1 u"""The ``lxml.etree`` module implements the extended ElementTree API
5 __docformat__ = u"restructuredtext en"
8 'AttributeBasedElementClassLookup', 'C14NError', 'CDATA',
9 'Comment', 'CommentBase', 'CustomElementClassLookup', 'DEBUG',
10 'DTD', 'DTDError', 'DTDParseError', 'DTDValidateError',
11 'DocumentInvalid', 'ETCompatXMLParser', 'ETXPath', 'Element',
12 'ElementBase', 'ElementClassLookup', 'ElementDefaultClassLookup',
13 'ElementNamespaceClassLookup', 'ElementTree', 'Entity', 'EntityBase',
14 'Error', 'ErrorDomains', 'ErrorLevels', 'ErrorTypes', 'Extension',
15 'FallbackElementClassLookup', 'FunctionNamespace', 'HTML',
16 'HTMLParser', 'LIBXML_COMPILED_VERSION', 'LIBXML_VERSION',
17 'LIBXSLT_COMPILED_VERSION', 'LIBXSLT_VERSION', 'LXML_VERSION',
18 'LxmlError', 'LxmlRegistryError', 'LxmlSyntaxError',
19 'NamespaceRegistryError', 'PI', 'PIBase', 'ParseError',
20 'ParserBasedElementClassLookup', 'ParserError', 'ProcessingInstruction',
21 'PyErrorLog', 'PythonElementClassLookup', 'QName', 'RelaxNG',
22 'RelaxNGError', 'RelaxNGErrorTypes', 'RelaxNGParseError',
23 'RelaxNGValidateError', 'Resolver', 'Schematron', 'SchematronError',
24 'SchematronParseError', 'SchematronValidateError', 'SerialisationError',
25 'SubElement', 'TreeBuilder', 'XInclude', 'XIncludeError', 'XML',
26 'XMLDTDID', 'XMLID', 'XMLParser', 'XMLSchema', 'XMLSchemaError',
27 'XMLSchemaParseError', 'XMLSchemaValidateError', 'XMLSyntaxError',
28 'XMLTreeBuilder', 'XPath', 'XPathDocumentEvaluator', 'XPathError',
29 'XPathEvalError', 'XPathEvaluator', 'XPathFunctionError', 'XPathResultError',
30 'XPathSyntaxError', 'XSLT', 'XSLTAccessControl', 'XSLTApplyError',
31 'XSLTError', 'XSLTExtension', 'XSLTExtensionError', 'XSLTParseError',
32 'XSLTSaveError', 'cleanup_namespaces', 'clear_error_log', 'dump',
33 'fromstring', 'fromstringlist', 'get_default_parser', 'iselement',
34 'iterparse', 'iterwalk', 'parse', 'parseid', 'register_namespace',
35 'set_default_parser', 'set_element_class_lookup', 'strip_attributes',
36 'strip_elements', 'strip_tags', 'tostring', 'tostringlist', 'tounicode',
37 'use_global_python_log'
40 cimport tree, python, config
41 from tree cimport xmlDoc, xmlNode, xmlAttr, xmlNs, _isElement, _getNs
42 from python cimport callable, _cstr, _isString
51 import builtins as __builtin__
55 _unicode = __builtin__.unicode
56 except AttributeError:
58 _unicode = __builtin__.str
62 cdef object os_path_abspath
63 from os.path import abspath as os_path_abspath
65 cdef object BytesIO, StringIO
67 from io import BytesIO, StringIO
68 except (ImportError, AttributeError):
69 from StringIO import StringIO, StringIO as BytesIO
71 cdef object _elementpath
83 cdef object ITER_EMPTY
86 cdef object EMPTY_READ_ONLY_DICT
87 EMPTY_READ_ONLY_DICT = python.PyDictProxy_New({})
91 # any libxml C argument/variable is prefixed with c_
92 # any non-public function/class is prefixed with an underscore
93 # instance creation is always through factories
95 # what to do with libxml2/libxslt error messages?
100 # maximum number of lines in the libxml2/xslt log if __DEBUG == 1
101 DEF __MAX_LOG_SIZE = 100
103 # make the compiled-in debug state publicly available
106 # global per-thread setup
107 tree.xmlThrDefIndentTreeOutput(1)
108 tree.xmlThrDefLineNumbersDefaultValue(1)
112 # initialize parser (and threading)
113 xmlparser.xmlInitParser()
116 cdef object _FILENAME_ENCODING
117 _FILENAME_ENCODING = sys.getfilesystemencoding()
118 if _FILENAME_ENCODING is None:
119 _FILENAME_ENCODING = sys.getdefaultencoding()
120 if _FILENAME_ENCODING is None:
121 _FILENAME_ENCODING = b'ascii'
123 _FILENAME_ENCODING = _FILENAME_ENCODING.encode(u"UTF-8")
124 cdef char* _C_FILENAME_ENCODING
125 _C_FILENAME_ENCODING = _cstr(_FILENAME_ENCODING)
127 # set up some default namespace prefixes
128 cdef object _DEFAULT_NAMESPACE_PREFIXES
129 _DEFAULT_NAMESPACE_PREFIXES = {
130 b"http://www.w3.org/XML/1998/namespace": b'xml',
131 b"http://www.w3.org/1999/xhtml": b"html",
132 b"http://www.w3.org/1999/XSL/Transform": b"xsl",
133 b"http://www.w3.org/1999/02/22-rdf-syntax-ns#": b"rdf",
134 b"http://schemas.xmlsoap.org/wsdl/": b"wsdl",
136 b"http://www.w3.org/2001/XMLSchema": b"xs",
137 b"http://www.w3.org/2001/XMLSchema-instance": b"xsi",
139 b"http://purl.org/dc/elements/1.1/": b"dc",
141 b"http://codespeak.net/lxml/objectify/pytype" : b"py",
144 cdef object _check_internal_prefix = re.compile(b"ns\d+$").match
146 def register_namespace(prefix, uri):
147 u"""Registers a namespace prefix that newly created Elements in that
148 namespace will use. The registry is global, and any existing
149 mapping for either the given prefix or the namespace URI will be
152 prefix_utf, uri_utf = _utf8(prefix), _utf8(uri)
153 if _check_internal_prefix(prefix_utf):
154 raise ValueError("Prefix format reserved for internal use")
155 _tagValidOrRaise(prefix_utf)
156 _uriValidOrRaise(uri_utf)
157 for k, v in _DEFAULT_NAMESPACE_PREFIXES.items():
158 if k == uri_utf or v == prefix_utf:
159 del _DEFAULT_NAMESPACE_PREFIXES[k]
160 _DEFAULT_NAMESPACE_PREFIXES[uri_utf] = prefix_utf
163 # Error superclass for ElementTree compatibility
164 class Error(Exception):
167 # module level superclass for all exceptions
168 class LxmlError(Error):
169 u"""Main exception base class for lxml. All other exceptions inherit from
172 def __init__(self, message, error_log=None):
173 if python.PY_VERSION_HEX >= 0x02050000:
174 # Python >= 2.5 uses new style class exceptions
175 super(_Error, self).__init__(message)
177 error_super_init(self, message)
178 if error_log is None:
179 self.error_log = __copyGlobalErrorLog()
181 self.error_log = error_log.copy()
183 cdef object _Error = Error if python.PY_VERSION_HEX >= 0x02050000 else None
184 cdef object error_super_init = Error.__init__ if python.PY_VERSION_HEX < 0x02050000 else None
187 # superclass for all syntax errors
188 class LxmlSyntaxError(LxmlError, SyntaxError):
189 u"""Base class for all syntax errors.
193 class C14NError(LxmlError):
194 u"""Error during C14N serialisation.
198 # version information
199 cdef __unpackDottedVersion(version):
200 cdef list version_list = []
201 l = (version.decode("ascii").replace(u'-', u'.').split(u'.') + [0]*4)[:4]
206 if item.startswith(u'dev'):
209 elif item.startswith(u'alpha'):
212 elif item.startswith(u'beta'):
219 version_list.append(item)
220 return tuple(version_list)
222 cdef __unpackIntVersion(int c_version):
224 ((c_version / (100*100)) % 100),
225 ((c_version / 100) % 100),
229 cdef int _LIBXML_VERSION_INT
231 _LIBXML_VERSION_INT = int(
232 re.match(u'[0-9]+', (tree.xmlParserVersion).decode("ascii")).group(0))
234 print u"Unknown libxml2 version: %s" % (tree.xmlParserVersion).decode("ascii")
235 _LIBXML_VERSION_INT = 0
237 LIBXML_VERSION = __unpackIntVersion(_LIBXML_VERSION_INT)
238 LIBXML_COMPILED_VERSION = __unpackIntVersion(tree.LIBXML_VERSION)
239 LXML_VERSION = __unpackDottedVersion(tree.LXML_VERSION_STRING)
241 __version__ = (tree.LXML_VERSION_STRING).decode("ascii")
244 # class for temporary storage of Python references,
245 # used e.g. for XPath results
246 cdef class _TempStore:
251 cdef int add(self, obj) except -1:
252 self._storage.append(obj)
255 cdef int clear(self) except -1:
259 # class for temporarily storing exceptions raised in extensions
260 cdef class _ExceptionContext:
261 cdef object _exc_info
262 cdef void clear(self):
263 self._exc_info = None
265 cdef void _store_raised(self):
266 self._exc_info = sys.exc_info()
268 cdef void _store_exception(self, exception):
269 self._exc_info = (exception, None, None)
271 cdef bint _has_raised(self):
272 return self._exc_info is not None
274 cdef int _raise_if_stored(self) except -1:
275 if self._exc_info is None:
277 type, value, traceback = self._exc_info
278 self._exc_info = None
279 if value is None and traceback is None:
282 raise type, value, traceback
285 # forward declarations
286 cdef public class _Document [ type LxmlDocumentType, object LxmlDocument ]
287 cdef public class _Element [ type LxmlElementType, object LxmlElement ]
288 cdef class _BaseParser
290 ctypedef public xmlNode* (*_node_to_node_function)(xmlNode*)
292 ################################################################################
295 include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.)
296 include "apihelpers.pxi" # Private helper functions
297 include "xmlerror.pxi" # Error and log handling
300 ################################################################################
303 cdef public class _Document [ type LxmlDocumentType, object LxmlDocument ]:
304 u"""Internal base class to reference a libxml document.
306 When instances of this class are garbage collected, the libxml
307 document is cleaned up.
310 cdef bytes _prefix_tail
312 cdef _BaseParser _parser
314 def __dealloc__(self):
315 # if there are no more references to the document, it is safe
316 # to clean the whole thing up, as all nodes have a reference to
318 tree.xmlFreeDoc(self._c_doc)
321 # return an element proxy for the document root
323 c_node = tree.xmlDocGetRootElement(self._c_doc)
326 return _elementFactory(self, c_node)
328 cdef bint hasdoctype(self):
329 # DOCTYPE gets parsed into internal subset (xmlDTD*)
330 return self._c_doc is not NULL and self._c_doc.intSubset is not NULL
332 cdef getdoctype(self):
333 # get doctype info: root tag, public/system ID (or None if not known)
334 cdef tree.xmlDtd* c_dtd
335 cdef xmlNode* c_root_node
338 c_dtd = self._c_doc.intSubset
339 if c_dtd is not NULL:
340 if c_dtd.ExternalID is not NULL:
341 public_id = funicode(c_dtd.ExternalID)
342 if c_dtd.SystemID is not NULL:
343 sys_url = funicode(c_dtd.SystemID)
344 c_dtd = self._c_doc.extSubset
345 if c_dtd is not NULL:
346 if not public_id and c_dtd.ExternalID is not NULL:
347 public_id = funicode(c_dtd.ExternalID)
348 if not sys_url and c_dtd.SystemID is not NULL:
349 sys_url = funicode(c_dtd.SystemID)
350 c_root_node = tree.xmlDocGetRootElement(self._c_doc)
351 if c_root_node is NULL:
354 root_name = funicode(c_root_node.name)
355 return (root_name, public_id, sys_url)
357 cdef getxmlinfo(self):
358 # return XML version and encoding (or None if not known)
359 cdef xmlDoc* c_doc = self._c_doc
360 if c_doc.version is NULL:
363 version = funicode(c_doc.version)
364 if c_doc.encoding is NULL:
367 encoding = funicode(c_doc.encoding)
368 return (version, encoding)
370 cdef isstandalone(self):
371 # returns True for "standalone=true",
372 # False for "standalone=false", None if not provided
373 if self._c_doc.standalone == -1:
376 return <bint>(self._c_doc.standalone == 1)
378 cdef bytes buildNewPrefix(self):
379 # get a new unique prefix ("nsX") for this document
381 if self._ns_counter < len(_PREFIX_CACHE):
382 ns = _PREFIX_CACHE[self._ns_counter]
384 ns = python.PyBytes_FromFormat("ns%d", self._ns_counter)
385 if self._prefix_tail is not None:
386 ns += self._prefix_tail
387 self._ns_counter += 1
388 if self._ns_counter < 0:
391 if self._prefix_tail is None:
392 self._prefix_tail = b"A"
394 self._prefix_tail += b"A"
397 cdef xmlNs* _findOrBuildNodeNs(self, xmlNode* c_node,
398 char* c_href, char* c_prefix,
399 bint is_attribute) except NULL:
400 u"""Get or create namespace structure for a node. Reuses the prefix if
405 cdef python.PyObject* dict_result
406 if c_node.type != tree.XML_ELEMENT_NODE:
407 assert c_node.type == tree.XML_ELEMENT_NODE, \
408 u"invalid node type %d, expected %d" % (
409 c_node.type, tree.XML_ELEMENT_NODE)
410 # look for existing ns declaration
411 c_ns = _searchNsByHref(c_node, c_href, is_attribute)
413 if is_attribute and c_ns.prefix is NULL:
414 # do not put namespaced attributes into the default
415 # namespace as this would break serialisation
420 # none found => determine a suitable new prefix
422 dict_result = python.PyDict_GetItem(
423 _DEFAULT_NAMESPACE_PREFIXES, c_href)
424 if dict_result is not NULL:
425 prefix = <object>dict_result
427 prefix = self.buildNewPrefix()
428 c_prefix = _cstr(prefix)
430 # make sure the prefix is not in use already
431 while tree.xmlSearchNs(self._c_doc, c_node, c_prefix) is not NULL:
432 prefix = self.buildNewPrefix()
433 c_prefix = _cstr(prefix)
435 # declare the namespace and return it
436 c_ns = tree.xmlNewNs(c_node, c_href, c_prefix)
438 python.PyErr_NoMemory()
441 cdef int _setNodeNs(self, xmlNode* c_node, char* href) except -1:
442 u"Lookup namespace structure and set it for the node."
444 c_ns = self._findOrBuildNodeNs(c_node, href, NULL, 0)
445 tree.xmlSetNs(c_node, c_ns)
447 cdef tuple __initPrefixCache():
449 return tuple([ python.PyBytes_FromFormat("ns%d", i)
450 for i in range(30) ])
452 cdef tuple _PREFIX_CACHE
453 _PREFIX_CACHE = __initPrefixCache()
455 cdef _Document _documentFactory(xmlDoc* c_doc, _BaseParser parser):
456 cdef _Document result
457 result = _Document.__new__(_Document)
458 result._c_doc = c_doc
459 result._ns_counter = 0
460 result._prefix_tail = None
462 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
463 result._parser = parser
468 u"Document information provided by parser and DTD."
470 def __cinit__(self, tree):
471 u"Create a DocInfo object for an ElementTree object or root Element."
472 self._doc = _documentOrRaise(tree)
473 root_name, public_id, system_url = self._doc.getdoctype()
474 if not root_name and (public_id or system_url):
475 raise ValueError, u"Could not find root node"
478 u"Returns the name of the root node as defined by the DOCTYPE."
480 root_name, public_id, system_url = self._doc.getdoctype()
484 u"Returns the public ID of the DOCTYPE."
486 root_name, public_id, system_url = self._doc.getdoctype()
490 u"Returns the system ID of the DOCTYPE."
492 root_name, public_id, system_url = self._doc.getdoctype()
495 property xml_version:
496 u"Returns the XML version as declared by the document."
498 xml_version, encoding = self._doc.getxmlinfo()
502 u"Returns the encoding name as declared by the document."
504 xml_version, encoding = self._doc.getxmlinfo()
508 u"""Returns the standalone flag as declared by the document. The possible
509 values are True (``standalone='yes'``), False
510 (``standalone='no'`` or flag not provided in the declaration),
511 and None (unknown or no declaration found). Note that a
512 normal truth test on this value will always tell if the
513 ``standalone`` flag was set to ``'yes'`` or not.
516 return self._doc.isstandalone()
519 u"The source URL of the document (or None if unknown)."
521 if self._doc._c_doc.URL is NULL:
523 return _decodeFilename(self._doc._c_doc.URL)
524 def __set__(self, url):
526 url = _encodeFilename(url)
527 c_oldurl = self._doc._c_doc.URL
529 self._doc._c_doc.URL = NULL
531 self._doc._c_doc.URL = tree.xmlStrdup(_cstr(url))
532 if c_oldurl is not NULL:
533 tree.xmlFree(c_oldurl)
536 u"Returns a DOCTYPE declaration string for the document."
538 root_name, public_id, system_url = self._doc.getdoctype()
541 return u'<!DOCTYPE %s PUBLIC "%s" "%s">' % (
542 root_name, public_id, system_url)
544 return u'<!DOCTYPE %s PUBLIC "%s">' % (
545 root_name, public_id)
547 return u'<!DOCTYPE %s SYSTEM "%s">' % (
548 root_name, system_url)
549 elif self._doc.hasdoctype():
550 return u'<!DOCTYPE %s>' % root_name
554 property internalDTD:
555 u"Returns a DTD validator based on the internal subset of the document."
557 return _dtdFactory(self._doc._c_doc.intSubset)
559 property externalDTD:
560 u"Returns a DTD validator based on the external subset of the document."
562 return _dtdFactory(self._doc._c_doc.extSubset)
565 cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
568 References a document object and a libxml node.
570 By pointing to a Document instance, a reference is kept to
571 _Document as long as there is some pointer to a node in it.
573 cdef python.PyObject* _gc_doc
575 cdef xmlNode* _c_node
581 Called after object initialisation. Custom subclasses may override
582 this if they recursively call _init() in the superclasses.
585 def __dealloc__(self):
586 #print "trying to free node:", <int>self._c_node
587 #displayNode(self._c_node, 0)
588 if self._c_node is not NULL:
589 _unregisterProxy(self)
590 attemptDeallocation(self._c_node)
595 def __setitem__(self, x, value):
596 u"""__setitem__(self, x, value)
598 Replaces the given subelement index or slice.
600 cdef xmlNode* c_node = NULL
602 cdef xmlDoc* c_source_doc
603 cdef _Element element
604 cdef bint left_to_right
605 cdef Py_ssize_t slicelength = 0, step = 0
606 _assertValidNode(self)
608 raise ValueError, u"cannot assign None"
609 if python.PySlice_Check(x):
611 _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
617 _replaceSlice(self, c_node, slicelength, step, left_to_right, value)
620 # otherwise: normal item assignment
622 _assertValidNode(element)
623 c_node = _findChild(self._c_node, x)
625 raise IndexError, u"list index out of range"
626 c_source_doc = element._c_node.doc
627 c_next = element._c_node.next
628 _removeText(c_node.next)
629 tree.xmlReplaceNode(c_node, element._c_node)
630 _moveTail(c_next, element._c_node)
631 moveNodeToDocument(self._doc, c_source_doc, element._c_node)
632 if not attemptDeallocation(c_node):
633 moveNodeToDocument(self._doc, c_node.doc, c_node)
635 def __delitem__(self, x):
636 u"""__delitem__(self, x)
638 Deletes the given subelement or a slice.
640 cdef xmlNode* c_node = NULL
642 cdef Py_ssize_t step = 0, slicelength = 0
643 _assertValidNode(self)
644 if python.PySlice_Check(x):
646 if _isFullSlice(<slice>x):
647 c_node = self._c_node.children
648 if c_node is not NULL:
649 if not _isElement(c_node):
650 c_node = _nextElement(c_node)
651 while c_node is not NULL:
652 c_next = _nextElement(c_node)
653 _removeNode(self._doc, c_node)
656 _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
657 _deleteSlice(self._doc, c_node, slicelength, step)
660 c_node = _findChild(self._c_node, x)
662 raise IndexError, u"index out of range: %d" % x
663 _removeText(c_node.next)
664 _removeNode(self._doc, c_node)
666 def __deepcopy__(self, memo):
667 u"__deepcopy__(self, memo)"
668 return self.__copy__()
674 cdef _Document new_doc
675 _assertValidNode(self)
676 c_doc = _copyDocRoot(self._doc._c_doc, self._c_node) # recursive
677 new_doc = _documentFactory(c_doc, self._doc._parser)
678 root = new_doc.getroot()
682 c_node = c_doc.children
683 while c_node is not NULL and c_node.type != self._c_node.type:
687 return _elementFactory(new_doc, c_node)
689 def set(self, key, value):
690 u"""set(self, key, value)
692 Sets an element attribute.
694 _assertValidNode(self)
695 _setAttributeValue(self, key, value)
697 def append(self, _Element element not None):
698 u"""append(self, element)
700 Adds a subelement to the end of this element.
702 _assertValidNode(self)
703 _assertValidNode(element)
704 _appendChild(self, element)
706 def addnext(self, _Element element not None):
707 u"""addnext(self, element)
709 Adds the element as a following sibling directly after this
712 This is normally used to set a processing instruction or comment after
713 the root node of a document. Note that tail text is automatically
714 discarded when adding at the root level.
716 _assertValidNode(self)
717 _assertValidNode(element)
718 if self._c_node.parent != NULL and not _isElement(self._c_node.parent):
719 if element._c_node.type != tree.XML_PI_NODE:
720 if element._c_node.type != tree.XML_COMMENT_NODE:
721 raise TypeError, u"Only processing instructions and comments can be siblings of the root element"
723 _appendSibling(self, element)
725 def addprevious(self, _Element element not None):
726 u"""addprevious(self, element)
728 Adds the element as a preceding sibling directly before this
731 This is normally used to set a processing instruction or comment
732 before the root node of a document. Note that tail text is
733 automatically discarded when adding at the root level.
735 _assertValidNode(self)
736 _assertValidNode(element)
737 if self._c_node.parent != NULL and not _isElement(self._c_node.parent):
738 if element._c_node.type != tree.XML_PI_NODE:
739 if element._c_node.type != tree.XML_COMMENT_NODE:
740 raise TypeError, u"Only processing instructions and comments can be siblings of the root element"
742 _prependSibling(self, element)
744 def extend(self, elements):
745 u"""extend(self, elements)
747 Extends the current children by the elements in the iterable.
749 cdef _Element element
750 _assertValidNode(self)
751 for element in elements:
753 raise TypeError, u"Node must not be None"
754 _assertValidNode(element)
755 _appendChild(self, element)
760 Resets an element. This function removes all subelements, clears
761 all attributes and sets the text and tail properties to None.
764 cdef xmlAttr* c_attr_next
766 cdef xmlNode* c_node_next
767 _assertValidNode(self)
768 c_node = self._c_node
769 # remove self.text and self.tail
770 _removeText(c_node.children)
771 _removeText(c_node.next)
772 # remove all attributes
773 c_attr = c_node.properties
774 while c_attr is not NULL:
775 c_attr_next = c_attr.next
776 tree.xmlRemoveProp(c_attr)
778 # remove all subelements
779 c_node = c_node.children
780 if c_node is not NULL:
781 if not _isElement(c_node):
782 c_node = _nextElement(c_node)
783 while c_node is not NULL:
784 c_node_next = _nextElement(c_node)
785 _removeNode(self._doc, c_node)
788 def insert(self, index, _Element element not None):
789 u"""insert(self, index, element)
791 Inserts a subelement at the given position in this element
795 cdef xmlDoc* c_source_doc
796 _assertValidNode(self)
797 _assertValidNode(element)
798 c_node = _findChild(self._c_node, index)
800 _appendChild(self, element)
802 c_source_doc = c_node.doc
803 c_next = element._c_node.next
804 tree.xmlAddPrevSibling(c_node, element._c_node)
805 _moveTail(c_next, element._c_node)
806 moveNodeToDocument(self._doc, c_source_doc, element._c_node)
808 def remove(self, _Element element not None):
809 u"""remove(self, element)
811 Removes a matching subelement. Unlike the find methods, this
812 method compares elements based on identity, not on tag value
817 _assertValidNode(self)
818 _assertValidNode(element)
819 c_node = element._c_node
820 if c_node.parent is not self._c_node:
821 raise ValueError, u"Element is not a child of this node."
822 c_next = element._c_node.next
823 tree.xmlUnlinkNode(c_node)
824 _moveTail(c_next, c_node)
825 # fix namespace declarations
826 moveNodeToDocument(self._doc, c_node.doc, c_node)
828 def replace(self, _Element old_element not None,
829 _Element new_element not None):
830 u"""replace(self, old_element, new_element)
832 Replaces a subelement with the element passed as second argument.
834 cdef xmlNode* c_old_node
835 cdef xmlNode* c_old_next
836 cdef xmlNode* c_new_node
837 cdef xmlNode* c_new_next
838 cdef xmlDoc* c_source_doc
839 _assertValidNode(self)
840 _assertValidNode(old_element)
841 _assertValidNode(new_element)
842 c_old_node = old_element._c_node
843 if c_old_node.parent is not self._c_node:
844 raise ValueError, u"Element is not a child of this node."
845 c_old_next = c_old_node.next
846 c_new_node = new_element._c_node
847 c_new_next = c_new_node.next
848 c_source_doc = c_new_node.doc
849 tree.xmlReplaceNode(c_old_node, c_new_node)
850 _moveTail(c_new_next, c_new_node)
851 _moveTail(c_old_next, c_old_node)
852 moveNodeToDocument(self._doc, c_source_doc, c_new_node)
853 # fix namespace declarations
854 moveNodeToDocument(self._doc, c_old_node.doc, c_old_node)
861 if self._tag is not None:
863 _assertValidNode(self)
864 self._tag = _namespacedName(self._c_node)
867 def __set__(self, value):
868 cdef _BaseParser parser
869 _assertValidNode(self)
870 ns, name = _getNsTag(value)
871 parser = self._doc._parser
872 if parser is not None and parser._for_html:
873 _htmlTagValidOrRaise(name)
875 _tagValidOrRaise(name)
877 tree.xmlNodeSetName(self._c_node, _cstr(name))
879 self._c_node.ns = NULL
881 self._doc._setNodeNs(self._c_node, _cstr(ns))
884 u"""Element attribute dictionary. Where possible, use get(), set(),
885 keys(), values() and items() to access element attributes.
888 _assertValidNode(self)
892 u"""Text before the first subelement. This is either a string or
893 the value None, if there was no text.
896 _assertValidNode(self)
897 return _collectText(self._c_node.children)
899 def __set__(self, value):
900 _assertValidNode(self)
901 if isinstance(value, QName):
902 value = python.PyUnicode_FromEncodedObject(
903 _resolveQNameText(self, value), 'UTF-8', 'strict')
904 _setNodeText(self._c_node, value)
906 # using 'del el.text' is the wrong thing to do
908 # _setNodeText(self._c_node, None)
911 u"""Text after this element's end tag, but before the next sibling
912 element's start tag. This is either a string or the value None, if
916 _assertValidNode(self)
917 return _collectText(self._c_node.next)
919 def __set__(self, value):
920 _assertValidNode(self)
921 _setTailText(self._c_node, value)
923 # using 'del el.tail' is the wrong thing to do
925 # _setTailText(self._c_node, None)
927 # not in ElementTree, read-only
929 u"""Namespace prefix or None.
932 if self._c_node.ns is not NULL:
933 if self._c_node.ns.prefix is not NULL:
934 return funicode(self._c_node.ns.prefix)
937 # not in ElementTree, read-only
939 u"""Original line number as found by the parser or None if unknown.
943 _assertValidNode(self)
944 line = tree.xmlGetLineNo(self._c_node)
950 def __set__(self, line):
951 _assertValidNode(self)
953 self._c_node.line = 0
955 self._c_node.line = line
957 # not in ElementTree, read-only
959 u"""Namespace prefix->URI mapping known in the context of this
960 Element. This includes all namespace declarations of the
963 Note that changing the returned dict has no effect on the Element.
969 _assertValidNode(self)
970 c_node = self._c_node
971 while c_node is not NULL and c_node.type == tree.XML_ELEMENT_NODE:
973 while c_ns is not NULL:
974 prefix = None if c_ns.prefix is NULL else funicode(c_ns.prefix)
975 if prefix not in nsmap:
976 nsmap[prefix] = None if c_ns.href is NULL else funicode(c_ns.href)
978 c_node = c_node.parent
981 # not in ElementTree, read-only
983 u"""The base URI of the Element (xml:base or HTML base URL).
984 None if the base URI is unknown.
986 Note that the value depends on the URL of the document that
987 holds the Element if there is no xml:base attribute on the
988 Element or its ancestors.
990 Setting this property will set an xml:base attribute on the
991 Element, regardless of the document type (XML or HTML).
995 _assertValidNode(self)
996 c_base = tree.xmlNodeGetBase(self._doc._c_doc, self._c_node)
998 if self._doc._c_doc.URL is NULL:
1000 return _decodeFilename(self._doc._c_doc.URL)
1001 base = _decodeFilename(c_base)
1002 tree.xmlFree(c_base)
1005 def __set__(self, url):
1007 _assertValidNode(self)
1011 url = _encodeFilename(url)
1013 tree.xmlNodeSetBase(self._c_node, c_base)
1018 return u"<Element %s at 0x%x>" % (self.tag, id(self))
1020 def __getitem__(self, x):
1021 u"""Returns the subelement at the given position or the requested
1024 cdef xmlNode* c_node = NULL
1025 cdef Py_ssize_t step = 0, slicelength = 0
1026 cdef Py_ssize_t c, i
1027 cdef _node_to_node_function next_element
1029 _assertValidNode(self)
1030 if python.PySlice_Check(x):
1032 if _isFullSlice(<slice>x):
1033 return _collectChildren(self)
1034 _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
1038 next_element = _nextElement
1041 next_element = _previousElement
1044 while c_node is not NULL and c < slicelength:
1045 result.append(_elementFactory(self._doc, c_node))
1047 for i from 0 <= i < step:
1048 c_node = next_element(c_node)
1052 c_node = _findChild(self._c_node, x)
1054 raise IndexError, u"list index out of range"
1055 return _elementFactory(self._doc, c_node)
1060 Returns the number of subelements.
1062 _assertValidNode(self)
1063 return _countElements(self._c_node.children)
1065 def __nonzero__(self):
1066 #u"__nonzero__(self)" # currently fails in Py3.1
1069 u"The behavior of this method will change in future versions. "
1070 u"Use specific 'len(elem)' or 'elem is not None' test instead.",
1073 # emulate old behaviour
1074 _assertValidNode(self)
1075 return _hasChild(self._c_node)
1077 def __contains__(self, element):
1078 u"__contains__(self, element)"
1079 cdef xmlNode* c_node
1080 _assertValidNode(self)
1081 if not isinstance(element, _Element):
1083 c_node = (<_Element>element)._c_node
1084 return c_node is not NULL and c_node.parent is self._c_node
1088 return ElementChildIterator(self)
1090 def __reversed__(self):
1091 u"__reversed__(self)"
1092 return ElementChildIterator(self, reversed=True)
1094 def index(self, _Element child not None, start=None, stop=None):
1095 u"""index(self, child, start=None, stop=None)
1097 Find the position of the child within the parent.
1099 This method is not part of the original ElementTree API.
1101 cdef Py_ssize_t k, l
1102 cdef Py_ssize_t c_start, c_stop
1103 cdef xmlNode* c_child
1104 cdef xmlNode* c_start_node
1105 _assertValidNode(self)
1106 _assertValidNode(child)
1107 c_child = child._c_node
1108 if c_child.parent is not self._c_node:
1109 raise ValueError, u"Element is not a child of this node."
1111 # handle the unbounded search straight away (normal case)
1112 if stop is None and (start is None or start == 0):
1114 c_child = c_child.prev
1115 while c_child is not NULL:
1116 if _isElement(c_child):
1118 c_child = c_child.prev
1131 c_start >= c_stop and (c_stop > 0 or c_start < 0):
1132 raise ValueError, u"list.index(x): x not in slice"
1134 # for negative slice indices, check slice before searching index
1135 if c_start < 0 or c_stop < 0:
1136 # start from right, at most up to leftmost(c_start, c_stop)
1137 if c_start < c_stop:
1141 c_start_node = self._c_node.last
1143 while c_start_node != c_child and l < k:
1144 if _isElement(c_start_node):
1146 c_start_node = c_start_node.prev
1147 if c_start_node == c_child:
1148 # found! before slice end?
1149 if c_stop < 0 and l <= -c_stop:
1150 raise ValueError, u"list.index(x): x not in slice"
1152 raise ValueError, u"list.index(x): x not in slice"
1154 # now determine the index backwards from child
1155 c_child = c_child.prev
1158 # we can optimize: stop after c_stop elements if not found
1159 while c_child != NULL and k < c_stop:
1160 if _isElement(c_child):
1162 c_child = c_child.prev
1167 while c_child != NULL:
1168 if _isElement(c_child):
1170 c_child = c_child.prev
1176 if c_start != 0 or c_stop != 0:
1177 raise ValueError, u"list.index(x): x not in slice"
1179 raise ValueError, u"list.index(x): x not in list"
1181 def get(self, key, default=None):
1182 u"""get(self, key, default=None)
1184 Gets an element attribute.
1186 _assertValidNode(self)
1187 return _getAttributeValue(self, key, default)
1192 Gets a list of attribute names. The names are returned in an
1193 arbitrary order (just like for an ordinary Python dictionary).
1195 _assertValidNode(self)
1196 return _collectAttributes(self._c_node, 1)
1201 Gets element attribute values as a sequence of strings. The
1202 attributes are returned in an arbitrary order.
1204 _assertValidNode(self)
1205 return _collectAttributes(self._c_node, 2)
1210 Gets element attributes, as a sequence. The attributes are returned in
1213 _assertValidNode(self)
1214 return _collectAttributes(self._c_node, 3)
1216 def getchildren(self):
1217 u"""getchildren(self)
1219 Returns all direct children. The elements are returned in document
1222 :deprecated: Note that this method has been deprecated as of
1223 ElementTree 1.3 and lxml 2.0. New code should use
1224 ``list(element)`` or simply iterate over elements.
1226 _assertValidNode(self)
1227 return _collectChildren(self)
1229 def getparent(self):
1232 Returns the parent of this element or None for the root element.
1234 cdef xmlNode* c_node
1235 #_assertValidNode(self) # not needed
1236 c_node = _parentElement(self._c_node)
1239 return _elementFactory(self._doc, c_node)
1244 Returns the following sibling of this element or None.
1246 cdef xmlNode* c_node
1247 #_assertValidNode(self) # not needed
1248 c_node = _nextElement(self._c_node)
1251 return _elementFactory(self._doc, c_node)
1253 def getprevious(self):
1254 u"""getprevious(self)
1256 Returns the preceding sibling of this element or None.
1258 cdef xmlNode* c_node
1259 #_assertValidNode(self) # not needed
1260 c_node = _previousElement(self._c_node)
1263 return _elementFactory(self._doc, c_node)
1265 def itersiblings(self, tag=None, *, preceding=False):
1266 u"""itersiblings(self, tag=None, preceding=False)
1268 Iterate over the following or preceding siblings of this element.
1270 The direction is determined by the 'preceding' keyword which
1271 defaults to False, i.e. forward iteration over the following
1272 siblings. When True, the iterator yields the preceding
1273 siblings in reverse document order, i.e. starting right before
1274 the current element and going left. The generated elements
1275 can be restricted to a specific tag name with the 'tag'
1278 return SiblingsIterator(self, tag, preceding=preceding)
1280 def iterancestors(self, tag=None):
1281 u"""iterancestors(self, tag=None)
1283 Iterate over the ancestors of this element (from parent to parent).
1285 The generated elements can be restricted to a specific tag name with
1288 return AncestorsIterator(self, tag)
1290 def iterdescendants(self, tag=None):
1291 u"""iterdescendants(self, tag=None)
1293 Iterate over the descendants of this element in document order.
1295 As opposed to ``el.iter()``, this iterator does not yield the element
1296 itself. The generated elements can be restricted to a specific tag
1297 name with the 'tag' keyword.
1299 return ElementDepthFirstIterator(self, tag, inclusive=False)
1301 def iterchildren(self, tag=None, *, reversed=False):
1302 u"""iterchildren(self, tag=None, reversed=False)
1304 Iterate over the children of this element.
1306 As opposed to using normal iteration on this element, the generated
1307 elements can be restricted to a specific tag name with the 'tag'
1308 keyword and reversed with the 'reversed' keyword.
1310 return ElementChildIterator(self, tag, reversed=reversed)
1312 def getroottree(self):
1313 u"""getroottree(self)
1315 Return an ElementTree for the root node of the document that
1316 contains this element.
1318 This is the same as following element.getparent() up the tree until it
1319 returns None (for the root element) and then build an ElementTree for
1320 the last parent that was returned."""
1321 _assertValidDoc(self._doc)
1322 return _elementTreeFactory(self._doc, None)
1324 def getiterator(self, tag=None):
1325 u"""getiterator(self, tag=None)
1327 Returns a sequence or iterator of all elements in the subtree in
1328 document order (depth first pre-order), starting with this
1331 Can be restricted to find only elements with a specific tag
1332 (pass ``tag="xyz"``) or from a namespace (pass ``tag="{ns}*"``).
1334 You can also pass the Element, Comment, ProcessingInstruction and
1335 Entity factory functions to look only for the specific element type.
1337 :deprecated: Note that this method is deprecated as of
1338 ElementTree 1.3 and lxml 2.0. It returns an iterator in
1339 lxml, which diverges from the original ElementTree
1340 behaviour. If you want an efficient iterator, use the
1341 ``element.iter()`` method instead. You should only use this
1342 method in new code if you require backwards compatibility
1343 with older versions of lxml or ElementTree.
1345 return ElementDepthFirstIterator(self, tag)
1347 def iter(self, tag=None):
1348 u"""iter(self, tag=None)
1350 Iterate over all elements in the subtree in document order (depth
1351 first pre-order), starting with this element.
1353 Can be restricted to find only elements with a specific tag
1354 (pass ``tag="xyz"``) or from a namespace (pass ``tag="{ns}*"``).
1356 You can also pass the Element, Comment, ProcessingInstruction and
1357 Entity factory functions to look only for the specific element type.
1359 return ElementDepthFirstIterator(self, tag)
1361 def itertext(self, tag=None, *, with_tail=True):
1362 u"""itertext(self, tag=None, with_tail=True)
1364 Iterates over the text content of a subtree.
1366 You can pass the ``tag`` keyword argument to restrict text content to
1367 a specific tag name.
1369 You can set the ``with_tail`` keyword argument to ``False`` to skip
1372 return ElementTextIterator(self, tag, with_tail=with_tail)
1374 def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
1375 u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
1377 Creates a new element associated with the same document.
1379 _assertValidDoc(self._doc)
1380 return _makeElement(_tag, NULL, self._doc, None, None, None,
1381 attrib, nsmap, _extra)
1383 def find(self, path, namespaces=None):
1384 u"""find(self, path, namespaces=None)
1386 Finds the first matching subelement, by tag name or path.
1388 The optional ``namespaces`` argument accepts a
1389 prefix-to-namespace mapping that allows the usage of XPath
1390 prefixes in the path expression.
1392 if isinstance(path, QName):
1393 path = (<QName>path).text
1394 return _elementpath.find(self, path, namespaces)
1396 def findtext(self, path, default=None, namespaces=None):
1397 u"""findtext(self, path, default=None, namespaces=None)
1399 Finds text for the first matching subelement, by tag name or path.
1401 The optional ``namespaces`` argument accepts a
1402 prefix-to-namespace mapping that allows the usage of XPath
1403 prefixes in the path expression.
1405 if isinstance(path, QName):
1406 path = (<QName>path).text
1407 return _elementpath.findtext(self, path, default, namespaces)
1409 def findall(self, path, namespaces=None):
1410 u"""findall(self, path, namespaces=None)
1412 Finds all matching subelements, by tag name or path.
1414 The optional ``namespaces`` argument accepts a
1415 prefix-to-namespace mapping that allows the usage of XPath
1416 prefixes in the path expression.
1418 if isinstance(path, QName):
1419 path = (<QName>path).text
1420 return _elementpath.findall(self, path, namespaces)
1422 def iterfind(self, path, namespaces=None):
1423 u"""iterfind(self, path, namespaces=None)
1425 Iterates over all matching subelements, by tag name or path.
1427 The optional ``namespaces`` argument accepts a
1428 prefix-to-namespace mapping that allows the usage of XPath
1429 prefixes in the path expression.
1431 if isinstance(path, QName):
1432 path = (<QName>path).text
1433 return _elementpath.iterfind(self, path, namespaces)
1435 def xpath(self, _path, *, namespaces=None, extensions=None,
1436 smart_strings=True, **_variables):
1437 u"""xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
1439 Evaluate an xpath expression using the element as context node.
1441 evaluator = XPathElementEvaluator(self, namespaces=namespaces,
1442 extensions=extensions,
1443 smart_strings=smart_strings)
1444 return evaluator(_path, **_variables)
1447 cdef extern from "etree_defs.h":
1448 # macro call to 't->tp_new()' for fast instantiation
1449 cdef object NEW_ELEMENT "PY_NEW" (object t)
1451 cdef _Element _elementFactory(_Document doc, xmlNode* c_node):
1452 cdef _Element result
1453 result = getProxy(c_node)
1454 if result is not None:
1459 element_class = LOOKUP_ELEMENT_CLASS(
1460 ELEMENT_CLASS_LOOKUP_STATE, doc, c_node)
1461 if hasProxy(c_node):
1462 # prevent re-entry race condition - we just called into Python
1463 return getProxy(c_node)
1464 result = NEW_ELEMENT(element_class)
1465 if hasProxy(c_node):
1466 # prevent re-entry race condition - we just called into Python
1467 result._c_node = NULL
1468 return getProxy(c_node)
1470 _registerProxy(result, doc, c_node)
1471 if element_class is not _Element:
1476 cdef class __ContentOnlyElement(_Element):
1477 cdef int _raiseImmutable(self) except -1:
1478 raise TypeError, u"this element does not have children or attributes"
1480 def set(self, key, value):
1481 u"set(self, key, value)"
1482 self._raiseImmutable()
1484 def append(self, value):
1485 u"append(self, value)"
1486 self._raiseImmutable()
1488 def insert(self, index, value):
1489 u"insert(self, index, value)"
1490 self._raiseImmutable()
1492 def __setitem__(self, index, value):
1493 u"__setitem__(self, index, value)"
1494 self._raiseImmutable()
1502 _assertValidNode(self)
1503 if self._c_node.content is NULL:
1506 return funicode(self._c_node.content)
1508 def __set__(self, value):
1509 cdef tree.xmlDict* c_dict
1511 _assertValidNode(self)
1515 value = _utf8(value)
1516 c_text = _cstr(value)
1517 tree.xmlNodeSetContent(self._c_node, c_text)
1520 def __getitem__(self, x):
1521 u"__getitem__(self, x)"
1522 if python.PySlice_Check(x):
1525 raise IndexError, u"list index out of range"
1531 def get(self, key, default=None):
1532 u"get(self, key, default=None)"
1547 cdef class _Comment(__ContentOnlyElement):
1553 return u"<!--%s-->" % self.text
1555 cdef class _ProcessingInstruction(__ContentOnlyElement):
1558 return ProcessingInstruction
1561 # not in ElementTree
1563 _assertValidNode(self)
1564 return funicode(self._c_node.name)
1566 def __set__(self, value):
1567 _assertValidNode(self)
1568 value = _utf8(value)
1569 c_text = _cstr(value)
1570 tree.xmlNodeSetName(self._c_node, c_text)
1575 return u"<?%s %s?>" % (self.target, text)
1577 return u"<?%s?>" % self.target
1579 def get(self, key, default=None):
1580 u"""get(self, key, default=None)
1582 Try to parse pseudo-attributes from the text content of the
1583 processing instruction, search for one with the given key as
1584 name and return its associated value.
1586 Note that this is only a convenience method for the most
1587 common case that all text content is structured in
1588 attribute-like name-value pairs with properly quoted values.
1589 It is not guaranteed to work for all possible text content.
1591 return self.attrib.get(key, default)
1594 u"""Returns a dict containing all pseudo-attributes that can be
1595 parsed from the text content of this processing instruction.
1596 Note that modifying the dict currently has no effect on the
1597 XML node, although this is not guaranteed to stay this way.
1600 return { attr : (value1 or value2)
1601 for attr, value1, value2 in _FIND_PI_ATTRIBUTES(u' ' + self.text) }
1603 cdef object _FIND_PI_ATTRIBUTES = re.compile(ur'\s+(\w+)\s*=\s*(?:\'([^\']*)\'|"([^"]*)")', re.U).findall
1605 cdef class _Entity(__ContentOnlyElement):
1611 # not in ElementTree
1613 _assertValidNode(self)
1614 return funicode(self._c_node.name)
1616 def __set__(self, value):
1617 _assertValidNode(self)
1618 value_utf = _utf8(value)
1619 assert u'&' not in value and u';' not in value, \
1620 u"Invalid entity name '%s'" % value
1621 tree.xmlNodeSetName(self._c_node, _cstr(value_utf))
1624 # FIXME: should this be None or '&[VALUE];' or the resolved
1627 _assertValidNode(self)
1628 return u'&%s;' % funicode(self._c_node.name)
1631 return u"&%s;" % self.name
1635 u"""QName(text_or_uri_or_element, tag=None)
1637 QName wrapper for qualified XML names.
1639 Pass a tag name by itself or a namespace URI and a tag name to
1640 create a qualified name. Alternatively, pass an Element to
1641 extract its tag name.
1643 The ``text`` property holds the qualified name in
1644 ``{namespace}tagname`` notation. The ``namespace`` and
1645 ``localname`` properties hold the respective parts of the tag
1648 You can pass QName objects wherever a tag name is expected. Also,
1649 setting Element text from a QName will resolve the namespace
1650 prefix and set a qualified text value. This is helpful in XML
1651 languages like SOAP or XML-Schema that use prefixed tag names in
1654 cdef readonly object text
1655 cdef readonly object localname
1656 cdef readonly object namespace
1657 def __init__(self, text_or_uri_or_element, tag=None):
1658 if not _isString(text_or_uri_or_element):
1659 if isinstance(text_or_uri_or_element, _Element):
1660 text_or_uri_or_element = (<_Element>text_or_uri_or_element).tag
1661 if not _isString(text_or_uri_or_element):
1662 raise ValueError, (u"Invalid input tag of type %r" %
1663 type(text_or_uri_or_element))
1664 elif isinstance(text_or_uri_or_element, QName):
1665 text_or_uri_or_element = (<QName>text_or_uri_or_element).text
1667 text_or_uri_or_element = unicode(text_or_uri_or_element)
1669 ns_utf, tag_utf = _getNsTag(text_or_uri_or_element)
1671 # either ('ns', 'tag') or ('{ns}oldtag', 'newtag')
1673 ns_utf = tag_utf # case 1: namespace ended up as tag name
1674 tag_utf = _utf8(tag)
1675 _tagValidOrRaise(tag_utf)
1676 self.localname = python.PyUnicode_FromEncodedObject(
1677 tag_utf, 'UTF-8', NULL)
1679 self.namespace = None
1680 self.text = self.localname
1682 self.namespace = python.PyUnicode_FromEncodedObject(
1683 ns_utf, 'UTF-8', NULL)
1684 self.text = u"{%s}%s" % (self.namespace, self.localname)
1688 return self.text.__hash__()
1689 def __richcmp__(one, other, int op):
1690 if not _isString(one):
1692 if not _isString(other):
1693 other = unicode(other)
1694 return python.PyObject_RichCompare(one, other, op)
1697 cdef public class _ElementTree [ type LxmlElementTreeType,
1698 object LxmlElementTree ]:
1700 cdef _Element _context_node
1702 # Note that _doc is only used to store the original document if we do not
1703 # have a _context_node. All methods should prefer self._context_node._doc
1704 # to honour tree restructuring. _doc can happily be None!
1706 cdef _assertHasRoot(self):
1707 u"""We have to take care here: the document may not have a root node!
1708 This can happen if ElementTree() is called without any argument and
1709 the caller 'forgets' to call parse() afterwards, so this is a bug in
1712 assert self._context_node is not None, \
1713 u"ElementTree not initialized, missing root"
1715 def parse(self, source, _BaseParser parser=None, *, base_url=None):
1716 u"""parse(self, source, parser=None, base_url=None)
1718 Updates self with the content of source and returns its root
1720 cdef _Document doc = None
1722 doc = _parseDocument(source, parser, base_url)
1723 self._context_node = doc.getroot()
1724 if self._context_node is None:
1726 except _TargetParserResult, result_container:
1727 # raises a TypeError if we don't get an _Element
1728 self._context_node = result_container.result
1729 return self._context_node
1731 def _setroot(self, _Element root not None):
1732 u"""_setroot(self, root)
1734 Relocate the ElementTree to a new root node.
1736 _assertValidNode(root)
1737 if root._c_node.type != tree.XML_ELEMENT_NODE:
1738 raise TypeError, u"Only elements can be the root of an ElementTree"
1739 self._context_node = root
1745 Gets the root element for this tree.
1747 return self._context_node
1750 return _elementTreeFactory(self._doc, self._context_node)
1752 def __deepcopy__(self, memo):
1756 if self._context_node is not None:
1757 root = self._context_node.__copy__()
1758 _copyNonElementSiblings(self._context_node._c_node, root._c_node)
1760 c_doc = self._context_node._doc._c_doc
1761 if c_doc.intSubset is not NULL and doc._c_doc.intSubset is NULL:
1762 doc._c_doc.intSubset = tree.xmlCopyDtd(c_doc.intSubset)
1763 if doc._c_doc.intSubset is NULL:
1764 python.PyErr_NoMemory()
1765 if c_doc.extSubset is not NULL and not doc._c_doc.extSubset is NULL:
1766 doc._c_doc.extSubset = tree.xmlCopyDtd(c_doc.extSubset)
1767 if doc._c_doc.extSubset is NULL:
1768 python.PyErr_NoMemory()
1769 return _elementTreeFactory(None, root)
1770 elif self._doc is not None:
1771 _assertValidDoc(self._doc)
1772 c_doc = tree.xmlCopyDoc(self._doc._c_doc, 1)
1774 python.PyErr_NoMemory()
1775 doc = _documentFactory(c_doc, self._doc._parser)
1776 return _elementTreeFactory(doc, None)
1781 # not in ElementTree, read-only
1783 u"""Information about the document provided by parser and DTD. This
1784 value is only defined for ElementTree objects based on the root node
1785 of a parsed document (e.g. those returned by the parse functions),
1786 not for trees that were built manually.
1789 self._assertHasRoot()
1790 return DocInfo(self._context_node._doc)
1792 # not in ElementTree, read-only
1794 u"""The parser that was used to parse the document in this ElementTree.
1797 if self._context_node is not None and \
1798 self._context_node._doc is not None:
1799 return self._context_node._doc._parser
1800 if self._doc is not None:
1801 return self._doc._parser
1804 def write(self, file, *, encoding=None, method=u"xml",
1805 pretty_print=False, xml_declaration=None, with_tail=True,
1806 standalone=None, docstring=None, compression=0,
1807 exclusive=False, with_comments=True):
1808 u"""write(self, file, encoding=None, method="xml",
1809 pretty_print=False, xml_declaration=None, with_tail=True,
1810 standalone=None, compression=0,
1811 exclusive=False, with_comments=True)
1813 Write the tree to a filename, file or file-like object.
1815 Defaults to ASCII encoding and writing a declaration as needed.
1817 The keyword argument 'method' selects the output method:
1818 'xml', 'html', 'text' or 'c14n'. Default is 'xml'.
1820 The ``exclusive`` and ``with_comments`` arguments are only
1821 used with C14N output, where they request exclusive and
1822 uncommented C14N serialisation respectively.
1824 Passing a boolean value to the ``standalone`` option will
1825 output an XML declaration with the corresponding
1826 ``standalone`` flag.
1828 The ``compression`` option enables GZip compression level 1-9.
1830 cdef bint write_declaration
1831 cdef int is_standalone
1832 self._assertHasRoot()
1833 _assertValidNode(self._context_node)
1834 if compression is None or compression < 0:
1836 # C14N serialisation
1837 if method == 'c14n':
1838 if encoding is not None:
1839 raise ValueError("Cannot specify encoding with C14N")
1841 raise ValueError("Cannot enable XML declaration in C14N")
1842 _tofilelikeC14N(file, self._context_node, exclusive, with_comments,
1845 if not with_comments:
1846 raise ValueError("Can only discard comments in C14N serialisation")
1847 # suppress decl. in default case (purely for ElementTree compatibility)
1848 if xml_declaration is not None:
1849 write_declaration = xml_declaration
1850 if encoding is None:
1853 encoding = encoding.upper()
1854 elif encoding is None:
1856 write_declaration = 0
1858 encoding = encoding.upper()
1859 write_declaration = encoding not in \
1860 (u'US-ASCII', u'ASCII', u'UTF8', u'UTF-8')
1861 if standalone is None:
1864 write_declaration = 1
1867 write_declaration = 1
1869 _tofilelike(file, self._context_node, encoding, docstring, method,
1870 write_declaration, 1, pretty_print, with_tail,
1871 is_standalone, compression)
1873 def getpath(self, _Element element not None):
1874 u"""getpath(self, element)
1876 Returns a structural, absolute XPath expression to find that element.
1882 _assertValidNode(element)
1883 if self._context_node is not None:
1884 root = self._context_node
1886 elif self._doc is not None:
1888 root = doc.getroot()
1890 raise ValueError, u"Element is not in this tree."
1891 _assertValidDoc(doc)
1892 _assertValidNode(root)
1893 if element._doc is not doc:
1894 raise ValueError, u"Element is not in this tree."
1896 c_doc = _fakeRootDoc(doc._c_doc, root._c_node)
1897 c_path = tree.xmlGetNodePath(element._c_node)
1898 _destroyFakeDoc(doc._c_doc, c_doc)
1900 python.PyErr_NoMemory()
1901 path = funicode(c_path)
1902 tree.xmlFree(c_path)
1905 def getiterator(self, tag=None):
1906 u"""getiterator(self, tag=None)
1908 Returns a sequence or iterator of all elements in document order
1909 (depth first pre-order), starting with the root element.
1911 Can be restricted to find only elements with a specific tag
1912 (pass ``tag="xyz"`` or ``tag="{ns}xyz"``) or from a namespace
1913 (pass ``tag="{ns}*"``).
1915 You can also pass the Element, Comment, ProcessingInstruction and
1916 Entity factory functions to look only for the specific element type.
1918 :deprecated: Note that this method is deprecated as of
1919 ElementTree 1.3 and lxml 2.0. It returns an iterator in
1920 lxml, which diverges from the original ElementTree
1921 behaviour. If you want an efficient iterator, use the
1922 ``tree.iter()`` method instead. You should only use this
1923 method in new code if you require backwards compatibility
1924 with older versions of lxml or ElementTree.
1926 root = self.getroot()
1929 return root.getiterator(tag)
1931 def iter(self, tag=None):
1932 u"""iter(self, tag=None)
1934 Creates an iterator for the root element. The iterator loops over
1935 all elements in this tree, in document order.
1937 root = self.getroot()
1940 return root.iter(tag)
1942 def find(self, path, namespaces=None):
1943 u"""find(self, path, namespaces=None)
1945 Finds the first toplevel element with given tag. Same as
1946 ``tree.getroot().find(path)``.
1948 The optional ``namespaces`` argument accepts a
1949 prefix-to-namespace mapping that allows the usage of XPath
1950 prefixes in the path expression.
1952 self._assertHasRoot()
1953 root = self.getroot()
1960 return root.find(path, namespaces)
1962 def findtext(self, path, default=None, namespaces=None):
1963 u"""findtext(self, path, default=None, namespaces=None)
1965 Finds the text for the first element matching the ElementPath
1966 expression. Same as getroot().findtext(path)
1968 The optional ``namespaces`` argument accepts a
1969 prefix-to-namespace mapping that allows the usage of XPath
1970 prefixes in the path expression.
1972 self._assertHasRoot()
1973 root = self.getroot()
1980 return root.findtext(path, default, namespaces)
1982 def findall(self, path, namespaces=None):
1983 u"""findall(self, path, namespaces=None)
1985 Finds all elements matching the ElementPath expression. Same as
1986 getroot().findall(path).
1988 The optional ``namespaces`` argument accepts a
1989 prefix-to-namespace mapping that allows the usage of XPath
1990 prefixes in the path expression.
1992 self._assertHasRoot()
1993 root = self.getroot()
2000 return root.findall(path, namespaces)
2002 def iterfind(self, path, namespaces=None):
2003 u"""iterfind(self, path, namespaces=None)
2005 Iterates over all elements matching the ElementPath expression.
2006 Same as getroot().iterfind(path).
2008 The optional ``namespaces`` argument accepts a
2009 prefix-to-namespace mapping that allows the usage of XPath
2010 prefixes in the path expression.
2012 self._assertHasRoot()
2013 root = self.getroot()
2020 return root.iterfind(path, namespaces)
2022 def xpath(self, _path, *, namespaces=None, extensions=None,
2023 smart_strings=True, **_variables):
2024 u"""xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
2026 XPath evaluate in context of document.
2028 ``namespaces`` is an optional dictionary with prefix to namespace URI
2029 mappings, used by XPath. ``extensions`` defines additional extension
2032 Returns a list (nodeset), or bool, float or string.
2034 In case of a list result, return Element for element nodes,
2035 string for text and attribute values.
2037 Note: if you are going to apply multiple XPath expressions
2038 against the same document, it is more efficient to use
2039 XPathEvaluator directly.
2041 self._assertHasRoot()
2042 evaluator = XPathDocumentEvaluator(self, namespaces=namespaces,
2043 extensions=extensions,
2044 smart_strings=smart_strings)
2045 return evaluator(_path, **_variables)
2047 def xslt(self, _xslt, extensions=None, access_control=None, **_kw):
2048 u"""xslt(self, _xslt, extensions=None, access_control=None, **_kw)
2050 Transform this document using other document.
2052 xslt is a tree that should be XSLT
2053 keyword parameters are XSLT transformation parameters.
2055 Returns the transformed tree.
2057 Note: if you are going to apply the same XSLT stylesheet against
2058 multiple documents, it is more efficient to use the XSLT
2061 self._assertHasRoot()
2062 style = XSLT(_xslt, extensions=extensions,
2063 access_control=access_control)
2064 return style(self, **_kw)
2066 def relaxng(self, relaxng):
2067 u"""relaxng(self, relaxng)
2069 Validate this document using other document.
2071 The relaxng argument is a tree that should contain a Relax NG schema.
2073 Returns True or False, depending on whether validation
2076 Note: if you are going to apply the same Relax NG schema against
2077 multiple documents, it is more efficient to use the RelaxNG
2080 self._assertHasRoot()
2081 schema = RelaxNG(relaxng)
2082 return schema.validate(self)
2084 def xmlschema(self, xmlschema):
2085 u"""xmlschema(self, xmlschema)
2087 Validate this document using other document.
2089 The xmlschema argument is a tree that should contain an XML Schema.
2091 Returns True or False, depending on whether validation
2094 Note: If you are going to apply the same XML Schema against
2095 multiple documents, it is more efficient to use the XMLSchema
2098 self._assertHasRoot()
2099 schema = XMLSchema(xmlschema)
2100 return schema.validate(self)
2105 Process the XInclude nodes in this document and include the
2106 referenced XML fragments.
2108 There is support for loading files through the file system, HTTP and
2111 Note that XInclude does not support custom resolvers in Python space
2112 due to restrictions of libxml2 <= 2.6.29.
2114 self._assertHasRoot()
2115 XInclude()(self._context_node)
2117 def write_c14n(self, file, *, exclusive=False, with_comments=True,
2119 u"""write_c14n(self, file, exclusive=False, with_comments=True,
2122 C14N write of document. Always writes UTF-8.
2124 The ``compression`` option enables GZip compression level 1-9.
2126 self._assertHasRoot()
2127 _assertValidNode(self._context_node)
2128 if compression is None or compression < 0:
2130 _tofilelikeC14N(file, self._context_node, exclusive, with_comments,
2133 cdef _ElementTree _elementTreeFactory(_Document doc, _Element context_node):
2134 return _newElementTree(doc, context_node, _ElementTree)
2136 cdef _ElementTree _newElementTree(_Document doc, _Element context_node,
2138 cdef _ElementTree result
2139 result = baseclass()
2140 if context_node is None and doc is not None:
2141 context_node = doc.getroot()
2142 if context_node is None:
2143 _assertValidDoc(doc)
2146 _assertValidNode(context_node)
2147 result._context_node = context_node
2152 u"""A dict-like proxy for the ``Element.attrib`` property.
2154 cdef _Element _element
2155 def __cinit__(self, _Element element not None):
2156 _assertValidNode(element)
2157 self._element = element
2160 def __setitem__(self, key, value):
2161 _setAttributeValue(self._element, key, value)
2163 def __delitem__(self, key):
2164 _delAttribute(self._element, key)
2166 def update(self, sequence_or_dict):
2167 if isinstance(sequence_or_dict, dict):
2168 sequence_or_dict = sequence_or_dict.items()
2169 for key, value in sequence_or_dict:
2170 _setAttributeValue(self._element, key, value)
2172 def pop(self, key, *default):
2173 if python.PyTuple_GET_SIZE(default) > 1:
2174 raise TypeError, u"pop expected at most 2 arguments, got %d" % (
2175 python.PyTuple_GET_SIZE(default)+1)
2176 result = _getAttributeValue(self._element, key, None)
2178 if python.PyTuple_GET_SIZE(default) == 0:
2181 result = python.PyTuple_GET_ITEM(default, 0)
2182 python.Py_INCREF(result)
2184 _delAttribute(self._element, key)
2188 cdef xmlNode* c_node
2189 c_node = self._element._c_node
2190 while c_node.properties is not NULL:
2191 tree.xmlRemoveProp(c_node.properties)
2195 return repr(dict( _attributeIteratorFactory(self._element, 3) ))
2197 def __getitem__(self, key):
2198 result = _getAttributeValue(self._element, key, None)
2204 def __nonzero__(self):
2205 cdef xmlAttr* c_attr
2206 c_attr = self._element._c_node.properties
2207 while c_attr is not NULL:
2208 if c_attr.type == tree.XML_ATTRIBUTE_NODE:
2210 c_attr = c_attr.next
2214 cdef xmlAttr* c_attr
2217 c_attr = self._element._c_node.properties
2218 while c_attr is not NULL:
2219 if c_attr.type == tree.XML_ATTRIBUTE_NODE:
2221 c_attr = c_attr.next
2224 def get(self, key, default=None):
2225 return _getAttributeValue(self._element, key, default)
2228 return _collectAttributes(self._element._c_node, 1)
2231 return iter(_collectAttributes(self._element._c_node, 1))
2234 return iter(_collectAttributes(self._element._c_node, 1))
2237 return _collectAttributes(self._element._c_node, 2)
2239 def itervalues(self):
2240 return iter(_collectAttributes(self._element._c_node, 2))
2243 return _collectAttributes(self._element._c_node, 3)
2245 def iteritems(self):
2246 return iter(_collectAttributes(self._element._c_node, 3))
2248 def has_key(self, key):
2254 def __contains__(self, key):
2255 cdef xmlNode* c_node
2257 ns, tag = _getNsTag(key)
2258 c_node = self._element._c_node
2259 c_href = NULL if ns is None else _cstr(ns)
2260 if tree.xmlHasNsProp(c_node, _cstr(tag), c_href):
2265 def __richcmp__(one, other, int op):
2266 if not python.PyDict_Check(one):
2268 if not python.PyDict_Check(other):
2270 return python.PyObject_RichCompare(one, other, op)
2273 cdef class _AttribIterator:
2274 u"""Attribute iterator - for internal use only!
2276 # XML attributes must not be removed while running!
2278 cdef xmlAttr* _c_attr
2279 cdef int _keysvalues # 1 - keys, 2 - values, 3 - items (key, value)
2284 cdef xmlAttr* c_attr
2285 if self._node is None:
2287 c_attr = self._c_attr
2288 while c_attr is not NULL and c_attr.type != tree.XML_ATTRIBUTE_NODE:
2289 c_attr = c_attr.next
2294 self._c_attr = c_attr.next
2295 if self._keysvalues == 1:
2296 return _namespacedName(<xmlNode*>c_attr)
2297 elif self._keysvalues == 2:
2298 return _attributeValue(self._node._c_node, c_attr)
2300 return (_namespacedName(<xmlNode*>c_attr),
2301 _attributeValue(self._node._c_node, c_attr))
2303 cdef object _attributeIteratorFactory(_Element element, int keysvalues):
2304 cdef _AttribIterator attribs
2305 if element._c_node.properties is NULL:
2307 attribs = _AttribIterator()
2308 attribs._node = element
2309 attribs._c_attr = element._c_node.properties
2310 attribs._keysvalues = keysvalues
2314 cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher,
2315 type LxmlElementTagMatcherType ]:
2316 cdef object _pystrings
2320 cdef _initTagMatch(self, tag):
2325 elif tag is Comment:
2326 self._node_type = tree.XML_COMMENT_NODE
2327 elif tag is ProcessingInstruction:
2328 self._node_type = tree.XML_PI_NODE
2330 self._node_type = tree.XML_ENTITY_REF_NODE
2331 elif tag is Element:
2332 self._node_type = tree.XML_ELEMENT_NODE
2334 self._node_type = tree.XML_ELEMENT_NODE
2335 self._pystrings = _getNsTag(tag)
2336 if self._pystrings[0] is not None:
2337 self._href = _cstr(self._pystrings[0])
2338 self._name = _cstr(self._pystrings[1])
2339 if self._name[0] == c'*' and self._name[1] == c'\0':
2342 cdef public class _ElementIterator(_ElementTagMatcher) [
2343 object LxmlElementIterator, type LxmlElementIteratorType ]:
2344 # we keep Python references here to control GC
2346 cdef _node_to_node_function _next_element
2350 cdef void _storeNext(self, _Element node):
2351 cdef xmlNode* c_node
2352 c_node = self._next_element(node._c_node)
2353 while c_node is not NULL and \
2354 self._node_type != 0 and \
2355 (self._node_type != c_node.type or
2356 not _tagMatches(c_node, self._href, self._name)):
2357 c_node = self._next_element(c_node)
2362 self._node = _elementFactory(node._doc, c_node)
2365 cdef xmlNode* c_node
2366 cdef _Element current_node
2367 if self._node is None:
2370 current_node = self._node
2371 self._storeNext(current_node)
2374 cdef class ElementChildIterator(_ElementIterator):
2375 u"""ElementChildIterator(self, node, tag=None, reversed=False)
2376 Iterates over the children of an element.
2378 def __cinit__(self, _Element node not None, tag=None, *, reversed=False):
2379 cdef xmlNode* c_node
2380 _assertValidNode(node)
2381 self._initTagMatch(tag)
2383 c_node = _findChildBackwards(node._c_node, 0)
2384 self._next_element = _previousElement
2386 c_node = _findChildForwards(node._c_node, 0)
2387 self._next_element = _nextElement
2389 while c_node is not NULL and \
2390 self._node_type != 0 and \
2391 (self._node_type != c_node.type or
2392 not _tagMatches(c_node, self._href, self._name)):
2393 c_node = self._next_element(c_node)
2394 if c_node is not NULL:
2396 self._node = _elementFactory(node._doc, c_node)
2398 cdef class SiblingsIterator(_ElementIterator):
2399 u"""SiblingsIterator(self, node, tag=None, preceding=False)
2400 Iterates over the siblings of an element.
2402 You can pass the boolean keyword ``preceding`` to specify the direction.
2404 def __cinit__(self, _Element node not None, tag=None, *, preceding=False):
2405 _assertValidNode(node)
2406 self._initTagMatch(tag)
2408 self._next_element = _previousElement
2410 self._next_element = _nextElement
2411 self._storeNext(node)
2413 cdef class AncestorsIterator(_ElementIterator):
2414 u"""AncestorsIterator(self, node, tag=None)
2415 Iterates over the ancestors of an element (from parent to parent).
2417 def __cinit__(self, _Element node not None, tag=None):
2418 _assertValidNode(node)
2419 self._initTagMatch(tag)
2420 self._next_element = _parentElement
2421 self._storeNext(node)
2423 cdef class ElementDepthFirstIterator(_ElementTagMatcher):
2424 u"""ElementDepthFirstIterator(self, node, tag=None, inclusive=True)
2425 Iterates over an element and its sub-elements in document order (depth
2428 Note that this also includes comments, entities and processing
2429 instructions. To filter them out, check if the ``tag`` property
2430 of the returned element is a string (i.e. not None and not a
2431 factory function), or pass the ``Element`` factory for the ``tag``
2434 If the optional ``tag`` argument is not None, the iterator returns only
2435 the elements that match the respective name and namespace.
2437 The optional boolean argument 'inclusive' defaults to True and can be set
2438 to False to exclude the start element itself.
2440 Note that the behaviour of this iterator is completely undefined if the
2441 tree it traverses is modified during iteration.
2443 # we keep Python references here to control GC
2444 # keep next node to return and the (s)top node
2445 cdef _Element _next_node
2446 cdef _Element _top_node
2447 def __cinit__(self, _Element node not None, tag=None, *, inclusive=True):
2448 _assertValidNode(node)
2449 self._top_node = node
2450 self._next_node = node
2451 self._initTagMatch(tag)
2452 if not inclusive or \
2453 tag is not None and \
2454 self._node_type != 0 and \
2455 (self._node_type != node._c_node.type or
2456 not _tagMatches(node._c_node, self._href, self._name)):
2457 # this cannot raise StopIteration, self._next_node != None
2464 cdef xmlNode* c_node
2465 cdef _Element current_node
2466 if self._next_node is None:
2468 current_node = self._next_node
2469 c_node = self._next_node._c_node
2470 if self._name is NULL and self._href is NULL:
2471 c_node = self._nextNodeAnyTag(c_node)
2473 c_node = self._nextNodeMatchTag(c_node)
2475 self._next_node = None
2477 self._next_node = _elementFactory(current_node._doc, c_node)
2480 cdef xmlNode* _nextNodeAnyTag(self, xmlNode* c_node):
2481 cdef int node_type = self._node_type
2482 tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0)
2483 if node_type == 0 or node_type == c_node.type:
2485 tree.END_FOR_EACH_ELEMENT_FROM(c_node)
2488 cdef xmlNode* _nextNodeMatchTag(self, xmlNode* c_node):
2489 cdef char* c_name = NULL
2490 if self._name is not NULL:
2491 c_name = tree.xmlDictExists(c_node.doc.dict, self._name, -1)
2493 # not found in dict => not in document at all
2495 tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0)
2496 if c_node.type == tree.XML_ELEMENT_NODE:
2497 if (c_name is NULL or c_name is c_node.name) and \
2498 _tagMatches(c_node, self._href, self._name):
2500 tree.END_FOR_EACH_ELEMENT_FROM(c_node)
2503 cdef class ElementTextIterator:
2504 u"""ElementTextIterator(self, element, tag=None, with_tail=True)
2505 Iterates over the text content of a subtree.
2507 You can pass the ``tag`` keyword argument to restrict text content to a
2510 You can set the ``with_tail`` keyword argument to ``False`` to skip over
2513 cdef object _nextEvent
2514 cdef _Element _start_element
2515 def __cinit__(self, _Element element not None, tag=None, *, with_tail=True):
2516 _assertValidNode(element)
2518 events = (u"start", u"end")
2520 events = (u"start",)
2521 self._start_element = element
2522 self._nextEvent = iterwalk(element, events=events, tag=tag).__next__
2528 cdef _Element element
2530 while result is None:
2531 event, element = self._nextEvent() # raises StopIteration
2532 if event == u"start":
2533 result = element.text
2534 elif element is not self._start_element:
2535 result = element.tail
2538 cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf) except NULL:
2539 cdef xmlNode* c_node
2540 c_node = tree.xmlNewDocNode(c_doc, NULL, _cstr(name_utf), NULL)
2543 cdef xmlNode* _createComment(xmlDoc* c_doc, char* text):
2544 cdef xmlNode* c_node
2545 c_node = tree.xmlNewDocComment(c_doc, text)
2548 cdef xmlNode* _createPI(xmlDoc* c_doc, char* target, char* text):
2549 cdef xmlNode* c_node
2550 c_node = tree.xmlNewDocPI(c_doc, target, text)
2553 cdef xmlNode* _createEntity(xmlDoc* c_doc, char* name):
2554 cdef xmlNode* c_node
2555 c_node = tree.xmlNewReference(c_doc, name)
2558 # module-level API for ElementTree
2560 def Element(_tag, attrib=None, nsmap=None, **_extra):
2561 u"""Element(_tag, attrib=None, nsmap=None, **_extra)
2563 Element factory. This function returns an object implementing the
2566 Also look at the `_Element.makeelement()` and
2567 `_BaseParser.makeelement()` methods, which provide a faster way to
2568 create an Element within a specific document or parser context.
2570 return _makeElement(_tag, NULL, None, None, None, None,
2571 attrib, nsmap, _extra)
2573 def Comment(text=None):
2574 u"""Comment(text=None)
2576 Comment element factory. This factory function creates a special element that will
2577 be serialized as an XML comment.
2580 cdef xmlNode* c_node
2586 c_doc = _newXMLDoc()
2587 doc = _documentFactory(c_doc, None)
2588 c_node = _createComment(c_doc, _cstr(text))
2589 tree.xmlAddChild(<xmlNode*>c_doc, c_node)
2590 return _elementFactory(doc, c_node)
2592 def ProcessingInstruction(target, text=None):
2593 u"""ProcessingInstruction(target, text=None)
2595 ProcessingInstruction element factory. This factory function creates a
2596 special element that will be serialized as an XML processing instruction.
2599 cdef xmlNode* c_node
2601 target = _utf8(target)
2606 c_doc = _newXMLDoc()
2607 doc = _documentFactory(c_doc, None)
2608 c_node = _createPI(c_doc, _cstr(target), _cstr(text))
2609 tree.xmlAddChild(<xmlNode*>c_doc, c_node)
2610 return _elementFactory(doc, c_node)
2612 PI = ProcessingInstruction
2617 CDATA factory. This factory creates an opaque data object that
2618 can be used to set Element text. The usual way to use it is::
2620 >>> from lxml import etree
2621 >>> el = etree.Element('content')
2622 >>> el.text = etree.CDATA('a string')
2624 cdef object _utf8_data
2625 def __init__(self, data):
2626 self._utf8_data = _utf8(data)
2631 Entity factory. This factory function creates a special element
2632 that will be serialized as an XML entity reference or character
2633 reference. Note, however, that entities will not be automatically
2634 declared in the document. A document that uses entity references
2635 requires a DTD to define the entities.
2638 cdef xmlNode* c_node
2641 name_utf = _utf8(name)
2642 c_name = _cstr(name_utf)
2643 if c_name[0] == c'#':
2644 if not _characterReferenceIsValid(c_name + 1):
2645 raise ValueError, u"Invalid character reference: '%s'" % name
2646 elif not _xmlNameIsValid(c_name):
2647 raise ValueError, u"Invalid entity reference: '%s'" % name
2648 c_doc = _newXMLDoc()
2649 doc = _documentFactory(c_doc, None)
2650 c_node = _createEntity(c_doc, c_name)
2651 tree.xmlAddChild(<xmlNode*>c_doc, c_node)
2652 return _elementFactory(doc, c_node)
2654 def SubElement(_Element _parent not None, _tag,
2655 attrib=None, nsmap=None, **_extra):
2656 u"""SubElement(_parent, _tag, attrib=None, nsmap=None, **_extra)
2658 Subelement factory. This function creates an element instance, and
2659 appends it to an existing element.
2661 return _makeSubElement(_parent, _tag, None, None, attrib, nsmap, _extra)
2663 def ElementTree(_Element element=None, *, file=None, _BaseParser parser=None):
2664 u"""ElementTree(element=None, file=None, parser=None)
2666 ElementTree wrapper class.
2668 cdef xmlNode* c_next
2669 cdef xmlNode* c_node
2670 cdef xmlNode* c_node_copy
2672 cdef _ElementTree etree
2675 if element is not None:
2677 elif file is not None:
2679 doc = _parseDocument(file, parser, None)
2680 except _TargetParserResult, result_container:
2681 return result_container.result
2683 c_doc = _newXMLDoc()
2684 doc = _documentFactory(c_doc, parser)
2686 return _elementTreeFactory(doc, element)
2688 def HTML(text, _BaseParser parser=None, *, base_url=None):
2689 u"""HTML(text, parser=None, base_url=None)
2691 Parses an HTML document from a string constant. Returns the root
2692 node (or the result returned by a parser target). This function
2693 can be used to embed "HTML literals" in Python code.
2695 To override the parser with a different ``HTMLParser`` you can pass it to
2696 the ``parser`` keyword argument.
2698 The ``base_url`` keyword argument allows to set the original base URL of
2699 the document to support relative Paths when looking up external entities
2700 (DTD, XInclude, ...).
2704 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
2705 if not isinstance(parser, HTMLParser):
2706 parser = __DEFAULT_HTML_PARSER
2708 doc = _parseMemoryDocument(text, base_url, parser)
2709 return doc.getroot()
2710 except _TargetParserResult, result_container:
2711 return result_container.result
2713 def XML(text, _BaseParser parser=None, *, base_url=None):
2714 u"""XML(text, parser=None, base_url=None)
2716 Parses an XML document or fragment from a string constant.
2717 Returns the root node (or the result returned by a parser target).
2718 This function can be used to embed "XML literals" in Python code,
2721 >>> root = etree.XML("<root><test/></root>")
2723 To override the parser with a different ``XMLParser`` you can pass it to
2724 the ``parser`` keyword argument.
2726 The ``base_url`` keyword argument allows to set the original base URL of
2727 the document to support relative Paths when looking up external entities
2728 (DTD, XInclude, ...).
2732 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
2733 if not isinstance(parser, XMLParser):
2734 parser = __DEFAULT_XML_PARSER
2736 doc = _parseMemoryDocument(text, base_url, parser)
2737 return doc.getroot()
2738 except _TargetParserResult, result_container:
2739 return result_container.result
2741 def fromstring(text, _BaseParser parser=None, *, base_url=None):
2742 u"""fromstring(text, parser=None, base_url=None)
2744 Parses an XML document or fragment from a string. Returns the
2745 root node (or the result returned by a parser target).
2747 To override the default parser with a different parser you can pass it to
2748 the ``parser`` keyword argument.
2750 The ``base_url`` keyword argument allows to set the original base URL of
2751 the document to support relative Paths when looking up external entities
2752 (DTD, XInclude, ...).
2756 doc = _parseMemoryDocument(text, base_url, parser)
2757 return doc.getroot()
2758 except _TargetParserResult, result_container:
2759 return result_container.result
2761 def fromstringlist(strings, _BaseParser parser=None):
2762 u"""fromstringlist(strings, parser=None)
2764 Parses an XML document from a sequence of strings. Returns the
2765 root node (or the result returned by a parser target).
2767 To override the default parser with a different parser you can pass it to
2768 the ``parser`` keyword argument.
2772 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
2774 for data in strings:
2776 return parser.close()
2778 def iselement(element):
2779 u"""iselement(element)
2781 Checks if an object appears to be a valid element object.
2783 return isinstance(element, _Element) and (<_Element>element)._c_node is not NULL
2785 def dump(_Element elem not None, *, bint pretty_print=True, bint with_tail=True):
2786 u"""dump(elem, pretty_print=True, with_tail=True)
2788 Writes an element tree or element structure to sys.stdout. This function
2789 should be used for debugging only.
2791 _assertValidNode(elem)
2792 _dumpToFile(sys.stdout, elem._c_node, pretty_print, with_tail)
2794 def tostring(element_or_tree, *, encoding=None, method=u"xml",
2795 xml_declaration=None, bint pretty_print=False, bint with_tail=True,
2796 standalone=None, doctype=None,
2797 bint exclusive=False, bint with_comments=True):
2798 u"""tostring(element_or_tree, encoding=None, method="xml",
2799 xml_declaration=None, pretty_print=False, with_tail=True,
2800 standalone=None, doctype=None,
2801 exclusive=False, with_comments=True)
2803 Serialize an element to an encoded string representation of its XML
2806 Defaults to ASCII encoding without XML declaration. This
2807 behaviour can be configured with the keyword arguments 'encoding'
2808 (string) and 'xml_declaration' (bool). Note that changing the
2809 encoding to a non UTF-8 compatible encoding will enable a
2810 declaration by default.
2812 You can also serialise to a Unicode string without declaration by
2813 passing the ``unicode`` function as encoding (or ``str`` in Py3),
2814 or the name 'unicode'. This changes the return value from a byte
2815 string to an unencoded unicode string.
2817 The keyword argument 'pretty_print' (bool) enables formatted XML.
2819 The keyword argument 'method' selects the output method: 'xml',
2820 'html', plain 'text' (text content without tags) or 'c14n'.
2823 The ``exclusive`` and ``with_comments`` arguments are only used
2824 with C14N output, where they request exclusive and uncommented
2825 C14N serialisation respectively.
2827 Passing a boolean value to the ``standalone`` option will output
2828 an XML declaration with the corresponding ``standalone`` flag.
2830 The ``doctype`` option allows passing in a plain string that will
2831 be serialised before the XML tree. Note that passing in non
2832 well-formed content here will make the XML output non well-formed.
2833 Also, an existing doctype in the document tree will not be removed
2834 when serialising an ElementTree instance.
2836 You can prevent the tail text of the element from being serialised
2837 by passing the boolean ``with_tail`` option. This has no impact
2838 on the tail text of children, which will always be serialised.
2840 cdef bint write_declaration
2841 cdef int is_standalone
2842 # C14N serialisation
2843 if method == 'c14n':
2844 if encoding is not None:
2845 raise ValueError("Cannot specify encoding with C14N")
2847 raise ValueError("Cannot enable XML declaration in C14N")
2848 return _tostringC14N(element_or_tree, exclusive, with_comments)
2849 if not with_comments:
2850 raise ValueError("Can only discard comments in C14N serialisation")
2851 if encoding is _unicode or (encoding is not None and encoding.upper() == 'UNICODE'):
2854 u"Serialisation to unicode must not request an XML declaration"
2855 write_declaration = 0
2857 elif xml_declaration is None:
2858 # by default, write an XML declaration only for non-standard encodings
2859 write_declaration = encoding is not None and encoding.upper() not in \
2860 (u'ASCII', u'UTF-8', u'UTF8', u'US-ASCII')
2862 write_declaration = xml_declaration
2863 if encoding is None:
2865 if standalone is None:
2868 write_declaration = 1
2871 write_declaration = 1
2874 if isinstance(element_or_tree, _Element):
2875 return _tostring(<_Element>element_or_tree, encoding, doctype, method,
2876 write_declaration, 0, pretty_print, with_tail,
2878 elif isinstance(element_or_tree, _ElementTree):
2879 return _tostring((<_ElementTree>element_or_tree)._context_node,
2880 encoding, doctype, method, write_declaration, 1,
2881 pretty_print, with_tail, is_standalone)
2883 raise TypeError, u"Type '%s' cannot be serialized." % \
2884 python._fqtypename(element_or_tree)
2886 def tostringlist(element_or_tree, *args, **kwargs):
2887 u"""tostringlist(element_or_tree, *args, **kwargs)
2889 Serialize an element to an encoded string representation of its XML
2890 tree, stored in a list of partial strings.
2892 This is purely for ElementTree 1.3 compatibility. The result is a
2893 single string wrapped in a list.
2895 return [tostring(element_or_tree, *args, **kwargs)]
2897 def tounicode(element_or_tree, *, method=u"xml", bint pretty_print=False,
2898 bint with_tail=True, doctype=None):
2899 u"""tounicode(element_or_tree, method="xml", pretty_print=False,
2900 with_tail=True, doctype=None)
2902 Serialize an element to the Python unicode representation of its XML
2905 :deprecated: use ``tostring(el, encoding=unicode)`` instead.
2907 Note that the result does not carry an XML encoding declaration and is
2908 therefore not necessarily suited for serialization to byte streams without
2911 The boolean keyword argument 'pretty_print' enables formatted XML.
2913 The keyword argument 'method' selects the output method: 'xml',
2914 'html' or plain 'text'.
2916 You can prevent the tail text of the element from being serialised
2917 by passing the boolean ``with_tail`` option. This has no impact
2918 on the tail text of children, which will always be serialised.
2920 if isinstance(element_or_tree, _Element):
2921 return _tostring(<_Element>element_or_tree, _unicode, doctype, method,
2922 0, 0, pretty_print, with_tail, -1)
2923 elif isinstance(element_or_tree, _ElementTree):
2924 return _tostring((<_ElementTree>element_or_tree)._context_node,
2925 _unicode, doctype, method, 0, 1, pretty_print,
2928 raise TypeError, u"Type '%s' cannot be serialized." % \
2929 type(element_or_tree)
2931 def parse(source, _BaseParser parser=None, *, base_url=None):
2932 u"""parse(source, parser=None, base_url=None)
2934 Return an ElementTree object loaded with source elements. If no parser
2935 is provided as second argument, the default parser is used.
2937 The ``source`` can be any of the following:
2941 - a file-like object
2942 - a URL using the HTTP or FTP protocol
2944 To parse from a string, use the ``fromstring()`` function instead.
2946 Note that it is generally faster to parse from a file path or URL
2947 than from an open file object or file-like object. Transparent
2948 decompression from gzip compressed sources is supported (unless
2949 explicitly disabled in libxml2).
2951 The ``base_url`` keyword allows setting a URL for the document
2952 when parsing from a file-like object. This is needed when looking
2953 up external entities (DTD, XInclude, ...) with relative paths.
2957 doc = _parseDocument(source, parser, base_url)
2958 return _elementTreeFactory(doc, None)
2959 except _TargetParserResult, result_container:
2960 return result_container.result
2963 ################################################################################
2964 # Include submodules
2966 include "readonlytree.pxi" # Read-only implementation of Element proxies
2967 include "classlookup.pxi" # Element class lookup mechanisms
2968 include "nsclasses.pxi" # Namespace implementation and registry
2969 include "docloader.pxi" # Support for custom document loaders
2970 include "parser.pxi" # XML Parser
2971 include "saxparser.pxi" # SAX-like Parser interface and tree builder
2972 include "parsertarget.pxi" # ET Parser target
2973 include "serializer.pxi" # XML output functions
2974 include "iterparse.pxi" # incremental XML parsing
2975 include "xmlid.pxi" # XMLID and IDDict
2976 include "xinclude.pxi" # XInclude
2977 include "cleanup.pxi" # Cleanup and recursive element removal functions
2980 ################################################################################
2981 # Include submodules for XPath and XSLT
2983 include "extensions.pxi" # XPath/XSLT extension functions
2984 include "xpath.pxi" # XPath evaluation
2985 include "xslt.pxi" # XSL transformations
2986 include "xsltext.pxi" # XSL extension elements
2989 ################################################################################
2992 class DocumentInvalid(LxmlError):
2993 u"""Validation error.
2995 Raised by all document validators when their ``assertValid(tree)``
3000 cdef class _Validator:
3001 u"Base class for XML validators."
3002 cdef _ErrorLog _error_log
3003 def __cinit__(self):
3004 self._error_log = _ErrorLog()
3006 def validate(self, etree):
3007 u"""validate(self, etree)
3009 Validate the document using this schema.
3011 Returns true if document is valid, false if not.
3015 def assertValid(self, etree):
3016 u"""assertValid(self, etree)
3018 Raises `DocumentInvalid` if the document does not comply with the schema.
3021 raise DocumentInvalid(self._error_log._buildExceptionMessage(
3022 u"Document does not comply with schema"),
3025 def assert_(self, etree):
3026 u"""assert_(self, etree)
3028 Raises `AssertionError` if the document does not comply with the schema.
3031 raise AssertionError, self._error_log._buildExceptionMessage(
3032 u"Document does not comply with schema")
3034 cpdef _append_log_message(self, int domain, int type, int level, int line,
3036 self._error_log._receiveGeneric(domain, type, level, line, message,
3039 cpdef _clear_error_log(self):
3040 self._error_log.clear()
3043 u"The log of validation errors and warnings."
3045 assert self._error_log is not None, "XPath evaluator not initialised"
3046 return self._error_log.copy()
3048 include "dtd.pxi" # DTD
3049 include "relaxng.pxi" # RelaxNG
3050 include "xmlschema.pxi" # XMLSchema
3051 include "schematron.pxi" # Schematron (requires libxml2 2.6.21+)
3053 ################################################################################
3056 include "public-api.pxi"