1 # Private/public helper functions for API functions
3 cdef void displayNode(xmlNode* c_node, indent):
4 # to help with debugging
6 print indent * u' ', <long>c_node
7 c_child = c_node.children
8 while c_child is not NULL:
9 displayNode(c_child, indent + 1)
10 c_child = c_child.next
12 cdef _Document _documentOrRaise(object input):
13 u"""Call this to get the document of a _Document, _ElementTree or _Element
14 object, or to raise an exception if it can't be determined.
16 Should be used in all API functions for consistency.
20 if isinstance(input, _ElementTree):
21 element = (<_ElementTree>input)._context_node
22 if element is not None:
24 elif isinstance(input, _Element):
25 doc = (<_Element>input)._doc
26 elif isinstance(input, _Document):
27 doc = <_Document>input
29 raise TypeError, u"Invalid input object: %s" % \
30 python._fqtypename(input)
32 raise ValueError, u"Input object has no document: %s" % \
33 python._fqtypename(input)
37 cdef _Element _rootNodeOrRaise(object input):
38 u"""Call this to get the root node of a _Document, _ElementTree or
39 _Element object, or to raise an exception if it can't be determined.
41 Should be used in all API functions for consistency.
44 if isinstance(input, _ElementTree):
45 node = (<_ElementTree>input)._context_node
46 elif isinstance(input, _Element):
47 node = <_Element>input
48 elif isinstance(input, _Document):
49 node = (<_Document>input).getroot()
51 raise TypeError, u"Invalid input object: %s" % \
52 python._fqtypename(input)
54 raise ValueError, u"Input object has no element: %s" % \
55 python._fqtypename(input)
59 cdef _Document _documentOf(object input):
60 # call this to get the document of a
61 # _Document, _ElementTree or _Element object
64 if isinstance(input, _ElementTree):
65 element = (<_ElementTree>input)._context_node
66 if element is not None:
68 elif isinstance(input, _Element):
69 return (<_Element>input)._doc
70 elif isinstance(input, _Document):
71 return <_Document>input
74 cdef _Element _rootNodeOf(object input):
75 # call this to get the root node of a
76 # _Document, _ElementTree or _Element object
78 if isinstance(input, _ElementTree):
79 return (<_ElementTree>input)._context_node
80 elif isinstance(input, _Element):
81 return <_Element>input
82 elif isinstance(input, _Document):
83 return (<_Document>input).getroot()
87 cdef _Element _makeElement(tag, xmlDoc* c_doc, _Document doc,
88 _BaseParser parser, text, tail, attrib, nsmap,
90 u"""Create a new element and initialize text content, namespaces and
93 This helper function will reuse as much of the existing document as
96 If 'parser' is None, the parser will be inherited from 'doc' or the
97 default parser will be used.
99 If 'doc' is None, 'c_doc' is used to create a new _Document and the new
100 element is made its root node.
102 If 'c_doc' is also NULL, a new xmlDoc will be created.
107 ns_utf, name_utf = _getNsTag(tag)
108 if parser is not None and parser._for_html:
109 _htmlTagValidOrRaise(name_utf)
111 c_doc = _newHTMLDoc()
113 _tagValidOrRaise(name_utf)
116 c_node = _createElement(c_doc, name_utf)
118 if doc is None and c_doc is not NULL:
119 tree.xmlFreeDoc(c_doc)
120 return python.PyErr_NoMemory()
123 tree.xmlDocSetRootElement(c_doc, c_node)
124 doc = _documentFactory(c_doc, parser)
126 _setNodeText(c_node, text)
128 _setTailText(c_node, tail)
129 # add namespaces to node if necessary
130 _initNodeNamespaces(c_node, doc, ns_utf, nsmap)
131 _initNodeAttributes(c_node, doc, attrib, extra_attrs)
132 return _elementFactory(doc, c_node)
134 # free allocated c_node/c_doc unless Python does it for us
135 if c_node.doc is not c_doc:
136 # node not yet in document => will not be freed by document
138 _removeText(c_node.next) # tail
139 tree.xmlFreeNode(c_node)
141 # c_doc will not be freed by doc
142 tree.xmlFreeDoc(c_doc)
145 cdef int _initNewElement(_Element element, bint is_html, name_utf, ns_utf,
146 _BaseParser parser, attrib, nsmap, extra_attrs) except -1:
147 u"""Initialise a new Element object.
149 This is used when users instantiate a Python Element class
150 directly, without it being mapped to an existing XML node.
156 _htmlTagValidOrRaise(name_utf)
157 c_doc = _newHTMLDoc()
159 _tagValidOrRaise(name_utf)
161 c_node = _createElement(c_doc, name_utf)
163 if c_doc is not NULL:
164 tree.xmlFreeDoc(c_doc)
165 return python.PyErr_NoMemory()
166 tree.xmlDocSetRootElement(c_doc, c_node)
167 doc = _documentFactory(c_doc, parser)
168 # add namespaces to node if necessary
169 _initNodeNamespaces(c_node, doc, ns_utf, nsmap)
170 _initNodeAttributes(c_node, doc, attrib, extra_attrs)
171 _registerProxy(element, doc, c_node)
175 cdef _Element _makeSubElement(_Element parent, tag, text, tail,
176 attrib, nsmap, extra_attrs):
177 u"""Create a new child element and initialize text content, namespaces and
182 if parent is None or parent._doc is None:
184 ns_utf, name_utf = _getNsTag(tag)
185 c_doc = parent._doc._c_doc
187 if parent._doc._parser is not None and parent._doc._parser._for_html:
188 _htmlTagValidOrRaise(name_utf)
190 _tagValidOrRaise(name_utf)
192 c_node = _createElement(c_doc, name_utf)
194 return python.PyErr_NoMemory()
195 tree.xmlAddChild(parent._c_node, c_node)
199 _setNodeText(c_node, text)
201 _setTailText(c_node, tail)
203 # add namespaces to node if necessary
204 _initNodeNamespaces(c_node, parent._doc, ns_utf, nsmap)
205 _initNodeAttributes(c_node, parent._doc, attrib, extra_attrs)
206 return _elementFactory(parent._doc, c_node)
208 # make sure we clean up in case of an error
209 _removeNode(parent._doc, c_node)
212 cdef int _initNodeNamespaces(xmlNode* c_node, _Document doc,
213 object node_ns_utf, object nsmap) except -1:
214 u"""Lookup current namespace prefixes, then set namespace structure for
215 node and register new ns-prefix mappings.
217 This only works for a newly created node!
224 if node_ns_utf is not None:
225 doc._setNodeNs(c_node, _cstr(node_ns_utf))
228 nsdefs = list(nsmap.items())
229 if None in nsmap and python.PyList_GET_SIZE(nsdefs) > 1:
230 # Move the default namespace to the end. This makes sure libxml2
231 # prefers a prefix if the ns is defined redundantly on the same
232 # element. That way, users can work around a problem themselves
233 # where default namespace attributes on non-default namespaced
234 # elements serialise without prefix (i.e. into the non-default
236 item = (None, nsmap[None])
240 for prefix, href in nsdefs:
241 href_utf = _utf8(href)
242 c_href = _cstr(href_utf)
243 if prefix is not None:
244 prefix_utf = _utf8(prefix)
245 _prefixValidOrRaise(prefix_utf)
246 c_prefix = _cstr(prefix_utf)
249 # add namespace with prefix if it is not already known
250 c_ns = tree.xmlSearchNs(doc._c_doc, c_node, c_prefix)
252 c_ns.href is NULL or \
253 cstd.strcmp(c_ns.href, c_href) != 0:
254 c_ns = tree.xmlNewNs(c_node, c_href, c_prefix)
255 if href_utf == node_ns_utf:
256 tree.xmlSetNs(c_node, c_ns)
259 if node_ns_utf is not None:
260 doc._setNodeNs(c_node, _cstr(node_ns_utf))
263 cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, extra):
264 u"""Initialise the attributes of an element node.
268 # 'extra' is not checked here (expected to be a keyword dict)
269 if attrib is not None and not hasattr(attrib, u'items'):
270 raise TypeError, u"Invalid attribute dictionary: %s" % \
271 python._fqtypename(attrib)
272 if extra is not None and extra:
278 is_html = doc._parser._for_html
279 for name, value in attrib.items():
280 attr_ns_utf, attr_name_utf = _getNsTag(name)
282 _attributeValidOrRaise(attr_name_utf)
283 value_utf = _utf8(value)
284 if attr_ns_utf is None:
285 tree.xmlNewProp(c_node, _cstr(attr_name_utf), _cstr(value_utf))
287 c_ns = doc._findOrBuildNodeNs(c_node, _cstr(attr_ns_utf), NULL)
288 tree.xmlNewNsProp(c_node, c_ns,
289 _cstr(attr_name_utf), _cstr(value_utf))
291 ctypedef struct _ns_node_ref:
295 cdef int _removeUnusedNamespaceDeclarations(xmlNode* c_element) except -1:
296 u"""Remove any namespace declarations from a subtree that do not used
297 by any of its elements (or attributes).
299 cdef _ns_node_ref* c_ns_list
300 cdef _ns_node_ref* c_nsref_ptr
303 cdef size_t c_ns_list_size
304 cdef size_t c_ns_list_len
311 if c_element.parent is not NULL and \
312 c_element.parent.type == tree.XML_DOCUMENT_NODE:
313 # include the document node
314 c_nsdef = c_element.parent.nsDef
315 while c_nsdef is not NULL:
316 if c_ns_list_len >= c_ns_list_size:
317 if c_ns_list is NULL:
321 c_nsref_ptr = <_ns_node_ref*> cstd.realloc(
322 c_ns_list, c_ns_list_size * sizeof(_ns_node_ref))
323 if c_nsref_ptr is NULL:
324 if c_ns_list is not NULL:
326 python.PyErr_NoMemory()
328 c_ns_list = c_nsref_ptr
330 c_ns_list[c_ns_list_len].ns = c_nsdef
331 c_ns_list[c_ns_list_len].node = c_element.parent
333 c_nsdef = c_nsdef.next
335 tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_element, c_element, 1)
336 # collect all new namespace declarations into the ns list
337 c_nsdef = c_element.nsDef
338 while c_nsdef is not NULL:
339 if c_ns_list_len >= c_ns_list_size:
340 if c_ns_list is NULL:
344 c_nsref_ptr = <_ns_node_ref*> cstd.realloc(
345 c_ns_list, c_ns_list_size * sizeof(_ns_node_ref))
346 if c_nsref_ptr is NULL:
347 if c_ns_list is not NULL:
349 python.PyErr_NoMemory()
351 c_ns_list = c_nsref_ptr
353 c_ns_list[c_ns_list_len].ns = c_nsdef
354 c_ns_list[c_ns_list_len].node = c_element
356 c_nsdef = c_nsdef.next
358 # remove all namespace declarations from the list that are referenced
359 if c_element.type == tree.XML_ELEMENT_NODE:
361 while c_node is not NULL:
362 if c_node.ns is not NULL:
363 for i from 0 <= i < c_ns_list_len:
364 if c_node.ns is c_ns_list[i].ns:
366 c_ns_list[i].ns = c_ns_list[c_ns_list_len].ns
367 c_ns_list[i].node = c_ns_list[c_ns_list_len].node
368 c_ns_list[c_ns_list_len].ns = NULL
369 c_ns_list[c_ns_list_len].node = NULL
371 if c_node is c_element:
372 # continue with attributes
373 c_node = <xmlNode*>c_element.properties
376 tree.END_FOR_EACH_ELEMENT_FROM(c_element)
378 if c_ns_list is NULL:
381 # free all namespace declarations that remained in the list
382 for i from 0 <= i < c_ns_list_len:
383 c_node = c_ns_list[i].node
384 c_nsdef = c_node.nsDef
385 if c_nsdef is c_ns_list[i].ns:
386 c_node.nsDef = c_node.nsDef.next
388 while c_nsdef.next is not c_ns_list[i].ns:
389 c_nsdef = c_nsdef.next
390 c_nsdef.next = c_nsdef.next.next
391 tree.xmlFreeNs(c_ns_list[i].ns)
393 if c_ns_list is not NULL:
397 cdef int _replaceNodeByChildren(_Document doc, xmlNode* c_node) except -1:
398 cdef xmlNode* c_parent
399 cdef xmlNode* c_child
400 if c_node.children is NULL:
401 tree.xmlUnlinkNode(c_node)
404 c_parent = c_node.parent
405 # fix parent links of children
406 c_child = c_node.children
407 while c_child is not NULL:
408 c_child.parent = c_parent
409 c_child = c_child.next
411 # fix namespace references of children if their parent's namespace
412 # declarations get lost
413 if c_node.nsDef is not NULL:
414 c_child = c_node.children
415 while c_child is not NULL:
416 moveNodeToDocument(doc, doc._c_doc, c_child)
417 c_child = c_child.next
419 # fix sibling links to/from child slice
420 if c_node.prev is NULL:
421 c_parent.children = c_node.children
423 c_node.prev.next = c_node.children
424 c_node.children.prev = c_node.prev
425 if c_node.next is NULL:
426 c_parent.last = c_node.last
428 c_node.next.prev = c_node.last
429 c_node.last.next = c_node.next
432 c_node.children = c_node.last = NULL
433 c_node.parent = c_node.next = c_node.prev = NULL
436 cdef object _attributeValue(xmlNode* c_element, xmlAttr* c_attrib_node):
439 c_href = _getNs(<xmlNode*>c_attrib_node)
440 value = tree.xmlGetNsProp(c_element, c_attrib_node.name, c_href)
442 result = funicode(value)
447 cdef object _attributeValueFromNsName(xmlNode* c_element,
448 char* c_href, char* c_name):
449 cdef char* c_result = tree.xmlGetNsProp(c_element, c_name, c_href)
453 result = funicode(c_result)
455 tree.xmlFree(c_result)
458 cdef object _getNodeAttributeValue(xmlNode* c_node, key, default):
461 ns, tag = _getNsTag(key)
462 c_href = NULL if ns is None else _cstr(ns)
463 c_result = tree.xmlGetNsProp(c_node, _cstr(tag), c_href)
465 # XXX free namespace that is not in use..?
468 result = funicode(c_result)
470 tree.xmlFree(c_result)
473 cdef inline object _getAttributeValue(_Element element, key, default):
474 return _getNodeAttributeValue(element._c_node, key, default)
476 cdef int _setAttributeValue(_Element element, key, value) except -1:
480 ns, tag = _getNsTag(key)
481 if not element._doc._parser._for_html:
482 _attributeValidOrRaise(tag)
484 if isinstance(value, QName):
485 value = _resolveQNameText(element, value)
488 c_value = _cstr(value)
492 c_ns = element._doc._findOrBuildNodeNs(element._c_node,
494 tree.xmlSetNsProp(element._c_node, c_ns, c_tag, c_value)
497 cdef int _delAttribute(_Element element, key) except -1:
499 ns, tag = _getNsTag(key)
500 c_href = NULL if ns is None else _cstr(ns)
501 if _delAttributeFromNsName(element._c_node, c_href, _cstr(tag)):
505 cdef int _delAttributeFromNsName(xmlNode* c_node, char* c_href, char* c_name):
507 c_attr = tree.xmlHasNsProp(c_node, c_name, c_href)
509 # XXX free namespace that is not in use..?
511 tree.xmlRemoveProp(c_attr)
514 cdef object _collectAttributes(xmlNode* c_node, int collecttype):
515 u"""Collect all attributes of a node in a list. Depending on collecttype,
516 it collects either the name (1), the value (2) or the name-value tuples.
518 cdef Py_ssize_t count
520 c_attr = c_node.properties
522 while c_attr is not NULL:
523 if c_attr.type == tree.XML_ATTRIBUTE_NODE:
530 attributes = python.PyList_New(count)
531 c_attr = c_node.properties
533 while c_attr is not NULL:
534 if c_attr.type == tree.XML_ATTRIBUTE_NODE:
536 item = _namespacedName(<xmlNode*>c_attr)
537 elif collecttype == 2:
538 item = _attributeValue(c_node, c_attr)
540 item = (_namespacedName(<xmlNode*>c_attr),
541 _attributeValue(c_node, c_attr))
543 python.Py_INCREF(item)
544 python.PyList_SET_ITEM(attributes, count, item)
549 cdef object __RE_XML_ENCODING
550 __RE_XML_ENCODING = re.compile(
551 ur'^(\s*<\?\s*xml[^>]+)\s+encoding\s*=\s*"[^"]*"\s*', re.U)
553 cdef object __REPLACE_XML_ENCODING
554 __REPLACE_XML_ENCODING = __RE_XML_ENCODING.sub
556 cdef object __HAS_XML_ENCODING
557 __HAS_XML_ENCODING = __RE_XML_ENCODING.match
559 cdef object _stripEncodingDeclaration(object xml_string):
560 # this is a hack to remove the XML encoding declaration from unicode
561 return __REPLACE_XML_ENCODING(ur'\g<1>', xml_string)
563 cdef int _hasEncodingDeclaration(object xml_string):
564 # check if a (unicode) string has an XML encoding declaration
565 return __HAS_XML_ENCODING(xml_string) is not None
567 cdef object _stripDeclaration(object xml_string):
568 # this is a hack to remove the XML declaration when we encode to UTF-8
569 xml_string = xml_string.strip()
570 if xml_string[:5] == '<?xml':
571 i = xml_string.find('?>')
574 while xml_string[i:i+1] in '\n\r ':
576 xml_string = xml_string[i:]
579 cdef inline int _hasText(xmlNode* c_node):
580 return c_node is not NULL and _textNodeOrSkip(c_node.children) is not NULL
582 cdef inline int _hasTail(xmlNode* c_node):
583 return c_node is not NULL and _textNodeOrSkip(c_node.next) is not NULL
585 cdef _collectText(xmlNode* c_node):
586 u"""Collect all text nodes and return them as a unicode string.
588 Start collecting at c_node.
590 If there was no text to collect, return None
592 cdef Py_ssize_t scount
594 cdef xmlNode* c_node_cur
595 # check for multiple text nodes
598 c_node_cur = c_node = _textNodeOrSkip(c_node)
599 while c_node_cur is not NULL:
600 if c_node_cur.content[0] != c'\0':
601 c_text = c_node_cur.content
603 c_node_cur = _textNodeOrSkip(c_node_cur.next)
605 # handle two most common cases first
608 if python.IS_PYTHON3:
615 return funicode(c_text)
617 # the rest is not performance critical anymore
619 while c_node is not NULL:
620 result = result + c_node.content
621 c_node = _textNodeOrSkip(c_node.next)
622 return funicode(result)
624 cdef void _removeText(xmlNode* c_node):
625 u"""Remove all text nodes.
627 Start removing at c_node.
630 c_node = _textNodeOrSkip(c_node)
631 while c_node is not NULL:
632 c_next = _textNodeOrSkip(c_node.next)
633 tree.xmlUnlinkNode(c_node)
634 tree.xmlFreeNode(c_node)
637 cdef int _setNodeText(xmlNode* c_node, value) except -1:
638 cdef xmlNode* c_text_node
639 # remove all text nodes at the start first
640 _removeText(c_node.children)
643 # now add new text node with value at start
644 if python._isString(value):
646 c_text_node = tree.xmlNewDocText(c_node.doc, _cstr(text))
647 elif isinstance(value, CDATA):
648 c_text_node = tree.xmlNewCDataBlock(
649 c_node.doc, _cstr((<CDATA>value)._utf8_data),
650 python.PyString_GET_SIZE((<CDATA>value)._utf8_data))
652 # this will raise the right error
655 if c_node.children is NULL:
656 tree.xmlAddChild(c_node, c_text_node)
658 tree.xmlAddPrevSibling(c_node.children, c_text_node)
661 cdef int _setTailText(xmlNode* c_node, value) except -1:
662 cdef xmlNode* c_text_node
663 # remove all text nodes at the start first
664 _removeText(c_node.next)
668 c_text_node = tree.xmlNewDocText(c_node.doc, _cstr(text))
669 # XXX what if we're the top element?
670 tree.xmlAddNextSibling(c_node, c_text_node)
673 cdef _resolveQNameText(_Element element, value):
675 ns, tag = _getNsTag(value)
679 c_ns = element._doc._findOrBuildNodeNs(
680 element._c_node, _cstr(ns), NULL)
681 return python.PyString_FromFormat('%s:%s', c_ns.prefix, _cstr(tag))
683 cdef inline bint _hasChild(xmlNode* c_node):
684 return c_node is not NULL and _findChildForwards(c_node, 0) is not NULL
686 cdef inline Py_ssize_t _countElements(xmlNode* c_node):
687 u"Counts the elements within the following siblings and the node itself."
688 cdef Py_ssize_t count
690 while c_node is not NULL:
691 if _isElement(c_node):
696 cdef int _findChildSlice(
697 python.slice sliceobject, xmlNode* c_parent,
698 xmlNode** c_start_node, Py_ssize_t* c_step, Py_ssize_t* c_length) except -1:
699 u"""Resolve a children slice.
701 Returns the start node, step size and the slice length in the
704 cdef Py_ssize_t start, stop, childcount
705 childcount = _countElements(c_parent.children)
707 c_start_node[0] = NULL
709 if sliceobject.step is None:
712 python._PyEval_SliceIndex(sliceobject.step, c_step)
714 python.PySlice_GetIndicesEx(
715 sliceobject, childcount, &start, &stop, c_step, c_length)
716 if start > childcount / 2:
717 c_start_node[0] = _findChildBackwards(c_parent, childcount - start - 1)
719 c_start_node[0] = _findChild(c_parent, start)
722 cdef bint _isFullSlice(python.slice sliceobject):
723 u"""Conservative guess if this slice is a full slice as in ``s[:]``.
726 if sliceobject is None:
728 if sliceobject.start is None and \
729 sliceobject.stop is None:
730 if sliceobject.step is None:
732 python._PyEval_SliceIndex(sliceobject.step, &step)
738 cdef _collectChildren(_Element element):
740 cdef list result = []
741 c_node = element._c_node.children
742 if c_node is not NULL:
743 if not _isElement(c_node):
744 c_node = _nextElement(c_node)
745 while c_node is not NULL:
746 result.append(_elementFactory(element._doc, c_node))
747 c_node = _nextElement(c_node)
750 cdef inline xmlNode* _findChild(xmlNode* c_node, Py_ssize_t index):
752 return _findChildBackwards(c_node, -index - 1)
754 return _findChildForwards(c_node, index)
756 cdef inline xmlNode* _findChildForwards(xmlNode* c_node, Py_ssize_t index):
757 u"""Return child element of c_node with index, or return NULL if not found.
759 cdef xmlNode* c_child
761 c_child = c_node.children
763 while c_child is not NULL:
764 if _isElement(c_child):
768 c_child = c_child.next
771 cdef inline xmlNode* _findChildBackwards(xmlNode* c_node, Py_ssize_t index):
772 u"""Return child element of c_node with index, or return NULL if not found.
775 cdef xmlNode* c_child
777 c_child = c_node.last
779 while c_child is not NULL:
780 if _isElement(c_child):
784 c_child = c_child.prev
787 cdef inline xmlNode* _textNodeOrSkip(xmlNode* c_node) nogil:
788 u"""Return the node if it's a text node. Skip over ignorable nodes in a
789 series of text nodes. Return NULL if a non-ignorable node is found.
791 This is used to skip over XInclude nodes when collecting adjacent text
794 while c_node is not NULL:
795 if c_node.type == tree.XML_TEXT_NODE:
797 if c_node.type == tree.XML_CDATA_SECTION_NODE:
799 elif c_node.type == tree.XML_XINCLUDE_START or \
800 c_node.type == tree.XML_XINCLUDE_END:
806 cdef inline xmlNode* _nextElement(xmlNode* c_node):
807 u"""Given a node, find the next sibling that is an element.
812 while c_node is not NULL:
813 if _isElement(c_node):
818 cdef inline xmlNode* _previousElement(xmlNode* c_node):
819 u"""Given a node, find the next sibling that is an element.
824 while c_node is not NULL:
825 if _isElement(c_node):
830 cdef inline xmlNode* _parentElement(xmlNode* c_node):
831 u"Given a node, find the parent element."
832 if c_node is NULL or not _isElement(c_node):
834 c_node = c_node.parent
835 if c_node is NULL or not _isElement(c_node):
839 cdef inline bint _tagMatches(xmlNode* c_node, char* c_href, char* c_name):
840 u"""Tests if the node matches namespace URI and tag name.
842 A node matches if it matches both c_href and c_name.
844 A node matches c_href if any of the following is true:
846 * its namespace is NULL and c_href is the empty string
847 * its namespace string equals the c_href string
849 A node matches c_name if any of the following is true:
851 * its name string equals the c_name string
853 cdef char* c_node_href
856 if c_node.type != tree.XML_ELEMENT_NODE:
857 # not an element, only succeed if we match everything
858 return c_name is NULL and c_href is NULL
864 c_node_href = _getNs(c_node)
865 if c_node_href is NULL:
866 return c_href[0] == c'\0'
868 return cstd.strcmp(c_node_href, c_href) == 0
870 if _getNs(c_node) is not NULL:
872 return c_node.name == c_name or cstd.strcmp(c_node.name, c_name) == 0
873 elif c_node.name == c_name or cstd.strcmp(c_node.name, c_name) == 0:
874 c_node_href = _getNs(c_node)
875 if c_node_href is NULL:
876 return c_href[0] == c'\0'
878 return cstd.strcmp(c_node_href, c_href) == 0
882 cdef int _removeNode(_Document doc, xmlNode* c_node) except -1:
883 u"""Unlink and free a node and subnodes if possible. Otherwise, make sure
888 tree.xmlUnlinkNode(c_node)
889 _moveTail(c_next, c_node)
890 if not attemptDeallocation(c_node):
891 # make namespaces absolute
892 moveNodeToDocument(doc, c_node.doc, c_node)
895 cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target):
897 # tail support: look for any text nodes trailing this node and
899 c_tail = _textNodeOrSkip(c_tail)
900 while c_tail is not NULL:
901 c_next = _textNodeOrSkip(c_tail.next)
902 tree.xmlUnlinkNode(c_tail)
903 tree.xmlAddNextSibling(c_target, c_tail)
907 cdef int _copyTail(xmlNode* c_tail, xmlNode* c_target) except -1:
908 cdef xmlNode* c_new_tail
909 # tail copying support: look for any text nodes trailing this node and
910 # copy it to the target node
911 c_tail = _textNodeOrSkip(c_tail)
912 while c_tail is not NULL:
913 if c_target.doc is not c_tail.doc:
914 c_new_tail = tree.xmlDocCopyNode(c_tail, c_target.doc, 0)
916 c_new_tail = tree.xmlCopyNode(c_tail, 0)
917 if c_new_tail is NULL:
918 python.PyErr_NoMemory()
919 tree.xmlAddNextSibling(c_target, c_new_tail)
920 c_target = c_new_tail
921 c_tail = _textNodeOrSkip(c_tail.next)
924 cdef int _copyNonElementSiblings(xmlNode* c_node, xmlNode* c_target) except -1:
926 cdef xmlNode* c_sibling = c_node
927 while c_sibling.prev != NULL and \
928 (c_sibling.prev.type == tree.XML_PI_NODE or \
929 c_sibling.prev.type == tree.XML_COMMENT_NODE):
930 c_sibling = c_sibling.prev
931 while c_sibling != c_node:
932 c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1)
934 python.PyErr_NoMemory()
935 tree.xmlAddPrevSibling(c_target, c_copy)
936 c_sibling = c_sibling.next
937 while c_sibling.next != NULL and \
938 (c_sibling.next.type == tree.XML_PI_NODE or \
939 c_sibling.next.type == tree.XML_COMMENT_NODE):
940 c_sibling = c_sibling.next
941 c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1)
943 python.PyErr_NoMemory()
944 tree.xmlAddNextSibling(c_target, c_copy)
946 cdef int _deleteSlice(_Document doc, xmlNode* c_node,
947 Py_ssize_t count, Py_ssize_t step) except -1:
948 u"""Delete slice, ``count`` items starting with ``c_node`` with a step
953 cdef _node_to_node_function next_element
957 next_element = _nextElement
960 next_element = _previousElement
961 # now start deleting nodes
964 while c_node is not NULL and c < count:
965 for i from 0 <= i < step:
966 c_next = next_element(c_next)
967 _removeNode(doc, c_node)
972 cdef int _replaceSlice(_Element parent, xmlNode* c_node,
973 Py_ssize_t slicelength, Py_ssize_t step,
974 bint left_to_right, elements) except -1:
975 u"""Replace the slice of ``count`` elements starting at ``c_node`` with
976 positive step width ``step`` by the Elements in ``elements``. The
977 direction is given by the boolean argument ``left_to_right``.
979 ``c_node`` may be NULL to indicate the end of the children list.
981 cdef xmlNode* c_orig_neighbour
983 cdef xmlDoc* c_source_doc
984 cdef _Element element
985 cdef Py_ssize_t seqlength, i, c
986 cdef _node_to_node_function next_element
989 next_element = _nextElement
991 next_element = _previousElement
993 if not python.PyList_Check(elements) and \
994 not python.PyTuple_Check(elements):
995 elements = list(elements)
998 # *replacing* children stepwise with list => check size!
999 seqlength = len(elements)
1000 if seqlength != slicelength:
1001 raise ValueError, u"attempt to assign sequence of size %d " \
1002 u"to extended slice of size %d" % (seqlength, slicelength)
1005 # no children yet => add all elements straight away
1007 for element in elements:
1008 assert element is not None, u"Node must not be None"
1009 _appendChild(parent, element)
1011 for element in elements:
1012 assert element is not None, u"Node must not be None"
1013 _prependChild(parent, element)
1016 # remove the elements first as some might be re-added
1018 # L->R, remember left neighbour
1019 c_orig_neighbour = _previousElement(c_node)
1021 # R->L, remember right neighbour
1022 c_orig_neighbour = _nextElement(c_node)
1024 # We remove the original slice elements one by one. Since we hold
1025 # a Python reference to all elements that we will insert, it is
1026 # safe to let _removeNode() try (and fail) to free them even if
1027 # the element itself or one of its descendents will be reinserted.
1030 while c_node is not NULL and c < slicelength:
1031 for i from 0 <= i < step:
1032 c_next = next_element(c_next)
1033 _removeNode(parent._doc, c_node)
1037 # make sure each element is inserted only once
1038 elements = iter(elements)
1040 # find the first node right of the new insertion point
1042 if c_orig_neighbour is not NULL:
1043 c_node = next_element(c_orig_neighbour)
1045 # before the first element
1046 c_node = _findChildForwards(parent._c_node, 0)
1047 elif c_orig_neighbour is NULL:
1048 # at the end, but reversed stepping
1049 # append one element and go to the next insertion point
1050 for element in elements:
1051 assert element is not None, u"Node must not be None"
1052 _appendChild(parent, element)
1053 c_node = element._c_node
1055 slicelength = slicelength - 1
1056 for i from 1 <= i < step:
1057 c_node = next_element(c_node)
1061 # adjust step size after removing slice as we are not stepping
1062 # over the newly inserted elements
1065 # now insert elements where we removed them
1066 if c_node is not NULL:
1067 for element in elements:
1068 assert element is not None, u"Node must not be None"
1069 # move element and tail over
1070 c_source_doc = element._c_node.doc
1071 c_next = element._c_node.next
1072 tree.xmlAddPrevSibling(c_node, element._c_node)
1073 _moveTail(c_next, element._c_node)
1075 # integrate element into new document
1076 moveNodeToDocument(parent._doc, c_source_doc, element._c_node)
1078 # stop at the end of the slice
1080 slicelength = slicelength - 1
1081 for i from 0 <= i < step:
1082 c_node = next_element(c_node)
1086 # everything inserted
1089 # append the remaining elements at the respective end
1091 for element in elements:
1092 assert element is not None, u"Node must not be None"
1093 _appendChild(parent, element)
1095 for element in elements:
1096 assert element is not None, u"Node must not be None"
1097 _prependChild(parent, element)
1101 cdef int _appendChild(_Element parent, _Element child) except -1:
1102 u"""Append a new child to a parent element.
1104 cdef xmlNode* c_next
1105 cdef xmlNode* c_node
1106 cdef xmlDoc* c_source_doc
1107 c_node = child._c_node
1108 c_source_doc = c_node.doc
1109 # store possible text node
1110 c_next = c_node.next
1112 tree.xmlUnlinkNode(c_node)
1113 tree.xmlAddChild(parent._c_node, c_node)
1114 _moveTail(c_next, c_node)
1115 # uh oh, elements may be pointing to different doc when
1116 # parent element has moved; change them too..
1117 moveNodeToDocument(parent._doc, c_source_doc, c_node)
1119 cdef int _prependChild(_Element parent, _Element child) except -1:
1120 u"""Prepend a new child to a parent element.
1122 cdef xmlNode* c_next
1123 cdef xmlNode* c_child
1124 cdef xmlNode* c_node
1125 cdef xmlDoc* c_source_doc
1126 c_node = child._c_node
1127 c_source_doc = c_node.doc
1128 # store possible text node
1129 c_next = c_node.next
1131 c_child = _findChildForwards(parent._c_node, 0)
1133 tree.xmlUnlinkNode(c_node)
1134 tree.xmlAddChild(parent._c_node, c_node)
1136 tree.xmlAddPrevSibling(c_child, c_node)
1137 _moveTail(c_next, c_node)
1138 # uh oh, elements may be pointing to different doc when
1139 # parent element has moved; change them too..
1140 moveNodeToDocument(parent._doc, c_source_doc, c_node)
1142 cdef int _appendSibling(_Element element, _Element sibling) except -1:
1143 u"""Append a new child to a parent element.
1145 cdef xmlNode* c_next
1146 cdef xmlNode* c_node
1147 cdef xmlDoc* c_source_doc
1148 c_node = sibling._c_node
1149 c_source_doc = c_node.doc
1150 # store possible text node
1151 c_next = c_node.next
1153 tree.xmlAddNextSibling(element._c_node, c_node)
1154 _moveTail(c_next, c_node)
1155 # uh oh, elements may be pointing to different doc when
1156 # parent element has moved; change them too..
1157 moveNodeToDocument(element._doc, c_source_doc, c_node)
1159 cdef int _prependSibling(_Element element, _Element sibling) except -1:
1160 u"""Append a new child to a parent element.
1162 cdef xmlNode* c_next
1163 cdef xmlNode* c_node
1164 cdef xmlDoc* c_source_doc
1165 c_node = sibling._c_node
1166 c_source_doc = c_node.doc
1167 # store possible text node
1168 c_next = c_node.next
1170 tree.xmlAddPrevSibling(element._c_node, c_node)
1171 _moveTail(c_next, c_node)
1172 # uh oh, elements may be pointing to different doc when
1173 # parent element has moved; change them too..
1174 moveNodeToDocument(element._doc, c_source_doc, c_node)
1176 cdef inline int isutf8(char* s):
1186 cdef int check_string_utf8(pystring):
1187 u"""Check if a string looks like valid UTF-8 XML content. Returns 0
1188 for ASCII, 1 for UTF-8 and -1 in the case of errors, such as NULL
1189 bytes or ASCII control characters.
1194 cdef bint is_non_ascii
1196 c_end = s + python.PyString_GET_SIZE(pystring)
1200 # skip the entire multi byte sequence
1204 elif not tree.xmlIsChar_ch(s[0]):
1205 return -1 # invalid!
1209 cdef object funicode(char* s):
1210 cdef Py_ssize_t slen
1212 cdef bint is_non_ascii
1213 if python.IS_PYTHON3:
1214 slen = cstd.strlen(s)
1215 return python.PyUnicode_DecodeUTF8(s, slen, NULL)
1218 while spos[0] != c'\0':
1223 while spos[0] != c'\0':
1227 return python.PyUnicode_DecodeUTF8(s, slen, NULL)
1228 return python.PyString_FromStringAndSize(s, slen)
1230 cdef object _utf8(object s):
1232 if python.PyString_CheckExact(s):
1233 invalid = check_string_utf8(s)
1234 elif python.PyUnicode_CheckExact(s) or python.PyUnicode_Check(s):
1235 s = python.PyUnicode_AsUTF8String(s)
1236 invalid = check_string_utf8(s) == -1
1237 elif python.PyString_Check(s):
1238 invalid = check_string_utf8(s)
1240 raise TypeError, u"Argument must be string or unicode."
1243 u"All strings must be XML compatible: Unicode or ASCII, no NULL bytes"
1246 cdef bint _isFilePath(char* c_path):
1247 u"simple heuristic to see if a path is a filename"
1249 # test if it looks like an absolute Unix path or a Windows network path
1250 if c_path[0] == c'/':
1252 # test if it looks like an absolute Windows path
1253 if (c_path[0] >= c'a' and c_path[0] <= c'z') or \
1254 (c_path[0] >= c'A' and c_path[0] <= c'Z'):
1255 if c_path[1] == c':':
1257 # test if it looks like a relative path
1258 while c_path[0] != c'\0':
1269 cdef object _encodeFilename(object filename):
1270 u"""Make sure a filename is 8-bit encoded (or None).
1272 if filename is None:
1274 elif python.PyString_Check(filename):
1276 elif python.PyUnicode_Check(filename):
1277 filename8 = python.PyUnicode_AsEncodedString(
1278 filename, 'UTF-8', NULL)
1279 if _isFilePath(filename8):
1281 return python.PyUnicode_AsEncodedString(
1282 filename, _C_FILENAME_ENCODING, NULL)
1283 except UnicodeEncodeError:
1287 raise TypeError, u"Argument must be string or unicode."
1289 cdef object _decodeFilename(char* c_path):
1290 u"""Make the filename a unicode string if we are in Py3.
1292 cdef Py_ssize_t c_len = cstd.strlen(c_path)
1293 if _isFilePath(c_path):
1295 return python.PyUnicode_Decode(
1296 c_path, c_len, _C_FILENAME_ENCODING, NULL)
1297 except UnicodeDecodeError:
1300 return python.PyUnicode_DecodeUTF8(c_path, c_len, NULL)
1301 except UnicodeDecodeError:
1302 # this is a stupid fallback, but it might still work...
1303 return python.PyUnicode_DecodeLatin1(c_path, c_len, 'replace')
1305 cdef object _encodeFilenameUTF8(object filename):
1306 u"""Recode filename as UTF-8. Tries ASCII, local filesystem encoding and
1307 UTF-8 as source encoding.
1309 cdef char* c_filename
1310 if filename is None:
1312 elif python.PyString_Check(filename):
1313 if not check_string_utf8(filename):
1316 c_filename = _cstr(filename)
1318 # try to decode with default encoding
1319 filename = python.PyUnicode_Decode(
1320 c_filename, python.PyString_GET_SIZE(filename),
1321 _C_FILENAME_ENCODING, NULL)
1322 except UnicodeDecodeError, decode_exc:
1325 filename = python.PyUnicode_DecodeUTF8(
1326 c_filename, python.PyString_GET_SIZE(filename), NULL)
1327 except UnicodeDecodeError:
1328 raise decode_exc # otherwise re-raise original exception
1329 if python.PyUnicode_Check(filename):
1330 return python.PyUnicode_AsUTF8String(filename)
1332 raise TypeError, u"Argument must be string or unicode."
1334 cdef _getNsTag(tag):
1335 u"""Given a tag, find namespace URI and tag name.
1336 Return None for NS uri if no namespace URI available.
1340 cdef Py_ssize_t taglen
1341 cdef Py_ssize_t nslen
1342 # _isString() is much faster than isinstance()
1343 if not _isString(tag) and isinstance(tag, QName):
1344 tag = (<QName>tag).text
1347 if c_tag[0] == c'{':
1349 c_ns_end = cstd.strchr(c_tag, c'}')
1350 if c_ns_end is NULL:
1351 raise ValueError, u"Invalid tag name"
1352 nslen = c_ns_end - c_tag
1353 taglen = python.PyString_GET_SIZE(tag) - nslen - 2
1355 raise ValueError, u"Empty tag name"
1357 ns = python.PyString_FromStringAndSize(c_tag, nslen)
1358 tag = python.PyString_FromStringAndSize(c_ns_end+1, taglen)
1359 elif python.PyString_GET_SIZE(tag) == 0:
1360 raise ValueError, u"Empty tag name"
1363 cdef inline int _pyXmlNameIsValid(name_utf8):
1364 return _xmlNameIsValid(_cstr(name_utf8))
1366 cdef inline int _pyHtmlNameIsValid(name_utf8):
1367 return _htmlNameIsValid(_cstr(name_utf8))
1369 cdef inline int _xmlNameIsValid(char* c_name):
1370 return tree.xmlValidateNCName(c_name, 0) == 0
1372 cdef int _htmlNameIsValid(char* c_name):
1374 if c_name is NULL or c_name[0] == c'\0':
1376 while c_name[0] != c'\0':
1378 if c in (c'&', c'<', c'>', c'/', c'"', c"'",
1379 c'\t', c'\n', c'\x0B', c'\x0C', c'\r', c' '):
1384 cdef bint _characterReferenceIsValid(char* c_name):
1386 if c_name[0] == c'x':
1391 if c_name[0] == c'\0':
1393 while c_name[0] != c'\0':
1394 if c_name[0] < c'0' or c_name[0] > c'9':
1397 if not (c_name[0] >= c'a' and c_name[0] <= c'f'):
1398 if not (c_name[0] >= c'A' and c_name[0] <= c'F'):
1403 cdef int _tagValidOrRaise(tag_utf) except -1:
1404 if not _pyXmlNameIsValid(tag_utf):
1405 raise ValueError, u"Invalid tag name %r" % \
1406 python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', NULL)
1409 cdef int _htmlTagValidOrRaise(tag_utf) except -1:
1410 if not _pyHtmlNameIsValid(tag_utf):
1411 raise ValueError, u"Invalid HTML tag name %r" % \
1412 python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', NULL)
1415 cdef int _attributeValidOrRaise(name_utf) except -1:
1416 if not _pyXmlNameIsValid(name_utf):
1417 raise ValueError, u"Invalid attribute name %r" % \
1418 python.PyUnicode_FromEncodedObject(name_utf, 'UTF-8', NULL)
1421 cdef int _prefixValidOrRaise(tag_utf) except -1:
1422 if not _pyXmlNameIsValid(tag_utf):
1423 raise ValueError, u"Invalid namespace prefix %r" % \
1424 python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', NULL)
1427 cdef inline object _namespacedName(xmlNode* c_node):
1428 return _namespacedNameFromNsName(_getNs(c_node), c_node.name)
1430 cdef object _namespacedNameFromNsName(char* href, char* name):
1432 return funicode(name)
1433 elif python.IS_PYTHON3:
1434 return python.PyUnicode_FromFormat("{%s}%s", href, name)
1436 s = python.PyString_FromFormat("{%s}%s", href, name)
1437 if isutf8(href) or isutf8(name):
1438 return python.PyUnicode_FromEncodedObject(s, 'UTF-8', NULL)
1442 cdef _getFilenameForFile(source):
1443 u"""Given a Python File or Gzip object, give filename back.
1445 Returns None if not a file object.
1447 # file instances have a name attribute
1448 filename = getattr3(source, u'name', None)
1449 if filename is not None:
1450 return os_path_abspath(filename)
1451 # urllib2 provides a geturl() method
1452 geturl = getattr3(source, u'geturl', None)
1453 if geturl is not None:
1455 # gzip file instances have a filename attribute (before Py3k)
1456 filename = getattr3(source, u'filename', None)
1457 if filename is not None:
1458 return os_path_abspath(filename)
1459 # can't determine filename