src/lxml/apihelpers.pxi

   1 # Private/public helper functions for API functions
   2
   3 cdef void displayNode(xmlNode* c_node, indent):
   4     # to help with debugging
   5     cdef xmlNode* c_child
   6     print indent * u' ', <long>c_node
   7     c_child = c_node.children
   8     while c_child is not NULL:
   9         displayNode(c_child, indent + 1)
  10         c_child = c_child.next
  11
  12 cdef _Document _documentOrRaise(object input):
  13     u"""Call this to get the document of a _Document, _ElementTree or _Element
  14     object, or to raise an exception if it can't be determined.
  15
  16     Should be used in all API functions for consistency.
  17     """
  18     cdef _Document doc
  19     cdef _Element element
  20     if isinstance(input, _ElementTree):
  21         element = (<_ElementTree>input)._context_node
  22         if element is not None:
  23             doc = element._doc
  24     elif isinstance(input, _Element):
  25         doc = (<_Element>input)._doc
  26     elif isinstance(input, _Document):
  27         doc = <_Document>input
  28     else:
  29         raise TypeError, u"Invalid input object: %s" % \
  30             python._fqtypename(input)
  31     if doc is None:
  32         raise ValueError, u"Input object has no document: %s" % \
  33             python._fqtypename(input)
  34     else:
  35         return doc
  36
  37 cdef _Element _rootNodeOrRaise(object input):
  38     u"""Call this to get the root node of a _Document, _ElementTree or
  39      _Element object, or to raise an exception if it can't be determined.
  40
  41     Should be used in all API functions for consistency.
  42      """
  43     cdef _Element node
  44     if isinstance(input, _ElementTree):
  45         node = (<_ElementTree>input)._context_node
  46     elif isinstance(input, _Element):
  47         node = <_Element>input
  48     elif isinstance(input, _Document):
  49         node = (<_Document>input).getroot()
  50     else:
  51         raise TypeError, u"Invalid input object: %s" % \
  52             python._fqtypename(input)
  53     if node is None:
  54         raise ValueError, u"Input object has no element: %s" % \
  55             python._fqtypename(input)
  56     else:
  57         return node
  58
  59 cdef _Document _documentOf(object input):
  60     # call this to get the document of a
  61     # _Document, _ElementTree or _Element object
  62     # may return None!
  63     cdef _Element element
  64     if isinstance(input, _ElementTree):
  65         element = (<_ElementTree>input)._context_node
  66         if element is not None:
  67             return element._doc
  68     elif isinstance(input, _Element):
  69         return (<_Element>input)._doc
  70     elif isinstance(input, _Document):
  71         return <_Document>input
  72     return None
  73
  74 cdef _Element _rootNodeOf(object input):
  75     # call this to get the root node of a
  76     # _Document, _ElementTree or _Element object
  77     # may return None!
  78     if isinstance(input, _ElementTree):
  79         return (<_ElementTree>input)._context_node
  80     elif isinstance(input, _Element):
  81         return <_Element>input
  82     elif isinstance(input, _Document):
  83         return (<_Document>input).getroot()
  84     else:
  85         return None
  86
  87 cdef _Element _makeElement(tag, xmlDoc* c_doc, _Document doc,
  88                            _BaseParser parser, text, tail, attrib, nsmap,
  89                            extra_attrs):
  90     u"""Create a new element and initialize text content, namespaces and
  91     attributes.
  92
  93     This helper function will reuse as much of the existing document as
  94     possible:
  95
  96     If 'parser' is None, the parser will be inherited from 'doc' or the
  97     default parser will be used.
  98
  99     If 'doc' is None, 'c_doc' is used to create a new _Document and the new
 100     element is made its root node.
 101
 102     If 'c_doc' is also NULL, a new xmlDoc will be created.
 103     """
 104     cdef xmlNode* c_node
 105     if doc is not None:
 106         c_doc = doc._c_doc
 107     ns_utf, name_utf = _getNsTag(tag)
 108     if parser is not None and parser._for_html:
 109         _htmlTagValidOrRaise(name_utf)
 110         if c_doc is NULL:
 111             c_doc = _newHTMLDoc()
 112     else:
 113         _tagValidOrRaise(name_utf)
 114         if c_doc is NULL:
 115             c_doc = _newXMLDoc()
 116     c_node = _createElement(c_doc, name_utf)
 117     if c_node is NULL:
 118         if doc is None and c_doc is not NULL:
 119             tree.xmlFreeDoc(c_doc)
 120         return python.PyErr_NoMemory()
 121     try:
 122         if doc is None:
 123             tree.xmlDocSetRootElement(c_doc, c_node)
 124             doc = _documentFactory(c_doc, parser)
 125         if text is not None:
 126             _setNodeText(c_node, text)
 127         if tail is not None:
 128             _setTailText(c_node, tail)
 129         # add namespaces to node if necessary
 130         _initNodeNamespaces(c_node, doc, ns_utf, nsmap)
 131         _initNodeAttributes(c_node, doc, attrib, extra_attrs)
 132         return _elementFactory(doc, c_node)
 133     except:
 134         # free allocated c_node/c_doc unless Python does it for us
 135         if c_node.doc is not c_doc:
 136             # node not yet in document => will not be freed by document
 137             if tail is not None:
 138                 _removeText(c_node.next) # tail
 139             tree.xmlFreeNode(c_node)
 140         if doc is None:
 141             # c_doc will not be freed by doc
 142             tree.xmlFreeDoc(c_doc)
 143         raise
 144
 145 cdef int _initNewElement(_Element element, bint is_html, name_utf, ns_utf,
 146                          _BaseParser parser, attrib, nsmap, extra_attrs) except -1:
 147     u"""Initialise a new Element object.
 148
 149     This is used when users instantiate a Python Element class
 150     directly, without it being mapped to an existing XML node.
 151     """
 152     cdef xmlDoc* c_doc
 153     cdef xmlNode* c_node
 154     cdef _Document doc
 155     if is_html:
 156         _htmlTagValidOrRaise(name_utf)
 157         c_doc = _newHTMLDoc()
 158     else:
 159         _tagValidOrRaise(name_utf)
 160         c_doc = _newXMLDoc()
 161     c_node = _createElement(c_doc, name_utf)
 162     if c_node is NULL:
 163         if c_doc is not NULL:
 164             tree.xmlFreeDoc(c_doc)
 165         return python.PyErr_NoMemory()
 166     tree.xmlDocSetRootElement(c_doc, c_node)
 167     doc = _documentFactory(c_doc, parser)
 168     # add namespaces to node if necessary
 169     _initNodeNamespaces(c_node, doc, ns_utf, nsmap)
 170     _initNodeAttributes(c_node, doc, attrib, extra_attrs)
 171     _registerProxy(element, doc, c_node)
 172     element._init()
 173     return 0
 174
 175 cdef _Element _makeSubElement(_Element parent, tag, text, tail,
 176                               attrib, nsmap, extra_attrs):
 177     u"""Create a new child element and initialize text content, namespaces and
 178     attributes.
 179     """
 180     cdef xmlNode* c_node
 181     cdef xmlDoc* c_doc
 182     if parent is None or parent._doc is None:
 183         return None
 184     ns_utf, name_utf = _getNsTag(tag)
 185     c_doc = parent._doc._c_doc
 186
 187     if parent._doc._parser is not None and parent._doc._parser._for_html:
 188         _htmlTagValidOrRaise(name_utf)
 189     else:
 190         _tagValidOrRaise(name_utf)
 191
 192     c_node = _createElement(c_doc, name_utf)
 193     if c_node is NULL:
 194         return python.PyErr_NoMemory()
 195     tree.xmlAddChild(parent._c_node, c_node)
 196
 197     try:
 198         if text is not None:
 199             _setNodeText(c_node, text)
 200         if tail is not None:
 201             _setTailText(c_node, tail)
 202
 203         # add namespaces to node if necessary
 204         _initNodeNamespaces(c_node, parent._doc, ns_utf, nsmap)
 205         _initNodeAttributes(c_node, parent._doc, attrib, extra_attrs)
 206         return _elementFactory(parent._doc, c_node)
 207     except:
 208         # make sure we clean up in case of an error
 209         _removeNode(parent._doc, c_node)
 210         raise
 211
 212 cdef int _initNodeNamespaces(xmlNode* c_node, _Document doc,
 213                              object node_ns_utf, object nsmap) except -1:
 214     u"""Lookup current namespace prefixes, then set namespace structure for
 215     node and register new ns-prefix mappings.
 216
 217     This only works for a newly created node!
 218     """
 219     cdef xmlNs* c_ns
 220     cdef char*  c_prefix
 221     cdef char*  c_href
 222     cdef list nsdefs
 223     if not nsmap:
 224         if node_ns_utf is not None:
 225             doc._setNodeNs(c_node, _cstr(node_ns_utf))
 226         return 0
 227
 228     nsdefs = list(nsmap.items())
 229     if None in nsmap and python.PyList_GET_SIZE(nsdefs) > 1:
 230         # Move the default namespace to the end.  This makes sure libxml2
 231         # prefers a prefix if the ns is defined redundantly on the same
 232         # element.  That way, users can work around a problem themselves
 233         # where default namespace attributes on non-default namespaced
 234         # elements serialise without prefix (i.e. into the non-default
 235         # namespace).
 236         item = (None, nsmap[None])
 237         nsdefs.remove(item)
 238         nsdefs.append(item)
 239
 240     for prefix, href in nsdefs:
 241         href_utf = _utf8(href)
 242         c_href = _cstr(href_utf)
 243         if prefix is not None:
 244             prefix_utf = _utf8(prefix)
 245             _prefixValidOrRaise(prefix_utf)
 246             c_prefix = _cstr(prefix_utf)
 247         else:
 248             c_prefix = NULL
 249         # add namespace with prefix if it is not already known
 250         c_ns = tree.xmlSearchNs(doc._c_doc, c_node, c_prefix)
 251         if c_ns is NULL or \
 252                 c_ns.href is NULL or \
 253                 cstd.strcmp(c_ns.href, c_href) != 0:
 254             c_ns = tree.xmlNewNs(c_node, c_href, c_prefix)
 255         if href_utf == node_ns_utf:
 256             tree.xmlSetNs(c_node, c_ns)
 257             node_ns_utf = None
 258
 259     if node_ns_utf is not None:
 260         doc._setNodeNs(c_node, _cstr(node_ns_utf))
 261     return 0
 262
 263 cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, extra):
 264     u"""Initialise the attributes of an element node.
 265     """
 266     cdef bint is_html
 267     cdef xmlNs* c_ns
 268     # 'extra' is not checked here (expected to be a keyword dict)
 269     if attrib is not None and not hasattr(attrib, u'items'):
 270         raise TypeError, u"Invalid attribute dictionary: %s" % \
 271             python._fqtypename(attrib)
 272     if extra is not None and extra:
 273         if attrib is None:
 274             attrib = extra
 275         else:
 276             attrib.update(extra)
 277     if attrib:
 278         is_html = doc._parser._for_html
 279         for name, value in attrib.items():
 280             attr_ns_utf, attr_name_utf = _getNsTag(name)
 281             if not is_html:
 282                 _attributeValidOrRaise(attr_name_utf)
 283             value_utf = _utf8(value)
 284             if attr_ns_utf is None:
 285                 tree.xmlNewProp(c_node, _cstr(attr_name_utf), _cstr(value_utf))
 286             else:
 287                 c_ns = doc._findOrBuildNodeNs(c_node, _cstr(attr_ns_utf), NULL)
 288                 tree.xmlNewNsProp(c_node, c_ns,
 289                                   _cstr(attr_name_utf), _cstr(value_utf))
 290
 291 ctypedef struct _ns_node_ref:
 292     xmlNs* ns
 293     xmlNode* node
 294
 295 cdef int _removeUnusedNamespaceDeclarations(xmlNode* c_element) except -1:
 296     u"""Remove any namespace declarations from a subtree that do not used
 297     by any of its elements (or attributes).
 298     """
 299     cdef _ns_node_ref* c_ns_list
 300     cdef _ns_node_ref* c_nsref_ptr
 301     cdef xmlNs* c_nsdef
 302     cdef xmlNode* c_node
 303     cdef size_t c_ns_list_size
 304     cdef size_t c_ns_list_len
 305     cdef size_t i
 306
 307     c_ns_list = NULL
 308     c_ns_list_size = 0
 309     c_ns_list_len  = 0
 310
 311     if c_element.parent is not NULL and \
 312             c_element.parent.type == tree.XML_DOCUMENT_NODE:
 313         # include the document node
 314         c_nsdef = c_element.parent.nsDef
 315         while c_nsdef is not NULL:
 316             if c_ns_list_len >= c_ns_list_size:
 317                 if c_ns_list is NULL:
 318                     c_ns_list_size = 20
 319                 else:
 320                     c_ns_list_size *= 2
 321                 c_nsref_ptr = <_ns_node_ref*> cstd.realloc(
 322                     c_ns_list, c_ns_list_size * sizeof(_ns_node_ref))
 323                 if c_nsref_ptr is NULL:
 324                     if c_ns_list is not NULL:
 325                         cstd.free(c_ns_list)
 326                     python.PyErr_NoMemory()
 327                     return -1
 328                 c_ns_list = c_nsref_ptr
 329
 330             c_ns_list[c_ns_list_len].ns   = c_nsdef
 331             c_ns_list[c_ns_list_len].node = c_element.parent
 332             c_ns_list_len += 1
 333             c_nsdef = c_nsdef.next
 334
 335     tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_element, c_element, 1)
 336     # collect all new namespace declarations into the ns list
 337     c_nsdef = c_element.nsDef
 338     while c_nsdef is not NULL:
 339         if c_ns_list_len >= c_ns_list_size:
 340             if c_ns_list is NULL:
 341                 c_ns_list_size = 20
 342             else:
 343                 c_ns_list_size *= 2
 344             c_nsref_ptr = <_ns_node_ref*> cstd.realloc(
 345                 c_ns_list, c_ns_list_size * sizeof(_ns_node_ref))
 346             if c_nsref_ptr is NULL:
 347                 if c_ns_list is not NULL:
 348                     cstd.free(c_ns_list)
 349                 python.PyErr_NoMemory()
 350                 return -1
 351             c_ns_list = c_nsref_ptr
 352
 353         c_ns_list[c_ns_list_len].ns   = c_nsdef
 354         c_ns_list[c_ns_list_len].node = c_element
 355         c_ns_list_len += 1
 356         c_nsdef = c_nsdef.next
 357
 358     # remove all namespace declarations from the list that are referenced
 359     if c_element.type == tree.XML_ELEMENT_NODE:
 360         c_node = c_element
 361         while c_node is not NULL:
 362             if c_node.ns is not NULL:
 363                 for i from 0 <= i < c_ns_list_len:
 364                     if c_node.ns is c_ns_list[i].ns:
 365                         c_ns_list_len -= 1
 366                         c_ns_list[i].ns   = c_ns_list[c_ns_list_len].ns
 367                         c_ns_list[i].node = c_ns_list[c_ns_list_len].node
 368                         c_ns_list[c_ns_list_len].ns   = NULL
 369                         c_ns_list[c_ns_list_len].node = NULL
 370                         break
 371             if c_node is c_element:
 372                 # continue with attributes
 373                 c_node = <xmlNode*>c_element.properties
 374             else:
 375                 c_node = c_node.next
 376     tree.END_FOR_EACH_ELEMENT_FROM(c_element)
 377
 378     if c_ns_list is NULL:
 379         return 0
 380
 381     # free all namespace declarations that remained in the list
 382     for i from 0 <= i < c_ns_list_len:
 383         c_node = c_ns_list[i].node
 384         c_nsdef = c_node.nsDef
 385         if c_nsdef is c_ns_list[i].ns:
 386             c_node.nsDef = c_node.nsDef.next
 387         else:
 388             while c_nsdef.next is not c_ns_list[i].ns:
 389                 c_nsdef = c_nsdef.next
 390             c_nsdef.next = c_nsdef.next.next
 391         tree.xmlFreeNs(c_ns_list[i].ns)
 392
 393     if c_ns_list is not NULL:
 394         cstd.free(c_ns_list)
 395     return 0
 396
 397 cdef int _replaceNodeByChildren(_Document doc, xmlNode* c_node) except -1:
 398     cdef xmlNode* c_parent
 399     cdef xmlNode* c_child
 400     if c_node.children is NULL:
 401         tree.xmlUnlinkNode(c_node)
 402         return 0
 403
 404     c_parent = c_node.parent
 405     # fix parent links of children
 406     c_child = c_node.children
 407     while c_child is not NULL:
 408         c_child.parent = c_parent
 409         c_child = c_child.next
 410
 411     # fix namespace references of children if their parent's namespace
 412     # declarations get lost
 413     if c_node.nsDef is not NULL:
 414         c_child = c_node.children
 415         while c_child is not NULL:
 416             moveNodeToDocument(doc, doc._c_doc, c_child)
 417             c_child = c_child.next
 418
 419     # fix sibling links to/from child slice
 420     if c_node.prev is NULL:
 421         c_parent.children = c_node.children
 422     else:
 423         c_node.prev.next = c_node.children
 424         c_node.children.prev = c_node.prev
 425     if c_node.next is NULL:
 426         c_parent.last = c_node.last
 427     else:
 428         c_node.next.prev = c_node.last
 429         c_node.last.next = c_node.next
 430
 431     # unlink c_node
 432     c_node.children = c_node.last = NULL
 433     c_node.parent = c_node.next = c_node.prev = NULL
 434     return 0
 435
 436 cdef object _attributeValue(xmlNode* c_element, xmlAttr* c_attrib_node):
 437     cdef char* value
 438     cdef char* c_href
 439     c_href = _getNs(<xmlNode*>c_attrib_node)
 440     value = tree.xmlGetNsProp(c_element, c_attrib_node.name, c_href)
 441     try:
 442         result = funicode(value)
 443     finally:
 444         tree.xmlFree(value)
 445     return result
 446
 447 cdef object _attributeValueFromNsName(xmlNode* c_element,
 448                                       char* c_href, char* c_name):
 449     cdef char* c_result = tree.xmlGetNsProp(c_element, c_name, c_href)
 450     if c_result is NULL:
 451         return None
 452     try:
 453         result = funicode(c_result)
 454     finally:
 455         tree.xmlFree(c_result)
 456     return result
 457
 458 cdef object _getNodeAttributeValue(xmlNode* c_node, key, default):
 459     cdef char* c_result
 460     cdef char* c_href
 461     ns, tag = _getNsTag(key)
 462     c_href = NULL if ns is None else _cstr(ns)
 463     c_result = tree.xmlGetNsProp(c_node, _cstr(tag), c_href)
 464     if c_result is NULL:
 465         # XXX free namespace that is not in use..?
 466         return default
 467     try:
 468         result = funicode(c_result)
 469     finally:
 470         tree.xmlFree(c_result)
 471     return result
 472
 473 cdef inline object _getAttributeValue(_Element element, key, default):
 474     return _getNodeAttributeValue(element._c_node, key, default)
 475
 476 cdef int _setAttributeValue(_Element element, key, value) except -1:
 477     cdef xmlNs* c_ns
 478     cdef char* c_value
 479     cdef char* c_tag
 480     ns, tag = _getNsTag(key)
 481     if not element._doc._parser._for_html:
 482         _attributeValidOrRaise(tag)
 483     c_tag = _cstr(tag)
 484     if isinstance(value, QName):
 485         value = _resolveQNameText(element, value)
 486     else:
 487         value = _utf8(value)
 488     c_value = _cstr(value)
 489     if ns is None:
 490         c_ns = NULL
 491     else:
 492         c_ns = element._doc._findOrBuildNodeNs(element._c_node,
 493                                                _cstr(ns), NULL)
 494     tree.xmlSetNsProp(element._c_node, c_ns, c_tag, c_value)
 495     return 0
 496
 497 cdef int _delAttribute(_Element element, key) except -1:
 498     cdef char* c_href
 499     ns, tag = _getNsTag(key)
 500     c_href = NULL if ns is None else _cstr(ns)
 501     if _delAttributeFromNsName(element._c_node, c_href, _cstr(tag)):
 502         raise KeyError, key
 503     return 0
 504
 505 cdef int _delAttributeFromNsName(xmlNode* c_node, char* c_href, char* c_name):
 506     cdef xmlAttr* c_attr
 507     c_attr = tree.xmlHasNsProp(c_node, c_name, c_href)
 508     if c_attr is NULL:
 509         # XXX free namespace that is not in use..?
 510         return -1
 511     tree.xmlRemoveProp(c_attr)
 512     return 0
 513
 514 cdef object _collectAttributes(xmlNode* c_node, int collecttype):
 515     u"""Collect all attributes of a node in a list.  Depending on collecttype,
 516     it collects either the name (1), the value (2) or the name-value tuples.
 517     """
 518     cdef Py_ssize_t count
 519     cdef xmlAttr* c_attr
 520     c_attr = c_node.properties
 521     count = 0
 522     while c_attr is not NULL:
 523         if c_attr.type == tree.XML_ATTRIBUTE_NODE:
 524             count = count + 1
 525         c_attr = c_attr.next
 526
 527     if count == 0:
 528         return []
 529
 530     attributes = python.PyList_New(count)
 531     c_attr = c_node.properties
 532     count = 0
 533     while c_attr is not NULL:
 534         if c_attr.type == tree.XML_ATTRIBUTE_NODE:
 535             if collecttype == 1:
 536                 item = _namespacedName(<xmlNode*>c_attr)
 537             elif collecttype == 2:
 538                 item = _attributeValue(c_node, c_attr)
 539             else:
 540                 item = (_namespacedName(<xmlNode*>c_attr),
 541                         _attributeValue(c_node, c_attr))
 542
 543             python.Py_INCREF(item)
 544             python.PyList_SET_ITEM(attributes, count, item)
 545             count = count + 1
 546         c_attr = c_attr.next
 547     return attributes
 548
 549 cdef object __RE_XML_ENCODING
 550 __RE_XML_ENCODING = re.compile(
 551     ur'^(\s*<\?\s*xml[^>]+)\s+encoding\s*=\s*"[^"]*"\s*', re.U)
 552
 553 cdef object __REPLACE_XML_ENCODING
 554 __REPLACE_XML_ENCODING = __RE_XML_ENCODING.sub
 555
 556 cdef object __HAS_XML_ENCODING
 557 __HAS_XML_ENCODING = __RE_XML_ENCODING.match
 558
 559 cdef object _stripEncodingDeclaration(object xml_string):
 560     # this is a hack to remove the XML encoding declaration from unicode
 561     return __REPLACE_XML_ENCODING(ur'\g<1>', xml_string)
 562
 563 cdef int _hasEncodingDeclaration(object xml_string):
 564     # check if a (unicode) string has an XML encoding declaration
 565     return __HAS_XML_ENCODING(xml_string) is not None
 566
 567 cdef object _stripDeclaration(object xml_string):
 568     # this is a hack to remove the XML declaration when we encode to UTF-8
 569     xml_string = xml_string.strip()
 570     if xml_string[:5] == '<?xml':
 571         i = xml_string.find('?>')
 572         if i != -1:
 573             i = i + 2
 574             while xml_string[i:i+1] in '\n\r ':
 575                 i = i+1
 576             xml_string = xml_string[i:]
 577     return xml_string
 578
 579 cdef inline int _hasText(xmlNode* c_node):
 580     return c_node is not NULL and _textNodeOrSkip(c_node.children) is not NULL
 581
 582 cdef inline int _hasTail(xmlNode* c_node):
 583     return c_node is not NULL and _textNodeOrSkip(c_node.next) is not NULL
 584
 585 cdef _collectText(xmlNode* c_node):
 586     u"""Collect all text nodes and return them as a unicode string.
 587
 588     Start collecting at c_node.
 589
 590     If there was no text to collect, return None
 591     """
 592     cdef Py_ssize_t scount
 593     cdef char* c_text
 594     cdef xmlNode* c_node_cur
 595     # check for multiple text nodes
 596     scount = 0
 597     c_text = NULL
 598     c_node_cur = c_node = _textNodeOrSkip(c_node)
 599     while c_node_cur is not NULL:
 600         if c_node_cur.content[0] != c'\0':
 601             c_text = c_node_cur.content
 602         scount = scount + 1
 603         c_node_cur = _textNodeOrSkip(c_node_cur.next)
 604
 605     # handle two most common cases first
 606     if c_text is NULL:
 607         if scount > 0:
 608             if python.IS_PYTHON3:
 609                 return u''
 610             else:
 611                 return ''
 612         else:
 613             return None
 614     if scount == 1:
 615         return funicode(c_text)
 616
 617     # the rest is not performance critical anymore
 618     result = ''
 619     while c_node is not NULL:
 620         result = result + c_node.content
 621         c_node = _textNodeOrSkip(c_node.next)
 622     return funicode(result)
 623
 624 cdef void _removeText(xmlNode* c_node):
 625     u"""Remove all text nodes.
 626
 627     Start removing at c_node.
 628     """
 629     cdef xmlNode* c_next
 630     c_node = _textNodeOrSkip(c_node)
 631     while c_node is not NULL:
 632         c_next = _textNodeOrSkip(c_node.next)
 633         tree.xmlUnlinkNode(c_node)
 634         tree.xmlFreeNode(c_node)
 635         c_node = c_next
 636
 637 cdef int _setNodeText(xmlNode* c_node, value) except -1:
 638     cdef xmlNode* c_text_node
 639     # remove all text nodes at the start first
 640     _removeText(c_node.children)
 641     if value is None:
 642         return 0
 643     # now add new text node with value at start
 644     if python._isString(value):
 645         text = _utf8(value)
 646         c_text_node = tree.xmlNewDocText(c_node.doc, _cstr(text))
 647     elif isinstance(value, CDATA):
 648         c_text_node = tree.xmlNewCDataBlock(
 649             c_node.doc, _cstr((<CDATA>value)._utf8_data),
 650             python.PyString_GET_SIZE((<CDATA>value)._utf8_data))
 651     else:
 652         # this will raise the right error
 653        _utf8(value)
 654        return -1
 655     if c_node.children is NULL:
 656         tree.xmlAddChild(c_node, c_text_node)
 657     else:
 658         tree.xmlAddPrevSibling(c_node.children, c_text_node)
 659     return 0
 660
 661 cdef int _setTailText(xmlNode* c_node, value) except -1:
 662     cdef xmlNode* c_text_node
 663     # remove all text nodes at the start first
 664     _removeText(c_node.next)
 665     if value is None:
 666         return 0
 667     text = _utf8(value)
 668     c_text_node = tree.xmlNewDocText(c_node.doc, _cstr(text))
 669     # XXX what if we're the top element?
 670     tree.xmlAddNextSibling(c_node, c_text_node)
 671     return 0
 672
 673 cdef _resolveQNameText(_Element element, value):
 674     cdef xmlNs* c_ns
 675     ns, tag = _getNsTag(value)
 676     if ns is None:
 677         return tag
 678     else:
 679         c_ns = element._doc._findOrBuildNodeNs(
 680             element._c_node, _cstr(ns), NULL)
 681         return python.PyString_FromFormat('%s:%s', c_ns.prefix, _cstr(tag))
 682
 683 cdef inline bint _hasChild(xmlNode* c_node):
 684     return c_node is not NULL and _findChildForwards(c_node, 0) is not NULL
 685
 686 cdef inline Py_ssize_t _countElements(xmlNode* c_node):
 687     u"Counts the elements within the following siblings and the node itself."
 688     cdef Py_ssize_t count
 689     count = 0
 690     while c_node is not NULL:
 691         if _isElement(c_node):
 692             count = count + 1
 693         c_node = c_node.next
 694     return count
 695
 696 cdef int _findChildSlice(
 697     python.slice sliceobject, xmlNode* c_parent,
 698     xmlNode** c_start_node, Py_ssize_t* c_step, Py_ssize_t* c_length) except -1:
 699     u"""Resolve a children slice.
 700
 701     Returns the start node, step size and the slice length in the
 702     pointer arguments.
 703     """
 704     cdef Py_ssize_t start, stop, childcount
 705     childcount = _countElements(c_parent.children)
 706     if childcount == 0:
 707         c_start_node[0] = NULL
 708         c_length[0] = 0
 709         if sliceobject.step is None:
 710             c_step[0] = 1
 711         else:
 712             python._PyEval_SliceIndex(sliceobject.step, c_step)
 713         return 0
 714     python.PySlice_GetIndicesEx(
 715         sliceobject, childcount, &start, &stop, c_step, c_length)
 716     if start > childcount / 2:
 717         c_start_node[0] = _findChildBackwards(c_parent, childcount - start - 1)
 718     else:
 719         c_start_node[0] = _findChild(c_parent, start)
 720     return 0
 721
 722 cdef bint _isFullSlice(python.slice sliceobject):
 723     u"""Conservative guess if this slice is a full slice as in ``s[:]``.
 724     """
 725     cdef Py_ssize_t step
 726     if sliceobject is None:
 727         return 0
 728     if sliceobject.start is None and \
 729             sliceobject.stop is None:
 730         if sliceobject.step is None:
 731             return 1
 732         python._PyEval_SliceIndex(sliceobject.step, &step)
 733         if step == 1:
 734             return 1
 735         return 0
 736     return 0
 737
 738 cdef _collectChildren(_Element element):
 739     cdef xmlNode* c_node
 740     cdef list result = []
 741     c_node = element._c_node.children
 742     if c_node is not NULL:
 743         if not _isElement(c_node):
 744             c_node = _nextElement(c_node)
 745         while c_node is not NULL:
 746             result.append(_elementFactory(element._doc, c_node))
 747             c_node = _nextElement(c_node)
 748     return result
 749
 750 cdef inline xmlNode* _findChild(xmlNode* c_node, Py_ssize_t index):
 751     if index < 0:
 752         return _findChildBackwards(c_node, -index - 1)
 753     else:
 754         return _findChildForwards(c_node, index)
 755
 756 cdef inline xmlNode* _findChildForwards(xmlNode* c_node, Py_ssize_t index):
 757     u"""Return child element of c_node with index, or return NULL if not found.
 758     """
 759     cdef xmlNode* c_child
 760     cdef Py_ssize_t c
 761     c_child = c_node.children
 762     c = 0
 763     while c_child is not NULL:
 764         if _isElement(c_child):
 765             if c == index:
 766                 return c_child
 767             c = c + 1
 768         c_child = c_child.next
 769     return NULL
 770
 771 cdef inline xmlNode* _findChildBackwards(xmlNode* c_node, Py_ssize_t index):
 772     u"""Return child element of c_node with index, or return NULL if not found.
 773     Search from the end.
 774     """
 775     cdef xmlNode* c_child
 776     cdef Py_ssize_t c
 777     c_child = c_node.last
 778     c = 0
 779     while c_child is not NULL:
 780         if _isElement(c_child):
 781             if c == index:
 782                 return c_child
 783             c = c + 1
 784         c_child = c_child.prev
 785     return NULL
 786
 787 cdef inline xmlNode* _textNodeOrSkip(xmlNode* c_node) nogil:
 788     u"""Return the node if it's a text node.  Skip over ignorable nodes in a
 789     series of text nodes.  Return NULL if a non-ignorable node is found.
 790
 791     This is used to skip over XInclude nodes when collecting adjacent text
 792     nodes.
 793     """
 794     while c_node is not NULL:
 795         if c_node.type == tree.XML_TEXT_NODE:
 796             return c_node
 797         if c_node.type == tree.XML_CDATA_SECTION_NODE:
 798             return c_node
 799         elif c_node.type == tree.XML_XINCLUDE_START or \
 800                  c_node.type == tree.XML_XINCLUDE_END:
 801             c_node = c_node.next
 802         else:
 803             return NULL
 804     return NULL
 805
 806 cdef inline xmlNode* _nextElement(xmlNode* c_node):
 807     u"""Given a node, find the next sibling that is an element.
 808     """
 809     if c_node is NULL:
 810         return NULL
 811     c_node = c_node.next
 812     while c_node is not NULL:
 813         if _isElement(c_node):
 814             return c_node
 815         c_node = c_node.next
 816     return NULL
 817
 818 cdef inline xmlNode* _previousElement(xmlNode* c_node):
 819     u"""Given a node, find the next sibling that is an element.
 820     """
 821     if c_node is NULL:
 822         return NULL
 823     c_node = c_node.prev
 824     while c_node is not NULL:
 825         if _isElement(c_node):
 826             return c_node
 827         c_node = c_node.prev
 828     return NULL
 829
 830 cdef inline xmlNode* _parentElement(xmlNode* c_node):
 831     u"Given a node, find the parent element."
 832     if c_node is NULL or not _isElement(c_node):
 833         return NULL
 834     c_node = c_node.parent
 835     if c_node is NULL or not _isElement(c_node):
 836         return NULL
 837     return c_node
 838
 839 cdef inline bint _tagMatches(xmlNode* c_node, char* c_href, char* c_name):
 840     u"""Tests if the node matches namespace URI and tag name.
 841
 842     A node matches if it matches both c_href and c_name.
 843
 844     A node matches c_href if any of the following is true:
 845     * c_href is NULL
 846     * its namespace is NULL and c_href is the empty string
 847     * its namespace string equals the c_href string
 848
 849     A node matches c_name if any of the following is true:
 850     * c_name is NULL
 851     * its name string equals the c_name string
 852     """
 853     cdef char* c_node_href
 854     if c_node is NULL:
 855         return 0
 856     if c_node.type != tree.XML_ELEMENT_NODE:
 857         # not an element, only succeed if we match everything
 858         return c_name is NULL and c_href is NULL
 859     if c_name is NULL:
 860         if c_href is NULL:
 861             # always match
 862             return 1
 863         else:
 864             c_node_href = _getNs(c_node)
 865             if c_node_href is NULL:
 866                 return c_href[0] == c'\0'
 867             else:
 868                 return cstd.strcmp(c_node_href, c_href) == 0
 869     elif c_href is NULL:
 870         if _getNs(c_node) is not NULL:
 871             return 0
 872         return c_node.name == c_name or cstd.strcmp(c_node.name, c_name) == 0
 873     elif c_node.name == c_name or cstd.strcmp(c_node.name, c_name) == 0:
 874         c_node_href = _getNs(c_node)
 875         if c_node_href is NULL:
 876             return c_href[0] == c'\0'
 877         else:
 878             return cstd.strcmp(c_node_href, c_href) == 0
 879     else:
 880         return 0
 881
 882 cdef int _removeNode(_Document doc, xmlNode* c_node) except -1:
 883     u"""Unlink and free a node and subnodes if possible.  Otherwise, make sure
 884     it's self-contained.
 885     """
 886     cdef xmlNode* c_next
 887     c_next = c_node.next
 888     tree.xmlUnlinkNode(c_node)
 889     _moveTail(c_next, c_node)
 890     if not attemptDeallocation(c_node):
 891         # make namespaces absolute
 892         moveNodeToDocument(doc, c_node.doc, c_node)
 893     return 0
 894
 895 cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target):
 896     cdef xmlNode* c_next
 897     # tail support: look for any text nodes trailing this node and
 898     # move them too
 899     c_tail = _textNodeOrSkip(c_tail)
 900     while c_tail is not NULL:
 901         c_next = _textNodeOrSkip(c_tail.next)
 902         tree.xmlUnlinkNode(c_tail)
 903         tree.xmlAddNextSibling(c_target, c_tail)
 904         c_target = c_tail
 905         c_tail = c_next
 906
 907 cdef int _copyTail(xmlNode* c_tail, xmlNode* c_target) except -1:
 908     cdef xmlNode* c_new_tail
 909     # tail copying support: look for any text nodes trailing this node and
 910     # copy it to the target node
 911     c_tail = _textNodeOrSkip(c_tail)
 912     while c_tail is not NULL:
 913         if c_target.doc is not c_tail.doc:
 914             c_new_tail = tree.xmlDocCopyNode(c_tail, c_target.doc, 0)
 915         else:
 916             c_new_tail = tree.xmlCopyNode(c_tail, 0)
 917         if c_new_tail is NULL:
 918             python.PyErr_NoMemory()
 919         tree.xmlAddNextSibling(c_target, c_new_tail)
 920         c_target = c_new_tail
 921         c_tail = _textNodeOrSkip(c_tail.next)
 922     return 0
 923
 924 cdef int _copyNonElementSiblings(xmlNode* c_node, xmlNode* c_target) except -1:
 925     cdef xmlNode* c_copy
 926     cdef xmlNode* c_sibling = c_node
 927     while c_sibling.prev != NULL and \
 928             (c_sibling.prev.type == tree.XML_PI_NODE or \
 929                  c_sibling.prev.type == tree.XML_COMMENT_NODE):
 930         c_sibling = c_sibling.prev
 931     while c_sibling != c_node:
 932         c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1)
 933         if c_copy is NULL:
 934             python.PyErr_NoMemory()
 935         tree.xmlAddPrevSibling(c_target, c_copy)
 936         c_sibling = c_sibling.next
 937     while c_sibling.next != NULL and \
 938             (c_sibling.next.type == tree.XML_PI_NODE or \
 939                  c_sibling.next.type == tree.XML_COMMENT_NODE):
 940         c_sibling = c_sibling.next
 941         c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1)
 942         if c_copy is NULL:
 943             python.PyErr_NoMemory()
 944         tree.xmlAddNextSibling(c_target, c_copy)
 945
 946 cdef int _deleteSlice(_Document doc, xmlNode* c_node,
 947                       Py_ssize_t count, Py_ssize_t step) except -1:
 948     u"""Delete slice, ``count`` items starting with ``c_node`` with a step
 949     width of ``step``.
 950     """
 951     cdef xmlNode* c_next
 952     cdef Py_ssize_t c, i
 953     cdef _node_to_node_function next_element
 954     if c_node is NULL:
 955         return 0
 956     if step > 0:
 957         next_element = _nextElement
 958     else:
 959         step = -step
 960         next_element = _previousElement
 961     # now start deleting nodes
 962     c = 0
 963     c_next = c_node
 964     while c_node is not NULL and c < count:
 965         for i from 0 <= i < step:
 966             c_next = next_element(c_next)
 967         _removeNode(doc, c_node)
 968         c = c + 1
 969         c_node = c_next
 970     return 0
 971
 972 cdef int _replaceSlice(_Element parent, xmlNode* c_node,
 973                        Py_ssize_t slicelength, Py_ssize_t step,
 974                        bint left_to_right, elements) except -1:
 975     u"""Replace the slice of ``count`` elements starting at ``c_node`` with
 976     positive step width ``step`` by the Elements in ``elements``.  The
 977     direction is given by the boolean argument ``left_to_right``.
 978
 979     ``c_node`` may be NULL to indicate the end of the children list.
 980     """
 981     cdef xmlNode* c_orig_neighbour
 982     cdef xmlNode* c_next
 983     cdef xmlDoc*  c_source_doc
 984     cdef _Element element
 985     cdef Py_ssize_t seqlength, i, c
 986     cdef _node_to_node_function next_element
 987     assert step > 0
 988     if left_to_right:
 989         next_element = _nextElement
 990     else:
 991         next_element = _previousElement
 992
 993     if not python.PyList_Check(elements) and \
 994             not python.PyTuple_Check(elements):
 995         elements = list(elements)
 996
 997     if step > 1:
 998         # *replacing* children stepwise with list => check size!
 999         seqlength = len(elements)
1000         if seqlength != slicelength:
1001             raise ValueError, u"attempt to assign sequence of size %d " \
1002                 u"to extended slice of size %d" % (seqlength, slicelength)
1003
1004     if c_node is NULL:
1005         # no children yet => add all elements straight away
1006         if left_to_right:
1007             for element in elements:
1008                 assert element is not None, u"Node must not be None"
1009                 _appendChild(parent, element)
1010         else:
1011             for element in elements:
1012                 assert element is not None, u"Node must not be None"
1013                 _prependChild(parent, element)
1014         return 0
1015
1016     # remove the elements first as some might be re-added
1017     if left_to_right:
1018         # L->R, remember left neighbour
1019         c_orig_neighbour = _previousElement(c_node)
1020     else:
1021         # R->L, remember right neighbour
1022         c_orig_neighbour = _nextElement(c_node)
1023
1024     # We remove the original slice elements one by one. Since we hold
1025     # a Python reference to all elements that we will insert, it is
1026     # safe to let _removeNode() try (and fail) to free them even if
1027     # the element itself or one of its descendents will be reinserted.
1028     c = 0
1029     c_next = c_node
1030     while c_node is not NULL and c < slicelength:
1031         for i from 0 <= i < step:
1032             c_next = next_element(c_next)
1033         _removeNode(parent._doc, c_node)
1034         c = c + 1
1035         c_node = c_next
1036
1037     # make sure each element is inserted only once
1038     elements = iter(elements)
1039
1040     # find the first node right of the new insertion point
1041     if left_to_right:
1042         if c_orig_neighbour is not NULL:
1043             c_node = next_element(c_orig_neighbour)
1044         else:
1045             # before the first element
1046             c_node = _findChildForwards(parent._c_node, 0)
1047     elif c_orig_neighbour is NULL:
1048         # at the end, but reversed stepping
1049         # append one element and go to the next insertion point
1050         for element in elements:
1051             assert element is not None, u"Node must not be None"
1052             _appendChild(parent, element)
1053             c_node = element._c_node
1054             if slicelength > 0:
1055                 slicelength = slicelength - 1
1056                 for i from 1 <= i < step:
1057                     c_node = next_element(c_node)
1058             break
1059
1060     if left_to_right:
1061         # adjust step size after removing slice as we are not stepping
1062         # over the newly inserted elements
1063         step = step - 1
1064
1065     # now insert elements where we removed them
1066     if c_node is not NULL:
1067         for element in elements:
1068             assert element is not None, u"Node must not be None"
1069             # move element and tail over
1070             c_source_doc = element._c_node.doc
1071             c_next = element._c_node.next
1072             tree.xmlAddPrevSibling(c_node, element._c_node)
1073             _moveTail(c_next, element._c_node)
1074
1075             # integrate element into new document
1076             moveNodeToDocument(parent._doc, c_source_doc, element._c_node)
1077
1078             # stop at the end of the slice
1079             if slicelength > 0:
1080                 slicelength = slicelength - 1
1081                 for i from 0 <= i < step:
1082                     c_node = next_element(c_node)
1083                 if c_node is NULL:
1084                     break
1085         else:
1086             # everything inserted
1087             return 0
1088
1089     # append the remaining elements at the respective end
1090     if left_to_right:
1091         for element in elements:
1092             assert element is not None, u"Node must not be None"
1093             _appendChild(parent, element)
1094     else:
1095         for element in elements:
1096             assert element is not None, u"Node must not be None"
1097             _prependChild(parent, element)
1098
1099     return 0
1100
1101 cdef int _appendChild(_Element parent, _Element child) except -1:
1102     u"""Append a new child to a parent element.
1103     """
1104     cdef xmlNode* c_next
1105     cdef xmlNode* c_node
1106     cdef xmlDoc* c_source_doc
1107     c_node = child._c_node
1108     c_source_doc = c_node.doc
1109     # store possible text node
1110     c_next = c_node.next
1111     # move node itself
1112     tree.xmlUnlinkNode(c_node)
1113     tree.xmlAddChild(parent._c_node, c_node)
1114     _moveTail(c_next, c_node)
1115     # uh oh, elements may be pointing to different doc when
1116     # parent element has moved; change them too..
1117     moveNodeToDocument(parent._doc, c_source_doc, c_node)
1118
1119 cdef int _prependChild(_Element parent, _Element child) except -1:
1120     u"""Prepend a new child to a parent element.
1121     """
1122     cdef xmlNode* c_next
1123     cdef xmlNode* c_child
1124     cdef xmlNode* c_node
1125     cdef xmlDoc* c_source_doc
1126     c_node = child._c_node
1127     c_source_doc = c_node.doc
1128     # store possible text node
1129     c_next = c_node.next
1130     # move node itself
1131     c_child = _findChildForwards(parent._c_node, 0)
1132     if c_child is NULL:
1133         tree.xmlUnlinkNode(c_node)
1134         tree.xmlAddChild(parent._c_node, c_node)
1135     else:
1136         tree.xmlAddPrevSibling(c_child, c_node)
1137     _moveTail(c_next, c_node)
1138     # uh oh, elements may be pointing to different doc when
1139     # parent element has moved; change them too..
1140     moveNodeToDocument(parent._doc, c_source_doc, c_node)
1141
1142 cdef int _appendSibling(_Element element, _Element sibling) except -1:
1143     u"""Append a new child to a parent element.
1144     """
1145     cdef xmlNode* c_next
1146     cdef xmlNode* c_node
1147     cdef xmlDoc* c_source_doc
1148     c_node = sibling._c_node
1149     c_source_doc = c_node.doc
1150     # store possible text node
1151     c_next = c_node.next
1152     # move node itself
1153     tree.xmlAddNextSibling(element._c_node, c_node)
1154     _moveTail(c_next, c_node)
1155     # uh oh, elements may be pointing to different doc when
1156     # parent element has moved; change them too..
1157     moveNodeToDocument(element._doc, c_source_doc, c_node)
1158
1159 cdef int _prependSibling(_Element element, _Element sibling) except -1:
1160     u"""Append a new child to a parent element.
1161     """
1162     cdef xmlNode* c_next
1163     cdef xmlNode* c_node
1164     cdef xmlDoc* c_source_doc
1165     c_node = sibling._c_node
1166     c_source_doc = c_node.doc
1167     # store possible text node
1168     c_next = c_node.next
1169     # move node itself
1170     tree.xmlAddPrevSibling(element._c_node, c_node)
1171     _moveTail(c_next, c_node)
1172     # uh oh, elements may be pointing to different doc when
1173     # parent element has moved; change them too..
1174     moveNodeToDocument(element._doc, c_source_doc, c_node)
1175
1176 cdef inline int isutf8(char* s):
1177     cdef char c
1178     c = s[0]
1179     while c != c'\0':
1180         if c & 0x80:
1181             return 1
1182         s = s + 1
1183         c = s[0]
1184     return 0
1185
1186 cdef int check_string_utf8(pystring):
1187     u"""Check if a string looks like valid UTF-8 XML content.  Returns 0
1188     for ASCII, 1 for UTF-8 and -1 in the case of errors, such as NULL
1189     bytes or ASCII control characters.
1190     """
1191     cdef char* s
1192     cdef char* c_end
1193     cdef char c
1194     cdef bint is_non_ascii
1195     s = _cstr(pystring)
1196     c_end = s + python.PyString_GET_SIZE(pystring)
1197     is_non_ascii = 0
1198     while s < c_end:
1199         if s[0] & 0x80:
1200             # skip the entire multi byte sequence
1201             while s[0] & 0x80:
1202                 s += 1
1203             is_non_ascii = 1
1204         elif not tree.xmlIsChar_ch(s[0]):
1205             return -1 # invalid!
1206         s += 1
1207     return is_non_ascii
1208
1209 cdef object funicode(char* s):
1210     cdef Py_ssize_t slen
1211     cdef char* spos
1212     cdef bint is_non_ascii
1213     if python.IS_PYTHON3:
1214         slen = cstd.strlen(s)
1215         return python.PyUnicode_DecodeUTF8(s, slen, NULL)
1216     spos = s
1217     is_non_ascii = 0
1218     while spos[0] != c'\0':
1219         if spos[0] & 0x80:
1220             is_non_ascii = 1
1221             break
1222         spos += 1
1223     while spos[0] != c'\0':
1224         spos += 1
1225     slen = spos - s
1226     if is_non_ascii:
1227         return python.PyUnicode_DecodeUTF8(s, slen, NULL)
1228     return python.PyString_FromStringAndSize(s, slen)
1229
1230 cdef object _utf8(object s):
1231     cdef int invalid
1232     if python.PyString_CheckExact(s):
1233         invalid = check_string_utf8(s)
1234     elif python.PyUnicode_CheckExact(s) or python.PyUnicode_Check(s):
1235         s = python.PyUnicode_AsUTF8String(s)
1236         invalid = check_string_utf8(s) == -1
1237     elif python.PyString_Check(s):
1238         invalid = check_string_utf8(s)
1239     else:
1240         raise TypeError, u"Argument must be string or unicode."
1241     if invalid:
1242         raise ValueError, \
1243             u"All strings must be XML compatible: Unicode or ASCII, no NULL bytes"
1244     return s
1245
1246 cdef bint _isFilePath(char* c_path):
1247     u"simple heuristic to see if a path is a filename"
1248     cdef char c
1249     # test if it looks like an absolute Unix path or a Windows network path
1250     if c_path[0] == c'/':
1251         return 1
1252     # test if it looks like an absolute Windows path
1253     if (c_path[0] >= c'a' and c_path[0] <= c'z') or \
1254             (c_path[0] >= c'A' and c_path[0] <= c'Z'):
1255         if c_path[1] == c':':
1256             return 1
1257     # test if it looks like a relative path
1258     while c_path[0] != c'\0':
1259         c = c_path[0]
1260         if c == c':':
1261             return 0
1262         elif c == c'/':
1263             return 1
1264         elif c == c'\\':
1265             return 1
1266         c_path += 1
1267     return 1
1268
1269 cdef object _encodeFilename(object filename):
1270     u"""Make sure a filename is 8-bit encoded (or None).
1271     """
1272     if filename is None:
1273         return None
1274     elif python.PyString_Check(filename):
1275         return filename
1276     elif python.PyUnicode_Check(filename):
1277         filename8 = python.PyUnicode_AsEncodedString(
1278             filename, 'UTF-8', NULL)
1279         if _isFilePath(filename8):
1280             try:
1281                 return python.PyUnicode_AsEncodedString(
1282                     filename, _C_FILENAME_ENCODING, NULL)
1283             except UnicodeEncodeError:
1284                 pass
1285         return filename8
1286     else:
1287         raise TypeError, u"Argument must be string or unicode."
1288
1289 cdef object _decodeFilename(char* c_path):
1290     u"""Make the filename a unicode string if we are in Py3.
1291     """
1292     cdef Py_ssize_t c_len = cstd.strlen(c_path)
1293     if _isFilePath(c_path):
1294         try:
1295             return python.PyUnicode_Decode(
1296                 c_path, c_len, _C_FILENAME_ENCODING, NULL)
1297         except UnicodeDecodeError:
1298             pass
1299     try:
1300         return python.PyUnicode_DecodeUTF8(c_path, c_len, NULL)
1301     except UnicodeDecodeError:
1302         # this is a stupid fallback, but it might still work...
1303         return python.PyUnicode_DecodeLatin1(c_path, c_len, 'replace')
1304
1305 cdef object _encodeFilenameUTF8(object filename):
1306     u"""Recode filename as UTF-8. Tries ASCII, local filesystem encoding and
1307     UTF-8 as source encoding.
1308     """
1309     cdef char* c_filename
1310     if filename is None:
1311         return None
1312     elif python.PyString_Check(filename):
1313         if not check_string_utf8(filename):
1314             # plain ASCII!
1315             return filename
1316         c_filename = _cstr(filename)
1317         try:
1318             # try to decode with default encoding
1319             filename = python.PyUnicode_Decode(
1320                 c_filename, python.PyString_GET_SIZE(filename),
1321                 _C_FILENAME_ENCODING, NULL)
1322         except UnicodeDecodeError, decode_exc:
1323             try:
1324                 # try if it's UTF-8
1325                 filename = python.PyUnicode_DecodeUTF8(
1326                     c_filename, python.PyString_GET_SIZE(filename), NULL)
1327             except UnicodeDecodeError:
1328                 raise decode_exc # otherwise re-raise original exception
1329     if python.PyUnicode_Check(filename):
1330         return python.PyUnicode_AsUTF8String(filename)
1331     else:
1332         raise TypeError, u"Argument must be string or unicode."
1333
1334 cdef _getNsTag(tag):
1335     u"""Given a tag, find namespace URI and tag name.
1336     Return None for NS uri if no namespace URI available.
1337     """
1338     cdef char* c_tag
1339     cdef char* c_ns_end
1340     cdef Py_ssize_t taglen
1341     cdef Py_ssize_t nslen
1342     # _isString() is much faster than isinstance()
1343     if not _isString(tag) and isinstance(tag, QName):
1344         tag = (<QName>tag).text
1345     tag = _utf8(tag)
1346     c_tag = _cstr(tag)
1347     if c_tag[0] == c'{':
1348         c_tag = c_tag + 1
1349         c_ns_end = cstd.strchr(c_tag, c'}')
1350         if c_ns_end is NULL:
1351             raise ValueError, u"Invalid tag name"
1352         nslen  = c_ns_end - c_tag
1353         taglen = python.PyString_GET_SIZE(tag) - nslen - 2
1354         if taglen == 0:
1355             raise ValueError, u"Empty tag name"
1356         if nslen > 0:
1357             ns = python.PyString_FromStringAndSize(c_tag,   nslen)
1358         tag = python.PyString_FromStringAndSize(c_ns_end+1, taglen)
1359     elif python.PyString_GET_SIZE(tag) == 0:
1360         raise ValueError, u"Empty tag name"
1361     return ns, tag
1362
1363 cdef inline int _pyXmlNameIsValid(name_utf8):
1364     return _xmlNameIsValid(_cstr(name_utf8))
1365
1366 cdef inline int _pyHtmlNameIsValid(name_utf8):
1367     return _htmlNameIsValid(_cstr(name_utf8))
1368
1369 cdef inline int _xmlNameIsValid(char* c_name):
1370     return tree.xmlValidateNCName(c_name, 0) == 0
1371
1372 cdef int _htmlNameIsValid(char* c_name):
1373     cdef char c
1374     if c_name is NULL or c_name[0] == c'\0':
1375         return 0
1376     while c_name[0] != c'\0':
1377         c = c_name[0]
1378         if c in (c'&', c'<', c'>', c'/', c'"', c"'",
1379                  c'\t', c'\n', c'\x0B', c'\x0C', c'\r', c' '):
1380             return 0
1381         c_name = c_name + 1
1382     return 1
1383
1384 cdef bint _characterReferenceIsValid(char* c_name):
1385     cdef bint is_hex
1386     if c_name[0] == c'x':
1387         c_name += 1
1388         is_hex = 1
1389     else:
1390         is_hex = 0
1391     if c_name[0] == c'\0':
1392         return 0
1393     while c_name[0] != c'\0':
1394         if c_name[0] < c'0' or c_name[0] > c'9':
1395             if not is_hex:
1396                 return 0
1397             if not (c_name[0] >= c'a' and c_name[0] <= c'f'):
1398                 if not (c_name[0] >= c'A' and c_name[0] <= c'F'):
1399                     return 0
1400         c_name += 1
1401     return 1
1402
1403 cdef int _tagValidOrRaise(tag_utf) except -1:
1404     if not _pyXmlNameIsValid(tag_utf):
1405         raise ValueError, u"Invalid tag name %r" % \
1406             python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', NULL)
1407     return 0
1408
1409 cdef int _htmlTagValidOrRaise(tag_utf) except -1:
1410     if not _pyHtmlNameIsValid(tag_utf):
1411         raise ValueError, u"Invalid HTML tag name %r" % \
1412             python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', NULL)
1413     return 0
1414
1415 cdef int _attributeValidOrRaise(name_utf) except -1:
1416     if not _pyXmlNameIsValid(name_utf):
1417         raise ValueError, u"Invalid attribute name %r" % \
1418             python.PyUnicode_FromEncodedObject(name_utf, 'UTF-8', NULL)
1419     return 0
1420
1421 cdef int _prefixValidOrRaise(tag_utf) except -1:
1422     if not _pyXmlNameIsValid(tag_utf):
1423         raise ValueError, u"Invalid namespace prefix %r" % \
1424             python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', NULL)
1425     return 0
1426
1427 cdef inline object _namespacedName(xmlNode* c_node):
1428     return _namespacedNameFromNsName(_getNs(c_node), c_node.name)
1429
1430 cdef object _namespacedNameFromNsName(char* href, char* name):
1431     if href is NULL:
1432         return funicode(name)
1433     elif python.IS_PYTHON3:
1434         return python.PyUnicode_FromFormat("{%s}%s", href, name)
1435     else:
1436         s = python.PyString_FromFormat("{%s}%s", href, name)
1437         if isutf8(href) or isutf8(name):
1438             return python.PyUnicode_FromEncodedObject(s, 'UTF-8', NULL)
1439         else:
1440             return s
1441
1442 cdef _getFilenameForFile(source):
1443     u"""Given a Python File or Gzip object, give filename back.
1444
1445     Returns None if not a file object.
1446     """
1447     # file instances have a name attribute
1448     filename = getattr3(source, u'name', None)
1449     if filename is not None:
1450         return os_path_abspath(filename)
1451     # urllib2 provides a geturl() method
1452     geturl = getattr3(source, u'geturl', None)
1453     if geturl is not None:
1454         return geturl()
1455     # gzip file instances have a filename attribute (before Py3k)
1456     filename = getattr3(source, u'filename', None)
1457     if filename is not None:
1458         return os_path_abspath(filename)
1459     # can't determine filename
1460     return None