src/lxml/parser.pxi

   1 # Parsers for XML and HTML
   2
   3 from lxml.includes cimport xmlparser
   4 from lxml.includes cimport htmlparser
   5
   6
   7 class ParseError(LxmlSyntaxError):
   8     """Syntax error while parsing an XML document.
   9
  10     For compatibility with ElementTree 1.3 and later.
  11     """
  12     def __init__(self, message, code, line, column, filename=None):
  13         super(_ParseError, self).__init__(message)
  14         self.lineno, self.offset = (line, column - 1)
  15         self.code = code
  16         self.filename = filename
  17
  18     @property
  19     def position(self):
  20         return self.lineno, self.offset + 1
  21
  22     @position.setter
  23     def position(self, new_pos):
  24         self.lineno, column = new_pos
  25         self.offset = column - 1
  26
  27 cdef object _ParseError = ParseError
  28
  29
  30 class XMLSyntaxError(ParseError):
  31     """Syntax error while parsing an XML document.
  32     """
  33
  34 cdef class ParserError(LxmlError):
  35     """Internal lxml parser error.
  36     """
  37
  38
  39 @cython.final
  40 @cython.internal
  41 cdef class _ParserDictionaryContext:
  42     # Global parser context to share the string dictionary.
  43     #
  44     # This class is a delegate singleton!
  45     #
  46     # It creates _ParserDictionaryContext objects for each thread to keep thread state,
  47     # but those must never be used directly.  Always stick to using the static
  48     # __GLOBAL_PARSER_CONTEXT as defined below the class.
  49     #
  50
  51     cdef tree.xmlDict* _c_dict
  52     cdef _BaseParser _default_parser
  53     cdef list _implied_parser_contexts
  54
  55     def __cinit__(self):
  56         self._c_dict = NULL
  57         self._implied_parser_contexts = []
  58
  59     def __dealloc__(self):
  60         if self._c_dict is not NULL:
  61             xmlparser.xmlDictFree(self._c_dict)
  62
  63     cdef void initMainParserContext(self):
  64         u"""Put the global context into the thread dictionary of the main
  65         thread.  To be called once and only in the main thread."""
  66         thread_dict = python.PyThreadState_GetDict()
  67         if thread_dict is not NULL:
  68             (<dict>thread_dict)[u"_ParserDictionaryContext"] = self
  69
  70     cdef _ParserDictionaryContext _findThreadParserContext(self):
  71         u"Find (or create) the _ParserDictionaryContext object for the current thread"
  72         cdef _ParserDictionaryContext context
  73         thread_dict = python.PyThreadState_GetDict()
  74         if thread_dict is NULL:
  75             return self
  76         d = <dict>thread_dict
  77         result = python.PyDict_GetItem(d, u"_ParserDictionaryContext")
  78         if result is not NULL:
  79             return <object>result
  80         context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext)
  81         d[u"_ParserDictionaryContext"] = context
  82         return context
  83
  84     cdef void setDefaultParser(self, _BaseParser parser):
  85         u"Set the default parser for the current thread"
  86         cdef _ParserDictionaryContext context
  87         context = self._findThreadParserContext()
  88         context._default_parser = parser
  89
  90     cdef _BaseParser getDefaultParser(self):
  91         u"Return (or create) the default parser of the current thread"
  92         cdef _ParserDictionaryContext context
  93         context = self._findThreadParserContext()
  94         if context._default_parser is None:
  95             if self._default_parser is None:
  96                 self._default_parser = __DEFAULT_XML_PARSER._copy()
  97             if context is not self:
  98                 context._default_parser = self._default_parser._copy()
  99         return context._default_parser
 100
 101     cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
 102         u"Return the thread-local dict or create a new one if necessary."
 103         cdef _ParserDictionaryContext context
 104         context = self._findThreadParserContext()
 105         if context._c_dict is NULL:
 106             # thread dict not yet set up => use default or create a new one
 107             if default is not NULL:
 108                 context._c_dict = default
 109                 xmlparser.xmlDictReference(default)
 110                 return default
 111             if self._c_dict is NULL:
 112                 self._c_dict = xmlparser.xmlDictCreate()
 113             if context is not self:
 114                 context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
 115         return context._c_dict
 116
 117     cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref):
 118         c_dict = c_dict_ref[0]
 119         c_thread_dict = self._getThreadDict(c_dict)
 120         if c_dict is c_thread_dict:
 121             return
 122         if c_dict is not NULL:
 123             xmlparser.xmlDictFree(c_dict)
 124         c_dict_ref[0] = c_thread_dict
 125         xmlparser.xmlDictReference(c_thread_dict)
 126
 127     cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt):
 128         u"Assure we always use the same string dictionary."
 129         self.initThreadDictRef(&pctxt.dict)
 130         pctxt.dictNames = 1
 131
 132     cdef void initXPathParserDict(self, xpath.xmlXPathContext* pctxt):
 133         u"Assure we always use the same string dictionary."
 134         self.initThreadDictRef(&pctxt.dict)
 135
 136     cdef void initDocDict(self, xmlDoc* result):
 137         u"Store dict of last object parsed if no shared dict yet"
 138         # XXX We also free the result dict here if there already was one.
 139         # This case should only occur for new documents with empty dicts,
 140         # otherwise we'd free data that's in use => segfault
 141         self.initThreadDictRef(&result.dict)
 142
 143     cdef _ParserContext findImpliedContext(self):
 144         u"""Return any current implied xml parser context for the current
 145         thread.  This is used when the resolver functions are called
 146         with an xmlParserCtxt that was generated from within libxml2
 147         (i.e. without a _ParserContext) - which happens when parsing
 148         schema and xinclude external references."""
 149         cdef _ParserDictionaryContext context
 150         cdef _ParserContext implied_context
 151
 152         # see if we have a current implied parser
 153         context = self._findThreadParserContext()
 154         if context._implied_parser_contexts:
 155             implied_context = context._implied_parser_contexts[-1]
 156             return implied_context
 157         return None
 158
 159     cdef void pushImpliedContextFromParser(self, _BaseParser parser):
 160         u"Push a new implied context object taken from the parser."
 161         if parser is not None:
 162             self.pushImpliedContext(parser._getParserContext())
 163         else:
 164             self.pushImpliedContext(None)
 165
 166     cdef void pushImpliedContext(self, _ParserContext parser_context):
 167         u"Push a new implied context object."
 168         cdef _ParserDictionaryContext context
 169         context = self._findThreadParserContext()
 170         context._implied_parser_contexts.append(parser_context)
 171
 172     cdef void popImpliedContext(self):
 173         u"Pop the current implied context object."
 174         cdef _ParserDictionaryContext context
 175         context = self._findThreadParserContext()
 176         context._implied_parser_contexts.pop()
 177
 178 cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
 179 __GLOBAL_PARSER_CONTEXT.initMainParserContext()
 180
 181 ############################################################
 182 ## support for Python unicode I/O
 183 ############################################################
 184
 185 # name of Python unicode encoding as known to libxml2
 186 cdef const_char* _UNICODE_ENCODING = NULL
 187
 188 cdef int _setupPythonUnicode() except -1:
 189     u"""Sets _UNICODE_ENCODING to the internal encoding name of Python unicode
 190     strings if libxml2 supports reading native Python unicode.  This depends
 191     on iconv and the local Python installation, so we simply check if we find
 192     a matching encoding handler.
 193     """
 194     cdef tree.xmlCharEncodingHandler* enchandler
 195     cdef Py_ssize_t l
 196     cdef const_char* enc
 197     cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>']
 198     cdef const_xmlChar* buffer = <const_xmlChar*>uchars
 199     # apparently, libxml2 can't detect UTF-16 on some systems
 200     if (buffer[0] == c'<' and buffer[1] == c'\0' and
 201             buffer[2] == c't' and buffer[3] == c'\0'):
 202         enc = "UTF-16LE"
 203     elif (buffer[0] == c'\0' and buffer[1] == c'<' and
 204             buffer[2] == c'\0' and buffer[3] == c't'):
 205         enc = "UTF-16BE"
 206     else:
 207         # let libxml2 give it a try
 208         enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7)
 209         if enc is NULL:
 210             # not my fault, it's YOUR broken system :)
 211             return 0
 212     enchandler = tree.xmlFindCharEncodingHandler(enc)
 213     if enchandler is not NULL:
 214         global _UNICODE_ENCODING
 215         tree.xmlCharEncCloseFunc(enchandler)
 216         _UNICODE_ENCODING = enc
 217     return 0
 218
 219 cdef const_char* _findEncodingName(const_xmlChar* buffer, int size):
 220     u"Work around bug in libxml2: find iconv name of encoding on our own."
 221     cdef tree.xmlCharEncoding enc
 222     enc = tree.xmlDetectCharEncoding(buffer, size)
 223     if enc == tree.XML_CHAR_ENCODING_UTF16LE:
 224         if size >= 4 and (buffer[0] == <const_xmlChar>'\xFF' and
 225                           buffer[1] == <const_xmlChar>'\xFE' and
 226                           buffer[2] == 0 and buffer[3] == 0):
 227             return "UTF-32LE"  # according to BOM
 228         else:
 229             return "UTF-16LE"
 230     elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
 231         return "UTF-16BE"
 232     elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
 233         return "UCS-4LE"
 234     elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
 235         return "UCS-4BE"
 236     elif enc == tree.XML_CHAR_ENCODING_NONE:
 237         return NULL
 238     else:
 239         # returns a constant char*, no need to free it
 240         return tree.xmlGetCharEncodingName(enc)
 241
 242 _setupPythonUnicode()
 243
 244 ############################################################
 245 ## support for file-like objects
 246 ############################################################
 247
 248 @cython.final
 249 @cython.internal
 250 cdef class _FileReaderContext:
 251     cdef object _filelike
 252     cdef object _encoding
 253     cdef object _url
 254     cdef object _bytes
 255     cdef _ExceptionContext _exc_context
 256     cdef Py_ssize_t _bytes_read
 257     cdef char* _c_url
 258     cdef bint _close_file_after_read
 259
 260     def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False):
 261         self._exc_context = exc_context
 262         self._filelike = filelike
 263         self._close_file_after_read = close_file
 264         self._encoding = encoding
 265         if url is None:
 266             self._c_url = NULL
 267         else:
 268             url = _encodeFilename(url)
 269             self._c_url = _cstr(url)
 270         self._url = url
 271         self._bytes  = b''
 272         self._bytes_read = 0
 273
 274     cdef _close_file(self):
 275         if self._filelike is None or not self._close_file_after_read:
 276             return
 277         try:
 278             close = self._filelike.close
 279         except AttributeError:
 280             close = None
 281         finally:
 282             self._filelike = None
 283         if close is not None:
 284             close()
 285
 286     cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self):
 287         cdef stdio.FILE* c_stream
 288         cdef xmlparser.xmlParserInputBuffer* c_buffer
 289         c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
 290         c_stream = python.PyFile_AsFile(self._filelike)
 291         if c_stream is NULL:
 292             c_buffer.readcallback  = _readFilelikeParser
 293             c_buffer.context = <python.PyObject*>self
 294         else:
 295             c_buffer.readcallback  = _readFileParser
 296             c_buffer.context = c_stream
 297         return c_buffer
 298
 299     cdef xmlparser.xmlParserInput* _createParserInput(
 300             self, xmlparser.xmlParserCtxt* ctxt):
 301         cdef xmlparser.xmlParserInputBuffer* c_buffer
 302         c_buffer = self._createParserInputBuffer()
 303         return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
 304
 305     cdef tree.xmlDtd* _readDtd(self):
 306         cdef xmlparser.xmlParserInputBuffer* c_buffer
 307         c_buffer = self._createParserInputBuffer()
 308         with nogil:
 309             return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
 310
 311     cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options):
 312         cdef xmlDoc* result
 313         cdef char* c_encoding
 314         cdef stdio.FILE* c_stream
 315         cdef xmlparser.xmlInputReadCallback c_read_callback
 316         cdef xmlparser.xmlInputCloseCallback c_close_callback
 317         cdef void* c_callback_context
 318
 319         if self._encoding is None:
 320             c_encoding = NULL
 321         else:
 322             c_encoding = _cstr(self._encoding)
 323
 324         c_stream = python.PyFile_AsFile(self._filelike)
 325         if c_stream is NULL:
 326             c_read_callback  = _readFilelikeParser
 327             c_callback_context = <python.PyObject*>self
 328         else:
 329             c_read_callback  = _readFileParser
 330             c_callback_context = c_stream
 331
 332         orig_options = ctxt.options
 333         with nogil:
 334             if ctxt.html:
 335                 result = htmlparser.htmlCtxtReadIO(
 336                         ctxt, c_read_callback, NULL, c_callback_context,
 337                         self._c_url, c_encoding, options)
 338                 if result is not NULL:
 339                     if _fixHtmlDictNames(ctxt.dict, result) < 0:
 340                         tree.xmlFreeDoc(result)
 341                         result = NULL
 342             else:
 343                 result = xmlparser.xmlCtxtReadIO(
 344                     ctxt, c_read_callback, NULL, c_callback_context,
 345                     self._c_url, c_encoding, options)
 346         ctxt.options = orig_options # work around libxml2 problem
 347         try:
 348             self._close_file()
 349         except:
 350             self._exc_context._store_raised()
 351         finally:
 352             return result  # swallow any exceptions
 353
 354     cdef int copyToBuffer(self, char* c_buffer, int c_requested):
 355         cdef int c_byte_count = 0
 356         cdef char* c_start
 357         cdef Py_ssize_t byte_count, remaining
 358         if self._bytes_read < 0:
 359             return 0
 360         try:
 361             byte_count = python.PyBytes_GET_SIZE(self._bytes)
 362             remaining  = byte_count - self._bytes_read
 363             while c_requested > remaining:
 364                 c_start = _cstr(self._bytes) + self._bytes_read
 365                 cstring_h.memcpy(c_buffer, c_start, remaining)
 366                 c_byte_count += remaining
 367                 c_buffer += remaining
 368                 c_requested -= remaining
 369
 370                 self._bytes = self._filelike.read(c_requested)
 371                 if not isinstance(self._bytes, bytes):
 372                     if isinstance(self._bytes, unicode):
 373                         if self._encoding is None:
 374                             self._bytes = (<unicode>self._bytes).encode('utf8')
 375                         else:
 376                             self._bytes = python.PyUnicode_AsEncodedString(
 377                                 self._bytes, _cstr(self._encoding), NULL)
 378                     else:
 379                         self._close_file()
 380                         raise TypeError, \
 381                             u"reading from file-like objects must return byte strings or unicode strings"
 382
 383                 remaining = python.PyBytes_GET_SIZE(self._bytes)
 384                 if remaining == 0:
 385                     self._bytes_read = -1
 386                     self._close_file()
 387                     return c_byte_count
 388                 self._bytes_read = 0
 389
 390             if c_requested > 0:
 391                 c_start = _cstr(self._bytes) + self._bytes_read
 392                 cstring_h.memcpy(c_buffer, c_start, c_requested)
 393                 c_byte_count += c_requested
 394                 self._bytes_read += c_requested
 395         except:
 396             c_byte_count = -1
 397             self._exc_context._store_raised()
 398             try:
 399                 self._close_file()
 400             except:
 401                 self._exc_context._store_raised()
 402         finally:
 403             return c_byte_count  # swallow any exceptions
 404
 405 cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) with gil:
 406     return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
 407
 408 cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) nogil:
 409     return stdio.fread(c_buffer, 1,  c_size, <stdio.FILE*>ctxt)
 410
 411 ############################################################
 412 ## support for custom document loaders
 413 ############################################################
 414
 415 cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid,
 416                                                xmlparser.xmlParserCtxt* c_context) with gil:
 417     cdef _ResolverContext context
 418     cdef xmlparser.xmlParserInput* c_input
 419     cdef _InputDocument doc_ref
 420     cdef _FileReaderContext file_context
 421     # if there is no _ParserContext associated with the xmlParserCtxt
 422     # passed, check to see if the thread state object has an implied
 423     # context.
 424     if c_context._private is not NULL:
 425         context = <_ResolverContext>c_context._private
 426     else:
 427         context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
 428
 429     if context is None:
 430         if __DEFAULT_ENTITY_LOADER is NULL:
 431             return NULL
 432         with nogil:
 433             # free the GIL as we might do serious I/O here (e.g. HTTP)
 434             c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
 435         return c_input
 436
 437     try:
 438         if c_url is NULL:
 439             url = None
 440         else:
 441             # parsing a related document (DTD etc.) => UTF-8 encoded URL?
 442             url = _decodeFilename(<const_xmlChar*>c_url)
 443         if c_pubid is NULL:
 444             pubid = None
 445         else:
 446             pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8
 447
 448         doc_ref = context._resolvers.resolve(url, pubid, context)
 449     except:
 450         context._store_raised()
 451         return NULL
 452
 453     if doc_ref is not None:
 454         if doc_ref._type == PARSER_DATA_STRING:
 455             data = doc_ref._data_bytes
 456             filename = doc_ref._filename
 457             if not filename:
 458                 filename = None
 459             elif not isinstance(filename, bytes):
 460                 # most likely a text URL
 461                 filename = filename.encode('utf8')
 462                 if not isinstance(filename, bytes):
 463                     filename = None
 464
 465             c_input = xmlparser.xmlNewInputStream(c_context)
 466             if c_input is not NULL:
 467                 if filename is not None:
 468                     c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename))
 469                 c_input.base = _xcstr(data)
 470                 c_input.length = python.PyBytes_GET_SIZE(data)
 471                 c_input.cur = c_input.base
 472                 c_input.end = c_input.base + c_input.length
 473         elif doc_ref._type == PARSER_DATA_FILENAME:
 474             data = None
 475             c_filename = _cstr(doc_ref._filename)
 476             with nogil:
 477                 # free the GIL as we might do serious I/O here
 478                 c_input = xmlparser.xmlNewInputFromFile(
 479                     c_context, c_filename)
 480         elif doc_ref._type == PARSER_DATA_FILE:
 481             file_context = _FileReaderContext(doc_ref._file, context, url,
 482                                               None, doc_ref._close_file)
 483             c_input = file_context._createParserInput(c_context)
 484             data = file_context
 485         else:
 486             data = None
 487             c_input = NULL
 488
 489         if data is not None:
 490             context._storage.add(data)
 491         if c_input is not NULL:
 492             return c_input
 493
 494     if __DEFAULT_ENTITY_LOADER is NULL:
 495         return NULL
 496
 497     with nogil:
 498         # free the GIL as we might do serious I/O here (e.g. HTTP)
 499         c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
 500     return c_input
 501
 502 cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
 503 __DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
 504
 505 xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver)
 506
 507 ############################################################
 508 ## Parsers
 509 ############################################################
 510
 511 @cython.no_gc_clear  # May have to call "self._validator.disconnect()" on dealloc.
 512 @cython.internal
 513 cdef class _ParserContext(_ResolverContext):
 514     cdef _ErrorLog _error_log
 515     cdef _ParserSchemaValidationContext _validator
 516     cdef xmlparser.xmlParserCtxt* _c_ctxt
 517     cdef python.PyThread_type_lock _lock
 518     cdef _Document _doc
 519     cdef bint _collect_ids
 520
 521     def __cinit__(self):
 522         self._c_ctxt = NULL
 523         self._collect_ids = True
 524         if not config.ENABLE_THREADING:
 525             self._lock = NULL
 526         else:
 527             self._lock = python.PyThread_allocate_lock()
 528         self._error_log = _ErrorLog()
 529
 530     def __dealloc__(self):
 531         if config.ENABLE_THREADING and self._lock is not NULL:
 532             python.PyThread_free_lock(self._lock)
 533             self._lock = NULL
 534         if self._c_ctxt is not NULL:
 535             if <void*>self._validator is not NULL and self._validator is not None:
 536                 # If the parser was not closed correctly (e.g. interrupted iterparse()),
 537                 # and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX
 538                 # validator plug might still be in place, which will make xmlFreeParserCtxt()
 539                 # crash when trying to xmlFree() a static SAX handler.
 540                 # Thus, make sure we disconnect the handler interceptor here at the latest.
 541                 self._validator.disconnect()
 542             xmlparser.xmlFreeParserCtxt(self._c_ctxt)
 543
 544     cdef _ParserContext _copy(self):
 545         cdef _ParserContext context
 546         context = self.__class__()
 547         context._collect_ids = self._collect_ids
 548         context._validator = self._validator.copy()
 549         _initParserContext(context, self._resolvers._copy(), NULL)
 550         return context
 551
 552     cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
 553         self._c_ctxt = c_ctxt
 554         c_ctxt._private = <void*>self
 555
 556     cdef void _resetParserContext(self):
 557         if self._c_ctxt is not NULL:
 558             if self._c_ctxt.html:
 559                 htmlparser.htmlCtxtReset(self._c_ctxt)
 560                 self._c_ctxt.disableSAX = 0 # work around bug in libxml2
 561             else:
 562                 xmlparser.xmlClearParserCtxt(self._c_ctxt)
 563
 564     cdef int prepare(self) except -1:
 565         cdef int result
 566         if config.ENABLE_THREADING and self._lock is not NULL:
 567             with nogil:
 568                 result = python.PyThread_acquire_lock(
 569                     self._lock, python.WAIT_LOCK)
 570             if result == 0:
 571                 raise ParserError, u"parser locking failed"
 572         self._error_log.clear()
 573         self._doc = None
 574         self._c_ctxt.sax.serror = _receiveParserError
 575         if self._validator is not None:
 576             self._validator.connect(self._c_ctxt, self._error_log)
 577         return 0
 578
 579     cdef int cleanup(self) except -1:
 580         if self._validator is not None:
 581             self._validator.disconnect()
 582         self._resetParserContext()
 583         self.clear()
 584         self._doc = None
 585         self._c_ctxt.sax.serror = NULL
 586         if config.ENABLE_THREADING and self._lock is not NULL:
 587             python.PyThread_release_lock(self._lock)
 588         return 0
 589
 590     cdef object _handleParseResult(self, _BaseParser parser,
 591                                    xmlDoc* result, filename):
 592         c_doc = self._handleParseResultDoc(parser, result, filename)
 593         if self._doc is not None and self._doc._c_doc is c_doc:
 594             return self._doc
 595         else:
 596             return _documentFactory(c_doc, parser)
 597
 598     cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
 599                                        xmlDoc* result, filename) except NULL:
 600         recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
 601         return _handleParseResult(self, self._c_ctxt, result,
 602                                   filename, recover,
 603                                   free_doc=self._doc is None)
 604
 605 cdef _initParserContext(_ParserContext context,
 606                         _ResolverRegistry resolvers,
 607                         xmlparser.xmlParserCtxt* c_ctxt):
 608     _initResolverContext(context, resolvers)
 609     if c_ctxt is not NULL:
 610         context._initParserContext(c_ctxt)
 611
 612 cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, xmlerror.xmlError* error) with gil:
 613     (<_ParserContext>_parser_context._private)._error_log._receive(error)
 614
 615 cdef void _receiveParserError(void* c_context, xmlerror.xmlError* error) nogil:
 616     if __DEBUG:
 617         if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL:
 618             _forwardError(NULL, error)
 619         else:
 620             _forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error)
 621
 622 cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
 623                           _ErrorLog error_log) except -1:
 624     if filename is not None and \
 625            ctxt.lastError.domain == xmlerror.XML_FROM_IO:
 626         if isinstance(filename, bytes):
 627             filename = _decodeFilenameWithLength(
 628                 <bytes>filename, len(<bytes>filename))
 629         if ctxt.lastError.message is not NULL:
 630             try:
 631                 message = ctxt.lastError.message.decode('utf-8')
 632             except UnicodeDecodeError:
 633                 # the filename may be in there => play it safe
 634                 message = ctxt.lastError.message.decode('iso8859-1')
 635             message = f"Error reading file '{filename}': {message.strip()}"
 636         else:
 637             message = f"Error reading '{filename}'"
 638         raise IOError, message
 639     elif error_log:
 640         raise error_log._buildParseException(
 641             XMLSyntaxError, u"Document is not well formed")
 642     elif ctxt.lastError.message is not NULL:
 643         message = ctxt.lastError.message.strip()
 644         code = ctxt.lastError.code
 645         line = ctxt.lastError.line
 646         column = ctxt.lastError.int2
 647         if ctxt.lastError.line > 0:
 648             message = f"line {line}: {message}"
 649         raise XMLSyntaxError(message, code, line, column, filename)
 650     else:
 651         raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
 652                              filename)
 653
 654 cdef xmlDoc* _handleParseResult(_ParserContext context,
 655                                 xmlparser.xmlParserCtxt* c_ctxt,
 656                                 xmlDoc* result, filename,
 657                                 bint recover, bint free_doc) except NULL:
 658     cdef bint well_formed
 659     if result is not NULL:
 660         __GLOBAL_PARSER_CONTEXT.initDocDict(result)
 661
 662     if c_ctxt.myDoc is not NULL:
 663         if c_ctxt.myDoc is not result:
 664             __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
 665             tree.xmlFreeDoc(c_ctxt.myDoc)
 666         c_ctxt.myDoc = NULL
 667
 668     if result is not NULL:
 669         if (context._validator is not None and
 670                 not context._validator.isvalid()):
 671             well_formed = 0  # actually not 'valid', but anyway ...
 672         elif (not c_ctxt.wellFormed and not c_ctxt.html and
 673                 c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
 674                 [1 for error in context._error_log
 675                  if error.type == ErrorTypes.ERR_INVALID_CHAR]):
 676             # An encoding error occurred and libxml2 switched from UTF-8
 677             # input to (undecoded) Latin-1, at some arbitrary point in the
 678             # document.  Better raise an error than allowing for a broken
 679             # tree with mixed encodings.
 680             well_formed = 0
 681         elif recover or (c_ctxt.wellFormed and
 682                          c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
 683             well_formed = 1
 684         elif not c_ctxt.replaceEntities and not c_ctxt.validate \
 685                  and context is not None:
 686             # in this mode, we ignore errors about undefined entities
 687             for error in context._error_log.filter_from_errors():
 688                 if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
 689                        error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
 690                     well_formed = 0
 691                     break
 692             else:
 693                 well_formed = 1
 694         else:
 695             well_formed = 0
 696
 697         if not well_formed:
 698             if free_doc:
 699                 tree.xmlFreeDoc(result)
 700             result = NULL
 701
 702     if context is not None and context._has_raised():
 703         if result is not NULL:
 704             if free_doc:
 705                 tree.xmlFreeDoc(result)
 706             result = NULL
 707         context._raise_if_stored()
 708
 709     if result is NULL:
 710         if context is not None:
 711             _raiseParseError(c_ctxt, filename, context._error_log)
 712         else:
 713             _raiseParseError(c_ctxt, filename, None)
 714     else:
 715         if result.URL is NULL and filename is not None:
 716             result.URL = tree.xmlStrdup(_xcstr(filename))
 717         if result.encoding is NULL:
 718             result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
 719
 720     if context._validator is not None and \
 721            context._validator._add_default_attributes:
 722         # we currently need to do this here as libxml2 does not
 723         # support inserting default attributes during parse-time
 724         # validation
 725         context._validator.inject_default_attributes(result)
 726
 727     return result
 728
 729 cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil:
 730     cdef xmlNode* c_node
 731     if c_doc is NULL:
 732         return 0
 733     c_node = c_doc.children
 734     tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
 735     if c_node.type == tree.XML_ELEMENT_NODE:
 736         if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
 737             return -1
 738     tree.END_FOR_EACH_ELEMENT_FROM(c_node)
 739     return 0
 740
 741 cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc,
 742                                   xmlNode* c_start_node) nogil:
 743     """
 744     Move names to the dict, iterating in document order, starting at
 745     c_start_node. This is used in incremental parsing after each chunk.
 746     """
 747     cdef xmlNode* c_node
 748     if not c_doc:
 749         return 0
 750     if not c_start_node:
 751         return _fixHtmlDictNames(c_dict, c_doc)
 752     c_node = c_start_node
 753     tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
 754     if c_node.type == tree.XML_ELEMENT_NODE:
 755         if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
 756             return -1
 757     tree.END_FOR_EACH_ELEMENT_FROM(c_node)
 758     return 0
 759
 760 cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
 761                                       xmlNode* c_node) nogil:
 762     cdef xmlNode* c_attr
 763     c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
 764     if c_name is NULL:
 765         return -1
 766     if c_name is not c_node.name:
 767         tree.xmlFree(<char*>c_node.name)
 768         c_node.name = c_name
 769     c_attr = <xmlNode*>c_node.properties
 770     while c_attr is not NULL:
 771         c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
 772         if c_name is NULL:
 773             return -1
 774         if c_name is not c_attr.name:
 775             tree.xmlFree(<char*>c_attr.name)
 776             c_attr.name = c_name
 777         c_attr = c_attr.next
 778     return 0
 779
 780 @cython.internal
 781 cdef class _BaseParser:
 782     cdef ElementClassLookup _class_lookup
 783     cdef _ResolverRegistry _resolvers
 784     cdef _ParserContext _parser_context
 785     cdef _ParserContext _push_parser_context
 786     cdef int _parse_options
 787     cdef bint _for_html
 788     cdef bint _remove_comments
 789     cdef bint _remove_pis
 790     cdef bint _strip_cdata
 791     cdef bint _collect_ids
 792     cdef XMLSchema _schema
 793     cdef bytes _filename
 794     cdef readonly object target
 795     cdef object _default_encoding
 796     cdef tuple _events_to_collect  # (event_types, tag)
 797
 798     def __init__(self, int parse_options, bint for_html, XMLSchema schema,
 799                  remove_comments, remove_pis, strip_cdata, collect_ids,
 800                  target, encoding):
 801         cdef tree.xmlCharEncodingHandler* enchandler
 802         cdef int c_encoding
 803         if not isinstance(self, (XMLParser, HTMLParser)):
 804             raise TypeError, u"This class cannot be instantiated"
 805
 806         self._parse_options = parse_options
 807         self.target = target
 808         self._for_html = for_html
 809         self._remove_comments = remove_comments
 810         self._remove_pis = remove_pis
 811         self._strip_cdata = strip_cdata
 812         self._collect_ids = collect_ids
 813         self._schema = schema
 814
 815         self._resolvers = _ResolverRegistry()
 816
 817         if encoding is None:
 818             self._default_encoding = None
 819         else:
 820             encoding = _utf8(encoding)
 821             enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
 822             if enchandler is NULL:
 823                 raise LookupError, f"unknown encoding: '{encoding}'"
 824             tree.xmlCharEncCloseFunc(enchandler)
 825             self._default_encoding = encoding
 826
 827     cdef _setBaseURL(self, base_url):
 828         self._filename = _encodeFilename(base_url)
 829
 830     cdef _collectEvents(self, event_types, tag):
 831         if event_types is None:
 832             event_types = ()
 833         else:
 834             event_types = tuple(set(event_types))
 835             _buildParseEventFilter(event_types)  # purely for validation
 836         self._events_to_collect = (event_types, tag)
 837
 838     cdef _ParserContext _getParserContext(self):
 839         cdef xmlparser.xmlParserCtxt* pctxt
 840         if self._parser_context is None:
 841             self._parser_context = self._createContext(self.target, None)
 842             self._parser_context._collect_ids = self._collect_ids
 843             if self._schema is not None:
 844                 self._parser_context._validator = \
 845                     self._schema._newSaxValidator(
 846                         self._parse_options & xmlparser.XML_PARSE_DTDATTR)
 847             pctxt = self._newParserCtxt()
 848             _initParserContext(self._parser_context, self._resolvers, pctxt)
 849             self._configureSaxContext(pctxt)
 850         return self._parser_context
 851
 852     cdef _ParserContext _getPushParserContext(self):
 853         cdef xmlparser.xmlParserCtxt* pctxt
 854         if self._push_parser_context is None:
 855             self._push_parser_context = self._createContext(
 856                 self.target, self._events_to_collect)
 857             self._push_parser_context._collect_ids = self._collect_ids
 858             if self._schema is not None:
 859                 self._push_parser_context._validator = \
 860                     self._schema._newSaxValidator(
 861                         self._parse_options & xmlparser.XML_PARSE_DTDATTR)
 862             pctxt = self._newPushParserCtxt()
 863             _initParserContext(
 864                 self._push_parser_context, self._resolvers, pctxt)
 865             self._configureSaxContext(pctxt)
 866         return self._push_parser_context
 867
 868     cdef _ParserContext _createContext(self, target, events_to_collect):
 869         cdef _SaxParserContext sax_context
 870         if target is not None:
 871             sax_context = _TargetParserContext(self)
 872             (<_TargetParserContext>sax_context)._setTarget(target)
 873         elif events_to_collect:
 874             sax_context = _SaxParserContext(self)
 875         else:
 876             # nothing special to configure
 877             return _ParserContext()
 878         if events_to_collect:
 879             events, tag = events_to_collect
 880             sax_context._setEventFilter(events, tag)
 881         return sax_context
 882
 883     @cython.final
 884     cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1:
 885         if self._remove_comments:
 886             pctxt.sax.comment = NULL
 887         if self._remove_pis:
 888             pctxt.sax.processingInstruction = NULL
 889         if self._strip_cdata:
 890             # hard switch-off for CDATA nodes => makes them plain text
 891             pctxt.sax.cdataBlock = NULL
 892
 893     cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
 894         cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
 895         if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
 896             # need to extend SAX1 context to SAX2 to get proper error reports
 897             if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler:
 898                 sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler))
 899                 if sax is NULL:
 900                     raise MemoryError()
 901                 cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler,
 902                                  sizeof(htmlparser.htmlDefaultSAXHandler))
 903                 c_ctxt.sax = sax
 904             sax.initialized = xmlparser.XML_SAX2_MAGIC
 905             sax.serror = _receiveParserError
 906             sax.startElementNs = NULL
 907             sax.endElementNs = NULL
 908             sax._private = NULL
 909         return 0
 910
 911     cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
 912         cdef xmlparser.xmlParserCtxt* c_ctxt
 913         if self._for_html:
 914             c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
 915             if c_ctxt is not NULL:
 916                 self._registerHtmlErrorHandler(c_ctxt)
 917         else:
 918             c_ctxt = xmlparser.xmlNewParserCtxt()
 919         if c_ctxt is NULL:
 920             raise MemoryError
 921         c_ctxt.sax.startDocument = _initSaxDocument
 922         return c_ctxt
 923
 924     cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL:
 925         cdef xmlparser.xmlParserCtxt* c_ctxt
 926         cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL
 927         if self._for_html:
 928             c_ctxt = htmlparser.htmlCreatePushParserCtxt(
 929                 NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
 930             if c_ctxt is not NULL:
 931                 self._registerHtmlErrorHandler(c_ctxt)
 932                 htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
 933         else:
 934             c_ctxt = xmlparser.xmlCreatePushParserCtxt(
 935                 NULL, NULL, NULL, 0, c_filename)
 936             if c_ctxt is not NULL:
 937                 xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
 938         if c_ctxt is NULL:
 939             raise MemoryError()
 940         c_ctxt.sax.startDocument = _initSaxDocument
 941         return c_ctxt
 942
 943     @property
 944     def error_log(self):
 945         """The error log of the last parser run.
 946         """
 947         cdef _ParserContext context
 948         context = self._getParserContext()
 949         return context._error_log.copy()
 950
 951     @property
 952     def resolvers(self):
 953         """The custom resolver registry of this parser."""
 954         return self._resolvers
 955
 956     @property
 957     def version(self):
 958         """The version of the underlying XML parser."""
 959         return u"libxml2 %d.%d.%d" % LIBXML_VERSION
 960
 961     def setElementClassLookup(self, ElementClassLookup lookup = None):
 962         u":deprecated: use ``parser.set_element_class_lookup(lookup)`` instead."
 963         self.set_element_class_lookup(lookup)
 964
 965     def set_element_class_lookup(self, ElementClassLookup lookup = None):
 966         u"""set_element_class_lookup(self, lookup = None)
 967
 968         Set a lookup scheme for element classes generated from this parser.
 969
 970         Reset it by passing None or nothing.
 971         """
 972         self._class_lookup = lookup
 973
 974     cdef _BaseParser _copy(self):
 975         u"Create a new parser with the same configuration."
 976         cdef _BaseParser parser
 977         parser = self.__class__()
 978         parser._parse_options = self._parse_options
 979         parser._for_html = self._for_html
 980         parser._remove_comments = self._remove_comments
 981         parser._remove_pis = self._remove_pis
 982         parser._strip_cdata = self._strip_cdata
 983         parser._filename = self._filename
 984         parser._resolvers = self._resolvers
 985         parser.target = self.target
 986         parser._class_lookup  = self._class_lookup
 987         parser._default_encoding = self._default_encoding
 988         parser._schema = self._schema
 989         parser._events_to_collect = self._events_to_collect
 990         return parser
 991
 992     def copy(self):
 993         u"""copy(self)
 994
 995         Create a new parser with the same configuration.
 996         """
 997         return self._copy()
 998
 999     def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
1000         u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
1001
1002         Creates a new element associated with this parser.
1003         """
1004         return _makeElement(_tag, NULL, None, self, None, None,
1005                             attrib, nsmap, _extra)
1006
1007     # internal parser methods
1008
1009     cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
1010         u"""Parse unicode document, share dictionary if possible.
1011         """
1012         cdef _ParserContext context
1013         cdef xmlDoc* result
1014         cdef xmlparser.xmlParserCtxt* pctxt
1015         cdef Py_ssize_t py_buffer_len
1016         cdef int buffer_len, c_kind
1017         cdef const_char* c_text
1018         cdef const_char* c_encoding = _UNICODE_ENCODING
1019         cdef bint is_pep393_string = (
1020             python.PEP393_ENABLED and python.PyUnicode_IS_READY(utext))
1021         if is_pep393_string:
1022             c_text = <const_char*>python.PyUnicode_DATA(utext)
1023             py_buffer_len = python.PyUnicode_GET_LENGTH(utext)
1024             c_kind = python.PyUnicode_KIND(utext)
1025             if c_kind == 1:
1026                 c_encoding = 'ISO-8859-1'
1027             elif c_kind == 2:
1028                 py_buffer_len *= 2
1029                 if python.PY_BIG_ENDIAN:
1030                     c_encoding = 'UTF-16BE'  # actually UCS-2
1031                 else:
1032                     c_encoding = 'UTF-16LE'  # actually UCS-2
1033             elif c_kind == 4:
1034                 py_buffer_len *= 4
1035                 if python.PY_BIG_ENDIAN:
1036                     c_encoding = 'UCS-4BE'
1037                 else:
1038                     c_encoding = 'UCS-4LE'
1039             else:
1040                 assert False, f"Illegal Unicode kind {c_kind}"
1041         else:
1042             py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
1043             c_text = python.PyUnicode_AS_DATA(utext)
1044         assert 0 <= py_buffer_len <= limits.INT_MAX
1045         buffer_len = py_buffer_len
1046
1047         context = self._getParserContext()
1048         context.prepare()
1049         try:
1050             pctxt = context._c_ctxt
1051             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1052             orig_options = pctxt.options
1053             with nogil:
1054                 if self._for_html:
1055                     result = htmlparser.htmlCtxtReadMemory(
1056                         pctxt, c_text, buffer_len, c_filename, c_encoding,
1057                         self._parse_options)
1058                     if result is not NULL:
1059                         if _fixHtmlDictNames(pctxt.dict, result) < 0:
1060                             tree.xmlFreeDoc(result)
1061                             result = NULL
1062                 else:
1063                     result = xmlparser.xmlCtxtReadMemory(
1064                         pctxt, c_text, buffer_len, c_filename, c_encoding,
1065                         self._parse_options)
1066             pctxt.options = orig_options # work around libxml2 problem
1067
1068             return context._handleParseResultDoc(self, result, None)
1069         finally:
1070             context.cleanup()
1071
1072     cdef xmlDoc* _parseDoc(self, char* c_text, int c_len,
1073                            char* c_filename) except NULL:
1074         u"""Parse document, share dictionary if possible.
1075         """
1076         cdef _ParserContext context
1077         cdef xmlDoc* result
1078         cdef xmlparser.xmlParserCtxt* pctxt
1079         cdef char* c_encoding
1080         cdef tree.xmlCharEncoding enc
1081         context = self._getParserContext()
1082         context.prepare()
1083         try:
1084             pctxt = context._c_ctxt
1085             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1086
1087             if self._default_encoding is None:
1088                 c_encoding = NULL
1089                 # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs
1090                 # NOTE: limit to problematic cases because it changes character offsets
1091                 if c_len >= 4 and (c_text[0] == '\xFF' and c_text[1] == '\xFE' and
1092                                    c_text[2] == 0 and c_text[3] == 0):
1093                     c_encoding = "UTF-32LE"
1094                     c_text += 4
1095                     c_len -= 4
1096                 elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and
1097                                      c_text[2] == '\xFE' and c_text[3] == '\xFF'):
1098                     c_encoding = "UTF-32BE"
1099                     c_text += 4
1100                     c_len -= 4
1101                 else:
1102                     # no BOM => try to determine encoding
1103                     enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len)
1104                     if enc == tree.XML_CHAR_ENCODING_UCS4LE:
1105                         c_encoding = 'UTF-32LE'
1106                     elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
1107                         c_encoding = 'UTF-32BE'
1108             else:
1109                 c_encoding = _cstr(self._default_encoding)
1110
1111             orig_options = pctxt.options
1112             with nogil:
1113                 if self._for_html:
1114                     result = htmlparser.htmlCtxtReadMemory(
1115                         pctxt, c_text, c_len, c_filename,
1116                         c_encoding, self._parse_options)
1117                     if result is not NULL:
1118                         if _fixHtmlDictNames(pctxt.dict, result) < 0:
1119                             tree.xmlFreeDoc(result)
1120                             result = NULL
1121                 else:
1122                     result = xmlparser.xmlCtxtReadMemory(
1123                         pctxt, c_text, c_len, c_filename,
1124                         c_encoding, self._parse_options)
1125             pctxt.options = orig_options # work around libxml2 problem
1126
1127             return context._handleParseResultDoc(self, result, None)
1128         finally:
1129             context.cleanup()
1130
1131     cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
1132         cdef _ParserContext context
1133         cdef xmlDoc* result
1134         cdef xmlparser.xmlParserCtxt* pctxt
1135         cdef char* c_encoding
1136         result = NULL
1137
1138         context = self._getParserContext()
1139         context.prepare()
1140         try:
1141             pctxt = context._c_ctxt
1142             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1143
1144             if self._default_encoding is None:
1145                 c_encoding = NULL
1146             else:
1147                 c_encoding = _cstr(self._default_encoding)
1148
1149             orig_options = pctxt.options
1150             with nogil:
1151                 if self._for_html:
1152                     result = htmlparser.htmlCtxtReadFile(
1153                         pctxt, c_filename, c_encoding, self._parse_options)
1154                     if result is not NULL:
1155                         if _fixHtmlDictNames(pctxt.dict, result) < 0:
1156                             tree.xmlFreeDoc(result)
1157                             result = NULL
1158                 else:
1159                     result = xmlparser.xmlCtxtReadFile(
1160                         pctxt, c_filename, c_encoding, self._parse_options)
1161             pctxt.options = orig_options # work around libxml2 problem
1162
1163             return context._handleParseResultDoc(self, result, c_filename)
1164         finally:
1165             context.cleanup()
1166
1167     cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename,
1168                                        encoding) except NULL:
1169         cdef _ParserContext context
1170         cdef _FileReaderContext file_context
1171         cdef xmlDoc* result
1172         cdef xmlparser.xmlParserCtxt* pctxt
1173         cdef char* c_filename
1174         if not filename:
1175             filename = None
1176
1177         context = self._getParserContext()
1178         context.prepare()
1179         try:
1180             pctxt = context._c_ctxt
1181             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1182             file_context = _FileReaderContext(
1183                 filelike, context, filename,
1184                 encoding or self._default_encoding)
1185             result = file_context._readDoc(pctxt, self._parse_options)
1186
1187             return context._handleParseResultDoc(
1188                 self, result, filename)
1189         finally:
1190             context.cleanup()
1191
1192
1193 cdef void _initSaxDocument(void* ctxt) with gil:
1194     xmlparser.xmlSAX2StartDocument(ctxt)
1195     c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
1196     c_doc = c_ctxt.myDoc
1197
1198     # set up document dict
1199     if c_doc and c_ctxt.dict and not c_doc.dict:
1200         # I have no idea why libxml2 disables this - we need it
1201         c_ctxt.dictNames = 1
1202         c_doc.dict = c_ctxt.dict
1203         xmlparser.xmlDictReference(c_ctxt.dict)
1204
1205     # set up XML ID hash table
1206     if c_ctxt._private:
1207         context = <_ParserContext>c_ctxt._private
1208         if context._collect_ids:
1209             # keep the global parser dict from filling up with XML IDs
1210             if c_doc and not c_doc.ids:
1211                 # memory errors are not fatal here
1212                 c_dict = xmlparser.xmlDictCreate()
1213                 if c_dict:
1214                     c_doc.ids = tree.xmlHashCreateDict(0, c_dict)
1215                     xmlparser.xmlDictFree(c_dict)
1216                 else:
1217                     c_doc.ids = tree.xmlHashCreate(0)
1218         else:
1219             c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS
1220             if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids):
1221                 # already initialised but empty => clear
1222                 tree.xmlHashFree(c_doc.ids, NULL)
1223                 c_doc.ids = NULL
1224
1225
1226 ############################################################
1227 ## ET feed parser
1228 ############################################################
1229
1230 cdef class _FeedParser(_BaseParser):
1231     cdef bint _feed_parser_running
1232
1233     @property
1234     def feed_error_log(self):
1235         """The error log of the last (or current) run of the feed parser.
1236
1237         Note that this is local to the feed parser and thus is
1238         different from what the ``error_log`` property returns.
1239         """
1240         return self._getPushParserContext()._error_log.copy()
1241
1242     cpdef feed(self, data):
1243         u"""feed(self, data)
1244
1245         Feeds data to the parser.  The argument should be an 8-bit string
1246         buffer containing encoded data, although Unicode is supported as long
1247         as both string types are not mixed.
1248
1249         This is the main entry point to the consumer interface of a
1250         parser.  The parser will parse as much of the XML stream as it
1251         can on each call.  To finish parsing or to reset the parser,
1252         call the ``close()`` method.  Both methods may raise
1253         ParseError if errors occur in the input data.  If an error is
1254         raised, there is no longer a need to call ``close()``.
1255
1256         The feed parser interface is independent of the normal parser
1257         usage.  You can use the same parser as a feed parser and in
1258         the ``parse()`` function concurrently.
1259         """
1260         cdef _ParserContext context
1261         cdef xmlparser.xmlParserCtxt* pctxt
1262         cdef Py_ssize_t py_buffer_len
1263         cdef const_char* c_data
1264         cdef const_char* c_encoding
1265         cdef int buffer_len
1266         cdef int error
1267         cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
1268         if isinstance(data, bytes):
1269             if self._default_encoding is None:
1270                 c_encoding = NULL
1271             else:
1272                 c_encoding = self._default_encoding
1273             c_data = _cstr(data)
1274             py_buffer_len = python.PyBytes_GET_SIZE(data)
1275         elif isinstance(data, unicode):
1276             if _UNICODE_ENCODING is NULL:
1277                 raise ParserError, \
1278                     u"Unicode parsing is not supported on this platform"
1279             c_encoding = _UNICODE_ENCODING
1280             c_data = python.PyUnicode_AS_DATA(data)
1281             py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data)
1282         else:
1283             raise TypeError, u"Parsing requires string data"
1284
1285         context = self._getPushParserContext()
1286         pctxt = context._c_ctxt
1287         error = 0
1288         if not self._feed_parser_running:
1289             context.prepare()
1290             self._feed_parser_running = 1
1291             c_filename = (_cstr(self._filename)
1292                           if self._filename is not None else NULL)
1293
1294             # We have to give *mlCtxtResetPush() enough input to figure
1295             # out the character encoding (at least four bytes),
1296             # however if we give it all we got, we'll have nothing for
1297             # *mlParseChunk() and things go wrong.
1298             buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len
1299             if self._for_html:
1300                 error = _htmlCtxtResetPush(
1301                     pctxt, c_data, buffer_len, c_filename, c_encoding,
1302                     self._parse_options)
1303             else:
1304                 xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
1305                 error = xmlparser.xmlCtxtResetPush(
1306                     pctxt, c_data, buffer_len, c_filename, c_encoding)
1307             py_buffer_len -= buffer_len
1308             c_data += buffer_len
1309             if error:
1310                 raise MemoryError()
1311             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1312
1313         #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
1314
1315         fixup_error = 0
1316         while py_buffer_len > 0 and (error == 0 or recover):
1317             with nogil:
1318                 if py_buffer_len > limits.INT_MAX:
1319                     buffer_len = limits.INT_MAX
1320                 else:
1321                     buffer_len = <int>py_buffer_len
1322                 if self._for_html:
1323                     c_node = pctxt.node  # last node where the parser stopped
1324                     error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0)
1325                     # and now for the fun part: move node names to the dict
1326                     if pctxt.myDoc:
1327                         fixup_error = _fixHtmlDictSubtreeNames(
1328                             pctxt.dict, pctxt.myDoc, c_node)
1329                         if pctxt.myDoc.dict and pctxt.myDoc.dict is not pctxt.dict:
1330                             xmlparser.xmlDictFree(pctxt.myDoc.dict)
1331                             pctxt.myDoc.dict = pctxt.dict
1332                             xmlparser.xmlDictReference(pctxt.dict)
1333                 else:
1334                     error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0)
1335                 py_buffer_len -= buffer_len
1336                 c_data += buffer_len
1337
1338             if fixup_error:
1339                 context.store_exception(MemoryError())
1340
1341             if context._has_raised():
1342                 # propagate Python exceptions immediately
1343                 recover = 0
1344                 error = 1
1345                 break
1346
1347             if error and not pctxt.replaceEntities and not pctxt.validate:
1348                 # in this mode, we ignore errors about undefined entities
1349                 for entry in context._error_log.filter_from_errors():
1350                     if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
1351                            entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
1352                         break
1353                 else:
1354                     error = 0
1355
1356         if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised():
1357             # propagate Python exceptions immediately
1358             recover = 0
1359             error = 1
1360
1361         if fixup_error or not recover and (error or not pctxt.wellFormed):
1362             self._feed_parser_running = 0
1363             try:
1364                 context._handleParseResult(self, pctxt.myDoc, None)
1365             finally:
1366                 context.cleanup()
1367
1368     cpdef close(self):
1369         u"""close(self)
1370
1371         Terminates feeding data to this parser.  This tells the parser to
1372         process any remaining data in the feed buffer, and then returns the
1373         root Element of the tree that was parsed.
1374
1375         This method must be called after passing the last chunk of data into
1376         the ``feed()`` method.  It should only be called when using the feed
1377         parser interface, all other usage is undefined.
1378         """
1379         if not self._feed_parser_running:
1380             raise XMLSyntaxError(u"no element found",
1381                                  xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
1382                                  self._filename)
1383
1384         context = self._getPushParserContext()
1385         pctxt = context._c_ctxt
1386
1387         self._feed_parser_running = 0
1388         if self._for_html:
1389             htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
1390         else:
1391             xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
1392
1393         if (pctxt.recovery and not pctxt.disableSAX and
1394                 isinstance(context, _SaxParserContext)):
1395             # apply any left-over 'end' events
1396             (<_SaxParserContext>context).flushEvents()
1397
1398         try:
1399             result = context._handleParseResult(self, pctxt.myDoc, None)
1400         finally:
1401             context.cleanup()
1402
1403         if isinstance(result, _Document):
1404             return (<_Document>result).getroot()
1405         else:
1406             return result
1407
1408
1409 cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
1410                              const_char* c_data, int buffer_len,
1411                              const_char* c_filename, const_char* c_encoding,
1412                              int parse_options) except -1:
1413     cdef xmlparser.xmlParserInput* c_input_stream
1414     # libxml2 lacks an HTML push parser setup function
1415     error = xmlparser.xmlCtxtResetPush(
1416         c_ctxt, c_data, buffer_len, c_filename, c_encoding)
1417     if error:
1418         return error
1419
1420     # fix libxml2 setup for HTML
1421     c_ctxt.progressive = 1
1422     c_ctxt.html = 1
1423     htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
1424
1425     return 0
1426
1427
1428 ############################################################
1429 ## XML parser
1430 ############################################################
1431
1432 cdef int _XML_DEFAULT_PARSE_OPTIONS
1433 _XML_DEFAULT_PARSE_OPTIONS = (
1434     xmlparser.XML_PARSE_NOENT   |
1435     xmlparser.XML_PARSE_NOCDATA |
1436     xmlparser.XML_PARSE_NONET   |
1437     xmlparser.XML_PARSE_COMPACT |
1438     xmlparser.XML_PARSE_BIG_LINES
1439     )
1440
1441 cdef class XMLParser(_FeedParser):
1442     u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True)
1443
1444     The XML parser.
1445
1446     Parsers can be supplied as additional argument to various parse
1447     functions of the lxml API.  A default parser is always available
1448     and can be replaced by a call to the global function
1449     'set_default_parser'.  New parsers can be created at any time
1450     without a major run-time overhead.
1451
1452     The keyword arguments in the constructor are mainly based on the
1453     libxml2 parser configuration.  A DTD will also be loaded if DTD
1454     validation or attribute default values are requested (unless you
1455     additionally provide an XMLSchema from which the default
1456     attributes can be read).
1457
1458     Available boolean keyword arguments:
1459
1460     - attribute_defaults - inject default attributes from DTD or XMLSchema
1461     - dtd_validation     - validate against a DTD referenced by the document
1462     - load_dtd           - use DTD for parsing
1463     - no_network         - prevent network access for related files (default: True)
1464     - ns_clean           - clean up redundant namespace declarations
1465     - recover            - try hard to parse through broken XML
1466     - remove_blank_text  - discard blank text nodes that appear ignorable
1467     - remove_comments    - discard comments
1468     - remove_pis         - discard processing instructions
1469     - strip_cdata        - replace CDATA sections by normal text content (default: True)
1470     - compact            - save memory for short text content (default: True)
1471     - collect_ids        - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
1472     - resolve_entities   - replace entities by their text value (default: True)
1473     - huge_tree          - disable security restrictions and support very deep trees
1474                            and very long text content (only affects libxml2 2.7+)
1475
1476     Other keyword arguments:
1477
1478     - encoding - override the document encoding
1479     - target   - a parser target object that will receive the parse events
1480     - schema   - an XMLSchema to validate against
1481
1482     Note that you should avoid sharing parsers between threads.  While this is
1483     not harmful, it is more efficient to use separate parsers.  This does not
1484     apply to the default parser.
1485     """
1486     def __init__(self, *, encoding=None, attribute_defaults=False,
1487                  dtd_validation=False, load_dtd=False, no_network=True,
1488                  ns_clean=False, recover=False, XMLSchema schema=None,
1489                  huge_tree=False, remove_blank_text=False, resolve_entities=True,
1490                  remove_comments=False, remove_pis=False, strip_cdata=True,
1491                  collect_ids=True, target=None, compact=True):
1492         cdef int parse_options
1493         parse_options = _XML_DEFAULT_PARSE_OPTIONS
1494         if load_dtd:
1495             parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1496         if dtd_validation:
1497             parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
1498                             xmlparser.XML_PARSE_DTDLOAD
1499         if attribute_defaults:
1500             parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
1501             if schema is None:
1502                 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1503         if ns_clean:
1504             parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
1505         if recover:
1506             parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
1507         if remove_blank_text:
1508             parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
1509         if huge_tree:
1510             parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1511         if not no_network:
1512             parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
1513         if not compact:
1514             parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
1515         if not resolve_entities:
1516             parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
1517         if not strip_cdata:
1518             parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
1519
1520         _BaseParser.__init__(self, parse_options, 0, schema,
1521                              remove_comments, remove_pis, strip_cdata,
1522                              collect_ids, target, encoding)
1523
1524
1525 cdef class XMLPullParser(XMLParser):
1526     """XMLPullParser(self, events=None, *, tag=None, **kwargs)
1527
1528     XML parser that collects parse events in an iterator.
1529
1530     The collected events are the same as for iterparse(), but the
1531     parser itself is non-blocking in the sense that it receives
1532     data chunks incrementally through its .feed() method, instead
1533     of reading them directly from a file(-like) object all by itself.
1534
1535     By default, it collects Element end events.  To change that,
1536     pass any subset of the available events into the ``events``
1537     argument: ``'start'``, ``'end'``, ``'start-ns'``,
1538     ``'end-ns'``, ``'comment'``, ``'pi'``.
1539
1540     To support loading external dependencies relative to the input
1541     source, you can pass the ``base_url``.
1542     """
1543     def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
1544         XMLParser.__init__(self, **kwargs)
1545         if events is None:
1546             events = ('end',)
1547         self._setBaseURL(base_url)
1548         self._collectEvents(events, tag)
1549
1550     def read_events(self):
1551         return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
1552
1553
1554 cdef class ETCompatXMLParser(XMLParser):
1555     u"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
1556                  dtd_validation=False, load_dtd=False, no_network=True, \
1557                  ns_clean=False, recover=False, schema=None, \
1558                  huge_tree=False, remove_blank_text=False, resolve_entities=True, \
1559                  remove_comments=True, remove_pis=True, strip_cdata=True, \
1560                  target=None, compact=True)
1561
1562     An XML parser with an ElementTree compatible default setup.
1563
1564     See the XMLParser class for details.
1565
1566     This parser has ``remove_comments`` and ``remove_pis`` enabled by default
1567     and thus ignores comments and processing instructions.
1568     """
1569     def __init__(self, *, encoding=None, attribute_defaults=False,
1570                  dtd_validation=False, load_dtd=False, no_network=True,
1571                  ns_clean=False, recover=False, schema=None,
1572                  huge_tree=False, remove_blank_text=False, resolve_entities=True,
1573                  remove_comments=True, remove_pis=True, strip_cdata=True,
1574                  target=None, compact=True):
1575         XMLParser.__init__(self,
1576                            attribute_defaults=attribute_defaults,
1577                            dtd_validation=dtd_validation,
1578                            load_dtd=load_dtd,
1579                            no_network=no_network,
1580                            ns_clean=ns_clean,
1581                            recover=recover,
1582                            remove_blank_text=remove_blank_text,
1583                            huge_tree=huge_tree,
1584                            compact=compact,
1585                            resolve_entities=resolve_entities,
1586                            remove_comments=remove_comments,
1587                            remove_pis=remove_pis,
1588                            strip_cdata=strip_cdata,
1589                            target=target,
1590                            encoding=encoding,
1591                            schema=schema)
1592
1593 # ET 1.2 compatible name
1594 XMLTreeBuilder = ETCompatXMLParser
1595
1596
1597 cdef XMLParser __DEFAULT_XML_PARSER
1598 __DEFAULT_XML_PARSER = XMLParser()
1599
1600 __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
1601
1602 def set_default_parser(_BaseParser parser=None):
1603     u"""set_default_parser(parser=None)
1604
1605     Set a default parser for the current thread.  This parser is used
1606     globally whenever no parser is supplied to the various parse functions of
1607     the lxml API.  If this function is called without a parser (or if it is
1608     None), the default parser is reset to the original configuration.
1609
1610     Note that the pre-installed default parser is not thread-safe.  Avoid the
1611     default parser in multi-threaded environments.  You can create a separate
1612     parser for each thread explicitly or use a parser pool.
1613     """
1614     if parser is None:
1615         parser = __DEFAULT_XML_PARSER
1616     __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
1617
1618 def get_default_parser():
1619     u"get_default_parser()"
1620     return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1621
1622 ############################################################
1623 ## HTML parser
1624 ############################################################
1625
1626 cdef int _HTML_DEFAULT_PARSE_OPTIONS
1627 _HTML_DEFAULT_PARSE_OPTIONS = (
1628     htmlparser.HTML_PARSE_RECOVER |
1629     htmlparser.HTML_PARSE_NONET   |
1630     htmlparser.HTML_PARSE_COMPACT
1631     )
1632
1633 cdef class HTMLParser(_FeedParser):
1634     u"""HTMLParser(self, encoding=None, remove_blank_text=False, \
1635                    remove_comments=False, remove_pis=False, strip_cdata=True, \
1636                    no_network=True, target=None, schema: XMLSchema =None, \
1637                    recover=True, compact=True, collect_ids=True, huge_tree=False)
1638
1639     The HTML parser.
1640
1641     This parser allows reading HTML into a normal XML tree.  By
1642     default, it can read broken (non well-formed) HTML, depending on
1643     the capabilities of libxml2.  Use the 'recover' option to switch
1644     this off.
1645
1646     Available boolean keyword arguments:
1647
1648     - recover            - try hard to parse through broken HTML (default: True)
1649     - no_network         - prevent network access for related files (default: True)
1650     - remove_blank_text  - discard empty text nodes that are ignorable (i.e. not actual text content)
1651     - remove_comments    - discard comments
1652     - remove_pis         - discard processing instructions
1653     - strip_cdata        - replace CDATA sections by normal text content (default: True)
1654     - compact            - save memory for short text content (default: True)
1655     - default_doctype    - add a default doctype even if it is not found in the HTML (default: True)
1656     - collect_ids        - use a hash table of XML IDs for fast access (default: True)
1657     - huge_tree          - disable security restrictions and support very deep trees
1658                            and very long text content (only affects libxml2 2.7+)
1659
1660     Other keyword arguments:
1661
1662     - encoding - override the document encoding
1663     - target   - a parser target object that will receive the parse events
1664     - schema   - an XMLSchema to validate against
1665
1666     Note that you should avoid sharing parsers between threads for performance
1667     reasons.
1668     """
1669     def __init__(self, *, encoding=None, remove_blank_text=False,
1670                  remove_comments=False, remove_pis=False, strip_cdata=True,
1671                  no_network=True, target=None, XMLSchema schema=None,
1672                  recover=True, compact=True, default_doctype=True,
1673                  collect_ids=True, huge_tree=False):
1674         cdef int parse_options
1675         parse_options = _HTML_DEFAULT_PARSE_OPTIONS
1676         if remove_blank_text:
1677             parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
1678         if not recover:
1679             parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
1680         if not no_network:
1681             parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
1682         if not compact:
1683             parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
1684         if not default_doctype:
1685             parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD
1686         if huge_tree:
1687             parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1688
1689         _BaseParser.__init__(self, parse_options, 1, schema,
1690                              remove_comments, remove_pis, strip_cdata,
1691                              collect_ids, target, encoding)
1692
1693
1694 cdef HTMLParser __DEFAULT_HTML_PARSER
1695 __DEFAULT_HTML_PARSER = HTMLParser()
1696
1697
1698 cdef class HTMLPullParser(HTMLParser):
1699     """HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs)
1700
1701     HTML parser that collects parse events in an iterator.
1702
1703     The collected events are the same as for iterparse(), but the
1704     parser itself is non-blocking in the sense that it receives
1705     data chunks incrementally through its .feed() method, instead
1706     of reading them directly from a file(-like) object all by itself.
1707
1708     By default, it collects Element end events.  To change that,
1709     pass any subset of the available events into the ``events``
1710     argument: ``'start'``, ``'end'``, ``'start-ns'``,
1711     ``'end-ns'``, ``'comment'``, ``'pi'``.
1712
1713     To support loading external dependencies relative to the input
1714     source, you can pass the ``base_url``.
1715     """
1716     def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
1717         HTMLParser.__init__(self, **kwargs)
1718         if events is None:
1719             events = ('end',)
1720         self._setBaseURL(base_url)
1721         self._collectEvents(events, tag)
1722
1723     def read_events(self):
1724         return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
1725
1726
1727 ############################################################
1728 ## helper functions for document creation
1729 ############################################################
1730
1731 cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
1732     cdef char* c_filename
1733     cdef char* c_text
1734     cdef Py_ssize_t c_len
1735     cdef bint is_pep393_string
1736     if parser is None:
1737         parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1738     if not filename:
1739         c_filename = NULL
1740     else:
1741         filename_utf = _encodeFilenameUTF8(filename)
1742         c_filename = _cstr(filename_utf)
1743     if isinstance(text, unicode):
1744         is_pep393_string = (
1745             python.PEP393_ENABLED and python.PyUnicode_IS_READY(text))
1746         if is_pep393_string:
1747             c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
1748         else:
1749             c_len = python.PyUnicode_GET_DATA_SIZE(text)
1750         if c_len > limits.INT_MAX:
1751             return (<_BaseParser>parser)._parseDocFromFilelike(
1752                 StringIO(text), filename, None)
1753         if _UNICODE_ENCODING is NULL and not is_pep393_string:
1754             text = (<unicode>text).encode('utf8')
1755             return (<_BaseParser>parser)._parseDocFromFilelike(
1756                 BytesIO(text), filename, "UTF-8")
1757         return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
1758     else:
1759         c_len = python.PyBytes_GET_SIZE(text)
1760         if c_len > limits.INT_MAX:
1761             return (<_BaseParser>parser)._parseDocFromFilelike(
1762                 BytesIO(text), filename, None)
1763         c_text = _cstr(text)
1764         return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
1765
1766 cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
1767     if parser is None:
1768         parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1769     return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
1770
1771 cdef xmlDoc* _parseDocFromFilelike(source, filename,
1772                                    _BaseParser parser) except NULL:
1773     if parser is None:
1774         parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1775     return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
1776
1777 cdef xmlDoc* _newXMLDoc() except NULL:
1778     cdef xmlDoc* result
1779     result = tree.xmlNewDoc(NULL)
1780     if result is NULL:
1781         raise MemoryError()
1782     if result.encoding is NULL:
1783         result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
1784     __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1785     return result
1786
1787 cdef xmlDoc* _newHTMLDoc() except NULL:
1788     cdef xmlDoc* result
1789     result = tree.htmlNewDoc(NULL, NULL)
1790     if result is NULL:
1791         raise MemoryError()
1792     __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1793     return result
1794
1795 cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
1796     cdef xmlDoc* result
1797     if recursive:
1798         with nogil:
1799             result = tree.xmlCopyDoc(c_doc, recursive)
1800     else:
1801         result = tree.xmlCopyDoc(c_doc, 0)
1802     if result is NULL:
1803         raise MemoryError()
1804     __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1805     return result
1806
1807 cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
1808     u"Recursively copy the document and make c_new_root the new root node."
1809     cdef xmlDoc* result
1810     cdef xmlNode* c_node
1811     result = tree.xmlCopyDoc(c_doc, 0) # non recursive
1812     __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1813     with nogil:
1814         c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
1815     if c_node is NULL:
1816         raise MemoryError()
1817     tree.xmlDocSetRootElement(result, c_node)
1818     _copyTail(c_new_root.next, c_node)
1819     return result
1820
1821 cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
1822     u"Recursively copy the element into the document. c_doc is not modified."
1823     cdef xmlNode* c_root
1824     c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
1825     if c_root is NULL:
1826         raise MemoryError()
1827     _copyTail(c_node.next, c_root)
1828     return c_root
1829
1830
1831 ############################################################
1832 ## API level helper functions for _Document creation
1833 ############################################################
1834
1835 cdef _Document _parseDocument(source, _BaseParser parser, base_url):
1836     cdef _Document doc
1837     if _isString(source):
1838         # parse the file directly from the filesystem
1839         doc = _parseDocumentFromURL(_encodeFilename(source), parser)
1840         # fix base URL if requested
1841         if base_url is not None:
1842             base_url = _encodeFilenameUTF8(base_url)
1843             if doc._c_doc.URL is not NULL:
1844                 tree.xmlFree(<char*>doc._c_doc.URL)
1845             doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url))
1846         return doc
1847
1848     if base_url is not None:
1849         url = base_url
1850     else:
1851         url = _getFilenameForFile(source)
1852
1853     if hasattr(source, u'getvalue') and hasattr(source, u'tell'):
1854         # StringIO - reading from start?
1855         if source.tell() == 0:
1856             return _parseMemoryDocument(source.getvalue(), url, parser)
1857
1858     # Support for file-like objects (urlgrabber.urlopen, ...)
1859     if hasattr(source, u'read'):
1860         return _parseFilelikeDocument(source, url, parser)
1861
1862     raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'"
1863
1864 cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
1865     c_doc = _parseDocFromFile(url, parser)
1866     return _documentFactory(c_doc, parser)
1867
1868 cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
1869     if isinstance(text, unicode):
1870         if _hasEncodingDeclaration(text):
1871             raise ValueError(
1872                 u"Unicode strings with encoding declaration are not supported. "
1873                 u"Please use bytes input or XML fragments without declaration.")
1874     elif not isinstance(text, bytes):
1875         raise ValueError, u"can only parse strings"
1876     c_doc = _parseDoc(text, url, parser)
1877     return _documentFactory(c_doc, parser)
1878
1879 cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
1880     c_doc = _parseDocFromFilelike(source, url, parser)
1881     return _documentFactory(c_doc, parser)