src/lxml/parser.pxi

   1 # Parsers for XML and HTML
   2
   3 from lxml.includes cimport xmlparser
   4 from lxml.includes cimport htmlparser
   5
   6
   7 class ParseError(LxmlSyntaxError):
   8     """Syntax error while parsing an XML document.
   9
  10     For compatibility with ElementTree 1.3 and later.
  11     """
  12     def __init__(self, message, code, line, column, filename=None):
  13         super(_ParseError, self).__init__(message)
  14         self.lineno, self.offset = (line, column - 1)
  15         self.code = code
  16         self.filename = filename
  17
  18     @property
  19     def position(self):
  20         return self.lineno, self.offset + 1
  21
  22     @position.setter
  23     def position(self, new_pos):
  24         self.lineno, column = new_pos
  25         self.offset = column - 1
  26
  27 cdef object _ParseError = ParseError
  28
  29
  30 class XMLSyntaxError(ParseError):
  31     """Syntax error while parsing an XML document.
  32     """
  33
  34 cdef class ParserError(LxmlError):
  35     """Internal lxml parser error.
  36     """
  37
  38
  39 @cython.final
  40 @cython.internal
  41 cdef class _ParserDictionaryContext:
  42     # Global parser context to share the string dictionary.
  43     #
  44     # This class is a delegate singleton!
  45     #
  46     # It creates _ParserDictionaryContext objects for each thread to keep thread state,
  47     # but those must never be used directly.  Always stick to using the static
  48     # __GLOBAL_PARSER_CONTEXT as defined below the class.
  49     #
  50
  51     cdef tree.xmlDict* _c_dict
  52     cdef _BaseParser _default_parser
  53     cdef list _implied_parser_contexts
  54
  55     def __cinit__(self):
  56         self._c_dict = NULL
  57         self._implied_parser_contexts = []
  58
  59     def __dealloc__(self):
  60         if self._c_dict is not NULL:
  61             xmlparser.xmlDictFree(self._c_dict)
  62
  63     cdef void initMainParserContext(self):
  64         u"""Put the global context into the thread dictionary of the main
  65         thread.  To be called once and only in the main thread."""
  66         thread_dict = python.PyThreadState_GetDict()
  67         if thread_dict is not NULL:
  68             (<dict>thread_dict)[u"_ParserDictionaryContext"] = self
  69
  70     cdef _ParserDictionaryContext _findThreadParserContext(self):
  71         u"Find (or create) the _ParserDictionaryContext object for the current thread"
  72         cdef _ParserDictionaryContext context
  73         thread_dict = python.PyThreadState_GetDict()
  74         if thread_dict is NULL:
  75             return self
  76         d = <dict>thread_dict
  77         result = python.PyDict_GetItem(d, u"_ParserDictionaryContext")
  78         if result is not NULL:
  79             return <object>result
  80         context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext)
  81         d[u"_ParserDictionaryContext"] = context
  82         return context
  83
  84     cdef void setDefaultParser(self, _BaseParser parser):
  85         u"Set the default parser for the current thread"
  86         cdef _ParserDictionaryContext context
  87         context = self._findThreadParserContext()
  88         context._default_parser = parser
  89
  90     cdef _BaseParser getDefaultParser(self):
  91         u"Return (or create) the default parser of the current thread"
  92         cdef _ParserDictionaryContext context
  93         context = self._findThreadParserContext()
  94         if context._default_parser is None:
  95             if self._default_parser is None:
  96                 self._default_parser = __DEFAULT_XML_PARSER._copy()
  97             if context is not self:
  98                 context._default_parser = self._default_parser._copy()
  99         return context._default_parser
 100
 101     cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
 102         u"Return the thread-local dict or create a new one if necessary."
 103         cdef _ParserDictionaryContext context
 104         context = self._findThreadParserContext()
 105         if context._c_dict is NULL:
 106             # thread dict not yet set up => use default or create a new one
 107             if default is not NULL:
 108                 context._c_dict = default
 109                 xmlparser.xmlDictReference(default)
 110                 return default
 111             if self._c_dict is NULL:
 112                 self._c_dict = xmlparser.xmlDictCreate()
 113             if context is not self:
 114                 context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
 115         return context._c_dict
 116
 117     cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref):
 118         c_dict = c_dict_ref[0]
 119         c_thread_dict = self._getThreadDict(c_dict)
 120         if c_dict is c_thread_dict:
 121             return
 122         if c_dict is not NULL:
 123             xmlparser.xmlDictFree(c_dict)
 124         c_dict_ref[0] = c_thread_dict
 125         xmlparser.xmlDictReference(c_thread_dict)
 126
 127     cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt):
 128         u"Assure we always use the same string dictionary."
 129         self.initThreadDictRef(&pctxt.dict)
 130         pctxt.dictNames = 1
 131
 132     cdef void initXPathParserDict(self, xpath.xmlXPathContext* pctxt):
 133         u"Assure we always use the same string dictionary."
 134         self.initThreadDictRef(&pctxt.dict)
 135
 136     cdef void initDocDict(self, xmlDoc* result):
 137         u"Store dict of last object parsed if no shared dict yet"
 138         # XXX We also free the result dict here if there already was one.
 139         # This case should only occur for new documents with empty dicts,
 140         # otherwise we'd free data that's in use => segfault
 141         self.initThreadDictRef(&result.dict)
 142
 143     cdef _ParserContext findImpliedContext(self):
 144         u"""Return any current implied xml parser context for the current
 145         thread.  This is used when the resolver functions are called
 146         with an xmlParserCtxt that was generated from within libxml2
 147         (i.e. without a _ParserContext) - which happens when parsing
 148         schema and xinclude external references."""
 149         cdef _ParserDictionaryContext context
 150         cdef _ParserContext implied_context
 151
 152         # see if we have a current implied parser
 153         context = self._findThreadParserContext()
 154         if context._implied_parser_contexts:
 155             implied_context = context._implied_parser_contexts[-1]
 156             return implied_context
 157         return None
 158
 159     cdef void pushImpliedContextFromParser(self, _BaseParser parser):
 160         u"Push a new implied context object taken from the parser."
 161         if parser is not None:
 162             self.pushImpliedContext(parser._getParserContext())
 163         else:
 164             self.pushImpliedContext(None)
 165
 166     cdef void pushImpliedContext(self, _ParserContext parser_context):
 167         u"Push a new implied context object."
 168         cdef _ParserDictionaryContext context
 169         context = self._findThreadParserContext()
 170         context._implied_parser_contexts.append(parser_context)
 171
 172     cdef void popImpliedContext(self):
 173         u"Pop the current implied context object."
 174         cdef _ParserDictionaryContext context
 175         context = self._findThreadParserContext()
 176         context._implied_parser_contexts.pop()
 177
 178 cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
 179 __GLOBAL_PARSER_CONTEXT.initMainParserContext()
 180
 181 ############################################################
 182 ## support for Python unicode I/O
 183 ############################################################
 184
 185 # name of Python unicode encoding as known to libxml2
 186 cdef const_char* _UNICODE_ENCODING = NULL
 187
 188 cdef int _setupPythonUnicode() except -1:
 189     u"""Sets _UNICODE_ENCODING to the internal encoding name of Python unicode
 190     strings if libxml2 supports reading native Python unicode.  This depends
 191     on iconv and the local Python installation, so we simply check if we find
 192     a matching encoding handler.
 193     """
 194     cdef tree.xmlCharEncodingHandler* enchandler
 195     cdef Py_ssize_t l
 196     cdef const_char* enc
 197     cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>']
 198     cdef const_xmlChar* buffer = <const_xmlChar*>uchars
 199     # apparently, libxml2 can't detect UTF-16 on some systems
 200     if (buffer[0] == c'<' and buffer[1] == c'\0' and
 201             buffer[2] == c't' and buffer[3] == c'\0'):
 202         enc = "UTF-16LE"
 203     elif (buffer[0] == c'\0' and buffer[1] == c'<' and
 204             buffer[2] == c'\0' and buffer[3] == c't'):
 205         enc = "UTF-16BE"
 206     else:
 207         # let libxml2 give it a try
 208         enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7)
 209         if enc is NULL:
 210             # not my fault, it's YOUR broken system :)
 211             return 0
 212     enchandler = tree.xmlFindCharEncodingHandler(enc)
 213     if enchandler is not NULL:
 214         global _UNICODE_ENCODING
 215         tree.xmlCharEncCloseFunc(enchandler)
 216         _UNICODE_ENCODING = enc
 217     return 0
 218
 219 cdef const_char* _findEncodingName(const_xmlChar* buffer, int size):
 220     u"Work around bug in libxml2: find iconv name of encoding on our own."
 221     cdef tree.xmlCharEncoding enc
 222     enc = tree.xmlDetectCharEncoding(buffer, size)
 223     if enc == tree.XML_CHAR_ENCODING_UTF16LE:
 224         if size >= 4 and (buffer[0] == <const_xmlChar>'\xFF' and
 225                           buffer[1] == <const_xmlChar>'\xFE' and
 226                           buffer[2] == 0 and buffer[3] == 0):
 227             return "UTF-32LE"  # according to BOM
 228         else:
 229             return "UTF-16LE"
 230     elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
 231         return "UTF-16BE"
 232     elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
 233         return "UCS-4LE"
 234     elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
 235         return "UCS-4BE"
 236     elif enc == tree.XML_CHAR_ENCODING_NONE:
 237         return NULL
 238     else:
 239         # returns a constant char*, no need to free it
 240         return tree.xmlGetCharEncodingName(enc)
 241
 242 _setupPythonUnicode()
 243
 244 ############################################################
 245 ## support for file-like objects
 246 ############################################################
 247
 248 @cython.final
 249 @cython.internal
 250 cdef class _FileReaderContext:
 251     cdef object _filelike
 252     cdef object _encoding
 253     cdef object _url
 254     cdef object _bytes
 255     cdef _ExceptionContext _exc_context
 256     cdef Py_ssize_t _bytes_read
 257     cdef char* _c_url
 258     cdef bint _close_file_after_read
 259
 260     def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False):
 261         self._exc_context = exc_context
 262         self._filelike = filelike
 263         self._close_file_after_read = close_file
 264         self._encoding = encoding
 265         if url is None:
 266             self._c_url = NULL
 267         else:
 268             url = _encodeFilename(url)
 269             self._c_url = _cstr(url)
 270         self._url = url
 271         self._bytes  = b''
 272         self._bytes_read = 0
 273
 274     cdef _close_file(self):
 275         if self._filelike is None or not self._close_file_after_read:
 276             return
 277         try:
 278             close = self._filelike.close
 279         except AttributeError:
 280             close = None
 281         finally:
 282             self._filelike = None
 283         if close is not None:
 284             close()
 285
 286     cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self):
 287         cdef stdio.FILE* c_stream
 288         cdef xmlparser.xmlParserInputBuffer* c_buffer
 289         c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
 290         c_stream = python.PyFile_AsFile(self._filelike)
 291         if c_stream is NULL:
 292             c_buffer.readcallback  = _readFilelikeParser
 293             c_buffer.context = <python.PyObject*>self
 294         else:
 295             c_buffer.readcallback  = _readFileParser
 296             c_buffer.context = c_stream
 297         return c_buffer
 298
 299     cdef xmlparser.xmlParserInput* _createParserInput(
 300             self, xmlparser.xmlParserCtxt* ctxt):
 301         cdef xmlparser.xmlParserInputBuffer* c_buffer
 302         c_buffer = self._createParserInputBuffer()
 303         return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
 304
 305     cdef tree.xmlDtd* _readDtd(self):
 306         cdef xmlparser.xmlParserInputBuffer* c_buffer
 307         c_buffer = self._createParserInputBuffer()
 308         with nogil:
 309             return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
 310
 311     cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options):
 312         cdef xmlDoc* result
 313         cdef char* c_encoding
 314         cdef stdio.FILE* c_stream
 315         cdef xmlparser.xmlInputReadCallback c_read_callback
 316         cdef xmlparser.xmlInputCloseCallback c_close_callback
 317         cdef void* c_callback_context
 318
 319         if self._encoding is None:
 320             c_encoding = NULL
 321         else:
 322             c_encoding = _cstr(self._encoding)
 323
 324         c_stream = python.PyFile_AsFile(self._filelike)
 325         if c_stream is NULL:
 326             c_read_callback  = _readFilelikeParser
 327             c_callback_context = <python.PyObject*>self
 328         else:
 329             c_read_callback  = _readFileParser
 330             c_callback_context = c_stream
 331
 332         orig_options = ctxt.options
 333         with nogil:
 334             if ctxt.html:
 335                 result = htmlparser.htmlCtxtReadIO(
 336                         ctxt, c_read_callback, NULL, c_callback_context,
 337                         self._c_url, c_encoding, options)
 338                 if result is not NULL:
 339                     if _fixHtmlDictNames(ctxt.dict, result) < 0:
 340                         tree.xmlFreeDoc(result)
 341                         result = NULL
 342             else:
 343                 result = xmlparser.xmlCtxtReadIO(
 344                     ctxt, c_read_callback, NULL, c_callback_context,
 345                     self._c_url, c_encoding, options)
 346         ctxt.options = orig_options # work around libxml2 problem
 347         try:
 348             self._close_file()
 349         except:
 350             self._exc_context._store_raised()
 351         finally:
 352             return result  # swallow any exceptions
 353
 354     cdef int copyToBuffer(self, char* c_buffer, int c_requested):
 355         cdef int c_byte_count = 0
 356         cdef char* c_start
 357         cdef Py_ssize_t byte_count, remaining
 358         if self._bytes_read < 0:
 359             return 0
 360         try:
 361             byte_count = python.PyBytes_GET_SIZE(self._bytes)
 362             remaining  = byte_count - self._bytes_read
 363             while c_requested > remaining:
 364                 c_start = _cstr(self._bytes) + self._bytes_read
 365                 cstring_h.memcpy(c_buffer, c_start, remaining)
 366                 c_byte_count += remaining
 367                 c_buffer += remaining
 368                 c_requested -= remaining
 369
 370                 self._bytes = self._filelike.read(c_requested)
 371                 if not isinstance(self._bytes, bytes):
 372                     if isinstance(self._bytes, unicode):
 373                         if self._encoding is None:
 374                             self._bytes = (<unicode>self._bytes).encode('utf8')
 375                         else:
 376                             self._bytes = python.PyUnicode_AsEncodedString(
 377                                 self._bytes, _cstr(self._encoding), NULL)
 378                     else:
 379                         self._close_file()
 380                         raise TypeError, \
 381                             u"reading from file-like objects must return byte strings or unicode strings"
 382
 383                 remaining = python.PyBytes_GET_SIZE(self._bytes)
 384                 if remaining == 0:
 385                     self._bytes_read = -1
 386                     self._close_file()
 387                     return c_byte_count
 388                 self._bytes_read = 0
 389
 390             if c_requested > 0:
 391                 c_start = _cstr(self._bytes) + self._bytes_read
 392                 cstring_h.memcpy(c_buffer, c_start, c_requested)
 393                 c_byte_count += c_requested
 394                 self._bytes_read += c_requested
 395         except:
 396             c_byte_count = -1
 397             self._exc_context._store_raised()
 398             try:
 399                 self._close_file()
 400             except:
 401                 self._exc_context._store_raised()
 402         finally:
 403             return c_byte_count  # swallow any exceptions
 404
 405 cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) with gil:
 406     return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
 407
 408 cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) nogil:
 409     return stdio.fread(c_buffer, 1,  c_size, <stdio.FILE*>ctxt)
 410
 411 ############################################################
 412 ## support for custom document loaders
 413 ############################################################
 414
 415 cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid,
 416                                                xmlparser.xmlParserCtxt* c_context) with gil:
 417     cdef _ResolverContext context
 418     cdef xmlparser.xmlParserInput* c_input
 419     cdef _InputDocument doc_ref
 420     cdef _FileReaderContext file_context
 421     # if there is no _ParserContext associated with the xmlParserCtxt
 422     # passed, check to see if the thread state object has an implied
 423     # context.
 424     if c_context._private is not NULL:
 425         context = <_ResolverContext>c_context._private
 426     else:
 427         context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
 428
 429     if context is None:
 430         if __DEFAULT_ENTITY_LOADER is NULL:
 431             return NULL
 432         with nogil:
 433             # free the GIL as we might do serious I/O here (e.g. HTTP)
 434             c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
 435         return c_input
 436
 437     try:
 438         if c_url is NULL:
 439             url = None
 440         else:
 441             # parsing a related document (DTD etc.) => UTF-8 encoded URL?
 442             url = _decodeFilename(<const_xmlChar*>c_url)
 443         if c_pubid is NULL:
 444             pubid = None
 445         else:
 446             pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8
 447
 448         doc_ref = context._resolvers.resolve(url, pubid, context)
 449     except:
 450         context._store_raised()
 451         return NULL
 452
 453     if doc_ref is not None:
 454         if doc_ref._type == PARSER_DATA_STRING:
 455             data = doc_ref._data_bytes
 456             filename = doc_ref._filename
 457             if not filename:
 458                 filename = None
 459             elif not isinstance(filename, bytes):
 460                 # most likely a text URL
 461                 filename = filename.encode('utf8')
 462                 if not isinstance(filename, bytes):
 463                     filename = None
 464
 465             c_input = xmlparser.xmlNewInputStream(c_context)
 466             if c_input is not NULL:
 467                 if filename is not None:
 468                     c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename))
 469                 c_input.base = _xcstr(data)
 470                 c_input.length = python.PyBytes_GET_SIZE(data)
 471                 c_input.cur = c_input.base
 472                 c_input.end = c_input.base + c_input.length
 473         elif doc_ref._type == PARSER_DATA_FILENAME:
 474             data = None
 475             c_filename = _cstr(doc_ref._filename)
 476             with nogil:
 477                 # free the GIL as we might do serious I/O here
 478                 c_input = xmlparser.xmlNewInputFromFile(
 479                     c_context, c_filename)
 480         elif doc_ref._type == PARSER_DATA_FILE:
 481             file_context = _FileReaderContext(doc_ref._file, context, url,
 482                                               None, doc_ref._close_file)
 483             c_input = file_context._createParserInput(c_context)
 484             data = file_context
 485         else:
 486             data = None
 487             c_input = NULL
 488
 489         if data is not None:
 490             context._storage.add(data)
 491         if c_input is not NULL:
 492             return c_input
 493
 494     if __DEFAULT_ENTITY_LOADER is NULL:
 495         return NULL
 496
 497     with nogil:
 498         # free the GIL as we might do serious I/O here (e.g. HTTP)
 499         c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
 500     return c_input
 501
 502 cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
 503 __DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
 504
 505
 506 cdef xmlparser.xmlExternalEntityLoader _register_document_loader() nogil:
 507     cdef xmlparser.xmlExternalEntityLoader old = xmlparser.xmlGetExternalEntityLoader()
 508     xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver)
 509     return old
 510
 511 cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) nogil:
 512     xmlparser.xmlSetExternalEntityLoader(old)
 513
 514
 515 ############################################################
 516 ## Parsers
 517 ############################################################
 518
 519 @cython.no_gc_clear  # May have to call "self._validator.disconnect()" on dealloc.
 520 @cython.internal
 521 cdef class _ParserContext(_ResolverContext):
 522     cdef _ErrorLog _error_log
 523     cdef _ParserSchemaValidationContext _validator
 524     cdef xmlparser.xmlParserCtxt* _c_ctxt
 525     cdef xmlparser.xmlExternalEntityLoader _orig_loader
 526     cdef python.PyThread_type_lock _lock
 527     cdef _Document _doc
 528     cdef bint _collect_ids
 529
 530     def __cinit__(self):
 531         self._c_ctxt = NULL
 532         self._collect_ids = True
 533         if not config.ENABLE_THREADING:
 534             self._lock = NULL
 535         else:
 536             self._lock = python.PyThread_allocate_lock()
 537         self._error_log = _ErrorLog()
 538
 539     def __dealloc__(self):
 540         if config.ENABLE_THREADING and self._lock is not NULL:
 541             python.PyThread_free_lock(self._lock)
 542             self._lock = NULL
 543         if self._c_ctxt is not NULL:
 544             if <void*>self._validator is not NULL and self._validator is not None:
 545                 # If the parser was not closed correctly (e.g. interrupted iterparse()),
 546                 # and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX
 547                 # validator plug might still be in place, which will make xmlFreeParserCtxt()
 548                 # crash when trying to xmlFree() a static SAX handler.
 549                 # Thus, make sure we disconnect the handler interceptor here at the latest.
 550                 self._validator.disconnect()
 551             xmlparser.xmlFreeParserCtxt(self._c_ctxt)
 552
 553     cdef _ParserContext _copy(self):
 554         cdef _ParserContext context
 555         context = self.__class__()
 556         context._collect_ids = self._collect_ids
 557         context._validator = self._validator.copy()
 558         _initParserContext(context, self._resolvers._copy(), NULL)
 559         return context
 560
 561     cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
 562         self._c_ctxt = c_ctxt
 563         c_ctxt._private = <void*>self
 564
 565     cdef void _resetParserContext(self):
 566         if self._c_ctxt is not NULL:
 567             if self._c_ctxt.html:
 568                 htmlparser.htmlCtxtReset(self._c_ctxt)
 569                 self._c_ctxt.disableSAX = 0 # work around bug in libxml2
 570             else:
 571                 xmlparser.xmlClearParserCtxt(self._c_ctxt)
 572
 573     cdef int prepare(self, bint set_document_loader=True) except -1:
 574         cdef int result
 575         if config.ENABLE_THREADING and self._lock is not NULL:
 576             with nogil:
 577                 result = python.PyThread_acquire_lock(
 578                     self._lock, python.WAIT_LOCK)
 579             if result == 0:
 580                 raise ParserError, u"parser locking failed"
 581         self._error_log.clear()
 582         self._doc = None
 583         self._c_ctxt.sax.serror = _receiveParserError
 584         self._orig_loader = _register_document_loader() if set_document_loader else NULL
 585         if self._validator is not None:
 586             self._validator.connect(self._c_ctxt, self._error_log)
 587         return 0
 588
 589     cdef int cleanup(self) except -1:
 590         if self._orig_loader is not NULL:
 591             _reset_document_loader(self._orig_loader)
 592         try:
 593             if self._validator is not None:
 594                 self._validator.disconnect()
 595             self._resetParserContext()
 596             self.clear()
 597             self._doc = None
 598             self._c_ctxt.sax.serror = NULL
 599         finally:
 600             if config.ENABLE_THREADING and self._lock is not NULL:
 601                 python.PyThread_release_lock(self._lock)
 602         return 0
 603
 604     cdef object _handleParseResult(self, _BaseParser parser,
 605                                    xmlDoc* result, filename):
 606         c_doc = self._handleParseResultDoc(parser, result, filename)
 607         if self._doc is not None and self._doc._c_doc is c_doc:
 608             return self._doc
 609         else:
 610             return _documentFactory(c_doc, parser)
 611
 612     cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
 613                                        xmlDoc* result, filename) except NULL:
 614         recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
 615         return _handleParseResult(self, self._c_ctxt, result,
 616                                   filename, recover,
 617                                   free_doc=self._doc is None)
 618
 619 cdef _initParserContext(_ParserContext context,
 620                         _ResolverRegistry resolvers,
 621                         xmlparser.xmlParserCtxt* c_ctxt):
 622     _initResolverContext(context, resolvers)
 623     if c_ctxt is not NULL:
 624         context._initParserContext(c_ctxt)
 625
 626 cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, xmlerror.xmlError* error) with gil:
 627     (<_ParserContext>_parser_context._private)._error_log._receive(error)
 628
 629 cdef void _receiveParserError(void* c_context, xmlerror.xmlError* error) nogil:
 630     if __DEBUG:
 631         if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL:
 632             _forwardError(NULL, error)
 633         else:
 634             _forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error)
 635
 636 cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
 637                           _ErrorLog error_log) except -1:
 638     if filename is not None and \
 639            ctxt.lastError.domain == xmlerror.XML_FROM_IO:
 640         if isinstance(filename, bytes):
 641             filename = _decodeFilenameWithLength(
 642                 <bytes>filename, len(<bytes>filename))
 643         if ctxt.lastError.message is not NULL:
 644             try:
 645                 message = ctxt.lastError.message.decode('utf-8')
 646             except UnicodeDecodeError:
 647                 # the filename may be in there => play it safe
 648                 message = ctxt.lastError.message.decode('iso8859-1')
 649             message = f"Error reading file '{filename}': {message.strip()}"
 650         else:
 651             message = f"Error reading '{filename}'"
 652         raise IOError, message
 653     elif error_log:
 654         raise error_log._buildParseException(
 655             XMLSyntaxError, u"Document is not well formed")
 656     elif ctxt.lastError.message is not NULL:
 657         message = ctxt.lastError.message.strip()
 658         code = ctxt.lastError.code
 659         line = ctxt.lastError.line
 660         column = ctxt.lastError.int2
 661         if ctxt.lastError.line > 0:
 662             message = f"line {line}: {message}"
 663         raise XMLSyntaxError(message, code, line, column, filename)
 664     else:
 665         raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
 666                              filename)
 667
 668 cdef xmlDoc* _handleParseResult(_ParserContext context,
 669                                 xmlparser.xmlParserCtxt* c_ctxt,
 670                                 xmlDoc* result, filename,
 671                                 bint recover, bint free_doc) except NULL:
 672     cdef bint well_formed
 673     if result is not NULL:
 674         __GLOBAL_PARSER_CONTEXT.initDocDict(result)
 675
 676     if c_ctxt.myDoc is not NULL:
 677         if c_ctxt.myDoc is not result:
 678             __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
 679             tree.xmlFreeDoc(c_ctxt.myDoc)
 680         c_ctxt.myDoc = NULL
 681
 682     if result is not NULL:
 683         if (context._validator is not None and
 684                 not context._validator.isvalid()):
 685             well_formed = 0  # actually not 'valid', but anyway ...
 686         elif (not c_ctxt.wellFormed and not c_ctxt.html and
 687                 c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
 688                 [1 for error in context._error_log
 689                  if error.type == ErrorTypes.ERR_INVALID_CHAR]):
 690             # An encoding error occurred and libxml2 switched from UTF-8
 691             # input to (undecoded) Latin-1, at some arbitrary point in the
 692             # document.  Better raise an error than allowing for a broken
 693             # tree with mixed encodings.
 694             well_formed = 0
 695         elif recover or (c_ctxt.wellFormed and
 696                          c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
 697             well_formed = 1
 698         elif not c_ctxt.replaceEntities and not c_ctxt.validate \
 699                  and context is not None:
 700             # in this mode, we ignore errors about undefined entities
 701             for error in context._error_log.filter_from_errors():
 702                 if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
 703                        error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
 704                     well_formed = 0
 705                     break
 706             else:
 707                 well_formed = 1
 708         else:
 709             well_formed = 0
 710
 711         if not well_formed:
 712             if free_doc:
 713                 tree.xmlFreeDoc(result)
 714             result = NULL
 715
 716     if context is not None and context._has_raised():
 717         if result is not NULL:
 718             if free_doc:
 719                 tree.xmlFreeDoc(result)
 720             result = NULL
 721         context._raise_if_stored()
 722
 723     if result is NULL:
 724         if context is not None:
 725             _raiseParseError(c_ctxt, filename, context._error_log)
 726         else:
 727             _raiseParseError(c_ctxt, filename, None)
 728     else:
 729         if result.URL is NULL and filename is not None:
 730             result.URL = tree.xmlStrdup(_xcstr(filename))
 731         if result.encoding is NULL:
 732             result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
 733
 734     if context._validator is not None and \
 735            context._validator._add_default_attributes:
 736         # we currently need to do this here as libxml2 does not
 737         # support inserting default attributes during parse-time
 738         # validation
 739         context._validator.inject_default_attributes(result)
 740
 741     return result
 742
 743 cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil:
 744     cdef xmlNode* c_node
 745     if c_doc is NULL:
 746         return 0
 747     c_node = c_doc.children
 748     tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
 749     if c_node.type == tree.XML_ELEMENT_NODE:
 750         if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
 751             return -1
 752     tree.END_FOR_EACH_ELEMENT_FROM(c_node)
 753     return 0
 754
 755 cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc,
 756                                   xmlNode* c_start_node) nogil:
 757     """
 758     Move names to the dict, iterating in document order, starting at
 759     c_start_node. This is used in incremental parsing after each chunk.
 760     """
 761     cdef xmlNode* c_node
 762     if not c_doc:
 763         return 0
 764     if not c_start_node:
 765         return _fixHtmlDictNames(c_dict, c_doc)
 766     c_node = c_start_node
 767     tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
 768     if c_node.type == tree.XML_ELEMENT_NODE:
 769         if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
 770             return -1
 771     tree.END_FOR_EACH_ELEMENT_FROM(c_node)
 772     return 0
 773
 774 cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
 775                                       xmlNode* c_node) nogil:
 776     cdef xmlNode* c_attr
 777     c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
 778     if c_name is NULL:
 779         return -1
 780     if c_name is not c_node.name:
 781         tree.xmlFree(<char*>c_node.name)
 782         c_node.name = c_name
 783     c_attr = <xmlNode*>c_node.properties
 784     while c_attr is not NULL:
 785         c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
 786         if c_name is NULL:
 787             return -1
 788         if c_name is not c_attr.name:
 789             tree.xmlFree(<char*>c_attr.name)
 790             c_attr.name = c_name
 791         c_attr = c_attr.next
 792     return 0
 793
 794 @cython.internal
 795 cdef class _BaseParser:
 796     cdef ElementClassLookup _class_lookup
 797     cdef _ResolverRegistry _resolvers
 798     cdef _ParserContext _parser_context
 799     cdef _ParserContext _push_parser_context
 800     cdef int _parse_options
 801     cdef bint _for_html
 802     cdef bint _remove_comments
 803     cdef bint _remove_pis
 804     cdef bint _strip_cdata
 805     cdef bint _collect_ids
 806     cdef XMLSchema _schema
 807     cdef bytes _filename
 808     cdef readonly object target
 809     cdef object _default_encoding
 810     cdef tuple _events_to_collect  # (event_types, tag)
 811
 812     def __init__(self, int parse_options, bint for_html, XMLSchema schema,
 813                  remove_comments, remove_pis, strip_cdata, collect_ids,
 814                  target, encoding):
 815         cdef tree.xmlCharEncodingHandler* enchandler
 816         cdef int c_encoding
 817         if not isinstance(self, (XMLParser, HTMLParser)):
 818             raise TypeError, u"This class cannot be instantiated"
 819
 820         self._parse_options = parse_options
 821         self.target = target
 822         self._for_html = for_html
 823         self._remove_comments = remove_comments
 824         self._remove_pis = remove_pis
 825         self._strip_cdata = strip_cdata
 826         self._collect_ids = collect_ids
 827         self._schema = schema
 828
 829         self._resolvers = _ResolverRegistry()
 830
 831         if encoding is None:
 832             self._default_encoding = None
 833         else:
 834             encoding = _utf8(encoding)
 835             enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
 836             if enchandler is NULL:
 837                 raise LookupError, f"unknown encoding: '{encoding}'"
 838             tree.xmlCharEncCloseFunc(enchandler)
 839             self._default_encoding = encoding
 840
 841     cdef _setBaseURL(self, base_url):
 842         self._filename = _encodeFilename(base_url)
 843
 844     cdef _collectEvents(self, event_types, tag):
 845         if event_types is None:
 846             event_types = ()
 847         else:
 848             event_types = tuple(set(event_types))
 849             _buildParseEventFilter(event_types)  # purely for validation
 850         self._events_to_collect = (event_types, tag)
 851
 852     cdef _ParserContext _getParserContext(self):
 853         cdef xmlparser.xmlParserCtxt* pctxt
 854         if self._parser_context is None:
 855             self._parser_context = self._createContext(self.target, None)
 856             self._parser_context._collect_ids = self._collect_ids
 857             if self._schema is not None:
 858                 self._parser_context._validator = \
 859                     self._schema._newSaxValidator(
 860                         self._parse_options & xmlparser.XML_PARSE_DTDATTR)
 861             pctxt = self._newParserCtxt()
 862             _initParserContext(self._parser_context, self._resolvers, pctxt)
 863             self._configureSaxContext(pctxt)
 864         return self._parser_context
 865
 866     cdef _ParserContext _getPushParserContext(self):
 867         cdef xmlparser.xmlParserCtxt* pctxt
 868         if self._push_parser_context is None:
 869             self._push_parser_context = self._createContext(
 870                 self.target, self._events_to_collect)
 871             self._push_parser_context._collect_ids = self._collect_ids
 872             if self._schema is not None:
 873                 self._push_parser_context._validator = \
 874                     self._schema._newSaxValidator(
 875                         self._parse_options & xmlparser.XML_PARSE_DTDATTR)
 876             pctxt = self._newPushParserCtxt()
 877             _initParserContext(
 878                 self._push_parser_context, self._resolvers, pctxt)
 879             self._configureSaxContext(pctxt)
 880         return self._push_parser_context
 881
 882     cdef _ParserContext _createContext(self, target, events_to_collect):
 883         cdef _SaxParserContext sax_context
 884         if target is not None:
 885             sax_context = _TargetParserContext(self)
 886             (<_TargetParserContext>sax_context)._setTarget(target)
 887         elif events_to_collect:
 888             sax_context = _SaxParserContext(self)
 889         else:
 890             # nothing special to configure
 891             return _ParserContext()
 892         if events_to_collect:
 893             events, tag = events_to_collect
 894             sax_context._setEventFilter(events, tag)
 895         return sax_context
 896
 897     @cython.final
 898     cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1:
 899         if self._remove_comments:
 900             pctxt.sax.comment = NULL
 901         if self._remove_pis:
 902             pctxt.sax.processingInstruction = NULL
 903         if self._strip_cdata:
 904             # hard switch-off for CDATA nodes => makes them plain text
 905             pctxt.sax.cdataBlock = NULL
 906
 907     cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
 908         cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
 909         if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
 910             # need to extend SAX1 context to SAX2 to get proper error reports
 911             if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler:
 912                 sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler))
 913                 if sax is NULL:
 914                     raise MemoryError()
 915                 cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler,
 916                                  sizeof(htmlparser.htmlDefaultSAXHandler))
 917                 c_ctxt.sax = sax
 918             sax.initialized = xmlparser.XML_SAX2_MAGIC
 919             sax.serror = _receiveParserError
 920             sax.startElementNs = NULL
 921             sax.endElementNs = NULL
 922             sax._private = NULL
 923         return 0
 924
 925     cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
 926         cdef xmlparser.xmlParserCtxt* c_ctxt
 927         if self._for_html:
 928             c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
 929             if c_ctxt is not NULL:
 930                 self._registerHtmlErrorHandler(c_ctxt)
 931         else:
 932             c_ctxt = xmlparser.xmlNewParserCtxt()
 933         if c_ctxt is NULL:
 934             raise MemoryError
 935         c_ctxt.sax.startDocument = _initSaxDocument
 936         return c_ctxt
 937
 938     cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL:
 939         cdef xmlparser.xmlParserCtxt* c_ctxt
 940         cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL
 941         if self._for_html:
 942             c_ctxt = htmlparser.htmlCreatePushParserCtxt(
 943                 NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
 944             if c_ctxt is not NULL:
 945                 self._registerHtmlErrorHandler(c_ctxt)
 946                 htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
 947         else:
 948             c_ctxt = xmlparser.xmlCreatePushParserCtxt(
 949                 NULL, NULL, NULL, 0, c_filename)
 950             if c_ctxt is not NULL:
 951                 xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
 952         if c_ctxt is NULL:
 953             raise MemoryError()
 954         c_ctxt.sax.startDocument = _initSaxDocument
 955         return c_ctxt
 956
 957     @property
 958     def error_log(self):
 959         """The error log of the last parser run.
 960         """
 961         cdef _ParserContext context
 962         context = self._getParserContext()
 963         return context._error_log.copy()
 964
 965     @property
 966     def resolvers(self):
 967         """The custom resolver registry of this parser."""
 968         return self._resolvers
 969
 970     @property
 971     def version(self):
 972         """The version of the underlying XML parser."""
 973         return u"libxml2 %d.%d.%d" % LIBXML_VERSION
 974
 975     def setElementClassLookup(self, ElementClassLookup lookup = None):
 976         u":deprecated: use ``parser.set_element_class_lookup(lookup)`` instead."
 977         self.set_element_class_lookup(lookup)
 978
 979     def set_element_class_lookup(self, ElementClassLookup lookup = None):
 980         u"""set_element_class_lookup(self, lookup = None)
 981
 982         Set a lookup scheme for element classes generated from this parser.
 983
 984         Reset it by passing None or nothing.
 985         """
 986         self._class_lookup = lookup
 987
 988     cdef _BaseParser _copy(self):
 989         u"Create a new parser with the same configuration."
 990         cdef _BaseParser parser
 991         parser = self.__class__()
 992         parser._parse_options = self._parse_options
 993         parser._for_html = self._for_html
 994         parser._remove_comments = self._remove_comments
 995         parser._remove_pis = self._remove_pis
 996         parser._strip_cdata = self._strip_cdata
 997         parser._filename = self._filename
 998         parser._resolvers = self._resolvers
 999         parser.target = self.target
1000         parser._class_lookup  = self._class_lookup
1001         parser._default_encoding = self._default_encoding
1002         parser._schema = self._schema
1003         parser._events_to_collect = self._events_to_collect
1004         return parser
1005
1006     def copy(self):
1007         u"""copy(self)
1008
1009         Create a new parser with the same configuration.
1010         """
1011         return self._copy()
1012
1013     def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
1014         u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
1015
1016         Creates a new element associated with this parser.
1017         """
1018         return _makeElement(_tag, NULL, None, self, None, None,
1019                             attrib, nsmap, _extra)
1020
1021     # internal parser methods
1022
1023     cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
1024         u"""Parse unicode document, share dictionary if possible.
1025         """
1026         cdef _ParserContext context
1027         cdef xmlDoc* result
1028         cdef xmlparser.xmlParserCtxt* pctxt
1029         cdef Py_ssize_t py_buffer_len
1030         cdef int buffer_len, c_kind
1031         cdef const_char* c_text
1032         cdef const_char* c_encoding = _UNICODE_ENCODING
1033         cdef bint is_pep393_string = (
1034             python.PEP393_ENABLED and python.PyUnicode_IS_READY(utext))
1035         if is_pep393_string:
1036             c_text = <const_char*>python.PyUnicode_DATA(utext)
1037             py_buffer_len = python.PyUnicode_GET_LENGTH(utext)
1038             c_kind = python.PyUnicode_KIND(utext)
1039             if c_kind == 1:
1040                 c_encoding = 'ISO-8859-1'
1041             elif c_kind == 2:
1042                 py_buffer_len *= 2
1043                 if python.PY_BIG_ENDIAN:
1044                     c_encoding = 'UTF-16BE'  # actually UCS-2
1045                 else:
1046                     c_encoding = 'UTF-16LE'  # actually UCS-2
1047             elif c_kind == 4:
1048                 py_buffer_len *= 4
1049                 if python.PY_BIG_ENDIAN:
1050                     c_encoding = 'UCS-4BE'
1051                 else:
1052                     c_encoding = 'UCS-4LE'
1053             else:
1054                 assert False, f"Illegal Unicode kind {c_kind}"
1055         else:
1056             py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
1057             c_text = python.PyUnicode_AS_DATA(utext)
1058         assert 0 <= py_buffer_len <= limits.INT_MAX
1059         buffer_len = py_buffer_len
1060
1061         context = self._getParserContext()
1062         context.prepare()
1063         try:
1064             pctxt = context._c_ctxt
1065             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1066             orig_options = pctxt.options
1067             with nogil:
1068                 if self._for_html:
1069                     result = htmlparser.htmlCtxtReadMemory(
1070                         pctxt, c_text, buffer_len, c_filename, c_encoding,
1071                         self._parse_options)
1072                     if result is not NULL:
1073                         if _fixHtmlDictNames(pctxt.dict, result) < 0:
1074                             tree.xmlFreeDoc(result)
1075                             result = NULL
1076                 else:
1077                     result = xmlparser.xmlCtxtReadMemory(
1078                         pctxt, c_text, buffer_len, c_filename, c_encoding,
1079                         self._parse_options)
1080             pctxt.options = orig_options # work around libxml2 problem
1081
1082             return context._handleParseResultDoc(self, result, None)
1083         finally:
1084             context.cleanup()
1085
1086     cdef xmlDoc* _parseDoc(self, char* c_text, int c_len,
1087                            char* c_filename) except NULL:
1088         u"""Parse document, share dictionary if possible.
1089         """
1090         cdef _ParserContext context
1091         cdef xmlDoc* result
1092         cdef xmlparser.xmlParserCtxt* pctxt
1093         cdef char* c_encoding
1094         cdef tree.xmlCharEncoding enc
1095         context = self._getParserContext()
1096         context.prepare()
1097         try:
1098             pctxt = context._c_ctxt
1099             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1100
1101             if self._default_encoding is None:
1102                 c_encoding = NULL
1103                 # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs
1104                 # NOTE: limit to problematic cases because it changes character offsets
1105                 if c_len >= 4 and (c_text[0] == '\xFF' and c_text[1] == '\xFE' and
1106                                    c_text[2] == 0 and c_text[3] == 0):
1107                     c_encoding = "UTF-32LE"
1108                     c_text += 4
1109                     c_len -= 4
1110                 elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and
1111                                      c_text[2] == '\xFE' and c_text[3] == '\xFF'):
1112                     c_encoding = "UTF-32BE"
1113                     c_text += 4
1114                     c_len -= 4
1115                 else:
1116                     # no BOM => try to determine encoding
1117                     enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len)
1118                     if enc == tree.XML_CHAR_ENCODING_UCS4LE:
1119                         c_encoding = 'UTF-32LE'
1120                     elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
1121                         c_encoding = 'UTF-32BE'
1122             else:
1123                 c_encoding = _cstr(self._default_encoding)
1124
1125             orig_options = pctxt.options
1126             with nogil:
1127                 if self._for_html:
1128                     result = htmlparser.htmlCtxtReadMemory(
1129                         pctxt, c_text, c_len, c_filename,
1130                         c_encoding, self._parse_options)
1131                     if result is not NULL:
1132                         if _fixHtmlDictNames(pctxt.dict, result) < 0:
1133                             tree.xmlFreeDoc(result)
1134                             result = NULL
1135                 else:
1136                     result = xmlparser.xmlCtxtReadMemory(
1137                         pctxt, c_text, c_len, c_filename,
1138                         c_encoding, self._parse_options)
1139             pctxt.options = orig_options # work around libxml2 problem
1140
1141             return context._handleParseResultDoc(self, result, None)
1142         finally:
1143             context.cleanup()
1144
1145     cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
1146         cdef _ParserContext context
1147         cdef xmlDoc* result
1148         cdef xmlparser.xmlParserCtxt* pctxt
1149         cdef char* c_encoding
1150         result = NULL
1151
1152         context = self._getParserContext()
1153         context.prepare()
1154         try:
1155             pctxt = context._c_ctxt
1156             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1157
1158             if self._default_encoding is None:
1159                 c_encoding = NULL
1160             else:
1161                 c_encoding = _cstr(self._default_encoding)
1162
1163             orig_options = pctxt.options
1164             with nogil:
1165                 if self._for_html:
1166                     result = htmlparser.htmlCtxtReadFile(
1167                         pctxt, c_filename, c_encoding, self._parse_options)
1168                     if result is not NULL:
1169                         if _fixHtmlDictNames(pctxt.dict, result) < 0:
1170                             tree.xmlFreeDoc(result)
1171                             result = NULL
1172                 else:
1173                     result = xmlparser.xmlCtxtReadFile(
1174                         pctxt, c_filename, c_encoding, self._parse_options)
1175             pctxt.options = orig_options # work around libxml2 problem
1176
1177             return context._handleParseResultDoc(self, result, c_filename)
1178         finally:
1179             context.cleanup()
1180
1181     cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename,
1182                                        encoding) except NULL:
1183         cdef _ParserContext context
1184         cdef _FileReaderContext file_context
1185         cdef xmlDoc* result
1186         cdef xmlparser.xmlParserCtxt* pctxt
1187         cdef char* c_filename
1188         if not filename:
1189             filename = None
1190
1191         context = self._getParserContext()
1192         context.prepare()
1193         try:
1194             pctxt = context._c_ctxt
1195             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1196             file_context = _FileReaderContext(
1197                 filelike, context, filename,
1198                 encoding or self._default_encoding)
1199             result = file_context._readDoc(pctxt, self._parse_options)
1200
1201             return context._handleParseResultDoc(
1202                 self, result, filename)
1203         finally:
1204             context.cleanup()
1205
1206
1207 cdef void _initSaxDocument(void* ctxt) with gil:
1208     xmlparser.xmlSAX2StartDocument(ctxt)
1209     c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
1210     c_doc = c_ctxt.myDoc
1211
1212     # set up document dict
1213     if c_doc and c_ctxt.dict and not c_doc.dict:
1214         # I have no idea why libxml2 disables this - we need it
1215         c_ctxt.dictNames = 1
1216         c_doc.dict = c_ctxt.dict
1217         xmlparser.xmlDictReference(c_ctxt.dict)
1218
1219     # set up XML ID hash table
1220     if c_ctxt._private:
1221         context = <_ParserContext>c_ctxt._private
1222         if context._collect_ids:
1223             # keep the global parser dict from filling up with XML IDs
1224             if c_doc and not c_doc.ids:
1225                 # memory errors are not fatal here
1226                 c_dict = xmlparser.xmlDictCreate()
1227                 if c_dict:
1228                     c_doc.ids = tree.xmlHashCreateDict(0, c_dict)
1229                     xmlparser.xmlDictFree(c_dict)
1230                 else:
1231                     c_doc.ids = tree.xmlHashCreate(0)
1232         else:
1233             c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS
1234             if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids):
1235                 # already initialised but empty => clear
1236                 tree.xmlHashFree(c_doc.ids, NULL)
1237                 c_doc.ids = NULL
1238
1239
1240 ############################################################
1241 ## ET feed parser
1242 ############################################################
1243
1244 cdef class _FeedParser(_BaseParser):
1245     cdef bint _feed_parser_running
1246
1247     @property
1248     def feed_error_log(self):
1249         """The error log of the last (or current) run of the feed parser.
1250
1251         Note that this is local to the feed parser and thus is
1252         different from what the ``error_log`` property returns.
1253         """
1254         return self._getPushParserContext()._error_log.copy()
1255
1256     cpdef feed(self, data):
1257         u"""feed(self, data)
1258
1259         Feeds data to the parser.  The argument should be an 8-bit string
1260         buffer containing encoded data, although Unicode is supported as long
1261         as both string types are not mixed.
1262
1263         This is the main entry point to the consumer interface of a
1264         parser.  The parser will parse as much of the XML stream as it
1265         can on each call.  To finish parsing or to reset the parser,
1266         call the ``close()`` method.  Both methods may raise
1267         ParseError if errors occur in the input data.  If an error is
1268         raised, there is no longer a need to call ``close()``.
1269
1270         The feed parser interface is independent of the normal parser
1271         usage.  You can use the same parser as a feed parser and in
1272         the ``parse()`` function concurrently.
1273         """
1274         cdef _ParserContext context
1275         cdef xmlparser.xmlParserCtxt* pctxt
1276         cdef Py_ssize_t py_buffer_len
1277         cdef const_char* c_data
1278         cdef const_char* c_encoding
1279         cdef int buffer_len
1280         cdef int error
1281         cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
1282         if isinstance(data, bytes):
1283             if self._default_encoding is None:
1284                 c_encoding = NULL
1285             else:
1286                 c_encoding = self._default_encoding
1287             c_data = _cstr(data)
1288             py_buffer_len = python.PyBytes_GET_SIZE(data)
1289         elif isinstance(data, unicode):
1290             if _UNICODE_ENCODING is NULL:
1291                 raise ParserError, \
1292                     u"Unicode parsing is not supported on this platform"
1293             c_encoding = _UNICODE_ENCODING
1294             c_data = python.PyUnicode_AS_DATA(data)
1295             py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data)
1296         else:
1297             raise TypeError, u"Parsing requires string data"
1298
1299         context = self._getPushParserContext()
1300         pctxt = context._c_ctxt
1301         error = 0
1302         if not self._feed_parser_running:
1303             context.prepare(set_document_loader=False)
1304             self._feed_parser_running = 1
1305             c_filename = (_cstr(self._filename)
1306                           if self._filename is not None else NULL)
1307
1308             # We have to give *mlCtxtResetPush() enough input to figure
1309             # out the character encoding (at least four bytes),
1310             # however if we give it all we got, we'll have nothing for
1311             # *mlParseChunk() and things go wrong.
1312             buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len
1313             orig_loader = _register_document_loader()
1314             if self._for_html:
1315                 error = _htmlCtxtResetPush(
1316                     pctxt, c_data, buffer_len, c_filename, c_encoding,
1317                     self._parse_options)
1318             else:
1319                 xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
1320                 error = xmlparser.xmlCtxtResetPush(
1321                     pctxt, c_data, buffer_len, c_filename, c_encoding)
1322             _reset_document_loader(orig_loader)
1323             py_buffer_len -= buffer_len
1324             c_data += buffer_len
1325             if error:
1326                 raise MemoryError()
1327             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1328
1329         #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
1330
1331         fixup_error = 0
1332         while py_buffer_len > 0 and (error == 0 or recover):
1333             with nogil:
1334                 if py_buffer_len > limits.INT_MAX:
1335                     buffer_len = limits.INT_MAX
1336                 else:
1337                     buffer_len = <int>py_buffer_len
1338                 if self._for_html:
1339                     c_node = pctxt.node  # last node where the parser stopped
1340                     orig_loader = _register_document_loader()
1341                     error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0)
1342                     _reset_document_loader(orig_loader)
1343                     # and now for the fun part: move node names to the dict
1344                     if pctxt.myDoc:
1345                         fixup_error = _fixHtmlDictSubtreeNames(
1346                             pctxt.dict, pctxt.myDoc, c_node)
1347                         if pctxt.myDoc.dict and pctxt.myDoc.dict is not pctxt.dict:
1348                             xmlparser.xmlDictFree(pctxt.myDoc.dict)
1349                             pctxt.myDoc.dict = pctxt.dict
1350                             xmlparser.xmlDictReference(pctxt.dict)
1351                 else:
1352                     orig_loader = _register_document_loader()
1353                     error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0)
1354                     _reset_document_loader(orig_loader)
1355                 py_buffer_len -= buffer_len
1356                 c_data += buffer_len
1357
1358             if fixup_error:
1359                 context.store_exception(MemoryError())
1360
1361             if context._has_raised():
1362                 # propagate Python exceptions immediately
1363                 recover = 0
1364                 error = 1
1365                 break
1366
1367             if error and not pctxt.replaceEntities and not pctxt.validate:
1368                 # in this mode, we ignore errors about undefined entities
1369                 for entry in context._error_log.filter_from_errors():
1370                     if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
1371                            entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
1372                         break
1373                 else:
1374                     error = 0
1375
1376         if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised():
1377             # propagate Python exceptions immediately
1378             recover = 0
1379             error = 1
1380
1381         if fixup_error or not recover and (error or not pctxt.wellFormed):
1382             self._feed_parser_running = 0
1383             try:
1384                 context._handleParseResult(self, pctxt.myDoc, None)
1385             finally:
1386                 context.cleanup()
1387
1388     cpdef close(self):
1389         u"""close(self)
1390
1391         Terminates feeding data to this parser.  This tells the parser to
1392         process any remaining data in the feed buffer, and then returns the
1393         root Element of the tree that was parsed.
1394
1395         This method must be called after passing the last chunk of data into
1396         the ``feed()`` method.  It should only be called when using the feed
1397         parser interface, all other usage is undefined.
1398         """
1399         if not self._feed_parser_running:
1400             raise XMLSyntaxError(u"no element found",
1401                                  xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
1402                                  self._filename)
1403
1404         context = self._getPushParserContext()
1405         pctxt = context._c_ctxt
1406
1407         self._feed_parser_running = 0
1408         if self._for_html:
1409             htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
1410         else:
1411             xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
1412
1413         if (pctxt.recovery and not pctxt.disableSAX and
1414                 isinstance(context, _SaxParserContext)):
1415             # apply any left-over 'end' events
1416             (<_SaxParserContext>context).flushEvents()
1417
1418         try:
1419             result = context._handleParseResult(self, pctxt.myDoc, None)
1420         finally:
1421             context.cleanup()
1422
1423         if isinstance(result, _Document):
1424             return (<_Document>result).getroot()
1425         else:
1426             return result
1427
1428
1429 cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
1430                              const_char* c_data, int buffer_len,
1431                              const_char* c_filename, const_char* c_encoding,
1432                              int parse_options) except -1:
1433     cdef xmlparser.xmlParserInput* c_input_stream
1434     # libxml2 lacks an HTML push parser setup function
1435     error = xmlparser.xmlCtxtResetPush(
1436         c_ctxt, c_data, buffer_len, c_filename, c_encoding)
1437     if error:
1438         return error
1439
1440     # fix libxml2 setup for HTML
1441     c_ctxt.progressive = 1
1442     c_ctxt.html = 1
1443     htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
1444
1445     return 0
1446
1447
1448 ############################################################
1449 ## XML parser
1450 ############################################################
1451
1452 cdef int _XML_DEFAULT_PARSE_OPTIONS
1453 _XML_DEFAULT_PARSE_OPTIONS = (
1454     xmlparser.XML_PARSE_NOENT   |
1455     xmlparser.XML_PARSE_NOCDATA |
1456     xmlparser.XML_PARSE_NONET   |
1457     xmlparser.XML_PARSE_COMPACT |
1458     xmlparser.XML_PARSE_BIG_LINES
1459     )
1460
1461 cdef class XMLParser(_FeedParser):
1462     u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True)
1463
1464     The XML parser.
1465
1466     Parsers can be supplied as additional argument to various parse
1467     functions of the lxml API.  A default parser is always available
1468     and can be replaced by a call to the global function
1469     'set_default_parser'.  New parsers can be created at any time
1470     without a major run-time overhead.
1471
1472     The keyword arguments in the constructor are mainly based on the
1473     libxml2 parser configuration.  A DTD will also be loaded if DTD
1474     validation or attribute default values are requested (unless you
1475     additionally provide an XMLSchema from which the default
1476     attributes can be read).
1477
1478     Available boolean keyword arguments:
1479
1480     - attribute_defaults - inject default attributes from DTD or XMLSchema
1481     - dtd_validation     - validate against a DTD referenced by the document
1482     - load_dtd           - use DTD for parsing
1483     - no_network         - prevent network access for related files (default: True)
1484     - ns_clean           - clean up redundant namespace declarations
1485     - recover            - try hard to parse through broken XML
1486     - remove_blank_text  - discard blank text nodes that appear ignorable
1487     - remove_comments    - discard comments
1488     - remove_pis         - discard processing instructions
1489     - strip_cdata        - replace CDATA sections by normal text content (default: True)
1490     - compact            - save memory for short text content (default: True)
1491     - collect_ids        - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
1492     - resolve_entities   - replace entities by their text value (default: True)
1493     - huge_tree          - disable security restrictions and support very deep trees
1494                            and very long text content (only affects libxml2 2.7+)
1495
1496     Other keyword arguments:
1497
1498     - encoding - override the document encoding
1499     - target   - a parser target object that will receive the parse events
1500     - schema   - an XMLSchema to validate against
1501
1502     Note that you should avoid sharing parsers between threads.  While this is
1503     not harmful, it is more efficient to use separate parsers.  This does not
1504     apply to the default parser.
1505     """
1506     def __init__(self, *, encoding=None, attribute_defaults=False,
1507                  dtd_validation=False, load_dtd=False, no_network=True,
1508                  ns_clean=False, recover=False, XMLSchema schema=None,
1509                  huge_tree=False, remove_blank_text=False, resolve_entities=True,
1510                  remove_comments=False, remove_pis=False, strip_cdata=True,
1511                  collect_ids=True, target=None, compact=True):
1512         cdef int parse_options
1513         parse_options = _XML_DEFAULT_PARSE_OPTIONS
1514         if load_dtd:
1515             parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1516         if dtd_validation:
1517             parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
1518                             xmlparser.XML_PARSE_DTDLOAD
1519         if attribute_defaults:
1520             parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
1521             if schema is None:
1522                 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1523         if ns_clean:
1524             parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
1525         if recover:
1526             parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
1527         if remove_blank_text:
1528             parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
1529         if huge_tree:
1530             parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1531         if not no_network:
1532             parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
1533         if not compact:
1534             parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
1535         if not resolve_entities:
1536             parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
1537         if not strip_cdata:
1538             parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
1539
1540         _BaseParser.__init__(self, parse_options, 0, schema,
1541                              remove_comments, remove_pis, strip_cdata,
1542                              collect_ids, target, encoding)
1543
1544
1545 cdef class XMLPullParser(XMLParser):
1546     """XMLPullParser(self, events=None, *, tag=None, **kwargs)
1547
1548     XML parser that collects parse events in an iterator.
1549
1550     The collected events are the same as for iterparse(), but the
1551     parser itself is non-blocking in the sense that it receives
1552     data chunks incrementally through its .feed() method, instead
1553     of reading them directly from a file(-like) object all by itself.
1554
1555     By default, it collects Element end events.  To change that,
1556     pass any subset of the available events into the ``events``
1557     argument: ``'start'``, ``'end'``, ``'start-ns'``,
1558     ``'end-ns'``, ``'comment'``, ``'pi'``.
1559
1560     To support loading external dependencies relative to the input
1561     source, you can pass the ``base_url``.
1562     """
1563     def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
1564         XMLParser.__init__(self, **kwargs)
1565         if events is None:
1566             events = ('end',)
1567         self._setBaseURL(base_url)
1568         self._collectEvents(events, tag)
1569
1570     def read_events(self):
1571         return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
1572
1573
1574 cdef class ETCompatXMLParser(XMLParser):
1575     u"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
1576                  dtd_validation=False, load_dtd=False, no_network=True, \
1577                  ns_clean=False, recover=False, schema=None, \
1578                  huge_tree=False, remove_blank_text=False, resolve_entities=True, \
1579                  remove_comments=True, remove_pis=True, strip_cdata=True, \
1580                  target=None, compact=True)
1581
1582     An XML parser with an ElementTree compatible default setup.
1583
1584     See the XMLParser class for details.
1585
1586     This parser has ``remove_comments`` and ``remove_pis`` enabled by default
1587     and thus ignores comments and processing instructions.
1588     """
1589     def __init__(self, *, encoding=None, attribute_defaults=False,
1590                  dtd_validation=False, load_dtd=False, no_network=True,
1591                  ns_clean=False, recover=False, schema=None,
1592                  huge_tree=False, remove_blank_text=False, resolve_entities=True,
1593                  remove_comments=True, remove_pis=True, strip_cdata=True,
1594                  target=None, compact=True):
1595         XMLParser.__init__(self,
1596                            attribute_defaults=attribute_defaults,
1597                            dtd_validation=dtd_validation,
1598                            load_dtd=load_dtd,
1599                            no_network=no_network,
1600                            ns_clean=ns_clean,
1601                            recover=recover,
1602                            remove_blank_text=remove_blank_text,
1603                            huge_tree=huge_tree,
1604                            compact=compact,
1605                            resolve_entities=resolve_entities,
1606                            remove_comments=remove_comments,
1607                            remove_pis=remove_pis,
1608                            strip_cdata=strip_cdata,
1609                            target=target,
1610                            encoding=encoding,
1611                            schema=schema)
1612
1613 # ET 1.2 compatible name
1614 XMLTreeBuilder = ETCompatXMLParser
1615
1616
1617 cdef XMLParser __DEFAULT_XML_PARSER
1618 __DEFAULT_XML_PARSER = XMLParser()
1619
1620 __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
1621
1622 def set_default_parser(_BaseParser parser=None):
1623     u"""set_default_parser(parser=None)
1624
1625     Set a default parser for the current thread.  This parser is used
1626     globally whenever no parser is supplied to the various parse functions of
1627     the lxml API.  If this function is called without a parser (or if it is
1628     None), the default parser is reset to the original configuration.
1629
1630     Note that the pre-installed default parser is not thread-safe.  Avoid the
1631     default parser in multi-threaded environments.  You can create a separate
1632     parser for each thread explicitly or use a parser pool.
1633     """
1634     if parser is None:
1635         parser = __DEFAULT_XML_PARSER
1636     __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
1637
1638 def get_default_parser():
1639     u"get_default_parser()"
1640     return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1641
1642 ############################################################
1643 ## HTML parser
1644 ############################################################
1645
1646 cdef int _HTML_DEFAULT_PARSE_OPTIONS
1647 _HTML_DEFAULT_PARSE_OPTIONS = (
1648     htmlparser.HTML_PARSE_RECOVER |
1649     htmlparser.HTML_PARSE_NONET   |
1650     htmlparser.HTML_PARSE_COMPACT
1651     )
1652
1653 cdef class HTMLParser(_FeedParser):
1654     u"""HTMLParser(self, encoding=None, remove_blank_text=False, \
1655                    remove_comments=False, remove_pis=False, strip_cdata=True, \
1656                    no_network=True, target=None, schema: XMLSchema =None, \
1657                    recover=True, compact=True, collect_ids=True, huge_tree=False)
1658
1659     The HTML parser.
1660
1661     This parser allows reading HTML into a normal XML tree.  By
1662     default, it can read broken (non well-formed) HTML, depending on
1663     the capabilities of libxml2.  Use the 'recover' option to switch
1664     this off.
1665
1666     Available boolean keyword arguments:
1667
1668     - recover            - try hard to parse through broken HTML (default: True)
1669     - no_network         - prevent network access for related files (default: True)
1670     - remove_blank_text  - discard empty text nodes that are ignorable (i.e. not actual text content)
1671     - remove_comments    - discard comments
1672     - remove_pis         - discard processing instructions
1673     - strip_cdata        - replace CDATA sections by normal text content (default: True)
1674     - compact            - save memory for short text content (default: True)
1675     - default_doctype    - add a default doctype even if it is not found in the HTML (default: True)
1676     - collect_ids        - use a hash table of XML IDs for fast access (default: True)
1677     - huge_tree          - disable security restrictions and support very deep trees
1678                            and very long text content (only affects libxml2 2.7+)
1679
1680     Other keyword arguments:
1681
1682     - encoding - override the document encoding
1683     - target   - a parser target object that will receive the parse events
1684     - schema   - an XMLSchema to validate against
1685
1686     Note that you should avoid sharing parsers between threads for performance
1687     reasons.
1688     """
1689     def __init__(self, *, encoding=None, remove_blank_text=False,
1690                  remove_comments=False, remove_pis=False, strip_cdata=True,
1691                  no_network=True, target=None, XMLSchema schema=None,
1692                  recover=True, compact=True, default_doctype=True,
1693                  collect_ids=True, huge_tree=False):
1694         cdef int parse_options
1695         parse_options = _HTML_DEFAULT_PARSE_OPTIONS
1696         if remove_blank_text:
1697             parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
1698         if not recover:
1699             parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
1700         if not no_network:
1701             parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
1702         if not compact:
1703             parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
1704         if not default_doctype:
1705             parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD
1706         if huge_tree:
1707             parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1708
1709         _BaseParser.__init__(self, parse_options, 1, schema,
1710                              remove_comments, remove_pis, strip_cdata,
1711                              collect_ids, target, encoding)
1712
1713
1714 cdef HTMLParser __DEFAULT_HTML_PARSER
1715 __DEFAULT_HTML_PARSER = HTMLParser()
1716
1717
1718 cdef class HTMLPullParser(HTMLParser):
1719     """HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs)
1720
1721     HTML parser that collects parse events in an iterator.
1722
1723     The collected events are the same as for iterparse(), but the
1724     parser itself is non-blocking in the sense that it receives
1725     data chunks incrementally through its .feed() method, instead
1726     of reading them directly from a file(-like) object all by itself.
1727
1728     By default, it collects Element end events.  To change that,
1729     pass any subset of the available events into the ``events``
1730     argument: ``'start'``, ``'end'``, ``'start-ns'``,
1731     ``'end-ns'``, ``'comment'``, ``'pi'``.
1732
1733     To support loading external dependencies relative to the input
1734     source, you can pass the ``base_url``.
1735     """
1736     def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
1737         HTMLParser.__init__(self, **kwargs)
1738         if events is None:
1739             events = ('end',)
1740         self._setBaseURL(base_url)
1741         self._collectEvents(events, tag)
1742
1743     def read_events(self):
1744         return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
1745
1746
1747 ############################################################
1748 ## helper functions for document creation
1749 ############################################################
1750
1751 cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
1752     cdef char* c_filename
1753     cdef char* c_text
1754     cdef Py_ssize_t c_len
1755     cdef bint is_pep393_string
1756     if parser is None:
1757         parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1758     if not filename:
1759         c_filename = NULL
1760     else:
1761         filename_utf = _encodeFilenameUTF8(filename)
1762         c_filename = _cstr(filename_utf)
1763     if isinstance(text, unicode):
1764         is_pep393_string = (
1765             python.PEP393_ENABLED and python.PyUnicode_IS_READY(text))
1766         if is_pep393_string:
1767             c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
1768         else:
1769             c_len = python.PyUnicode_GET_DATA_SIZE(text)
1770         if c_len > limits.INT_MAX:
1771             return (<_BaseParser>parser)._parseDocFromFilelike(
1772                 StringIO(text), filename, None)
1773         if _UNICODE_ENCODING is NULL and not is_pep393_string:
1774             text = (<unicode>text).encode('utf8')
1775             return (<_BaseParser>parser)._parseDocFromFilelike(
1776                 BytesIO(text), filename, "UTF-8")
1777         return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
1778     else:
1779         c_len = python.PyBytes_GET_SIZE(text)
1780         if c_len > limits.INT_MAX:
1781             return (<_BaseParser>parser)._parseDocFromFilelike(
1782                 BytesIO(text), filename, None)
1783         c_text = _cstr(text)
1784         return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
1785
1786 cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
1787     if parser is None:
1788         parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1789     return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
1790
1791 cdef xmlDoc* _parseDocFromFilelike(source, filename,
1792                                    _BaseParser parser) except NULL:
1793     if parser is None:
1794         parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1795     return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
1796
1797 cdef xmlDoc* _newXMLDoc() except NULL:
1798     cdef xmlDoc* result
1799     result = tree.xmlNewDoc(NULL)
1800     if result is NULL:
1801         raise MemoryError()
1802     if result.encoding is NULL:
1803         result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
1804     __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1805     return result
1806
1807 cdef xmlDoc* _newHTMLDoc() except NULL:
1808     cdef xmlDoc* result
1809     result = tree.htmlNewDoc(NULL, NULL)
1810     if result is NULL:
1811         raise MemoryError()
1812     __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1813     return result
1814
1815 cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
1816     cdef xmlDoc* result
1817     if recursive:
1818         with nogil:
1819             result = tree.xmlCopyDoc(c_doc, recursive)
1820     else:
1821         result = tree.xmlCopyDoc(c_doc, 0)
1822     if result is NULL:
1823         raise MemoryError()
1824     __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1825     return result
1826
1827 cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
1828     u"Recursively copy the document and make c_new_root the new root node."
1829     cdef xmlDoc* result
1830     cdef xmlNode* c_node
1831     result = tree.xmlCopyDoc(c_doc, 0) # non recursive
1832     __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1833     with nogil:
1834         c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
1835     if c_node is NULL:
1836         raise MemoryError()
1837     tree.xmlDocSetRootElement(result, c_node)
1838     _copyTail(c_new_root.next, c_node)
1839     return result
1840
1841 cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
1842     u"Recursively copy the element into the document. c_doc is not modified."
1843     cdef xmlNode* c_root
1844     c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
1845     if c_root is NULL:
1846         raise MemoryError()
1847     _copyTail(c_node.next, c_root)
1848     return c_root
1849
1850
1851 ############################################################
1852 ## API level helper functions for _Document creation
1853 ############################################################
1854
1855 cdef _Document _parseDocument(source, _BaseParser parser, base_url):
1856     cdef _Document doc
1857     if _isString(source):
1858         # parse the file directly from the filesystem
1859         doc = _parseDocumentFromURL(_encodeFilename(source), parser)
1860         # fix base URL if requested
1861         if base_url is not None:
1862             base_url = _encodeFilenameUTF8(base_url)
1863             if doc._c_doc.URL is not NULL:
1864                 tree.xmlFree(<char*>doc._c_doc.URL)
1865             doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url))
1866         return doc
1867
1868     if base_url is not None:
1869         url = base_url
1870     else:
1871         url = _getFilenameForFile(source)
1872
1873     if hasattr(source, u'getvalue') and hasattr(source, u'tell'):
1874         # StringIO - reading from start?
1875         if source.tell() == 0:
1876             return _parseMemoryDocument(source.getvalue(), url, parser)
1877
1878     # Support for file-like objects (urlgrabber.urlopen, ...)
1879     if hasattr(source, u'read'):
1880         return _parseFilelikeDocument(source, url, parser)
1881
1882     raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'"
1883
1884 cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
1885     c_doc = _parseDocFromFile(url, parser)
1886     return _documentFactory(c_doc, parser)
1887
1888 cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
1889     if isinstance(text, unicode):
1890         if _hasEncodingDeclaration(text):
1891             raise ValueError(
1892                 u"Unicode strings with encoding declaration are not supported. "
1893                 u"Please use bytes input or XML fragments without declaration.")
1894     elif not isinstance(text, bytes):
1895         raise ValueError, u"can only parse strings"
1896     c_doc = _parseDoc(text, url, parser)
1897     return _documentFactory(c_doc, parser)
1898
1899 cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
1900     c_doc = _parseDocFromFilelike(source, url, parser)
1901     return _documentFactory(c_doc, parser)