1 # Parsers for XML and HTML
3 from lxml.includes cimport xmlparser
4 from lxml.includes cimport htmlparser
7 class ParseError(LxmlSyntaxError):
8 """Syntax error while parsing an XML document.
10 For compatibility with ElementTree 1.3 and later.
12 def __init__(self, message, code, line, column, filename=None):
13 super(_ParseError, self).__init__(message)
14 self.lineno, self.offset = (line, column - 1)
16 self.filename = filename
20 return self.lineno, self.offset + 1
23 def position(self, new_pos):
24 self.lineno, column = new_pos
25 self.offset = column - 1
27 cdef object _ParseError = ParseError
30 class XMLSyntaxError(ParseError):
31 """Syntax error while parsing an XML document.
34 cdef class ParserError(LxmlError):
35 """Internal lxml parser error.
41 cdef class _ParserDictionaryContext:
42 # Global parser context to share the string dictionary.
44 # This class is a delegate singleton!
46 # It creates _ParserDictionaryContext objects for each thread to keep thread state,
47 # but those must never be used directly. Always stick to using the static
48 # __GLOBAL_PARSER_CONTEXT as defined below the class.
51 cdef tree.xmlDict* _c_dict
52 cdef _BaseParser _default_parser
53 cdef list _implied_parser_contexts
57 self._implied_parser_contexts = []
59 def __dealloc__(self):
60 if self._c_dict is not NULL:
61 xmlparser.xmlDictFree(self._c_dict)
63 cdef void initMainParserContext(self):
64 u"""Put the global context into the thread dictionary of the main
65 thread. To be called once and only in the main thread."""
66 thread_dict = python.PyThreadState_GetDict()
67 if thread_dict is not NULL:
68 (<dict>thread_dict)[u"_ParserDictionaryContext"] = self
70 cdef _ParserDictionaryContext _findThreadParserContext(self):
71 u"Find (or create) the _ParserDictionaryContext object for the current thread"
72 cdef _ParserDictionaryContext context
73 thread_dict = python.PyThreadState_GetDict()
74 if thread_dict is NULL:
77 result = python.PyDict_GetItem(d, u"_ParserDictionaryContext")
78 if result is not NULL:
80 context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext)
81 d[u"_ParserDictionaryContext"] = context
84 cdef void setDefaultParser(self, _BaseParser parser):
85 u"Set the default parser for the current thread"
86 cdef _ParserDictionaryContext context
87 context = self._findThreadParserContext()
88 context._default_parser = parser
90 cdef _BaseParser getDefaultParser(self):
91 u"Return (or create) the default parser of the current thread"
92 cdef _ParserDictionaryContext context
93 context = self._findThreadParserContext()
94 if context._default_parser is None:
95 if self._default_parser is None:
96 self._default_parser = __DEFAULT_XML_PARSER._copy()
97 if context is not self:
98 context._default_parser = self._default_parser._copy()
99 return context._default_parser
101 cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
102 u"Return the thread-local dict or create a new one if necessary."
103 cdef _ParserDictionaryContext context
104 context = self._findThreadParserContext()
105 if context._c_dict is NULL:
106 # thread dict not yet set up => use default or create a new one
107 if default is not NULL:
108 context._c_dict = default
109 xmlparser.xmlDictReference(default)
111 if self._c_dict is NULL:
112 self._c_dict = xmlparser.xmlDictCreate()
113 if context is not self:
114 context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
115 return context._c_dict
117 cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref):
118 c_dict = c_dict_ref[0]
119 c_thread_dict = self._getThreadDict(c_dict)
120 if c_dict is c_thread_dict:
122 if c_dict is not NULL:
123 xmlparser.xmlDictFree(c_dict)
124 c_dict_ref[0] = c_thread_dict
125 xmlparser.xmlDictReference(c_thread_dict)
127 cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt):
128 u"Assure we always use the same string dictionary."
129 self.initThreadDictRef(&pctxt.dict)
132 cdef void initXPathParserDict(self, xpath.xmlXPathContext* pctxt):
133 u"Assure we always use the same string dictionary."
134 self.initThreadDictRef(&pctxt.dict)
136 cdef void initDocDict(self, xmlDoc* result):
137 u"Store dict of last object parsed if no shared dict yet"
138 # XXX We also free the result dict here if there already was one.
139 # This case should only occur for new documents with empty dicts,
140 # otherwise we'd free data that's in use => segfault
141 self.initThreadDictRef(&result.dict)
143 cdef _ParserContext findImpliedContext(self):
144 u"""Return any current implied xml parser context for the current
145 thread. This is used when the resolver functions are called
146 with an xmlParserCtxt that was generated from within libxml2
147 (i.e. without a _ParserContext) - which happens when parsing
148 schema and xinclude external references."""
149 cdef _ParserDictionaryContext context
150 cdef _ParserContext implied_context
152 # see if we have a current implied parser
153 context = self._findThreadParserContext()
154 if context._implied_parser_contexts:
155 implied_context = context._implied_parser_contexts[-1]
156 return implied_context
159 cdef void pushImpliedContextFromParser(self, _BaseParser parser):
160 u"Push a new implied context object taken from the parser."
161 if parser is not None:
162 self.pushImpliedContext(parser._getParserContext())
164 self.pushImpliedContext(None)
166 cdef void pushImpliedContext(self, _ParserContext parser_context):
167 u"Push a new implied context object."
168 cdef _ParserDictionaryContext context
169 context = self._findThreadParserContext()
170 context._implied_parser_contexts.append(parser_context)
172 cdef void popImpliedContext(self):
173 u"Pop the current implied context object."
174 cdef _ParserDictionaryContext context
175 context = self._findThreadParserContext()
176 context._implied_parser_contexts.pop()
178 cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
179 __GLOBAL_PARSER_CONTEXT.initMainParserContext()
181 ############################################################
182 ## support for Python unicode I/O
183 ############################################################
185 # name of Python unicode encoding as known to libxml2
186 cdef const_char* _UNICODE_ENCODING = NULL
188 cdef int _setupPythonUnicode() except -1:
189 u"""Sets _UNICODE_ENCODING to the internal encoding name of Python unicode
190 strings if libxml2 supports reading native Python unicode. This depends
191 on iconv and the local Python installation, so we simply check if we find
192 a matching encoding handler.
194 cdef tree.xmlCharEncodingHandler* enchandler
197 cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>']
198 cdef const_xmlChar* buffer = <const_xmlChar*>uchars
199 # apparently, libxml2 can't detect UTF-16 on some systems
200 if (buffer[0] == c'<' and buffer[1] == c'\0' and
201 buffer[2] == c't' and buffer[3] == c'\0'):
203 elif (buffer[0] == c'\0' and buffer[1] == c'<' and
204 buffer[2] == c'\0' and buffer[3] == c't'):
207 # let libxml2 give it a try
208 enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7)
210 # not my fault, it's YOUR broken system :)
212 enchandler = tree.xmlFindCharEncodingHandler(enc)
213 if enchandler is not NULL:
214 global _UNICODE_ENCODING
215 tree.xmlCharEncCloseFunc(enchandler)
216 _UNICODE_ENCODING = enc
219 cdef const_char* _findEncodingName(const_xmlChar* buffer, int size):
220 u"Work around bug in libxml2: find iconv name of encoding on our own."
221 cdef tree.xmlCharEncoding enc
222 enc = tree.xmlDetectCharEncoding(buffer, size)
223 if enc == tree.XML_CHAR_ENCODING_UTF16LE:
224 if size >= 4 and (buffer[0] == <const_xmlChar>'\xFF' and
225 buffer[1] == <const_xmlChar>'\xFE' and
226 buffer[2] == 0 and buffer[3] == 0):
227 return "UTF-32LE" # according to BOM
230 elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
232 elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
234 elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
236 elif enc == tree.XML_CHAR_ENCODING_NONE:
239 # returns a constant char*, no need to free it
240 return tree.xmlGetCharEncodingName(enc)
242 _setupPythonUnicode()
244 ############################################################
245 ## support for file-like objects
246 ############################################################
250 cdef class _FileReaderContext:
251 cdef object _filelike
252 cdef object _encoding
255 cdef _ExceptionContext _exc_context
256 cdef Py_ssize_t _bytes_read
258 cdef bint _close_file_after_read
260 def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False):
261 self._exc_context = exc_context
262 self._filelike = filelike
263 self._close_file_after_read = close_file
264 self._encoding = encoding
268 url = _encodeFilename(url)
269 self._c_url = _cstr(url)
274 cdef _close_file(self):
275 if self._filelike is None or not self._close_file_after_read:
278 close = self._filelike.close
279 except AttributeError:
282 self._filelike = None
283 if close is not None:
286 cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self):
287 cdef stdio.FILE* c_stream
288 cdef xmlparser.xmlParserInputBuffer* c_buffer
289 c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
290 c_stream = python.PyFile_AsFile(self._filelike)
292 c_buffer.readcallback = _readFilelikeParser
293 c_buffer.context = <python.PyObject*>self
295 c_buffer.readcallback = _readFileParser
296 c_buffer.context = c_stream
299 cdef xmlparser.xmlParserInput* _createParserInput(
300 self, xmlparser.xmlParserCtxt* ctxt):
301 cdef xmlparser.xmlParserInputBuffer* c_buffer
302 c_buffer = self._createParserInputBuffer()
303 return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
305 cdef tree.xmlDtd* _readDtd(self):
306 cdef xmlparser.xmlParserInputBuffer* c_buffer
307 c_buffer = self._createParserInputBuffer()
309 return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
311 cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options):
313 cdef char* c_encoding
314 cdef stdio.FILE* c_stream
315 cdef xmlparser.xmlInputReadCallback c_read_callback
316 cdef xmlparser.xmlInputCloseCallback c_close_callback
317 cdef void* c_callback_context
319 if self._encoding is None:
322 c_encoding = _cstr(self._encoding)
324 c_stream = python.PyFile_AsFile(self._filelike)
326 c_read_callback = _readFilelikeParser
327 c_callback_context = <python.PyObject*>self
329 c_read_callback = _readFileParser
330 c_callback_context = c_stream
332 orig_options = ctxt.options
335 result = htmlparser.htmlCtxtReadIO(
336 ctxt, c_read_callback, NULL, c_callback_context,
337 self._c_url, c_encoding, options)
338 if result is not NULL:
339 if _fixHtmlDictNames(ctxt.dict, result) < 0:
340 tree.xmlFreeDoc(result)
343 result = xmlparser.xmlCtxtReadIO(
344 ctxt, c_read_callback, NULL, c_callback_context,
345 self._c_url, c_encoding, options)
346 ctxt.options = orig_options # work around libxml2 problem
350 self._exc_context._store_raised()
352 return result # swallow any exceptions
354 cdef int copyToBuffer(self, char* c_buffer, int c_requested):
355 cdef int c_byte_count = 0
357 cdef Py_ssize_t byte_count, remaining
358 if self._bytes_read < 0:
361 byte_count = python.PyBytes_GET_SIZE(self._bytes)
362 remaining = byte_count - self._bytes_read
363 while c_requested > remaining:
364 c_start = _cstr(self._bytes) + self._bytes_read
365 cstring_h.memcpy(c_buffer, c_start, remaining)
366 c_byte_count += remaining
367 c_buffer += remaining
368 c_requested -= remaining
370 self._bytes = self._filelike.read(c_requested)
371 if not isinstance(self._bytes, bytes):
372 if isinstance(self._bytes, unicode):
373 if self._encoding is None:
374 self._bytes = (<unicode>self._bytes).encode('utf8')
376 self._bytes = python.PyUnicode_AsEncodedString(
377 self._bytes, _cstr(self._encoding), NULL)
381 u"reading from file-like objects must return byte strings or unicode strings"
383 remaining = python.PyBytes_GET_SIZE(self._bytes)
385 self._bytes_read = -1
391 c_start = _cstr(self._bytes) + self._bytes_read
392 cstring_h.memcpy(c_buffer, c_start, c_requested)
393 c_byte_count += c_requested
394 self._bytes_read += c_requested
397 self._exc_context._store_raised()
401 self._exc_context._store_raised()
403 return c_byte_count # swallow any exceptions
405 cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) with gil:
406 return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
408 cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) nogil:
409 return stdio.fread(c_buffer, 1, c_size, <stdio.FILE*>ctxt)
411 ############################################################
412 ## support for custom document loaders
413 ############################################################
415 cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid,
416 xmlparser.xmlParserCtxt* c_context) with gil:
417 cdef _ResolverContext context
418 cdef xmlparser.xmlParserInput* c_input
419 cdef _InputDocument doc_ref
420 cdef _FileReaderContext file_context
421 # if there is no _ParserContext associated with the xmlParserCtxt
422 # passed, check to see if the thread state object has an implied
424 if c_context._private is not NULL:
425 context = <_ResolverContext>c_context._private
427 context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
430 if __DEFAULT_ENTITY_LOADER is NULL:
433 # free the GIL as we might do serious I/O here (e.g. HTTP)
434 c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
441 # parsing a related document (DTD etc.) => UTF-8 encoded URL?
442 url = _decodeFilename(<const_xmlChar*>c_url)
446 pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8
448 doc_ref = context._resolvers.resolve(url, pubid, context)
450 context._store_raised()
453 if doc_ref is not None:
454 if doc_ref._type == PARSER_DATA_STRING:
455 data = doc_ref._data_bytes
456 filename = doc_ref._filename
459 elif not isinstance(filename, bytes):
460 # most likely a text URL
461 filename = filename.encode('utf8')
462 if not isinstance(filename, bytes):
465 c_input = xmlparser.xmlNewInputStream(c_context)
466 if c_input is not NULL:
467 if filename is not None:
468 c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename))
469 c_input.base = _xcstr(data)
470 c_input.length = python.PyBytes_GET_SIZE(data)
471 c_input.cur = c_input.base
472 c_input.end = c_input.base + c_input.length
473 elif doc_ref._type == PARSER_DATA_FILENAME:
475 c_filename = _cstr(doc_ref._filename)
477 # free the GIL as we might do serious I/O here
478 c_input = xmlparser.xmlNewInputFromFile(
479 c_context, c_filename)
480 elif doc_ref._type == PARSER_DATA_FILE:
481 file_context = _FileReaderContext(doc_ref._file, context, url,
482 None, doc_ref._close_file)
483 c_input = file_context._createParserInput(c_context)
490 context._storage.add(data)
491 if c_input is not NULL:
494 if __DEFAULT_ENTITY_LOADER is NULL:
498 # free the GIL as we might do serious I/O here (e.g. HTTP)
499 c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
502 cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
503 __DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
506 cdef xmlparser.xmlExternalEntityLoader _register_document_loader() nogil:
507 cdef xmlparser.xmlExternalEntityLoader old = xmlparser.xmlGetExternalEntityLoader()
508 xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver)
511 cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) nogil:
512 xmlparser.xmlSetExternalEntityLoader(old)
515 ############################################################
517 ############################################################
519 @cython.no_gc_clear # May have to call "self._validator.disconnect()" on dealloc.
521 cdef class _ParserContext(_ResolverContext):
522 cdef _ErrorLog _error_log
523 cdef _ParserSchemaValidationContext _validator
524 cdef xmlparser.xmlParserCtxt* _c_ctxt
525 cdef xmlparser.xmlExternalEntityLoader _orig_loader
526 cdef python.PyThread_type_lock _lock
528 cdef bint _collect_ids
532 self._collect_ids = True
533 if not config.ENABLE_THREADING:
536 self._lock = python.PyThread_allocate_lock()
537 self._error_log = _ErrorLog()
539 def __dealloc__(self):
540 if config.ENABLE_THREADING and self._lock is not NULL:
541 python.PyThread_free_lock(self._lock)
543 if self._c_ctxt is not NULL:
544 if <void*>self._validator is not NULL and self._validator is not None:
545 # If the parser was not closed correctly (e.g. interrupted iterparse()),
546 # and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX
547 # validator plug might still be in place, which will make xmlFreeParserCtxt()
548 # crash when trying to xmlFree() a static SAX handler.
549 # Thus, make sure we disconnect the handler interceptor here at the latest.
550 self._validator.disconnect()
551 xmlparser.xmlFreeParserCtxt(self._c_ctxt)
553 cdef _ParserContext _copy(self):
554 cdef _ParserContext context
555 context = self.__class__()
556 context._collect_ids = self._collect_ids
557 context._validator = self._validator.copy()
558 _initParserContext(context, self._resolvers._copy(), NULL)
561 cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
562 self._c_ctxt = c_ctxt
563 c_ctxt._private = <void*>self
565 cdef void _resetParserContext(self):
566 if self._c_ctxt is not NULL:
567 if self._c_ctxt.html:
568 htmlparser.htmlCtxtReset(self._c_ctxt)
569 self._c_ctxt.disableSAX = 0 # work around bug in libxml2
571 xmlparser.xmlClearParserCtxt(self._c_ctxt)
573 cdef int prepare(self, bint set_document_loader=True) except -1:
575 if config.ENABLE_THREADING and self._lock is not NULL:
577 result = python.PyThread_acquire_lock(
578 self._lock, python.WAIT_LOCK)
580 raise ParserError, u"parser locking failed"
581 self._error_log.clear()
583 self._c_ctxt.sax.serror = _receiveParserError
584 self._orig_loader = _register_document_loader() if set_document_loader else NULL
585 if self._validator is not None:
586 self._validator.connect(self._c_ctxt, self._error_log)
589 cdef int cleanup(self) except -1:
590 if self._orig_loader is not NULL:
591 _reset_document_loader(self._orig_loader)
593 if self._validator is not None:
594 self._validator.disconnect()
595 self._resetParserContext()
598 self._c_ctxt.sax.serror = NULL
600 if config.ENABLE_THREADING and self._lock is not NULL:
601 python.PyThread_release_lock(self._lock)
604 cdef object _handleParseResult(self, _BaseParser parser,
605 xmlDoc* result, filename):
606 c_doc = self._handleParseResultDoc(parser, result, filename)
607 if self._doc is not None and self._doc._c_doc is c_doc:
610 return _documentFactory(c_doc, parser)
612 cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
613 xmlDoc* result, filename) except NULL:
614 recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
615 return _handleParseResult(self, self._c_ctxt, result,
617 free_doc=self._doc is None)
619 cdef _initParserContext(_ParserContext context,
620 _ResolverRegistry resolvers,
621 xmlparser.xmlParserCtxt* c_ctxt):
622 _initResolverContext(context, resolvers)
623 if c_ctxt is not NULL:
624 context._initParserContext(c_ctxt)
626 cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, xmlerror.xmlError* error) with gil:
627 (<_ParserContext>_parser_context._private)._error_log._receive(error)
629 cdef void _receiveParserError(void* c_context, xmlerror.xmlError* error) nogil:
631 if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL:
632 _forwardError(NULL, error)
634 _forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error)
636 cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
637 _ErrorLog error_log) except -1:
638 if filename is not None and \
639 ctxt.lastError.domain == xmlerror.XML_FROM_IO:
640 if isinstance(filename, bytes):
641 filename = _decodeFilenameWithLength(
642 <bytes>filename, len(<bytes>filename))
643 if ctxt.lastError.message is not NULL:
645 message = ctxt.lastError.message.decode('utf-8')
646 except UnicodeDecodeError:
647 # the filename may be in there => play it safe
648 message = ctxt.lastError.message.decode('iso8859-1')
649 message = f"Error reading file '{filename}': {message.strip()}"
651 message = f"Error reading '{filename}'"
652 raise IOError, message
654 raise error_log._buildParseException(
655 XMLSyntaxError, u"Document is not well formed")
656 elif ctxt.lastError.message is not NULL:
657 message = ctxt.lastError.message.strip()
658 code = ctxt.lastError.code
659 line = ctxt.lastError.line
660 column = ctxt.lastError.int2
661 if ctxt.lastError.line > 0:
662 message = f"line {line}: {message}"
663 raise XMLSyntaxError(message, code, line, column, filename)
665 raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
668 cdef xmlDoc* _handleParseResult(_ParserContext context,
669 xmlparser.xmlParserCtxt* c_ctxt,
670 xmlDoc* result, filename,
671 bint recover, bint free_doc) except NULL:
672 cdef bint well_formed
673 if result is not NULL:
674 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
676 if c_ctxt.myDoc is not NULL:
677 if c_ctxt.myDoc is not result:
678 __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
679 tree.xmlFreeDoc(c_ctxt.myDoc)
682 if result is not NULL:
683 if (context._validator is not None and
684 not context._validator.isvalid()):
685 well_formed = 0 # actually not 'valid', but anyway ...
686 elif (not c_ctxt.wellFormed and not c_ctxt.html and
687 c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
688 [1 for error in context._error_log
689 if error.type == ErrorTypes.ERR_INVALID_CHAR]):
690 # An encoding error occurred and libxml2 switched from UTF-8
691 # input to (undecoded) Latin-1, at some arbitrary point in the
692 # document. Better raise an error than allowing for a broken
693 # tree with mixed encodings.
695 elif recover or (c_ctxt.wellFormed and
696 c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
698 elif not c_ctxt.replaceEntities and not c_ctxt.validate \
699 and context is not None:
700 # in this mode, we ignore errors about undefined entities
701 for error in context._error_log.filter_from_errors():
702 if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
703 error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
713 tree.xmlFreeDoc(result)
716 if context is not None and context._has_raised():
717 if result is not NULL:
719 tree.xmlFreeDoc(result)
721 context._raise_if_stored()
724 if context is not None:
725 _raiseParseError(c_ctxt, filename, context._error_log)
727 _raiseParseError(c_ctxt, filename, None)
729 if result.URL is NULL and filename is not None:
730 result.URL = tree.xmlStrdup(_xcstr(filename))
731 if result.encoding is NULL:
732 result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
734 if context._validator is not None and \
735 context._validator._add_default_attributes:
736 # we currently need to do this here as libxml2 does not
737 # support inserting default attributes during parse-time
739 context._validator.inject_default_attributes(result)
743 cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil:
747 c_node = c_doc.children
748 tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
749 if c_node.type == tree.XML_ELEMENT_NODE:
750 if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
752 tree.END_FOR_EACH_ELEMENT_FROM(c_node)
755 cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc,
756 xmlNode* c_start_node) nogil:
758 Move names to the dict, iterating in document order, starting at
759 c_start_node. This is used in incremental parsing after each chunk.
765 return _fixHtmlDictNames(c_dict, c_doc)
766 c_node = c_start_node
767 tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
768 if c_node.type == tree.XML_ELEMENT_NODE:
769 if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
771 tree.END_FOR_EACH_ELEMENT_FROM(c_node)
774 cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
775 xmlNode* c_node) nogil:
777 c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
780 if c_name is not c_node.name:
781 tree.xmlFree(<char*>c_node.name)
783 c_attr = <xmlNode*>c_node.properties
784 while c_attr is not NULL:
785 c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
788 if c_name is not c_attr.name:
789 tree.xmlFree(<char*>c_attr.name)
795 cdef class _BaseParser:
796 cdef ElementClassLookup _class_lookup
797 cdef _ResolverRegistry _resolvers
798 cdef _ParserContext _parser_context
799 cdef _ParserContext _push_parser_context
800 cdef int _parse_options
802 cdef bint _remove_comments
803 cdef bint _remove_pis
804 cdef bint _strip_cdata
805 cdef bint _collect_ids
806 cdef XMLSchema _schema
808 cdef readonly object target
809 cdef object _default_encoding
810 cdef tuple _events_to_collect # (event_types, tag)
812 def __init__(self, int parse_options, bint for_html, XMLSchema schema,
813 remove_comments, remove_pis, strip_cdata, collect_ids,
815 cdef tree.xmlCharEncodingHandler* enchandler
817 if not isinstance(self, (XMLParser, HTMLParser)):
818 raise TypeError, u"This class cannot be instantiated"
820 self._parse_options = parse_options
822 self._for_html = for_html
823 self._remove_comments = remove_comments
824 self._remove_pis = remove_pis
825 self._strip_cdata = strip_cdata
826 self._collect_ids = collect_ids
827 self._schema = schema
829 self._resolvers = _ResolverRegistry()
832 self._default_encoding = None
834 encoding = _utf8(encoding)
835 enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
836 if enchandler is NULL:
837 raise LookupError, f"unknown encoding: '{encoding}'"
838 tree.xmlCharEncCloseFunc(enchandler)
839 self._default_encoding = encoding
841 cdef _setBaseURL(self, base_url):
842 self._filename = _encodeFilename(base_url)
844 cdef _collectEvents(self, event_types, tag):
845 if event_types is None:
848 event_types = tuple(set(event_types))
849 _buildParseEventFilter(event_types) # purely for validation
850 self._events_to_collect = (event_types, tag)
852 cdef _ParserContext _getParserContext(self):
853 cdef xmlparser.xmlParserCtxt* pctxt
854 if self._parser_context is None:
855 self._parser_context = self._createContext(self.target, None)
856 self._parser_context._collect_ids = self._collect_ids
857 if self._schema is not None:
858 self._parser_context._validator = \
859 self._schema._newSaxValidator(
860 self._parse_options & xmlparser.XML_PARSE_DTDATTR)
861 pctxt = self._newParserCtxt()
862 _initParserContext(self._parser_context, self._resolvers, pctxt)
863 self._configureSaxContext(pctxt)
864 return self._parser_context
866 cdef _ParserContext _getPushParserContext(self):
867 cdef xmlparser.xmlParserCtxt* pctxt
868 if self._push_parser_context is None:
869 self._push_parser_context = self._createContext(
870 self.target, self._events_to_collect)
871 self._push_parser_context._collect_ids = self._collect_ids
872 if self._schema is not None:
873 self._push_parser_context._validator = \
874 self._schema._newSaxValidator(
875 self._parse_options & xmlparser.XML_PARSE_DTDATTR)
876 pctxt = self._newPushParserCtxt()
878 self._push_parser_context, self._resolvers, pctxt)
879 self._configureSaxContext(pctxt)
880 return self._push_parser_context
882 cdef _ParserContext _createContext(self, target, events_to_collect):
883 cdef _SaxParserContext sax_context
884 if target is not None:
885 sax_context = _TargetParserContext(self)
886 (<_TargetParserContext>sax_context)._setTarget(target)
887 elif events_to_collect:
888 sax_context = _SaxParserContext(self)
890 # nothing special to configure
891 return _ParserContext()
892 if events_to_collect:
893 events, tag = events_to_collect
894 sax_context._setEventFilter(events, tag)
898 cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1:
899 if self._remove_comments:
900 pctxt.sax.comment = NULL
902 pctxt.sax.processingInstruction = NULL
903 if self._strip_cdata:
904 # hard switch-off for CDATA nodes => makes them plain text
905 pctxt.sax.cdataBlock = NULL
907 cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
908 cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
909 if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
910 # need to extend SAX1 context to SAX2 to get proper error reports
911 if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler:
912 sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler))
915 cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler,
916 sizeof(htmlparser.htmlDefaultSAXHandler))
918 sax.initialized = xmlparser.XML_SAX2_MAGIC
919 sax.serror = _receiveParserError
920 sax.startElementNs = NULL
921 sax.endElementNs = NULL
925 cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
926 cdef xmlparser.xmlParserCtxt* c_ctxt
928 c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
929 if c_ctxt is not NULL:
930 self._registerHtmlErrorHandler(c_ctxt)
932 c_ctxt = xmlparser.xmlNewParserCtxt()
935 c_ctxt.sax.startDocument = _initSaxDocument
938 cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL:
939 cdef xmlparser.xmlParserCtxt* c_ctxt
940 cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL
942 c_ctxt = htmlparser.htmlCreatePushParserCtxt(
943 NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
944 if c_ctxt is not NULL:
945 self._registerHtmlErrorHandler(c_ctxt)
946 htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
948 c_ctxt = xmlparser.xmlCreatePushParserCtxt(
949 NULL, NULL, NULL, 0, c_filename)
950 if c_ctxt is not NULL:
951 xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
954 c_ctxt.sax.startDocument = _initSaxDocument
959 """The error log of the last parser run.
961 cdef _ParserContext context
962 context = self._getParserContext()
963 return context._error_log.copy()
967 """The custom resolver registry of this parser."""
968 return self._resolvers
972 """The version of the underlying XML parser."""
973 return u"libxml2 %d.%d.%d" % LIBXML_VERSION
975 def setElementClassLookup(self, ElementClassLookup lookup = None):
976 u":deprecated: use ``parser.set_element_class_lookup(lookup)`` instead."
977 self.set_element_class_lookup(lookup)
979 def set_element_class_lookup(self, ElementClassLookup lookup = None):
980 u"""set_element_class_lookup(self, lookup = None)
982 Set a lookup scheme for element classes generated from this parser.
984 Reset it by passing None or nothing.
986 self._class_lookup = lookup
988 cdef _BaseParser _copy(self):
989 u"Create a new parser with the same configuration."
990 cdef _BaseParser parser
991 parser = self.__class__()
992 parser._parse_options = self._parse_options
993 parser._for_html = self._for_html
994 parser._remove_comments = self._remove_comments
995 parser._remove_pis = self._remove_pis
996 parser._strip_cdata = self._strip_cdata
997 parser._filename = self._filename
998 parser._resolvers = self._resolvers
999 parser.target = self.target
1000 parser._class_lookup = self._class_lookup
1001 parser._default_encoding = self._default_encoding
1002 parser._schema = self._schema
1003 parser._events_to_collect = self._events_to_collect
1009 Create a new parser with the same configuration.
1013 def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
1014 u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
1016 Creates a new element associated with this parser.
1018 return _makeElement(_tag, NULL, None, self, None, None,
1019 attrib, nsmap, _extra)
1021 # internal parser methods
1023 cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
1024 u"""Parse unicode document, share dictionary if possible.
1026 cdef _ParserContext context
1028 cdef xmlparser.xmlParserCtxt* pctxt
1029 cdef Py_ssize_t py_buffer_len
1030 cdef int buffer_len, c_kind
1031 cdef const_char* c_text
1032 cdef const_char* c_encoding = _UNICODE_ENCODING
1033 cdef bint is_pep393_string = (
1034 python.PEP393_ENABLED and python.PyUnicode_IS_READY(utext))
1035 if is_pep393_string:
1036 c_text = <const_char*>python.PyUnicode_DATA(utext)
1037 py_buffer_len = python.PyUnicode_GET_LENGTH(utext)
1038 c_kind = python.PyUnicode_KIND(utext)
1040 c_encoding = 'ISO-8859-1'
1043 if python.PY_BIG_ENDIAN:
1044 c_encoding = 'UTF-16BE' # actually UCS-2
1046 c_encoding = 'UTF-16LE' # actually UCS-2
1049 if python.PY_BIG_ENDIAN:
1050 c_encoding = 'UCS-4BE'
1052 c_encoding = 'UCS-4LE'
1054 assert False, f"Illegal Unicode kind {c_kind}"
1056 py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
1057 c_text = python.PyUnicode_AS_DATA(utext)
1058 assert 0 <= py_buffer_len <= limits.INT_MAX
1059 buffer_len = py_buffer_len
1061 context = self._getParserContext()
1064 pctxt = context._c_ctxt
1065 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1066 orig_options = pctxt.options
1069 result = htmlparser.htmlCtxtReadMemory(
1070 pctxt, c_text, buffer_len, c_filename, c_encoding,
1071 self._parse_options)
1072 if result is not NULL:
1073 if _fixHtmlDictNames(pctxt.dict, result) < 0:
1074 tree.xmlFreeDoc(result)
1077 result = xmlparser.xmlCtxtReadMemory(
1078 pctxt, c_text, buffer_len, c_filename, c_encoding,
1079 self._parse_options)
1080 pctxt.options = orig_options # work around libxml2 problem
1082 return context._handleParseResultDoc(self, result, None)
1086 cdef xmlDoc* _parseDoc(self, char* c_text, int c_len,
1087 char* c_filename) except NULL:
1088 u"""Parse document, share dictionary if possible.
1090 cdef _ParserContext context
1092 cdef xmlparser.xmlParserCtxt* pctxt
1093 cdef char* c_encoding
1094 cdef tree.xmlCharEncoding enc
1095 context = self._getParserContext()
1098 pctxt = context._c_ctxt
1099 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1101 if self._default_encoding is None:
1103 # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs
1104 # NOTE: limit to problematic cases because it changes character offsets
1105 if c_len >= 4 and (c_text[0] == '\xFF' and c_text[1] == '\xFE' and
1106 c_text[2] == 0 and c_text[3] == 0):
1107 c_encoding = "UTF-32LE"
1110 elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and
1111 c_text[2] == '\xFE' and c_text[3] == '\xFF'):
1112 c_encoding = "UTF-32BE"
1116 # no BOM => try to determine encoding
1117 enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len)
1118 if enc == tree.XML_CHAR_ENCODING_UCS4LE:
1119 c_encoding = 'UTF-32LE'
1120 elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
1121 c_encoding = 'UTF-32BE'
1123 c_encoding = _cstr(self._default_encoding)
1125 orig_options = pctxt.options
1128 result = htmlparser.htmlCtxtReadMemory(
1129 pctxt, c_text, c_len, c_filename,
1130 c_encoding, self._parse_options)
1131 if result is not NULL:
1132 if _fixHtmlDictNames(pctxt.dict, result) < 0:
1133 tree.xmlFreeDoc(result)
1136 result = xmlparser.xmlCtxtReadMemory(
1137 pctxt, c_text, c_len, c_filename,
1138 c_encoding, self._parse_options)
1139 pctxt.options = orig_options # work around libxml2 problem
1141 return context._handleParseResultDoc(self, result, None)
1145 cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
1146 cdef _ParserContext context
1148 cdef xmlparser.xmlParserCtxt* pctxt
1149 cdef char* c_encoding
1152 context = self._getParserContext()
1155 pctxt = context._c_ctxt
1156 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1158 if self._default_encoding is None:
1161 c_encoding = _cstr(self._default_encoding)
1163 orig_options = pctxt.options
1166 result = htmlparser.htmlCtxtReadFile(
1167 pctxt, c_filename, c_encoding, self._parse_options)
1168 if result is not NULL:
1169 if _fixHtmlDictNames(pctxt.dict, result) < 0:
1170 tree.xmlFreeDoc(result)
1173 result = xmlparser.xmlCtxtReadFile(
1174 pctxt, c_filename, c_encoding, self._parse_options)
1175 pctxt.options = orig_options # work around libxml2 problem
1177 return context._handleParseResultDoc(self, result, c_filename)
1181 cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename,
1182 encoding) except NULL:
1183 cdef _ParserContext context
1184 cdef _FileReaderContext file_context
1186 cdef xmlparser.xmlParserCtxt* pctxt
1187 cdef char* c_filename
1191 context = self._getParserContext()
1194 pctxt = context._c_ctxt
1195 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1196 file_context = _FileReaderContext(
1197 filelike, context, filename,
1198 encoding or self._default_encoding)
1199 result = file_context._readDoc(pctxt, self._parse_options)
1201 return context._handleParseResultDoc(
1202 self, result, filename)
1207 cdef void _initSaxDocument(void* ctxt) with gil:
1208 xmlparser.xmlSAX2StartDocument(ctxt)
1209 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
1210 c_doc = c_ctxt.myDoc
1212 # set up document dict
1213 if c_doc and c_ctxt.dict and not c_doc.dict:
1214 # I have no idea why libxml2 disables this - we need it
1215 c_ctxt.dictNames = 1
1216 c_doc.dict = c_ctxt.dict
1217 xmlparser.xmlDictReference(c_ctxt.dict)
1219 # set up XML ID hash table
1221 context = <_ParserContext>c_ctxt._private
1222 if context._collect_ids:
1223 # keep the global parser dict from filling up with XML IDs
1224 if c_doc and not c_doc.ids:
1225 # memory errors are not fatal here
1226 c_dict = xmlparser.xmlDictCreate()
1228 c_doc.ids = tree.xmlHashCreateDict(0, c_dict)
1229 xmlparser.xmlDictFree(c_dict)
1231 c_doc.ids = tree.xmlHashCreate(0)
1233 c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS
1234 if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids):
1235 # already initialised but empty => clear
1236 tree.xmlHashFree(c_doc.ids, NULL)
1240 ############################################################
1242 ############################################################
1244 cdef class _FeedParser(_BaseParser):
1245 cdef bint _feed_parser_running
1248 def feed_error_log(self):
1249 """The error log of the last (or current) run of the feed parser.
1251 Note that this is local to the feed parser and thus is
1252 different from what the ``error_log`` property returns.
1254 return self._getPushParserContext()._error_log.copy()
1256 cpdef feed(self, data):
1257 u"""feed(self, data)
1259 Feeds data to the parser. The argument should be an 8-bit string
1260 buffer containing encoded data, although Unicode is supported as long
1261 as both string types are not mixed.
1263 This is the main entry point to the consumer interface of a
1264 parser. The parser will parse as much of the XML stream as it
1265 can on each call. To finish parsing or to reset the parser,
1266 call the ``close()`` method. Both methods may raise
1267 ParseError if errors occur in the input data. If an error is
1268 raised, there is no longer a need to call ``close()``.
1270 The feed parser interface is independent of the normal parser
1271 usage. You can use the same parser as a feed parser and in
1272 the ``parse()`` function concurrently.
1274 cdef _ParserContext context
1275 cdef xmlparser.xmlParserCtxt* pctxt
1276 cdef Py_ssize_t py_buffer_len
1277 cdef const_char* c_data
1278 cdef const_char* c_encoding
1281 cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
1282 if isinstance(data, bytes):
1283 if self._default_encoding is None:
1286 c_encoding = self._default_encoding
1287 c_data = _cstr(data)
1288 py_buffer_len = python.PyBytes_GET_SIZE(data)
1289 elif isinstance(data, unicode):
1290 if _UNICODE_ENCODING is NULL:
1291 raise ParserError, \
1292 u"Unicode parsing is not supported on this platform"
1293 c_encoding = _UNICODE_ENCODING
1294 c_data = python.PyUnicode_AS_DATA(data)
1295 py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data)
1297 raise TypeError, u"Parsing requires string data"
1299 context = self._getPushParserContext()
1300 pctxt = context._c_ctxt
1302 if not self._feed_parser_running:
1303 context.prepare(set_document_loader=False)
1304 self._feed_parser_running = 1
1305 c_filename = (_cstr(self._filename)
1306 if self._filename is not None else NULL)
1308 # We have to give *mlCtxtResetPush() enough input to figure
1309 # out the character encoding (at least four bytes),
1310 # however if we give it all we got, we'll have nothing for
1311 # *mlParseChunk() and things go wrong.
1312 buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len
1313 orig_loader = _register_document_loader()
1315 error = _htmlCtxtResetPush(
1316 pctxt, c_data, buffer_len, c_filename, c_encoding,
1317 self._parse_options)
1319 xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
1320 error = xmlparser.xmlCtxtResetPush(
1321 pctxt, c_data, buffer_len, c_filename, c_encoding)
1322 _reset_document_loader(orig_loader)
1323 py_buffer_len -= buffer_len
1324 c_data += buffer_len
1327 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1329 #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
1332 while py_buffer_len > 0 and (error == 0 or recover):
1334 if py_buffer_len > limits.INT_MAX:
1335 buffer_len = limits.INT_MAX
1337 buffer_len = <int>py_buffer_len
1339 c_node = pctxt.node # last node where the parser stopped
1340 orig_loader = _register_document_loader()
1341 error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0)
1342 _reset_document_loader(orig_loader)
1343 # and now for the fun part: move node names to the dict
1345 fixup_error = _fixHtmlDictSubtreeNames(
1346 pctxt.dict, pctxt.myDoc, c_node)
1347 if pctxt.myDoc.dict and pctxt.myDoc.dict is not pctxt.dict:
1348 xmlparser.xmlDictFree(pctxt.myDoc.dict)
1349 pctxt.myDoc.dict = pctxt.dict
1350 xmlparser.xmlDictReference(pctxt.dict)
1352 orig_loader = _register_document_loader()
1353 error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0)
1354 _reset_document_loader(orig_loader)
1355 py_buffer_len -= buffer_len
1356 c_data += buffer_len
1359 context.store_exception(MemoryError())
1361 if context._has_raised():
1362 # propagate Python exceptions immediately
1367 if error and not pctxt.replaceEntities and not pctxt.validate:
1368 # in this mode, we ignore errors about undefined entities
1369 for entry in context._error_log.filter_from_errors():
1370 if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
1371 entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
1376 if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised():
1377 # propagate Python exceptions immediately
1381 if fixup_error or not recover and (error or not pctxt.wellFormed):
1382 self._feed_parser_running = 0
1384 context._handleParseResult(self, pctxt.myDoc, None)
1391 Terminates feeding data to this parser. This tells the parser to
1392 process any remaining data in the feed buffer, and then returns the
1393 root Element of the tree that was parsed.
1395 This method must be called after passing the last chunk of data into
1396 the ``feed()`` method. It should only be called when using the feed
1397 parser interface, all other usage is undefined.
1399 if not self._feed_parser_running:
1400 raise XMLSyntaxError(u"no element found",
1401 xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
1404 context = self._getPushParserContext()
1405 pctxt = context._c_ctxt
1407 self._feed_parser_running = 0
1409 htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
1411 xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
1413 if (pctxt.recovery and not pctxt.disableSAX and
1414 isinstance(context, _SaxParserContext)):
1415 # apply any left-over 'end' events
1416 (<_SaxParserContext>context).flushEvents()
1419 result = context._handleParseResult(self, pctxt.myDoc, None)
1423 if isinstance(result, _Document):
1424 return (<_Document>result).getroot()
1429 cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
1430 const_char* c_data, int buffer_len,
1431 const_char* c_filename, const_char* c_encoding,
1432 int parse_options) except -1:
1433 cdef xmlparser.xmlParserInput* c_input_stream
1434 # libxml2 lacks an HTML push parser setup function
1435 error = xmlparser.xmlCtxtResetPush(
1436 c_ctxt, c_data, buffer_len, c_filename, c_encoding)
1440 # fix libxml2 setup for HTML
1441 c_ctxt.progressive = 1
1443 htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
1448 ############################################################
1450 ############################################################
1452 cdef int _XML_DEFAULT_PARSE_OPTIONS
1453 _XML_DEFAULT_PARSE_OPTIONS = (
1454 xmlparser.XML_PARSE_NOENT |
1455 xmlparser.XML_PARSE_NOCDATA |
1456 xmlparser.XML_PARSE_NONET |
1457 xmlparser.XML_PARSE_COMPACT |
1458 xmlparser.XML_PARSE_BIG_LINES
1461 cdef class XMLParser(_FeedParser):
1462 u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True)
1466 Parsers can be supplied as additional argument to various parse
1467 functions of the lxml API. A default parser is always available
1468 and can be replaced by a call to the global function
1469 'set_default_parser'. New parsers can be created at any time
1470 without a major run-time overhead.
1472 The keyword arguments in the constructor are mainly based on the
1473 libxml2 parser configuration. A DTD will also be loaded if DTD
1474 validation or attribute default values are requested (unless you
1475 additionally provide an XMLSchema from which the default
1476 attributes can be read).
1478 Available boolean keyword arguments:
1480 - attribute_defaults - inject default attributes from DTD or XMLSchema
1481 - dtd_validation - validate against a DTD referenced by the document
1482 - load_dtd - use DTD for parsing
1483 - no_network - prevent network access for related files (default: True)
1484 - ns_clean - clean up redundant namespace declarations
1485 - recover - try hard to parse through broken XML
1486 - remove_blank_text - discard blank text nodes that appear ignorable
1487 - remove_comments - discard comments
1488 - remove_pis - discard processing instructions
1489 - strip_cdata - replace CDATA sections by normal text content (default: True)
1490 - compact - save memory for short text content (default: True)
1491 - collect_ids - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
1492 - resolve_entities - replace entities by their text value (default: True)
1493 - huge_tree - disable security restrictions and support very deep trees
1494 and very long text content (only affects libxml2 2.7+)
1496 Other keyword arguments:
1498 - encoding - override the document encoding
1499 - target - a parser target object that will receive the parse events
1500 - schema - an XMLSchema to validate against
1502 Note that you should avoid sharing parsers between threads. While this is
1503 not harmful, it is more efficient to use separate parsers. This does not
1504 apply to the default parser.
1506 def __init__(self, *, encoding=None, attribute_defaults=False,
1507 dtd_validation=False, load_dtd=False, no_network=True,
1508 ns_clean=False, recover=False, XMLSchema schema=None,
1509 huge_tree=False, remove_blank_text=False, resolve_entities=True,
1510 remove_comments=False, remove_pis=False, strip_cdata=True,
1511 collect_ids=True, target=None, compact=True):
1512 cdef int parse_options
1513 parse_options = _XML_DEFAULT_PARSE_OPTIONS
1515 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1517 parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
1518 xmlparser.XML_PARSE_DTDLOAD
1519 if attribute_defaults:
1520 parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
1522 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1524 parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
1526 parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
1527 if remove_blank_text:
1528 parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
1530 parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1532 parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
1534 parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
1535 if not resolve_entities:
1536 parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
1538 parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
1540 _BaseParser.__init__(self, parse_options, 0, schema,
1541 remove_comments, remove_pis, strip_cdata,
1542 collect_ids, target, encoding)
1545 cdef class XMLPullParser(XMLParser):
1546 """XMLPullParser(self, events=None, *, tag=None, **kwargs)
1548 XML parser that collects parse events in an iterator.
1550 The collected events are the same as for iterparse(), but the
1551 parser itself is non-blocking in the sense that it receives
1552 data chunks incrementally through its .feed() method, instead
1553 of reading them directly from a file(-like) object all by itself.
1555 By default, it collects Element end events. To change that,
1556 pass any subset of the available events into the ``events``
1557 argument: ``'start'``, ``'end'``, ``'start-ns'``,
1558 ``'end-ns'``, ``'comment'``, ``'pi'``.
1560 To support loading external dependencies relative to the input
1561 source, you can pass the ``base_url``.
1563 def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
1564 XMLParser.__init__(self, **kwargs)
1567 self._setBaseURL(base_url)
1568 self._collectEvents(events, tag)
1570 def read_events(self):
1571 return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
1574 cdef class ETCompatXMLParser(XMLParser):
1575 u"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
1576 dtd_validation=False, load_dtd=False, no_network=True, \
1577 ns_clean=False, recover=False, schema=None, \
1578 huge_tree=False, remove_blank_text=False, resolve_entities=True, \
1579 remove_comments=True, remove_pis=True, strip_cdata=True, \
1580 target=None, compact=True)
1582 An XML parser with an ElementTree compatible default setup.
1584 See the XMLParser class for details.
1586 This parser has ``remove_comments`` and ``remove_pis`` enabled by default
1587 and thus ignores comments and processing instructions.
1589 def __init__(self, *, encoding=None, attribute_defaults=False,
1590 dtd_validation=False, load_dtd=False, no_network=True,
1591 ns_clean=False, recover=False, schema=None,
1592 huge_tree=False, remove_blank_text=False, resolve_entities=True,
1593 remove_comments=True, remove_pis=True, strip_cdata=True,
1594 target=None, compact=True):
1595 XMLParser.__init__(self,
1596 attribute_defaults=attribute_defaults,
1597 dtd_validation=dtd_validation,
1599 no_network=no_network,
1602 remove_blank_text=remove_blank_text,
1603 huge_tree=huge_tree,
1605 resolve_entities=resolve_entities,
1606 remove_comments=remove_comments,
1607 remove_pis=remove_pis,
1608 strip_cdata=strip_cdata,
1613 # ET 1.2 compatible name
1614 XMLTreeBuilder = ETCompatXMLParser
1617 cdef XMLParser __DEFAULT_XML_PARSER
1618 __DEFAULT_XML_PARSER = XMLParser()
1620 __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
1622 def set_default_parser(_BaseParser parser=None):
1623 u"""set_default_parser(parser=None)
1625 Set a default parser for the current thread. This parser is used
1626 globally whenever no parser is supplied to the various parse functions of
1627 the lxml API. If this function is called without a parser (or if it is
1628 None), the default parser is reset to the original configuration.
1630 Note that the pre-installed default parser is not thread-safe. Avoid the
1631 default parser in multi-threaded environments. You can create a separate
1632 parser for each thread explicitly or use a parser pool.
1635 parser = __DEFAULT_XML_PARSER
1636 __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
1638 def get_default_parser():
1639 u"get_default_parser()"
1640 return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1642 ############################################################
1644 ############################################################
1646 cdef int _HTML_DEFAULT_PARSE_OPTIONS
1647 _HTML_DEFAULT_PARSE_OPTIONS = (
1648 htmlparser.HTML_PARSE_RECOVER |
1649 htmlparser.HTML_PARSE_NONET |
1650 htmlparser.HTML_PARSE_COMPACT
1653 cdef class HTMLParser(_FeedParser):
1654 u"""HTMLParser(self, encoding=None, remove_blank_text=False, \
1655 remove_comments=False, remove_pis=False, strip_cdata=True, \
1656 no_network=True, target=None, schema: XMLSchema =None, \
1657 recover=True, compact=True, collect_ids=True, huge_tree=False)
1661 This parser allows reading HTML into a normal XML tree. By
1662 default, it can read broken (non well-formed) HTML, depending on
1663 the capabilities of libxml2. Use the 'recover' option to switch
1666 Available boolean keyword arguments:
1668 - recover - try hard to parse through broken HTML (default: True)
1669 - no_network - prevent network access for related files (default: True)
1670 - remove_blank_text - discard empty text nodes that are ignorable (i.e. not actual text content)
1671 - remove_comments - discard comments
1672 - remove_pis - discard processing instructions
1673 - strip_cdata - replace CDATA sections by normal text content (default: True)
1674 - compact - save memory for short text content (default: True)
1675 - default_doctype - add a default doctype even if it is not found in the HTML (default: True)
1676 - collect_ids - use a hash table of XML IDs for fast access (default: True)
1677 - huge_tree - disable security restrictions and support very deep trees
1678 and very long text content (only affects libxml2 2.7+)
1680 Other keyword arguments:
1682 - encoding - override the document encoding
1683 - target - a parser target object that will receive the parse events
1684 - schema - an XMLSchema to validate against
1686 Note that you should avoid sharing parsers between threads for performance
1689 def __init__(self, *, encoding=None, remove_blank_text=False,
1690 remove_comments=False, remove_pis=False, strip_cdata=True,
1691 no_network=True, target=None, XMLSchema schema=None,
1692 recover=True, compact=True, default_doctype=True,
1693 collect_ids=True, huge_tree=False):
1694 cdef int parse_options
1695 parse_options = _HTML_DEFAULT_PARSE_OPTIONS
1696 if remove_blank_text:
1697 parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
1699 parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
1701 parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
1703 parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
1704 if not default_doctype:
1705 parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD
1707 parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1709 _BaseParser.__init__(self, parse_options, 1, schema,
1710 remove_comments, remove_pis, strip_cdata,
1711 collect_ids, target, encoding)
1714 cdef HTMLParser __DEFAULT_HTML_PARSER
1715 __DEFAULT_HTML_PARSER = HTMLParser()
1718 cdef class HTMLPullParser(HTMLParser):
1719 """HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs)
1721 HTML parser that collects parse events in an iterator.
1723 The collected events are the same as for iterparse(), but the
1724 parser itself is non-blocking in the sense that it receives
1725 data chunks incrementally through its .feed() method, instead
1726 of reading them directly from a file(-like) object all by itself.
1728 By default, it collects Element end events. To change that,
1729 pass any subset of the available events into the ``events``
1730 argument: ``'start'``, ``'end'``, ``'start-ns'``,
1731 ``'end-ns'``, ``'comment'``, ``'pi'``.
1733 To support loading external dependencies relative to the input
1734 source, you can pass the ``base_url``.
1736 def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
1737 HTMLParser.__init__(self, **kwargs)
1740 self._setBaseURL(base_url)
1741 self._collectEvents(events, tag)
1743 def read_events(self):
1744 return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
1747 ############################################################
1748 ## helper functions for document creation
1749 ############################################################
1751 cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
1752 cdef char* c_filename
1754 cdef Py_ssize_t c_len
1755 cdef bint is_pep393_string
1757 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1761 filename_utf = _encodeFilenameUTF8(filename)
1762 c_filename = _cstr(filename_utf)
1763 if isinstance(text, unicode):
1764 is_pep393_string = (
1765 python.PEP393_ENABLED and python.PyUnicode_IS_READY(text))
1766 if is_pep393_string:
1767 c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
1769 c_len = python.PyUnicode_GET_DATA_SIZE(text)
1770 if c_len > limits.INT_MAX:
1771 return (<_BaseParser>parser)._parseDocFromFilelike(
1772 StringIO(text), filename, None)
1773 if _UNICODE_ENCODING is NULL and not is_pep393_string:
1774 text = (<unicode>text).encode('utf8')
1775 return (<_BaseParser>parser)._parseDocFromFilelike(
1776 BytesIO(text), filename, "UTF-8")
1777 return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
1779 c_len = python.PyBytes_GET_SIZE(text)
1780 if c_len > limits.INT_MAX:
1781 return (<_BaseParser>parser)._parseDocFromFilelike(
1782 BytesIO(text), filename, None)
1783 c_text = _cstr(text)
1784 return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
1786 cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
1788 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1789 return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
1791 cdef xmlDoc* _parseDocFromFilelike(source, filename,
1792 _BaseParser parser) except NULL:
1794 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1795 return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
1797 cdef xmlDoc* _newXMLDoc() except NULL:
1799 result = tree.xmlNewDoc(NULL)
1802 if result.encoding is NULL:
1803 result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
1804 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1807 cdef xmlDoc* _newHTMLDoc() except NULL:
1809 result = tree.htmlNewDoc(NULL, NULL)
1812 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1815 cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
1819 result = tree.xmlCopyDoc(c_doc, recursive)
1821 result = tree.xmlCopyDoc(c_doc, 0)
1824 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1827 cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
1828 u"Recursively copy the document and make c_new_root the new root node."
1830 cdef xmlNode* c_node
1831 result = tree.xmlCopyDoc(c_doc, 0) # non recursive
1832 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1834 c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
1837 tree.xmlDocSetRootElement(result, c_node)
1838 _copyTail(c_new_root.next, c_node)
1841 cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
1842 u"Recursively copy the element into the document. c_doc is not modified."
1843 cdef xmlNode* c_root
1844 c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
1847 _copyTail(c_node.next, c_root)
1851 ############################################################
1852 ## API level helper functions for _Document creation
1853 ############################################################
1855 cdef _Document _parseDocument(source, _BaseParser parser, base_url):
1857 if _isString(source):
1858 # parse the file directly from the filesystem
1859 doc = _parseDocumentFromURL(_encodeFilename(source), parser)
1860 # fix base URL if requested
1861 if base_url is not None:
1862 base_url = _encodeFilenameUTF8(base_url)
1863 if doc._c_doc.URL is not NULL:
1864 tree.xmlFree(<char*>doc._c_doc.URL)
1865 doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url))
1868 if base_url is not None:
1871 url = _getFilenameForFile(source)
1873 if hasattr(source, u'getvalue') and hasattr(source, u'tell'):
1874 # StringIO - reading from start?
1875 if source.tell() == 0:
1876 return _parseMemoryDocument(source.getvalue(), url, parser)
1878 # Support for file-like objects (urlgrabber.urlopen, ...)
1879 if hasattr(source, u'read'):
1880 return _parseFilelikeDocument(source, url, parser)
1882 raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'"
1884 cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
1885 c_doc = _parseDocFromFile(url, parser)
1886 return _documentFactory(c_doc, parser)
1888 cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
1889 if isinstance(text, unicode):
1890 if _hasEncodingDeclaration(text):
1892 u"Unicode strings with encoding declaration are not supported. "
1893 u"Please use bytes input or XML fragments without declaration.")
1894 elif not isinstance(text, bytes):
1895 raise ValueError, u"can only parse strings"
1896 c_doc = _parseDoc(text, url, parser)
1897 return _documentFactory(c_doc, parser)
1899 cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
1900 c_doc = _parseDocFromFilelike(source, url, parser)
1901 return _documentFactory(c_doc, parser)