1 # Parsers for XML and HTML
3 from lxml.includes cimport xmlparser
4 from lxml.includes cimport htmlparser
7 class ParseError(LxmlSyntaxError):
8 """Syntax error while parsing an XML document.
10 For compatibility with ElementTree 1.3 and later.
12 def __init__(self, message, code, line, column, filename=None):
13 super(_ParseError, self).__init__(message)
14 self.lineno, self.offset = (line, column - 1)
16 self.filename = filename
20 return self.lineno, self.offset + 1
23 def position(self, new_pos):
24 self.lineno, column = new_pos
25 self.offset = column - 1
27 cdef object _ParseError = ParseError
30 class XMLSyntaxError(ParseError):
31 """Syntax error while parsing an XML document.
34 cdef class ParserError(LxmlError):
35 """Internal lxml parser error.
41 cdef class _ParserDictionaryContext:
42 # Global parser context to share the string dictionary.
44 # This class is a delegate singleton!
46 # It creates _ParserDictionaryContext objects for each thread to keep thread state,
47 # but those must never be used directly. Always stick to using the static
48 # __GLOBAL_PARSER_CONTEXT as defined below the class.
51 cdef tree.xmlDict* _c_dict
52 cdef _BaseParser _default_parser
53 cdef list _implied_parser_contexts
57 self._implied_parser_contexts = []
59 def __dealloc__(self):
60 if self._c_dict is not NULL:
61 xmlparser.xmlDictFree(self._c_dict)
63 cdef void initMainParserContext(self):
64 u"""Put the global context into the thread dictionary of the main
65 thread. To be called once and only in the main thread."""
66 thread_dict = python.PyThreadState_GetDict()
67 if thread_dict is not NULL:
68 (<dict>thread_dict)[u"_ParserDictionaryContext"] = self
70 cdef _ParserDictionaryContext _findThreadParserContext(self):
71 u"Find (or create) the _ParserDictionaryContext object for the current thread"
72 cdef _ParserDictionaryContext context
73 thread_dict = python.PyThreadState_GetDict()
74 if thread_dict is NULL:
77 result = python.PyDict_GetItem(d, u"_ParserDictionaryContext")
78 if result is not NULL:
80 context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext)
81 d[u"_ParserDictionaryContext"] = context
84 cdef void setDefaultParser(self, _BaseParser parser):
85 u"Set the default parser for the current thread"
86 cdef _ParserDictionaryContext context
87 context = self._findThreadParserContext()
88 context._default_parser = parser
90 cdef _BaseParser getDefaultParser(self):
91 u"Return (or create) the default parser of the current thread"
92 cdef _ParserDictionaryContext context
93 context = self._findThreadParserContext()
94 if context._default_parser is None:
95 if self._default_parser is None:
96 self._default_parser = __DEFAULT_XML_PARSER._copy()
97 if context is not self:
98 context._default_parser = self._default_parser._copy()
99 return context._default_parser
101 cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
102 u"Return the thread-local dict or create a new one if necessary."
103 cdef _ParserDictionaryContext context
104 context = self._findThreadParserContext()
105 if context._c_dict is NULL:
106 # thread dict not yet set up => use default or create a new one
107 if default is not NULL:
108 context._c_dict = default
109 xmlparser.xmlDictReference(default)
111 if self._c_dict is NULL:
112 self._c_dict = xmlparser.xmlDictCreate()
113 if context is not self:
114 context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
115 return context._c_dict
117 cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref):
118 c_dict = c_dict_ref[0]
119 c_thread_dict = self._getThreadDict(c_dict)
120 if c_dict is c_thread_dict:
122 if c_dict is not NULL:
123 xmlparser.xmlDictFree(c_dict)
124 c_dict_ref[0] = c_thread_dict
125 xmlparser.xmlDictReference(c_thread_dict)
127 cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt):
128 u"Assure we always use the same string dictionary."
129 self.initThreadDictRef(&pctxt.dict)
132 cdef void initXPathParserDict(self, xpath.xmlXPathContext* pctxt):
133 u"Assure we always use the same string dictionary."
134 self.initThreadDictRef(&pctxt.dict)
136 cdef void initDocDict(self, xmlDoc* result):
137 u"Store dict of last object parsed if no shared dict yet"
138 # XXX We also free the result dict here if there already was one.
139 # This case should only occur for new documents with empty dicts,
140 # otherwise we'd free data that's in use => segfault
141 self.initThreadDictRef(&result.dict)
143 cdef _ParserContext findImpliedContext(self):
144 u"""Return any current implied xml parser context for the current
145 thread. This is used when the resolver functions are called
146 with an xmlParserCtxt that was generated from within libxml2
147 (i.e. without a _ParserContext) - which happens when parsing
148 schema and xinclude external references."""
149 cdef _ParserDictionaryContext context
150 cdef _ParserContext implied_context
152 # see if we have a current implied parser
153 context = self._findThreadParserContext()
154 if context._implied_parser_contexts:
155 implied_context = context._implied_parser_contexts[-1]
156 return implied_context
159 cdef void pushImpliedContextFromParser(self, _BaseParser parser):
160 u"Push a new implied context object taken from the parser."
161 if parser is not None:
162 self.pushImpliedContext(parser._getParserContext())
164 self.pushImpliedContext(None)
166 cdef void pushImpliedContext(self, _ParserContext parser_context):
167 u"Push a new implied context object."
168 cdef _ParserDictionaryContext context
169 context = self._findThreadParserContext()
170 context._implied_parser_contexts.append(parser_context)
172 cdef void popImpliedContext(self):
173 u"Pop the current implied context object."
174 cdef _ParserDictionaryContext context
175 context = self._findThreadParserContext()
176 context._implied_parser_contexts.pop()
178 cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
179 __GLOBAL_PARSER_CONTEXT.initMainParserContext()
181 ############################################################
182 ## support for Python unicode I/O
183 ############################################################
185 # name of Python unicode encoding as known to libxml2
186 cdef const_char* _UNICODE_ENCODING = NULL
188 cdef int _setupPythonUnicode() except -1:
189 u"""Sets _UNICODE_ENCODING to the internal encoding name of Python unicode
190 strings if libxml2 supports reading native Python unicode. This depends
191 on iconv and the local Python installation, so we simply check if we find
192 a matching encoding handler.
194 cdef tree.xmlCharEncodingHandler* enchandler
197 cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>']
198 cdef const_xmlChar* buffer = <const_xmlChar*>uchars
199 # apparently, libxml2 can't detect UTF-16 on some systems
200 if (buffer[0] == c'<' and buffer[1] == c'\0' and
201 buffer[2] == c't' and buffer[3] == c'\0'):
203 elif (buffer[0] == c'\0' and buffer[1] == c'<' and
204 buffer[2] == c'\0' and buffer[3] == c't'):
207 # let libxml2 give it a try
208 enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7)
210 # not my fault, it's YOUR broken system :)
212 enchandler = tree.xmlFindCharEncodingHandler(enc)
213 if enchandler is not NULL:
214 global _UNICODE_ENCODING
215 tree.xmlCharEncCloseFunc(enchandler)
216 _UNICODE_ENCODING = enc
219 cdef const_char* _findEncodingName(const_xmlChar* buffer, int size):
220 u"Work around bug in libxml2: find iconv name of encoding on our own."
221 cdef tree.xmlCharEncoding enc
222 enc = tree.xmlDetectCharEncoding(buffer, size)
223 if enc == tree.XML_CHAR_ENCODING_UTF16LE:
224 if size >= 4 and (buffer[0] == <const_xmlChar>'\xFF' and
225 buffer[1] == <const_xmlChar>'\xFE' and
226 buffer[2] == 0 and buffer[3] == 0):
227 return "UTF-32LE" # according to BOM
230 elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
232 elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
234 elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
236 elif enc == tree.XML_CHAR_ENCODING_NONE:
239 # returns a constant char*, no need to free it
240 return tree.xmlGetCharEncodingName(enc)
242 _setupPythonUnicode()
244 ############################################################
245 ## support for file-like objects
246 ############################################################
250 cdef class _FileReaderContext:
251 cdef object _filelike
252 cdef object _encoding
255 cdef _ExceptionContext _exc_context
256 cdef Py_ssize_t _bytes_read
258 cdef bint _close_file_after_read
260 def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False):
261 self._exc_context = exc_context
262 self._filelike = filelike
263 self._close_file_after_read = close_file
264 self._encoding = encoding
268 url = _encodeFilename(url)
269 self._c_url = _cstr(url)
274 cdef _close_file(self):
275 if self._filelike is None or not self._close_file_after_read:
278 close = self._filelike.close
279 except AttributeError:
282 self._filelike = None
283 if close is not None:
286 cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self):
287 cdef stdio.FILE* c_stream
288 cdef xmlparser.xmlParserInputBuffer* c_buffer
289 c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
290 c_stream = python.PyFile_AsFile(self._filelike)
292 c_buffer.readcallback = _readFilelikeParser
293 c_buffer.context = <python.PyObject*>self
295 c_buffer.readcallback = _readFileParser
296 c_buffer.context = c_stream
299 cdef xmlparser.xmlParserInput* _createParserInput(
300 self, xmlparser.xmlParserCtxt* ctxt):
301 cdef xmlparser.xmlParserInputBuffer* c_buffer
302 c_buffer = self._createParserInputBuffer()
303 return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
305 cdef tree.xmlDtd* _readDtd(self):
306 cdef xmlparser.xmlParserInputBuffer* c_buffer
307 c_buffer = self._createParserInputBuffer()
309 return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
311 cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options):
313 cdef char* c_encoding
314 cdef stdio.FILE* c_stream
315 cdef xmlparser.xmlInputReadCallback c_read_callback
316 cdef xmlparser.xmlInputCloseCallback c_close_callback
317 cdef void* c_callback_context
319 if self._encoding is None:
322 c_encoding = _cstr(self._encoding)
324 c_stream = python.PyFile_AsFile(self._filelike)
326 c_read_callback = _readFilelikeParser
327 c_callback_context = <python.PyObject*>self
329 c_read_callback = _readFileParser
330 c_callback_context = c_stream
332 orig_options = ctxt.options
335 result = htmlparser.htmlCtxtReadIO(
336 ctxt, c_read_callback, NULL, c_callback_context,
337 self._c_url, c_encoding, options)
338 if result is not NULL:
339 if _fixHtmlDictNames(ctxt.dict, result) < 0:
340 tree.xmlFreeDoc(result)
343 result = xmlparser.xmlCtxtReadIO(
344 ctxt, c_read_callback, NULL, c_callback_context,
345 self._c_url, c_encoding, options)
346 ctxt.options = orig_options # work around libxml2 problem
350 self._exc_context._store_raised()
352 return result # swallow any exceptions
354 cdef int copyToBuffer(self, char* c_buffer, int c_requested):
355 cdef int c_byte_count = 0
357 cdef Py_ssize_t byte_count, remaining
358 if self._bytes_read < 0:
361 byte_count = python.PyBytes_GET_SIZE(self._bytes)
362 remaining = byte_count - self._bytes_read
363 while c_requested > remaining:
364 c_start = _cstr(self._bytes) + self._bytes_read
365 cstring_h.memcpy(c_buffer, c_start, remaining)
366 c_byte_count += remaining
367 c_buffer += remaining
368 c_requested -= remaining
370 self._bytes = self._filelike.read(c_requested)
371 if not isinstance(self._bytes, bytes):
372 if isinstance(self._bytes, unicode):
373 if self._encoding is None:
374 self._bytes = (<unicode>self._bytes).encode('utf8')
376 self._bytes = python.PyUnicode_AsEncodedString(
377 self._bytes, _cstr(self._encoding), NULL)
381 u"reading from file-like objects must return byte strings or unicode strings"
383 remaining = python.PyBytes_GET_SIZE(self._bytes)
385 self._bytes_read = -1
391 c_start = _cstr(self._bytes) + self._bytes_read
392 cstring_h.memcpy(c_buffer, c_start, c_requested)
393 c_byte_count += c_requested
394 self._bytes_read += c_requested
397 self._exc_context._store_raised()
401 self._exc_context._store_raised()
403 return c_byte_count # swallow any exceptions
405 cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) with gil:
406 return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
408 cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) nogil:
409 return stdio.fread(c_buffer, 1, c_size, <stdio.FILE*>ctxt)
411 ############################################################
412 ## support for custom document loaders
413 ############################################################
415 cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid,
416 xmlparser.xmlParserCtxt* c_context) with gil:
417 cdef _ResolverContext context
418 cdef xmlparser.xmlParserInput* c_input
419 cdef _InputDocument doc_ref
420 cdef _FileReaderContext file_context
421 # if there is no _ParserContext associated with the xmlParserCtxt
422 # passed, check to see if the thread state object has an implied
424 if c_context._private is not NULL:
425 context = <_ResolverContext>c_context._private
427 context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
430 if __DEFAULT_ENTITY_LOADER is NULL:
433 # free the GIL as we might do serious I/O here (e.g. HTTP)
434 c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
441 # parsing a related document (DTD etc.) => UTF-8 encoded URL?
442 url = _decodeFilename(<const_xmlChar*>c_url)
446 pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8
448 doc_ref = context._resolvers.resolve(url, pubid, context)
450 context._store_raised()
453 if doc_ref is not None:
454 if doc_ref._type == PARSER_DATA_STRING:
455 data = doc_ref._data_bytes
456 filename = doc_ref._filename
459 elif not isinstance(filename, bytes):
460 # most likely a text URL
461 filename = filename.encode('utf8')
462 if not isinstance(filename, bytes):
465 c_input = xmlparser.xmlNewInputStream(c_context)
466 if c_input is not NULL:
467 if filename is not None:
468 c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename))
469 c_input.base = _xcstr(data)
470 c_input.length = python.PyBytes_GET_SIZE(data)
471 c_input.cur = c_input.base
472 c_input.end = c_input.base + c_input.length
473 elif doc_ref._type == PARSER_DATA_FILENAME:
475 c_filename = _cstr(doc_ref._filename)
477 # free the GIL as we might do serious I/O here
478 c_input = xmlparser.xmlNewInputFromFile(
479 c_context, c_filename)
480 elif doc_ref._type == PARSER_DATA_FILE:
481 file_context = _FileReaderContext(doc_ref._file, context, url,
482 None, doc_ref._close_file)
483 c_input = file_context._createParserInput(c_context)
490 context._storage.add(data)
491 if c_input is not NULL:
494 if __DEFAULT_ENTITY_LOADER is NULL:
498 # free the GIL as we might do serious I/O here (e.g. HTTP)
499 c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
502 cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
503 __DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
505 xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver)
507 ############################################################
509 ############################################################
511 @cython.no_gc_clear # May have to call "self._validator.disconnect()" on dealloc.
513 cdef class _ParserContext(_ResolverContext):
514 cdef _ErrorLog _error_log
515 cdef _ParserSchemaValidationContext _validator
516 cdef xmlparser.xmlParserCtxt* _c_ctxt
517 cdef python.PyThread_type_lock _lock
519 cdef bint _collect_ids
523 self._collect_ids = True
524 if not config.ENABLE_THREADING:
527 self._lock = python.PyThread_allocate_lock()
528 self._error_log = _ErrorLog()
530 def __dealloc__(self):
531 if config.ENABLE_THREADING and self._lock is not NULL:
532 python.PyThread_free_lock(self._lock)
534 if self._c_ctxt is not NULL:
535 if <void*>self._validator is not NULL and self._validator is not None:
536 # If the parser was not closed correctly (e.g. interrupted iterparse()),
537 # and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX
538 # validator plug might still be in place, which will make xmlFreeParserCtxt()
539 # crash when trying to xmlFree() a static SAX handler.
540 # Thus, make sure we disconnect the handler interceptor here at the latest.
541 self._validator.disconnect()
542 xmlparser.xmlFreeParserCtxt(self._c_ctxt)
544 cdef _ParserContext _copy(self):
545 cdef _ParserContext context
546 context = self.__class__()
547 context._collect_ids = self._collect_ids
548 context._validator = self._validator.copy()
549 _initParserContext(context, self._resolvers._copy(), NULL)
552 cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
553 self._c_ctxt = c_ctxt
554 c_ctxt._private = <void*>self
556 cdef void _resetParserContext(self):
557 if self._c_ctxt is not NULL:
558 if self._c_ctxt.html:
559 htmlparser.htmlCtxtReset(self._c_ctxt)
560 self._c_ctxt.disableSAX = 0 # work around bug in libxml2
562 xmlparser.xmlClearParserCtxt(self._c_ctxt)
564 cdef int prepare(self) except -1:
566 if config.ENABLE_THREADING and self._lock is not NULL:
568 result = python.PyThread_acquire_lock(
569 self._lock, python.WAIT_LOCK)
571 raise ParserError, u"parser locking failed"
572 self._error_log.clear()
574 self._c_ctxt.sax.serror = _receiveParserError
575 if self._validator is not None:
576 self._validator.connect(self._c_ctxt, self._error_log)
579 cdef int cleanup(self) except -1:
580 if self._validator is not None:
581 self._validator.disconnect()
582 self._resetParserContext()
585 self._c_ctxt.sax.serror = NULL
586 if config.ENABLE_THREADING and self._lock is not NULL:
587 python.PyThread_release_lock(self._lock)
590 cdef object _handleParseResult(self, _BaseParser parser,
591 xmlDoc* result, filename):
592 c_doc = self._handleParseResultDoc(parser, result, filename)
593 if self._doc is not None and self._doc._c_doc is c_doc:
596 return _documentFactory(c_doc, parser)
598 cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
599 xmlDoc* result, filename) except NULL:
600 recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
601 return _handleParseResult(self, self._c_ctxt, result,
603 free_doc=self._doc is None)
605 cdef _initParserContext(_ParserContext context,
606 _ResolverRegistry resolvers,
607 xmlparser.xmlParserCtxt* c_ctxt):
608 _initResolverContext(context, resolvers)
609 if c_ctxt is not NULL:
610 context._initParserContext(c_ctxt)
612 cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, xmlerror.xmlError* error) with gil:
613 (<_ParserContext>_parser_context._private)._error_log._receive(error)
615 cdef void _receiveParserError(void* c_context, xmlerror.xmlError* error) nogil:
617 if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL:
618 _forwardError(NULL, error)
620 _forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error)
622 cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
623 _ErrorLog error_log) except -1:
624 if filename is not None and \
625 ctxt.lastError.domain == xmlerror.XML_FROM_IO:
626 if isinstance(filename, bytes):
627 filename = _decodeFilenameWithLength(
628 <bytes>filename, len(<bytes>filename))
629 if ctxt.lastError.message is not NULL:
631 message = ctxt.lastError.message.decode('utf-8')
632 except UnicodeDecodeError:
633 # the filename may be in there => play it safe
634 message = ctxt.lastError.message.decode('iso8859-1')
635 message = f"Error reading file '{filename}': {message.strip()}"
637 message = f"Error reading '{filename}'"
638 raise IOError, message
640 raise error_log._buildParseException(
641 XMLSyntaxError, u"Document is not well formed")
642 elif ctxt.lastError.message is not NULL:
643 message = ctxt.lastError.message.strip()
644 code = ctxt.lastError.code
645 line = ctxt.lastError.line
646 column = ctxt.lastError.int2
647 if ctxt.lastError.line > 0:
648 message = f"line {line}: {message}"
649 raise XMLSyntaxError(message, code, line, column, filename)
651 raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
654 cdef xmlDoc* _handleParseResult(_ParserContext context,
655 xmlparser.xmlParserCtxt* c_ctxt,
656 xmlDoc* result, filename,
657 bint recover, bint free_doc) except NULL:
658 cdef bint well_formed
659 if result is not NULL:
660 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
662 if c_ctxt.myDoc is not NULL:
663 if c_ctxt.myDoc is not result:
664 __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
665 tree.xmlFreeDoc(c_ctxt.myDoc)
668 if result is not NULL:
669 if (context._validator is not None and
670 not context._validator.isvalid()):
671 well_formed = 0 # actually not 'valid', but anyway ...
672 elif (not c_ctxt.wellFormed and not c_ctxt.html and
673 c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
674 [1 for error in context._error_log
675 if error.type == ErrorTypes.ERR_INVALID_CHAR]):
676 # An encoding error occurred and libxml2 switched from UTF-8
677 # input to (undecoded) Latin-1, at some arbitrary point in the
678 # document. Better raise an error than allowing for a broken
679 # tree with mixed encodings.
681 elif recover or (c_ctxt.wellFormed and
682 c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
684 elif not c_ctxt.replaceEntities and not c_ctxt.validate \
685 and context is not None:
686 # in this mode, we ignore errors about undefined entities
687 for error in context._error_log.filter_from_errors():
688 if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
689 error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
699 tree.xmlFreeDoc(result)
702 if context is not None and context._has_raised():
703 if result is not NULL:
705 tree.xmlFreeDoc(result)
707 context._raise_if_stored()
710 if context is not None:
711 _raiseParseError(c_ctxt, filename, context._error_log)
713 _raiseParseError(c_ctxt, filename, None)
715 if result.URL is NULL and filename is not None:
716 result.URL = tree.xmlStrdup(_xcstr(filename))
717 if result.encoding is NULL:
718 result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
720 if context._validator is not None and \
721 context._validator._add_default_attributes:
722 # we currently need to do this here as libxml2 does not
723 # support inserting default attributes during parse-time
725 context._validator.inject_default_attributes(result)
729 cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil:
733 c_node = c_doc.children
734 tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
735 if c_node.type == tree.XML_ELEMENT_NODE:
736 if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
738 tree.END_FOR_EACH_ELEMENT_FROM(c_node)
741 cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc,
742 xmlNode* c_start_node) nogil:
744 Move names to the dict, iterating in document order, starting at
745 c_start_node. This is used in incremental parsing after each chunk.
751 return _fixHtmlDictNames(c_dict, c_doc)
752 c_node = c_start_node
753 tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
754 if c_node.type == tree.XML_ELEMENT_NODE:
755 if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
757 tree.END_FOR_EACH_ELEMENT_FROM(c_node)
760 cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
761 xmlNode* c_node) nogil:
763 c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
766 if c_name is not c_node.name:
767 tree.xmlFree(<char*>c_node.name)
769 c_attr = <xmlNode*>c_node.properties
770 while c_attr is not NULL:
771 c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
774 if c_name is not c_attr.name:
775 tree.xmlFree(<char*>c_attr.name)
781 cdef class _BaseParser:
782 cdef ElementClassLookup _class_lookup
783 cdef _ResolverRegistry _resolvers
784 cdef _ParserContext _parser_context
785 cdef _ParserContext _push_parser_context
786 cdef int _parse_options
788 cdef bint _remove_comments
789 cdef bint _remove_pis
790 cdef bint _strip_cdata
791 cdef bint _collect_ids
792 cdef XMLSchema _schema
794 cdef readonly object target
795 cdef object _default_encoding
796 cdef tuple _events_to_collect # (event_types, tag)
798 def __init__(self, int parse_options, bint for_html, XMLSchema schema,
799 remove_comments, remove_pis, strip_cdata, collect_ids,
801 cdef tree.xmlCharEncodingHandler* enchandler
803 if not isinstance(self, (XMLParser, HTMLParser)):
804 raise TypeError, u"This class cannot be instantiated"
806 self._parse_options = parse_options
808 self._for_html = for_html
809 self._remove_comments = remove_comments
810 self._remove_pis = remove_pis
811 self._strip_cdata = strip_cdata
812 self._collect_ids = collect_ids
813 self._schema = schema
815 self._resolvers = _ResolverRegistry()
818 self._default_encoding = None
820 encoding = _utf8(encoding)
821 enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
822 if enchandler is NULL:
823 raise LookupError, f"unknown encoding: '{encoding}'"
824 tree.xmlCharEncCloseFunc(enchandler)
825 self._default_encoding = encoding
827 cdef _setBaseURL(self, base_url):
828 self._filename = _encodeFilename(base_url)
830 cdef _collectEvents(self, event_types, tag):
831 if event_types is None:
834 event_types = tuple(set(event_types))
835 _buildParseEventFilter(event_types) # purely for validation
836 self._events_to_collect = (event_types, tag)
838 cdef _ParserContext _getParserContext(self):
839 cdef xmlparser.xmlParserCtxt* pctxt
840 if self._parser_context is None:
841 self._parser_context = self._createContext(self.target, None)
842 self._parser_context._collect_ids = self._collect_ids
843 if self._schema is not None:
844 self._parser_context._validator = \
845 self._schema._newSaxValidator(
846 self._parse_options & xmlparser.XML_PARSE_DTDATTR)
847 pctxt = self._newParserCtxt()
848 _initParserContext(self._parser_context, self._resolvers, pctxt)
849 self._configureSaxContext(pctxt)
850 return self._parser_context
852 cdef _ParserContext _getPushParserContext(self):
853 cdef xmlparser.xmlParserCtxt* pctxt
854 if self._push_parser_context is None:
855 self._push_parser_context = self._createContext(
856 self.target, self._events_to_collect)
857 self._push_parser_context._collect_ids = self._collect_ids
858 if self._schema is not None:
859 self._push_parser_context._validator = \
860 self._schema._newSaxValidator(
861 self._parse_options & xmlparser.XML_PARSE_DTDATTR)
862 pctxt = self._newPushParserCtxt()
864 self._push_parser_context, self._resolvers, pctxt)
865 self._configureSaxContext(pctxt)
866 return self._push_parser_context
868 cdef _ParserContext _createContext(self, target, events_to_collect):
869 cdef _SaxParserContext sax_context
870 if target is not None:
871 sax_context = _TargetParserContext(self)
872 (<_TargetParserContext>sax_context)._setTarget(target)
873 elif events_to_collect:
874 sax_context = _SaxParserContext(self)
876 # nothing special to configure
877 return _ParserContext()
878 if events_to_collect:
879 events, tag = events_to_collect
880 sax_context._setEventFilter(events, tag)
884 cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1:
885 if self._remove_comments:
886 pctxt.sax.comment = NULL
888 pctxt.sax.processingInstruction = NULL
889 if self._strip_cdata:
890 # hard switch-off for CDATA nodes => makes them plain text
891 pctxt.sax.cdataBlock = NULL
893 cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
894 cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
895 if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
896 # need to extend SAX1 context to SAX2 to get proper error reports
897 if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler:
898 sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler))
901 cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler,
902 sizeof(htmlparser.htmlDefaultSAXHandler))
904 sax.initialized = xmlparser.XML_SAX2_MAGIC
905 sax.serror = _receiveParserError
906 sax.startElementNs = NULL
907 sax.endElementNs = NULL
911 cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
912 cdef xmlparser.xmlParserCtxt* c_ctxt
914 c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
915 if c_ctxt is not NULL:
916 self._registerHtmlErrorHandler(c_ctxt)
918 c_ctxt = xmlparser.xmlNewParserCtxt()
921 c_ctxt.sax.startDocument = _initSaxDocument
924 cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL:
925 cdef xmlparser.xmlParserCtxt* c_ctxt
926 cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL
928 c_ctxt = htmlparser.htmlCreatePushParserCtxt(
929 NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
930 if c_ctxt is not NULL:
931 self._registerHtmlErrorHandler(c_ctxt)
932 htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
934 c_ctxt = xmlparser.xmlCreatePushParserCtxt(
935 NULL, NULL, NULL, 0, c_filename)
936 if c_ctxt is not NULL:
937 xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
940 c_ctxt.sax.startDocument = _initSaxDocument
945 """The error log of the last parser run.
947 cdef _ParserContext context
948 context = self._getParserContext()
949 return context._error_log.copy()
953 """The custom resolver registry of this parser."""
954 return self._resolvers
958 """The version of the underlying XML parser."""
959 return u"libxml2 %d.%d.%d" % LIBXML_VERSION
961 def setElementClassLookup(self, ElementClassLookup lookup = None):
962 u":deprecated: use ``parser.set_element_class_lookup(lookup)`` instead."
963 self.set_element_class_lookup(lookup)
965 def set_element_class_lookup(self, ElementClassLookup lookup = None):
966 u"""set_element_class_lookup(self, lookup = None)
968 Set a lookup scheme for element classes generated from this parser.
970 Reset it by passing None or nothing.
972 self._class_lookup = lookup
974 cdef _BaseParser _copy(self):
975 u"Create a new parser with the same configuration."
976 cdef _BaseParser parser
977 parser = self.__class__()
978 parser._parse_options = self._parse_options
979 parser._for_html = self._for_html
980 parser._remove_comments = self._remove_comments
981 parser._remove_pis = self._remove_pis
982 parser._strip_cdata = self._strip_cdata
983 parser._filename = self._filename
984 parser._resolvers = self._resolvers
985 parser.target = self.target
986 parser._class_lookup = self._class_lookup
987 parser._default_encoding = self._default_encoding
988 parser._schema = self._schema
989 parser._events_to_collect = self._events_to_collect
995 Create a new parser with the same configuration.
999 def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
1000 u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
1002 Creates a new element associated with this parser.
1004 return _makeElement(_tag, NULL, None, self, None, None,
1005 attrib, nsmap, _extra)
1007 # internal parser methods
1009 cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
1010 u"""Parse unicode document, share dictionary if possible.
1012 cdef _ParserContext context
1014 cdef xmlparser.xmlParserCtxt* pctxt
1015 cdef Py_ssize_t py_buffer_len
1016 cdef int buffer_len, c_kind
1017 cdef const_char* c_text
1018 cdef const_char* c_encoding = _UNICODE_ENCODING
1019 cdef bint is_pep393_string = (
1020 python.PEP393_ENABLED and python.PyUnicode_IS_READY(utext))
1021 if is_pep393_string:
1022 c_text = <const_char*>python.PyUnicode_DATA(utext)
1023 py_buffer_len = python.PyUnicode_GET_LENGTH(utext)
1024 c_kind = python.PyUnicode_KIND(utext)
1026 c_encoding = 'ISO-8859-1'
1029 if python.PY_BIG_ENDIAN:
1030 c_encoding = 'UTF-16BE' # actually UCS-2
1032 c_encoding = 'UTF-16LE' # actually UCS-2
1035 if python.PY_BIG_ENDIAN:
1036 c_encoding = 'UCS-4BE'
1038 c_encoding = 'UCS-4LE'
1040 assert False, f"Illegal Unicode kind {c_kind}"
1042 py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
1043 c_text = python.PyUnicode_AS_DATA(utext)
1044 assert 0 <= py_buffer_len <= limits.INT_MAX
1045 buffer_len = py_buffer_len
1047 context = self._getParserContext()
1050 pctxt = context._c_ctxt
1051 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1052 orig_options = pctxt.options
1055 result = htmlparser.htmlCtxtReadMemory(
1056 pctxt, c_text, buffer_len, c_filename, c_encoding,
1057 self._parse_options)
1058 if result is not NULL:
1059 if _fixHtmlDictNames(pctxt.dict, result) < 0:
1060 tree.xmlFreeDoc(result)
1063 result = xmlparser.xmlCtxtReadMemory(
1064 pctxt, c_text, buffer_len, c_filename, c_encoding,
1065 self._parse_options)
1066 pctxt.options = orig_options # work around libxml2 problem
1068 return context._handleParseResultDoc(self, result, None)
1072 cdef xmlDoc* _parseDoc(self, char* c_text, int c_len,
1073 char* c_filename) except NULL:
1074 u"""Parse document, share dictionary if possible.
1076 cdef _ParserContext context
1078 cdef xmlparser.xmlParserCtxt* pctxt
1079 cdef char* c_encoding
1080 cdef tree.xmlCharEncoding enc
1081 context = self._getParserContext()
1084 pctxt = context._c_ctxt
1085 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1087 if self._default_encoding is None:
1089 # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs
1090 # NOTE: limit to problematic cases because it changes character offsets
1091 if c_len >= 4 and (c_text[0] == '\xFF' and c_text[1] == '\xFE' and
1092 c_text[2] == 0 and c_text[3] == 0):
1093 c_encoding = "UTF-32LE"
1096 elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and
1097 c_text[2] == '\xFE' and c_text[3] == '\xFF'):
1098 c_encoding = "UTF-32BE"
1102 # no BOM => try to determine encoding
1103 enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len)
1104 if enc == tree.XML_CHAR_ENCODING_UCS4LE:
1105 c_encoding = 'UTF-32LE'
1106 elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
1107 c_encoding = 'UTF-32BE'
1109 c_encoding = _cstr(self._default_encoding)
1111 orig_options = pctxt.options
1114 result = htmlparser.htmlCtxtReadMemory(
1115 pctxt, c_text, c_len, c_filename,
1116 c_encoding, self._parse_options)
1117 if result is not NULL:
1118 if _fixHtmlDictNames(pctxt.dict, result) < 0:
1119 tree.xmlFreeDoc(result)
1122 result = xmlparser.xmlCtxtReadMemory(
1123 pctxt, c_text, c_len, c_filename,
1124 c_encoding, self._parse_options)
1125 pctxt.options = orig_options # work around libxml2 problem
1127 return context._handleParseResultDoc(self, result, None)
1131 cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
1132 cdef _ParserContext context
1134 cdef xmlparser.xmlParserCtxt* pctxt
1135 cdef char* c_encoding
1138 context = self._getParserContext()
1141 pctxt = context._c_ctxt
1142 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1144 if self._default_encoding is None:
1147 c_encoding = _cstr(self._default_encoding)
1149 orig_options = pctxt.options
1152 result = htmlparser.htmlCtxtReadFile(
1153 pctxt, c_filename, c_encoding, self._parse_options)
1154 if result is not NULL:
1155 if _fixHtmlDictNames(pctxt.dict, result) < 0:
1156 tree.xmlFreeDoc(result)
1159 result = xmlparser.xmlCtxtReadFile(
1160 pctxt, c_filename, c_encoding, self._parse_options)
1161 pctxt.options = orig_options # work around libxml2 problem
1163 return context._handleParseResultDoc(self, result, c_filename)
1167 cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename,
1168 encoding) except NULL:
1169 cdef _ParserContext context
1170 cdef _FileReaderContext file_context
1172 cdef xmlparser.xmlParserCtxt* pctxt
1173 cdef char* c_filename
1177 context = self._getParserContext()
1180 pctxt = context._c_ctxt
1181 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1182 file_context = _FileReaderContext(
1183 filelike, context, filename,
1184 encoding or self._default_encoding)
1185 result = file_context._readDoc(pctxt, self._parse_options)
1187 return context._handleParseResultDoc(
1188 self, result, filename)
1193 cdef void _initSaxDocument(void* ctxt) with gil:
1194 xmlparser.xmlSAX2StartDocument(ctxt)
1195 c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
1196 c_doc = c_ctxt.myDoc
1198 # set up document dict
1199 if c_doc and c_ctxt.dict and not c_doc.dict:
1200 # I have no idea why libxml2 disables this - we need it
1201 c_ctxt.dictNames = 1
1202 c_doc.dict = c_ctxt.dict
1203 xmlparser.xmlDictReference(c_ctxt.dict)
1205 # set up XML ID hash table
1207 context = <_ParserContext>c_ctxt._private
1208 if context._collect_ids:
1209 # keep the global parser dict from filling up with XML IDs
1210 if c_doc and not c_doc.ids:
1211 # memory errors are not fatal here
1212 c_dict = xmlparser.xmlDictCreate()
1214 c_doc.ids = tree.xmlHashCreateDict(0, c_dict)
1215 xmlparser.xmlDictFree(c_dict)
1217 c_doc.ids = tree.xmlHashCreate(0)
1219 c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS
1220 if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids):
1221 # already initialised but empty => clear
1222 tree.xmlHashFree(c_doc.ids, NULL)
1226 ############################################################
1228 ############################################################
1230 cdef class _FeedParser(_BaseParser):
1231 cdef bint _feed_parser_running
1234 def feed_error_log(self):
1235 """The error log of the last (or current) run of the feed parser.
1237 Note that this is local to the feed parser and thus is
1238 different from what the ``error_log`` property returns.
1240 return self._getPushParserContext()._error_log.copy()
1242 cpdef feed(self, data):
1243 u"""feed(self, data)
1245 Feeds data to the parser. The argument should be an 8-bit string
1246 buffer containing encoded data, although Unicode is supported as long
1247 as both string types are not mixed.
1249 This is the main entry point to the consumer interface of a
1250 parser. The parser will parse as much of the XML stream as it
1251 can on each call. To finish parsing or to reset the parser,
1252 call the ``close()`` method. Both methods may raise
1253 ParseError if errors occur in the input data. If an error is
1254 raised, there is no longer a need to call ``close()``.
1256 The feed parser interface is independent of the normal parser
1257 usage. You can use the same parser as a feed parser and in
1258 the ``parse()`` function concurrently.
1260 cdef _ParserContext context
1261 cdef xmlparser.xmlParserCtxt* pctxt
1262 cdef Py_ssize_t py_buffer_len
1263 cdef const_char* c_data
1264 cdef const_char* c_encoding
1267 cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
1268 if isinstance(data, bytes):
1269 if self._default_encoding is None:
1272 c_encoding = self._default_encoding
1273 c_data = _cstr(data)
1274 py_buffer_len = python.PyBytes_GET_SIZE(data)
1275 elif isinstance(data, unicode):
1276 if _UNICODE_ENCODING is NULL:
1277 raise ParserError, \
1278 u"Unicode parsing is not supported on this platform"
1279 c_encoding = _UNICODE_ENCODING
1280 c_data = python.PyUnicode_AS_DATA(data)
1281 py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data)
1283 raise TypeError, u"Parsing requires string data"
1285 context = self._getPushParserContext()
1286 pctxt = context._c_ctxt
1288 if not self._feed_parser_running:
1290 self._feed_parser_running = 1
1291 c_filename = (_cstr(self._filename)
1292 if self._filename is not None else NULL)
1294 # We have to give *mlCtxtResetPush() enough input to figure
1295 # out the character encoding (at least four bytes),
1296 # however if we give it all we got, we'll have nothing for
1297 # *mlParseChunk() and things go wrong.
1298 buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len
1300 error = _htmlCtxtResetPush(
1301 pctxt, c_data, buffer_len, c_filename, c_encoding,
1302 self._parse_options)
1304 xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
1305 error = xmlparser.xmlCtxtResetPush(
1306 pctxt, c_data, buffer_len, c_filename, c_encoding)
1307 py_buffer_len -= buffer_len
1308 c_data += buffer_len
1311 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1313 #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
1316 while py_buffer_len > 0 and (error == 0 or recover):
1318 if py_buffer_len > limits.INT_MAX:
1319 buffer_len = limits.INT_MAX
1321 buffer_len = <int>py_buffer_len
1323 c_node = pctxt.node # last node where the parser stopped
1324 error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0)
1325 # and now for the fun part: move node names to the dict
1327 fixup_error = _fixHtmlDictSubtreeNames(
1328 pctxt.dict, pctxt.myDoc, c_node)
1329 if pctxt.myDoc.dict and pctxt.myDoc.dict is not pctxt.dict:
1330 xmlparser.xmlDictFree(pctxt.myDoc.dict)
1331 pctxt.myDoc.dict = pctxt.dict
1332 xmlparser.xmlDictReference(pctxt.dict)
1334 error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0)
1335 py_buffer_len -= buffer_len
1336 c_data += buffer_len
1339 context.store_exception(MemoryError())
1341 if context._has_raised():
1342 # propagate Python exceptions immediately
1347 if error and not pctxt.replaceEntities and not pctxt.validate:
1348 # in this mode, we ignore errors about undefined entities
1349 for entry in context._error_log.filter_from_errors():
1350 if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
1351 entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
1356 if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised():
1357 # propagate Python exceptions immediately
1361 if fixup_error or not recover and (error or not pctxt.wellFormed):
1362 self._feed_parser_running = 0
1364 context._handleParseResult(self, pctxt.myDoc, None)
1371 Terminates feeding data to this parser. This tells the parser to
1372 process any remaining data in the feed buffer, and then returns the
1373 root Element of the tree that was parsed.
1375 This method must be called after passing the last chunk of data into
1376 the ``feed()`` method. It should only be called when using the feed
1377 parser interface, all other usage is undefined.
1379 if not self._feed_parser_running:
1380 raise XMLSyntaxError(u"no element found",
1381 xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
1384 context = self._getPushParserContext()
1385 pctxt = context._c_ctxt
1387 self._feed_parser_running = 0
1389 htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
1391 xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
1393 if (pctxt.recovery and not pctxt.disableSAX and
1394 isinstance(context, _SaxParserContext)):
1395 # apply any left-over 'end' events
1396 (<_SaxParserContext>context).flushEvents()
1399 result = context._handleParseResult(self, pctxt.myDoc, None)
1403 if isinstance(result, _Document):
1404 return (<_Document>result).getroot()
1409 cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
1410 const_char* c_data, int buffer_len,
1411 const_char* c_filename, const_char* c_encoding,
1412 int parse_options) except -1:
1413 cdef xmlparser.xmlParserInput* c_input_stream
1414 # libxml2 lacks an HTML push parser setup function
1415 error = xmlparser.xmlCtxtResetPush(
1416 c_ctxt, c_data, buffer_len, c_filename, c_encoding)
1420 # fix libxml2 setup for HTML
1421 c_ctxt.progressive = 1
1423 htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
1428 ############################################################
1430 ############################################################
1432 cdef int _XML_DEFAULT_PARSE_OPTIONS
1433 _XML_DEFAULT_PARSE_OPTIONS = (
1434 xmlparser.XML_PARSE_NOENT |
1435 xmlparser.XML_PARSE_NOCDATA |
1436 xmlparser.XML_PARSE_NONET |
1437 xmlparser.XML_PARSE_COMPACT |
1438 xmlparser.XML_PARSE_BIG_LINES
1441 cdef class XMLParser(_FeedParser):
1442 u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True)
1446 Parsers can be supplied as additional argument to various parse
1447 functions of the lxml API. A default parser is always available
1448 and can be replaced by a call to the global function
1449 'set_default_parser'. New parsers can be created at any time
1450 without a major run-time overhead.
1452 The keyword arguments in the constructor are mainly based on the
1453 libxml2 parser configuration. A DTD will also be loaded if DTD
1454 validation or attribute default values are requested (unless you
1455 additionally provide an XMLSchema from which the default
1456 attributes can be read).
1458 Available boolean keyword arguments:
1460 - attribute_defaults - inject default attributes from DTD or XMLSchema
1461 - dtd_validation - validate against a DTD referenced by the document
1462 - load_dtd - use DTD for parsing
1463 - no_network - prevent network access for related files (default: True)
1464 - ns_clean - clean up redundant namespace declarations
1465 - recover - try hard to parse through broken XML
1466 - remove_blank_text - discard blank text nodes that appear ignorable
1467 - remove_comments - discard comments
1468 - remove_pis - discard processing instructions
1469 - strip_cdata - replace CDATA sections by normal text content (default: True)
1470 - compact - save memory for short text content (default: True)
1471 - collect_ids - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
1472 - resolve_entities - replace entities by their text value (default: True)
1473 - huge_tree - disable security restrictions and support very deep trees
1474 and very long text content (only affects libxml2 2.7+)
1476 Other keyword arguments:
1478 - encoding - override the document encoding
1479 - target - a parser target object that will receive the parse events
1480 - schema - an XMLSchema to validate against
1482 Note that you should avoid sharing parsers between threads. While this is
1483 not harmful, it is more efficient to use separate parsers. This does not
1484 apply to the default parser.
1486 def __init__(self, *, encoding=None, attribute_defaults=False,
1487 dtd_validation=False, load_dtd=False, no_network=True,
1488 ns_clean=False, recover=False, XMLSchema schema=None,
1489 huge_tree=False, remove_blank_text=False, resolve_entities=True,
1490 remove_comments=False, remove_pis=False, strip_cdata=True,
1491 collect_ids=True, target=None, compact=True):
1492 cdef int parse_options
1493 parse_options = _XML_DEFAULT_PARSE_OPTIONS
1495 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1497 parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
1498 xmlparser.XML_PARSE_DTDLOAD
1499 if attribute_defaults:
1500 parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
1502 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1504 parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
1506 parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
1507 if remove_blank_text:
1508 parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
1510 parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1512 parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
1514 parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
1515 if not resolve_entities:
1516 parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
1518 parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
1520 _BaseParser.__init__(self, parse_options, 0, schema,
1521 remove_comments, remove_pis, strip_cdata,
1522 collect_ids, target, encoding)
1525 cdef class XMLPullParser(XMLParser):
1526 """XMLPullParser(self, events=None, *, tag=None, **kwargs)
1528 XML parser that collects parse events in an iterator.
1530 The collected events are the same as for iterparse(), but the
1531 parser itself is non-blocking in the sense that it receives
1532 data chunks incrementally through its .feed() method, instead
1533 of reading them directly from a file(-like) object all by itself.
1535 By default, it collects Element end events. To change that,
1536 pass any subset of the available events into the ``events``
1537 argument: ``'start'``, ``'end'``, ``'start-ns'``,
1538 ``'end-ns'``, ``'comment'``, ``'pi'``.
1540 To support loading external dependencies relative to the input
1541 source, you can pass the ``base_url``.
1543 def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
1544 XMLParser.__init__(self, **kwargs)
1547 self._setBaseURL(base_url)
1548 self._collectEvents(events, tag)
1550 def read_events(self):
1551 return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
1554 cdef class ETCompatXMLParser(XMLParser):
1555 u"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
1556 dtd_validation=False, load_dtd=False, no_network=True, \
1557 ns_clean=False, recover=False, schema=None, \
1558 huge_tree=False, remove_blank_text=False, resolve_entities=True, \
1559 remove_comments=True, remove_pis=True, strip_cdata=True, \
1560 target=None, compact=True)
1562 An XML parser with an ElementTree compatible default setup.
1564 See the XMLParser class for details.
1566 This parser has ``remove_comments`` and ``remove_pis`` enabled by default
1567 and thus ignores comments and processing instructions.
1569 def __init__(self, *, encoding=None, attribute_defaults=False,
1570 dtd_validation=False, load_dtd=False, no_network=True,
1571 ns_clean=False, recover=False, schema=None,
1572 huge_tree=False, remove_blank_text=False, resolve_entities=True,
1573 remove_comments=True, remove_pis=True, strip_cdata=True,
1574 target=None, compact=True):
1575 XMLParser.__init__(self,
1576 attribute_defaults=attribute_defaults,
1577 dtd_validation=dtd_validation,
1579 no_network=no_network,
1582 remove_blank_text=remove_blank_text,
1583 huge_tree=huge_tree,
1585 resolve_entities=resolve_entities,
1586 remove_comments=remove_comments,
1587 remove_pis=remove_pis,
1588 strip_cdata=strip_cdata,
1593 # ET 1.2 compatible name
1594 XMLTreeBuilder = ETCompatXMLParser
1597 cdef XMLParser __DEFAULT_XML_PARSER
1598 __DEFAULT_XML_PARSER = XMLParser()
1600 __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
1602 def set_default_parser(_BaseParser parser=None):
1603 u"""set_default_parser(parser=None)
1605 Set a default parser for the current thread. This parser is used
1606 globally whenever no parser is supplied to the various parse functions of
1607 the lxml API. If this function is called without a parser (or if it is
1608 None), the default parser is reset to the original configuration.
1610 Note that the pre-installed default parser is not thread-safe. Avoid the
1611 default parser in multi-threaded environments. You can create a separate
1612 parser for each thread explicitly or use a parser pool.
1615 parser = __DEFAULT_XML_PARSER
1616 __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
1618 def get_default_parser():
1619 u"get_default_parser()"
1620 return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1622 ############################################################
1624 ############################################################
1626 cdef int _HTML_DEFAULT_PARSE_OPTIONS
1627 _HTML_DEFAULT_PARSE_OPTIONS = (
1628 htmlparser.HTML_PARSE_RECOVER |
1629 htmlparser.HTML_PARSE_NONET |
1630 htmlparser.HTML_PARSE_COMPACT
1633 cdef class HTMLParser(_FeedParser):
1634 u"""HTMLParser(self, encoding=None, remove_blank_text=False, \
1635 remove_comments=False, remove_pis=False, strip_cdata=True, \
1636 no_network=True, target=None, schema: XMLSchema =None, \
1637 recover=True, compact=True, collect_ids=True, huge_tree=False)
1641 This parser allows reading HTML into a normal XML tree. By
1642 default, it can read broken (non well-formed) HTML, depending on
1643 the capabilities of libxml2. Use the 'recover' option to switch
1646 Available boolean keyword arguments:
1648 - recover - try hard to parse through broken HTML (default: True)
1649 - no_network - prevent network access for related files (default: True)
1650 - remove_blank_text - discard empty text nodes that are ignorable (i.e. not actual text content)
1651 - remove_comments - discard comments
1652 - remove_pis - discard processing instructions
1653 - strip_cdata - replace CDATA sections by normal text content (default: True)
1654 - compact - save memory for short text content (default: True)
1655 - default_doctype - add a default doctype even if it is not found in the HTML (default: True)
1656 - collect_ids - use a hash table of XML IDs for fast access (default: True)
1657 - huge_tree - disable security restrictions and support very deep trees
1658 and very long text content (only affects libxml2 2.7+)
1660 Other keyword arguments:
1662 - encoding - override the document encoding
1663 - target - a parser target object that will receive the parse events
1664 - schema - an XMLSchema to validate against
1666 Note that you should avoid sharing parsers between threads for performance
1669 def __init__(self, *, encoding=None, remove_blank_text=False,
1670 remove_comments=False, remove_pis=False, strip_cdata=True,
1671 no_network=True, target=None, XMLSchema schema=None,
1672 recover=True, compact=True, default_doctype=True,
1673 collect_ids=True, huge_tree=False):
1674 cdef int parse_options
1675 parse_options = _HTML_DEFAULT_PARSE_OPTIONS
1676 if remove_blank_text:
1677 parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
1679 parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
1681 parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
1683 parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
1684 if not default_doctype:
1685 parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD
1687 parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1689 _BaseParser.__init__(self, parse_options, 1, schema,
1690 remove_comments, remove_pis, strip_cdata,
1691 collect_ids, target, encoding)
1694 cdef HTMLParser __DEFAULT_HTML_PARSER
1695 __DEFAULT_HTML_PARSER = HTMLParser()
1698 cdef class HTMLPullParser(HTMLParser):
1699 """HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs)
1701 HTML parser that collects parse events in an iterator.
1703 The collected events are the same as for iterparse(), but the
1704 parser itself is non-blocking in the sense that it receives
1705 data chunks incrementally through its .feed() method, instead
1706 of reading them directly from a file(-like) object all by itself.
1708 By default, it collects Element end events. To change that,
1709 pass any subset of the available events into the ``events``
1710 argument: ``'start'``, ``'end'``, ``'start-ns'``,
1711 ``'end-ns'``, ``'comment'``, ``'pi'``.
1713 To support loading external dependencies relative to the input
1714 source, you can pass the ``base_url``.
1716 def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
1717 HTMLParser.__init__(self, **kwargs)
1720 self._setBaseURL(base_url)
1721 self._collectEvents(events, tag)
1723 def read_events(self):
1724 return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
1727 ############################################################
1728 ## helper functions for document creation
1729 ############################################################
1731 cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
1732 cdef char* c_filename
1734 cdef Py_ssize_t c_len
1735 cdef bint is_pep393_string
1737 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1741 filename_utf = _encodeFilenameUTF8(filename)
1742 c_filename = _cstr(filename_utf)
1743 if isinstance(text, unicode):
1744 is_pep393_string = (
1745 python.PEP393_ENABLED and python.PyUnicode_IS_READY(text))
1746 if is_pep393_string:
1747 c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
1749 c_len = python.PyUnicode_GET_DATA_SIZE(text)
1750 if c_len > limits.INT_MAX:
1751 return (<_BaseParser>parser)._parseDocFromFilelike(
1752 StringIO(text), filename, None)
1753 if _UNICODE_ENCODING is NULL and not is_pep393_string:
1754 text = (<unicode>text).encode('utf8')
1755 return (<_BaseParser>parser)._parseDocFromFilelike(
1756 BytesIO(text), filename, "UTF-8")
1757 return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
1759 c_len = python.PyBytes_GET_SIZE(text)
1760 if c_len > limits.INT_MAX:
1761 return (<_BaseParser>parser)._parseDocFromFilelike(
1762 BytesIO(text), filename, None)
1763 c_text = _cstr(text)
1764 return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
1766 cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
1768 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1769 return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
1771 cdef xmlDoc* _parseDocFromFilelike(source, filename,
1772 _BaseParser parser) except NULL:
1774 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1775 return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
1777 cdef xmlDoc* _newXMLDoc() except NULL:
1779 result = tree.xmlNewDoc(NULL)
1782 if result.encoding is NULL:
1783 result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
1784 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1787 cdef xmlDoc* _newHTMLDoc() except NULL:
1789 result = tree.htmlNewDoc(NULL, NULL)
1792 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1795 cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
1799 result = tree.xmlCopyDoc(c_doc, recursive)
1801 result = tree.xmlCopyDoc(c_doc, 0)
1804 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1807 cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
1808 u"Recursively copy the document and make c_new_root the new root node."
1810 cdef xmlNode* c_node
1811 result = tree.xmlCopyDoc(c_doc, 0) # non recursive
1812 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1814 c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
1817 tree.xmlDocSetRootElement(result, c_node)
1818 _copyTail(c_new_root.next, c_node)
1821 cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
1822 u"Recursively copy the element into the document. c_doc is not modified."
1823 cdef xmlNode* c_root
1824 c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
1827 _copyTail(c_node.next, c_root)
1831 ############################################################
1832 ## API level helper functions for _Document creation
1833 ############################################################
1835 cdef _Document _parseDocument(source, _BaseParser parser, base_url):
1837 if _isString(source):
1838 # parse the file directly from the filesystem
1839 doc = _parseDocumentFromURL(_encodeFilename(source), parser)
1840 # fix base URL if requested
1841 if base_url is not None:
1842 base_url = _encodeFilenameUTF8(base_url)
1843 if doc._c_doc.URL is not NULL:
1844 tree.xmlFree(<char*>doc._c_doc.URL)
1845 doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url))
1848 if base_url is not None:
1851 url = _getFilenameForFile(source)
1853 if hasattr(source, u'getvalue') and hasattr(source, u'tell'):
1854 # StringIO - reading from start?
1855 if source.tell() == 0:
1856 return _parseMemoryDocument(source.getvalue(), url, parser)
1858 # Support for file-like objects (urlgrabber.urlopen, ...)
1859 if hasattr(source, u'read'):
1860 return _parseFilelikeDocument(source, url, parser)
1862 raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'"
1864 cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
1865 c_doc = _parseDocFromFile(url, parser)
1866 return _documentFactory(c_doc, parser)
1868 cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
1869 if isinstance(text, unicode):
1870 if _hasEncodingDeclaration(text):
1872 u"Unicode strings with encoding declaration are not supported. "
1873 u"Please use bytes input or XML fragments without declaration.")
1874 elif not isinstance(text, bytes):
1875 raise ValueError, u"can only parse strings"
1876 c_doc = _parseDoc(text, url, parser)
1877 return _documentFactory(c_doc, parser)
1879 cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
1880 c_doc = _parseDocFromFilelike(source, url, parser)
1881 return _documentFactory(c_doc, parser)