1 # Parsers for XML and HTML
6 cdef class _ParserContext(_ResolverContext)
7 cdef class _SaxParserContext(_ParserContext)
8 cdef class _TargetParserContext(_SaxParserContext)
9 cdef class _ParserSchemaValidationContext
11 cdef class XMLSchema(_Validator)
13 class ParseError(LxmlSyntaxError):
14 u"""Syntax error while parsing an XML document.
16 For compatibility with ElementTree 1.3 and later.
18 def __init__(self, message, code, line, column):
19 if python.PY_VERSION_HEX >= 0x02050000:
20 # Python >= 2.5 uses new style class exceptions
21 super(_ParseError, self).__init__(message)
23 _LxmlSyntaxError.__init__(self, message)
24 self.position = (line, column)
27 cdef object _LxmlSyntaxError = LxmlSyntaxError
28 cdef object _ParseError = ParseError
30 class XMLSyntaxError(ParseError):
31 u"""Syntax error while parsing an XML document.
35 class ParserError(LxmlError):
36 u"""Internal lxml parser error.
40 cdef class _ParserDictionaryContext:
41 # Global parser context to share the string dictionary.
43 # This class is a delegate singleton!
45 # It creates _ParserDictionaryContext objects for each thread to keep thread state,
46 # but those must never be used directly. Always stick to using the static
47 # __GLOBAL_PARSER_CONTEXT as defined below the class.
50 cdef tree.xmlDict* _c_dict
51 cdef _BaseParser _default_parser
52 cdef list _implied_parser_contexts
56 self._implied_parser_contexts = []
58 def __dealloc__(self):
59 if self._c_dict is not NULL:
60 xmlparser.xmlDictFree(self._c_dict)
62 cdef void initMainParserContext(self):
63 u"""Put the global context into the thread dictionary of the main
64 thread. To be called once and only in the main thread."""
65 cdef python.PyObject* thread_dict
66 cdef python.PyObject* result
67 thread_dict = python.PyThreadState_GetDict()
68 if thread_dict is not NULL:
69 (<dict>thread_dict)[u"_ParserDictionaryContext"] = self
71 cdef _ParserDictionaryContext _findThreadParserContext(self):
72 u"Find (or create) the _ParserDictionaryContext object for the current thread"
73 cdef python.PyObject* thread_dict
74 cdef python.PyObject* result
75 cdef _ParserDictionaryContext context
76 thread_dict = python.PyThreadState_GetDict()
77 if thread_dict is NULL:
80 result = python.PyDict_GetItem(d, u"_ParserDictionaryContext")
81 if result is not NULL:
83 context = _ParserDictionaryContext()
84 d[u"_ParserDictionaryContext"] = context
87 cdef void setDefaultParser(self, _BaseParser parser):
88 u"Set the default parser for the current thread"
89 cdef _ParserDictionaryContext context
90 context = self._findThreadParserContext()
91 context._default_parser = parser
93 cdef _BaseParser getDefaultParser(self):
94 u"Return (or create) the default parser of the current thread"
95 cdef _ParserDictionaryContext context
96 context = self._findThreadParserContext()
97 if context._default_parser is None:
98 if self._default_parser is None:
99 self._default_parser = __DEFAULT_XML_PARSER._copy()
100 if context is not self:
101 context._default_parser = self._default_parser._copy()
102 return context._default_parser
104 cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
105 u"Return the thread-local dict or create a new one if necessary."
106 cdef _ParserDictionaryContext context
107 context = self._findThreadParserContext()
108 if context._c_dict is NULL:
109 # thread dict not yet set up => use default or create a new one
110 if default is not NULL:
111 context._c_dict = default
112 xmlparser.xmlDictReference(default)
114 if self._c_dict is NULL:
115 self._c_dict = xmlparser.xmlDictCreate()
116 if context is not self:
117 context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
118 return context._c_dict
120 cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref):
121 cdef tree.xmlDict* c_dict
122 cdef tree.xmlDict* c_thread_dict
123 c_dict = c_dict_ref[0]
124 c_thread_dict = self._getThreadDict(c_dict)
125 if c_dict is c_thread_dict:
127 if c_dict is not NULL:
128 xmlparser.xmlDictFree(c_dict)
129 c_dict_ref[0] = c_thread_dict
130 xmlparser.xmlDictReference(c_thread_dict)
132 cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt):
133 u"Assure we always use the same string dictionary."
134 self.initThreadDictRef(&pctxt.dict)
136 cdef void initXPathParserDict(self, xpath.xmlXPathContext* pctxt):
137 u"Assure we always use the same string dictionary."
138 self.initThreadDictRef(&pctxt.dict)
140 cdef void initDocDict(self, xmlDoc* result):
141 u"Store dict of last object parsed if no shared dict yet"
142 # XXX We also free the result dict here if there already was one.
143 # This case should only occur for new documents with empty dicts,
144 # otherwise we'd free data that's in use => segfault
145 self.initThreadDictRef(&result.dict)
147 cdef _ParserContext findImpliedContext(self):
148 u"""Return any current implied xml parser context for the current
149 thread. This is used when the resolver functions are called
150 with an xmlParserCtxt that was generated from within libxml2
151 (i.e. without a _ParserContext) - which happens when parsing
152 schema and xinclude external references."""
153 cdef _ParserDictionaryContext context
154 cdef _ParserContext implied_context
156 # see if we have a current implied parser
157 context = self._findThreadParserContext()
158 if python.PyList_GET_SIZE(context._implied_parser_contexts):
159 implied_context = context._implied_parser_contexts[-1]
160 return implied_context
163 cdef void pushImpliedContextFromParser(self, _BaseParser parser):
164 u"Push a new implied context object taken from the parser."
165 if parser is not None:
166 self.pushImpliedContext(parser._getParserContext())
168 self.pushImpliedContext(None)
170 cdef void pushImpliedContext(self, _ParserContext parser_context):
171 u"Push a new implied context object."
172 cdef _ParserDictionaryContext context
173 context = self._findThreadParserContext()
174 context._implied_parser_contexts.append(parser_context)
176 cdef void popImpliedContext(self):
177 u"Pop the current implied context object."
178 cdef _ParserDictionaryContext context
179 context = self._findThreadParserContext()
180 context._implied_parser_contexts.pop()
182 cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT
183 __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
184 __GLOBAL_PARSER_CONTEXT.initMainParserContext()
186 cdef int _checkThreadDict(tree.xmlDict* c_dict):
187 u"""Check that c_dict is either the local thread dictionary or the global
190 #if __GLOBAL_PARSER_CONTEXT._c_dict is c_dict:
191 # return 1 # main thread
192 if __GLOBAL_PARSER_CONTEXT._getThreadDict(NULL) is c_dict:
193 return 1 # local thread dict
196 ############################################################
197 ## support for Python unicode I/O
198 ############################################################
200 # name of Python unicode encoding as known to libxml2
201 cdef char* _UNICODE_ENCODING = NULL
203 cdef int _setupPythonUnicode() except -1:
204 u"""Sets _UNICODE_ENCODING to the internal encoding name of Python unicode
205 strings if libxml2 supports reading native Python unicode. This depends
206 on iconv and the local Python installation, so we simply check if we find
207 a matching encoding handler.
209 cdef tree.xmlCharEncodingHandler* enchandler
213 utext = python.PyUnicode_DecodeUTF8("<test/>", 7, NULL)
214 l = python.PyUnicode_GET_DATA_SIZE(utext)
215 buffer = python.PyUnicode_AS_DATA(utext)
216 enc = _findEncodingName(buffer, l)
218 # apparently, libxml2 can't detect UTF-16 on some systems
220 buffer[0] == c'<' and buffer[1] == c'\0' and \
221 buffer[2] == c't' and buffer[3] == c'\0':
224 buffer[0] == c'\0' and buffer[1] == c'<' and \
225 buffer[2] == c'\0' and buffer[3] == c't':
228 # not my fault, it's YOUR broken system :)
230 enchandler = tree.xmlFindCharEncodingHandler(enc)
231 if enchandler is not NULL:
232 global _UNICODE_ENCODING
233 tree.xmlCharEncCloseFunc(enchandler)
234 _UNICODE_ENCODING = enc
237 cdef char* _findEncodingName(char* buffer, int size):
238 u"Work around bug in libxml2: find iconv name of encoding on our own."
239 cdef tree.xmlCharEncoding enc
240 enc = tree.xmlDetectCharEncoding(buffer, size)
241 if enc == tree.XML_CHAR_ENCODING_UTF16LE:
243 elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
245 elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
247 elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
249 elif enc == tree.XML_CHAR_ENCODING_NONE:
252 # returns a constant char*, no need to free it
253 return tree.xmlGetCharEncodingName(enc)
255 _setupPythonUnicode()
257 ############################################################
258 ## support for file-like objects
259 ############################################################
261 cdef class _FileReaderContext:
262 cdef object _filelike
263 cdef object _encoding
266 cdef _ExceptionContext _exc_context
267 cdef Py_ssize_t _bytes_read
269 cdef bint _close_file_after_read
271 def __cinit__(self, filelike, exc_context, url, encoding=None, bint close_file=False):
272 self._exc_context = exc_context
273 self._filelike = filelike
274 self._close_file_after_read = close_file
275 self._encoding = encoding
279 url = _encodeFilename(url)
280 self._c_url = _cstr(url)
285 cdef _close_file(self):
286 if self._filelike is None or not self._close_file_after_read:
289 close = self._filelike.close
290 except AttributeError:
293 self._filelike = None
294 if close is not None:
297 cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self):
298 cdef cstd.FILE* c_stream
299 cdef xmlparser.xmlParserInputBuffer* c_buffer
300 c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
301 c_stream = python.PyFile_AsFile(self._filelike)
303 c_buffer.readcallback = _readFilelikeParser
304 c_buffer.context = <python.PyObject*>self
306 c_buffer.readcallback = _readFileParser
307 c_buffer.context = c_stream
310 cdef xmlparser.xmlParserInput* _createParserInput(
311 self, xmlparser.xmlParserCtxt* ctxt):
312 cdef xmlparser.xmlParserInputBuffer* c_buffer
313 c_buffer = self._createParserInputBuffer()
314 return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
316 cdef tree.xmlDtd* _readDtd(self):
317 cdef xmlparser.xmlParserInputBuffer* c_buffer
318 c_buffer = self._createParserInputBuffer()
320 return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
322 cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options):
324 cdef char* c_encoding
325 cdef cstd.FILE* c_stream
326 cdef xmlparser.xmlInputReadCallback c_read_callback
327 cdef xmlparser.xmlInputCloseCallback c_close_callback
328 cdef void* c_callback_context
330 if self._encoding is None:
333 c_encoding = _cstr(self._encoding)
335 c_stream = python.PyFile_AsFile(self._filelike)
337 c_read_callback = _readFilelikeParser
338 c_callback_context = <python.PyObject*>self
340 c_read_callback = _readFileParser
341 c_callback_context = c_stream
345 result = htmlparser.htmlCtxtReadIO(
346 ctxt, c_read_callback, NULL, c_callback_context,
347 self._c_url, c_encoding, options)
348 if result is not NULL:
349 if _fixHtmlDictNames(ctxt.dict, result) < 0:
350 tree.xmlFreeDoc(result)
353 result = xmlparser.xmlCtxtReadIO(
354 ctxt, c_read_callback, NULL, c_callback_context,
355 self._c_url, c_encoding, options)
359 cdef int copyToBuffer(self, char* c_buffer, int c_requested):
360 cdef int c_byte_count
362 cdef Py_ssize_t byte_count, remaining
363 if self._bytes_read < 0:
367 byte_count = python.PyBytes_GET_SIZE(self._bytes)
368 remaining = byte_count - self._bytes_read
369 while c_requested > remaining:
370 c_start = _cstr(self._bytes) + self._bytes_read
371 cstd.memcpy(c_buffer, c_start, remaining)
372 c_byte_count += remaining
373 c_buffer += remaining
374 c_requested -= remaining
376 self._bytes = self._filelike.read(c_requested)
377 if not python.PyBytes_Check(self._bytes):
378 if python.PyUnicode_Check(self._bytes):
379 if self._encoding is None:
380 self._bytes = python.PyUnicode_AsUTF8String(self._bytes)
382 self._bytes = python.PyUnicode_AsEncodedString(
383 self._bytes, _cstr(self._encoding), NULL)
387 u"reading from file-like objects must return byte strings or unicode strings"
389 remaining = python.PyBytes_GET_SIZE(self._bytes)
391 self._bytes_read = -1
397 c_start = _cstr(self._bytes) + self._bytes_read
398 cstd.memcpy(c_buffer, c_start, c_requested)
399 c_byte_count += c_requested
400 self._bytes_read += c_requested
403 self._exc_context._store_raised()
407 cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) with gil:
408 return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
410 cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) nogil:
411 return cstd.fread(c_buffer, 1, c_size, <cstd.FILE*>ctxt)
413 ############################################################
414 ## support for custom document loaders
415 ############################################################
417 cdef xmlparser.xmlParserInput* _local_resolver(char* c_url, char* c_pubid,
418 xmlparser.xmlParserCtxt* c_context) with gil:
419 cdef _ResolverContext context
420 cdef xmlparser.xmlParserInput* c_input
421 cdef _InputDocument doc_ref
422 cdef _FileReaderContext file_context
423 # if there is no _ParserContext associated with the xmlParserCtxt
424 # passed, check to see if the thread state object has an implied
426 if c_context._private is not NULL:
427 context = <_ResolverContext>c_context._private
429 context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
432 if __DEFAULT_ENTITY_LOADER is NULL:
434 return __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
440 # parsing a related document (DTD etc.) => UTF-8 encoded URL?
441 url = _decodeFilename(c_url)
445 pubid = funicode(c_pubid) # always UTF-8
447 doc_ref = context._resolvers.resolve(url, pubid, context)
449 context._store_raised()
452 if doc_ref is not None:
453 if doc_ref._type == PARSER_DATA_STRING:
454 data = doc_ref._data_bytes
455 c_input = xmlparser.xmlNewInputStream(c_context)
456 if c_input is not NULL:
457 c_input.base = _cstr(data)
458 c_input.length = python.PyBytes_GET_SIZE(data)
459 c_input.cur = c_input.base
460 c_input.end = &c_input.base[c_input.length]
461 elif doc_ref._type == PARSER_DATA_FILENAME:
463 c_input = xmlparser.xmlNewInputFromFile(
464 c_context, _cstr(doc_ref._filename))
465 elif doc_ref._type == PARSER_DATA_FILE:
466 file_context = _FileReaderContext(doc_ref._file, context, url,
467 None, doc_ref._close_file)
468 c_input = file_context._createParserInput(c_context)
475 context._storage.add(data)
476 if c_input is not NULL:
479 if __DEFAULT_ENTITY_LOADER is NULL:
481 return __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
483 cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
484 __DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
486 xmlparser.xmlSetExternalEntityLoader(_local_resolver)
488 ############################################################
490 ############################################################
492 cdef class _ParserContext(_ResolverContext):
493 cdef _ErrorLog _error_log
494 cdef _ParserSchemaValidationContext _validator
495 cdef xmlparser.xmlParserCtxt* _c_ctxt
496 cdef python.PyThread_type_lock _lock
499 if not config.ENABLE_THREADING:
502 self._lock = python.PyThread_allocate_lock()
503 self._error_log = _ErrorLog()
505 def __dealloc__(self):
506 if self._validator is not None:
507 self._validator.disconnect()
508 if self._lock is not NULL:
509 python.PyThread_free_lock(self._lock)
510 if self._c_ctxt is not NULL:
511 xmlparser.xmlFreeParserCtxt(self._c_ctxt)
513 cdef _ParserContext _copy(self):
514 cdef _ParserContext context
515 context = self.__class__()
516 context._validator = self._validator.copy()
517 _initParserContext(context, self._resolvers._copy(), NULL)
520 cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
521 self._c_ctxt = c_ctxt
522 c_ctxt._private = <void*>self
524 cdef void _resetParserContext(self):
525 if self._c_ctxt is not NULL:
526 if self._c_ctxt.html:
527 htmlparser.htmlCtxtReset(self._c_ctxt)
528 self._c_ctxt.disableSAX = 0 # work around bug in libxml2
529 elif self._c_ctxt.spaceTab is not NULL or \
530 _LIBXML_VERSION_INT >= 20629: # work around bug in libxml2
531 xmlparser.xmlClearParserCtxt(self._c_ctxt)
533 cdef int prepare(self) except -1:
535 if config.ENABLE_THREADING and self._lock is not NULL:
537 result = python.PyThread_acquire_lock(
538 self._lock, python.WAIT_LOCK)
540 raise ParserError, u"parser locking failed"
541 self._error_log.connect()
542 if self._validator is not None:
543 self._validator.connect(self._c_ctxt)
546 cdef int cleanup(self) except -1:
547 if self._validator is not None:
548 self._validator.disconnect()
549 self._resetParserContext()
551 self._error_log.disconnect()
552 if config.ENABLE_THREADING and self._lock is not NULL:
553 python.PyThread_release_lock(self._lock)
556 cdef object _handleParseResult(self, _BaseParser parser,
557 xmlDoc* result, filename):
560 recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
561 c_doc = _handleParseResult(self, self._c_ctxt, result,
563 return _documentFactory(c_doc, parser)
565 cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
566 xmlDoc* result, filename) except NULL:
568 recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
569 return _handleParseResult(self, self._c_ctxt, result,
572 cdef _initParserContext(_ParserContext context,
573 _ResolverRegistry resolvers,
574 xmlparser.xmlParserCtxt* c_ctxt):
575 _initResolverContext(context, resolvers)
576 if c_ctxt is not NULL:
577 context._initParserContext(c_ctxt)
579 cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
580 _ErrorLog error_log) except 0:
581 if filename is not None and \
582 ctxt.lastError.domain == xmlerror.XML_FROM_IO:
583 if ctxt.lastError.message is not NULL:
584 message = u"Error reading file '%s': %s" % (
585 filename, (ctxt.lastError.message).strip())
587 message = u"Error reading '%s'" % filename
588 raise IOError, message
590 raise error_log._buildParseException(
591 XMLSyntaxError, u"Document is not well formed")
592 elif ctxt.lastError.message is not NULL:
593 message = (ctxt.lastError.message).strip()
594 code = ctxt.lastError.code
595 line = ctxt.lastError.line
596 column = ctxt.lastError.int2
597 if ctxt.lastError.line > 0:
598 message = u"line %d: %s" % (line, message)
599 raise XMLSyntaxError(message, code, line, column)
601 raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0)
603 cdef xmlDoc* _handleParseResult(_ParserContext context,
604 xmlparser.xmlParserCtxt* c_ctxt,
605 xmlDoc* result, filename,
606 bint recover) except NULL:
607 cdef bint well_formed
608 if result is not NULL:
609 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
611 if c_ctxt.myDoc is not NULL:
612 if c_ctxt.myDoc is not result:
613 __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
614 tree.xmlFreeDoc(c_ctxt.myDoc)
617 if result is not NULL:
618 if context._validator is not None and \
619 not context._validator.isvalid():
620 well_formed = 0 # actually not 'valid', but anyway ...
621 elif recover or (c_ctxt.wellFormed and \
622 c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
624 elif not c_ctxt.replaceEntities and not c_ctxt.validate \
625 and context is not None:
626 # in this mode, we ignore errors about undefined entities
627 for error in context._error_log.filter_from_errors():
628 if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
629 error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
638 # free broken document
639 tree.xmlFreeDoc(result)
642 if context is not None and context._has_raised():
643 if result is not NULL:
644 tree.xmlFreeDoc(result)
646 context._raise_if_stored()
649 if context is not None:
650 _raiseParseError(c_ctxt, filename, context._error_log)
652 _raiseParseError(c_ctxt, filename, None)
654 if result.URL is NULL and filename is not None:
655 result.URL = tree.xmlStrdup(_cstr(filename))
656 if result.encoding is NULL:
657 result.encoding = tree.xmlStrdup("UTF-8")
659 if context._validator is not None and \
660 context._validator._add_default_attributes:
661 # we currently need to do this here as libxml2 does not
662 # support inserting default attributes during parse-time
664 context._validator.inject_default_attributes(result)
668 cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil:
672 c_node = c_doc.children
673 tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 0)
674 if c_node.type == tree.XML_ELEMENT_NODE:
675 if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
677 tree.END_FOR_EACH_ELEMENT_FROM(c_node)
680 cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
681 xmlNode* c_node) nogil:
684 c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
687 if c_name is not c_node.name:
688 tree.xmlFree(c_node.name)
690 c_attr = <xmlNode*>c_node.properties
691 while c_attr is not NULL:
692 c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
695 if c_name is not c_attr.name:
696 tree.xmlFree(c_attr.name)
701 cdef class _BaseParser:
702 cdef ElementClassLookup _class_lookup
703 cdef _ResolverRegistry _resolvers
704 cdef _ParserContext _parser_context
705 cdef _ParserContext _push_parser_context
706 cdef int _parse_options
708 cdef bint _remove_comments
709 cdef bint _remove_pis
710 cdef bint _strip_cdata
711 cdef XMLSchema _schema
712 cdef object _filename
713 cdef readonly object target
714 cdef object _default_encoding
716 def __init__(self, int parse_options, bint for_html, XMLSchema schema,
717 remove_comments, remove_pis, strip_cdata, target,
719 cdef tree.xmlCharEncodingHandler* enchandler
721 if not isinstance(self, HTMLParser) and \
722 not isinstance(self, XMLParser) and \
723 not isinstance(self, iterparse):
724 raise TypeError, u"This class cannot be instantiated"
726 self._parse_options = parse_options
727 self._filename = filename
729 self._for_html = for_html
730 self._remove_comments = remove_comments
731 self._remove_pis = remove_pis
732 self._strip_cdata = strip_cdata
733 self._schema = schema
735 self._resolvers = _ResolverRegistry()
738 self._default_encoding = None
740 encoding = _utf8(encoding)
741 enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
742 if enchandler is NULL:
743 raise LookupError, u"unknown encoding: '%s'" % encoding
744 tree.xmlCharEncCloseFunc(enchandler)
745 self._default_encoding = encoding
747 cdef _ParserContext _getParserContext(self):
748 cdef xmlparser.xmlParserCtxt* pctxt
749 if self._parser_context is None:
750 self._parser_context = self._createContext(self.target)
751 if self._schema is not None:
752 self._parser_context._validator = \
753 self._schema._newSaxValidator(
754 self._parse_options & xmlparser.XML_PARSE_DTDATTR)
755 pctxt = self._newParserCtxt()
757 python.PyErr_NoMemory()
758 _initParserContext(self._parser_context, self._resolvers, pctxt)
759 if self._remove_comments:
760 pctxt.sax.comment = NULL
762 pctxt.sax.processingInstruction = NULL
763 if self._strip_cdata:
764 # hard switch-off for CDATA nodes => makes them plain text
765 pctxt.sax.cdataBlock = NULL
766 return self._parser_context
768 cdef _ParserContext _getPushParserContext(self):
769 cdef xmlparser.xmlParserCtxt* pctxt
770 if self._push_parser_context is None:
771 self._push_parser_context = self._createContext(self.target)
772 if self._schema is not None:
773 self._push_parser_context._validator = \
774 self._schema._newSaxValidator(
775 self._parse_options & xmlparser.XML_PARSE_DTDATTR)
776 pctxt = self._newPushParserCtxt()
778 python.PyErr_NoMemory()
780 self._push_parser_context, self._resolvers, pctxt)
781 if self._remove_comments:
782 pctxt.sax.comment = NULL
784 pctxt.sax.processingInstruction = NULL
785 if self._strip_cdata:
786 # hard switch-off for CDATA nodes => makes them plain text
787 pctxt.sax.cdataBlock = NULL
788 return self._push_parser_context
790 cdef _ParserContext _createContext(self, target):
791 cdef _TargetParserContext context
793 return _ParserContext()
794 context = _TargetParserContext()
795 context._setTarget(target)
798 cdef xmlparser.xmlParserCtxt* _newParserCtxt(self):
800 return htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
802 return xmlparser.xmlNewParserCtxt()
804 cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self):
805 cdef xmlparser.xmlParserCtxt* c_ctxt
806 cdef char* c_filename
807 if self._filename is not None:
808 c_filename = _cstr(self._filename)
812 c_ctxt = htmlparser.htmlCreatePushParserCtxt(
813 NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
814 if c_ctxt is not NULL:
815 htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
817 c_ctxt = xmlparser.xmlCreatePushParserCtxt(
818 NULL, NULL, NULL, 0, c_filename)
819 if c_ctxt is not NULL:
820 xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
824 u"""The error log of the last parser run.
827 cdef _ParserContext context
828 context = self._getParserContext()
829 return context._error_log.copy()
832 u"The custom resolver registry of this parser."
834 return self._resolvers
837 u"The version of the underlying XML parser."
839 return u"libxml2 %d.%d.%d" % LIBXML_VERSION
841 def setElementClassLookup(self, ElementClassLookup lookup = None):
842 u":deprecated: use ``parser.set_element_class_lookup(lookup)`` instead."
843 self.set_element_class_lookup(lookup)
845 def set_element_class_lookup(self, ElementClassLookup lookup = None):
846 u"""set_element_class_lookup(self, lookup = None)
848 Set a lookup scheme for element classes generated from this parser.
850 Reset it by passing None or nothing.
852 self._class_lookup = lookup
854 cdef _BaseParser _copy(self):
855 u"Create a new parser with the same configuration."
856 cdef _BaseParser parser
857 parser = self.__class__()
858 parser._parse_options = self._parse_options
859 parser._for_html = self._for_html
860 parser._remove_comments = self._remove_comments
861 parser._remove_pis = self._remove_pis
862 parser._strip_cdata = self._strip_cdata
863 parser._filename = self._filename
864 parser._resolvers = self._resolvers
865 parser.target = self.target
866 parser._class_lookup = self._class_lookup
867 parser._default_encoding = self._default_encoding
868 parser._schema = self._schema
874 Create a new parser with the same configuration.
878 def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
879 u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
881 Creates a new element associated with this parser.
883 return _makeElement(_tag, NULL, None, self, None, None,
884 attrib, nsmap, _extra)
886 # internal parser methods
888 cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
889 u"""Parse unicode document, share dictionary if possible.
891 cdef _ParserContext context
893 cdef xmlparser.xmlParserCtxt* pctxt
894 cdef Py_ssize_t py_buffer_len
897 py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
898 if py_buffer_len > python.INT_MAX or _UNICODE_ENCODING is NULL:
899 text_utf = python.PyUnicode_AsUTF8String(utext)
900 py_buffer_len = python.PyBytes_GET_SIZE(text_utf)
901 return self._parseDoc(_cstr(text_utf), py_buffer_len, c_filename)
902 buffer_len = py_buffer_len
904 context = self._getParserContext()
907 pctxt = context._c_ctxt
908 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
910 c_text = python.PyUnicode_AS_DATA(utext)
913 result = htmlparser.htmlCtxtReadMemory(
914 pctxt, c_text, buffer_len, c_filename, _UNICODE_ENCODING,
916 if result is not NULL:
917 if _fixHtmlDictNames(pctxt.dict, result) < 0:
918 tree.xmlFreeDoc(result)
921 result = xmlparser.xmlCtxtReadMemory(
922 pctxt, c_text, buffer_len, c_filename, _UNICODE_ENCODING,
925 return context._handleParseResultDoc(self, result, None)
929 cdef xmlDoc* _parseDoc(self, char* c_text, Py_ssize_t c_len,
930 char* c_filename) except NULL:
931 u"""Parse document, share dictionary if possible.
933 cdef _ParserContext context
935 cdef xmlparser.xmlParserCtxt* pctxt
936 cdef char* c_encoding
937 if c_len > python.INT_MAX:
938 raise ParserError, u"string is too long to parse it with libxml2"
940 context = self._getParserContext()
943 pctxt = context._c_ctxt
944 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
946 if self._default_encoding is None:
949 c_encoding = _cstr(self._default_encoding)
953 result = htmlparser.htmlCtxtReadMemory(
954 pctxt, c_text, c_len, c_filename,
955 c_encoding, self._parse_options)
956 if result is not NULL:
957 if _fixHtmlDictNames(pctxt.dict, result) < 0:
958 tree.xmlFreeDoc(result)
961 result = xmlparser.xmlCtxtReadMemory(
962 pctxt, c_text, c_len, c_filename,
963 c_encoding, self._parse_options)
965 return context._handleParseResultDoc(self, result, None)
969 cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
970 cdef _ParserContext context
972 cdef xmlparser.xmlParserCtxt* pctxt
973 cdef int orig_options
974 cdef char* c_encoding
977 context = self._getParserContext()
980 pctxt = context._c_ctxt
981 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
983 if self._default_encoding is None:
986 c_encoding = _cstr(self._default_encoding)
988 orig_options = pctxt.options
991 result = htmlparser.htmlCtxtReadFile(
992 pctxt, c_filename, c_encoding, self._parse_options)
993 if result is not NULL:
994 if _fixHtmlDictNames(pctxt.dict, result) < 0:
995 tree.xmlFreeDoc(result)
998 result = xmlparser.xmlCtxtReadFile(
999 pctxt, c_filename, c_encoding, self._parse_options)
1000 pctxt.options = orig_options # work around libxml2 problem
1002 return context._handleParseResultDoc(self, result, c_filename)
1006 cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename) except NULL:
1007 cdef _ParserContext context
1008 cdef _FileReaderContext file_context
1010 cdef xmlparser.xmlParserCtxt* pctxt
1011 cdef char* c_filename
1015 context = self._getParserContext()
1018 pctxt = context._c_ctxt
1019 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1020 file_context = _FileReaderContext(
1021 filelike, context, filename, self._default_encoding)
1022 result = file_context._readDoc(pctxt, self._parse_options)
1024 return context._handleParseResultDoc(
1025 self, result, filename)
1029 ############################################################
1031 ############################################################
1033 cdef class _FeedParser(_BaseParser):
1034 cdef bint _feed_parser_running
1036 property feed_error_log:
1037 u"""The error log of the last (or current) run of the feed parser.
1039 Note that this is local to the feed parser and thus is
1040 different from what the ``error_log`` property returns.
1043 cdef _ParserContext context
1044 context = self._getPushParserContext()
1045 return context._error_log.copy()
1047 def feed(self, data):
1048 u"""feed(self, data)
1050 Feeds data to the parser. The argument should be an 8-bit string
1051 buffer containing encoded data, although Unicode is supported as long
1052 as both string types are not mixed.
1054 This is the main entry point to the consumer interface of a
1055 parser. The parser will parse as much of the XML stream as it
1056 can on each call. To finish parsing or to reset the parser,
1057 call the ``close()`` method. Both methods may raise
1058 ParseError if errors occur in the input data. If an error is
1059 raised, there is no longer a need to call ``close()``.
1061 The feed parser interface is independent of the normal parser
1062 usage. You can use the same parser as a feed parser and in
1063 the ``parse()`` function concurrently.
1065 cdef _ParserContext context
1066 cdef xmlparser.xmlParserCtxt* pctxt
1067 cdef Py_ssize_t py_buffer_len
1069 cdef char* c_encoding
1072 cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
1073 if python.PyBytes_Check(data):
1074 if self._default_encoding is None:
1077 c_encoding = self._default_encoding
1078 c_data = _cstr(data)
1079 py_buffer_len = python.PyBytes_GET_SIZE(data)
1080 elif python.PyUnicode_Check(data):
1081 if _UNICODE_ENCODING is NULL:
1082 raise ParserError, \
1083 u"Unicode parsing is not supported on this platform"
1084 c_encoding = _UNICODE_ENCODING
1085 c_data = python.PyUnicode_AS_DATA(data)
1086 py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data)
1088 raise TypeError, u"Parsing requires string data"
1090 context = self._getPushParserContext()
1091 pctxt = context._c_ctxt
1093 if not self._feed_parser_running:
1095 self._feed_parser_running = 1
1096 __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1098 error = _htmlCtxtResetPush(
1099 pctxt, NULL, 0, c_encoding, self._parse_options)
1101 xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
1102 error = xmlparser.xmlCtxtResetPush(
1103 pctxt, NULL, 0, NULL, c_encoding)
1105 #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
1107 while py_buffer_len > 0 and (error == 0 or recover):
1109 if py_buffer_len > python.INT_MAX:
1110 buffer_len = python.INT_MAX
1112 buffer_len = <int>py_buffer_len
1114 error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0)
1116 error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0)
1117 py_buffer_len -= buffer_len
1118 c_data += buffer_len
1120 if error and not pctxt.replaceEntities and not pctxt.validate:
1121 # in this mode, we ignore errors about undefined entities
1122 for entry in context._error_log.filter_from_errors():
1123 if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
1124 entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
1129 if not recover and (error or not pctxt.wellFormed):
1130 self._feed_parser_running = 0
1132 context._handleParseResult(self, NULL, None)
1139 Terminates feeding data to this parser. This tells the parser to
1140 process any remaining data in the feed buffer, and then returns the
1141 root Element of the tree that was parsed.
1143 This method must be called after passing the last chunk of data into
1144 the ``feed()`` method. It should only be called when using the feed
1145 parser interface, all other usage is undefined.
1147 cdef _ParserContext context
1148 cdef xmlparser.xmlParserCtxt* pctxt
1151 if not self._feed_parser_running:
1152 raise XMLSyntaxError(u"no element found",
1153 xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0)
1155 context = self._getPushParserContext()
1156 pctxt = context._c_ctxt
1158 self._feed_parser_running = 0
1160 htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
1162 xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
1164 result = context._handleParseResult(self, pctxt.myDoc, None)
1168 if isinstance(result, _Document):
1169 return (<_Document>result).getroot()
1173 cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
1174 char* c_data, int buffer_len,
1175 char* c_encoding, int parse_options) except -1:
1176 cdef xmlparser.xmlParserInput* c_input_stream
1177 # libxml2 crashes if spaceTab is not initialised
1178 if _LIBXML_VERSION_INT < 20629 and c_ctxt.spaceTab is NULL:
1179 c_ctxt.spaceTab = <int*>tree.xmlMalloc(10 * sizeof(int))
1180 c_ctxt.spaceMax = 10
1182 # libxml2 lacks an HTML push parser setup function
1183 error = xmlparser.xmlCtxtResetPush(c_ctxt, NULL, 0, NULL, c_encoding)
1187 # fix libxml2 setup for HTML
1188 c_ctxt.progressive = 1
1190 htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
1192 if c_data is not NULL and buffer_len > 0:
1193 return htmlparser.htmlParseChunk(c_ctxt, c_data, buffer_len, 0)
1196 ############################################################
1198 ############################################################
1200 cdef int _XML_DEFAULT_PARSE_OPTIONS
1201 _XML_DEFAULT_PARSE_OPTIONS = (
1202 xmlparser.XML_PARSE_NOENT |
1203 xmlparser.XML_PARSE_NOCDATA |
1204 xmlparser.XML_PARSE_NONET |
1205 xmlparser.XML_PARSE_COMPACT
1208 cdef class XMLParser(_FeedParser):
1209 u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, XMLSchema schema=None, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, target=None, compact=True)
1213 Parsers can be supplied as additional argument to various parse
1214 functions of the lxml API. A default parser is always available
1215 and can be replaced by a call to the global function
1216 'set_default_parser'. New parsers can be created at any time
1217 without a major run-time overhead.
1219 The keyword arguments in the constructor are mainly based on the
1220 libxml2 parser configuration. A DTD will also be loaded if DTD
1221 validation or attribute default values are requested (unless you
1222 additionally provide an XMLSchema from which the default
1223 attributes can be read).
1225 Available boolean keyword arguments:
1227 - attribute_defaults - inject default attributes from DTD or XMLSchema
1228 - dtd_validation - validate against a DTD referenced by the document
1229 - load_dtd - use DTD for parsing
1230 - no_network - prevent network access for related files (default: True)
1231 - ns_clean - clean up redundant namespace declarations
1232 - recover - try hard to parse through broken XML
1233 - remove_blank_text - discard blank text nodes
1234 - remove_comments - discard comments
1235 - remove_pis - discard processing instructions
1236 - strip_cdata - replace CDATA sections by normal text content (default: True)
1237 - compact - safe memory for short text content (default: True)
1238 - resolve_entities - replace entities by their text value (default: True)
1239 - huge_tree - disable security restrictions and support very deep trees
1240 and very long text content (only affects libxml2 2.7+)
1242 Other keyword arguments:
1244 - encoding - override the document encoding
1245 - target - a parser target object that will receive the parse events
1246 - schema - an XMLSchema to validate against
1248 Note that you should avoid sharing parsers between threads. While this is
1249 not harmful, it is more efficient to use separate parsers. This does not
1250 apply to the default parser.
1252 def __init__(self, *, encoding=None, attribute_defaults=False,
1253 dtd_validation=False, load_dtd=False, no_network=True,
1254 ns_clean=False, recover=False, XMLSchema schema=None,
1255 huge_tree=False, remove_blank_text=False, resolve_entities=True,
1256 remove_comments=False, remove_pis=False, strip_cdata=True,
1257 target=None, compact=True):
1258 cdef int parse_options
1259 parse_options = _XML_DEFAULT_PARSE_OPTIONS
1261 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1263 parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
1264 xmlparser.XML_PARSE_DTDLOAD
1265 if attribute_defaults:
1266 parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
1268 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1270 parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
1272 parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
1273 if remove_blank_text:
1274 parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
1276 parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1278 parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
1280 parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
1281 if not resolve_entities:
1282 parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
1284 parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
1286 _BaseParser.__init__(self, parse_options, 0, schema,
1287 remove_comments, remove_pis, strip_cdata,
1288 target, None, encoding)
1290 cdef class ETCompatXMLParser(XMLParser):
1291 u"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
1292 dtd_validation=False, load_dtd=False, no_network=True, \
1293 ns_clean=False, recover=False, schema=None, \
1294 huge_tree=False, remove_blank_text=False, resolve_entities=True, \
1295 remove_comments=True, remove_pis=True, strip_cdata=True, \
1296 target=None, compact=True)
1298 An XML parser with an ElementTree compatible default setup.
1300 See the XMLParser class for details.
1302 This parser has ``remove_comments`` and ``remove_pis`` enabled by default
1303 and thus ignores comments and processing instructions.
1305 def __init__(self, *, encoding=None, attribute_defaults=False,
1306 dtd_validation=False, load_dtd=False, no_network=True,
1307 ns_clean=False, recover=False, schema=None,
1308 huge_tree=False, remove_blank_text=False, resolve_entities=True,
1309 remove_comments=True, remove_pis=True, strip_cdata=True,
1310 target=None, compact=True):
1311 XMLParser.__init__(self,
1312 attribute_defaults=attribute_defaults,
1313 dtd_validation=dtd_validation,
1315 no_network=no_network,
1318 remove_blank_text=remove_blank_text,
1319 huge_tree=huge_tree,
1321 resolve_entities=resolve_entities,
1322 remove_comments=remove_comments,
1323 remove_pis=remove_pis,
1324 strip_cdata=strip_cdata,
1329 # ET 1.2 compatible name
1330 XMLTreeBuilder = ETCompatXMLParser
1333 cdef XMLParser __DEFAULT_XML_PARSER
1334 __DEFAULT_XML_PARSER = XMLParser()
1336 __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
1338 def set_default_parser(_BaseParser parser=None):
1339 u"""set_default_parser(parser=None)
1341 Set a default parser for the current thread. This parser is used
1342 globally whenever no parser is supplied to the various parse functions of
1343 the lxml API. If this function is called without a parser (or if it is
1344 None), the default parser is reset to the original configuration.
1346 Note that the pre-installed default parser is not thread-safe. Avoid the
1347 default parser in multi-threaded environments. You can create a separate
1348 parser for each thread explicitly or use a parser pool.
1351 parser = __DEFAULT_XML_PARSER
1352 __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
1354 def get_default_parser():
1355 u"get_default_parser()"
1356 return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1358 ############################################################
1360 ############################################################
1362 cdef int _HTML_DEFAULT_PARSE_OPTIONS
1363 _HTML_DEFAULT_PARSE_OPTIONS = (
1364 htmlparser.HTML_PARSE_RECOVER |
1365 htmlparser.HTML_PARSE_NONET |
1366 htmlparser.HTML_PARSE_COMPACT
1369 cdef class HTMLParser(_FeedParser):
1370 u"""HTMLParser(self, encoding=None, remove_blank_text=False, \
1371 remove_comments=False, remove_pis=False, strip_cdata=True, \
1372 no_network=True, target=None, XMLSchema schema=None, \
1373 recover=True, compact=True)
1377 This parser allows reading HTML into a normal XML tree. By
1378 default, it can read broken (non well-formed) HTML, depending on
1379 the capabilities of libxml2. Use the 'recover' option to switch
1382 Available boolean keyword arguments:
1384 - recover - try hard to parse through broken HTML (default: True)
1385 - no_network - prevent network access for related files (default: True)
1386 - remove_blank_text - discard empty text nodes
1387 - remove_comments - discard comments
1388 - remove_pis - discard processing instructions
1389 - strip_cdata - replace CDATA sections by normal text content (default: True)
1390 - compact - safe memory for short text content (default: True)
1392 Other keyword arguments:
1394 - encoding - override the document encoding
1395 - target - a parser target object that will receive the parse events
1396 - schema - an XMLSchema to validate against
1398 Note that you should avoid sharing parsers between threads for performance
1401 def __init__(self, *, encoding=None, remove_blank_text=False,
1402 remove_comments=False, remove_pis=False, strip_cdata=True,
1403 no_network=True, target=None, XMLSchema schema=None,
1404 recover=True, compact=True):
1405 cdef int parse_options
1406 parse_options = _HTML_DEFAULT_PARSE_OPTIONS
1407 if remove_blank_text:
1408 parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
1410 parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
1412 parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
1414 parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
1416 _BaseParser.__init__(self, parse_options, 1, schema,
1417 remove_comments, remove_pis, strip_cdata,
1418 target, None, encoding)
1420 cdef HTMLParser __DEFAULT_HTML_PARSER
1421 __DEFAULT_HTML_PARSER = HTMLParser()
1423 ############################################################
1424 ## helper functions for document creation
1425 ############################################################
1427 cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
1428 cdef char* c_filename
1430 cdef Py_ssize_t c_len
1432 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1436 filename_utf = _encodeFilenameUTF8(filename)
1437 c_filename = _cstr(filename_utf)
1438 if python.PyUnicode_Check(text):
1439 c_len = python.PyUnicode_GET_DATA_SIZE(text)
1440 if c_len > python.INT_MAX:
1441 return (<_BaseParser>parser)._parseDocFromFilelike(
1442 StringIO(text), filename)
1443 return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
1445 c_len = python.PyBytes_GET_SIZE(text)
1446 if c_len > python.INT_MAX:
1447 return (<_BaseParser>parser)._parseDocFromFilelike(
1448 BytesIO(text), filename)
1449 c_text = _cstr(text)
1450 return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
1452 cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
1454 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1455 return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
1457 cdef xmlDoc* _parseDocFromFilelike(source, filename,
1458 _BaseParser parser) except NULL:
1460 parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1461 return (<_BaseParser>parser)._parseDocFromFilelike(source, filename)
1463 cdef xmlDoc* _newXMLDoc() except NULL:
1465 result = tree.xmlNewDoc(NULL)
1467 python.PyErr_NoMemory()
1468 if result.encoding is NULL:
1469 result.encoding = tree.xmlStrdup("UTF-8")
1470 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1473 cdef xmlDoc* _newHTMLDoc() except NULL:
1475 result = tree.htmlNewDoc(NULL, NULL)
1477 python.PyErr_NoMemory()
1478 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1481 cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
1485 result = tree.xmlCopyDoc(c_doc, recursive)
1487 result = tree.xmlCopyDoc(c_doc, 0)
1489 python.PyErr_NoMemory()
1490 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1493 cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
1494 u"Recursively copy the document and make c_new_root the new root node."
1496 cdef xmlNode* c_node
1497 result = tree.xmlCopyDoc(c_doc, 0) # non recursive
1498 __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1500 c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
1502 python.PyErr_NoMemory()
1503 tree.xmlDocSetRootElement(result, c_node)
1504 _copyTail(c_new_root.next, c_node)
1507 cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
1508 u"Recursively copy the element into the document. c_doc is not modified."
1509 cdef xmlNode* c_root
1510 c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
1512 python.PyErr_NoMemory()
1513 _copyTail(c_node.next, c_root)
1517 ############################################################
1518 ## API level helper functions for _Document creation
1519 ## (here we convert to UTF-8)
1520 ############################################################
1522 cdef _Document _parseDocument(source, _BaseParser parser, base_url):
1524 if _isString(source):
1525 # parse the file directly from the filesystem
1526 doc = _parseDocumentFromURL(_encodeFilename(source), parser)
1527 # fix base URL if requested
1528 if base_url is not None:
1529 base_url = _encodeFilenameUTF8(base_url)
1530 if doc._c_doc.URL is not NULL:
1531 tree.xmlFree(doc._c_doc.URL)
1532 doc._c_doc.URL = tree.xmlStrdup(_cstr(base_url))
1535 if base_url is not None:
1538 url = _getFilenameForFile(source)
1540 if hasattr(source, u'getvalue') and hasattr(source, u'tell'):
1541 # StringIO - reading from start?
1542 if source.tell() == 0:
1543 return _parseMemoryDocument(
1544 source.getvalue(), _encodeFilenameUTF8(url), parser)
1546 # Support for file-like objects (urlgrabber.urlopen, ...)
1547 if hasattr(source, u'read'):
1548 return _parseFilelikeDocument(
1549 source, _encodeFilenameUTF8(url), parser)
1551 raise TypeError, u"cannot parse from '%s'" % funicode(python._fqtypename(source))
1553 cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
1555 c_doc = _parseDocFromFile(url, parser)
1556 return _documentFactory(c_doc, parser)
1558 cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
1560 if python.PyUnicode_Check(text):
1561 if _hasEncodingDeclaration(text):
1563 u"Unicode strings with encoding declaration are not supported."
1564 # pass native unicode only if libxml2 can handle it
1565 if _UNICODE_ENCODING is NULL:
1566 text = python.PyUnicode_AsUTF8String(text)
1567 elif not python.PyBytes_Check(text):
1568 raise ValueError, u"can only parse strings"
1569 if python.PyUnicode_Check(url):
1570 url = python.PyUnicode_AsUTF8String(url)
1571 c_doc = _parseDoc(text, url, parser)
1572 return _documentFactory(c_doc, parser)
1574 cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
1576 if python.PyUnicode_Check(url):
1577 url = python.PyUnicode_AsUTF8String(url)
1578 c_doc = _parseDocFromFilelike(source, url, parser)
1579 return _documentFactory(c_doc, parser)