src/lxml/parser.pxi

   1 # Parsers for XML and HTML
   2
   3 cimport xmlparser
   4 cimport htmlparser
   5
   6 cdef class _ParserContext(_ResolverContext)
   7 cdef class _SaxParserContext(_ParserContext)
   8 cdef class _TargetParserContext(_SaxParserContext)
   9 cdef class _ParserSchemaValidationContext
  10 cdef class _Validator
  11 cdef class XMLSchema(_Validator)
  12
  13 class ParseError(LxmlSyntaxError):
  14     u"""Syntax error while parsing an XML document.
  15
  16     For compatibility with ElementTree 1.3 and later.
  17     """
  18     def __init__(self, message, code, line, column):
  19         if python.PY_VERSION_HEX >= 0x02050000:
  20             # Python >= 2.5 uses new style class exceptions
  21             super(_ParseError, self).__init__(message)
  22         else:
  23             _LxmlSyntaxError.__init__(self, message)
  24         self.position = (line, column)
  25         self.code = code
  26
  27 cdef object _LxmlSyntaxError = LxmlSyntaxError
  28 cdef object _ParseError = ParseError
  29
  30 class XMLSyntaxError(ParseError):
  31     u"""Syntax error while parsing an XML document.
  32     """
  33     pass
  34
  35 class ParserError(LxmlError):
  36     u"""Internal lxml parser error.
  37     """
  38     pass
  39
  40 cdef class _ParserDictionaryContext:
  41     # Global parser context to share the string dictionary.
  42     #
  43     # This class is a delegate singleton!
  44     #
  45     # It creates _ParserDictionaryContext objects for each thread to keep thread state,
  46     # but those must never be used directly.  Always stick to using the static
  47     # __GLOBAL_PARSER_CONTEXT as defined below the class.
  48     #
  49
  50     cdef tree.xmlDict* _c_dict
  51     cdef _BaseParser _default_parser
  52     cdef list _implied_parser_contexts
  53
  54     def __cinit__(self):
  55         self._c_dict = NULL
  56         self._implied_parser_contexts = []
  57
  58     def __dealloc__(self):
  59         if self._c_dict is not NULL:
  60             xmlparser.xmlDictFree(self._c_dict)
  61
  62     cdef void initMainParserContext(self):
  63         u"""Put the global context into the thread dictionary of the main
  64         thread.  To be called once and only in the main thread."""
  65         cdef python.PyObject* thread_dict
  66         cdef python.PyObject* result
  67         thread_dict = python.PyThreadState_GetDict()
  68         if thread_dict is not NULL:
  69             (<dict>thread_dict)[u"_ParserDictionaryContext"] = self
  70
  71     cdef _ParserDictionaryContext _findThreadParserContext(self):
  72         u"Find (or create) the _ParserDictionaryContext object for the current thread"
  73         cdef python.PyObject* thread_dict
  74         cdef python.PyObject* result
  75         cdef _ParserDictionaryContext context
  76         thread_dict = python.PyThreadState_GetDict()
  77         if thread_dict is NULL:
  78             return self
  79         d = <dict>thread_dict
  80         result = python.PyDict_GetItem(d, u"_ParserDictionaryContext")
  81         if result is not NULL:
  82             return <object>result
  83         context = _ParserDictionaryContext()
  84         d[u"_ParserDictionaryContext"] = context
  85         return context
  86
  87     cdef void setDefaultParser(self, _BaseParser parser):
  88         u"Set the default parser for the current thread"
  89         cdef _ParserDictionaryContext context
  90         context = self._findThreadParserContext()
  91         context._default_parser = parser
  92
  93     cdef _BaseParser getDefaultParser(self):
  94         u"Return (or create) the default parser of the current thread"
  95         cdef _ParserDictionaryContext context
  96         context = self._findThreadParserContext()
  97         if context._default_parser is None:
  98             if self._default_parser is None:
  99                 self._default_parser = __DEFAULT_XML_PARSER._copy()
 100             if context is not self:
 101                 context._default_parser = self._default_parser._copy()
 102         return context._default_parser
 103
 104     cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
 105         u"Return the thread-local dict or create a new one if necessary."
 106         cdef _ParserDictionaryContext context
 107         context = self._findThreadParserContext()
 108         if context._c_dict is NULL:
 109             # thread dict not yet set up => use default or create a new one
 110             if default is not NULL:
 111                 context._c_dict = default
 112                 xmlparser.xmlDictReference(default)
 113                 return default
 114             if self._c_dict is NULL:
 115                 self._c_dict = xmlparser.xmlDictCreate()
 116             if context is not self:
 117                 context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
 118         return context._c_dict
 119
 120     cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref):
 121         cdef tree.xmlDict* c_dict
 122         cdef tree.xmlDict* c_thread_dict
 123         c_dict = c_dict_ref[0]
 124         c_thread_dict = self._getThreadDict(c_dict)
 125         if c_dict is c_thread_dict:
 126             return
 127         if c_dict is not NULL:
 128             xmlparser.xmlDictFree(c_dict)
 129         c_dict_ref[0] = c_thread_dict
 130         xmlparser.xmlDictReference(c_thread_dict)
 131
 132     cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt):
 133         u"Assure we always use the same string dictionary."
 134         self.initThreadDictRef(&pctxt.dict)
 135
 136     cdef void initXPathParserDict(self, xpath.xmlXPathContext* pctxt):
 137         u"Assure we always use the same string dictionary."
 138         self.initThreadDictRef(&pctxt.dict)
 139
 140     cdef void initDocDict(self, xmlDoc* result):
 141         u"Store dict of last object parsed if no shared dict yet"
 142         # XXX We also free the result dict here if there already was one.
 143         # This case should only occur for new documents with empty dicts,
 144         # otherwise we'd free data that's in use => segfault
 145         self.initThreadDictRef(&result.dict)
 146
 147     cdef _ParserContext findImpliedContext(self):
 148         u"""Return any current implied xml parser context for the current
 149         thread.  This is used when the resolver functions are called
 150         with an xmlParserCtxt that was generated from within libxml2
 151         (i.e. without a _ParserContext) - which happens when parsing
 152         schema and xinclude external references."""
 153         cdef _ParserDictionaryContext context
 154         cdef _ParserContext implied_context
 155
 156         # see if we have a current implied parser
 157         context = self._findThreadParserContext()
 158         if python.PyList_GET_SIZE(context._implied_parser_contexts):
 159             implied_context = context._implied_parser_contexts[-1]
 160             return implied_context
 161         return None
 162
 163     cdef void pushImpliedContextFromParser(self, _BaseParser parser):
 164         u"Push a new implied context object taken from the parser."
 165         if parser is not None:
 166             self.pushImpliedContext(parser._getParserContext())
 167         else:
 168             self.pushImpliedContext(None)
 169
 170     cdef void pushImpliedContext(self, _ParserContext parser_context):
 171         u"Push a new implied context object."
 172         cdef _ParserDictionaryContext context
 173         context = self._findThreadParserContext()
 174         context._implied_parser_contexts.append(parser_context)
 175
 176     cdef void popImpliedContext(self):
 177         u"Pop the current implied context object."
 178         cdef _ParserDictionaryContext context
 179         context = self._findThreadParserContext()
 180         context._implied_parser_contexts.pop()
 181
 182 cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT
 183 __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
 184 __GLOBAL_PARSER_CONTEXT.initMainParserContext()
 185
 186 cdef int _checkThreadDict(tree.xmlDict* c_dict):
 187     u"""Check that c_dict is either the local thread dictionary or the global
 188     parent dictionary.
 189     """
 190     #if __GLOBAL_PARSER_CONTEXT._c_dict is c_dict:
 191     #    return 1 # main thread
 192     if __GLOBAL_PARSER_CONTEXT._getThreadDict(NULL) is c_dict:
 193         return 1 # local thread dict
 194     return 0
 195
 196 ############################################################
 197 ## support for Python unicode I/O
 198 ############################################################
 199
 200 # name of Python unicode encoding as known to libxml2
 201 cdef char* _UNICODE_ENCODING = NULL
 202
 203 cdef int _setupPythonUnicode() except -1:
 204     u"""Sets _UNICODE_ENCODING to the internal encoding name of Python unicode
 205     strings if libxml2 supports reading native Python unicode.  This depends
 206     on iconv and the local Python installation, so we simply check if we find
 207     a matching encoding handler.
 208     """
 209     cdef tree.xmlCharEncodingHandler* enchandler
 210     cdef Py_ssize_t l
 211     cdef char* buffer
 212     cdef char* enc
 213     utext = python.PyUnicode_DecodeUTF8("<test/>", 7, NULL)
 214     l = python.PyUnicode_GET_DATA_SIZE(utext)
 215     buffer = python.PyUnicode_AS_DATA(utext)
 216     enc = _findEncodingName(buffer, l)
 217     if enc == NULL:
 218         # apparently, libxml2 can't detect UTF-16 on some systems
 219         if l >= 4 and \
 220                buffer[0] == c'<' and buffer[1] == c'\0' and \
 221                buffer[2] == c't' and buffer[3] == c'\0':
 222             enc = "UTF-16LE"
 223         elif l >= 4 and \
 224                buffer[0] == c'\0' and buffer[1] == c'<' and \
 225                buffer[2] == c'\0' and buffer[3] == c't':
 226             enc = "UTF-16BE"
 227         else:
 228             # not my fault, it's YOUR broken system :)
 229             return 0
 230     enchandler = tree.xmlFindCharEncodingHandler(enc)
 231     if enchandler is not NULL:
 232         global _UNICODE_ENCODING
 233         tree.xmlCharEncCloseFunc(enchandler)
 234         _UNICODE_ENCODING = enc
 235     return 0
 236
 237 cdef char* _findEncodingName(char* buffer, int size):
 238     u"Work around bug in libxml2: find iconv name of encoding on our own."
 239     cdef tree.xmlCharEncoding enc
 240     enc = tree.xmlDetectCharEncoding(buffer, size)
 241     if enc == tree.XML_CHAR_ENCODING_UTF16LE:
 242         return "UTF-16LE"
 243     elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
 244         return "UTF-16BE"
 245     elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
 246         return "UCS-4LE"
 247     elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
 248         return "UCS-4BE"
 249     elif enc == tree.XML_CHAR_ENCODING_NONE:
 250         return NULL
 251     else:
 252         # returns a constant char*, no need to free it
 253         return tree.xmlGetCharEncodingName(enc)
 254
 255 _setupPythonUnicode()
 256
 257 ############################################################
 258 ## support for file-like objects
 259 ############################################################
 260
 261 cdef class _FileReaderContext:
 262     cdef object _filelike
 263     cdef object _encoding
 264     cdef object _url
 265     cdef object _bytes
 266     cdef _ExceptionContext _exc_context
 267     cdef Py_ssize_t _bytes_read
 268     cdef char* _c_url
 269     cdef bint _close_file_after_read
 270
 271     def __cinit__(self, filelike, exc_context, url, encoding=None, bint close_file=False):
 272         self._exc_context = exc_context
 273         self._filelike = filelike
 274         self._close_file_after_read = close_file
 275         self._encoding = encoding
 276         if url is None:
 277             self._c_url = NULL
 278         else:
 279             url = _encodeFilename(url)
 280             self._c_url = _cstr(url)
 281         self._url = url
 282         self._bytes  = b''
 283         self._bytes_read = 0
 284
 285     cdef _close_file(self):
 286         if self._filelike is None or not self._close_file_after_read:
 287             return
 288         try:
 289             close = self._filelike.close
 290         except AttributeError:
 291             close = None
 292         finally:
 293             self._filelike = None
 294         if close is not None:
 295             close()
 296
 297     cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self):
 298         cdef cstd.FILE* c_stream
 299         cdef xmlparser.xmlParserInputBuffer* c_buffer
 300         c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
 301         c_stream = python.PyFile_AsFile(self._filelike)
 302         if c_stream is NULL:
 303             c_buffer.readcallback  = _readFilelikeParser
 304             c_buffer.context = <python.PyObject*>self
 305         else:
 306             c_buffer.readcallback  = _readFileParser
 307             c_buffer.context = c_stream
 308         return c_buffer
 309
 310     cdef xmlparser.xmlParserInput* _createParserInput(
 311             self, xmlparser.xmlParserCtxt* ctxt):
 312         cdef xmlparser.xmlParserInputBuffer* c_buffer
 313         c_buffer = self._createParserInputBuffer()
 314         return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
 315
 316     cdef tree.xmlDtd* _readDtd(self):
 317         cdef xmlparser.xmlParserInputBuffer* c_buffer
 318         c_buffer = self._createParserInputBuffer()
 319         with nogil:
 320             return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
 321
 322     cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options):
 323         cdef xmlDoc* result
 324         cdef char* c_encoding
 325         cdef cstd.FILE* c_stream
 326         cdef xmlparser.xmlInputReadCallback c_read_callback
 327         cdef xmlparser.xmlInputCloseCallback c_close_callback
 328         cdef void* c_callback_context
 329
 330         if self._encoding is None:
 331             c_encoding = NULL
 332         else:
 333             c_encoding = _cstr(self._encoding)
 334
 335         c_stream = python.PyFile_AsFile(self._filelike)
 336         if c_stream is NULL:
 337             c_read_callback  = _readFilelikeParser
 338             c_callback_context = <python.PyObject*>self
 339         else:
 340             c_read_callback  = _readFileParser
 341             c_callback_context = c_stream
 342
 343         with nogil:
 344             if ctxt.html:
 345                 result = htmlparser.htmlCtxtReadIO(
 346                         ctxt, c_read_callback, NULL, c_callback_context,
 347                         self._c_url, c_encoding, options)
 348                 if result is not NULL:
 349                     if _fixHtmlDictNames(ctxt.dict, result) < 0:
 350                         tree.xmlFreeDoc(result)
 351                         result = NULL
 352             else:
 353                 result = xmlparser.xmlCtxtReadIO(
 354                     ctxt, c_read_callback, NULL, c_callback_context,
 355                     self._c_url, c_encoding, options)
 356         self._close_file()
 357         return result
 358
 359     cdef int copyToBuffer(self, char* c_buffer, int c_requested):
 360         cdef int c_byte_count
 361         cdef char* c_start
 362         cdef Py_ssize_t byte_count, remaining
 363         if self._bytes_read < 0:
 364             return 0
 365         try:
 366             c_byte_count = 0
 367             byte_count = python.PyBytes_GET_SIZE(self._bytes)
 368             remaining  = byte_count - self._bytes_read
 369             while c_requested > remaining:
 370                 c_start = _cstr(self._bytes) + self._bytes_read
 371                 cstd.memcpy(c_buffer, c_start, remaining)
 372                 c_byte_count += remaining
 373                 c_buffer += remaining
 374                 c_requested -= remaining
 375
 376                 self._bytes = self._filelike.read(c_requested)
 377                 if not python.PyBytes_Check(self._bytes):
 378                     if python.PyUnicode_Check(self._bytes):
 379                         if self._encoding is None:
 380                             self._bytes = python.PyUnicode_AsUTF8String(self._bytes)
 381                         else:
 382                             self._bytes = python.PyUnicode_AsEncodedString(
 383                                 self._bytes, _cstr(self._encoding), NULL)
 384                     else:
 385                         self._close_file()
 386                         raise TypeError, \
 387                             u"reading from file-like objects must return byte strings or unicode strings"
 388
 389                 remaining = python.PyBytes_GET_SIZE(self._bytes)
 390                 if remaining == 0:
 391                     self._bytes_read = -1
 392                     self._close_file()
 393                     return c_byte_count
 394                 self._bytes_read = 0
 395
 396             if c_requested > 0:
 397                 c_start = _cstr(self._bytes) + self._bytes_read
 398                 cstd.memcpy(c_buffer, c_start, c_requested)
 399                 c_byte_count += c_requested
 400                 self._bytes_read += c_requested
 401             return c_byte_count
 402         except:
 403             self._exc_context._store_raised()
 404             self._close_file()
 405             return -1
 406
 407 cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) with gil:
 408     return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
 409
 410 cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) nogil:
 411     return cstd.fread(c_buffer, 1,  c_size, <cstd.FILE*>ctxt)
 412
 413 ############################################################
 414 ## support for custom document loaders
 415 ############################################################
 416
 417 cdef xmlparser.xmlParserInput* _local_resolver(char* c_url, char* c_pubid,
 418                                                xmlparser.xmlParserCtxt* c_context) with gil:
 419     cdef _ResolverContext context
 420     cdef xmlparser.xmlParserInput* c_input
 421     cdef _InputDocument doc_ref
 422     cdef _FileReaderContext file_context
 423     # if there is no _ParserContext associated with the xmlParserCtxt
 424     # passed, check to see if the thread state object has an implied
 425     # context.
 426     if c_context._private is not NULL:
 427         context = <_ResolverContext>c_context._private
 428     else:
 429         context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
 430
 431     if context is None:
 432         if __DEFAULT_ENTITY_LOADER is NULL:
 433             return NULL
 434         return __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
 435
 436     try:
 437         if c_url is NULL:
 438             url = None
 439         else:
 440             # parsing a related document (DTD etc.) => UTF-8 encoded URL?
 441             url = _decodeFilename(c_url)
 442         if c_pubid is NULL:
 443             pubid = None
 444         else:
 445             pubid = funicode(c_pubid) # always UTF-8
 446
 447         doc_ref = context._resolvers.resolve(url, pubid, context)
 448     except:
 449         context._store_raised()
 450         return NULL
 451
 452     if doc_ref is not None:
 453         if doc_ref._type == PARSER_DATA_STRING:
 454             data = doc_ref._data_bytes
 455             c_input = xmlparser.xmlNewInputStream(c_context)
 456             if c_input is not NULL:
 457                 c_input.base = _cstr(data)
 458                 c_input.length = python.PyBytes_GET_SIZE(data)
 459                 c_input.cur = c_input.base
 460                 c_input.end = &c_input.base[c_input.length]
 461         elif doc_ref._type == PARSER_DATA_FILENAME:
 462             data = None
 463             c_input = xmlparser.xmlNewInputFromFile(
 464                 c_context, _cstr(doc_ref._filename))
 465         elif doc_ref._type == PARSER_DATA_FILE:
 466             file_context = _FileReaderContext(doc_ref._file, context, url,
 467                                               None, doc_ref._close_file)
 468             c_input = file_context._createParserInput(c_context)
 469             data = file_context
 470         else:
 471             data = None
 472             c_input = NULL
 473
 474         if data is not None:
 475             context._storage.add(data)
 476         if c_input is not NULL:
 477             return c_input
 478
 479     if __DEFAULT_ENTITY_LOADER is NULL:
 480         return NULL
 481     return __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
 482
 483 cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
 484 __DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
 485
 486 xmlparser.xmlSetExternalEntityLoader(_local_resolver)
 487
 488 ############################################################
 489 ## Parsers
 490 ############################################################
 491
 492 cdef class _ParserContext(_ResolverContext):
 493     cdef _ErrorLog _error_log
 494     cdef _ParserSchemaValidationContext _validator
 495     cdef xmlparser.xmlParserCtxt* _c_ctxt
 496     cdef python.PyThread_type_lock _lock
 497     def __cinit__(self):
 498         self._c_ctxt = NULL
 499         if not config.ENABLE_THREADING:
 500             self._lock = NULL
 501         else:
 502             self._lock = python.PyThread_allocate_lock()
 503         self._error_log = _ErrorLog()
 504
 505     def __dealloc__(self):
 506         if self._validator is not None:
 507             self._validator.disconnect()
 508         if self._lock is not NULL:
 509             python.PyThread_free_lock(self._lock)
 510         if self._c_ctxt is not NULL:
 511             xmlparser.xmlFreeParserCtxt(self._c_ctxt)
 512
 513     cdef _ParserContext _copy(self):
 514         cdef _ParserContext context
 515         context = self.__class__()
 516         context._validator = self._validator.copy()
 517         _initParserContext(context, self._resolvers._copy(), NULL)
 518         return context
 519
 520     cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
 521         self._c_ctxt = c_ctxt
 522         c_ctxt._private = <void*>self
 523
 524     cdef void _resetParserContext(self):
 525         if self._c_ctxt is not NULL:
 526             if self._c_ctxt.html:
 527                 htmlparser.htmlCtxtReset(self._c_ctxt)
 528                 self._c_ctxt.disableSAX = 0 # work around bug in libxml2
 529             elif self._c_ctxt.spaceTab is not NULL or \
 530                     _LIBXML_VERSION_INT >= 20629: # work around bug in libxml2
 531                 xmlparser.xmlClearParserCtxt(self._c_ctxt)
 532
 533     cdef int prepare(self) except -1:
 534         cdef int result
 535         if config.ENABLE_THREADING and self._lock is not NULL:
 536             with nogil:
 537                 result = python.PyThread_acquire_lock(
 538                     self._lock, python.WAIT_LOCK)
 539             if result == 0:
 540                 raise ParserError, u"parser locking failed"
 541         self._error_log.connect()
 542         if self._validator is not None:
 543             self._validator.connect(self._c_ctxt)
 544         return 0
 545
 546     cdef int cleanup(self) except -1:
 547         if self._validator is not None:
 548             self._validator.disconnect()
 549         self._resetParserContext()
 550         self.clear()
 551         self._error_log.disconnect()
 552         if config.ENABLE_THREADING and self._lock is not NULL:
 553             python.PyThread_release_lock(self._lock)
 554         return 0
 555
 556     cdef object _handleParseResult(self, _BaseParser parser,
 557                                    xmlDoc* result, filename):
 558         cdef xmlDoc* c_doc
 559         cdef bint recover
 560         recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
 561         c_doc = _handleParseResult(self, self._c_ctxt, result,
 562                                    filename, recover)
 563         return _documentFactory(c_doc, parser)
 564
 565     cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
 566                                        xmlDoc* result, filename) except NULL:
 567         cdef bint recover
 568         recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
 569         return _handleParseResult(self, self._c_ctxt, result,
 570                                    filename, recover)
 571
 572 cdef _initParserContext(_ParserContext context,
 573                         _ResolverRegistry resolvers,
 574                         xmlparser.xmlParserCtxt* c_ctxt):
 575     _initResolverContext(context, resolvers)
 576     if c_ctxt is not NULL:
 577         context._initParserContext(c_ctxt)
 578
 579 cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
 580                           _ErrorLog error_log) except 0:
 581     if filename is not None and \
 582            ctxt.lastError.domain == xmlerror.XML_FROM_IO:
 583         if ctxt.lastError.message is not NULL:
 584             message = u"Error reading file '%s': %s" % (
 585                 filename, (ctxt.lastError.message).strip())
 586         else:
 587             message = u"Error reading '%s'" % filename
 588         raise IOError, message
 589     elif error_log:
 590         raise error_log._buildParseException(
 591             XMLSyntaxError, u"Document is not well formed")
 592     elif ctxt.lastError.message is not NULL:
 593         message = (ctxt.lastError.message).strip()
 594         code = ctxt.lastError.code
 595         line = ctxt.lastError.line
 596         column = ctxt.lastError.int2
 597         if ctxt.lastError.line > 0:
 598             message = u"line %d: %s" % (line, message)
 599         raise XMLSyntaxError(message, code, line, column)
 600     else:
 601         raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0)
 602
 603 cdef xmlDoc* _handleParseResult(_ParserContext context,
 604                                 xmlparser.xmlParserCtxt* c_ctxt,
 605                                 xmlDoc* result, filename,
 606                                 bint recover) except NULL:
 607     cdef bint well_formed
 608     if result is not NULL:
 609         __GLOBAL_PARSER_CONTEXT.initDocDict(result)
 610
 611     if c_ctxt.myDoc is not NULL:
 612         if c_ctxt.myDoc is not result:
 613             __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
 614             tree.xmlFreeDoc(c_ctxt.myDoc)
 615         c_ctxt.myDoc = NULL
 616
 617     if result is not NULL:
 618         if context._validator is not None and \
 619                 not context._validator.isvalid():
 620             well_formed = 0 # actually not 'valid', but anyway ...
 621         elif recover or (c_ctxt.wellFormed and \
 622                              c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
 623             well_formed = 1
 624         elif not c_ctxt.replaceEntities and not c_ctxt.validate \
 625                  and context is not None:
 626             # in this mode, we ignore errors about undefined entities
 627             for error in context._error_log.filter_from_errors():
 628                 if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
 629                        error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
 630                     well_formed = 0
 631                     break
 632             else:
 633                 well_formed = 1
 634         else:
 635             well_formed = 0
 636
 637         if not well_formed:
 638             # free broken document
 639             tree.xmlFreeDoc(result)
 640             result = NULL
 641
 642     if context is not None and context._has_raised():
 643         if result is not NULL:
 644             tree.xmlFreeDoc(result)
 645             result = NULL
 646         context._raise_if_stored()
 647
 648     if result is NULL:
 649         if context is not None:
 650             _raiseParseError(c_ctxt, filename, context._error_log)
 651         else:
 652             _raiseParseError(c_ctxt, filename, None)
 653     else:
 654         if result.URL is NULL and filename is not None:
 655             result.URL = tree.xmlStrdup(_cstr(filename))
 656         if result.encoding is NULL:
 657             result.encoding = tree.xmlStrdup("UTF-8")
 658
 659     if context._validator is not None and \
 660            context._validator._add_default_attributes:
 661         # we currently need to do this here as libxml2 does not
 662         # support inserting default attributes during parse-time
 663         # validation
 664         context._validator.inject_default_attributes(result)
 665
 666     return result
 667
 668 cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil:
 669     cdef xmlNode* c_node
 670     if c_doc is NULL:
 671         return 0
 672     c_node = c_doc.children
 673     tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 0)
 674     if c_node.type == tree.XML_ELEMENT_NODE:
 675         if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
 676             return -1
 677     tree.END_FOR_EACH_ELEMENT_FROM(c_node)
 678     return 0
 679
 680 cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
 681                                       xmlNode* c_node) nogil:
 682     cdef xmlNode* c_attr
 683     cdef char* c_name
 684     c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
 685     if c_name is NULL:
 686         return -1
 687     if c_name is not c_node.name:
 688         tree.xmlFree(c_node.name)
 689         c_node.name = c_name
 690     c_attr = <xmlNode*>c_node.properties
 691     while c_attr is not NULL:
 692         c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
 693         if c_name is NULL:
 694             return -1
 695         if c_name is not c_attr.name:
 696             tree.xmlFree(c_attr.name)
 697             c_attr.name = c_name
 698         c_attr = c_attr.next
 699     return 0
 700
 701 cdef class _BaseParser:
 702     cdef ElementClassLookup _class_lookup
 703     cdef _ResolverRegistry _resolvers
 704     cdef _ParserContext _parser_context
 705     cdef _ParserContext _push_parser_context
 706     cdef int _parse_options
 707     cdef bint _for_html
 708     cdef bint _remove_comments
 709     cdef bint _remove_pis
 710     cdef bint _strip_cdata
 711     cdef XMLSchema _schema
 712     cdef object _filename
 713     cdef readonly object target
 714     cdef object _default_encoding
 715
 716     def __init__(self, int parse_options, bint for_html, XMLSchema schema,
 717                  remove_comments, remove_pis, strip_cdata, target,
 718                  filename, encoding):
 719         cdef tree.xmlCharEncodingHandler* enchandler
 720         cdef int c_encoding
 721         if not isinstance(self, HTMLParser) and \
 722                 not isinstance(self, XMLParser) and \
 723                 not isinstance(self, iterparse):
 724             raise TypeError, u"This class cannot be instantiated"
 725
 726         self._parse_options = parse_options
 727         self._filename = filename
 728         self.target = target
 729         self._for_html = for_html
 730         self._remove_comments = remove_comments
 731         self._remove_pis = remove_pis
 732         self._strip_cdata = strip_cdata
 733         self._schema = schema
 734
 735         self._resolvers = _ResolverRegistry()
 736
 737         if encoding is None:
 738             self._default_encoding = None
 739         else:
 740             encoding = _utf8(encoding)
 741             enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
 742             if enchandler is NULL:
 743                 raise LookupError, u"unknown encoding: '%s'" % encoding
 744             tree.xmlCharEncCloseFunc(enchandler)
 745             self._default_encoding = encoding
 746
 747     cdef _ParserContext _getParserContext(self):
 748         cdef xmlparser.xmlParserCtxt* pctxt
 749         if self._parser_context is None:
 750             self._parser_context = self._createContext(self.target)
 751             if self._schema is not None:
 752                 self._parser_context._validator = \
 753                     self._schema._newSaxValidator(
 754                         self._parse_options & xmlparser.XML_PARSE_DTDATTR)
 755             pctxt = self._newParserCtxt()
 756             if pctxt is NULL:
 757                 python.PyErr_NoMemory()
 758             _initParserContext(self._parser_context, self._resolvers, pctxt)
 759             if self._remove_comments:
 760                 pctxt.sax.comment = NULL
 761             if self._remove_pis:
 762                 pctxt.sax.processingInstruction = NULL
 763             if self._strip_cdata:
 764                 # hard switch-off for CDATA nodes => makes them plain text
 765                 pctxt.sax.cdataBlock = NULL
 766         return self._parser_context
 767
 768     cdef _ParserContext _getPushParserContext(self):
 769         cdef xmlparser.xmlParserCtxt* pctxt
 770         if self._push_parser_context is None:
 771             self._push_parser_context = self._createContext(self.target)
 772             if self._schema is not None:
 773                 self._push_parser_context._validator = \
 774                     self._schema._newSaxValidator(
 775                         self._parse_options & xmlparser.XML_PARSE_DTDATTR)
 776             pctxt = self._newPushParserCtxt()
 777             if pctxt is NULL:
 778                 python.PyErr_NoMemory()
 779             _initParserContext(
 780                 self._push_parser_context, self._resolvers, pctxt)
 781             if self._remove_comments:
 782                 pctxt.sax.comment = NULL
 783             if self._remove_pis:
 784                 pctxt.sax.processingInstruction = NULL
 785             if self._strip_cdata:
 786                 # hard switch-off for CDATA nodes => makes them plain text
 787                 pctxt.sax.cdataBlock = NULL
 788         return self._push_parser_context
 789
 790     cdef _ParserContext _createContext(self, target):
 791         cdef _TargetParserContext context
 792         if target is None:
 793             return _ParserContext()
 794         context = _TargetParserContext()
 795         context._setTarget(target)
 796         return context
 797
 798     cdef xmlparser.xmlParserCtxt* _newParserCtxt(self):
 799         if self._for_html:
 800             return htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
 801         else:
 802             return xmlparser.xmlNewParserCtxt()
 803
 804     cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self):
 805         cdef xmlparser.xmlParserCtxt* c_ctxt
 806         cdef char* c_filename
 807         if self._filename is not None:
 808             c_filename = _cstr(self._filename)
 809         else:
 810             c_filename = NULL
 811         if self._for_html:
 812             c_ctxt = htmlparser.htmlCreatePushParserCtxt(
 813                 NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
 814             if c_ctxt is not NULL:
 815                 htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
 816         else:
 817             c_ctxt = xmlparser.xmlCreatePushParserCtxt(
 818                 NULL, NULL, NULL, 0, c_filename)
 819             if c_ctxt is not NULL:
 820                 xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
 821         return c_ctxt
 822
 823     property error_log:
 824         u"""The error log of the last parser run.
 825         """
 826         def __get__(self):
 827             cdef _ParserContext context
 828             context = self._getParserContext()
 829             return context._error_log.copy()
 830
 831     property resolvers:
 832         u"The custom resolver registry of this parser."
 833         def __get__(self):
 834             return self._resolvers
 835
 836     property version:
 837         u"The version of the underlying XML parser."
 838         def __get__(self):
 839             return u"libxml2 %d.%d.%d" % LIBXML_VERSION
 840
 841     def setElementClassLookup(self, ElementClassLookup lookup = None):
 842         u":deprecated: use ``parser.set_element_class_lookup(lookup)`` instead."
 843         self.set_element_class_lookup(lookup)
 844
 845     def set_element_class_lookup(self, ElementClassLookup lookup = None):
 846         u"""set_element_class_lookup(self, lookup = None)
 847
 848         Set a lookup scheme for element classes generated from this parser.
 849
 850         Reset it by passing None or nothing.
 851         """
 852         self._class_lookup = lookup
 853
 854     cdef _BaseParser _copy(self):
 855         u"Create a new parser with the same configuration."
 856         cdef _BaseParser parser
 857         parser = self.__class__()
 858         parser._parse_options = self._parse_options
 859         parser._for_html = self._for_html
 860         parser._remove_comments = self._remove_comments
 861         parser._remove_pis = self._remove_pis
 862         parser._strip_cdata = self._strip_cdata
 863         parser._filename = self._filename
 864         parser._resolvers = self._resolvers
 865         parser.target = self.target
 866         parser._class_lookup  = self._class_lookup
 867         parser._default_encoding = self._default_encoding
 868         parser._schema = self._schema
 869         return parser
 870
 871     def copy(self):
 872         u"""copy(self)
 873
 874         Create a new parser with the same configuration.
 875         """
 876         return self._copy()
 877
 878     def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
 879         u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
 880
 881         Creates a new element associated with this parser.
 882         """
 883         return _makeElement(_tag, NULL, None, self, None, None,
 884                             attrib, nsmap, _extra)
 885
 886     # internal parser methods
 887
 888     cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
 889         u"""Parse unicode document, share dictionary if possible.
 890         """
 891         cdef _ParserContext context
 892         cdef xmlDoc* result
 893         cdef xmlparser.xmlParserCtxt* pctxt
 894         cdef Py_ssize_t py_buffer_len
 895         cdef int buffer_len
 896         cdef char* c_text
 897         py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
 898         if py_buffer_len > python.INT_MAX or _UNICODE_ENCODING is NULL:
 899             text_utf = python.PyUnicode_AsUTF8String(utext)
 900             py_buffer_len = python.PyBytes_GET_SIZE(text_utf)
 901             return self._parseDoc(_cstr(text_utf), py_buffer_len, c_filename)
 902         buffer_len = py_buffer_len
 903
 904         context = self._getParserContext()
 905         context.prepare()
 906         try:
 907             pctxt = context._c_ctxt
 908             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
 909
 910             c_text = python.PyUnicode_AS_DATA(utext)
 911             with nogil:
 912                 if self._for_html:
 913                     result = htmlparser.htmlCtxtReadMemory(
 914                         pctxt, c_text, buffer_len, c_filename, _UNICODE_ENCODING,
 915                         self._parse_options)
 916                     if result is not NULL:
 917                         if _fixHtmlDictNames(pctxt.dict, result) < 0:
 918                             tree.xmlFreeDoc(result)
 919                             result = NULL
 920                 else:
 921                     result = xmlparser.xmlCtxtReadMemory(
 922                         pctxt, c_text, buffer_len, c_filename, _UNICODE_ENCODING,
 923                         self._parse_options)
 924
 925             return context._handleParseResultDoc(self, result, None)
 926         finally:
 927             context.cleanup()
 928
 929     cdef xmlDoc* _parseDoc(self, char* c_text, Py_ssize_t c_len,
 930                            char* c_filename) except NULL:
 931         u"""Parse document, share dictionary if possible.
 932         """
 933         cdef _ParserContext context
 934         cdef xmlDoc* result
 935         cdef xmlparser.xmlParserCtxt* pctxt
 936         cdef char* c_encoding
 937         if c_len > python.INT_MAX:
 938             raise ParserError, u"string is too long to parse it with libxml2"
 939
 940         context = self._getParserContext()
 941         context.prepare()
 942         try:
 943             pctxt = context._c_ctxt
 944             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
 945
 946             if self._default_encoding is None:
 947                 c_encoding = NULL
 948             else:
 949                 c_encoding = _cstr(self._default_encoding)
 950
 951             with nogil:
 952                 if self._for_html:
 953                     result = htmlparser.htmlCtxtReadMemory(
 954                         pctxt, c_text, c_len, c_filename,
 955                         c_encoding, self._parse_options)
 956                     if result is not NULL:
 957                         if _fixHtmlDictNames(pctxt.dict, result) < 0:
 958                             tree.xmlFreeDoc(result)
 959                             result = NULL
 960                 else:
 961                     result = xmlparser.xmlCtxtReadMemory(
 962                         pctxt, c_text, c_len, c_filename,
 963                         c_encoding, self._parse_options)
 964
 965             return context._handleParseResultDoc(self, result, None)
 966         finally:
 967             context.cleanup()
 968
 969     cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
 970         cdef _ParserContext context
 971         cdef xmlDoc* result
 972         cdef xmlparser.xmlParserCtxt* pctxt
 973         cdef int orig_options
 974         cdef char* c_encoding
 975         result = NULL
 976
 977         context = self._getParserContext()
 978         context.prepare()
 979         try:
 980             pctxt = context._c_ctxt
 981             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
 982
 983             if self._default_encoding is None:
 984                 c_encoding = NULL
 985             else:
 986                 c_encoding = _cstr(self._default_encoding)
 987
 988             orig_options = pctxt.options
 989             with nogil:
 990                 if self._for_html:
 991                     result = htmlparser.htmlCtxtReadFile(
 992                         pctxt, c_filename, c_encoding, self._parse_options)
 993                     if result is not NULL:
 994                         if _fixHtmlDictNames(pctxt.dict, result) < 0:
 995                             tree.xmlFreeDoc(result)
 996                             result = NULL
 997                 else:
 998                     result = xmlparser.xmlCtxtReadFile(
 999                         pctxt, c_filename, c_encoding, self._parse_options)
1000             pctxt.options = orig_options # work around libxml2 problem
1001
1002             return context._handleParseResultDoc(self, result, c_filename)
1003         finally:
1004             context.cleanup()
1005
1006     cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename) except NULL:
1007         cdef _ParserContext context
1008         cdef _FileReaderContext file_context
1009         cdef xmlDoc* result
1010         cdef xmlparser.xmlParserCtxt* pctxt
1011         cdef char* c_filename
1012         if not filename:
1013             filename = None
1014
1015         context = self._getParserContext()
1016         context.prepare()
1017         try:
1018             pctxt = context._c_ctxt
1019             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1020             file_context = _FileReaderContext(
1021                 filelike, context, filename, self._default_encoding)
1022             result = file_context._readDoc(pctxt, self._parse_options)
1023
1024             return context._handleParseResultDoc(
1025                 self, result, filename)
1026         finally:
1027             context.cleanup()
1028
1029 ############################################################
1030 ## ET feed parser
1031 ############################################################
1032
1033 cdef class _FeedParser(_BaseParser):
1034     cdef bint _feed_parser_running
1035
1036     property feed_error_log:
1037         u"""The error log of the last (or current) run of the feed parser.
1038
1039         Note that this is local to the feed parser and thus is
1040         different from what the ``error_log`` property returns.
1041         """
1042         def __get__(self):
1043             cdef _ParserContext context
1044             context = self._getPushParserContext()
1045             return context._error_log.copy()
1046
1047     def feed(self, data):
1048         u"""feed(self, data)
1049
1050         Feeds data to the parser.  The argument should be an 8-bit string
1051         buffer containing encoded data, although Unicode is supported as long
1052         as both string types are not mixed.
1053
1054         This is the main entry point to the consumer interface of a
1055         parser.  The parser will parse as much of the XML stream as it
1056         can on each call.  To finish parsing or to reset the parser,
1057         call the ``close()`` method.  Both methods may raise
1058         ParseError if errors occur in the input data.  If an error is
1059         raised, there is no longer a need to call ``close()``.
1060
1061         The feed parser interface is independent of the normal parser
1062         usage.  You can use the same parser as a feed parser and in
1063         the ``parse()`` function concurrently.
1064         """
1065         cdef _ParserContext context
1066         cdef xmlparser.xmlParserCtxt* pctxt
1067         cdef Py_ssize_t py_buffer_len
1068         cdef char* c_data
1069         cdef char* c_encoding
1070         cdef int buffer_len
1071         cdef int error
1072         cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
1073         if python.PyBytes_Check(data):
1074             if self._default_encoding is None:
1075                 c_encoding = NULL
1076             else:
1077                 c_encoding = self._default_encoding
1078             c_data = _cstr(data)
1079             py_buffer_len = python.PyBytes_GET_SIZE(data)
1080         elif python.PyUnicode_Check(data):
1081             if _UNICODE_ENCODING is NULL:
1082                 raise ParserError, \
1083                     u"Unicode parsing is not supported on this platform"
1084             c_encoding = _UNICODE_ENCODING
1085             c_data = python.PyUnicode_AS_DATA(data)
1086             py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data)
1087         else:
1088             raise TypeError, u"Parsing requires string data"
1089
1090         context = self._getPushParserContext()
1091         pctxt = context._c_ctxt
1092         error = 0
1093         if not self._feed_parser_running:
1094             context.prepare()
1095             self._feed_parser_running = 1
1096             __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
1097             if self._for_html:
1098                 error = _htmlCtxtResetPush(
1099                     pctxt, NULL, 0, c_encoding, self._parse_options)
1100             else:
1101                 xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
1102                 error = xmlparser.xmlCtxtResetPush(
1103                     pctxt, NULL, 0, NULL, c_encoding)
1104
1105         #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
1106
1107         while py_buffer_len > 0 and (error == 0 or recover):
1108             with nogil:
1109                 if py_buffer_len > python.INT_MAX:
1110                     buffer_len = python.INT_MAX
1111                 else:
1112                     buffer_len = <int>py_buffer_len
1113                 if self._for_html:
1114                     error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0)
1115                 else:
1116                     error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0)
1117                 py_buffer_len -= buffer_len
1118                 c_data += buffer_len
1119
1120             if error and not pctxt.replaceEntities and not pctxt.validate:
1121                 # in this mode, we ignore errors about undefined entities
1122                 for entry in context._error_log.filter_from_errors():
1123                     if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
1124                            entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
1125                         break
1126                 else:
1127                     error = 0
1128
1129         if not recover and (error or not pctxt.wellFormed):
1130             self._feed_parser_running = 0
1131             try:
1132                 context._handleParseResult(self, NULL, None)
1133             finally:
1134                 context.cleanup()
1135
1136     def close(self):
1137         u"""close(self)
1138
1139         Terminates feeding data to this parser.  This tells the parser to
1140         process any remaining data in the feed buffer, and then returns the
1141         root Element of the tree that was parsed.
1142
1143         This method must be called after passing the last chunk of data into
1144         the ``feed()`` method.  It should only be called when using the feed
1145         parser interface, all other usage is undefined.
1146         """
1147         cdef _ParserContext context
1148         cdef xmlparser.xmlParserCtxt* pctxt
1149         cdef xmlDoc* c_doc
1150         cdef _Document doc
1151         if not self._feed_parser_running:
1152             raise XMLSyntaxError(u"no element found",
1153                                  xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0)
1154
1155         context = self._getPushParserContext()
1156         pctxt = context._c_ctxt
1157
1158         self._feed_parser_running = 0
1159         if self._for_html:
1160             htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
1161         else:
1162             xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
1163         try:
1164             result = context._handleParseResult(self, pctxt.myDoc, None)
1165         finally:
1166             context.cleanup()
1167
1168         if isinstance(result, _Document):
1169             return (<_Document>result).getroot()
1170         else:
1171             return result
1172
1173 cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
1174                              char* c_data, int buffer_len,
1175                              char* c_encoding, int parse_options) except -1:
1176     cdef xmlparser.xmlParserInput* c_input_stream
1177     # libxml2 crashes if spaceTab is not initialised
1178     if _LIBXML_VERSION_INT < 20629 and c_ctxt.spaceTab is NULL:
1179         c_ctxt.spaceTab = <int*>tree.xmlMalloc(10 * sizeof(int))
1180         c_ctxt.spaceMax = 10
1181
1182     # libxml2 lacks an HTML push parser setup function
1183     error = xmlparser.xmlCtxtResetPush(c_ctxt, NULL, 0, NULL, c_encoding)
1184     if error:
1185         return error
1186
1187     # fix libxml2 setup for HTML
1188     c_ctxt.progressive = 1
1189     c_ctxt.html = 1
1190     htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
1191
1192     if c_data is not NULL and buffer_len > 0:
1193         return htmlparser.htmlParseChunk(c_ctxt, c_data, buffer_len, 0)
1194     return 0
1195
1196 ############################################################
1197 ## XML parser
1198 ############################################################
1199
1200 cdef int _XML_DEFAULT_PARSE_OPTIONS
1201 _XML_DEFAULT_PARSE_OPTIONS = (
1202     xmlparser.XML_PARSE_NOENT   |
1203     xmlparser.XML_PARSE_NOCDATA |
1204     xmlparser.XML_PARSE_NONET   |
1205     xmlparser.XML_PARSE_COMPACT
1206     )
1207
1208 cdef class XMLParser(_FeedParser):
1209     u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, XMLSchema schema=None, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, target=None, compact=True)
1210
1211     The XML parser.
1212
1213     Parsers can be supplied as additional argument to various parse
1214     functions of the lxml API.  A default parser is always available
1215     and can be replaced by a call to the global function
1216     'set_default_parser'.  New parsers can be created at any time
1217     without a major run-time overhead.
1218
1219     The keyword arguments in the constructor are mainly based on the
1220     libxml2 parser configuration.  A DTD will also be loaded if DTD
1221     validation or attribute default values are requested (unless you
1222     additionally provide an XMLSchema from which the default
1223     attributes can be read).
1224
1225     Available boolean keyword arguments:
1226
1227     - attribute_defaults - inject default attributes from DTD or XMLSchema
1228     - dtd_validation     - validate against a DTD referenced by the document
1229     - load_dtd           - use DTD for parsing
1230     - no_network         - prevent network access for related files (default: True)
1231     - ns_clean           - clean up redundant namespace declarations
1232     - recover            - try hard to parse through broken XML
1233     - remove_blank_text  - discard blank text nodes
1234     - remove_comments    - discard comments
1235     - remove_pis         - discard processing instructions
1236     - strip_cdata        - replace CDATA sections by normal text content (default: True)
1237     - compact            - safe memory for short text content (default: True)
1238     - resolve_entities   - replace entities by their text value (default: True)
1239     - huge_tree          - disable security restrictions and support very deep trees
1240                            and very long text content (only affects libxml2 2.7+)
1241
1242     Other keyword arguments:
1243
1244     - encoding - override the document encoding
1245     - target   - a parser target object that will receive the parse events
1246     - schema   - an XMLSchema to validate against
1247
1248     Note that you should avoid sharing parsers between threads.  While this is
1249     not harmful, it is more efficient to use separate parsers.  This does not
1250     apply to the default parser.
1251     """
1252     def __init__(self, *, encoding=None, attribute_defaults=False,
1253                  dtd_validation=False, load_dtd=False, no_network=True,
1254                  ns_clean=False, recover=False, XMLSchema schema=None,
1255                  huge_tree=False, remove_blank_text=False, resolve_entities=True,
1256                  remove_comments=False, remove_pis=False, strip_cdata=True,
1257                  target=None, compact=True):
1258         cdef int parse_options
1259         parse_options = _XML_DEFAULT_PARSE_OPTIONS
1260         if load_dtd:
1261             parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1262         if dtd_validation:
1263             parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
1264                             xmlparser.XML_PARSE_DTDLOAD
1265         if attribute_defaults:
1266             parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
1267             if schema is None:
1268                 parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
1269         if ns_clean:
1270             parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
1271         if recover:
1272             parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
1273         if remove_blank_text:
1274             parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
1275         if huge_tree:
1276             parse_options = parse_options | xmlparser.XML_PARSE_HUGE
1277         if not no_network:
1278             parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
1279         if not compact:
1280             parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
1281         if not resolve_entities:
1282             parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
1283         if not strip_cdata:
1284             parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
1285
1286         _BaseParser.__init__(self, parse_options, 0, schema,
1287                              remove_comments, remove_pis, strip_cdata,
1288                              target, None, encoding)
1289
1290 cdef class ETCompatXMLParser(XMLParser):
1291     u"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
1292                  dtd_validation=False, load_dtd=False, no_network=True, \
1293                  ns_clean=False, recover=False, schema=None, \
1294                  huge_tree=False, remove_blank_text=False, resolve_entities=True, \
1295                  remove_comments=True, remove_pis=True, strip_cdata=True, \
1296                  target=None, compact=True)
1297
1298     An XML parser with an ElementTree compatible default setup.
1299
1300     See the XMLParser class for details.
1301
1302     This parser has ``remove_comments`` and ``remove_pis`` enabled by default
1303     and thus ignores comments and processing instructions.
1304     """
1305     def __init__(self, *, encoding=None, attribute_defaults=False,
1306                  dtd_validation=False, load_dtd=False, no_network=True,
1307                  ns_clean=False, recover=False, schema=None,
1308                  huge_tree=False, remove_blank_text=False, resolve_entities=True,
1309                  remove_comments=True, remove_pis=True, strip_cdata=True,
1310                  target=None, compact=True):
1311         XMLParser.__init__(self,
1312                            attribute_defaults=attribute_defaults,
1313                            dtd_validation=dtd_validation,
1314                            load_dtd=load_dtd,
1315                            no_network=no_network,
1316                            ns_clean=ns_clean,
1317                            recover=recover,
1318                            remove_blank_text=remove_blank_text,
1319                            huge_tree=huge_tree,
1320                            compact=compact,
1321                            resolve_entities=resolve_entities,
1322                            remove_comments=remove_comments,
1323                            remove_pis=remove_pis,
1324                            strip_cdata=strip_cdata,
1325                            target=target,
1326                            encoding=encoding,
1327                            schema=schema)
1328
1329 # ET 1.2 compatible name
1330 XMLTreeBuilder = ETCompatXMLParser
1331
1332
1333 cdef XMLParser __DEFAULT_XML_PARSER
1334 __DEFAULT_XML_PARSER = XMLParser()
1335
1336 __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
1337
1338 def set_default_parser(_BaseParser parser=None):
1339     u"""set_default_parser(parser=None)
1340
1341     Set a default parser for the current thread.  This parser is used
1342     globally whenever no parser is supplied to the various parse functions of
1343     the lxml API.  If this function is called without a parser (or if it is
1344     None), the default parser is reset to the original configuration.
1345
1346     Note that the pre-installed default parser is not thread-safe.  Avoid the
1347     default parser in multi-threaded environments.  You can create a separate
1348     parser for each thread explicitly or use a parser pool.
1349     """
1350     if parser is None:
1351         parser = __DEFAULT_XML_PARSER
1352     __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
1353
1354 def get_default_parser():
1355     u"get_default_parser()"
1356     return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1357
1358 ############################################################
1359 ## HTML parser
1360 ############################################################
1361
1362 cdef int _HTML_DEFAULT_PARSE_OPTIONS
1363 _HTML_DEFAULT_PARSE_OPTIONS = (
1364     htmlparser.HTML_PARSE_RECOVER |
1365     htmlparser.HTML_PARSE_NONET   |
1366     htmlparser.HTML_PARSE_COMPACT
1367     )
1368
1369 cdef class HTMLParser(_FeedParser):
1370     u"""HTMLParser(self, encoding=None, remove_blank_text=False, \
1371                    remove_comments=False, remove_pis=False, strip_cdata=True, \
1372                    no_network=True, target=None, XMLSchema schema=None, \
1373                    recover=True, compact=True)
1374
1375     The HTML parser.
1376
1377     This parser allows reading HTML into a normal XML tree.  By
1378     default, it can read broken (non well-formed) HTML, depending on
1379     the capabilities of libxml2.  Use the 'recover' option to switch
1380     this off.
1381
1382     Available boolean keyword arguments:
1383
1384     - recover            - try hard to parse through broken HTML (default: True)
1385     - no_network         - prevent network access for related files (default: True)
1386     - remove_blank_text  - discard empty text nodes
1387     - remove_comments    - discard comments
1388     - remove_pis         - discard processing instructions
1389     - strip_cdata        - replace CDATA sections by normal text content (default: True)
1390     - compact            - safe memory for short text content (default: True)
1391
1392     Other keyword arguments:
1393
1394     - encoding - override the document encoding
1395     - target   - a parser target object that will receive the parse events
1396     - schema   - an XMLSchema to validate against
1397
1398     Note that you should avoid sharing parsers between threads for performance
1399     reasons.
1400     """
1401     def __init__(self, *, encoding=None, remove_blank_text=False,
1402                  remove_comments=False, remove_pis=False, strip_cdata=True,
1403                  no_network=True, target=None, XMLSchema schema=None,
1404                  recover=True, compact=True):
1405         cdef int parse_options
1406         parse_options = _HTML_DEFAULT_PARSE_OPTIONS
1407         if remove_blank_text:
1408             parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
1409         if not recover:
1410             parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
1411         if not no_network:
1412             parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
1413         if not compact:
1414             parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
1415
1416         _BaseParser.__init__(self, parse_options, 1, schema,
1417                              remove_comments, remove_pis, strip_cdata,
1418                              target, None, encoding)
1419
1420 cdef HTMLParser __DEFAULT_HTML_PARSER
1421 __DEFAULT_HTML_PARSER = HTMLParser()
1422
1423 ############################################################
1424 ## helper functions for document creation
1425 ############################################################
1426
1427 cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
1428     cdef char* c_filename
1429     cdef char* c_text
1430     cdef Py_ssize_t c_len
1431     if parser is None:
1432         parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1433     if not filename:
1434         c_filename = NULL
1435     else:
1436         filename_utf = _encodeFilenameUTF8(filename)
1437         c_filename = _cstr(filename_utf)
1438     if python.PyUnicode_Check(text):
1439         c_len = python.PyUnicode_GET_DATA_SIZE(text)
1440         if c_len > python.INT_MAX:
1441             return (<_BaseParser>parser)._parseDocFromFilelike(
1442                 StringIO(text), filename)
1443         return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
1444     else:
1445         c_len = python.PyBytes_GET_SIZE(text)
1446         if c_len > python.INT_MAX:
1447             return (<_BaseParser>parser)._parseDocFromFilelike(
1448                 BytesIO(text), filename)
1449         c_text = _cstr(text)
1450         return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
1451
1452 cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
1453     if parser is None:
1454         parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1455     return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
1456
1457 cdef xmlDoc* _parseDocFromFilelike(source, filename,
1458                                    _BaseParser parser) except NULL:
1459     if parser is None:
1460         parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1461     return (<_BaseParser>parser)._parseDocFromFilelike(source, filename)
1462
1463 cdef xmlDoc* _newXMLDoc() except NULL:
1464     cdef xmlDoc* result
1465     result = tree.xmlNewDoc(NULL)
1466     if result is NULL:
1467         python.PyErr_NoMemory()
1468     if result.encoding is NULL:
1469         result.encoding = tree.xmlStrdup("UTF-8")
1470     __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1471     return result
1472
1473 cdef xmlDoc* _newHTMLDoc() except NULL:
1474     cdef xmlDoc* result
1475     result = tree.htmlNewDoc(NULL, NULL)
1476     if result is NULL:
1477         python.PyErr_NoMemory()
1478     __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1479     return result
1480
1481 cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
1482     cdef xmlDoc* result
1483     if recursive:
1484         with nogil:
1485             result = tree.xmlCopyDoc(c_doc, recursive)
1486     else:
1487         result = tree.xmlCopyDoc(c_doc, 0)
1488     if result is NULL:
1489         python.PyErr_NoMemory()
1490     __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1491     return result
1492
1493 cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
1494     u"Recursively copy the document and make c_new_root the new root node."
1495     cdef xmlDoc* result
1496     cdef xmlNode* c_node
1497     result = tree.xmlCopyDoc(c_doc, 0) # non recursive
1498     __GLOBAL_PARSER_CONTEXT.initDocDict(result)
1499     with nogil:
1500         c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
1501     if c_node is NULL:
1502         python.PyErr_NoMemory()
1503     tree.xmlDocSetRootElement(result, c_node)
1504     _copyTail(c_new_root.next, c_node)
1505     return result
1506
1507 cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
1508     u"Recursively copy the element into the document. c_doc is not modified."
1509     cdef xmlNode* c_root
1510     c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
1511     if c_root is NULL:
1512         python.PyErr_NoMemory()
1513     _copyTail(c_node.next, c_root)
1514     return c_root
1515
1516
1517 ############################################################
1518 ## API level helper functions for _Document creation
1519 ## (here we convert to UTF-8)
1520 ############################################################
1521
1522 cdef _Document _parseDocument(source, _BaseParser parser, base_url):
1523     cdef _Document doc
1524     if _isString(source):
1525         # parse the file directly from the filesystem
1526         doc = _parseDocumentFromURL(_encodeFilename(source), parser)
1527         # fix base URL if requested
1528         if base_url is not None:
1529             base_url = _encodeFilenameUTF8(base_url)
1530             if doc._c_doc.URL is not NULL:
1531                 tree.xmlFree(doc._c_doc.URL)
1532             doc._c_doc.URL = tree.xmlStrdup(_cstr(base_url))
1533         return doc
1534
1535     if base_url is not None:
1536         url = base_url
1537     else:
1538         url = _getFilenameForFile(source)
1539
1540     if hasattr(source, u'getvalue') and hasattr(source, u'tell'):
1541         # StringIO - reading from start?
1542         if source.tell() == 0:
1543             return _parseMemoryDocument(
1544                 source.getvalue(), _encodeFilenameUTF8(url), parser)
1545
1546     # Support for file-like objects (urlgrabber.urlopen, ...)
1547     if hasattr(source, u'read'):
1548         return _parseFilelikeDocument(
1549             source, _encodeFilenameUTF8(url), parser)
1550
1551     raise TypeError, u"cannot parse from '%s'" % funicode(python._fqtypename(source))
1552
1553 cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
1554     cdef xmlDoc* c_doc
1555     c_doc = _parseDocFromFile(url, parser)
1556     return _documentFactory(c_doc, parser)
1557
1558 cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
1559     cdef xmlDoc* c_doc
1560     if python.PyUnicode_Check(text):
1561         if _hasEncodingDeclaration(text):
1562             raise ValueError, \
1563                 u"Unicode strings with encoding declaration are not supported."
1564         # pass native unicode only if libxml2 can handle it
1565         if _UNICODE_ENCODING is NULL:
1566             text = python.PyUnicode_AsUTF8String(text)
1567     elif not python.PyBytes_Check(text):
1568         raise ValueError, u"can only parse strings"
1569     if python.PyUnicode_Check(url):
1570         url = python.PyUnicode_AsUTF8String(url)
1571     c_doc = _parseDoc(text, url, parser)
1572     return _documentFactory(c_doc, parser)
1573
1574 cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
1575     cdef xmlDoc* c_doc
1576     if python.PyUnicode_Check(url):
1577         url = python.PyUnicode_AsUTF8String(url)
1578     c_doc = _parseDocFromFilelike(source, url, parser)
1579     return _documentFactory(c_doc, parser)