1 # -*- coding: iso-8859-1 -*-
2 """ A SAX2 driver for libxml2, on top of it's XmlReader API
5 # put this file (drv_libxml2.py) in PYTHONPATH
7 reader = xml.sax.make_parser(["drv_libxml2"])
8 # ...and the rest is standard python sax.
11 - Lexical handlers are supported, except for start/endEntity
12 (waiting for XmlReader.ResolveEntity) and start/endDTD
13 - Error callbacks are not exactly synchronous, they tend
14 to be invoked before the corresponding content callback,
15 because the underlying reader interface parses
16 data by chunks of 512 bytes
20 - some ErrorHandler events (warning)
21 - some ContentHandler events (setDocumentLocator, skippedEntity)
22 - EntityResolver (using libxml2.?)
23 - DTDHandler (if/when libxml2 exposes such node types)
24 - DeclHandler (if/when libxml2 exposes such node types)
25 - property_xml_string?
26 - feature_string_interning?
28 - additional performance tuning:
29 - one might cache callbacks to avoid some name lookups
30 - one might implement a smarter way to pass attributes to startElement
31 (some kind of lazy evaluation?)
32 - there might be room for improvement in start/endPrefixMapping
37 __author__ = "Stéphane Bidoul <sbi@skynet.be>"
43 if sys.version_info[0] < 3:
44 __author__ = codecs.unicode_escape_decode(__author__)[0]
46 StringTypes = (str, unicode)
47 # libxml2 returns strings as UTF8
48 _decoder = codecs.lookup("utf8")[1]
56 # s is Unicode `str` already
60 from xml.sax._exceptions import *
61 from xml.sax import xmlreader, saxutils
62 from xml.sax.handler import \
64 feature_namespace_prefixes, \
65 feature_string_interning, \
67 feature_external_ges, \
68 feature_external_pes, \
69 property_lexical_handler, \
70 property_declaration_handler, \
77 raise SAXReaderNotAvailable("libxml2 not available: " \
78 "import error was: %s" % sys.exc_info()[1])
80 class Locator(xmlreader.Locator):
81 """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
83 def __init__(self,locator):
84 self.__locator = locator
86 def getColumnNumber(self):
87 "Return the column number where the current event ends."
90 def getLineNumber(self):
91 "Return the line number where the current event ends."
92 return self.__locator.LineNumber()
94 def getPublicId(self):
95 "Return the public identifier for the current event."
98 def getSystemId(self):
99 "Return the system identifier for the current event."
100 return self.__locator.BaseURI()
102 class LibXml2Reader(xmlreader.XMLReader):
105 xmlreader.XMLReader.__init__(self)
113 # additional handlers
114 self.__lex_handler = None
115 self.__decl_handler = None
116 # error messages accumulator
119 def _errorHandler(self,arg,msg,severity,locator):
120 if self.__errors is None:
122 self.__errors.append((severity,
123 SAXParseException(msg,None,
126 def _reportErrors(self,fatal):
127 for severity,exception in self.__errors:
128 if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
129 libxml2.PARSER_SEVERITY_WARNING):
130 self._err_handler.warning(exception)
132 # when fatal is set, the parse will stop;
133 # we consider that the last error reported
135 if fatal and exception is self.__errors[-1][1]:
136 self._err_handler.fatalError(exception)
138 self._err_handler.error(exception)
141 def parse(self, source):
144 # prepare source and create reader
145 if isinstance(source, StringTypes):
146 reader = libxml2.newTextReaderFilename(source)
148 source = saxutils.prepare_input_source(source)
149 input = libxml2.inputBuffer(source.getByteStream())
150 reader = input.newTextReader(source.getSystemId())
151 reader.SetErrorHandler(self._errorHandler,None)
154 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
155 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
156 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
157 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
159 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
160 # we reuse attribute maps (for a slight performance gain)
162 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
164 attributesImpl = xmlreader.AttributesImpl({})
165 # prefixes to pop (for endPrefixMapping)
168 self._cont_handler.startDocument()
173 if not self.__errors is None:
174 self._reportErrors(0)
176 if not self.__errors is None:
177 self._reportErrors(0)
180 if not self.__errors is None:
181 self._reportErrors(1)
183 self._err_handler.fatalError(\
184 SAXException("Read failed (no details available)"))
185 break # fatal parse error
187 nodeType = reader.NodeType()
191 eltName = (_d(reader.NamespaceUri()),\
192 _d(reader.LocalName()))
193 eltQName = _d(reader.Name())
194 attributesNSImpl._attrs = attrs = {}
195 attributesNSImpl._qnames = qnames = {}
197 while reader.MoveToNextAttribute():
198 qname = _d(reader.Name())
199 value = _d(reader.Value())
200 if qname.startswith("xmlns"):
202 newPrefix = qname[6:]
205 newPrefixes.append(newPrefix)
206 self._cont_handler.startPrefixMapping(\
209 continue # don't report xmlns attribute
210 attName = (_d(reader.NamespaceUri()),
211 _d(reader.LocalName()))
212 qnames[attName] = qname
213 attrs[attName] = value
214 reader.MoveToElement()
215 self._cont_handler.startElementNS( \
216 eltName,eltQName,attributesNSImpl)
217 if reader.IsEmptyElement():
218 self._cont_handler.endElementNS(eltName,eltQName)
219 for newPrefix in newPrefixes:
220 self._cont_handler.endPrefixMapping(newPrefix)
222 prefixes.append(newPrefixes)
224 eltName = _d(reader.Name())
225 attributesImpl._attrs = attrs = {}
226 while reader.MoveToNextAttribute():
227 attName = _d(reader.Name())
228 attrs[attName] = _d(reader.Value())
229 reader.MoveToElement()
230 self._cont_handler.startElement( \
231 eltName,attributesImpl)
232 if reader.IsEmptyElement():
233 self._cont_handler.endElement(eltName)
237 self._cont_handler.endElementNS( \
238 (_d(reader.NamespaceUri()),_d(reader.LocalName())),
240 for prefix in prefixes.pop():
241 self._cont_handler.endPrefixMapping(prefix)
243 self._cont_handler.endElement(_d(reader.Name()))
246 self._cont_handler.characters(_d(reader.Value()))
249 self._cont_handler.ignorableWhitespace(_d(reader.Value()))
250 # SignificantWhitespace
252 self._cont_handler.characters(_d(reader.Value()))
255 if not self.__lex_handler is None:
256 self.__lex_handler.startCDATA()
257 self._cont_handler.characters(_d(reader.Value()))
258 if not self.__lex_handler is None:
259 self.__lex_handler.endCDATA()
262 if not self.__lex_handler is None:
263 self.startEntity(_d(reader.Name()))
264 reader.ResolveEntity()
267 if not self.__lex_handler is None:
268 self.endEntity(_d(reader.Name()))
269 # ProcessingInstruction
271 self._cont_handler.processingInstruction( \
272 _d(reader.Name()),_d(reader.Value()))
275 if not self.__lex_handler is None:
276 self.__lex_handler.comment(_d(reader.Value()))
279 #if not self.__lex_handler is None:
280 # self.__lex_handler.startDTD()
281 pass # TODO (how to detect endDTD? on first non-dtd event?)
287 pass # TODO (entity decl)
291 # Attribute (never in this loop)
294 # Document (not exposed)
297 # DocumentFragment (never returned by XmlReader)
298 #elif nodeType == 11:
305 raise SAXException("Unexpected node type %d" % nodeType)
307 self._cont_handler.endDocument()
312 def setDTDHandler(self, handler):
313 # TODO (when supported, the inherited method works just fine)
314 raise SAXNotSupportedException("DTDHandler not supported")
316 def setEntityResolver(self, resolver):
317 # TODO (when supported, the inherited method works just fine)
318 raise SAXNotSupportedException("EntityResolver not supported")
320 def getFeature(self, name):
321 if name == feature_namespaces:
323 elif name == feature_namespace_prefixes:
325 elif name == feature_validation:
326 return self.__validate
327 elif name == feature_external_ges:
328 return 1 # TODO (does that relate to PARSER_LOADDTD)?
329 elif name == feature_external_pes:
330 return self.__extparams
332 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
335 def setFeature(self, name, state):
337 raise SAXNotSupportedException("Cannot set feature %s " \
338 "while parsing" % name)
339 if name == feature_namespaces:
341 elif name == feature_namespace_prefixes:
343 elif name == feature_validation:
344 self.__validate = state
345 elif name == feature_external_ges:
347 # TODO (does that relate to PARSER_LOADDTD)?
348 raise SAXNotSupportedException("Feature '%s' not supported" % \
350 elif name == feature_external_pes:
351 self.__extparams = state
353 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
356 def getProperty(self, name):
357 if name == property_lexical_handler:
358 return self.__lex_handler
359 elif name == property_declaration_handler:
360 return self.__decl_handler
362 raise SAXNotRecognizedException("Property '%s' not recognized" % \
365 def setProperty(self, name, value):
366 if name == property_lexical_handler:
367 self.__lex_handler = value
368 elif name == property_declaration_handler:
369 # TODO: remove if/when libxml2 supports dtd events
370 raise SAXNotSupportedException("Property '%s' not supported" % \
372 self.__decl_handler = value
374 raise SAXNotRecognizedException("Property '%s' not recognized" % \
378 return LibXml2Reader()