1 # -*- coding: iso-8859-1 -*-
2 """ A SAX2 driver for libxml2, on top of it's XmlReader API
5 # put this file (drv_libxml2.py) in PYTHONPATH
7 reader = xml.sax.make_parser(["drv_libxml2"])
8 # ...and the rest is standard python sax.
11 - Lexical handlers are supported, except for start/endEntity
12 (waiting for XmlReader.ResolveEntity) and start/endDTD
13 - Error callbacks are not exactly synchronous, they tend
14 to be invoked before the corresponding content callback,
15 because the underlying reader interface parses
16 data by chunks of 512 bytes
20 - some ErrorHandler events (warning)
21 - some ContentHandler events (setDocumentLocator, skippedEntity)
22 - EntityResolver (using libxml2.?)
23 - DTDHandler (if/when libxml2 exposes such node types)
24 - DeclHandler (if/when libxml2 exposes such node types)
25 - property_xml_string?
26 - feature_string_interning?
28 - additional performance tuning:
29 - one might cache callbacks to avoid some name lookups
30 - one might implement a smarter way to pass attributes to startElement
31 (some kind of lazy evaluation?)
32 - there might be room for improvement in start/endPrefixMapping
37 __author__ = "Stéphane Bidoul <sbi@skynet.be>"
43 if sys.version_info[0] < 3:
44 __author__ = codecs.unicode_escape_decode(__author__)[0]
46 StringTypes = (str, unicode)
50 from xml.sax._exceptions import *
51 from xml.sax import xmlreader, saxutils
52 from xml.sax.handler import \
54 feature_namespace_prefixes, \
55 feature_string_interning, \
57 feature_external_ges, \
58 feature_external_pes, \
59 property_lexical_handler, \
60 property_declaration_handler, \
64 # libxml2 returns strings as UTF8
65 _decoder = codecs.lookup("utf8")[1]
75 raise SAXReaderNotAvailable("libxml2 not available: " \
76 "import error was: %s" % sys.exc_info()[1])
78 class Locator(xmlreader.Locator):
79 """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
81 def __init__(self,locator):
82 self.__locator = locator
84 def getColumnNumber(self):
85 "Return the column number where the current event ends."
88 def getLineNumber(self):
89 "Return the line number where the current event ends."
90 return self.__locator.LineNumber()
92 def getPublicId(self):
93 "Return the public identifier for the current event."
96 def getSystemId(self):
97 "Return the system identifier for the current event."
98 return self.__locator.BaseURI()
100 class LibXml2Reader(xmlreader.XMLReader):
103 xmlreader.XMLReader.__init__(self)
111 # additional handlers
112 self.__lex_handler = None
113 self.__decl_handler = None
114 # error messages accumulator
117 def _errorHandler(self,arg,msg,severity,locator):
118 if self.__errors is None:
120 self.__errors.append((severity,
121 SAXParseException(msg,None,
124 def _reportErrors(self,fatal):
125 for severity,exception in self.__errors:
126 if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
127 libxml2.PARSER_SEVERITY_WARNING):
128 self._err_handler.warning(exception)
130 # when fatal is set, the parse will stop;
131 # we consider that the last error reported
133 if fatal and exception is self.__errors[-1][1]:
134 self._err_handler.fatalError(exception)
136 self._err_handler.error(exception)
139 def parse(self, source):
142 # prepare source and create reader
143 if isinstance(source, StringTypes):
144 reader = libxml2.newTextReaderFilename(source)
146 source = saxutils.prepare_input_source(source)
147 input = libxml2.inputBuffer(source.getByteStream())
148 reader = input.newTextReader(source.getSystemId())
149 reader.SetErrorHandler(self._errorHandler,None)
152 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
153 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
154 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
155 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
157 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
158 # we reuse attribute maps (for a slight performance gain)
160 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
162 attributesImpl = xmlreader.AttributesImpl({})
163 # prefixes to pop (for endPrefixMapping)
166 self._cont_handler.startDocument()
171 if not self.__errors is None:
172 self._reportErrors(0)
174 if not self.__errors is None:
175 self._reportErrors(0)
178 if not self.__errors is None:
179 self._reportErrors(1)
181 self._err_handler.fatalError(\
182 SAXException("Read failed (no details available)"))
183 break # fatal parse error
185 nodeType = reader.NodeType()
189 eltName = (_d(reader.NamespaceUri()),\
190 _d(reader.LocalName()))
191 eltQName = _d(reader.Name())
192 attributesNSImpl._attrs = attrs = {}
193 attributesNSImpl._qnames = qnames = {}
195 while reader.MoveToNextAttribute():
196 qname = _d(reader.Name())
197 value = _d(reader.Value())
198 if qname.startswith("xmlns"):
200 newPrefix = qname[6:]
203 newPrefixes.append(newPrefix)
204 self._cont_handler.startPrefixMapping(\
207 continue # don't report xmlns attribute
208 attName = (_d(reader.NamespaceUri()),
209 _d(reader.LocalName()))
210 qnames[attName] = qname
211 attrs[attName] = value
212 reader.MoveToElement()
213 self._cont_handler.startElementNS( \
214 eltName,eltQName,attributesNSImpl)
215 if reader.IsEmptyElement():
216 self._cont_handler.endElementNS(eltName,eltQName)
217 for newPrefix in newPrefixes:
218 self._cont_handler.endPrefixMapping(newPrefix)
220 prefixes.append(newPrefixes)
222 eltName = _d(reader.Name())
223 attributesImpl._attrs = attrs = {}
224 while reader.MoveToNextAttribute():
225 attName = _d(reader.Name())
226 attrs[attName] = _d(reader.Value())
227 reader.MoveToElement()
228 self._cont_handler.startElement( \
229 eltName,attributesImpl)
230 if reader.IsEmptyElement():
231 self._cont_handler.endElement(eltName)
235 self._cont_handler.endElementNS( \
236 (_d(reader.NamespaceUri()),_d(reader.LocalName())),
238 for prefix in prefixes.pop():
239 self._cont_handler.endPrefixMapping(prefix)
241 self._cont_handler.endElement(_d(reader.Name()))
244 self._cont_handler.characters(_d(reader.Value()))
247 self._cont_handler.ignorableWhitespace(_d(reader.Value()))
248 # SignificantWhitespace
250 self._cont_handler.characters(_d(reader.Value()))
253 if not self.__lex_handler is None:
254 self.__lex_handler.startCDATA()
255 self._cont_handler.characters(_d(reader.Value()))
256 if not self.__lex_handler is None:
257 self.__lex_handler.endCDATA()
260 if not self.__lex_handler is None:
261 self.startEntity(_d(reader.Name()))
262 reader.ResolveEntity()
265 if not self.__lex_handler is None:
266 self.endEntity(_d(reader.Name()))
267 # ProcessingInstruction
269 self._cont_handler.processingInstruction( \
270 _d(reader.Name()),_d(reader.Value()))
273 if not self.__lex_handler is None:
274 self.__lex_handler.comment(_d(reader.Value()))
277 #if not self.__lex_handler is None:
278 # self.__lex_handler.startDTD()
279 pass # TODO (how to detect endDTD? on first non-dtd event?)
285 pass # TODO (entity decl)
289 # Attribute (never in this loop)
292 # Document (not exposed)
295 # DocumentFragment (never returned by XmlReader)
296 #elif nodeType == 11:
303 raise SAXException("Unexpected node type %d" % nodeType)
305 self._cont_handler.endDocument()
310 def setDTDHandler(self, handler):
311 # TODO (when supported, the inherited method works just fine)
312 raise SAXNotSupportedException("DTDHandler not supported")
314 def setEntityResolver(self, resolver):
315 # TODO (when supported, the inherited method works just fine)
316 raise SAXNotSupportedException("EntityResolver not supported")
318 def getFeature(self, name):
319 if name == feature_namespaces:
321 elif name == feature_namespace_prefixes:
323 elif name == feature_validation:
324 return self.__validate
325 elif name == feature_external_ges:
326 return 1 # TODO (does that relate to PARSER_LOADDTD)?
327 elif name == feature_external_pes:
328 return self.__extparams
330 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
333 def setFeature(self, name, state):
335 raise SAXNotSupportedException("Cannot set feature %s " \
336 "while parsing" % name)
337 if name == feature_namespaces:
339 elif name == feature_namespace_prefixes:
341 elif name == feature_validation:
342 self.__validate = state
343 elif name == feature_external_ges:
345 # TODO (does that relate to PARSER_LOADDTD)?
346 raise SAXNotSupportedException("Feature '%s' not supported" % \
348 elif name == feature_external_pes:
349 self.__extparams = state
351 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
354 def getProperty(self, name):
355 if name == property_lexical_handler:
356 return self.__lex_handler
357 elif name == property_declaration_handler:
358 return self.__decl_handler
360 raise SAXNotRecognizedException("Property '%s' not recognized" % \
363 def setProperty(self, name, value):
364 if name == property_lexical_handler:
365 self.__lex_handler = value
366 elif name == property_declaration_handler:
367 # TODO: remove if/when libxml2 supports dtd events
368 raise SAXNotSupportedException("Property '%s' not supported" % \
370 self.__decl_handler = value
372 raise SAXNotRecognizedException("Property '%s' not recognized" % \
376 return LibXml2Reader()