71b1c67dba9933355ad6d6ead5b7d929cfc38f49
[platform/upstream/libxml2.git] / python / drv_libxml2.py
1 # -*- coding: iso-8859-1 -*-
2 """ A SAX2 driver for libxml2, on top of it's XmlReader API
3
4 USAGE
5     # put this file (drv_libxml2.py) in PYTHONPATH
6     import xml.sax
7     reader = xml.sax.make_parser(["drv_libxml2"])
8     # ...and the rest is standard python sax.
9
10 CAVEATS
11     - Lexical handlers are supported, except for start/endEntity
12       (waiting for XmlReader.ResolveEntity) and start/endDTD
13     - Error callbacks are not exactly synchronous, they tend
14       to be invoked before the corresponding content callback,
15       because the underlying reader interface parses
16       data by chunks of 512 bytes
17     
18 TODO
19     - search for TODO
20     - some ErrorHandler events (warning)
21     - some ContentHandler events (setDocumentLocator, skippedEntity)
22     - EntityResolver (using libxml2.?)
23     - DTDHandler (if/when libxml2 exposes such node types)
24     - DeclHandler (if/when libxml2 exposes such node types)
25     - property_xml_string?
26     - feature_string_interning?
27     - Incremental parser
28     - additional performance tuning:
29       - one might cache callbacks to avoid some name lookups
30       - one might implement a smarter way to pass attributes to startElement
31         (some kind of lazy evaluation?)
32       - there might be room for improvement in start/endPrefixMapping
33       - other?
34
35 """
36
37 __author__  = "Stéphane Bidoul <sbi@skynet.be>"
38 __version__ = "0.3"
39
40 import sys
41 import codecs
42
43 if sys.version_info[0] < 3:
44     __author__  = codecs.unicode_escape_decode(__author__)[0]
45
46     StringTypes = (str, unicode)
47     # libxml2 returns strings as UTF8
48     _decoder = codecs.lookup("utf8")[1]
49     def _d(s):
50         if s is None:
51             return s
52         else:
53             return _decoder(s)[0]
54 else:
55     StringTypes = str
56     # s is Unicode `str` already
57     def _d(s):
58         return s
59
60 from xml.sax._exceptions import *
61 from xml.sax import xmlreader, saxutils
62 from xml.sax.handler import \
63      feature_namespaces, \
64      feature_namespace_prefixes, \
65      feature_string_interning, \
66      feature_validation, \
67      feature_external_ges, \
68      feature_external_pes, \
69      property_lexical_handler, \
70      property_declaration_handler, \
71      property_dom_node, \
72      property_xml_string
73
74 try:
75     import libxml2
76 except ImportError:
77     raise SAXReaderNotAvailable("libxml2 not available: " \
78                                 "import error was: %s" % sys.exc_info()[1])
79
80 class Locator(xmlreader.Locator):
81     """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
82
83     def __init__(self,locator):
84         self.__locator = locator
85
86     def getColumnNumber(self):
87         "Return the column number where the current event ends."
88         return -1
89
90     def getLineNumber(self):
91         "Return the line number where the current event ends."
92         return self.__locator.LineNumber()
93
94     def getPublicId(self):
95         "Return the public identifier for the current event."
96         return None
97
98     def getSystemId(self):
99         "Return the system identifier for the current event."
100         return self.__locator.BaseURI()
101
102 class LibXml2Reader(xmlreader.XMLReader):
103
104     def __init__(self):
105         xmlreader.XMLReader.__init__(self)
106         # features
107         self.__ns = 0
108         self.__nspfx = 0
109         self.__validate = 0
110         self.__extparams = 1
111         # parsing flag
112         self.__parsing = 0
113         # additional handlers
114         self.__lex_handler = None
115         self.__decl_handler = None
116         # error messages accumulator
117         self.__errors = None
118
119     def _errorHandler(self,arg,msg,severity,locator):
120         if self.__errors is None:
121             self.__errors = []
122         self.__errors.append((severity,
123                               SAXParseException(msg,None,
124                                                 Locator(locator))))
125
126     def _reportErrors(self,fatal):
127         for severity,exception in self.__errors:
128             if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
129                             libxml2.PARSER_SEVERITY_WARNING):
130                 self._err_handler.warning(exception)
131             else:
132                 # when fatal is set, the parse will stop;
133                 # we consider that the last error reported
134                 # is the fatal one.
135                 if fatal and exception is self.__errors[-1][1]:
136                     self._err_handler.fatalError(exception)
137                 else:
138                     self._err_handler.error(exception)
139         self.__errors = None
140
141     def parse(self, source):
142         self.__parsing = 1
143         try:
144             # prepare source and create reader
145             if isinstance(source, StringTypes):
146                 reader = libxml2.newTextReaderFilename(source)
147             else:
148                 source = saxutils.prepare_input_source(source)
149                 input = libxml2.inputBuffer(source.getByteStream())
150                 reader = input.newTextReader(source.getSystemId())
151             reader.SetErrorHandler(self._errorHandler,None)
152             # configure reader
153             if self.__extparams:
154                 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
155                 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
156                 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
157                 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
158             else:
159                 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
160             # we reuse attribute maps (for a slight performance gain)
161             if self.__ns:
162                 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
163             else:
164                 attributesImpl = xmlreader.AttributesImpl({})
165             # prefixes to pop (for endPrefixMapping)
166             prefixes = []
167             # start loop
168             self._cont_handler.startDocument()
169             while 1:
170                 r = reader.Read()
171                 # check for errors
172                 if r == 1:
173                     if not self.__errors is None:
174                         self._reportErrors(0)
175                 elif r == 0:
176                     if not self.__errors is None:
177                         self._reportErrors(0)
178                     break # end of parse
179                 else:
180                     if not self.__errors is None:
181                         self._reportErrors(1)
182                     else:
183                         self._err_handler.fatalError(\
184                             SAXException("Read failed (no details available)"))
185                     break # fatal parse error
186                 # get node type
187                 nodeType = reader.NodeType()
188                 # Element
189                 if nodeType == 1: 
190                     if self.__ns:
191                         eltName = (_d(reader.NamespaceUri()),\
192                                    _d(reader.LocalName()))
193                         eltQName = _d(reader.Name())
194                         attributesNSImpl._attrs = attrs = {}
195                         attributesNSImpl._qnames = qnames = {}
196                         newPrefixes = []
197                         while reader.MoveToNextAttribute():
198                             qname = _d(reader.Name())
199                             value = _d(reader.Value())
200                             if qname.startswith("xmlns"):
201                                 if len(qname) > 5:
202                                     newPrefix = qname[6:]
203                                 else:
204                                     newPrefix = None
205                                 newPrefixes.append(newPrefix)
206                                 self._cont_handler.startPrefixMapping(\
207                                     newPrefix,value)
208                                 if not self.__nspfx:
209                                     continue # don't report xmlns attribute
210                             attName = (_d(reader.NamespaceUri()),
211                                        _d(reader.LocalName()))
212                             qnames[attName] = qname
213                             attrs[attName] = value
214                         reader.MoveToElement()
215                         self._cont_handler.startElementNS( \
216                             eltName,eltQName,attributesNSImpl) 
217                         if reader.IsEmptyElement():
218                             self._cont_handler.endElementNS(eltName,eltQName)
219                             for newPrefix in newPrefixes:
220                                 self._cont_handler.endPrefixMapping(newPrefix)
221                         else:
222                             prefixes.append(newPrefixes)
223                     else:
224                         eltName = _d(reader.Name())
225                         attributesImpl._attrs = attrs = {}
226                         while reader.MoveToNextAttribute():
227                             attName = _d(reader.Name())
228                             attrs[attName] = _d(reader.Value())
229                         reader.MoveToElement()
230                         self._cont_handler.startElement( \
231                             eltName,attributesImpl)
232                         if reader.IsEmptyElement():
233                             self._cont_handler.endElement(eltName)
234                 # EndElement
235                 elif nodeType == 15: 
236                     if self.__ns:
237                         self._cont_handler.endElementNS( \
238                              (_d(reader.NamespaceUri()),_d(reader.LocalName())),
239                              _d(reader.Name()))
240                         for prefix in prefixes.pop():
241                             self._cont_handler.endPrefixMapping(prefix)
242                     else:
243                         self._cont_handler.endElement(_d(reader.Name()))
244                 # Text
245                 elif nodeType == 3: 
246                     self._cont_handler.characters(_d(reader.Value()))
247                 # Whitespace
248                 elif nodeType == 13: 
249                     self._cont_handler.ignorableWhitespace(_d(reader.Value()))
250                 # SignificantWhitespace
251                 elif nodeType == 14:
252                     self._cont_handler.characters(_d(reader.Value()))
253                 # CDATA
254                 elif nodeType == 4:
255                     if not self.__lex_handler is None:
256                         self.__lex_handler.startCDATA()
257                     self._cont_handler.characters(_d(reader.Value()))
258                     if not self.__lex_handler is None:
259                         self.__lex_handler.endCDATA()
260                 # EntityReference
261                 elif nodeType == 5:
262                     if not self.__lex_handler is None:
263                         self.startEntity(_d(reader.Name()))
264                     reader.ResolveEntity()
265                 # EndEntity
266                 elif nodeType == 16:
267                     if not self.__lex_handler is None:
268                         self.endEntity(_d(reader.Name()))
269                 # ProcessingInstruction
270                 elif nodeType == 7: 
271                     self._cont_handler.processingInstruction( \
272                         _d(reader.Name()),_d(reader.Value()))
273                 # Comment
274                 elif nodeType == 8:
275                     if not self.__lex_handler is None:
276                         self.__lex_handler.comment(_d(reader.Value()))
277                 # DocumentType
278                 elif nodeType == 10:
279                     #if not self.__lex_handler is None:
280                     #    self.__lex_handler.startDTD()
281                     pass # TODO (how to detect endDTD? on first non-dtd event?)
282                 # XmlDeclaration
283                 elif nodeType == 17:
284                     pass # TODO
285                 # Entity
286                 elif nodeType == 6:
287                     pass # TODO (entity decl)
288                 # Notation (decl)
289                 elif nodeType == 12:
290                     pass # TODO
291                 # Attribute (never in this loop)
292                 #elif nodeType == 2: 
293                 #    pass
294                 # Document (not exposed)
295                 #elif nodeType == 9: 
296                 #    pass
297                 # DocumentFragment (never returned by XmlReader)
298                 #elif nodeType == 11:
299                 #    pass
300                 # None
301                 #elif nodeType == 0:
302                 #    pass
303                 # -
304                 else:
305                     raise SAXException("Unexpected node type %d" % nodeType)
306             if r == 0:
307                 self._cont_handler.endDocument()
308             reader.Close()
309         finally:
310             self.__parsing = 0
311
312     def setDTDHandler(self, handler):
313         # TODO (when supported, the inherited method works just fine)
314         raise SAXNotSupportedException("DTDHandler not supported")
315
316     def setEntityResolver(self, resolver):
317         # TODO (when supported, the inherited method works just fine)
318         raise SAXNotSupportedException("EntityResolver not supported")
319
320     def getFeature(self, name):
321         if name == feature_namespaces:
322             return self.__ns
323         elif name == feature_namespace_prefixes:
324             return self.__nspfx
325         elif name == feature_validation:
326             return self.__validate
327         elif name == feature_external_ges:
328             return 1 # TODO (does that relate to PARSER_LOADDTD)?
329         elif name == feature_external_pes:
330             return self.__extparams
331         else:
332             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
333                                             name)
334
335     def setFeature(self, name, state):
336         if self.__parsing:
337             raise SAXNotSupportedException("Cannot set feature %s " \
338                                            "while parsing" % name)
339         if name == feature_namespaces:
340             self.__ns = state
341         elif name == feature_namespace_prefixes:
342             self.__nspfx = state
343         elif name == feature_validation:
344             self.__validate = state
345         elif name == feature_external_ges:
346             if state == 0:
347                 # TODO (does that relate to PARSER_LOADDTD)?
348                 raise SAXNotSupportedException("Feature '%s' not supported" % \
349                                                name)
350         elif name == feature_external_pes:
351             self.__extparams = state
352         else:
353             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
354                                             name)
355
356     def getProperty(self, name):
357         if name == property_lexical_handler:
358             return self.__lex_handler
359         elif name == property_declaration_handler:
360             return self.__decl_handler
361         else:
362             raise SAXNotRecognizedException("Property '%s' not recognized" % \
363                                             name)
364
365     def setProperty(self, name, value):     
366         if name == property_lexical_handler:
367             self.__lex_handler = value
368         elif name == property_declaration_handler:
369             # TODO: remove if/when libxml2 supports dtd events
370             raise SAXNotSupportedException("Property '%s' not supported" % \
371                                            name)
372             self.__decl_handler = value
373         else:
374             raise SAXNotRecognizedException("Property '%s' not recognized" % \
375                                             name)
376
377 def create_parser():
378     return LibXml2Reader()
379