Imported Upstream version 2.9.2
[platform/upstream/libxml2.git] / python / drv_libxml2.py
1 # -*- coding: iso-8859-1 -*-
2 """ A SAX2 driver for libxml2, on top of it's XmlReader API
3
4 USAGE
5     # put this file (drv_libxml2.py) in PYTHONPATH
6     import xml.sax
7     reader = xml.sax.make_parser(["drv_libxml2"])
8     # ...and the rest is standard python sax.
9
10 CAVEATS
11     - Lexical handlers are supported, except for start/endEntity
12       (waiting for XmlReader.ResolveEntity) and start/endDTD
13     - Error callbacks are not exactly synchronous, they tend
14       to be invoked before the corresponding content callback,
15       because the underlying reader interface parses
16       data by chunks of 512 bytes
17     
18 TODO
19     - search for TODO
20     - some ErrorHandler events (warning)
21     - some ContentHandler events (setDocumentLocator, skippedEntity)
22     - EntityResolver (using libxml2.?)
23     - DTDHandler (if/when libxml2 exposes such node types)
24     - DeclHandler (if/when libxml2 exposes such node types)
25     - property_xml_string?
26     - feature_string_interning?
27     - Incremental parser
28     - additional performance tuning:
29       - one might cache callbacks to avoid some name lookups
30       - one might implement a smarter way to pass attributes to startElement
31         (some kind of lazy evaluation?)
32       - there might be room for improvement in start/endPrefixMapping
33       - other?
34
35 """
36
37 __author__  = "Stéphane Bidoul <sbi@skynet.be>"
38 __version__ = "0.3"
39
40 import sys
41 import codecs
42
43 if sys.version_info[0] < 3:
44     __author__  = codecs.unicode_escape_decode(__author__)[0]
45
46     StringTypes = (str, unicode)
47 else:
48     StringTypes = str
49
50 from xml.sax._exceptions import *
51 from xml.sax import xmlreader, saxutils
52 from xml.sax.handler import \
53      feature_namespaces, \
54      feature_namespace_prefixes, \
55      feature_string_interning, \
56      feature_validation, \
57      feature_external_ges, \
58      feature_external_pes, \
59      property_lexical_handler, \
60      property_declaration_handler, \
61      property_dom_node, \
62      property_xml_string
63
64 # libxml2 returns strings as UTF8
65 _decoder = codecs.lookup("utf8")[1]
66 def _d(s):
67     if s is None:
68         return s
69     else:
70         return _decoder(s)[0]
71
72 try:
73     import libxml2
74 except ImportError:
75     raise SAXReaderNotAvailable("libxml2 not available: " \
76                                 "import error was: %s" % sys.exc_info()[1])
77
78 class Locator(xmlreader.Locator):
79     """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
80
81     def __init__(self,locator):
82         self.__locator = locator
83
84     def getColumnNumber(self):
85         "Return the column number where the current event ends."
86         return -1
87
88     def getLineNumber(self):
89         "Return the line number where the current event ends."
90         return self.__locator.LineNumber()
91
92     def getPublicId(self):
93         "Return the public identifier for the current event."
94         return None
95
96     def getSystemId(self):
97         "Return the system identifier for the current event."
98         return self.__locator.BaseURI()
99
100 class LibXml2Reader(xmlreader.XMLReader):
101
102     def __init__(self):
103         xmlreader.XMLReader.__init__(self)
104         # features
105         self.__ns = 0
106         self.__nspfx = 0
107         self.__validate = 0
108         self.__extparams = 1
109         # parsing flag
110         self.__parsing = 0
111         # additional handlers
112         self.__lex_handler = None
113         self.__decl_handler = None
114         # error messages accumulator
115         self.__errors = None
116
117     def _errorHandler(self,arg,msg,severity,locator):
118         if self.__errors is None:
119             self.__errors = []
120         self.__errors.append((severity,
121                               SAXParseException(msg,None,
122                                                 Locator(locator))))
123
124     def _reportErrors(self,fatal):
125         for severity,exception in self.__errors:
126             if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
127                             libxml2.PARSER_SEVERITY_WARNING):
128                 self._err_handler.warning(exception)
129             else:
130                 # when fatal is set, the parse will stop;
131                 # we consider that the last error reported
132                 # is the fatal one.
133                 if fatal and exception is self.__errors[-1][1]:
134                     self._err_handler.fatalError(exception)
135                 else:
136                     self._err_handler.error(exception)
137         self.__errors = None
138
139     def parse(self, source):
140         self.__parsing = 1
141         try:
142             # prepare source and create reader
143             if isinstance(source, StringTypes):
144                 reader = libxml2.newTextReaderFilename(source)
145             else:
146                 source = saxutils.prepare_input_source(source)
147                 input = libxml2.inputBuffer(source.getByteStream())
148                 reader = input.newTextReader(source.getSystemId())
149             reader.SetErrorHandler(self._errorHandler,None)
150             # configure reader
151             if self.__extparams:
152                 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
153                 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
154                 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
155                 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
156             else:
157                 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
158             # we reuse attribute maps (for a slight performance gain)
159             if self.__ns:
160                 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
161             else:
162                 attributesImpl = xmlreader.AttributesImpl({})
163             # prefixes to pop (for endPrefixMapping)
164             prefixes = []
165             # start loop
166             self._cont_handler.startDocument()
167             while 1:
168                 r = reader.Read()
169                 # check for errors
170                 if r == 1:
171                     if not self.__errors is None:
172                         self._reportErrors(0)
173                 elif r == 0:
174                     if not self.__errors is None:
175                         self._reportErrors(0)
176                     break # end of parse
177                 else:
178                     if not self.__errors is None:
179                         self._reportErrors(1)
180                     else:
181                         self._err_handler.fatalError(\
182                             SAXException("Read failed (no details available)"))
183                     break # fatal parse error
184                 # get node type
185                 nodeType = reader.NodeType()
186                 # Element
187                 if nodeType == 1: 
188                     if self.__ns:
189                         eltName = (_d(reader.NamespaceUri()),\
190                                    _d(reader.LocalName()))
191                         eltQName = _d(reader.Name())
192                         attributesNSImpl._attrs = attrs = {}
193                         attributesNSImpl._qnames = qnames = {}
194                         newPrefixes = []
195                         while reader.MoveToNextAttribute():
196                             qname = _d(reader.Name())
197                             value = _d(reader.Value())
198                             if qname.startswith("xmlns"):
199                                 if len(qname) > 5:
200                                     newPrefix = qname[6:]
201                                 else:
202                                     newPrefix = None
203                                 newPrefixes.append(newPrefix)
204                                 self._cont_handler.startPrefixMapping(\
205                                     newPrefix,value)
206                                 if not self.__nspfx:
207                                     continue # don't report xmlns attribute
208                             attName = (_d(reader.NamespaceUri()),
209                                        _d(reader.LocalName()))
210                             qnames[attName] = qname
211                             attrs[attName] = value
212                         reader.MoveToElement()
213                         self._cont_handler.startElementNS( \
214                             eltName,eltQName,attributesNSImpl) 
215                         if reader.IsEmptyElement():
216                             self._cont_handler.endElementNS(eltName,eltQName)
217                             for newPrefix in newPrefixes:
218                                 self._cont_handler.endPrefixMapping(newPrefix)
219                         else:
220                             prefixes.append(newPrefixes)
221                     else:
222                         eltName = _d(reader.Name())
223                         attributesImpl._attrs = attrs = {}
224                         while reader.MoveToNextAttribute():
225                             attName = _d(reader.Name())
226                             attrs[attName] = _d(reader.Value())
227                         reader.MoveToElement()
228                         self._cont_handler.startElement( \
229                             eltName,attributesImpl)
230                         if reader.IsEmptyElement():
231                             self._cont_handler.endElement(eltName)
232                 # EndElement
233                 elif nodeType == 15: 
234                     if self.__ns:
235                         self._cont_handler.endElementNS( \
236                              (_d(reader.NamespaceUri()),_d(reader.LocalName())),
237                              _d(reader.Name()))
238                         for prefix in prefixes.pop():
239                             self._cont_handler.endPrefixMapping(prefix)
240                     else:
241                         self._cont_handler.endElement(_d(reader.Name()))
242                 # Text
243                 elif nodeType == 3: 
244                     self._cont_handler.characters(_d(reader.Value()))
245                 # Whitespace
246                 elif nodeType == 13: 
247                     self._cont_handler.ignorableWhitespace(_d(reader.Value()))
248                 # SignificantWhitespace
249                 elif nodeType == 14:
250                     self._cont_handler.characters(_d(reader.Value()))
251                 # CDATA
252                 elif nodeType == 4:
253                     if not self.__lex_handler is None:
254                         self.__lex_handler.startCDATA()
255                     self._cont_handler.characters(_d(reader.Value()))
256                     if not self.__lex_handler is None:
257                         self.__lex_handler.endCDATA()
258                 # EntityReference
259                 elif nodeType == 5:
260                     if not self.__lex_handler is None:
261                         self.startEntity(_d(reader.Name()))
262                     reader.ResolveEntity()
263                 # EndEntity
264                 elif nodeType == 16:
265                     if not self.__lex_handler is None:
266                         self.endEntity(_d(reader.Name()))
267                 # ProcessingInstruction
268                 elif nodeType == 7: 
269                     self._cont_handler.processingInstruction( \
270                         _d(reader.Name()),_d(reader.Value()))
271                 # Comment
272                 elif nodeType == 8:
273                     if not self.__lex_handler is None:
274                         self.__lex_handler.comment(_d(reader.Value()))
275                 # DocumentType
276                 elif nodeType == 10:
277                     #if not self.__lex_handler is None:
278                     #    self.__lex_handler.startDTD()
279                     pass # TODO (how to detect endDTD? on first non-dtd event?)
280                 # XmlDeclaration
281                 elif nodeType == 17:
282                     pass # TODO
283                 # Entity
284                 elif nodeType == 6:
285                     pass # TODO (entity decl)
286                 # Notation (decl)
287                 elif nodeType == 12:
288                     pass # TODO
289                 # Attribute (never in this loop)
290                 #elif nodeType == 2: 
291                 #    pass
292                 # Document (not exposed)
293                 #elif nodeType == 9: 
294                 #    pass
295                 # DocumentFragment (never returned by XmlReader)
296                 #elif nodeType == 11:
297                 #    pass
298                 # None
299                 #elif nodeType == 0:
300                 #    pass
301                 # -
302                 else:
303                     raise SAXException("Unexpected node type %d" % nodeType)
304             if r == 0:
305                 self._cont_handler.endDocument()
306             reader.Close()
307         finally:
308             self.__parsing = 0
309
310     def setDTDHandler(self, handler):
311         # TODO (when supported, the inherited method works just fine)
312         raise SAXNotSupportedException("DTDHandler not supported")
313
314     def setEntityResolver(self, resolver):
315         # TODO (when supported, the inherited method works just fine)
316         raise SAXNotSupportedException("EntityResolver not supported")
317
318     def getFeature(self, name):
319         if name == feature_namespaces:
320             return self.__ns
321         elif name == feature_namespace_prefixes:
322             return self.__nspfx
323         elif name == feature_validation:
324             return self.__validate
325         elif name == feature_external_ges:
326             return 1 # TODO (does that relate to PARSER_LOADDTD)?
327         elif name == feature_external_pes:
328             return self.__extparams
329         else:
330             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
331                                             name)
332
333     def setFeature(self, name, state):
334         if self.__parsing:
335             raise SAXNotSupportedException("Cannot set feature %s " \
336                                            "while parsing" % name)
337         if name == feature_namespaces:
338             self.__ns = state
339         elif name == feature_namespace_prefixes:
340             self.__nspfx = state
341         elif name == feature_validation:
342             self.__validate = state
343         elif name == feature_external_ges:
344             if state == 0:
345                 # TODO (does that relate to PARSER_LOADDTD)?
346                 raise SAXNotSupportedException("Feature '%s' not supported" % \
347                                                name)
348         elif name == feature_external_pes:
349             self.__extparams = state
350         else:
351             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
352                                             name)
353
354     def getProperty(self, name):
355         if name == property_lexical_handler:
356             return self.__lex_handler
357         elif name == property_declaration_handler:
358             return self.__decl_handler
359         else:
360             raise SAXNotRecognizedException("Property '%s' not recognized" % \
361                                             name)
362
363     def setProperty(self, name, value):     
364         if name == property_lexical_handler:
365             self.__lex_handler = value
366         elif name == property_declaration_handler:
367             # TODO: remove if/when libxml2 supports dtd events
368             raise SAXNotSupportedException("Property '%s' not supported" % \
369                                            name)
370             self.__decl_handler = value
371         else:
372             raise SAXNotRecognizedException("Property '%s' not recognized" % \
373                                             name)
374
375 def create_parser():
376     return LibXml2Reader()
377