upload tizen1.0 source
[external/libxml2.git] / python / drv_libxml2.py
1 # -*- coding: iso-8859-1 -*-
2 """ A SAX2 driver for libxml2, on top of it's XmlReader API
3
4 USAGE
5     # put this file (drv_libxml2.py) in PYTHONPATH
6     import xml.sax
7     reader = xml.sax.make_parser(["drv_libxml2"])
8     # ...and the rest is standard python sax.
9
10 CAVEATS
11     - Lexical handlers are supported, except for start/endEntity
12       (waiting for XmlReader.ResolveEntity) and start/endDTD
13     - Error callbacks are not exactly synchronous, they tend
14       to be invoked before the corresponding content callback,
15       because the underlying reader interface parses
16       data by chunks of 512 bytes
17     
18 TODO
19     - search for TODO
20     - some ErrorHandler events (warning)
21     - some ContentHandler events (setDocumentLocator, skippedEntity)
22     - EntityResolver (using libxml2.?)
23     - DTDHandler (if/when libxml2 exposes such node types)
24     - DeclHandler (if/when libxml2 exposes such node types)
25     - property_xml_string?
26     - feature_string_interning?
27     - Incremental parser
28     - additional performance tuning:
29       - one might cache callbacks to avoid some name lookups
30       - one might implement a smarter way to pass attributes to startElement
31         (some kind of lazy evaluation?)
32       - there might be room for improvement in start/endPrefixMapping
33       - other?
34
35 """
36
37 __author__  = u"Stéphane Bidoul <sbi@skynet.be>"
38 __version__ = "0.3"
39
40 import codecs
41 from types import StringType, UnicodeType
42 StringTypes = (StringType,UnicodeType)
43
44 from xml.sax._exceptions import *
45 from xml.sax import xmlreader, saxutils
46 from xml.sax.handler import \
47      feature_namespaces, \
48      feature_namespace_prefixes, \
49      feature_string_interning, \
50      feature_validation, \
51      feature_external_ges, \
52      feature_external_pes, \
53      property_lexical_handler, \
54      property_declaration_handler, \
55      property_dom_node, \
56      property_xml_string
57
58 # libxml2 returns strings as UTF8
59 _decoder = codecs.lookup("utf8")[1]
60 def _d(s):
61     if s is None:
62         return s
63     else:
64         return _decoder(s)[0]
65
66 try:
67     import libxml2
68 except ImportError, e:
69     raise SAXReaderNotAvailable("libxml2 not available: " \
70                                 "import error was: %s" % e)
71
72 class Locator(xmlreader.Locator):
73     """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
74
75     def __init__(self,locator):
76         self.__locator = locator
77
78     def getColumnNumber(self):
79         "Return the column number where the current event ends."
80         return -1
81
82     def getLineNumber(self):
83         "Return the line number where the current event ends."
84         return self.__locator.LineNumber()
85
86     def getPublicId(self):
87         "Return the public identifier for the current event."
88         return None
89
90     def getSystemId(self):
91         "Return the system identifier for the current event."
92         return self.__locator.BaseURI()
93
94 class LibXml2Reader(xmlreader.XMLReader):
95
96     def __init__(self):
97         xmlreader.XMLReader.__init__(self)
98         # features
99         self.__ns = 0
100         self.__nspfx = 0
101         self.__validate = 0
102         self.__extparams = 1
103         # parsing flag
104         self.__parsing = 0
105         # additional handlers
106         self.__lex_handler = None
107         self.__decl_handler = None
108         # error messages accumulator
109         self.__errors = None
110
111     def _errorHandler(self,arg,msg,severity,locator):
112         if self.__errors is None:
113             self.__errors = []
114         self.__errors.append((severity,
115                               SAXParseException(msg,None,
116                                                 Locator(locator))))
117
118     def _reportErrors(self,fatal):
119         for severity,exception in self.__errors:
120             if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
121                             libxml2.PARSER_SEVERITY_WARNING):
122                 self._err_handler.warning(exception)
123             else:
124                 # when fatal is set, the parse will stop;
125                 # we consider that the last error reported
126                 # is the fatal one.
127                 if fatal and exception is self.__errors[-1][1]:
128                     self._err_handler.fatalError(exception)
129                 else:
130                     self._err_handler.error(exception)
131         self.__errors = None
132
133     def parse(self, source):
134         self.__parsing = 1
135         try:
136             # prepare source and create reader
137             if type(source) in StringTypes:
138                 reader = libxml2.newTextReaderFilename(source)
139             else:
140                 source = saxutils.prepare_input_source(source)
141                 input = libxml2.inputBuffer(source.getByteStream())
142                 reader = input.newTextReader(source.getSystemId())
143             reader.SetErrorHandler(self._errorHandler,None)
144             # configure reader
145             if self.__extparams:
146                 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
147                 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
148                 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
149                 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
150             else:
151                 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
152             # we reuse attribute maps (for a slight performance gain)
153             if self.__ns:
154                 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
155             else:
156                 attributesImpl = xmlreader.AttributesImpl({})
157             # prefixes to pop (for endPrefixMapping)
158             prefixes = []
159             # start loop
160             self._cont_handler.startDocument()
161             while 1:
162                 r = reader.Read()
163                 # check for errors
164                 if r == 1:
165                     if not self.__errors is None:
166                         self._reportErrors(0)
167                 elif r == 0:
168                     if not self.__errors is None:
169                         self._reportErrors(0)
170                     break # end of parse
171                 else:
172                     if not self.__errors is None:
173                         self._reportErrors(1)
174                     else:
175                         self._err_handler.fatalError(\
176                             SAXException("Read failed (no details available)"))
177                     break # fatal parse error
178                 # get node type
179                 nodeType = reader.NodeType()
180                 # Element
181                 if nodeType == 1: 
182                     if self.__ns:
183                         eltName = (_d(reader.NamespaceUri()),\
184                                    _d(reader.LocalName()))
185                         eltQName = _d(reader.Name())
186                         attributesNSImpl._attrs = attrs = {}
187                         attributesNSImpl._qnames = qnames = {}
188                         newPrefixes = []
189                         while reader.MoveToNextAttribute():
190                             qname = _d(reader.Name())
191                             value = _d(reader.Value())
192                             if qname.startswith("xmlns"):
193                                 if len(qname) > 5:
194                                     newPrefix = qname[6:]
195                                 else:
196                                     newPrefix = None
197                                 newPrefixes.append(newPrefix)
198                                 self._cont_handler.startPrefixMapping(\
199                                     newPrefix,value)
200                                 if not self.__nspfx:
201                                     continue # don't report xmlns attribute
202                             attName = (_d(reader.NamespaceUri()),
203                                        _d(reader.LocalName()))
204                             qnames[attName] = qname
205                             attrs[attName] = value
206                         reader.MoveToElement()
207                         self._cont_handler.startElementNS( \
208                             eltName,eltQName,attributesNSImpl) 
209                         if reader.IsEmptyElement():
210                             self._cont_handler.endElementNS(eltName,eltQName)
211                             for newPrefix in newPrefixes:
212                                 self._cont_handler.endPrefixMapping(newPrefix)
213                         else:
214                             prefixes.append(newPrefixes)
215                     else:
216                         eltName = _d(reader.Name())
217                         attributesImpl._attrs = attrs = {}
218                         while reader.MoveToNextAttribute():
219                             attName = _d(reader.Name())
220                             attrs[attName] = _d(reader.Value())
221                         reader.MoveToElement()
222                         self._cont_handler.startElement( \
223                             eltName,attributesImpl)
224                         if reader.IsEmptyElement():
225                             self._cont_handler.endElement(eltName)
226                 # EndElement
227                 elif nodeType == 15: 
228                     if self.__ns:
229                         self._cont_handler.endElementNS( \
230                              (_d(reader.NamespaceUri()),_d(reader.LocalName())),
231                              _d(reader.Name()))
232                         for prefix in prefixes.pop():
233                             self._cont_handler.endPrefixMapping(prefix)
234                     else:
235                         self._cont_handler.endElement(_d(reader.Name()))
236                 # Text
237                 elif nodeType == 3: 
238                     self._cont_handler.characters(_d(reader.Value()))
239                 # Whitespace
240                 elif nodeType == 13: 
241                     self._cont_handler.ignorableWhitespace(_d(reader.Value()))
242                 # SignificantWhitespace
243                 elif nodeType == 14:
244                     self._cont_handler.characters(_d(reader.Value()))
245                 # CDATA
246                 elif nodeType == 4:
247                     if not self.__lex_handler is None:
248                         self.__lex_handler.startCDATA()
249                     self._cont_handler.characters(_d(reader.Value()))
250                     if not self.__lex_handler is None:
251                         self.__lex_handler.endCDATA()
252                 # EntityReference
253                 elif nodeType == 5:
254                     if not self.__lex_handler is None:
255                         self.startEntity(_d(reader.Name()))
256                     reader.ResolveEntity()
257                 # EndEntity
258                 elif nodeType == 16:
259                     if not self.__lex_handler is None:
260                         self.endEntity(_d(reader.Name()))
261                 # ProcessingInstruction
262                 elif nodeType == 7: 
263                     self._cont_handler.processingInstruction( \
264                         _d(reader.Name()),_d(reader.Value()))
265                 # Comment
266                 elif nodeType == 8:
267                     if not self.__lex_handler is None:
268                         self.__lex_handler.comment(_d(reader.Value()))
269                 # DocumentType
270                 elif nodeType == 10:
271                     #if not self.__lex_handler is None:
272                     #    self.__lex_handler.startDTD()
273                     pass # TODO (how to detect endDTD? on first non-dtd event?)
274                 # XmlDeclaration
275                 elif nodeType == 17:
276                     pass # TODO
277                 # Entity
278                 elif nodeType == 6:
279                     pass # TODO (entity decl)
280                 # Notation (decl)
281                 elif nodeType == 12:
282                     pass # TODO
283                 # Attribute (never in this loop)
284                 #elif nodeType == 2: 
285                 #    pass
286                 # Document (not exposed)
287                 #elif nodeType == 9: 
288                 #    pass
289                 # DocumentFragment (never returned by XmlReader)
290                 #elif nodeType == 11:
291                 #    pass
292                 # None
293                 #elif nodeType == 0:
294                 #    pass
295                 # -
296                 else:
297                     raise SAXException("Unexpected node type %d" % nodeType)
298             if r == 0:
299                 self._cont_handler.endDocument()
300             reader.Close()
301         finally:
302             self.__parsing = 0
303
304     def setDTDHandler(self, handler):
305         # TODO (when supported, the inherited method works just fine)
306         raise SAXNotSupportedException("DTDHandler not supported")
307
308     def setEntityResolver(self, resolver):
309         # TODO (when supported, the inherited method works just fine)
310         raise SAXNotSupportedException("EntityResolver not supported")
311
312     def getFeature(self, name):
313         if name == feature_namespaces:
314             return self.__ns
315         elif name == feature_namespace_prefixes:
316             return self.__nspfx
317         elif name == feature_validation:
318             return self.__validate
319         elif name == feature_external_ges:
320             return 1 # TODO (does that relate to PARSER_LOADDTD)?
321         elif name == feature_external_pes:
322             return self.__extparams
323         else:
324             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
325                                             name)
326
327     def setFeature(self, name, state):
328         if self.__parsing:
329             raise SAXNotSupportedException("Cannot set feature %s " \
330                                            "while parsing" % name)
331         if name == feature_namespaces:
332             self.__ns = state
333         elif name == feature_namespace_prefixes:
334             self.__nspfx = state
335         elif name == feature_validation:
336             self.__validate = state
337         elif name == feature_external_ges:
338             if state == 0:
339                 # TODO (does that relate to PARSER_LOADDTD)?
340                 raise SAXNotSupportedException("Feature '%s' not supported" % \
341                                                name)
342         elif name == feature_external_pes:
343             self.__extparams = state
344         else:
345             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
346                                             name)
347
348     def getProperty(self, name):
349         if name == property_lexical_handler:
350             return self.__lex_handler
351         elif name == property_declaration_handler:
352             return self.__decl_handler
353         else:
354             raise SAXNotRecognizedException("Property '%s' not recognized" % \
355                                             name)
356
357     def setProperty(self, name, value):     
358         if name == property_lexical_handler:
359             self.__lex_handler = value
360         elif name == property_declaration_handler:
361             # TODO: remove if/when libxml2 supports dtd events
362             raise SAXNotSupportedException("Property '%s' not supported" % \
363                                            name)
364             self.__decl_handler = value
365         else:
366             raise SAXNotRecognizedException("Property '%s' not recognized" % \
367                                             name)
368
369 def create_parser():
370     return LibXml2Reader()
371