1 # -*- test-case-name: twisted.web.test.test_xml -*-
3 # Copyright (c) Twisted Matrix Laboratories.
4 # See LICENSE for details.
8 *S*mall, *U*ncomplicated *X*ML.
10 This is a very simple implementation of XML/HTML as a network
11 protocol. It is not at all clever. Its main features are that it
15 - mung mnemonic entity references
17 - perform *any* external actions (such as fetching URLs or writing files)
18 under *any* circumstances
19 - has lots and lots of horrible hacks for supporting broken HTML (as an
20 option, they're not on by default).
23 from twisted.internet.protocol import Protocol
24 from twisted.python.reflect import prefixedMethodNames
28 # Elements of the three-tuples in the state table.
34 lenientIdentChars = identChars + ';+#/%~'
44 d = dict([(x, 1) for x in l])
48 def zipfndict(*args, **kw):
49 default = kw.get('default', nop)
51 for key in unionlist(*[fndict.keys() for fndict in args]):
52 d[key] = tuple([x.get(key, default) for x in args])
56 def prefixedMethodClassDict(clazz, prefix):
57 return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)])
60 def prefixedMethodObjDict(obj, prefix):
61 return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)])
64 class ParseError(Exception):
66 def __init__(self, filename, line, col, message):
67 self.filename = filename
70 self.message = message
73 return "%s:%s:%s: %s" % (self.filename, self.line, self.col,
76 class XMLParser(Protocol):
81 beExtremelyLenient = 0
84 # _leadingBodyData will sometimes be set before switching to the
85 # 'bodydata' state, when we "accidentally" read a byte of bodydata
86 # in a different state.
87 _leadingBodyData = None
89 def connectionMade(self):
95 '''Get the line number and column of the last character parsed'''
96 # This gets replaced during dataReceived, restored afterwards
97 return (self.lineno, self.colno)
99 def _parseError(self, message):
100 raise ParseError(*((self.filename,)+self.saveMark()+(message,)))
102 def _buildStateTable(self):
103 '''Return a dictionary of begin, do, end state function tuples'''
104 # _buildStateTable leaves something to be desired but it does what it
105 # does.. probably slowly, so I'm doing some evil caching so it doesn't
106 # get called more than once per class.
107 stateTable = getattr(self.__class__, '__stateTable', None)
108 if stateTable is None:
109 stateTable = self.__class__.__stateTable = zipfndict(
110 *[prefixedMethodObjDict(self, prefix)
111 for prefix in ('begin_', 'do_', 'end_')])
114 def _decode(self, data):
115 if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:
116 assert not len(data) & 1, 'UTF-16 must come in pairs for now'
118 data = self._prepend + data
119 for encoding in self.encodings:
120 data = unicode(data, encoding)
123 def maybeBodyData(self):
127 # Get ready for fun! We're going to allow
128 # <script>if (foo < bar)</script> to work!
129 # We do this by making everything between <script> and
131 # BUT <script src="foo"> will be special-cased to do regular,
132 # lenient behavior, because those may not have </script>
135 if (self.tagName == 'script'
136 and not self.tagAttributes.has_key('src')):
137 # we do this ourselves rather than having begin_waitforendscript
138 # becuase that can get called multiple times and we don't want
139 # bodydata to get reset other than the first time.
140 self.begin_bodydata(None)
141 return 'waitforendscript'
146 def dataReceived(self, data):
147 stateTable = self._buildStateTable()
149 # all UTF-16 starts with this string
150 if data.startswith('\xff\xfe'):
151 self._prepend = '\xff\xfe'
152 self.encodings.append('UTF-16')
154 elif data.startswith('\xfe\xff'):
155 self._prepend = '\xfe\xff'
156 self.encodings.append('UTF-16')
160 data = self._decode(data)
161 # bring state, lineno, colno into local scope
162 lineno, colno = self.lineno, self.colno
163 curState = self.state
164 # replace saveMark with a nested scope function
165 _saveMark = self.saveMark
167 return (lineno, colno)
168 self.saveMark = saveMark
169 # fetch functions from the stateTable
170 beginFn, doFn, endFn = stateTable[curState]
179 newState = doFn(byte)
180 if newState is not None and newState != curState:
181 # this is the endFn from the previous state
184 beginFn, doFn, endFn = stateTable[curState]
187 self.saveMark = _saveMark
188 self.lineno, self.colno = lineno, colno
189 # state doesn't make sense if there's an exception..
190 self.state = curState
193 def connectionLost(self, reason):
195 End the last state we were in.
197 stateTable = self._buildStateTable()
198 stateTable[self.state][END_HANDLER]()
203 def do_begin(self, byte):
207 if self.beExtremelyLenient:
208 self._leadingBodyData = byte
210 self._parseError("First char of document [%r] wasn't <" % (byte,))
213 def begin_comment(self, byte):
216 def do_comment(self, byte):
217 self.commentbuf += byte
218 if self.commentbuf.endswith('-->'):
219 self.gotComment(self.commentbuf[:-3])
222 def begin_tagstart(self, byte):
223 self.tagName = '' # name of the tag
224 self.tagAttributes = {} # attributes of the tag
225 self.termtag = 0 # is the tag self-terminating
228 def do_tagstart(self, byte):
229 if byte.isalnum() or byte in identChars:
231 if self.tagName == '!--':
236 # properly strict thing to do here is probably to only
241 self._parseError("Whitespace before tag-name")
244 self.gotTagEnd(self.tagName)
247 self.gotTagStart(self.tagName, {})
248 return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
256 if not self.beExtremelyLenient:
257 self._parseError("Invalid character in tag-name")
262 if self.tagName == '!':
265 self._parseError("Invalid '[' in tag-name")
267 if self.beExtremelyLenient:
270 self._parseError('Invalid tag character: %r'% byte)
272 def begin_unentity(self, byte):
273 self.bodydata += byte
275 def do_unentity(self, byte):
276 self.bodydata += byte
279 def end_unentity(self):
280 self.gotText(self.bodydata)
282 def begin_expectcdata(self, byte):
285 def do_expectcdata(self, byte):
286 self.cdatabuf += byte
289 if len(cd) > len(cdb):
290 if cd.startswith(cdb):
292 elif self.beExtremelyLenient:
293 ## WHAT THE CRAP!? MSWord9 generates HTML that includes these
294 ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
295 ## 'em as best I can. this should really be a separate parse
296 ## state but I don't even have any idea what these _are_.
299 self._parseError("Mal-formed CDATA header")
303 self._parseError("Mal-formed CDATA header")
305 def do_cdata(self, byte):
306 self.cdatabuf += byte
307 if self.cdatabuf.endswith("]]>"):
308 self.cdatabuf = self.cdatabuf[:-3]
312 self.gotCData(self.cdatabuf)
315 def do_attrs(self, byte):
316 if byte.isalnum() or byte in identChars:
317 # XXX FIXME really handle !DOCTYPE at some point
318 if self.tagName == '!DOCTYPE':
320 if self.tagName[0] in '!?':
326 self.gotTagStart(self.tagName, self.tagAttributes)
327 return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
330 elif self.beExtremelyLenient:
331 # discard and move on? Only case I've seen of this so far was:
334 self._parseError("Unexpected character: %r" % byte)
336 def begin_doctype(self, byte):
339 def do_doctype(self, byte):
344 def end_doctype(self):
345 self.gotDoctype(self.doctype)
348 def do_waitforgt(self, byte):
350 if self.endtag or not self.beExtremelyLenient:
352 return self.maybeBodyData()
354 def begin_attrname(self, byte):
356 self._attrname_termtag = 0
358 def do_attrname(self, byte):
359 if byte.isalnum() or byte in identChars:
360 self.attrname += byte
363 return 'beforeattrval'
366 elif self.beExtremelyLenient:
369 if byte in lenientIdentChars or byte.isalnum():
370 self.attrname += byte
373 self._attrname_termtag = 1
376 self.attrval = 'True'
377 self.tagAttributes[self.attrname] = self.attrval
378 self.gotTagStart(self.tagName, self.tagAttributes)
379 if self._attrname_termtag:
380 self.gotTagEnd(self.tagName)
382 return self.maybeBodyData()
383 # something is really broken. let's leave this attribute where it
384 # is and move on to the next thing
386 self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte))
388 def do_beforeattrval(self, byte):
393 elif self.beExtremelyLenient:
394 if byte in lenientIdentChars or byte.isalnum():
397 self.attrval = 'True'
398 self.tagAttributes[self.attrname] = self.attrval
399 self.gotTagStart(self.tagName, self.tagAttributes)
400 return self.maybeBodyData()
402 # I saw this in actual HTML once:
403 # <font size=\"3\"><sup>SM</sup></font>
405 self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)
410 def begin_beforeeq(self,byte):
411 self._beforeeq_termtag = 0
413 def do_beforeeq(self, byte):
415 return 'beforeattrval'
418 elif self.beExtremelyLenient:
419 if byte.isalnum() or byte in identChars:
420 self.attrval = 'True'
421 self.tagAttributes[self.attrname] = self.attrval
424 self.attrval = 'True'
425 self.tagAttributes[self.attrname] = self.attrval
426 self.gotTagStart(self.tagName, self.tagAttributes)
427 if self._beforeeq_termtag:
428 self.gotTagEnd(self.tagName)
430 return self.maybeBodyData()
432 self._beforeeq_termtag = 1
434 self._parseError("Invalid attribute")
436 def begin_attrval(self, byte):
437 self.quotetype = byte
440 def do_attrval(self, byte):
441 if byte == self.quotetype:
445 def end_attrval(self):
446 self.tagAttributes[self.attrname] = self.attrval
447 self.attrname = self.attrval = ''
449 def begin_messyattr(self, byte):
452 def do_messyattr(self, byte):
457 if self.attrval.endswith('/'):
459 self.attrval = self.attrval[:-1]
460 self.tagAttributes[self.attrname] = self.attrval
461 self.gotTagStart(self.tagName, self.tagAttributes)
463 self.gotTagEnd(self.tagName)
465 return self.maybeBodyData()
469 def end_messyattr(self):
471 self.tagAttributes[self.attrname] = self.attrval
473 def begin_afterslash(self, byte):
474 self._after_slash_closed = 0
476 def do_afterslash(self, byte):
477 # this state is only after a self-terminating slash, e.g. <foo/>
478 if self._after_slash_closed:
479 self._parseError("Mal-formed")#XXX When does this happen??
481 if self.beExtremelyLenient:
484 self._parseError("No data allowed after '/'")
485 self._after_slash_closed = 1
486 self.gotTagStart(self.tagName, self.tagAttributes)
487 self.gotTagEnd(self.tagName)
488 # don't need maybeBodyData here because there better not be
489 # any javascript code after a <script/>... we'll see :(
492 def begin_bodydata(self, byte):
493 if self._leadingBodyData:
494 self.bodydata = self._leadingBodyData
495 del self._leadingBodyData
499 def do_bodydata(self, byte):
504 self.bodydata += byte
506 def end_bodydata(self):
507 self.gotText(self.bodydata)
510 def do_waitforendscript(self, byte):
512 return 'waitscriptendtag'
513 self.bodydata += byte
515 def begin_waitscriptendtag(self, byte):
516 self.temptagdata = ''
520 def do_waitscriptendtag(self, byte):
521 # 1 enforce / as first byte read
522 # 2 enforce following bytes to be subset of "script" until
523 # tagName == "script"
524 # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
525 # 3 spaces can happen anywhere, they're ignored
527 # 4 anything else causes all data I've read to be moved to the
528 # bodydata, and switch back to waitforendscript state
530 # If it turns out this _isn't_ a </script>, we need to
531 # remember all the data we've been through so we can append it
533 self.temptagdata += byte
538 elif not self.endtag:
539 self.bodydata += "<" + self.temptagdata
540 return 'waitforendscript'
542 elif byte.isalnum() or byte in identChars:
544 if not 'script'.startswith(self.tagName):
545 self.bodydata += "<" + self.temptagdata
546 return 'waitforendscript'
547 elif self.tagName == 'script':
548 self.gotText(self.bodydata)
549 self.gotTagEnd(self.tagName)
553 return 'waitscriptendtag'
556 self.bodydata += "<" + self.temptagdata
557 return 'waitforendscript'
560 def begin_entityref(self, byte):
562 self.erefextra = '' # extra bit for lenient mode
564 def do_entityref(self, byte):
565 if byte.isspace() or byte == "<":
566 if self.beExtremelyLenient:
567 # '&foo' probably was '&foo'
568 if self.erefbuf and self.erefbuf != "amp":
569 self.erefextra = self.erefbuf
574 self.erefextra += byte
575 return 'spacebodydata'
576 self._parseError("Bad entity reference")
582 def end_entityref(self):
583 self.gotEntityReference(self.erefbuf)
585 # hacky support for space after & in entityref in beExtremelyLenient
586 # state should only happen in that case
587 def begin_spacebodydata(self, byte):
588 self.bodydata = self.erefextra
589 self.erefextra = None
590 do_spacebodydata = do_bodydata
591 end_spacebodydata = end_bodydata
595 def gotTagStart(self, name, attributes):
596 '''Encountered an opening tag.
598 Default behaviour is to print.'''
599 print 'begin', name, attributes
601 def gotText(self, data):
604 Default behaviour is to print.'''
605 print 'text:', repr(data)
607 def gotEntityReference(self, entityRef):
608 '''Encountered mnemonic entity reference
610 Default behaviour is to print.'''
611 print 'entityRef: &%s;' % entityRef
613 def gotComment(self, comment):
614 '''Encountered comment.
616 Default behaviour is to ignore.'''
619 def gotCData(self, cdata):
622 Default behaviour is to call the gotText method'''
625 def gotDoctype(self, doctype):
626 """Encountered DOCTYPE
628 This is really grotty: it basically just gives you everything between
629 '<!DOCTYPE' and '>' as an argument.
631 print '!DOCTYPE', repr(doctype)
633 def gotTagEnd(self, name):
634 '''Encountered closing tag
636 Default behaviour is to print.'''