twisted/web/sux.py

   1 # -*- test-case-name: twisted.web.test.test_xml -*-
   2 #
   3 # Copyright (c) Twisted Matrix Laboratories.
   4 # See LICENSE for details.
   5
   6
   7 """
   8 *S*mall, *U*ncomplicated *X*ML.
   9
  10 This is a very simple implementation of XML/HTML as a network
  11 protocol.  It is not at all clever.  Its main features are that it
  12 does not:
  13
  14   - support namespaces
  15   - mung mnemonic entity references
  16   - validate
  17   - perform *any* external actions (such as fetching URLs or writing files)
  18     under *any* circumstances
  19   - has lots and lots of horrible hacks for supporting broken HTML (as an
  20     option, they're not on by default).
  21 """
  22
  23 from twisted.internet.protocol import Protocol
  24 from twisted.python.reflect import prefixedMethodNames
  25
  26
  27
  28 # Elements of the three-tuples in the state table.
  29 BEGIN_HANDLER = 0
  30 DO_HANDLER = 1
  31 END_HANDLER = 2
  32
  33 identChars = '.-_:'
  34 lenientIdentChars = identChars + ';+#/%~'
  35
  36 def nop(*args, **kw):
  37     "Do nothing."
  38
  39
  40 def unionlist(*args):
  41     l = []
  42     for x in args:
  43         l.extend(x)
  44     d = dict([(x, 1) for x in l])
  45     return d.keys()
  46
  47
  48 def zipfndict(*args, **kw):
  49     default = kw.get('default', nop)
  50     d = {}
  51     for key in unionlist(*[fndict.keys() for fndict in args]):
  52         d[key] = tuple([x.get(key, default) for x in args])
  53     return d
  54
  55
  56 def prefixedMethodClassDict(clazz, prefix):
  57     return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)])
  58
  59
  60 def prefixedMethodObjDict(obj, prefix):
  61     return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)])
  62
  63
  64 class ParseError(Exception):
  65
  66     def __init__(self, filename, line, col, message):
  67         self.filename = filename
  68         self.line = line
  69         self.col = col
  70         self.message = message
  71
  72     def __str__(self):
  73        return "%s:%s:%s: %s" % (self.filename, self.line, self.col,
  74                                 self.message)
  75
  76 class XMLParser(Protocol):
  77
  78     state = None
  79     encodings = None
  80     filename = "<xml />"
  81     beExtremelyLenient = 0
  82     _prepend = None
  83
  84     # _leadingBodyData will sometimes be set before switching to the
  85     # 'bodydata' state, when we "accidentally" read a byte of bodydata
  86     # in a different state.
  87     _leadingBodyData = None
  88
  89     def connectionMade(self):
  90         self.lineno = 1
  91         self.colno = 0
  92         self.encodings = []
  93
  94     def saveMark(self):
  95         '''Get the line number and column of the last character parsed'''
  96         # This gets replaced during dataReceived, restored afterwards
  97         return (self.lineno, self.colno)
  98
  99     def _parseError(self, message):
 100         raise ParseError(*((self.filename,)+self.saveMark()+(message,)))
 101
 102     def _buildStateTable(self):
 103         '''Return a dictionary of begin, do, end state function tuples'''
 104         # _buildStateTable leaves something to be desired but it does what it
 105         # does.. probably slowly, so I'm doing some evil caching so it doesn't
 106         # get called more than once per class.
 107         stateTable = getattr(self.__class__, '__stateTable', None)
 108         if stateTable is None:
 109             stateTable = self.__class__.__stateTable = zipfndict(
 110                 *[prefixedMethodObjDict(self, prefix)
 111                   for prefix in ('begin_', 'do_', 'end_')])
 112         return stateTable
 113
 114     def _decode(self, data):
 115         if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:
 116             assert not len(data) & 1, 'UTF-16 must come in pairs for now'
 117         if self._prepend:
 118             data = self._prepend + data
 119         for encoding in self.encodings:
 120             data = unicode(data, encoding)
 121         return data
 122
 123     def maybeBodyData(self):
 124         if self.endtag:
 125             return 'bodydata'
 126
 127         # Get ready for fun! We're going to allow
 128         # <script>if (foo < bar)</script> to work!
 129         # We do this by making everything between <script> and
 130         # </script> a Text
 131         # BUT <script src="foo"> will be special-cased to do regular,
 132         # lenient behavior, because those may not have </script>
 133         # -radix
 134
 135         if (self.tagName == 'script'
 136             and not self.tagAttributes.has_key('src')):
 137             # we do this ourselves rather than having begin_waitforendscript
 138             # becuase that can get called multiple times and we don't want
 139             # bodydata to get reset other than the first time.
 140             self.begin_bodydata(None)
 141             return 'waitforendscript'
 142         return 'bodydata'
 143
 144
 145
 146     def dataReceived(self, data):
 147         stateTable = self._buildStateTable()
 148         if not self.state:
 149             # all UTF-16 starts with this string
 150             if data.startswith('\xff\xfe'):
 151                 self._prepend = '\xff\xfe'
 152                 self.encodings.append('UTF-16')
 153                 data = data[2:]
 154             elif data.startswith('\xfe\xff'):
 155                 self._prepend = '\xfe\xff'
 156                 self.encodings.append('UTF-16')
 157                 data = data[2:]
 158             self.state = 'begin'
 159         if self.encodings:
 160             data = self._decode(data)
 161         # bring state, lineno, colno into local scope
 162         lineno, colno = self.lineno, self.colno
 163         curState = self.state
 164         # replace saveMark with a nested scope function
 165         _saveMark = self.saveMark
 166         def saveMark():
 167             return (lineno, colno)
 168         self.saveMark = saveMark
 169         # fetch functions from the stateTable
 170         beginFn, doFn, endFn = stateTable[curState]
 171         try:
 172             for byte in data:
 173                 # do newline stuff
 174                 if byte == '\n':
 175                     lineno += 1
 176                     colno = 0
 177                 else:
 178                     colno += 1
 179                 newState = doFn(byte)
 180                 if newState is not None and newState != curState:
 181                     # this is the endFn from the previous state
 182                     endFn()
 183                     curState = newState
 184                     beginFn, doFn, endFn = stateTable[curState]
 185                     beginFn(byte)
 186         finally:
 187             self.saveMark = _saveMark
 188             self.lineno, self.colno = lineno, colno
 189         # state doesn't make sense if there's an exception..
 190         self.state = curState
 191
 192
 193     def connectionLost(self, reason):
 194         """
 195         End the last state we were in.
 196         """
 197         stateTable = self._buildStateTable()
 198         stateTable[self.state][END_HANDLER]()
 199
 200
 201     # state methods
 202
 203     def do_begin(self, byte):
 204         if byte.isspace():
 205             return
 206         if byte != '<':
 207             if self.beExtremelyLenient:
 208                 self._leadingBodyData = byte
 209                 return 'bodydata'
 210             self._parseError("First char of document [%r] wasn't <" % (byte,))
 211         return 'tagstart'
 212
 213     def begin_comment(self, byte):
 214         self.commentbuf = ''
 215
 216     def do_comment(self, byte):
 217         self.commentbuf += byte
 218         if self.commentbuf.endswith('-->'):
 219             self.gotComment(self.commentbuf[:-3])
 220             return 'bodydata'
 221
 222     def begin_tagstart(self, byte):
 223         self.tagName = ''               # name of the tag
 224         self.tagAttributes = {}         # attributes of the tag
 225         self.termtag = 0                # is the tag self-terminating
 226         self.endtag = 0
 227
 228     def do_tagstart(self, byte):
 229         if byte.isalnum() or byte in identChars:
 230             self.tagName += byte
 231             if self.tagName == '!--':
 232                 return 'comment'
 233         elif byte.isspace():
 234             if self.tagName:
 235                 if self.endtag:
 236                     # properly strict thing to do here is probably to only
 237                     # accept whitespace
 238                     return 'waitforgt'
 239                 return 'attrs'
 240             else:
 241                 self._parseError("Whitespace before tag-name")
 242         elif byte == '>':
 243             if self.endtag:
 244                 self.gotTagEnd(self.tagName)
 245                 return 'bodydata'
 246             else:
 247                 self.gotTagStart(self.tagName, {})
 248                 return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
 249         elif byte == '/':
 250             if self.tagName:
 251                 return 'afterslash'
 252             else:
 253                 self.endtag = 1
 254         elif byte in '!?':
 255             if self.tagName:
 256                 if not self.beExtremelyLenient:
 257                     self._parseError("Invalid character in tag-name")
 258             else:
 259                 self.tagName += byte
 260                 self.termtag = 1
 261         elif byte == '[':
 262             if self.tagName == '!':
 263                 return 'expectcdata'
 264             else:
 265                 self._parseError("Invalid '[' in tag-name")
 266         else:
 267             if self.beExtremelyLenient:
 268                 self.bodydata = '<'
 269                 return 'unentity'
 270             self._parseError('Invalid tag character: %r'% byte)
 271
 272     def begin_unentity(self, byte):
 273         self.bodydata += byte
 274
 275     def do_unentity(self, byte):
 276         self.bodydata += byte
 277         return 'bodydata'
 278
 279     def end_unentity(self):
 280         self.gotText(self.bodydata)
 281
 282     def begin_expectcdata(self, byte):
 283         self.cdatabuf = byte
 284
 285     def do_expectcdata(self, byte):
 286         self.cdatabuf += byte
 287         cdb = self.cdatabuf
 288         cd = '[CDATA['
 289         if len(cd) > len(cdb):
 290             if cd.startswith(cdb):
 291                 return
 292             elif self.beExtremelyLenient:
 293                 ## WHAT THE CRAP!?  MSWord9 generates HTML that includes these
 294                 ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
 295                 ## 'em as best I can.  this should really be a separate parse
 296                 ## state but I don't even have any idea what these _are_.
 297                 return 'waitforgt'
 298             else:
 299                 self._parseError("Mal-formed CDATA header")
 300         if cd == cdb:
 301             self.cdatabuf = ''
 302             return 'cdata'
 303         self._parseError("Mal-formed CDATA header")
 304
 305     def do_cdata(self, byte):
 306         self.cdatabuf += byte
 307         if self.cdatabuf.endswith("]]>"):
 308             self.cdatabuf = self.cdatabuf[:-3]
 309             return 'bodydata'
 310
 311     def end_cdata(self):
 312         self.gotCData(self.cdatabuf)
 313         self.cdatabuf = ''
 314
 315     def do_attrs(self, byte):
 316         if byte.isalnum() or byte in identChars:
 317             # XXX FIXME really handle !DOCTYPE at some point
 318             if self.tagName == '!DOCTYPE':
 319                 return 'doctype'
 320             if self.tagName[0] in '!?':
 321                 return 'waitforgt'
 322             return 'attrname'
 323         elif byte.isspace():
 324             return
 325         elif byte == '>':
 326             self.gotTagStart(self.tagName, self.tagAttributes)
 327             return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
 328         elif byte == '/':
 329             return 'afterslash'
 330         elif self.beExtremelyLenient:
 331             # discard and move on?  Only case I've seen of this so far was:
 332             # <foo bar="baz"">
 333             return
 334         self._parseError("Unexpected character: %r" % byte)
 335
 336     def begin_doctype(self, byte):
 337         self.doctype = byte
 338
 339     def do_doctype(self, byte):
 340         if byte == '>':
 341             return 'bodydata'
 342         self.doctype += byte
 343
 344     def end_doctype(self):
 345         self.gotDoctype(self.doctype)
 346         self.doctype = None
 347
 348     def do_waitforgt(self, byte):
 349         if byte == '>':
 350             if self.endtag or not self.beExtremelyLenient:
 351                 return 'bodydata'
 352             return self.maybeBodyData()
 353
 354     def begin_attrname(self, byte):
 355         self.attrname = byte
 356         self._attrname_termtag = 0
 357
 358     def do_attrname(self, byte):
 359         if byte.isalnum() or byte in identChars:
 360             self.attrname += byte
 361             return
 362         elif byte == '=':
 363             return 'beforeattrval'
 364         elif byte.isspace():
 365             return 'beforeeq'
 366         elif self.beExtremelyLenient:
 367             if byte in '"\'':
 368                 return 'attrval'
 369             if byte in lenientIdentChars or byte.isalnum():
 370                 self.attrname += byte
 371                 return
 372             if byte == '/':
 373                 self._attrname_termtag = 1
 374                 return
 375             if byte == '>':
 376                 self.attrval = 'True'
 377                 self.tagAttributes[self.attrname] = self.attrval
 378                 self.gotTagStart(self.tagName, self.tagAttributes)
 379                 if self._attrname_termtag:
 380                     self.gotTagEnd(self.tagName)
 381                     return 'bodydata'
 382                 return self.maybeBodyData()
 383             # something is really broken. let's leave this attribute where it
 384             # is and move on to the next thing
 385             return
 386         self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte))
 387
 388     def do_beforeattrval(self, byte):
 389         if byte in '"\'':
 390             return 'attrval'
 391         elif byte.isspace():
 392             return
 393         elif self.beExtremelyLenient:
 394             if byte in lenientIdentChars or byte.isalnum():
 395                 return 'messyattr'
 396             if byte == '>':
 397                 self.attrval = 'True'
 398                 self.tagAttributes[self.attrname] = self.attrval
 399                 self.gotTagStart(self.tagName, self.tagAttributes)
 400                 return self.maybeBodyData()
 401             if byte == '\\':
 402                 # I saw this in actual HTML once:
 403                 # <font size=\"3\"><sup>SM</sup></font>
 404                 return
 405         self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)
 406
 407     attrname = ''
 408     attrval = ''
 409
 410     def begin_beforeeq(self,byte):
 411         self._beforeeq_termtag = 0
 412
 413     def do_beforeeq(self, byte):
 414         if byte == '=':
 415             return 'beforeattrval'
 416         elif byte.isspace():
 417             return
 418         elif self.beExtremelyLenient:
 419             if byte.isalnum() or byte in identChars:
 420                 self.attrval = 'True'
 421                 self.tagAttributes[self.attrname] = self.attrval
 422                 return 'attrname'
 423             elif byte == '>':
 424                 self.attrval = 'True'
 425                 self.tagAttributes[self.attrname] = self.attrval
 426                 self.gotTagStart(self.tagName, self.tagAttributes)
 427                 if self._beforeeq_termtag:
 428                     self.gotTagEnd(self.tagName)
 429                     return 'bodydata'
 430                 return self.maybeBodyData()
 431             elif byte == '/':
 432                 self._beforeeq_termtag = 1
 433                 return
 434         self._parseError("Invalid attribute")
 435
 436     def begin_attrval(self, byte):
 437         self.quotetype = byte
 438         self.attrval = ''
 439
 440     def do_attrval(self, byte):
 441         if byte == self.quotetype:
 442             return 'attrs'
 443         self.attrval += byte
 444
 445     def end_attrval(self):
 446         self.tagAttributes[self.attrname] = self.attrval
 447         self.attrname = self.attrval = ''
 448
 449     def begin_messyattr(self, byte):
 450         self.attrval = byte
 451
 452     def do_messyattr(self, byte):
 453         if byte.isspace():
 454             return 'attrs'
 455         elif byte == '>':
 456             endTag = 0
 457             if self.attrval.endswith('/'):
 458                 endTag = 1
 459                 self.attrval = self.attrval[:-1]
 460             self.tagAttributes[self.attrname] = self.attrval
 461             self.gotTagStart(self.tagName, self.tagAttributes)
 462             if endTag:
 463                 self.gotTagEnd(self.tagName)
 464                 return 'bodydata'
 465             return self.maybeBodyData()
 466         else:
 467             self.attrval += byte
 468
 469     def end_messyattr(self):
 470         if self.attrval:
 471             self.tagAttributes[self.attrname] = self.attrval
 472
 473     def begin_afterslash(self, byte):
 474         self._after_slash_closed = 0
 475
 476     def do_afterslash(self, byte):
 477         # this state is only after a self-terminating slash, e.g. <foo/>
 478         if self._after_slash_closed:
 479             self._parseError("Mal-formed")#XXX When does this happen??
 480         if byte != '>':
 481             if self.beExtremelyLenient:
 482                 return
 483             else:
 484                 self._parseError("No data allowed after '/'")
 485         self._after_slash_closed = 1
 486         self.gotTagStart(self.tagName, self.tagAttributes)
 487         self.gotTagEnd(self.tagName)
 488         # don't need maybeBodyData here because there better not be
 489         # any javascript code after a <script/>... we'll see :(
 490         return 'bodydata'
 491
 492     def begin_bodydata(self, byte):
 493         if self._leadingBodyData:
 494             self.bodydata = self._leadingBodyData
 495             del self._leadingBodyData
 496         else:
 497             self.bodydata = ''
 498
 499     def do_bodydata(self, byte):
 500         if byte == '<':
 501             return 'tagstart'
 502         if byte == '&':
 503             return 'entityref'
 504         self.bodydata += byte
 505
 506     def end_bodydata(self):
 507         self.gotText(self.bodydata)
 508         self.bodydata = ''
 509
 510     def do_waitforendscript(self, byte):
 511         if byte == '<':
 512             return 'waitscriptendtag'
 513         self.bodydata += byte
 514
 515     def begin_waitscriptendtag(self, byte):
 516         self.temptagdata = ''
 517         self.tagName = ''
 518         self.endtag = 0
 519
 520     def do_waitscriptendtag(self, byte):
 521         # 1 enforce / as first byte read
 522         # 2 enforce following bytes to be subset of "script" until
 523         #   tagName == "script"
 524         #   2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
 525         # 3 spaces can happen anywhere, they're ignored
 526         #   e.g. < / script >
 527         # 4 anything else causes all data I've read to be moved to the
 528         #   bodydata, and switch back to waitforendscript state
 529
 530         # If it turns out this _isn't_ a </script>, we need to
 531         # remember all the data we've been through so we can append it
 532         # to bodydata
 533         self.temptagdata += byte
 534
 535         # 1
 536         if byte == '/':
 537             self.endtag = True
 538         elif not self.endtag:
 539             self.bodydata += "<" + self.temptagdata
 540             return 'waitforendscript'
 541         # 2
 542         elif byte.isalnum() or byte in identChars:
 543             self.tagName += byte
 544             if not 'script'.startswith(self.tagName):
 545                 self.bodydata += "<" + self.temptagdata
 546                 return 'waitforendscript'
 547             elif self.tagName == 'script':
 548                 self.gotText(self.bodydata)
 549                 self.gotTagEnd(self.tagName)
 550                 return 'waitforgt'
 551         # 3
 552         elif byte.isspace():
 553             return 'waitscriptendtag'
 554         # 4
 555         else:
 556             self.bodydata += "<" + self.temptagdata
 557             return 'waitforendscript'
 558
 559
 560     def begin_entityref(self, byte):
 561         self.erefbuf = ''
 562         self.erefextra = '' # extra bit for lenient mode
 563
 564     def do_entityref(self, byte):
 565         if byte.isspace() or byte == "<":
 566             if self.beExtremelyLenient:
 567                 # '&foo' probably was '&amp;foo'
 568                 if self.erefbuf and self.erefbuf != "amp":
 569                     self.erefextra = self.erefbuf
 570                 self.erefbuf = "amp"
 571                 if byte == "<":
 572                     return "tagstart"
 573                 else:
 574                     self.erefextra += byte
 575                     return 'spacebodydata'
 576             self._parseError("Bad entity reference")
 577         elif byte != ';':
 578             self.erefbuf += byte
 579         else:
 580             return 'bodydata'
 581
 582     def end_entityref(self):
 583         self.gotEntityReference(self.erefbuf)
 584
 585     # hacky support for space after & in entityref in beExtremelyLenient
 586     # state should only happen in that case
 587     def begin_spacebodydata(self, byte):
 588         self.bodydata = self.erefextra
 589         self.erefextra = None
 590     do_spacebodydata = do_bodydata
 591     end_spacebodydata = end_bodydata
 592
 593     # Sorta SAX-ish API
 594
 595     def gotTagStart(self, name, attributes):
 596         '''Encountered an opening tag.
 597
 598         Default behaviour is to print.'''
 599         print 'begin', name, attributes
 600
 601     def gotText(self, data):
 602         '''Encountered text
 603
 604         Default behaviour is to print.'''
 605         print 'text:', repr(data)
 606
 607     def gotEntityReference(self, entityRef):
 608         '''Encountered mnemonic entity reference
 609
 610         Default behaviour is to print.'''
 611         print 'entityRef: &%s;' % entityRef
 612
 613     def gotComment(self, comment):
 614         '''Encountered comment.
 615
 616         Default behaviour is to ignore.'''
 617         pass
 618
 619     def gotCData(self, cdata):
 620         '''Encountered CDATA
 621
 622         Default behaviour is to call the gotText method'''
 623         self.gotText(cdata)
 624
 625     def gotDoctype(self, doctype):
 626         """Encountered DOCTYPE
 627
 628         This is really grotty: it basically just gives you everything between
 629         '<!DOCTYPE' and '>' as an argument.
 630         """
 631         print '!DOCTYPE', repr(doctype)
 632
 633     def gotTagEnd(self, name):
 634         '''Encountered closing tag
 635
 636         Default behaviour is to print.'''
 637         print 'end', name