twisted/words/xish/xpathparser.py

   1 # Copyright (c) Twisted Matrix Laboratories.
   2 # See LICENSE for details.
   3
   4 # DO NOT EDIT xpathparser.py!
   5 #
   6 # It is generated from xpathparser.g using Yapps. Make needed changes there.
   7 # This also means that the generated Python may not conform to Twisted's coding
   8 # standards.
   9
  10 # HOWTO Generate me:
  11 #
  12 # 1.) Grab a copy of yapps2, version 2.1.1:
  13 #         http://theory.stanford.edu/~amitp/Yapps/
  14 #
  15 #     Note: Do NOT use the package in debian/ubuntu as it has incompatible
  16 #     modifications.
  17 #
  18 # 2.) Generate the grammar:
  19 #
  20 #         yapps2 xpathparser.g xpathparser.py.proto
  21 #
  22 # 3.) Edit the output to depend on the embedded runtime, not yappsrt.
  23 #
  24 #         sed -e '/^import yapps/d' -e '/^[^#]/s/yappsrt\.//g' \
  25 #             xpathparser.py.proto > xpathparser.py
  26
  27 """
  28 XPath Parser.
  29
  30 Besides the parser code produced by Yapps, this module also defines the
  31 parse-time exception classes, a scanner class, a base class for parsers
  32 produced by Yapps, and a context class that keeps track of the parse stack.
  33 These have been copied from the Yapps runtime.
  34 """
  35
  36 import sys, re
  37
  38 class SyntaxError(Exception):
  39     """When we run into an unexpected token, this is the exception to use"""
  40     def __init__(self, charpos=-1, msg="Bad Token", context=None):
  41         Exception.__init__(self)
  42         self.charpos = charpos
  43         self.msg = msg
  44         self.context = context
  45
  46     def __str__(self):
  47         if self.charpos < 0: return 'SyntaxError'
  48         else: return 'SyntaxError@char%s(%s)' % (repr(self.charpos), self.msg)
  49
  50 class NoMoreTokens(Exception):
  51     """Another exception object, for when we run out of tokens"""
  52     pass
  53
  54 class Scanner:
  55     """Yapps scanner.
  56
  57     The Yapps scanner can work in context sensitive or context
  58     insensitive modes.  The token(i) method is used to retrieve the
  59     i-th token.  It takes a restrict set that limits the set of tokens
  60     it is allowed to return.  In context sensitive mode, this restrict
  61     set guides the scanner.  In context insensitive mode, there is no
  62     restriction (the set is always the full set of tokens).
  63
  64     """
  65
  66     def __init__(self, patterns, ignore, input):
  67         """Initialize the scanner.
  68
  69         @param patterns: [(terminal, uncompiled regex), ...] or C{None}
  70         @param ignore: [terminal,...]
  71         @param input: string
  72
  73         If patterns is C{None}, we assume that the subclass has defined
  74         C{self.patterns} : [(terminal, compiled regex), ...]. Note that the
  75         patterns parameter expects uncompiled regexes, whereas the
  76         C{self.patterns} field expects compiled regexes.
  77         """
  78         self.tokens = [] # [(begin char pos, end char pos, token name, matched text), ...]
  79         self.restrictions = []
  80         self.input = input
  81         self.pos = 0
  82         self.ignore = ignore
  83         self.first_line_number = 1
  84
  85         if patterns is not None:
  86             # Compile the regex strings into regex objects
  87             self.patterns = []
  88             for terminal, regex in patterns:
  89                 self.patterns.append( (terminal, re.compile(regex)) )
  90
  91     def get_token_pos(self):
  92         """Get the current token position in the input text."""
  93         return len(self.tokens)
  94
  95     def get_char_pos(self):
  96         """Get the current char position in the input text."""
  97         return self.pos
  98
  99     def get_prev_char_pos(self, i=None):
 100         """Get the previous position (one token back) in the input text."""
 101         if self.pos == 0: return 0
 102         if i is None: i = -1
 103         return self.tokens[i][0]
 104
 105     def get_line_number(self):
 106         """Get the line number of the current position in the input text."""
 107         # TODO: make this work at any token/char position
 108         return self.first_line_number + self.get_input_scanned().count('\n')
 109
 110     def get_column_number(self):
 111         """Get the column number of the current position in the input text."""
 112         s = self.get_input_scanned()
 113         i = s.rfind('\n') # may be -1, but that's okay in this case
 114         return len(s) - (i+1)
 115
 116     def get_input_scanned(self):
 117         """Get the portion of the input that has been tokenized."""
 118         return self.input[:self.pos]
 119
 120     def get_input_unscanned(self):
 121         """Get the portion of the input that has not yet been tokenized."""
 122         return self.input[self.pos:]
 123
 124     def token(self, i, restrict=None):
 125         """Get the i'th token in the input.
 126
 127         If C{i} is one past the end, then scan for another token.
 128
 129         @param i: token index
 130
 131         @param restrict: [token, ...] or C{None}; if restrict is
 132                          C{None}, then any token is allowed.  You may call
 133                          token(i) more than once.  However, the restrict set
 134                          may never be larger than what was passed in on the
 135                          first call to token(i).
 136         """
 137         if i == len(self.tokens):
 138             self.scan(restrict)
 139         if i < len(self.tokens):
 140             # Make sure the restriction is more restricted.  This
 141             # invariant is needed to avoid ruining tokenization at
 142             # position i+1 and higher.
 143             if restrict and self.restrictions[i]:
 144                 for r in restrict:
 145                     if r not in self.restrictions[i]:
 146                         raise NotImplementedError("Unimplemented: restriction set changed")
 147             return self.tokens[i]
 148         raise NoMoreTokens()
 149
 150     def __repr__(self):
 151         """Print the last 10 tokens that have been scanned in"""
 152         output = ''
 153         for t in self.tokens[-10:]:
 154             output = '%s\n  (@%s)  %s  =  %s' % (output,t[0],t[2],repr(t[3]))
 155         return output
 156
 157     def scan(self, restrict):
 158         """Should scan another token and add it to the list, self.tokens,
 159         and add the restriction to self.restrictions"""
 160         # Keep looking for a token, ignoring any in self.ignore
 161         while 1:
 162             # Search the patterns for the longest match, with earlier
 163             # tokens in the list having preference
 164             best_match = -1
 165             best_pat = '(error)'
 166             for p, regexp in self.patterns:
 167                 # First check to see if we're ignoring this token
 168                 if restrict and p not in restrict and p not in self.ignore:
 169                     continue
 170                 m = regexp.match(self.input, self.pos)
 171                 if m and len(m.group(0)) > best_match:
 172                     # We got a match that's better than the previous one
 173                     best_pat = p
 174                     best_match = len(m.group(0))
 175
 176             # If we didn't find anything, raise an error
 177             if best_pat == '(error)' and best_match < 0:
 178                 msg = 'Bad Token'
 179                 if restrict:
 180                     msg = 'Trying to find one of '+', '.join(restrict)
 181                 raise SyntaxError(self.pos, msg)
 182
 183             # If we found something that isn't to be ignored, return it
 184             if best_pat not in self.ignore:
 185                 # Create a token with this data
 186                 token = (self.pos, self.pos+best_match, best_pat,
 187                          self.input[self.pos:self.pos+best_match])
 188                 self.pos = self.pos + best_match
 189                 # Only add this token if it's not in the list
 190                 # (to prevent looping)
 191                 if not self.tokens or token != self.tokens[-1]:
 192                     self.tokens.append(token)
 193                     self.restrictions.append(restrict)
 194                 return
 195             else:
 196                 # This token should be ignored ..
 197                 self.pos = self.pos + best_match
 198
 199 class Parser:
 200     """Base class for Yapps-generated parsers.
 201
 202     """
 203
 204     def __init__(self, scanner):
 205         self._scanner = scanner
 206         self._pos = 0
 207
 208     def _peek(self, *types):
 209         """Returns the token type for lookahead; if there are any args
 210         then the list of args is the set of token types to allow"""
 211         tok = self._scanner.token(self._pos, types)
 212         return tok[2]
 213
 214     def _scan(self, type):
 215         """Returns the matched text, and moves to the next token"""
 216         tok = self._scanner.token(self._pos, [type])
 217         if tok[2] != type:
 218             raise SyntaxError(tok[0], 'Trying to find '+type+' :'+ ' ,'.join(self._scanner.restrictions[self._pos]))
 219         self._pos = 1 + self._pos
 220         return tok[3]
 221
 222 class Context:
 223     """Class to represent the parser's call stack.
 224
 225     Every rule creates a Context that links to its parent rule.  The
 226     contexts can be used for debugging.
 227
 228     """
 229
 230     def __init__(self, parent, scanner, tokenpos, rule, args=()):
 231         """Create a new context.
 232
 233         @param parent: Context object or C{None}
 234         @param scanner: Scanner object
 235         @param tokenpos: scanner token position
 236         @type tokenpos: C{int}
 237         @param rule: name of the rule
 238         @type rule: C{str}
 239         @param args: tuple listing parameters to the rule
 240
 241         """
 242         self.parent = parent
 243         self.scanner = scanner
 244         self.tokenpos = tokenpos
 245         self.rule = rule
 246         self.args = args
 247
 248     def __str__(self):
 249         output = ''
 250         if self.parent: output = str(self.parent) + ' > '
 251         output += self.rule
 252         return output
 253
 254 def print_line_with_pointer(text, p):
 255     """Print the line of 'text' that includes position 'p',
 256     along with a second line with a single caret (^) at position p"""
 257
 258     # TODO: separate out the logic for determining the line/character
 259     # location from the logic for determining how to display an
 260     # 80-column line to stderr.
 261
 262     # Now try printing part of the line
 263     text = text[max(p-80, 0):p+80]
 264     p = p - max(p-80, 0)
 265
 266     # Strip to the left
 267     i = text[:p].rfind('\n')
 268     j = text[:p].rfind('\r')
 269     if i < 0 or (0 <= j < i): i = j
 270     if 0 <= i < p:
 271         p = p - i - 1
 272         text = text[i+1:]
 273
 274     # Strip to the right
 275     i = text.find('\n', p)
 276     j = text.find('\r', p)
 277     if i < 0 or (0 <= j < i): i = j
 278     if i >= 0:
 279         text = text[:i]
 280
 281     # Now shorten the text
 282     while len(text) > 70 and p > 60:
 283         # Cut off 10 chars
 284         text = "..." + text[10:]
 285         p = p - 7
 286
 287     # Now print the string, along with an indicator
 288     print >>sys.stderr, '> ',text
 289     print >>sys.stderr, '> ',' '*p + '^'
 290
 291 def print_error(input, err, scanner):
 292     """Print error messages, the parser stack, and the input text -- for human-readable error messages."""
 293     # NOTE: this function assumes 80 columns :-(
 294     # Figure out the line number
 295     line_number = scanner.get_line_number()
 296     column_number = scanner.get_column_number()
 297     print >>sys.stderr, '%d:%d: %s' % (line_number, column_number, err.msg)
 298
 299     context = err.context
 300     if not context:
 301         print_line_with_pointer(input, err.charpos)
 302
 303     while context:
 304         # TODO: add line number
 305         print >>sys.stderr, 'while parsing %s%s:' % (context.rule, tuple(context.args))
 306         print_line_with_pointer(input, context.scanner.get_prev_char_pos(context.tokenpos))
 307         context = context.parent
 308
 309 def wrap_error_reporter(parser, rule):
 310     try:
 311         return getattr(parser, rule)()
 312     except SyntaxError, e:
 313         input = parser._scanner.input
 314         print_error(input, e, parser._scanner)
 315     except NoMoreTokens:
 316         print >>sys.stderr, 'Could not complete parsing; stopped around here:'
 317         print >>sys.stderr, parser._scanner
 318
 319
 320 from twisted.words.xish.xpath import AttribValue, BooleanValue, CompareValue
 321 from twisted.words.xish.xpath import Function, IndexValue, LiteralValue
 322 from twisted.words.xish.xpath import _AnyLocation, _Location
 323
 324
 325 # Begin -- grammar generated by Yapps
 326 import sys, re
 327
 328 class XPathParserScanner(Scanner):
 329     patterns = [
 330         ('","', re.compile(',')),
 331         ('"@"', re.compile('@')),
 332         ('"\\)"', re.compile('\\)')),
 333         ('"\\("', re.compile('\\(')),
 334         ('"\\]"', re.compile('\\]')),
 335         ('"\\["', re.compile('\\[')),
 336         ('"//"', re.compile('//')),
 337         ('"/"', re.compile('/')),
 338         ('\\s+', re.compile('\\s+')),
 339         ('INDEX', re.compile('[0-9]+')),
 340         ('WILDCARD', re.compile('\\*')),
 341         ('IDENTIFIER', re.compile('[a-zA-Z][a-zA-Z0-9_\\-]*')),
 342         ('ATTRIBUTE', re.compile('\\@[a-zA-Z][a-zA-Z0-9_\\-]*')),
 343         ('FUNCNAME', re.compile('[a-zA-Z][a-zA-Z0-9_]*')),
 344         ('CMP_EQ', re.compile('\\=')),
 345         ('CMP_NE', re.compile('\\!\\=')),
 346         ('STR_DQ', re.compile('"([^"]|(\\"))*?"')),
 347         ('STR_SQ', re.compile("'([^']|(\\'))*?'")),
 348         ('OP_AND', re.compile('and')),
 349         ('OP_OR', re.compile('or')),
 350         ('END', re.compile('$')),
 351     ]
 352     def __init__(self, str):
 353         Scanner.__init__(self,None,['\\s+'],str)
 354
 355 class XPathParser(Parser):
 356     Context = Context
 357     def XPATH(self, _parent=None):
 358         _context = self.Context(_parent, self._scanner, self._pos, 'XPATH', [])
 359         PATH = self.PATH(_context)
 360         result = PATH; current = result
 361         while self._peek('END', '"/"', '"//"') != 'END':
 362             PATH = self.PATH(_context)
 363             current.childLocation = PATH; current = current.childLocation
 364         if self._peek() not in ['END', '"/"', '"//"']:
 365             raise SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['END', '"/"', '"//"']))
 366         END = self._scan('END')
 367         return  result
 368
 369     def PATH(self, _parent=None):
 370         _context = self.Context(_parent, self._scanner, self._pos, 'PATH', [])
 371         _token = self._peek('"/"', '"//"')
 372         if _token == '"/"':
 373             self._scan('"/"')
 374             result = _Location()
 375         else: # == '"//"'
 376             self._scan('"//"')
 377             result = _AnyLocation()
 378         _token = self._peek('IDENTIFIER', 'WILDCARD')
 379         if _token == 'IDENTIFIER':
 380             IDENTIFIER = self._scan('IDENTIFIER')
 381             result.elementName = IDENTIFIER
 382         else: # == 'WILDCARD'
 383             WILDCARD = self._scan('WILDCARD')
 384             result.elementName = None
 385         while self._peek('"\\["', 'END', '"/"', '"//"') == '"\\["':
 386             self._scan('"\\["')
 387             PREDICATE = self.PREDICATE(_context)
 388             result.predicates.append(PREDICATE)
 389             self._scan('"\\]"')
 390         if self._peek() not in ['"\\["', 'END', '"/"', '"//"']:
 391             raise SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['"\\["', 'END', '"/"', '"//"']))
 392         return result
 393
 394     def PREDICATE(self, _parent=None):
 395         _context = self.Context(_parent, self._scanner, self._pos, 'PREDICATE', [])
 396         _token = self._peek('INDEX', '"\\("', '"@"', 'FUNCNAME', 'STR_DQ', 'STR_SQ')
 397         if _token != 'INDEX':
 398             EXPR = self.EXPR(_context)
 399             return EXPR
 400         else: # == 'INDEX'
 401             INDEX = self._scan('INDEX')
 402             return IndexValue(INDEX)
 403
 404     def EXPR(self, _parent=None):
 405         _context = self.Context(_parent, self._scanner, self._pos, 'EXPR', [])
 406         FACTOR = self.FACTOR(_context)
 407         e = FACTOR
 408         while self._peek('OP_AND', 'OP_OR', '"\\)"', '"\\]"') in ['OP_AND', 'OP_OR']:
 409             BOOLOP = self.BOOLOP(_context)
 410             FACTOR = self.FACTOR(_context)
 411             e = BooleanValue(e, BOOLOP, FACTOR)
 412         if self._peek() not in ['OP_AND', 'OP_OR', '"\\)"', '"\\]"']:
 413             raise SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['OP_AND', 'OP_OR', '"\\)"', '"\\]"']))
 414         return e
 415
 416     def BOOLOP(self, _parent=None):
 417         _context = self.Context(_parent, self._scanner, self._pos, 'BOOLOP', [])
 418         _token = self._peek('OP_AND', 'OP_OR')
 419         if _token == 'OP_AND':
 420             OP_AND = self._scan('OP_AND')
 421             return OP_AND
 422         else: # == 'OP_OR'
 423             OP_OR = self._scan('OP_OR')
 424             return OP_OR
 425
 426     def FACTOR(self, _parent=None):
 427         _context = self.Context(_parent, self._scanner, self._pos, 'FACTOR', [])
 428         _token = self._peek('"\\("', '"@"', 'FUNCNAME', 'STR_DQ', 'STR_SQ')
 429         if _token != '"\\("':
 430             TERM = self.TERM(_context)
 431             return TERM
 432         else: # == '"\\("'
 433             self._scan('"\\("')
 434             EXPR = self.EXPR(_context)
 435             self._scan('"\\)"')
 436             return EXPR
 437
 438     def TERM(self, _parent=None):
 439         _context = self.Context(_parent, self._scanner, self._pos, 'TERM', [])
 440         VALUE = self.VALUE(_context)
 441         t = VALUE
 442         if self._peek('CMP_EQ', 'CMP_NE', 'OP_AND', 'OP_OR', '"\\)"', '"\\]"') in ['CMP_EQ', 'CMP_NE']:
 443             CMP = self.CMP(_context)
 444             VALUE = self.VALUE(_context)
 445             t = CompareValue(t, CMP, VALUE)
 446         return t
 447
 448     def VALUE(self, _parent=None):
 449         _context = self.Context(_parent, self._scanner, self._pos, 'VALUE', [])
 450         _token = self._peek('"@"', 'FUNCNAME', 'STR_DQ', 'STR_SQ')
 451         if _token == '"@"':
 452             self._scan('"@"')
 453             IDENTIFIER = self._scan('IDENTIFIER')
 454             return AttribValue(IDENTIFIER)
 455         elif _token == 'FUNCNAME':
 456             FUNCNAME = self._scan('FUNCNAME')
 457             f = Function(FUNCNAME); args = []
 458             self._scan('"\\("')
 459             if self._peek('"\\)"', '"@"', 'FUNCNAME', '","', 'STR_DQ', 'STR_SQ') not in ['"\\)"', '","']:
 460                 VALUE = self.VALUE(_context)
 461                 args.append(VALUE)
 462                 while self._peek('","', '"\\)"') == '","':
 463                     self._scan('","')
 464                     VALUE = self.VALUE(_context)
 465                     args.append(VALUE)
 466                 if self._peek() not in ['","', '"\\)"']:
 467                     raise SyntaxError(charpos=self._scanner.get_prev_char_pos(), context=_context, msg='Need one of ' + ', '.join(['","', '"\\)"']))
 468             self._scan('"\\)"')
 469             f.setParams(*args); return f
 470         else: # in ['STR_DQ', 'STR_SQ']
 471             STR = self.STR(_context)
 472             return LiteralValue(STR[1:len(STR)-1])
 473
 474     def CMP(self, _parent=None):
 475         _context = self.Context(_parent, self._scanner, self._pos, 'CMP', [])
 476         _token = self._peek('CMP_EQ', 'CMP_NE')
 477         if _token == 'CMP_EQ':
 478             CMP_EQ = self._scan('CMP_EQ')
 479             return CMP_EQ
 480         else: # == 'CMP_NE'
 481             CMP_NE = self._scan('CMP_NE')
 482             return CMP_NE
 483
 484     def STR(self, _parent=None):
 485         _context = self.Context(_parent, self._scanner, self._pos, 'STR', [])
 486         _token = self._peek('STR_DQ', 'STR_SQ')
 487         if _token == 'STR_DQ':
 488             STR_DQ = self._scan('STR_DQ')
 489             return STR_DQ
 490         else: # == 'STR_SQ'
 491             STR_SQ = self._scan('STR_SQ')
 492             return STR_SQ
 493
 494
 495 def parse(rule, text):
 496     P = XPathParser(XPathParserScanner(text))
 497     return wrap_error_reporter(P, rule)
 498
 499 if __name__ == '__main__':
 500     from sys import argv, stdin
 501     if len(argv) >= 2:
 502         if len(argv) >= 3:
 503             f = open(argv[2],'r')
 504         else:
 505             f = stdin
 506         print parse(argv[1], f.read())
 507     else: print >>sys.stderr, 'Args:  <rule> [<filename>]'
 508 # End -- grammar generated by Yapps