src/third_party/closure_linter/closure_linter/common/tokenizer.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved.
   4 #
   5 # Licensed under the Apache License, Version 2.0 (the "License");
   6 # you may not use this file except in compliance with the License.
   7 # You may obtain a copy of the License at
   8 #
   9 #      http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing, software
  12 # distributed under the License is distributed on an "AS-IS" BASIS,
  13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16
  17 """Regular expression based lexer."""
  18
  19 __author__ = ('robbyw@google.com (Robert Walker)',
  20               'ajp@google.com (Andy Perelson)')
  21
  22 from closure_linter.common import tokens
  23
  24 # Shorthand
  25 Type = tokens.TokenType
  26
  27
  28 class Tokenizer(object):
  29   """General purpose tokenizer.
  30
  31   Attributes:
  32     mode: The latest mode of the tokenizer.  This allows patterns to distinguish
  33         if they are mid-comment, mid-parameter list, etc.
  34     matchers: Dictionary of modes to sequences of matchers that define the
  35         patterns to check at any given time.
  36     default_types: Dictionary of modes to types, defining what type to give
  37         non-matched text when in the given mode.  Defaults to Type.NORMAL.
  38   """
  39
  40   def __init__(self, starting_mode, matchers, default_types):
  41     """Initialize the tokenizer.
  42
  43     Args:
  44       starting_mode: Mode to start in.
  45       matchers: Dictionary of modes to sequences of matchers that defines the
  46           patterns to check at any given time.
  47       default_types: Dictionary of modes to types, defining what type to give
  48           non-matched text when in the given mode.  Defaults to Type.NORMAL.
  49     """
  50     self.__starting_mode = starting_mode
  51     self.matchers = matchers
  52     self.default_types = default_types
  53
  54   def TokenizeFile(self, file):
  55     """Tokenizes the given file.
  56
  57     Args:
  58       file: An iterable that yields one line of the file at a time.
  59
  60     Returns:
  61       The first token in the file
  62     """
  63     # The current mode.
  64     self.mode = self.__starting_mode
  65     # The first token in the stream.
  66     self.__first_token = None
  67     # The last token added to the token stream.
  68     self.__last_token = None
  69     # The current line number.
  70     self.__line_number = 0
  71
  72     for line in file:
  73       self.__line_number += 1
  74       self.__TokenizeLine(line)
  75
  76     return self.__first_token
  77
  78   def _CreateToken(self, string, token_type, line, line_number, values=None):
  79     """Creates a new Token object (or subclass).
  80
  81     Args:
  82       string: The string of input the token represents.
  83       token_type: The type of token.
  84       line: The text of the line this token is in.
  85       line_number: The line number of the token.
  86       values: A dict of named values within the token.  For instance, a
  87         function declaration may have a value called 'name' which captures the
  88         name of the function.
  89
  90     Returns:
  91       The newly created Token object.
  92     """
  93     return tokens.Token(string, token_type, line, line_number, values,
  94                         line_number)
  95
  96   def __TokenizeLine(self, line):
  97     """Tokenizes the given line.
  98
  99     Args:
 100       line: The contents of the line.
 101     """
 102     string = line.rstrip('\n\r\f')
 103     line_number = self.__line_number
 104     self.__start_index = 0
 105
 106     if not string:
 107       self.__AddToken(self._CreateToken('', Type.BLANK_LINE, line, line_number))
 108       return
 109
 110     normal_token = ''
 111     index = 0
 112     while index < len(string):
 113       for matcher in self.matchers[self.mode]:
 114         if matcher.line_start and index > 0:
 115           continue
 116
 117         match = matcher.regex.match(string, index)
 118
 119         if match:
 120           if normal_token:
 121             self.__AddToken(
 122                 self.__CreateNormalToken(self.mode, normal_token, line,
 123                                          line_number))
 124             normal_token = ''
 125
 126           # Add the match.
 127           self.__AddToken(self._CreateToken(match.group(), matcher.type, line,
 128                                             line_number, match.groupdict()))
 129
 130           # Change the mode to the correct one for after this match.
 131           self.mode = matcher.result_mode or self.mode
 132
 133           # Shorten the string to be matched.
 134           index = match.end()
 135
 136           break
 137
 138       else:
 139         # If the for loop finishes naturally (i.e. no matches) we just add the
 140         # first character to the string of consecutive non match characters.
 141         # These will constitute a NORMAL token.
 142         if string:
 143           normal_token += string[index:index + 1]
 144           index += 1
 145
 146     if normal_token:
 147       self.__AddToken(
 148           self.__CreateNormalToken(self.mode, normal_token, line, line_number))
 149
 150   def __CreateNormalToken(self, mode, string, line, line_number):
 151     """Creates a normal token.
 152
 153     Args:
 154       mode: The current mode.
 155       string: The string to tokenize.
 156       line: The line of text.
 157       line_number: The line number within the file.
 158
 159     Returns:
 160       A Token object, of the default type for the current mode.
 161     """
 162     type = Type.NORMAL
 163     if mode in self.default_types:
 164       type = self.default_types[mode]
 165     return self._CreateToken(string, type, line, line_number)
 166
 167   def __AddToken(self, token):
 168     """Add the given token to the token stream.
 169
 170     Args:
 171       token: The token to add.
 172     """
 173     # Store the first token, or point the previous token to this one.
 174     if not self.__first_token:
 175       self.__first_token = token
 176     else:
 177       self.__last_token.next = token
 178
 179     # Establish the doubly linked list
 180     token.previous = self.__last_token
 181     self.__last_token = token
 182
 183     # Compute the character indices
 184     token.start_index = self.__start_index
 185     self.__start_index += token.length