src/third_party/closure_linter/closure_linter/common/tokenizer.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved.
   4 #
   5 # Licensed under the Apache License, Version 2.0 (the "License");
   6 # you may not use this file except in compliance with the License.
   7 # You may obtain a copy of the License at
   8 #
   9 #      http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing, software
  12 # distributed under the License is distributed on an "AS-IS" BASIS,
  13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16
  17 """Regular expression based lexer."""
  18
  19 __author__ = ('robbyw@google.com (Robert Walker)',
  20               'ajp@google.com (Andy Perelson)')
  21
  22 from closure_linter.common import tokens
  23
  24 # Shorthand
  25 Type = tokens.TokenType
  26
  27
  28 class Tokenizer(object):
  29   """General purpose tokenizer.
  30
  31   Attributes:
  32     mode: The latest mode of the tokenizer.  This allows patterns to distinguish
  33         if they are mid-comment, mid-parameter list, etc.
  34     matchers: Dictionary of modes to sequences of matchers that define the
  35         patterns to check at any given time.
  36     default_types: Dictionary of modes to types, defining what type to give
  37         non-matched text when in the given mode.  Defaults to Type.NORMAL.
  38   """
  39
  40   def __init__(self, starting_mode, matchers, default_types):
  41     """Initialize the tokenizer.
  42
  43     Args:
  44       starting_mode: Mode to start in.
  45       matchers: Dictionary of modes to sequences of matchers that defines the
  46           patterns to check at any given time.
  47       default_types: Dictionary of modes to types, defining what type to give
  48           non-matched text when in the given mode.  Defaults to Type.NORMAL.
  49     """
  50     self.__starting_mode = starting_mode
  51     self.matchers = matchers
  52     self.default_types = default_types
  53
  54   def TokenizeFile(self, file):
  55     """Tokenizes the given file.
  56
  57     Args:
  58       file: An iterable that yields one line of the file at a time.
  59
  60     Returns:
  61       The first token in the file
  62     """
  63     # The current mode.
  64     self.mode = self.__starting_mode
  65     # The first token in the stream.
  66     self.__first_token = None
  67     # The last token added to the token stream.
  68     self.__last_token = None
  69     # The current line number.
  70     self.__line_number = 0
  71
  72     for line in file:
  73       self.__line_number += 1
  74       self.__TokenizeLine(line)
  75
  76     return self.__first_token
  77
  78   def _CreateToken(self, string, token_type, line, line_number, values=None):
  79     """Creates a new Token object (or subclass).
  80
  81     Args:
  82       string: The string of input the token represents.
  83       token_type: The type of token.
  84       line: The text of the line this token is in.
  85       line_number: The line number of the token.
  86       values: A dict of named values within the token.  For instance, a
  87         function declaration may have a value called 'name' which captures the
  88         name of the function.
  89
  90     Returns:
  91       The newly created Token object.
  92     """
  93     return tokens.Token(string, token_type, line, line_number, values)
  94
  95   def __TokenizeLine(self, line):
  96     """Tokenizes the given line.
  97
  98     Args:
  99       line: The contents of the line.
 100     """
 101     string = line.rstrip('\n\r\f')
 102     line_number = self.__line_number
 103     self.__start_index = 0
 104
 105     if not string:
 106       self.__AddToken(self._CreateToken('', Type.BLANK_LINE, line, line_number))
 107       return
 108
 109     normal_token = ''
 110     index = 0
 111     while index < len(string):
 112       for matcher in self.matchers[self.mode]:
 113         if matcher.line_start and index > 0:
 114           continue
 115
 116         match = matcher.regex.match(string, index)
 117
 118         if match:
 119           if normal_token:
 120             self.__AddToken(
 121                 self.__CreateNormalToken(self.mode, normal_token, line,
 122                                          line_number))
 123             normal_token = ''
 124
 125           # Add the match.
 126           self.__AddToken(self._CreateToken(match.group(), matcher.type, line,
 127                                             line_number, match.groupdict()))
 128
 129           # Change the mode to the correct one for after this match.
 130           self.mode = matcher.result_mode or self.mode
 131
 132           # Shorten the string to be matched.
 133           index = match.end()
 134
 135           break
 136
 137       else:
 138         # If the for loop finishes naturally (i.e. no matches) we just add the
 139         # first character to the string of consecutive non match characters.
 140         # These will constitute a NORMAL token.
 141         if string:
 142           normal_token += string[index:index + 1]
 143           index += 1
 144
 145     if normal_token:
 146       self.__AddToken(
 147           self.__CreateNormalToken(self.mode, normal_token, line, line_number))
 148
 149   def __CreateNormalToken(self, mode, string, line, line_number):
 150     """Creates a normal token.
 151
 152     Args:
 153       mode: The current mode.
 154       string: The string to tokenize.
 155       line: The line of text.
 156       line_number: The line number within the file.
 157
 158     Returns:
 159       A Token object, of the default type for the current mode.
 160     """
 161     type = Type.NORMAL
 162     if mode in self.default_types:
 163       type = self.default_types[mode]
 164     return self._CreateToken(string, type, line, line_number)
 165
 166   def __AddToken(self, token):
 167     """Add the given token to the token stream.
 168
 169     Args:
 170       token: The token to add.
 171     """
 172     # Store the first token, or point the previous token to this one.
 173     if not self.__first_token:
 174       self.__first_token = token
 175     else:
 176       self.__last_token.next = token
 177
 178     # Establish the doubly linked list
 179     token.previous = self.__last_token
 180     self.__last_token = token
 181
 182     # Compute the character indices
 183     token.start_index = self.__start_index
 184     self.__start_index += token.length