3 # Copyright 2007 Neal Norwitz
4 # Portions Copyright 2007 Google Inc.
6 # Licensed under the Apache License, Version 2.0 (the "License");
7 # you may not use this file except in compliance with the License.
8 # You may obtain a copy of the License at
10 # http://www.apache.org/licenses/LICENSE-2.0
12 # Unless required by applicable law or agreed to in writing, software
13 # distributed under the License is distributed on an "AS IS" BASIS,
14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 # See the License for the specific language governing permissions and
16 # limitations under the License.
18 """Tokenize C++ source code."""
20 __author__ = 'nnorwitz@google.com (Neal Norwitz)'
28 import __builtin__ as builtins
36 if not hasattr(builtins, 'set'):
37 # Nominal support for Python 2.3.
38 from sets import Set as set
41 # Add $ as a valid identifier char since so much code uses it.
42 _letters = 'abcdefghijklmnopqrstuvwxyz'
43 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
44 HEX_DIGITS = set('0123456789abcdefABCDEF')
45 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
48 # C++0x string preffixes.
49 _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
57 PREPROCESSOR = 'PREPROCESSOR'
59 # Where the token originated from. This can be used for backtracking.
60 # It is always set to WHENCE_STREAM in this code.
61 WHENCE_STREAM, WHENCE_QUEUE = range(2)
65 """Data container to represent a C++ token.
67 Tokens can be identifiers, syntax char(s), constants, or
68 pre-processor directives.
70 start contains the index of the first char of the token in the source
71 end contains the index of the last char of the token in the source
74 def __init__(self, token_type, name, start, end):
75 self.token_type = token_type
79 self.whence = WHENCE_STREAM
83 return 'Token(%r)' % self.name
84 return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
89 def _GetString(source, start, i):
90 i = source.find('"', i+1)
91 while source[i-1] == '\\':
92 # Count the trailing backslashes.
95 while source[j] == '\\':
98 # When trailing backslashes are even, they escape each other.
99 if (backslash_count % 2) == 0:
101 i = source.find('"', i+1)
105 def _GetChar(source, start, i):
106 # NOTE(nnorwitz): may not be quite correct, should be good enough.
107 i = source.find("'", i+1)
108 while source[i-1] == '\\':
109 # Need to special case '\\'.
110 if (i - 2) > start and source[i-2] == '\\':
112 i = source.find("'", i+1)
113 # Try to handle unterminated single quotes (in a #if 0 block).
119 def GetTokens(source):
120 """Returns a sequence of Tokens.
123 source: string of C++ source code.
126 Token that represents the next token in the source.
128 # Cache various valid character sets for speed.
129 valid_identifier_chars = VALID_IDENTIFIER_CHARS
130 hex_digits = HEX_DIGITS
131 int_or_float_digits = INT_OR_FLOAT_DIGITS
132 int_or_float_digits2 = int_or_float_digits | set('.')
134 # Only ignore errors while in a #if 0 block.
135 ignore_errors = False
142 while i < end and source[i].isspace():
150 if c.isalpha() or c == '_': # Find a string token.
152 while source[i] in valid_identifier_chars:
154 # String and character constants can look like a name if
155 # they are something like L"".
156 if (source[i] == "'" and (i - start) == 1 and
157 source[start:i] in 'uUL'):
158 # u, U, and L are valid C++0x character preffixes.
159 token_type = CONSTANT
160 i = _GetChar(source, start, i)
161 elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
162 token_type = CONSTANT
163 i = _GetString(source, start, i)
164 elif c == '/' and source[i+1] == '/': # Find // comments.
165 i = source.find('\n', i)
166 if i == -1: # Handle EOF.
169 elif c == '/' and source[i+1] == '*': # Find /* comments. */
170 i = source.find('*/', i) + 2
172 elif c in ':+-<>&|*=': # : or :: (plus other chars).
176 if new_ch == c and c != '>': # Treat ">>" as two tokens.
178 elif c == '-' and new_ch == '>':
182 elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.
185 if c == '.' and source[i].isdigit():
186 token_type = CONSTANT
188 while source[i] in int_or_float_digits:
190 # Handle float suffixes.
191 for suffix in ('l', 'f'):
192 if suffix == source[i:i+1].lower():
195 elif c.isdigit(): # Find integer.
196 token_type = CONSTANT
197 if c == '0' and source[i+1] in 'xX':
200 while source[i] in hex_digits:
203 while source[i] in int_or_float_digits2:
205 # Handle integer (and float) suffixes.
206 for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
208 if suffix == source[i:i+size].lower():
211 elif c == '"': # Find string.
212 token_type = CONSTANT
213 i = _GetString(source, start, i)
214 elif c == "'": # Find char.
215 token_type = CONSTANT
216 i = _GetChar(source, start, i)
217 elif c == '#': # Find pre-processor command.
218 token_type = PREPROCESSOR
219 got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
222 elif source[i:i+6] == '#endif':
225 ignore_errors = False
227 # TODO(nnorwitz): handle preprocessor statements (\ continuations).
229 i1 = source.find('\n', i)
230 i2 = source.find('//', i)
231 i3 = source.find('/*', i)
232 i4 = source.find('"', i)
233 # NOTE(nnorwitz): doesn't handle comments in #define macros.
234 # Get the first important symbol (newline, comment, EOF/end).
235 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
237 # Handle #include "dir//foo.h" properly.
239 i = source.find('"', i+1) + 1
242 # Keep going if end of the line and the line ends with \.
243 if not (i == i1 and source[i-1] == '\\'):
245 condition = source[start+4:i].lstrip()
246 if (condition.startswith('0') or
247 condition.startswith('(0)')):
251 elif c == '\\': # Handle \ in code.
252 # This is different from the pre-processor \ handling.
256 # The tokenizer seems to be in pretty good shape. This
257 # raise is conditionally disabled so that bogus code
258 # in an #if 0 block can be handled. Since we will ignore
259 # it anyways, this is probably fine. So disable the
260 # exception and return the bogus char.
263 sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
264 ('?', i, c, source[i-10:i+10]))
265 raise RuntimeError('unexpected token')
268 print('Invalid index, exiting now.')
270 yield Token(token_type, source[start:i], start, i)
273 if __name__ == '__main__':
275 """Driver mostly for testing purposes."""
276 for filename in argv[1:]:
277 source = utils.ReadFile(filename)
281 for token in GetTokens(source):
282 print('%-12s: %s' % (token.token_type, token.name))
283 # print('\r%6.2f%%' % (100.0 * index / token.end),)
284 sys.stdout.write('\n')