3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS-IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 """Regular expression based JavaScript parsing classes."""
19 __author__ = ('robbyw@google.com (Robert Walker)',
20 'ajp@google.com (Andy Perelson)')
25 from closure_linter import javascripttokens
26 from closure_linter.common import matcher
27 from closure_linter.common import tokenizer
30 Type = javascripttokens.JavaScriptTokenType
31 Matcher = matcher.Matcher
34 class JavaScriptModes(object):
35 """Enumeration of the different matcher modes used for JavaScript."""
37 SINGLE_QUOTE_STRING_MODE = 'single_quote_string'
38 DOUBLE_QUOTE_STRING_MODE = 'double_quote_string'
39 BLOCK_COMMENT_MODE = 'block_comment'
40 DOC_COMMENT_MODE = 'doc_comment'
41 DOC_COMMENT_LEX_SPACES_MODE = 'doc_comment_spaces'
42 LINE_COMMENT_MODE = 'line_comment'
43 PARAMETER_MODE = 'parameter'
44 FUNCTION_MODE = 'function'
47 class JavaScriptTokenizer(tokenizer.Tokenizer):
48 """JavaScript tokenizer.
50 Convert JavaScript code in to an array of tokens.
53 # Useful patterns for JavaScript parsing.
54 IDENTIFIER_CHAR = r'A-Za-z0-9_$.'
56 # Number patterns based on:
57 # http://www.mozilla.org/js/language/js20-2000-07/formal/lexer-grammar.html
59 (\d+(?!\.)) | # Matches '10'
60 (\d+\.(?!\d)) | # Matches '10.'
61 (\d*\.\d+) # Matches '.5' or '10.5'
63 DECIMAL_LITERAL = r'(%s)([eE][-+]?\d+)?' % MANTISSA
64 HEX_LITERAL = r'0[xX][0-9a-fA-F]+'
65 NUMBER = re.compile(r"""
67 """ % (HEX_LITERAL, DECIMAL_LITERAL), re.VERBOSE)
69 # Strings come in three parts - first we match the start of the string, then
70 # the contents, then the end. The contents consist of any character except a
71 # backslash or end of string, or a backslash followed by any character, or a
72 # backslash followed by end of line to support correct parsing of multi-line
74 SINGLE_QUOTE = re.compile(r"'")
75 SINGLE_QUOTE_TEXT = re.compile(r"([^'\\]|\\(.|$))+")
76 DOUBLE_QUOTE = re.compile(r'"')
77 DOUBLE_QUOTE_TEXT = re.compile(r'([^"\\]|\\(.|$))+')
79 START_SINGLE_LINE_COMMENT = re.compile(r'//')
80 END_OF_LINE_SINGLE_LINE_COMMENT = re.compile(r'//$')
82 START_DOC_COMMENT = re.compile(r'/\*\*')
83 START_BLOCK_COMMENT = re.compile(r'/\*')
84 END_BLOCK_COMMENT = re.compile(r'\*/')
85 BLOCK_COMMENT_TEXT = re.compile(r'([^*]|\*(?!/))+')
87 # Comment text is anything that we are not going to parse into another special
88 # token like (inline) flags or end comments. Complicated regex to match
89 # most normal characters, and '*', '{', '}', and '@' when we are sure that
90 # it is safe. Expression [^*{\s]@ must come first, or the other options will
91 # match everything before @, and we won't match @'s that aren't part of flags
92 # like in email addresses in the @author tag.
93 DOC_COMMENT_TEXT = re.compile(r'([^*{}\s]@|[^*{}@]|\*(?!/))+')
94 DOC_COMMENT_NO_SPACES_TEXT = re.compile(r'([^*{}\s]@|[^*{}@\s]|\*(?!/))+')
96 # Match the prefix ' * ' that starts every line of jsdoc. Want to include
97 # spaces after the '*', but nothing else that occurs after a '*', and don't
98 # want to match the '*' in '*/'.
99 DOC_PREFIX = re.compile(r'\s*\*(\s+|(?!/))')
101 START_BLOCK = re.compile('{')
102 END_BLOCK = re.compile('}')
104 REGEX_CHARACTER_CLASS = r"""
106 ([^\]\\]|\\.)* # Anything but a ] or \,
107 # or a backslash followed by anything
110 # We ensure the regex is followed by one of the above tokens to avoid
111 # incorrectly parsing something like x / y / z as x REGEX(/ y /) z
113 ';', ',', r'\.', r'\)', r'\]', '$', r'\/\/', r'\/\*', ':', '}']
115 REGEX = re.compile(r"""
117 (?!\*) # not the start of a comment
118 (\\.|[^\[\/\\]|(%s))* # a backslash followed by anything,
119 # or anything but a / or [ or \,
120 # or a character class
122 [gimsx]* # optional modifiers
124 """ % (REGEX_CHARACTER_CLASS, '|'.join(POST_REGEX_LIST)),
127 ANYTHING = re.compile(r'.*')
128 PARAMETERS = re.compile(r'[^\)]+')
129 CLOSING_PAREN_WITH_SPACE = re.compile(r'\)\s*')
131 FUNCTION_DECLARATION = re.compile(r'\bfunction\b')
133 OPENING_PAREN = re.compile(r'\(')
134 CLOSING_PAREN = re.compile(r'\)')
136 OPENING_BRACKET = re.compile(r'\[')
137 CLOSING_BRACKET = re.compile(r'\]')
139 # We omit these JS keywords from the list:
140 # function - covered by FUNCTION_DECLARATION.
141 # delete, in, instanceof, new, typeof - included as operators.
142 # this - included in identifiers.
143 # null, undefined - not included, should go in some "special constant" list.
144 KEYWORD_LIST = ['break', 'case', 'catch', 'continue', 'default', 'do', 'else',
145 'finally', 'for', 'if', 'return', 'switch', 'throw', 'try', 'var',
147 # Match a keyword string followed by a non-identifier character in order to
148 # not match something like doSomething as do + Something.
149 KEYWORD = re.compile('(%s)((?=[^%s])|$)' % (
150 '|'.join(KEYWORD_LIST), IDENTIFIER_CHAR))
152 # List of regular expressions to match as operators. Some notes: for our
153 # purposes, the comma behaves similarly enough to a normal operator that we
154 # include it here. r'\bin\b' actually matches 'in' surrounded by boundary
155 # characters - this may not match some very esoteric uses of the in operator.
156 # Operators that are subsets of larger operators must come later in this list
157 # for proper matching, e.g., '>>' must come AFTER '>>>'.
158 OPERATOR_LIST = [',', r'\+\+', '===', '!==', '>>>=', '>>>', '==', '>=', '<=',
159 '!=', '<<=', '>>=', '<<', '>>', '>', '<', r'\+=', r'\+',
160 '--', '\^=', '-=', '-', '/=', '/', r'\*=', r'\*', '%=', '%',
161 '&&', r'\|\|', '&=', '&', r'\|=', r'\|', '=', '!', ':', '\?',
162 r'\bdelete\b', r'\bin\b', r'\binstanceof\b', r'\bnew\b',
163 r'\btypeof\b', r'\bvoid\b']
164 OPERATOR = re.compile('|'.join(OPERATOR_LIST))
166 WHITESPACE = re.compile(r'\s+')
167 SEMICOLON = re.compile(r';')
168 # Technically JavaScript identifiers can't contain '.', but we treat a set of
169 # nested identifiers as a single identifier.
170 NESTED_IDENTIFIER = r'[a-zA-Z_$][%s.]*' % IDENTIFIER_CHAR
171 IDENTIFIER = re.compile(NESTED_IDENTIFIER)
173 SIMPLE_LVALUE = re.compile(r"""
174 (?P<identifier>%s) # a valid identifier
175 (?=\s* # optional whitespace
176 \= # look ahead to equal sign
177 (?!=)) # not follwed by equal
178 """ % NESTED_IDENTIFIER, re.VERBOSE)
180 # A doc flag is a @ sign followed by non-space characters that appears at the
181 # beginning of the line, after whitespace, or after a '{'. The look-behind
182 # check is necessary to not match someone@google.com as a flag.
183 DOC_FLAG = re.compile(r'(^|(?<=\s))@(?P<name>[a-zA-Z]+)')
184 # To properly parse parameter names, we need to tokenize whitespace into a
186 DOC_FLAG_LEX_SPACES = re.compile(r'(^|(?<=\s))@(?P<name>%s)\b' %
189 DOC_INLINE_FLAG = re.compile(r'(?<={)@(?P<name>[a-zA-Z]+)')
191 # Star followed by non-slash, i.e a star that does not end a comment.
192 # This is used for TYPE_GROUP below.
193 SAFE_STAR = r'(\*(?!/))'
195 COMMON_DOC_MATCHERS = [
196 # Find the end of the comment.
197 Matcher(END_BLOCK_COMMENT, Type.END_DOC_COMMENT,
198 JavaScriptModes.TEXT_MODE),
200 # Tokenize documented flags like @private.
201 Matcher(DOC_INLINE_FLAG, Type.DOC_INLINE_FLAG),
202 Matcher(DOC_FLAG_LEX_SPACES, Type.DOC_FLAG,
203 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE),
205 # Encountering a doc flag should leave lex spaces mode.
206 Matcher(DOC_FLAG, Type.DOC_FLAG, JavaScriptModes.DOC_COMMENT_MODE),
208 # Tokenize braces so we can find types.
209 Matcher(START_BLOCK, Type.DOC_START_BRACE),
210 Matcher(END_BLOCK, Type.DOC_END_BRACE),
211 Matcher(DOC_PREFIX, Type.DOC_PREFIX, None, True)]
214 # The token matcher groups work as follows: it is an list of Matcher objects.
215 # The matchers will be tried in this order, and the first to match will be
216 # returned. Hence the order is important because the matchers that come first
217 # overrule the matchers that come later.
218 JAVASCRIPT_MATCHERS = {
219 # Matchers for basic text mode.
220 JavaScriptModes.TEXT_MODE: [
221 # Check a big group - strings, starting comments, and regexes - all
222 # of which could be intertwined. 'string with /regex/',
223 # /regex with 'string'/, /* comment with /regex/ and string */ (and so
225 Matcher(START_DOC_COMMENT, Type.START_DOC_COMMENT,
226 JavaScriptModes.DOC_COMMENT_MODE),
227 Matcher(START_BLOCK_COMMENT, Type.START_BLOCK_COMMENT,
228 JavaScriptModes.BLOCK_COMMENT_MODE),
229 Matcher(END_OF_LINE_SINGLE_LINE_COMMENT,
230 Type.START_SINGLE_LINE_COMMENT),
231 Matcher(START_SINGLE_LINE_COMMENT, Type.START_SINGLE_LINE_COMMENT,
232 JavaScriptModes.LINE_COMMENT_MODE),
233 Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_START,
234 JavaScriptModes.SINGLE_QUOTE_STRING_MODE),
235 Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_START,
236 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE),
237 Matcher(REGEX, Type.REGEX),
239 # Next we check for start blocks appearing outside any of the items
241 Matcher(START_BLOCK, Type.START_BLOCK),
242 Matcher(END_BLOCK, Type.END_BLOCK),
244 # Then we search for function declarations.
245 Matcher(FUNCTION_DECLARATION, Type.FUNCTION_DECLARATION,
246 JavaScriptModes.FUNCTION_MODE),
248 # Next, we convert non-function related parens to tokens.
249 Matcher(OPENING_PAREN, Type.START_PAREN),
250 Matcher(CLOSING_PAREN, Type.END_PAREN),
252 # Next, we convert brackets to tokens.
253 Matcher(OPENING_BRACKET, Type.START_BRACKET),
254 Matcher(CLOSING_BRACKET, Type.END_BRACKET),
256 # Find numbers. This has to happen before operators because scientific
257 # notation numbers can have + and - in them.
258 Matcher(NUMBER, Type.NUMBER),
260 # Find operators and simple assignments
261 Matcher(SIMPLE_LVALUE, Type.SIMPLE_LVALUE),
262 Matcher(OPERATOR, Type.OPERATOR),
264 # Find key words and whitespace.
265 Matcher(KEYWORD, Type.KEYWORD),
266 Matcher(WHITESPACE, Type.WHITESPACE),
269 Matcher(IDENTIFIER, Type.IDENTIFIER),
271 # Finally, we convert semicolons to tokens.
272 Matcher(SEMICOLON, Type.SEMICOLON)],
274 # Matchers for single quote strings.
275 JavaScriptModes.SINGLE_QUOTE_STRING_MODE: [
276 Matcher(SINGLE_QUOTE_TEXT, Type.STRING_TEXT),
277 Matcher(SINGLE_QUOTE, Type.SINGLE_QUOTE_STRING_END,
278 JavaScriptModes.TEXT_MODE)],
280 # Matchers for double quote strings.
281 JavaScriptModes.DOUBLE_QUOTE_STRING_MODE: [
282 Matcher(DOUBLE_QUOTE_TEXT, Type.STRING_TEXT),
283 Matcher(DOUBLE_QUOTE, Type.DOUBLE_QUOTE_STRING_END,
284 JavaScriptModes.TEXT_MODE)],
286 # Matchers for block comments.
287 JavaScriptModes.BLOCK_COMMENT_MODE: [
288 # First we check for exiting a block comment.
289 Matcher(END_BLOCK_COMMENT, Type.END_BLOCK_COMMENT,
290 JavaScriptModes.TEXT_MODE),
292 # Match non-comment-ending text..
293 Matcher(BLOCK_COMMENT_TEXT, Type.COMMENT)],
295 # Matchers for doc comments.
296 JavaScriptModes.DOC_COMMENT_MODE: COMMON_DOC_MATCHERS + [
297 Matcher(DOC_COMMENT_TEXT, Type.COMMENT)],
299 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: COMMON_DOC_MATCHERS + [
300 Matcher(WHITESPACE, Type.COMMENT),
301 Matcher(DOC_COMMENT_NO_SPACES_TEXT, Type.COMMENT)],
303 # Matchers for single line comments.
304 JavaScriptModes.LINE_COMMENT_MODE: [
305 # We greedy match until the end of the line in line comment mode.
306 Matcher(ANYTHING, Type.COMMENT, JavaScriptModes.TEXT_MODE)],
308 # Matchers for code after the function keyword.
309 JavaScriptModes.FUNCTION_MODE: [
310 # Must match open paren before anything else and move into parameter
311 # mode, otherwise everything inside the parameter list is parsed
313 Matcher(OPENING_PAREN, Type.START_PARAMETERS,
314 JavaScriptModes.PARAMETER_MODE),
315 Matcher(WHITESPACE, Type.WHITESPACE),
316 Matcher(IDENTIFIER, Type.FUNCTION_NAME)],
318 # Matchers for function parameters
319 JavaScriptModes.PARAMETER_MODE: [
320 # When in function parameter mode, a closing paren is treated specially.
321 # Everything else is treated as lines of parameters.
322 Matcher(CLOSING_PAREN_WITH_SPACE, Type.END_PARAMETERS,
323 JavaScriptModes.TEXT_MODE),
324 Matcher(PARAMETERS, Type.PARAMETERS, JavaScriptModes.PARAMETER_MODE)]}
326 # When text is not matched, it is given this default type based on mode.
327 # If unspecified in this map, the default default is Type.NORMAL.
328 JAVASCRIPT_DEFAULT_TYPES = {
329 JavaScriptModes.DOC_COMMENT_MODE: Type.COMMENT,
330 JavaScriptModes.DOC_COMMENT_LEX_SPACES_MODE: Type.COMMENT
333 def __init__(self, parse_js_doc = True):
334 """Create a tokenizer object.
337 parse_js_doc: Whether to do detailed parsing of javascript doc comments,
338 or simply treat them as normal comments. Defaults to parsing JsDoc.
340 matchers = self.JAVASCRIPT_MATCHERS
342 # Make a copy so the original doesn't get modified.
343 matchers = copy.deepcopy(matchers)
344 matchers[JavaScriptModes.DOC_COMMENT_MODE] = matchers[
345 JavaScriptModes.BLOCK_COMMENT_MODE]
347 tokenizer.Tokenizer.__init__(self, JavaScriptModes.TEXT_MODE, matchers,
348 self.JAVASCRIPT_DEFAULT_TYPES)
350 def _CreateToken(self, string, token_type, line, line_number, values=None):
351 """Creates a new JavaScriptToken object.
354 string: The string of input the token contains.
355 token_type: The type of token.
356 line: The text of the line this token is in.
357 line_number: The line number of the token.
358 values: A dict of named values within the token. For instance, a
359 function declaration may have a value called 'name' which captures the
360 name of the function.
362 return javascripttokens.JavaScriptToken(string, token_type, line,