STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
// Table of one-character tokens, by character (0x00..0x7f only).
-static byte one_char_tokens[] = {
+static const byte one_char_tokens[] = {
Token::ILLEGAL,
Token::ILLEGAL,
Token::ILLEGAL,
}
+// ----------------------------------------------------------------------------
+// Keyword Matcher
+
+#define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
+ KEYWORD_GROUP('b') \
+ KEYWORD("break", BREAK) \
+ KEYWORD_GROUP('c') \
+ KEYWORD("case", CASE) \
+ KEYWORD("catch", CATCH) \
+ KEYWORD("class", FUTURE_RESERVED_WORD) \
+ KEYWORD("const", CONST) \
+ KEYWORD("continue", CONTINUE) \
+ KEYWORD_GROUP('d') \
+ KEYWORD("debugger", DEBUGGER) \
+ KEYWORD("default", DEFAULT) \
+ KEYWORD("delete", DELETE) \
+ KEYWORD("do", DO) \
+ KEYWORD_GROUP('e') \
+ KEYWORD("else", ELSE) \
+ KEYWORD("enum", FUTURE_RESERVED_WORD) \
+ KEYWORD("export", FUTURE_RESERVED_WORD) \
+ KEYWORD("extends", FUTURE_RESERVED_WORD) \
+ KEYWORD_GROUP('f') \
+ KEYWORD("false", FALSE_LITERAL) \
+ KEYWORD("finally", FINALLY) \
+ KEYWORD("for", FOR) \
+ KEYWORD("function", FUNCTION) \
+ KEYWORD_GROUP('i') \
+ KEYWORD("if", IF) \
+ KEYWORD("implements", FUTURE_STRICT_RESERVED_WORD) \
+ KEYWORD("import", FUTURE_RESERVED_WORD) \
+ KEYWORD("in", IN) \
+ KEYWORD("instanceof", INSTANCEOF) \
+ KEYWORD("interface", FUTURE_STRICT_RESERVED_WORD) \
+ KEYWORD_GROUP('l') \
+ KEYWORD("let", FUTURE_STRICT_RESERVED_WORD) \
+ KEYWORD_GROUP('n') \
+ KEYWORD("new", NEW) \
+ KEYWORD("null", NULL_LITERAL) \
+ KEYWORD_GROUP('p') \
+ KEYWORD("package", FUTURE_STRICT_RESERVED_WORD) \
+ KEYWORD("private", FUTURE_STRICT_RESERVED_WORD) \
+ KEYWORD("protected", FUTURE_STRICT_RESERVED_WORD) \
+ KEYWORD("public", FUTURE_STRICT_RESERVED_WORD) \
+ KEYWORD_GROUP('r') \
+ KEYWORD("return", RETURN) \
+ KEYWORD_GROUP('s') \
+ KEYWORD("static", FUTURE_STRICT_RESERVED_WORD) \
+ KEYWORD("super", FUTURE_RESERVED_WORD) \
+ KEYWORD("switch", SWITCH) \
+ KEYWORD_GROUP('t') \
+ KEYWORD("this", THIS) \
+ KEYWORD("throw", THROW) \
+ KEYWORD("true", TRUE_LITERAL) \
+ KEYWORD("try", TRY) \
+ KEYWORD("typeof", TYPEOF) \
+ KEYWORD_GROUP('v') \
+ KEYWORD("var", VAR) \
+ KEYWORD("void", VOID) \
+ KEYWORD_GROUP('w') \
+ KEYWORD("while", WHILE) \
+ KEYWORD("with", WITH) \
+ KEYWORD_GROUP('y') \
+ KEYWORD("yield", FUTURE_STRICT_RESERVED_WORD)
+
+
+static Token::Value KeywordOrIdentifierToken(const char* input,
+ int input_length) {
+ ASSERT(input_length >= 1);
+ const int kMinLength = 2;
+ const int kMaxLength = 10;
+ if (input_length < kMinLength || input_length > kMaxLength) {
+ return Token::IDENTIFIER;
+ }
+ switch (input[0]) {
+ default:
+#define KEYWORD_GROUP_CASE(ch) \
+ break; \
+ case ch:
+#define KEYWORD(keyword, token) \
+ { \
+ /* 'keyword' is a char array, so sizeof(keyword) is */ \
+ /* strlen(keyword) plus 1 for the NUL char. */ \
+ const int keyword_length = sizeof(keyword) - 1; \
+ STATIC_ASSERT(keyword_length >= kMinLength); \
+ STATIC_ASSERT(keyword_length <= kMaxLength); \
+ if (input_length == keyword_length && \
+ input[1] == keyword[1] && \
+ (keyword_length <= 2 || input[2] == keyword[2]) && \
+ (keyword_length <= 3 || input[3] == keyword[3]) && \
+ (keyword_length <= 4 || input[4] == keyword[4]) && \
+ (keyword_length <= 5 || input[5] == keyword[5]) && \
+ (keyword_length <= 6 || input[6] == keyword[6]) && \
+ (keyword_length <= 7 || input[7] == keyword[7]) && \
+ (keyword_length <= 8 || input[8] == keyword[8]) && \
+ (keyword_length <= 9 || input[9] == keyword[9])) { \
+ return Token::token; \
+ } \
+ }
+ KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
+ }
+ return Token::IDENTIFIER;
+}
+
+
Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
ASSERT(unicode_cache_->IsIdentifierStart(c0_));
LiteralScope literal(this);
- KeywordMatcher keyword_match;
// Scan identifier start character.
if (c0_ == '\\') {
uc32 c = ScanIdentifierUnicodeEscape();
uc32 first_char = c0_;
Advance();
AddLiteralChar(first_char);
- if (!keyword_match.AddChar(first_char)) {
- return ScanIdentifierSuffix(&literal);
- }
// Scan the rest of the identifier characters.
while (unicode_cache_->IsIdentifierPart(c0_)) {
uc32 next_char = c0_;
Advance();
AddLiteralChar(next_char);
- if (keyword_match.AddChar(next_char)) continue;
+ continue;
}
- // Fallthrough if no loner able to complete keyword.
+ // Fallthrough if no longer able to complete keyword.
return ScanIdentifierSuffix(&literal);
}
+
literal.Complete();
- return keyword_match.token();
+ if (next_.literal_chars->is_ascii()) {
+ Vector<const char> chars = next_.literal_chars->ascii_literal();
+ return KeywordOrIdentifierToken(chars.start(), chars.length());
+ }
+
+ return Token::IDENTIFIER;
}
return true;
}
-// ----------------------------------------------------------------------------
-// Keyword Matcher
-
-const KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
- { "break", KEYWORD_PREFIX, Token::BREAK },
- { NULL, C, Token::ILLEGAL },
- { NULL, D, Token::ILLEGAL },
- { NULL, E, Token::ILLEGAL },
- { NULL, F, Token::ILLEGAL },
- { NULL, UNMATCHABLE, Token::ILLEGAL },
- { NULL, UNMATCHABLE, Token::ILLEGAL },
- { NULL, I, Token::ILLEGAL },
- { NULL, UNMATCHABLE, Token::ILLEGAL },
- { NULL, UNMATCHABLE, Token::ILLEGAL },
- { "let", KEYWORD_PREFIX, Token::FUTURE_STRICT_RESERVED_WORD },
- { NULL, UNMATCHABLE, Token::ILLEGAL },
- { NULL, N, Token::ILLEGAL },
- { NULL, UNMATCHABLE, Token::ILLEGAL },
- { NULL, P, Token::ILLEGAL },
- { NULL, UNMATCHABLE, Token::ILLEGAL },
- { "return", KEYWORD_PREFIX, Token::RETURN },
- { NULL, S, Token::ILLEGAL },
- { NULL, T, Token::ILLEGAL },
- { NULL, UNMATCHABLE, Token::ILLEGAL },
- { NULL, V, Token::ILLEGAL },
- { NULL, W, Token::ILLEGAL },
- { NULL, UNMATCHABLE, Token::ILLEGAL },
- { "yield", KEYWORD_PREFIX, Token::FUTURE_STRICT_RESERVED_WORD }
-};
-
-
-void KeywordMatcher::Step(unibrow::uchar input) {
- switch (state_) {
- case INITIAL: {
- // matching the first character is the only state with significant fanout.
- // Match only lower-case letters in range 'b'..'y'.
- unsigned int offset = input - kFirstCharRangeMin;
- if (offset < kFirstCharRangeLength) {
- state_ = first_states_[offset].state;
- if (state_ == KEYWORD_PREFIX) {
- keyword_ = first_states_[offset].keyword;
- counter_ = 1;
- keyword_token_ = first_states_[offset].token;
- }
- return;
- }
- break;
- }
- case KEYWORD_PREFIX:
- if (static_cast<unibrow::uchar>(keyword_[counter_]) == input) {
- counter_++;
- if (keyword_[counter_] == '\0') {
- state_ = KEYWORD_MATCHED;
- token_ = keyword_token_;
- }
- return;
- }
- break;
- case KEYWORD_MATCHED:
- token_ = Token::IDENTIFIER;
- break;
- case C:
- if (MatchState(input, 'a', CA)) return;
- if (MatchKeywordStart(input, "class", 1,
- Token::FUTURE_RESERVED_WORD)) return;
- if (MatchState(input, 'o', CO)) return;
- break;
- case CA:
- if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
- if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
- break;
- case CO:
- if (MatchState(input, 'n', CON)) return;
- break;
- case CON:
- if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
- if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
- break;
- case D:
- if (MatchState(input, 'e', DE)) return;
- if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
- break;
- case DE:
- if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
- if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
- if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
- break;
- case E:
- if (MatchKeywordStart(input, "else", 1, Token::ELSE)) return;
- if (MatchKeywordStart(input, "enum", 1,
- Token::FUTURE_RESERVED_WORD)) return;
- if (MatchState(input, 'x', EX)) return;
- break;
- case EX:
- if (MatchKeywordStart(input, "export", 2,
- Token::FUTURE_RESERVED_WORD)) return;
- if (MatchKeywordStart(input, "extends", 2,
- Token::FUTURE_RESERVED_WORD)) return;
- break;
- case F:
- if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
- if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
- if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
- if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
- break;
- case I:
- if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
- if (MatchState(input, 'm', IM)) return;
- if (MatchKeyword(input, 'n', IN, Token::IN)) return;
- break;
- case IM:
- if (MatchState(input, 'p', IMP)) return;
- break;
- case IMP:
- if (MatchKeywordStart(input, "implements", 3,
- Token::FUTURE_STRICT_RESERVED_WORD )) return;
- if (MatchKeywordStart(input, "import", 3,
- Token::FUTURE_RESERVED_WORD)) return;
- break;
- case IN:
- token_ = Token::IDENTIFIER;
- if (MatchKeywordStart(input, "interface", 2,
- Token::FUTURE_STRICT_RESERVED_WORD)) return;
- if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) return;
- break;
- case N:
- if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
- if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
- break;
- case P:
- if (MatchKeywordStart(input, "package", 1,
- Token::FUTURE_STRICT_RESERVED_WORD)) return;
- if (MatchState(input, 'r', PR)) return;
- if (MatchKeywordStart(input, "public", 1,
- Token::FUTURE_STRICT_RESERVED_WORD)) return;
- break;
- case PR:
- if (MatchKeywordStart(input, "private", 2,
- Token::FUTURE_STRICT_RESERVED_WORD)) return;
- if (MatchKeywordStart(input, "protected", 2,
- Token::FUTURE_STRICT_RESERVED_WORD)) return;
- break;
- case S:
- if (MatchKeywordStart(input, "static", 1,
- Token::FUTURE_STRICT_RESERVED_WORD)) return;
- if (MatchKeywordStart(input, "super", 1,
- Token::FUTURE_RESERVED_WORD)) return;
- if (MatchKeywordStart(input, "switch", 1,
- Token::SWITCH)) return;
- break;
- case T:
- if (MatchState(input, 'h', TH)) return;
- if (MatchState(input, 'r', TR)) return;
- if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
- break;
- case TH:
- if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
- if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
- break;
- case TR:
- if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
- if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
- break;
- case V:
- if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
- if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
- break;
- case W:
- if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
- if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
- break;
- case UNMATCHABLE:
- break;
- }
- // On fallthrough, it's a failure.
- state_ = UNMATCHABLE;
-}
-
} } // namespace v8::internal
bool has_multiline_comment_before_next_;
};
-
-// ----------------------------------------------------------------------------
-// Keyword matching state machine.
-
-class KeywordMatcher {
-// Incrementally recognize keywords.
-//
-// We distinguish between normal future reserved words and words that are
-// considered to be future reserved words only in strict mode as required by
-// ECMA-262 7.6.1.2.
-//
-// Recognized as keywords:
-// break, case, catch, const*, continue, debugger, default, delete, do,
-// else, finally, false, for, function, if, in, instanceof, new, null,
-// return, switch, this, throw, true, try, typeof, var, void, while, with.
-//
-// Recognized as Future Reserved Keywords:
-// class, enum, export, extends, import, super.
-//
-// Recognized as Future Reserved Keywords (strict mode only):
-// implements, interface, let, package, private, protected, public,
-// static, yield.
-//
-// *: Actually a "future reserved keyword". It's the only one we are
-// recognizing outside of ES5 strict mode, the remaining are allowed
-// as identifiers.
-//
- public:
- KeywordMatcher()
- : state_(INITIAL),
- token_(Token::IDENTIFIER),
- keyword_(NULL),
- counter_(0),
- keyword_token_(Token::ILLEGAL) {}
-
- Token::Value token() { return token_; }
-
- inline bool AddChar(unibrow::uchar input) {
- if (state_ != UNMATCHABLE) {
- Step(input);
- }
- return state_ != UNMATCHABLE;
- }
-
- void Fail() {
- token_ = Token::IDENTIFIER;
- state_ = UNMATCHABLE;
- }
-
- private:
- enum State {
- UNMATCHABLE,
- INITIAL,
- KEYWORD_PREFIX,
- KEYWORD_MATCHED,
- C,
- CA,
- CO,
- CON,
- D,
- DE,
- E,
- EX,
- F,
- I,
- IM,
- IMP,
- IN,
- N,
- P,
- PR,
- S,
- T,
- TH,
- TR,
- V,
- W,
- LAST_STATE = W
- };
-
-
- STATIC_ASSERT(LAST_STATE <= 0xFF);
- STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
- struct FirstState {
- const char* keyword;
- State state : 8;
- Token::Value token : 8;
- };
-
- // Range of possible first characters of a keyword.
- static const unsigned int kFirstCharRangeMin = 'b';
- static const unsigned int kFirstCharRangeMax = 'y';
- static const unsigned int kFirstCharRangeLength =
- kFirstCharRangeMax - kFirstCharRangeMin + 1;
- // State map for first keyword character range.
- static const FirstState first_states_[kFirstCharRangeLength];
-
- // If input equals keyword's character at position, continue matching keyword
- // from that position.
- inline bool MatchKeywordStart(unibrow::uchar input,
- const char* keyword,
- int position,
- Token::Value token_if_match) {
- if (input != static_cast<unibrow::uchar>(keyword[position])) {
- return false;
- }
- state_ = KEYWORD_PREFIX;
- this->keyword_ = keyword;
- this->counter_ = position + 1;
- this->keyword_token_ = token_if_match;
- return true;
- }
-
- // If input equals match character, transition to new state and return true.
- inline bool MatchState(unibrow::uchar input, char match, State new_state) {
- if (input != static_cast<unibrow::uchar>(match)) {
- return false;
- }
- state_ = new_state;
- return true;
- }
-
- inline bool MatchKeyword(unibrow::uchar input,
- char match,
- State new_state,
- Token::Value keyword_token) {
- if (input != static_cast<unibrow::uchar>(match)) {
- return false;
- }
- state_ = new_state;
- token_ = keyword_token;
- return true;
- }
-
- void Step(unibrow::uchar input);
-
- // Current state.
- State state_;
- // Token for currently added characters.
- Token::Value token_;
-
- // Matching a specific keyword string (there is only one possible valid
- // keyword with the current prefix).
- const char* keyword_;
- int counter_;
- Token::Value keyword_token_;
-};
-
-
} } // namespace v8::internal
#endif // V8_SCANNER_BASE_H_
namespace i = ::v8::internal;
-TEST(KeywordMatcher) {
+TEST(ScanKeywords) {
struct KeywordToken {
const char* keyword;
i::Token::Value token;
static const KeywordToken keywords[] = {
#define KEYWORD(t, s, d) { s, i::Token::t },
-#define IGNORE(t, s, d) /* */
- TOKEN_LIST(IGNORE, KEYWORD, IGNORE)
+ TOKEN_LIST(IGNORE_TOKEN, KEYWORD)
#undef KEYWORD
{ NULL, i::Token::IDENTIFIER }
};
- static const char* future_keywords[] = {
-#define FUTURE(t, s, d) s,
- TOKEN_LIST(IGNORE, IGNORE, FUTURE)
-#undef FUTURE
-#undef IGNORE
- NULL
- };
-
KeywordToken key_token;
+ i::UnicodeCache unicode_cache;
+ i::byte buffer[32];
for (int i = 0; (key_token = keywords[i]).keyword != NULL; i++) {
- i::KeywordMatcher matcher;
- const char* keyword = key_token.keyword;
- int length = i::StrLength(keyword);
- for (int j = 0; j < length; j++) {
- if (key_token.token == i::Token::INSTANCEOF && j == 2) {
- // "in" is a prefix of "instanceof". It's the only keyword
- // that is a prefix of another.
- CHECK_EQ(i::Token::IN, matcher.token());
- } else {
- CHECK_EQ(i::Token::IDENTIFIER, matcher.token());
- }
- matcher.AddChar(keyword[j]);
+ const i::byte* keyword =
+ reinterpret_cast<const i::byte*>(key_token.keyword);
+ int length = i::StrLength(key_token.keyword);
+ CHECK(static_cast<int>(sizeof(buffer)) >= length);
+ {
+ i::Utf8ToUC16CharacterStream stream(keyword, length);
+ i::JavaScriptScanner scanner(&unicode_cache);
+ scanner.Initialize(&stream);
+ CHECK_EQ(key_token.token, scanner.Next());
+ CHECK_EQ(i::Token::EOS, scanner.Next());
}
- CHECK_EQ(key_token.token, matcher.token());
- // Adding more characters will make keyword matching fail.
- matcher.AddChar('z');
- CHECK_EQ(i::Token::IDENTIFIER, matcher.token());
- // Adding a keyword later will not make it match again.
- matcher.AddChar('i');
- matcher.AddChar('f');
- CHECK_EQ(i::Token::IDENTIFIER, matcher.token());
- }
-
- // Future keywords are not recognized.
- const char* future_keyword;
- for (int i = 0; (future_keyword = future_keywords[i]) != NULL; i++) {
- i::KeywordMatcher matcher;
- int length = i::StrLength(future_keyword);
- for (int j = 0; j < length; j++) {
- matcher.AddChar(future_keyword[j]);
+ // Removing characters will make keyword matching fail.
+ {
+ i::Utf8ToUC16CharacterStream stream(keyword, length - 1);
+ i::JavaScriptScanner scanner(&unicode_cache);
+ scanner.Initialize(&stream);
+ CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
+ CHECK_EQ(i::Token::EOS, scanner.Next());
+ }
+ // Adding characters will make keyword matching fail.
+ static const char chars_to_append[] = { 'z', '0', '_' };
+ for (int j = 0; j < static_cast<int>(ARRAY_SIZE(chars_to_append)); ++j) {
+ memmove(buffer, keyword, length);
+ buffer[length] = chars_to_append[j];
+ i::Utf8ToUC16CharacterStream stream(buffer, length + 1);
+ i::JavaScriptScanner scanner(&unicode_cache);
+ scanner.Initialize(&stream);
+ CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
+ CHECK_EQ(i::Token::EOS, scanner.Next());
+ }
+ // Replacing characters will make keyword matching fail.
+ {
+ memmove(buffer, keyword, length);
+ buffer[length - 1] = '_';
+ i::Utf8ToUC16CharacterStream stream(buffer, length);
+ i::JavaScriptScanner scanner(&unicode_cache);
+ scanner.Initialize(&stream);
+ CHECK_EQ(i::Token::IDENTIFIER, scanner.Next());
+ CHECK_EQ(i::Token::EOS, scanner.Next());
}
- CHECK_EQ(i::Token::IDENTIFIER, matcher.token());
}
-
- // Zero isn't ignored at first.
- i::KeywordMatcher bad_start;
- bad_start.AddChar(0);
- CHECK_EQ(i::Token::IDENTIFIER, bad_start.token());
- bad_start.AddChar('i');
- bad_start.AddChar('f');
- CHECK_EQ(i::Token::IDENTIFIER, bad_start.token());
-
- // Zero isn't ignored at end.
- i::KeywordMatcher bad_end;
- bad_end.AddChar('i');
- bad_end.AddChar('f');
- CHECK_EQ(i::Token::IF, bad_end.token());
- bad_end.AddChar(0);
- CHECK_EQ(i::Token::IDENTIFIER, bad_end.token());
-
- // Case isn't ignored.
- i::KeywordMatcher bad_case;
- bad_case.AddChar('i');
- bad_case.AddChar('F');
- CHECK_EQ(i::Token::IDENTIFIER, bad_case.token());
-
- // If we mark it as failure, continuing won't help.
- i::KeywordMatcher full_stop;
- full_stop.AddChar('i');
- CHECK_EQ(i::Token::IDENTIFIER, full_stop.token());
- full_stop.Fail();
- CHECK_EQ(i::Token::IDENTIFIER, full_stop.token());
- full_stop.AddChar('f');
- CHECK_EQ(i::Token::IDENTIFIER, full_stop.token());
}