From 43cddf412d3f6a53b2599add1828b05b65e3da70 Mon Sep 17 00:00:00 2001 From: "kasperl@chromium.org" Date: Wed, 13 May 2009 13:40:02 +0000 Subject: [PATCH] Optimize the lexical scanner by selective inlining, and by dealing with whitespace as part of the token scanning instead of as a separate step before it. Review URL: http://codereview.chromium.org/113336 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@1934 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/scanner.cc | 458 +++++++++++++++++++++++++++++-------------------- src/scanner.h | 32 +++- src/token.h | 2 +- 3 files changed, 296 insertions(+), 196 deletions(-) diff --git a/src/scanner.cc b/src/scanner.cc index 65ec0f8b7..aab3ca342 100644 --- a/src/scanner.cc +++ b/src/scanner.cc @@ -48,8 +48,12 @@ StaticResource Scanner::utf8_decoder_; // ---------------------------------------------------------------------------- // UTF8Buffer -UTF8Buffer::UTF8Buffer() : data_(NULL) { - Initialize(NULL, 0); +UTF8Buffer::UTF8Buffer() { + static const int kInitialCapacity = 1 * KB; + data_ = NewArray(kInitialCapacity); + limit_ = ComputeLimit(data_, kInitialCapacity); + Reset(); + ASSERT(Capacity() == kInitialCapacity && pos() == 0); } @@ -58,33 +62,27 @@ UTF8Buffer::~UTF8Buffer() { } -void UTF8Buffer::Initialize(char* src, int length) { - DeleteArray(data_); - data_ = src; - size_ = length; - Reset(); -} - - -void UTF8Buffer::AddChar(uc32 c) { - const int min_size = 1024; - if (pos_ + static_cast(unibrow::Utf8::kMaxEncodedSize) > size_) { - int new_size = size_ * 2; - if (new_size < min_size) { - new_size = min_size; - } - char* new_data = NewArray(new_size); - memcpy(new_data, data_, pos_); +void UTF8Buffer::AddCharSlow(uc32 c) { + static const int kCapacityGrowthLimit = 1 * MB; + if (cursor_ > limit_) { + int old_capacity = Capacity(); + int old_position = pos(); + int new_capacity = + Min(old_capacity * 2, old_capacity + kCapacityGrowthLimit); + char* new_data = NewArray(new_capacity); + memcpy(new_data, data_, old_position); DeleteArray(data_); data_ = new_data; - size_ = new_size; + cursor_ = new_data + old_position; + limit_ = ComputeLimit(new_data, new_capacity); + ASSERT(Capacity() == new_capacity && pos() == old_position); } - if (static_cast(c) < unibrow::Utf8::kMaxOneByteChar) { - data_[pos_++] = c; // common case: 7bit ASCII + if (static_cast(c) <= unibrow::Utf8::kMaxOneByteChar) { + *cursor_++ = c; // Common case: 7-bit ASCII. } else { - pos_ += unibrow::Utf8::Encode(&data_[pos_], c); + cursor_ += unibrow::Utf8::Encode(cursor_, c); } - ASSERT(pos_ <= size_); + ASSERT(pos() <= Capacity()); } @@ -172,9 +170,10 @@ void Scanner::Init(Handle source, unibrow::CharacterStream* stream, ASSERT(kCharacterLookaheadBufferSize == 1); Advance(); - // Skip initial whitespace (allowing HTML comment ends) and scan - // first token. - SkipWhiteSpace(true); + // Skip initial whitespace allowing HTML comment ends just like + // after a newline and scan first token. + has_line_terminator_before_next_ = true; + SkipWhiteSpace(); Scan(); } @@ -246,18 +245,19 @@ static inline bool IsByteOrderMark(uc32 c) { } -void Scanner::SkipWhiteSpace(bool initial) { - has_line_terminator_before_next_ = initial; +bool Scanner::SkipWhiteSpace() { + int start_position = source_pos(); while (true) { // We treat byte-order marks (BOMs) as whitespace for better // compatibility with Spidermonkey and other JavaScript engines. while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) { // IsWhiteSpace() includes line terminators! - if (kIsLineTerminator.get(c0_)) + if (kIsLineTerminator.get(c0_)) { // Ignore line terminators, but remember them. This is necessary // for automatic semicolon insertion. has_line_terminator_before_next_ = true; + } Advance(); } @@ -279,7 +279,8 @@ void Scanner::SkipWhiteSpace(bool initial) { } PushBack('-'); // undo Advance() } - return; + // Return whether or not we skipped any characters. + return source_pos() != start_position; } } @@ -296,7 +297,7 @@ Token::Value Scanner::SkipSingleLineComment() { Advance(); } - return Token::COMMENT; + return Token::WHITESPACE; } @@ -316,7 +317,7 @@ Token::Value Scanner::SkipMultiLineComment() { // matches the behaviour of SpiderMonkey and KJS. if (ch == '*' && c0_ == '/') { c0_ = ' '; - return Token::COMMENT; + return Token::WHITESPACE; } } @@ -342,18 +343,238 @@ Token::Value Scanner::ScanHtmlComment() { void Scanner::Scan() { Token::Value token; - bool has_line_terminator = false; + has_line_terminator_before_next_ = false; do { - SkipWhiteSpace(has_line_terminator); - - // Remember the line terminator in previous loop - has_line_terminator = has_line_terminator_before_next(); - // Remember the position of the next token next_.location.beg_pos = source_pos(); - token = ScanToken(); - } while (token == Token::COMMENT); + switch (c0_) { + case ' ': + case '\t': + Advance(); + token = Token::WHITESPACE; + break; + + case '\n': + Advance(); + has_line_terminator_before_next_ = true; + token = Token::WHITESPACE; + break; + + case '"': case '\'': + token = ScanString(); + break; + + case '<': + // < <= << <<= -= + Advance(); + if (c0_ == '-') { + Advance(); + if (c0_ == '>' && has_line_terminator_before_next_) { + // For compatibility with SpiderMonkey, we skip lines that + // start with an HTML comment end '-->'. + token = SkipSingleLineComment(); + } else { + token = Token::DEC; + } + } else if (c0_ == '=') { + token = Select(Token::ASSIGN_SUB); + } else { + token = Token::SUB; + } + break; + + case '*': + // * *= + token = Select('=', Token::ASSIGN_MUL, Token::MUL); + break; + + case '%': + // % %= + token = Select('=', Token::ASSIGN_MOD, Token::MOD); + break; + + case '/': + // / // /* /= + Advance(); + if (c0_ == '/') { + token = SkipSingleLineComment(); + } else if (c0_ == '*') { + token = SkipMultiLineComment(); + } else if (c0_ == '=') { + token = Select(Token::ASSIGN_DIV); + } else { + token = Token::DIV; + } + break; + + case '&': + // & && &= + Advance(); + if (c0_ == '&') { + token = Select(Token::AND); + } else if (c0_ == '=') { + token = Select(Token::ASSIGN_BIT_AND); + } else { + token = Token::BIT_AND; + } + break; + + case '|': + // | || |= + Advance(); + if (c0_ == '|') { + token = Select(Token::OR); + } else if (c0_ == '=') { + token = Select(Token::ASSIGN_BIT_OR); + } else { + token = Token::BIT_OR; + } + break; + + case '^': + // ^ ^= + token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); + break; + + case '.': + // . Number + Advance(); + if (IsDecimalDigit(c0_)) { + token = ScanNumber(true); + } else { + token = Token::PERIOD; + } + break; + + case ':': + token = Select(Token::COLON); + break; + + case ';': + token = Select(Token::SEMICOLON); + break; + + case ',': + token = Select(Token::COMMA); + break; + + case '(': + token = Select(Token::LPAREN); + break; + + case ')': + token = Select(Token::RPAREN); + break; + + case '[': + token = Select(Token::LBRACK); + break; + + case ']': + token = Select(Token::RBRACK); + break; + + case '{': + token = Select(Token::LBRACE); + break; + + case '}': + token = Select(Token::RBRACE); + break; + + case '?': + token = Select(Token::CONDITIONAL); + break; + + case '~': + token = Select(Token::BIT_NOT); + break; + + default: + if (kIsIdentifierStart.get(c0_)) { + token = ScanIdentifier(); + } else if (IsDecimalDigit(c0_)) { + token = ScanNumber(false); + } else if (SkipWhiteSpace()) { + token = Token::WHITESPACE; + } else if (c0_ < 0) { + token = Token::EOS; + } else { + token = Select(Token::ILLEGAL); + } + break; + } + + // Continue scanning for tokens as long as we're just skipping + // whitespace. + } while (token == Token::WHITESPACE); next_.location.end_pos = source_pos(); next_.token = token; @@ -495,147 +716,6 @@ Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) { } -Token::Value Scanner::ScanToken() { - switch (c0_) { - // strings - case '"': case '\'': - return ScanString(); - - case '<': - // < <= << <<=