Rename scanner.* to scanner-character-streams.*. and scanner-base.* to scanner.*

author lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Thu, 8 Sep 2011 13:06:44 +0000 (13:06 +0000)

committer lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Thu, 8 Sep 2011 13:06:44 +0000 (13:06 +0000)
author lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Thu, 8 Sep 2011 13:06:44 +0000 (13:06 +0000)
committer lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Thu, 8 Sep 2011 13:06:44 +0000 (13:06 +0000)
diff --git a/src/SConscript b/src/SConscript

index 070827dd125765fe273a5a6cee38019a8c4bc382..52607f15c5e4d9dd4e640af12bc7d28af4b5ddf2 100644 (file)
--- a/src/SConscript
+++ b/src/SConscript
@@ -111,8 +111,8 @@ SOURCES = {
      runtime.cc
      runtime-profiler.cc
      safepoint-table.cc
-    scanner-base.cc
      scanner.cc
+    scanner-character-streams.cc
      scopeinfo.cc
      scopes.cc
      serialize.cc
@@ -245,7 +245,7 @@ PREPARSER_SOURCES = {
      preparse-data.cc
      preparser.cc
      preparser-api.cc
-    scanner-base.cc
+    scanner.cc
      strtod.cc
      token.cc
      unicode.cc
diff --git a/src/api.cc b/src/api.cc

index 0207f51eb428f19dced5e0c519472636424ed777..26558c42924954037831b23aa2f5de26f3fb9df1 100644 (file)
--- a/src/api.cc
+++ b/src/api.cc
@@ -44,6 +44,7 @@
  #include "platform.h"
  #include "profile-generator-inl.h"
  #include "runtime-profiler.h"
+#include "scanner-character-streams.h"
  #include "serialize.h"
  #include "snapshot.h"
  #include "v8threads.h"
diff --git a/src/compiler.cc b/src/compiler.cc

index c7e78067cfebce5161baaaf18ecc4520e33cfa92..ba6bb42bfa2862ad21932d47d1e82ea7b3b60f00 100644 (file)
--- a/src/compiler.cc
+++ b/src/compiler.cc
@@ -41,6 +41,7 @@
  #include "parser.h"
  #include "rewriter.h"
  #include "runtime-profiler.h"
+#include "scanner-character-streams.h"
  #include "scopeinfo.h"
  #include "scopes.h"
  #include "vm-state-inl.h"
diff --git a/src/conversions-inl.h b/src/conversions-inl.h

index a3c2c160f77da19cb3fc0dbdb89e759a7e7fe51f..41cf0d54c2196e4ca7f8cf33984b3a323b26aac0 100644 (file)
--- a/src/conversions-inl.h
+++ b/src/conversions-inl.h
@@ -38,9 +38,10 @@
  // Extra POSIX/ANSI functions for Win32/MSVC.
  
  #include "conversions.h"
-#include "strtod.h"
-#include "platform.h"
  #include "double.h"
+#include "platform.h"
+#include "scanner.h"
+#include "strtod.h"
  
  namespace v8 {
  namespace internal {
diff --git a/src/conversions.cc b/src/conversions.cc

index b634ba799c582fb0cff966adaa9819ec59f08c56..5bfddd04c01c025b3de2caf9cafbcbb0c6f8b549 100644 (file)
--- a/src/conversions.cc
+++ b/src/conversions.cc
@@ -31,7 +31,6 @@
  
  #include "conversions-inl.h"
  #include "dtoa.h"
-#include "scanner-base.h"
  #include "strtod.h"
  #include "utils.h"
  
diff --git a/src/conversions.h b/src/conversions.h

index 632419fab2a52ee93fb699ff3fc1413d0117c071..e51ad6501cb7628e1f60eeec608bdf6d574a426d 100644 (file)
--- a/src/conversions.h
+++ b/src/conversions.h
@@ -30,11 +30,13 @@
  
  #include <limits>
  
-#include "scanner-base.h"
+#include "utils.h"
  
  namespace v8 {
  namespace internal {
  
+class UnicodeCache;
+
  // Maximum number of significant digits in decimal representation.
  // The longest possible double in decimal representation is
  // (2^53 - 1) * 2 ^ -1074 that is (2 ^ 53 - 1) * 5 ^ 1074 / 10 ^ 1074
diff --git a/src/dateparser.h b/src/dateparser.h

index 4bd320e901d585c4df3c7763f600dc01505d38cb..4777e35f66c1fe3f1e46df3798aef43809bdaa4c 100644 (file)
--- a/src/dateparser.h
+++ b/src/dateparser.h
@@ -30,7 +30,6 @@
  
  #include "allocation.h"
  #include "char-predicates-inl.h"
-#include "scanner-base.h"
  
  namespace v8 {
  namespace internal {
diff --git a/src/heap.cc b/src/heap.cc

index 971a8e793860d964284f3eeac7527293123c3f3a..0ca138f337f8502f023ecda9d9328849cb131d02 100644 (file)
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -41,7 +41,6 @@
  #include "natives.h"
  #include "objects-visiting.h"
  #include "runtime-profiler.h"
-#include "scanner-base.h"
  #include "scopeinfo.h"
  #include "snapshot.h"
  #include "v8threads.h"
diff --git a/src/isolate.cc b/src/isolate.cc

index d36690297782ae5367d84d6260ed28b4c61cb5d8..afb9624875cc2756c0edb7f25d1c84138870bac2 100644 (file)
--- a/src/isolate.cc
+++ b/src/isolate.cc
@@ -43,7 +43,6 @@
  #include "messages.h"
  #include "regexp-stack.h"
  #include "runtime-profiler.h"
-#include "scanner.h"
  #include "scopeinfo.h"
  #include "serialize.h"
  #include "simulator.h"
diff --git a/src/objects.cc b/src/objects.cc

index 76b57d86aa2cf3b57b9665c82f555ea28e315ebd..00ea4f23db502aefcb91454502c3293c4a47d9e2 100644 (file)
--- a/src/objects.cc
+++ b/src/objects.cc
@@ -41,7 +41,6 @@
  #include "objects-visiting.h"
  #include "macro-assembler.h"
  #include "safepoint-table.h"
-#include "scanner-base.h"
  #include "string-stream.h"
  #include "utils.h"
  #include "vm-state-inl.h"
diff --git a/src/parser.cc b/src/parser.cc

index 31c5dc818cabe629255203e7a661cd3f4e1a2897..d64e7b7600ea3349f1bda03b6b9321f6aefde6bb 100644 (file)
--- a/src/parser.cc
+++ b/src/parser.cc
@@ -39,6 +39,7 @@
  #include "platform.h"
  #include "preparser.h"
  #include "runtime.h"
+#include "scanner-character-streams.h"
  #include "scopeinfo.h"
  #include "string-stream.h"
  
diff --git a/src/parser.h b/src/parser.h

index 381ff27143b4faf7cc028d7952e5471e0fa85e8e..3312f2f56a85de7c66c0d7db34768157eeb8752a 100644 (file)
--- a/src/parser.h
+++ b/src/parser.h
@@ -30,10 +30,9 @@
  
  #include "allocation.h"
  #include "ast.h"
-#include "scanner.h"
-#include "scopes.h"
  #include "preparse-data-format.h"
  #include "preparse-data.h"
+#include "scopes.h"
  
  namespace v8 {
  namespace internal {
diff --git a/src/preparser-api.cc b/src/preparser-api.cc

index 6a4d896f7550fd0918047e16c94188a27a5328b0..899489e25002d38ca6580586699b86eb03b3d13d 100644 (file)
--- a/src/preparser-api.cc
+++ b/src/preparser-api.cc
@@ -38,7 +38,6 @@
  #include "utils.h"
  #include "list.h"
  #include "hashmap.h"
-#include "scanner-base.h"
  #include "preparse-data-format.h"
  #include "preparse-data.h"
  #include "preparser.h"
diff --git a/src/preparser.cc b/src/preparser.cc

index c21dc2795c0cbb22c23ed42a68fbd21f6e999c48..310aeacfb9a1f40e293c89a15f435c8521f7639d 100644 (file)
--- a/src/preparser.cc
+++ b/src/preparser.cc
@@ -28,21 +28,19 @@
  #include <math.h>
  
  #include "../include/v8stdint.h"
-#include "unicode.h"
-#include "globals.h"
-#include "checks.h"
+
  #include "allocation.h"
-#include "utils.h"
-#include "list.h"
+#include "checks.h"
  #include "conversions.h"
+#include "conversions-inl.h"
+#include "globals.h"
  #include "hashmap.h"
-
-#include "scanner-base.h"
+#include "list.h"
  #include "preparse-data-format.h"
  #include "preparse-data.h"
  #include "preparser.h"
-
-#include "conversions-inl.h"
+#include "unicode.h"
+#include "utils.h"
  
  namespace v8 {
  
diff --git a/src/preparser.h b/src/preparser.h

index 9dc36f18a3fbaca89ad4821ebedb76e8e9ebd08f..010c5e2cf6e0080f880256f774fc8298add1ee43 100644 (file)
--- a/src/preparser.h
+++ b/src/preparser.h
@@ -28,7 +28,15 @@
  #ifndef V8_PREPARSER_H
  #define V8_PREPARSER_H
  
+#include "token.h"
+#include "scanner.h"
+
  namespace v8 {
+
+namespace internal {
+class UnicodeCache;
+}
+
  namespace preparser {
  
  typedef uint8_t byte;
@@ -106,7 +114,7 @@ class PreParser {
      kPreParseSuccess
    };
  
-  ~PreParser() { }
+  ~PreParser() {}
  
    // Pre-parse the program from the character stream; returns true on
    // success (even if parsing failed, the pre-parse data successfully
diff --git a/src/scanner-base.cc b/src/scanner-base.cc

deleted file mode 100644 (file)

index 62eee1a..0000000
--- a/src/scanner-base.cc
+++ /dev/null
@@ -1,1090 +0,0 @@
-// Copyright 2011 the V8 project authors. All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-//       notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-//       copyright notice, this list of conditions and the following
-//       disclaimer in the documentation and/or other materials provided
-//       with the distribution.
-//     * Neither the name of Google Inc. nor the names of its
-//       contributors may be used to endorse or promote products derived
-//       from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Features shared by parsing and pre-parsing scanners.
-
-#include "../include/v8stdint.h"
-#include "scanner-base.h"
-#include "char-predicates-inl.h"
-
-namespace v8 {
-namespace internal {
-
-// ----------------------------------------------------------------------------
-// Scanner
-
-Scanner::Scanner(UnicodeCache* unicode_cache)
-    : unicode_cache_(unicode_cache) { }
-
-
-uc32 Scanner::ScanHexNumber(int expected_length) {
-  ASSERT(expected_length <= 4);  // prevent overflow
-
-  uc32 digits[4] = { 0, 0, 0, 0 };
-  uc32 x = 0;
-  for (int i = 0; i < expected_length; i++) {
-    digits[i] = c0_;
-    int d = HexValue(c0_);
-    if (d < 0) {
-      // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
-      // should be illegal, but other JS VMs just return the
-      // non-escaped version of the original character.
-
-      // Push back digits that we have advanced past.
-      for (int j = i-1; j >= 0; j--) {
-        PushBack(digits[j]);
-      }
-      return -1;
-    }
-    x = x * 16 + d;
-    Advance();
-  }
-
-  return x;
-}
-
-
-
-// ----------------------------------------------------------------------------
-// JavaScriptScanner
-
-JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)
-    : Scanner(scanner_contants),
-      octal_pos_(Location::invalid()),
-      harmony_block_scoping_(false) { }
-
-
-void JavaScriptScanner::Initialize(UC16CharacterStream* source) {
-  source_ = source;
-  // Need to capture identifiers in order to recognize "get" and "set"
-  // in object literals.
-  Init();
-  // Skip initial whitespace allowing HTML comment ends just like
-  // after a newline and scan first token.
-  has_line_terminator_before_next_ = true;
-  SkipWhiteSpace();
-  Scan();
-}
-
-
-// Ensure that tokens can be stored in a byte.
-STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
-
-// Table of one-character tokens, by character (0x00..0x7f only).
-static const byte one_char_tokens[] = {
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::LPAREN,       // 0x28
-  Token::RPAREN,       // 0x29
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::COMMA,        // 0x2c
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::COLON,        // 0x3a
-  Token::SEMICOLON,    // 0x3b
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::CONDITIONAL,  // 0x3f
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::LBRACK,     // 0x5b
-  Token::ILLEGAL,
-  Token::RBRACK,     // 0x5d
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::ILLEGAL,
-  Token::LBRACE,       // 0x7b
-  Token::ILLEGAL,
-  Token::RBRACE,       // 0x7d
-  Token::BIT_NOT,      // 0x7e
-  Token::ILLEGAL
-};
-
-
-Token::Value JavaScriptScanner::Next() {
-  current_ = next_;
-  has_line_terminator_before_next_ = false;
-  has_multiline_comment_before_next_ = false;
-  if (static_cast<unsigned>(c0_) <= 0x7f) {
-    Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
-    if (token != Token::ILLEGAL) {
-      int pos = source_pos();
-      next_.token = token;
-      next_.location.beg_pos = pos;
-      next_.location.end_pos = pos + 1;
-      Advance();
-      return current_.token;
-    }
-  }
-  Scan();
-  return current_.token;
-}
-
-
-static inline bool IsByteOrderMark(uc32 c) {
-  // The Unicode value U+FFFE is guaranteed never to be assigned as a
-  // Unicode character; this implies that in a Unicode context the
-  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
-  // character expressed in little-endian byte order (since it could
-  // not be a U+FFFE character expressed in big-endian byte
-  // order). Nevertheless, we check for it to be compatible with
-  // Spidermonkey.
-  return c == 0xFEFF || c == 0xFFFE;
-}
-
-
-bool JavaScriptScanner::SkipWhiteSpace() {
-  int start_position = source_pos();
-
-  while (true) {
-    // We treat byte-order marks (BOMs) as whitespace for better
-    // compatibility with Spidermonkey and other JavaScript engines.
-    while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
-      // IsWhiteSpace() includes line terminators!
-      if (unicode_cache_->IsLineTerminator(c0_)) {
-        // Ignore line terminators, but remember them. This is necessary
-        // for automatic semicolon insertion.
-        has_line_terminator_before_next_ = true;
-      }
-      Advance();
-    }
-
-    // If there is an HTML comment end '-->' at the beginning of a
-    // line (with only whitespace in front of it), we treat the rest
-    // of the line as a comment. This is in line with the way
-    // SpiderMonkey handles it.
-    if (c0_ == '-' && has_line_terminator_before_next_) {
-      Advance();
-      if (c0_ == '-') {
-        Advance();
-        if (c0_ == '>') {
-          // Treat the rest of the line as a comment.
-          SkipSingleLineComment();
-          // Continue skipping white space after the comment.
-          continue;
-        }
-        PushBack('-');  // undo Advance()
-      }
-      PushBack('-');  // undo Advance()
-    }
-    // Return whether or not we skipped any characters.
-    return source_pos() != start_position;
-  }
-}
-
-
-Token::Value JavaScriptScanner::SkipSingleLineComment() {
-  Advance();
-
-  // The line terminator at the end of the line is not considered
-  // to be part of the single-line comment; it is recognized
-  // separately by the lexical grammar and becomes part of the
-  // stream of input elements for the syntactic grammar (see
-  // ECMA-262, section 7.4).
-  while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
-    Advance();
-  }
-
-  return Token::WHITESPACE;
-}
-
-
-Token::Value JavaScriptScanner::SkipMultiLineComment() {
-  ASSERT(c0_ == '*');
-  Advance();
-
-  while (c0_ >= 0) {
-    uc32 ch = c0_;
-    Advance();
-    if (unicode_cache_->IsLineTerminator(ch)) {
-      // Following ECMA-262, section 7.4, a comment containing
-      // a newline will make the comment count as a line-terminator.
-      has_multiline_comment_before_next_ = true;
-    }
-    // If we have reached the end of the multi-line comment, we
-    // consume the '/' and insert a whitespace. This way all
-    // multi-line comments are treated as whitespace.
-    if (ch == '*' && c0_ == '/') {
-      c0_ = ' ';
-      return Token::WHITESPACE;
-    }
-  }
-
-  // Unterminated multi-line comment.
-  return Token::ILLEGAL;
-}
-
-
-Token::Value JavaScriptScanner::ScanHtmlComment() {
-  // Check for <!-- comments.
-  ASSERT(c0_ == '!');
-  Advance();
-  if (c0_ == '-') {
-    Advance();
-    if (c0_ == '-') return SkipSingleLineComment();
-    PushBack('-');  // undo Advance()
-  }
-  PushBack('!');  // undo Advance()
-  ASSERT(c0_ == '!');
-  return Token::LT;
-}
-
-
-void JavaScriptScanner::Scan() {
-  next_.literal_chars = NULL;
-  Token::Value token;
-  do {
-    // Remember the position of the next token
-    next_.location.beg_pos = source_pos();
-
-    switch (c0_) {
-      case ' ':
-      case '\t':
-        Advance();
-        token = Token::WHITESPACE;
-        break;
-
-      case '\n':
-        Advance();
-        has_line_terminator_before_next_ = true;
-        token = Token::WHITESPACE;
-        break;
-
-      case '"': case '\'':
-        token = ScanString();
-        break;
-
-      case '<':
-        // < <= << <<= <!--
-        Advance();
-        if (c0_ == '=') {
-          token = Select(Token::LTE);
-        } else if (c0_ == '<') {
-          token = Select('=', Token::ASSIGN_SHL, Token::SHL);
-        } else if (c0_ == '!') {
-          token = ScanHtmlComment();
-        } else {
-          token = Token::LT;
-        }
-        break;
-
-      case '>':
-        // > >= >> >>= >>> >>>=
-        Advance();
-        if (c0_ == '=') {
-          token = Select(Token::GTE);
-        } else if (c0_ == '>') {
-          // >> >>= >>> >>>=
-          Advance();
-          if (c0_ == '=') {
-            token = Select(Token::ASSIGN_SAR);
-          } else if (c0_ == '>') {
-            token = Select('=', Token::ASSIGN_SHR, Token::SHR);
-          } else {
-            token = Token::SAR;
-          }
-        } else {
-          token = Token::GT;
-        }
-        break;
-
-      case '=':
-        // = == ===
-        Advance();
-        if (c0_ == '=') {
-          token = Select('=', Token::EQ_STRICT, Token::EQ);
-        } else {
-          token = Token::ASSIGN;
-        }
-        break;
-
-      case '!':
-        // ! != !==
-        Advance();
-        if (c0_ == '=') {
-          token = Select('=', Token::NE_STRICT, Token::NE);
-        } else {
-          token = Token::NOT;
-        }
-        break;
-
-      case '+':
-        // + ++ +=
-        Advance();
-        if (c0_ == '+') {
-          token = Select(Token::INC);
-        } else if (c0_ == '=') {
-          token = Select(Token::ASSIGN_ADD);
-        } else {
-          token = Token::ADD;
-        }
-        break;
-
-      case '-':
-        // - -- --> -=
-        Advance();
-        if (c0_ == '-') {
-          Advance();
-          if (c0_ == '>' && has_line_terminator_before_next_) {
-            // For compatibility with SpiderMonkey, we skip lines that
-            // start with an HTML comment end '-->'.
-            token = SkipSingleLineComment();
-          } else {
-            token = Token::DEC;
-          }
-        } else if (c0_ == '=') {
-          token = Select(Token::ASSIGN_SUB);
-        } else {
-          token = Token::SUB;
-        }
-        break;
-
-      case '*':
-        // * *=
-        token = Select('=', Token::ASSIGN_MUL, Token::MUL);
-        break;
-
-      case '%':
-        // % %=
-        token = Select('=', Token::ASSIGN_MOD, Token::MOD);
-        break;
-
-      case '/':
-        // /  // /* /=
-        Advance();
-        if (c0_ == '/') {
-          token = SkipSingleLineComment();
-        } else if (c0_ == '*') {
-          token = SkipMultiLineComment();
-        } else if (c0_ == '=') {
-          token = Select(Token::ASSIGN_DIV);
-        } else {
-          token = Token::DIV;
-        }
-        break;
-
-      case '&':
-        // & && &=
-        Advance();
-        if (c0_ == '&') {
-          token = Select(Token::AND);
-        } else if (c0_ == '=') {
-          token = Select(Token::ASSIGN_BIT_AND);
-        } else {
-          token = Token::BIT_AND;
-        }
-        break;
-
-      case '|':
-        // | || |=
-        Advance();
-        if (c0_ == '|') {
-          token = Select(Token::OR);
-        } else if (c0_ == '=') {
-          token = Select(Token::ASSIGN_BIT_OR);
-        } else {
-          token = Token::BIT_OR;
-        }
-        break;
-
-      case '^':
-        // ^ ^=
-        token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
-        break;
-
-      case '.':
-        // . Number
-        Advance();
-        if (IsDecimalDigit(c0_)) {
-          token = ScanNumber(true);
-        } else {
-          token = Token::PERIOD;
-        }
-        break;
-
-      case ':':
-        token = Select(Token::COLON);
-        break;
-
-      case ';':
-        token = Select(Token::SEMICOLON);
-        break;
-
-      case ',':
-        token = Select(Token::COMMA);
-        break;
-
-      case '(':
-        token = Select(Token::LPAREN);
-        break;
-
-      case ')':
-        token = Select(Token::RPAREN);
-        break;
-
-      case '[':
-        token = Select(Token::LBRACK);
-        break;
-
-      case ']':
-        token = Select(Token::RBRACK);
-        break;
-
-      case '{':
-        token = Select(Token::LBRACE);
-        break;
-
-      case '}':
-        token = Select(Token::RBRACE);
-        break;
-
-      case '?':
-        token = Select(Token::CONDITIONAL);
-        break;
-
-      case '~':
-        token = Select(Token::BIT_NOT);
-        break;
-
-      default:
-        if (unicode_cache_->IsIdentifierStart(c0_)) {
-          token = ScanIdentifierOrKeyword();
-        } else if (IsDecimalDigit(c0_)) {
-          token = ScanNumber(false);
-        } else if (SkipWhiteSpace()) {
-          token = Token::WHITESPACE;
-        } else if (c0_ < 0) {
-          token = Token::EOS;
-        } else {
-          token = Select(Token::ILLEGAL);
-        }
-        break;
-    }
-
-    // Continue scanning for tokens as long as we're just skipping
-    // whitespace.
-  } while (token == Token::WHITESPACE);
-
-  next_.location.end_pos = source_pos();
-  next_.token = token;
-}
-
-
-void JavaScriptScanner::SeekForward(int pos) {
-  // After this call, we will have the token at the given position as
-  // the "next" token. The "current" token will be invalid.
-  if (pos == next_.location.beg_pos) return;
-  int current_pos = source_pos();
-  ASSERT_EQ(next_.location.end_pos, current_pos);
-  // Positions inside the lookahead token aren't supported.
-  ASSERT(pos >= current_pos);
-  if (pos != current_pos) {
-    source_->SeekForward(pos - source_->pos());
-    Advance();
-    // This function is only called to seek to the location
-    // of the end of a function (at the "}" token). It doesn't matter
-    // whether there was a line terminator in the part we skip.
-    has_line_terminator_before_next_ = false;
-    has_multiline_comment_before_next_ = false;
-  }
-  Scan();
-}
-
-
-void JavaScriptScanner::ScanEscape() {
-  uc32 c = c0_;
-  Advance();
-
-  // Skip escaped newlines.
-  if (unicode_cache_->IsLineTerminator(c)) {
-    // Allow CR+LF newlines in multiline string literals.
-    if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
-    // Allow LF+CR newlines in multiline string literals.
-    if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
-    return;
-  }
-
-  switch (c) {
-    case '\'':  // fall through
-    case '"' :  // fall through
-    case '\\': break;
-    case 'b' : c = '\b'; break;
-    case 'f' : c = '\f'; break;
-    case 'n' : c = '\n'; break;
-    case 'r' : c = '\r'; break;
-    case 't' : c = '\t'; break;
-    case 'u' : {
-      c = ScanHexNumber(4);
-      if (c < 0) c = 'u';
-      break;
-    }
-    case 'v' : c = '\v'; break;
-    case 'x' : {
-      c = ScanHexNumber(2);
-      if (c < 0) c = 'x';
-      break;
-    }
-    case '0' :  // fall through
-    case '1' :  // fall through
-    case '2' :  // fall through
-    case '3' :  // fall through
-    case '4' :  // fall through
-    case '5' :  // fall through
-    case '6' :  // fall through
-    case '7' : c = ScanOctalEscape(c, 2); break;
-  }
-
-  // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
-  // should be illegal, but they are commonly handled
-  // as non-escaped characters by JS VMs.
-  AddLiteralChar(c);
-}
-
-
-// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
-// ECMA-262. Other JS VMs support them.
-uc32 JavaScriptScanner::ScanOctalEscape(uc32 c, int length) {
-  uc32 x = c - '0';
-  int i = 0;
-  for (; i < length; i++) {
-    int d = c0_ - '0';
-    if (d < 0 || d > 7) break;
-    int nx = x * 8 + d;
-    if (nx >= 256) break;
-    x = nx;
-    Advance();
-  }
-  // Anything except '\0' is an octal escape sequence, illegal in strict mode.
-  // Remember the position of octal escape sequences so that an error
-  // can be reported later (in strict mode).
-  // We don't report the error immediately, because the octal escape can
-  // occur before the "use strict" directive.
-  if (c != '0' || i > 0) {
-    octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
-  }
-  return x;
-}
-
-
-Token::Value JavaScriptScanner::ScanString() {
-  uc32 quote = c0_;
-  Advance();  // consume quote
-
-  LiteralScope literal(this);
-  while (c0_ != quote && c0_ >= 0
-         && !unicode_cache_->IsLineTerminator(c0_)) {
-    uc32 c = c0_;
-    Advance();
-    if (c == '\\') {
-      if (c0_ < 0) return Token::ILLEGAL;
-      ScanEscape();
-    } else {
-      AddLiteralChar(c);
-    }
-  }
-  if (c0_ != quote) return Token::ILLEGAL;
-  literal.Complete();
-
-  Advance();  // consume quote
-  return Token::STRING;
-}
-
-
-void JavaScriptScanner::ScanDecimalDigits() {
-  while (IsDecimalDigit(c0_))
-    AddLiteralCharAdvance();
-}
-
-
-Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {
-  ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
-
-  enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
-
-  LiteralScope literal(this);
-  if (seen_period) {
-    // we have already seen a decimal point of the float
-    AddLiteralChar('.');
-    ScanDecimalDigits();  // we know we have at least one digit
-
-  } else {
-    // if the first character is '0' we must check for octals and hex
-    if (c0_ == '0') {
-      int start_pos = source_pos();  // For reporting octal positions.
-      AddLiteralCharAdvance();
-
-      // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
-      if (c0_ == 'x' || c0_ == 'X') {
-        // hex number
-        kind = HEX;
-        AddLiteralCharAdvance();
-        if (!IsHexDigit(c0_)) {
-          // we must have at least one hex digit after 'x'/'X'
-          return Token::ILLEGAL;
-        }
-        while (IsHexDigit(c0_)) {
-          AddLiteralCharAdvance();
-        }
-      } else if ('0' <= c0_ && c0_ <= '7') {
-        // (possible) octal number
-        kind = OCTAL;
-        while (true) {
-          if (c0_ == '8' || c0_ == '9') {
-            kind = DECIMAL;
-            break;
-          }
-          if (c0_  < '0' || '7'  < c0_) {
-            // Octal literal finished.
-            octal_pos_ = Location(start_pos, source_pos());
-            break;
-          }
-          AddLiteralCharAdvance();
-        }
-      }
-    }
-
-    // Parse decimal digits and allow trailing fractional part.
-    if (kind == DECIMAL) {
-      ScanDecimalDigits();  // optional
-      if (c0_ == '.') {
-        AddLiteralCharAdvance();
-        ScanDecimalDigits();  // optional
-      }
-    }
-  }
-
-  // scan exponent, if any
-  if (c0_ == 'e' || c0_ == 'E') {
-    ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
-    if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
-    // scan exponent
-    AddLiteralCharAdvance();
-    if (c0_ == '+' || c0_ == '-')
-      AddLiteralCharAdvance();
-    if (!IsDecimalDigit(c0_)) {
-      // we must have at least one decimal digit after 'e'/'E'
-      return Token::ILLEGAL;
-    }
-    ScanDecimalDigits();
-  }
-
-  // The source character immediately following a numeric literal must
-  // not be an identifier start or a decimal digit; see ECMA-262
-  // section 7.8.3, page 17 (note that we read only one decimal digit
-  // if the value is 0).
-  if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
-    return Token::ILLEGAL;
-
-  literal.Complete();
-
-  return Token::NUMBER;
-}
-
-
-uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {
-  Advance();
-  if (c0_ != 'u') return -1;
-  Advance();
-  uc32 result = ScanHexNumber(4);
-  if (result < 0) PushBack('u');
-  return result;
-}
-
-
-// ----------------------------------------------------------------------------
-// Keyword Matcher
-
-#define KEYWORDS(KEYWORD_GROUP, KEYWORD)                            \
-  KEYWORD_GROUP('b')                                                \
-  KEYWORD("break", Token::BREAK)                                    \
-  KEYWORD_GROUP('c')                                                \
-  KEYWORD("case", Token::CASE)                                      \
-  KEYWORD("catch", Token::CATCH)                                    \
-  KEYWORD("class", Token::FUTURE_RESERVED_WORD)                     \
-  KEYWORD("const", Token::CONST)                                    \
-  KEYWORD("continue", Token::CONTINUE)                              \
-  KEYWORD_GROUP('d')                                                \
-  KEYWORD("debugger", Token::DEBUGGER)                              \
-  KEYWORD("default", Token::DEFAULT)                                \
-  KEYWORD("delete", Token::DELETE)                                  \
-  KEYWORD("do", Token::DO)                                          \
-  KEYWORD_GROUP('e')                                                \
-  KEYWORD("else", Token::ELSE)                                      \
-  KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                      \
-  KEYWORD("export", Token::FUTURE_RESERVED_WORD)                    \
-  KEYWORD("extends", Token::FUTURE_RESERVED_WORD)                   \
-  KEYWORD_GROUP('f')                                                \
-  KEYWORD("false", Token::FALSE_LITERAL)                            \
-  KEYWORD("finally", Token::FINALLY)                                \
-  KEYWORD("for", Token::FOR)                                        \
-  KEYWORD("function", Token::FUNCTION)                              \
-  KEYWORD_GROUP('i')                                                \
-  KEYWORD("if", Token::IF)                                          \
-  KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)         \
-  KEYWORD("import", Token::FUTURE_RESERVED_WORD)                    \
-  KEYWORD("in", Token::IN)                                          \
-  KEYWORD("instanceof", Token::INSTANCEOF)                          \
-  KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)          \
-  KEYWORD_GROUP('l')                                                \
-  KEYWORD("let", harmony_block_scoping                              \
-                 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
-  KEYWORD_GROUP('n')                                                \
-  KEYWORD("new", Token::NEW)                                        \
-  KEYWORD("null", Token::NULL_LITERAL)                              \
-  KEYWORD_GROUP('p')                                                \
-  KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)            \
-  KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)            \
-  KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)          \
-  KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)             \
-  KEYWORD_GROUP('r')                                                \
-  KEYWORD("return", Token::RETURN)                                  \
-  KEYWORD_GROUP('s')                                                \
-  KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD)             \
-  KEYWORD("super", Token::FUTURE_RESERVED_WORD)                     \
-  KEYWORD("switch", Token::SWITCH)                                  \
-  KEYWORD_GROUP('t')                                                \
-  KEYWORD("this", Token::THIS)                                      \
-  KEYWORD("throw", Token::THROW)                                    \
-  KEYWORD("true", Token::TRUE_LITERAL)                              \
-  KEYWORD("try", Token::TRY)                                        \
-  KEYWORD("typeof", Token::TYPEOF)                                  \
-  KEYWORD_GROUP('v')                                                \
-  KEYWORD("var", Token::VAR)                                        \
-  KEYWORD("void", Token::VOID)                                      \
-  KEYWORD_GROUP('w')                                                \
-  KEYWORD("while", Token::WHILE)                                    \
-  KEYWORD("with", Token::WITH)                                      \
-  KEYWORD_GROUP('y')                                                \
-  KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD)
-
-
-static Token::Value KeywordOrIdentifierToken(const char* input,
-                                             int input_length,
-                                             bool harmony_block_scoping) {
-  ASSERT(input_length >= 1);
-  const int kMinLength = 2;
-  const int kMaxLength = 10;
-  if (input_length < kMinLength || input_length > kMaxLength) {
-    return Token::IDENTIFIER;
-  }
-  switch (input[0]) {
-    default:
-#define KEYWORD_GROUP_CASE(ch)                                \
-      break;                                                  \
-    case ch:
-#define KEYWORD(keyword, token)                               \
-    {                                                         \
-      /* 'keyword' is a char array, so sizeof(keyword) is */  \
-      /* strlen(keyword) plus 1 for the NUL char. */          \
-      const int keyword_length = sizeof(keyword) - 1;         \
-      STATIC_ASSERT(keyword_length >= kMinLength);            \
-      STATIC_ASSERT(keyword_length <= kMaxLength);            \
-      if (input_length == keyword_length &&                   \
-          input[1] == keyword[1] &&                           \
-          (keyword_length <= 2 || input[2] == keyword[2]) &&  \
-          (keyword_length <= 3 || input[3] == keyword[3]) &&  \
-          (keyword_length <= 4 || input[4] == keyword[4]) &&  \
-          (keyword_length <= 5 || input[5] == keyword[5]) &&  \
-          (keyword_length <= 6 || input[6] == keyword[6]) &&  \
-          (keyword_length <= 7 || input[7] == keyword[7]) &&  \
-          (keyword_length <= 8 || input[8] == keyword[8]) &&  \
-          (keyword_length <= 9 || input[9] == keyword[9])) {  \
-        return token;                                         \
-      }                                                       \
-    }
-    KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
-  }
-  return Token::IDENTIFIER;
-}
-
-
-Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
-  ASSERT(unicode_cache_->IsIdentifierStart(c0_));
-  LiteralScope literal(this);
-  // Scan identifier start character.
-  if (c0_ == '\\') {
-    uc32 c = ScanIdentifierUnicodeEscape();
-    // Only allow legal identifier start characters.
-    if (c < 0 ||
-        c == '\\' ||  // No recursive escapes.
-        !unicode_cache_->IsIdentifierStart(c)) {
-      return Token::ILLEGAL;
-    }
-    AddLiteralChar(c);
-    return ScanIdentifierSuffix(&literal);
-  }
-
-  uc32 first_char = c0_;
-  Advance();
-  AddLiteralChar(first_char);
-
-  // Scan the rest of the identifier characters.
-  while (unicode_cache_->IsIdentifierPart(c0_)) {
-    if (c0_ != '\\') {
-      uc32 next_char = c0_;
-      Advance();
-      AddLiteralChar(next_char);
-      continue;
-    }
-    // Fallthrough if no longer able to complete keyword.
-    return ScanIdentifierSuffix(&literal);
-  }
-
-  literal.Complete();
-
-  if (next_.literal_chars->is_ascii()) {
-    Vector<const char> chars = next_.literal_chars->ascii_literal();
-    return KeywordOrIdentifierToken(chars.start(),
-                                    chars.length(),
-                                    harmony_block_scoping_);
-  }
-
-  return Token::IDENTIFIER;
-}
-
-
-Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {
-  // Scan the rest of the identifier characters.
-  while (unicode_cache_->IsIdentifierPart(c0_)) {
-    if (c0_ == '\\') {
-      uc32 c = ScanIdentifierUnicodeEscape();
-      // Only allow legal identifier part characters.
-      if (c < 0 ||
-          c == '\\' ||
-          !unicode_cache_->IsIdentifierPart(c)) {
-        return Token::ILLEGAL;
-      }
-      AddLiteralChar(c);
-    } else {
-      AddLiteralChar(c0_);
-      Advance();
-    }
-  }
-  literal->Complete();
-
-  return Token::IDENTIFIER;
-}
-
-
-bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
-  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
-  bool in_character_class = false;
-
-  // Previous token is either '/' or '/=', in the second case, the
-  // pattern starts at =.
-  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
-  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
-
-  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
-  // the scanner should pass uninterpreted bodies to the RegExp
-  // constructor.
-  LiteralScope literal(this);
-  if (seen_equal) {
-    AddLiteralChar('=');
-  }
-
-  while (c0_ != '/' || in_character_class) {
-    if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
-    if (c0_ == '\\') {  // Escape sequence.
-      AddLiteralCharAdvance();
-      if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
-      AddLiteralCharAdvance();
-      // If the escape allows more characters, i.e., \x??, \u????, or \c?,
-      // only "safe" characters are allowed (letters, digits, underscore),
-      // otherwise the escape isn't valid and the invalid character has
-      // its normal meaning. I.e., we can just continue scanning without
-      // worrying whether the following characters are part of the escape
-      // or not, since any '/', '\\' or '[' is guaranteed to not be part
-      // of the escape sequence.
-
-      // TODO(896): At some point, parse RegExps more throughly to capture
-      // octal esacpes in strict mode.
-    } else {  // Unescaped character.
-      if (c0_ == '[') in_character_class = true;
-      if (c0_ == ']') in_character_class = false;
-      AddLiteralCharAdvance();
-    }
-  }
-  Advance();  // consume '/'
-
-  literal.Complete();
-
-  return true;
-}
-
-
-bool JavaScriptScanner::ScanLiteralUnicodeEscape() {
-  ASSERT(c0_ == '\\');
-  uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
-  Advance();
-  int i = 1;
-  if (c0_ == 'u') {
-    i++;
-    while (i < 6) {
-      Advance();
-      if (!IsHexDigit(c0_)) break;
-      chars_read[i] = c0_;
-      i++;
-    }
-  }
-  if (i < 6) {
-    // Incomplete escape. Undo all advances and return false.
-    while (i > 0) {
-      i--;
-      PushBack(chars_read[i]);
-    }
-    return false;
-  }
-  // Complete escape. Add all chars to current literal buffer.
-  for (int i = 0; i < 6; i++) {
-    AddLiteralChar(chars_read[i]);
-  }
-  return true;
-}
-
-
-bool JavaScriptScanner::ScanRegExpFlags() {
-  // Scan regular expression flags.
-  LiteralScope literal(this);
-  while (unicode_cache_->IsIdentifierPart(c0_)) {
-    if (c0_ != '\\') {
-      AddLiteralCharAdvance();
-    } else {
-      if (!ScanLiteralUnicodeEscape()) {
-        break;
-      }
-    }
-  }
-  literal.Complete();
-
-  next_.location.end_pos = source_pos() - 1;
-  return true;
-}
-
-} }  // namespace v8::internal
diff --git a/src/scanner-base.h b/src/scanner-base.h

deleted file mode 100644 (file)

index 2e223ce..0000000
--- a/src/scanner-base.h
+++ /dev/null
@@ -1,564 +0,0 @@
-// Copyright 2011 the V8 project authors. All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-//       notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-//       copyright notice, this list of conditions and the following
-//       disclaimer in the documentation and/or other materials provided
-//       with the distribution.
-//     * Neither the name of Google Inc. nor the names of its
-//       contributors may be used to endorse or promote products derived
-//       from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Features shared by parsing and pre-parsing scanners.
-
-#ifndef V8_SCANNER_BASE_H_
-#define V8_SCANNER_BASE_H_
-
-#include "allocation.h"
-#include "char-predicates.h"
-#include "checks.h"
-#include "globals.h"
-#include "token.h"
-#include "unicode-inl.h"
-#include "utils.h"
-
-namespace v8 {
-namespace internal {
-
-// Returns the value (0 .. 15) of a hexadecimal character c.
-// If c is not a legal hexadecimal character, returns a value < 0.
-inline int HexValue(uc32 c) {
-  c -= '0';
-  if (static_cast<unsigned>(c) <= 9) return c;
-  c = (c | 0x20) - ('a' - '0');  // detect 0x11..0x16 and 0x31..0x36.
-  if (static_cast<unsigned>(c) <= 5) return c + 10;
-  return -1;
-}
-
-
-// ---------------------------------------------------------------------
-// Buffered stream of characters, using an internal UC16 buffer.
-
-class UC16CharacterStream {
- public:
-  UC16CharacterStream() : pos_(0) { }
-  virtual ~UC16CharacterStream() { }
-
-  // Returns and advances past the next UC16 character in the input
-  // stream. If there are no more characters, it returns a negative
-  // value.
-  inline uc32 Advance() {
-    if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
-      pos_++;
-      return static_cast<uc32>(*(buffer_cursor_++));
-    }
-    // Note: currently the following increment is necessary to avoid a
-    // parser problem! The scanner treats the final kEndOfInput as
-    // a character with a position, and does math relative to that
-    // position.
-    pos_++;
-
-    return kEndOfInput;
-  }
-
-  // Return the current position in the character stream.
-  // Starts at zero.
-  inline unsigned pos() const { return pos_; }
-
-  // Skips forward past the next character_count UC16 characters
-  // in the input, or until the end of input if that comes sooner.
-  // Returns the number of characters actually skipped. If less
-  // than character_count,
-  inline unsigned SeekForward(unsigned character_count) {
-    unsigned buffered_chars =
-        static_cast<unsigned>(buffer_end_ - buffer_cursor_);
-    if (character_count <= buffered_chars) {
-      buffer_cursor_ += character_count;
-      pos_ += character_count;
-      return character_count;
-    }
-    return SlowSeekForward(character_count);
-  }
-
-  // Pushes back the most recently read UC16 character (or negative
-  // value if at end of input), i.e., the value returned by the most recent
-  // call to Advance.
-  // Must not be used right after calling SeekForward.
-  virtual void PushBack(int32_t character) = 0;
-
- protected:
-  static const uc32 kEndOfInput = -1;
-
-  // Ensures that the buffer_cursor_ points to the character at
-  // position pos_ of the input, if possible. If the position
-  // is at or after the end of the input, return false. If there
-  // are more characters available, return true.
-  virtual bool ReadBlock() = 0;
-  virtual unsigned SlowSeekForward(unsigned character_count) = 0;
-
-  const uc16* buffer_cursor_;
-  const uc16* buffer_end_;
-  unsigned pos_;
-};
-
-
-class UnicodeCache {
-// ---------------------------------------------------------------------
-// Caching predicates used by scanners.
- public:
-  UnicodeCache() {}
-  typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
-
-  StaticResource<Utf8Decoder>* utf8_decoder() {
-    return &utf8_decoder_;
-  }
-
-  bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }
-  bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
-  bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
-  bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
-
- private:
-
-  unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
-  unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
-  unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
-  unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
-  StaticResource<Utf8Decoder> utf8_decoder_;
-
-  DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
-};
-
-
-// ----------------------------------------------------------------------------
-// LiteralBuffer -  Collector of chars of literals.
-
-class LiteralBuffer {
- public:
-  LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
-
-  ~LiteralBuffer() {
-    if (backing_store_.length() > 0) {
-      backing_store_.Dispose();
-    }
-  }
-
-  inline void AddChar(uc16 character) {
-    if (position_ >= backing_store_.length()) ExpandBuffer();
-    if (is_ascii_) {
-      if (character < kMaxAsciiCharCodeU) {
-        backing_store_[position_] = static_cast<byte>(character);
-        position_ += kASCIISize;
-        return;
-      }
-      ConvertToUC16();
-    }
-    *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
-    position_ += kUC16Size;
-  }
-
-  bool is_ascii() { return is_ascii_; }
-
-  Vector<const uc16> uc16_literal() {
-    ASSERT(!is_ascii_);
-    ASSERT((position_ & 0x1) == 0);
-    return Vector<const uc16>(
-        reinterpret_cast<const uc16*>(backing_store_.start()),
-        position_ >> 1);
-  }
-
-  Vector<const char> ascii_literal() {
-    ASSERT(is_ascii_);
-    return Vector<const char>(
-        reinterpret_cast<const char*>(backing_store_.start()),
-        position_);
-  }
-
-  int length() {
-    return is_ascii_ ? position_ : (position_ >> 1);
-  }
-
-  void Reset() {
-    position_ = 0;
-    is_ascii_ = true;
-  }
- private:
-  static const int kInitialCapacity = 16;
-  static const int kGrowthFactory = 4;
-  static const int kMinConversionSlack = 256;
-  static const int kMaxGrowth = 1 * MB;
-  inline int NewCapacity(int min_capacity) {
-    int capacity = Max(min_capacity, backing_store_.length());
-    int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
-    return new_capacity;
-  }
-
-  void ExpandBuffer() {
-    Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
-    memcpy(new_store.start(), backing_store_.start(), position_);
-    backing_store_.Dispose();
-    backing_store_ = new_store;
-  }
-
-  void ConvertToUC16() {
-    ASSERT(is_ascii_);
-    Vector<byte> new_store;
-    int new_content_size = position_ * kUC16Size;
-    if (new_content_size >= backing_store_.length()) {
-      // Ensure room for all currently read characters as UC16 as well
-      // as the character about to be stored.
-      new_store = Vector<byte>::New(NewCapacity(new_content_size));
-    } else {
-      new_store = backing_store_;
-    }
-    char* src = reinterpret_cast<char*>(backing_store_.start());
-    uc16* dst = reinterpret_cast<uc16*>(new_store.start());
-    for (int i = position_ - 1; i >= 0; i--) {
-      dst[i] = src[i];
-    }
-    if (new_store.start() != backing_store_.start()) {
-      backing_store_.Dispose();
-      backing_store_ = new_store;
-    }
-    position_ = new_content_size;
-    is_ascii_ = false;
-  }
-
-  bool is_ascii_;
-  int position_;
-  Vector<byte> backing_store_;
-
-  DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
-};
-
-
-// ----------------------------------------------------------------------------
-// Scanner base-class.
-
-// Generic functionality used by both JSON and JavaScript scanners.
-class Scanner {
- public:
-  // -1 is outside of the range of any real source code.
-  static const int kNoOctalLocation = -1;
-
-  typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
-
-  class LiteralScope {
-   public:
-    explicit LiteralScope(Scanner* self);
-    ~LiteralScope();
-    void Complete();
-
-   private:
-    Scanner* scanner_;
-    bool complete_;
-  };
-
-  explicit Scanner(UnicodeCache* scanner_contants);
-
-  // Returns the current token again.
-  Token::Value current_token() { return current_.token; }
-
-  // One token look-ahead (past the token returned by Next()).
-  Token::Value peek() const { return next_.token; }
-
-  struct Location {
-    Location(int b, int e) : beg_pos(b), end_pos(e) { }
-    Location() : beg_pos(0), end_pos(0) { }
-
-    bool IsValid() const {
-      return beg_pos >= 0 && end_pos >= beg_pos;
-    }
-
-    static Location invalid() { return Location(-1, -1); }
-
-    int beg_pos;
-    int end_pos;
-  };
-
-  // Returns the location information for the current token
-  // (the token returned by Next()).
-  Location location() const { return current_.location; }
-  Location peek_location() const { return next_.location; }
-
-  // Returns the literal string, if any, for the current token (the
-  // token returned by Next()). The string is 0-terminated and in
-  // UTF-8 format; they may contain 0-characters. Literal strings are
-  // collected for identifiers, strings, and numbers.
-  // These functions only give the correct result if the literal
-  // was scanned between calls to StartLiteral() and TerminateLiteral().
-  bool is_literal_ascii() {
-    ASSERT_NOT_NULL(current_.literal_chars);
-    return current_.literal_chars->is_ascii();
-  }
-  Vector<const char> literal_ascii_string() {
-    ASSERT_NOT_NULL(current_.literal_chars);
-    return current_.literal_chars->ascii_literal();
-  }
-  Vector<const uc16> literal_uc16_string() {
-    ASSERT_NOT_NULL(current_.literal_chars);
-    return current_.literal_chars->uc16_literal();
-  }
-  int literal_length() const {
-    ASSERT_NOT_NULL(current_.literal_chars);
-    return current_.literal_chars->length();
-  }
-
-  bool literal_contains_escapes() const {
-    Location location = current_.location;
-    int source_length = (location.end_pos - location.beg_pos);
-    if (current_.token == Token::STRING) {
-      // Subtract delimiters.
-      source_length -= 2;
-    }
-    return current_.literal_chars->length() != source_length;
-  }
-
-  // Returns the literal string for the next token (the token that
-  // would be returned if Next() were called).
-  bool is_next_literal_ascii() {
-    ASSERT_NOT_NULL(next_.literal_chars);
-    return next_.literal_chars->is_ascii();
-  }
-  Vector<const char> next_literal_ascii_string() {
-    ASSERT_NOT_NULL(next_.literal_chars);
-    return next_.literal_chars->ascii_literal();
-  }
-  Vector<const uc16> next_literal_uc16_string() {
-    ASSERT_NOT_NULL(next_.literal_chars);
-    return next_.literal_chars->uc16_literal();
-  }
-  int next_literal_length() const {
-    ASSERT_NOT_NULL(next_.literal_chars);
-    return next_.literal_chars->length();
-  }
-
-  UnicodeCache* unicode_cache() { return unicode_cache_; }
-
-  static const int kCharacterLookaheadBufferSize = 1;
-
- protected:
-  // The current and look-ahead token.
-  struct TokenDesc {
-    Token::Value token;
-    Location location;
-    LiteralBuffer* literal_chars;
-  };
-
-  // Call this after setting source_ to the input.
-  void Init() {
-    // Set c0_ (one character ahead)
-    STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
-    Advance();
-    // Initialize current_ to not refer to a literal.
-    current_.literal_chars = NULL;
-  }
-
-  // Literal buffer support
-  inline void StartLiteral() {
-    LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
-            &literal_buffer2_ : &literal_buffer1_;
-    free_buffer->Reset();
-    next_.literal_chars = free_buffer;
-  }
-
-  inline void AddLiteralChar(uc32 c) {
-    ASSERT_NOT_NULL(next_.literal_chars);
-    next_.literal_chars->AddChar(c);
-  }
-
-  // Complete scanning of a literal.
-  inline void TerminateLiteral() {
-    // Does nothing in the current implementation.
-  }
-
-  // Stops scanning of a literal and drop the collected characters,
-  // e.g., due to an encountered error.
-  inline void DropLiteral() {
-    next_.literal_chars = NULL;
-  }
-
-  inline void AddLiteralCharAdvance() {
-    AddLiteralChar(c0_);
-    Advance();
-  }
-
-  // Low-level scanning support.
-  void Advance() { c0_ = source_->Advance(); }
-  void PushBack(uc32 ch) {
-    source_->PushBack(c0_);
-    c0_ = ch;
-  }
-
-  inline Token::Value Select(Token::Value tok) {
-    Advance();
-    return tok;
-  }
-
-  inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
-    Advance();
-    if (c0_ == next) {
-      Advance();
-      return then;
-    } else {
-      return else_;
-    }
-  }
-
-  uc32 ScanHexNumber(int expected_length);
-
-  // Return the current source position.
-  int source_pos() {
-    return source_->pos() - kCharacterLookaheadBufferSize;
-  }
-
-  UnicodeCache* unicode_cache_;
-
-  // Buffers collecting literal strings, numbers, etc.
-  LiteralBuffer literal_buffer1_;
-  LiteralBuffer literal_buffer2_;
-
-  TokenDesc current_;  // desc for current token (as returned by Next())
-  TokenDesc next_;     // desc for next token (one token look-ahead)
-
-  // Input stream. Must be initialized to an UC16CharacterStream.
-  UC16CharacterStream* source_;
-
-  // One Unicode character look-ahead; c0_ < 0 at the end of the input.
-  uc32 c0_;
-};
-
-// ----------------------------------------------------------------------------
-// JavaScriptScanner - base logic for JavaScript scanning.
-
-class JavaScriptScanner : public Scanner {
- public:
-  // A LiteralScope that disables recording of some types of JavaScript
-  // literals. If the scanner is configured to not record the specific
-  // type of literal, the scope will not call StartLiteral.
-  class LiteralScope {
-   public:
-    explicit LiteralScope(JavaScriptScanner* self)
-        : scanner_(self), complete_(false) {
-      scanner_->StartLiteral();
-    }
-     ~LiteralScope() {
-       if (!complete_) scanner_->DropLiteral();
-     }
-    void Complete() {
-      scanner_->TerminateLiteral();
-      complete_ = true;
-    }
-
-   private:
-    JavaScriptScanner* scanner_;
-    bool complete_;
-  };
-
-  explicit JavaScriptScanner(UnicodeCache* scanner_contants);
-
-  void Initialize(UC16CharacterStream* source);
-
-  // Returns the next token.
-  Token::Value Next();
-
-  // Returns true if there was a line terminator before the peek'ed token,
-  // possibly inside a multi-line comment.
-  bool HasAnyLineTerminatorBeforeNext() const {
-    return has_line_terminator_before_next_ ||
-           has_multiline_comment_before_next_;
-  }
-
-  // Scans the input as a regular expression pattern, previous
-  // character(s) must be /(=). Returns true if a pattern is scanned.
-  bool ScanRegExpPattern(bool seen_equal);
-  // Returns true if regexp flags are scanned (always since flags can
-  // be empty).
-  bool ScanRegExpFlags();
-
-  // Tells whether the buffer contains an identifier (no escapes).
-  // Used for checking if a property name is an identifier.
-  static bool IsIdentifier(unibrow::CharacterStream* buffer);
-
-  // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
-  uc32 ScanOctalEscape(uc32 c, int length);
-
-  // Returns the location of the last seen octal literal
-  Location octal_position() const { return octal_pos_; }
-  void clear_octal_position() { octal_pos_ = Location::invalid(); }
-
-  // Seek forward to the given position.  This operation does not
-  // work in general, for instance when there are pushed back
-  // characters, but works for seeking forward until simple delimiter
-  // tokens, which is what it is used for.
-  void SeekForward(int pos);
-
-  bool HarmonyBlockScoping() const {
-    return harmony_block_scoping_;
-  }
-  void SetHarmonyBlockScoping(bool block_scoping) {
-    harmony_block_scoping_ = block_scoping;
-  }
-
-
- protected:
-  bool SkipWhiteSpace();
-  Token::Value SkipSingleLineComment();
-  Token::Value SkipMultiLineComment();
-
-  // Scans a single JavaScript token.
-  void Scan();
-
-  void ScanDecimalDigits();
-  Token::Value ScanNumber(bool seen_period);
-  Token::Value ScanIdentifierOrKeyword();
-  Token::Value ScanIdentifierSuffix(LiteralScope* literal);
-
-  void ScanEscape();
-  Token::Value ScanString();
-
-  // Scans a possible HTML comment -- begins with '<!'.
-  Token::Value ScanHtmlComment();
-
-  // Decodes a unicode escape-sequence which is part of an identifier.
-  // If the escape sequence cannot be decoded the result is kBadChar.
-  uc32 ScanIdentifierUnicodeEscape();
-  // Recognizes a uniocde escape-sequence and adds its characters,
-  // uninterpreted, to the current literal. Used for parsing RegExp
-  // flags.
-  bool ScanLiteralUnicodeEscape();
-
-  // Start position of the octal literal last scanned.
-  Location octal_pos_;
-
-  // Whether there is a line terminator whitespace character after
-  // the current token, and  before the next. Does not count newlines
-  // inside multiline comments.
-  bool has_line_terminator_before_next_;
-  // Whether there is a multi-line comment that contains a
-  // line-terminator after the current token, and before the next.
-  bool has_multiline_comment_before_next_;
-  // Whether we scan 'let' as a keyword for harmony block scoped
-  // let bindings.
-  bool harmony_block_scoping_;
-};
-
-} }  // namespace v8::internal
-
-#endif  // V8_SCANNER_BASE_H_
diff --git a/src/scanner-character-streams.cc b/src/scanner-character-streams.cc

new file mode 100644 (file)

index 0000000..2c1ccea
--- /dev/null
+++ b/src/scanner-character-streams.cc
@@ -0,0 +1,328 @@
+// Copyright 2011 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "v8.h"
+
+#include "scanner-character-streams.h"
+
+#include "ast.h"
+#include "handles.h"
+#include "unicode-inl.h"
+
+namespace v8 {
+namespace internal {
+
+// ----------------------------------------------------------------------------
+// BufferedUC16CharacterStreams
+
+BufferedUC16CharacterStream::BufferedUC16CharacterStream()
+    : UC16CharacterStream(),
+      pushback_limit_(NULL) {
+  // Initialize buffer as being empty. First read will fill the buffer.
+  buffer_cursor_ = buffer_;
+  buffer_end_ = buffer_;
+}
+
+BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }
+
+void BufferedUC16CharacterStream::PushBack(uc32 character) {
+  if (character == kEndOfInput) {
+    pos_--;
+    return;
+  }
+  if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {
+    // buffer_ is writable, buffer_cursor_ is const pointer.
+    buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
+    pos_--;
+    return;
+  }
+  SlowPushBack(static_cast<uc16>(character));
+}
+
+
+void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {
+  // In pushback mode, the end of the buffer contains pushback,
+  // and the start of the buffer (from buffer start to pushback_limit_)
+  // contains valid data that comes just after the pushback.
+  // We NULL the pushback_limit_ if pushing all the way back to the
+  // start of the buffer.
+
+  if (pushback_limit_ == NULL) {
+    // Enter pushback mode.
+    pushback_limit_ = buffer_end_;
+    buffer_end_ = buffer_ + kBufferSize;
+    buffer_cursor_ = buffer_end_;
+  }
+  // Ensure that there is room for at least one pushback.
+  ASSERT(buffer_cursor_ > buffer_);
+  ASSERT(pos_ > 0);
+  buffer_[--buffer_cursor_ - buffer_] = character;
+  if (buffer_cursor_ == buffer_) {
+    pushback_limit_ = NULL;
+  } else if (buffer_cursor_ < pushback_limit_) {
+    pushback_limit_ = buffer_cursor_;
+  }
+  pos_--;
+}
+
+
+bool BufferedUC16CharacterStream::ReadBlock() {
+  buffer_cursor_ = buffer_;
+  if (pushback_limit_ != NULL) {
+    // Leave pushback mode.
+    buffer_end_ = pushback_limit_;
+    pushback_limit_ = NULL;
+    // If there were any valid characters left at the
+    // start of the buffer, use those.
+    if (buffer_cursor_ < buffer_end_) return true;
+    // Otherwise read a new block.
+  }
+  unsigned length = FillBuffer(pos_, kBufferSize);
+  buffer_end_ = buffer_ + length;
+  return length > 0;
+}
+
+
+unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {
+  // Leave pushback mode (i.e., ignore that there might be valid data
+  // in the buffer before the pushback_limit_ point).
+  pushback_limit_ = NULL;
+  return BufferSeekForward(delta);
+}
+
+// ----------------------------------------------------------------------------
+// GenericStringUC16CharacterStream
+
+
+GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(
+    Handle<String> data,
+    unsigned start_position,
+    unsigned end_position)
+    : string_(data),
+      length_(end_position) {
+  ASSERT(end_position >= start_position);
+  buffer_cursor_ = buffer_;
+  buffer_end_ = buffer_;
+  pos_ = start_position;
+}
+
+
+GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }
+
+
+unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {
+  unsigned old_pos = pos_;
+  pos_ = Min(pos_ + delta, length_);
+  ReadBlock();
+  return pos_ - old_pos;
+}
+
+
+unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,
+                                                      unsigned length) {
+  if (from_pos >= length_) return 0;
+  if (from_pos + length > length_) {
+    length = length_ - from_pos;
+  }
+  String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
+  return length;
+}
+
+
+// ----------------------------------------------------------------------------
+// Utf8ToUC16CharacterStream
+Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,
+                                                     unsigned length)
+    : BufferedUC16CharacterStream(),
+      raw_data_(data),
+      raw_data_length_(length),
+      raw_data_pos_(0),
+      raw_character_position_(0) {
+  ReadBlock();
+}
+
+
+Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }
+
+
+unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {
+  unsigned old_pos = pos_;
+  unsigned target_pos = pos_ + delta;
+  SetRawPosition(target_pos);
+  pos_ = raw_character_position_;
+  ReadBlock();
+  return pos_ - old_pos;
+}
+
+
+unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
+                                               unsigned length) {
+  static const unibrow::uchar kMaxUC16Character = 0xffff;
+  SetRawPosition(char_position);
+  if (raw_character_position_ != char_position) {
+    // char_position was not a valid position in the stream (hit the end
+    // while spooling to it).
+    return 0u;
+  }
+  unsigned i = 0;
+  while (i < length) {
+    if (raw_data_pos_ == raw_data_length_) break;
+    unibrow::uchar c = raw_data_[raw_data_pos_];
+    if (c <= unibrow::Utf8::kMaxOneByteChar) {
+      raw_data_pos_++;
+    } else {
+      c =  unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
+                                         raw_data_length_ - raw_data_pos_,
+                                         &raw_data_pos_);
+      // Don't allow characters outside of the BMP.
+      if (c > kMaxUC16Character) {
+        c = unibrow::Utf8::kBadChar;
+      }
+    }
+    buffer_[i++] = static_cast<uc16>(c);
+  }
+  raw_character_position_ = char_position + i;
+  return i;
+}
+
+
+static const byte kUtf8MultiByteMask = 0xC0;
+static const byte kUtf8MultiByteCharStart = 0xC0;
+static const byte kUtf8MultiByteCharFollower = 0x80;
+
+
+#ifdef DEBUG
+static bool IsUtf8MultiCharacterStart(byte first_byte) {
+  return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
+}
+#endif
+
+
+static bool IsUtf8MultiCharacterFollower(byte later_byte) {
+  return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
+}
+
+
+// Move the cursor back to point at the preceding UTF-8 character start
+// in the buffer.
+static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
+  byte character = buffer[--*cursor];
+  if (character > unibrow::Utf8::kMaxOneByteChar) {
+    ASSERT(IsUtf8MultiCharacterFollower(character));
+    // Last byte of a multi-byte character encoding. Step backwards until
+    // pointing to the first byte of the encoding, recognized by having the
+    // top two bits set.
+    while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
+    ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor]));
+  }
+}
+
+
+// Move the cursor forward to point at the next following UTF-8 character start
+// in the buffer.
+static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
+  byte character = buffer[(*cursor)++];
+  if (character > unibrow::Utf8::kMaxOneByteChar) {
+    // First character of a multi-byte character encoding.
+    // The number of most-significant one-bits determines the length of the
+    // encoding:
+    //  110..... - (0xCx, 0xDx) one additional byte (minimum).
+    //  1110.... - (0xEx) two additional bytes.
+    //  11110... - (0xFx) three additional bytes (maximum).
+    ASSERT(IsUtf8MultiCharacterStart(character));
+    // Additional bytes is:
+    // 1 if value in range 0xC0 .. 0xDF.
+    // 2 if value in range 0xE0 .. 0xEF.
+    // 3 if value in range 0xF0 .. 0xF7.
+    // Encode that in a single value.
+    unsigned additional_bytes =
+        ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
+    *cursor += additional_bytes;
+    ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
+  }
+}
+
+
+void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {
+  if (raw_character_position_ > target_position) {
+    // Spool backwards in utf8 buffer.
+    do {
+      Utf8CharacterBack(raw_data_, &raw_data_pos_);
+      raw_character_position_--;
+    } while (raw_character_position_ > target_position);
+    return;
+  }
+  // Spool forwards in the utf8 buffer.
+  while (raw_character_position_ < target_position) {
+    if (raw_data_pos_ == raw_data_length_) return;
+    Utf8CharacterForward(raw_data_, &raw_data_pos_);
+    raw_character_position_++;
+  }
+}
+
+
+// ----------------------------------------------------------------------------
+// ExternalTwoByteStringUC16CharacterStream
+
+ExternalTwoByteStringUC16CharacterStream::
+    ~ExternalTwoByteStringUC16CharacterStream() { }
+
+
+ExternalTwoByteStringUC16CharacterStream
+    ::ExternalTwoByteStringUC16CharacterStream(
+        Handle<ExternalTwoByteString> data,
+        int start_position,
+        int end_position)
+    : UC16CharacterStream(),
+      source_(data),
+      raw_data_(data->GetTwoByteData(start_position)) {
+  buffer_cursor_ = raw_data_,
+  buffer_end_ = raw_data_ + (end_position - start_position);
+  pos_ = start_position;
+}
+
+
+// ----------------------------------------------------------------------------
+// Scanner::LiteralScope
+
+Scanner::LiteralScope::LiteralScope(Scanner* self)
+    : scanner_(self), complete_(false) {
+  self->StartLiteral();
+}
+
+
+Scanner::LiteralScope::~LiteralScope() {
+  if (!complete_) scanner_->DropLiteral();
+}
+
+
+void Scanner::LiteralScope::Complete() {
+  scanner_->TerminateLiteral();
+  complete_ = true;
+}
+
+} }  // namespace v8::internal
diff --git a/src/scanner-character-streams.h b/src/scanner-character-streams.h

new file mode 100644 (file)

index 0000000..5c4ea2c
--- /dev/null
+++ b/src/scanner-character-streams.h
@@ -0,0 +1,129 @@
+// Copyright 2011 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef V8_SCANNER_CHARACTER_STREAMS_H_
+#define V8_SCANNER_CHARACTER_STREAMS_H_
+
+#include "scanner.h"
+
+namespace v8 {
+namespace internal {
+
+// A buffered character stream based on a random access character
+// source (ReadBlock can be called with pos_ pointing to any position,
+// even positions before the current).
+class BufferedUC16CharacterStream: public UC16CharacterStream {
+ public:
+  BufferedUC16CharacterStream();
+  virtual ~BufferedUC16CharacterStream();
+
+  virtual void PushBack(uc32 character);
+
+ protected:
+  static const unsigned kBufferSize = 512;
+  static const unsigned kPushBackStepSize = 16;
+
+  virtual unsigned SlowSeekForward(unsigned delta);
+  virtual bool ReadBlock();
+  virtual void SlowPushBack(uc16 character);
+
+  virtual unsigned BufferSeekForward(unsigned delta) = 0;
+  virtual unsigned FillBuffer(unsigned position, unsigned length) = 0;
+
+  const uc16* pushback_limit_;
+  uc16 buffer_[kBufferSize];
+};
+
+
+// Generic string stream.
+class GenericStringUC16CharacterStream: public BufferedUC16CharacterStream {
+ public:
+  GenericStringUC16CharacterStream(Handle<String> data,
+                                   unsigned start_position,
+                                   unsigned end_position);
+  virtual ~GenericStringUC16CharacterStream();
+
+ protected:
+  virtual unsigned BufferSeekForward(unsigned delta);
+  virtual unsigned FillBuffer(unsigned position, unsigned length);
+
+  Handle<String> string_;
+  unsigned start_position_;
+  unsigned length_;
+};
+
+
+// UC16 stream based on a literal UTF-8 string.
+class Utf8ToUC16CharacterStream: public BufferedUC16CharacterStream {
+ public:
+  Utf8ToUC16CharacterStream(const byte* data, unsigned length);
+  virtual ~Utf8ToUC16CharacterStream();
+
+ protected:
+  virtual unsigned BufferSeekForward(unsigned delta);
+  virtual unsigned FillBuffer(unsigned char_position, unsigned length);
+  void SetRawPosition(unsigned char_position);
+
+  const byte* raw_data_;
+  unsigned raw_data_length_;  // Measured in bytes, not characters.
+  unsigned raw_data_pos_;
+  // The character position of the character at raw_data[raw_data_pos_].
+  // Not necessarily the same as pos_.
+  unsigned raw_character_position_;
+};
+
+
+// UTF16 buffer to read characters from an external string.
+class ExternalTwoByteStringUC16CharacterStream: public UC16CharacterStream {
+ public:
+  ExternalTwoByteStringUC16CharacterStream(Handle<ExternalTwoByteString> data,
+                                           int start_position,
+                                           int end_position);
+  virtual ~ExternalTwoByteStringUC16CharacterStream();
+
+  virtual void PushBack(uc32 character) {
+    ASSERT(buffer_cursor_ > raw_data_);
+    buffer_cursor_--;
+    pos_--;
+  }
+
+ protected:
+  virtual unsigned SlowSeekForward(unsigned delta) {
+    // Fast case always handles seeking.
+    return 0;
+  }
+  virtual bool ReadBlock() {
+    // Entire string is read at start.
+    return false;
+  }
+  Handle<ExternalTwoByteString> source_;
+  const uc16* raw_data_;  // Pointer to the actual array of characters.
+};
+
+} }  // namespace v8::internal
+
+#endif  // V8_SCANNER_CHARACTER_STREAMS_H_
diff --git a/src/scanner.cc b/src/scanner.cc

index 5919073cde8c13a74c1fe3fb2cba5e8199787614..3425f4159e571d4bdfa0eb83d3064ac244821a61 100644 (file)
--- a/src/scanner.cc
+++ b/src/scanner.cc
@@ -25,303 +25,1067 @@
  // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
-#include "v8.h"
+// Features shared by parsing and pre-parsing scanners.
  
-#include "ast.h"
-#include "handles.h"
  #include "scanner.h"
-#include "unicode-inl.h"
+
+#include "../include/v8stdint.h"
+#include "char-predicates-inl.h"
  
  namespace v8 {
  namespace internal {
  
  // ----------------------------------------------------------------------------
-// BufferedUC16CharacterStreams
-
-BufferedUC16CharacterStream::BufferedUC16CharacterStream()
-    : UC16CharacterStream(),
-      pushback_limit_(NULL) {
-  // Initialize buffer as being empty. First read will fill the buffer.
-  buffer_cursor_ = buffer_;
-  buffer_end_ = buffer_;
+// Scanner
+
+Scanner::Scanner(UnicodeCache* unicode_cache)
+    : unicode_cache_(unicode_cache) { }
+
+
+uc32 Scanner::ScanHexNumber(int expected_length) {
+  ASSERT(expected_length <= 4);  // prevent overflow
+
+  uc32 digits[4] = { 0, 0, 0, 0 };
+  uc32 x = 0;
+  for (int i = 0; i < expected_length; i++) {
+    digits[i] = c0_;
+    int d = HexValue(c0_);
+    if (d < 0) {
+      // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
+      // should be illegal, but other JS VMs just return the
+      // non-escaped version of the original character.
+
+      // Push back digits that we have advanced past.
+      for (int j = i-1; j >= 0; j--) {
+        PushBack(digits[j]);
+      }
+      return -1;
+    }
+    x = x * 16 + d;
+    Advance();
+  }
+
+  return x;
  }
  
-BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }
  
-void BufferedUC16CharacterStream::PushBack(uc32 character) {
-  if (character == kEndOfInput) {
-    pos_--;
-    return;
+
+// ----------------------------------------------------------------------------
+// JavaScriptScanner
+
+JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)
+    : Scanner(scanner_contants),
+      octal_pos_(Location::invalid()),
+      harmony_block_scoping_(false) { }
+
+
+void JavaScriptScanner::Initialize(UC16CharacterStream* source) {
+  source_ = source;
+  // Need to capture identifiers in order to recognize "get" and "set"
+  // in object literals.
+  Init();
+  // Skip initial whitespace allowing HTML comment ends just like
+  // after a newline and scan first token.
+  has_line_terminator_before_next_ = true;
+  SkipWhiteSpace();
+  Scan();
+}
+
+
+// Ensure that tokens can be stored in a byte.
+STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
+
+// Table of one-character tokens, by character (0x00..0x7f only).
+static const byte one_char_tokens[] = {
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::LPAREN,       // 0x28
+  Token::RPAREN,       // 0x29
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::COMMA,        // 0x2c
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::COLON,        // 0x3a
+  Token::SEMICOLON,    // 0x3b
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::CONDITIONAL,  // 0x3f
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::LBRACK,     // 0x5b
+  Token::ILLEGAL,
+  Token::RBRACK,     // 0x5d
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::ILLEGAL,
+  Token::LBRACE,       // 0x7b
+  Token::ILLEGAL,
+  Token::RBRACE,       // 0x7d
+  Token::BIT_NOT,      // 0x7e
+  Token::ILLEGAL
+};
+
+
+Token::Value JavaScriptScanner::Next() {
+  current_ = next_;
+  has_line_terminator_before_next_ = false;
+  has_multiline_comment_before_next_ = false;
+  if (static_cast<unsigned>(c0_) <= 0x7f) {
+    Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
+    if (token != Token::ILLEGAL) {
+      int pos = source_pos();
+      next_.token = token;
+      next_.location.beg_pos = pos;
+      next_.location.end_pos = pos + 1;
+      Advance();
+      return current_.token;
+    }
    }
-  if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {
-    // buffer_ is writable, buffer_cursor_ is const pointer.
-    buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
-    pos_--;
-    return;
+  Scan();
+  return current_.token;
+}
+
+
+static inline bool IsByteOrderMark(uc32 c) {
+  // The Unicode value U+FFFE is guaranteed never to be assigned as a
+  // Unicode character; this implies that in a Unicode context the
+  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
+  // character expressed in little-endian byte order (since it could
+  // not be a U+FFFE character expressed in big-endian byte
+  // order). Nevertheless, we check for it to be compatible with
+  // Spidermonkey.
+  return c == 0xFEFF || c == 0xFFFE;
+}
+
+
+bool JavaScriptScanner::SkipWhiteSpace() {
+  int start_position = source_pos();
+
+  while (true) {
+    // We treat byte-order marks (BOMs) as whitespace for better
+    // compatibility with Spidermonkey and other JavaScript engines.
+    while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
+      // IsWhiteSpace() includes line terminators!
+      if (unicode_cache_->IsLineTerminator(c0_)) {
+        // Ignore line terminators, but remember them. This is necessary
+        // for automatic semicolon insertion.
+        has_line_terminator_before_next_ = true;
+      }
+      Advance();
+    }
+
+    // If there is an HTML comment end '-->' at the beginning of a
+    // line (with only whitespace in front of it), we treat the rest
+    // of the line as a comment. This is in line with the way
+    // SpiderMonkey handles it.
+    if (c0_ == '-' && has_line_terminator_before_next_) {
+      Advance();
+      if (c0_ == '-') {
+        Advance();
+        if (c0_ == '>') {
+          // Treat the rest of the line as a comment.
+          SkipSingleLineComment();
+          // Continue skipping white space after the comment.
+          continue;
+        }
+        PushBack('-');  // undo Advance()
+      }
+      PushBack('-');  // undo Advance()
+    }
+    // Return whether or not we skipped any characters.
+    return source_pos() != start_position;
    }
-  SlowPushBack(static_cast<uc16>(character));
  }
  
  
-void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {
-  // In pushback mode, the end of the buffer contains pushback,
-  // and the start of the buffer (from buffer start to pushback_limit_)
-  // contains valid data that comes just after the pushback.
-  // We NULL the pushback_limit_ if pushing all the way back to the
-  // start of the buffer.
+Token::Value JavaScriptScanner::SkipSingleLineComment() {
+  Advance();
  
-  if (pushback_limit_ == NULL) {
-    // Enter pushback mode.
-    pushback_limit_ = buffer_end_;
-    buffer_end_ = buffer_ + kBufferSize;
-    buffer_cursor_ = buffer_end_;
+  // The line terminator at the end of the line is not considered
+  // to be part of the single-line comment; it is recognized
+  // separately by the lexical grammar and becomes part of the
+  // stream of input elements for the syntactic grammar (see
+  // ECMA-262, section 7.4).
+  while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
+    Advance();
    }
-  // Ensure that there is room for at least one pushback.
-  ASSERT(buffer_cursor_ > buffer_);
-  ASSERT(pos_ > 0);
-  buffer_[--buffer_cursor_ - buffer_] = character;
-  if (buffer_cursor_ == buffer_) {
-    pushback_limit_ = NULL;
-  } else if (buffer_cursor_ < pushback_limit_) {
-    pushback_limit_ = buffer_cursor_;
+
+  return Token::WHITESPACE;
+}
+
+
+Token::Value JavaScriptScanner::SkipMultiLineComment() {
+  ASSERT(c0_ == '*');
+  Advance();
+
+  while (c0_ >= 0) {
+    uc32 ch = c0_;
+    Advance();
+    if (unicode_cache_->IsLineTerminator(ch)) {
+      // Following ECMA-262, section 7.4, a comment containing
+      // a newline will make the comment count as a line-terminator.
+      has_multiline_comment_before_next_ = true;
+    }
+    // If we have reached the end of the multi-line comment, we
+    // consume the '/' and insert a whitespace. This way all
+    // multi-line comments are treated as whitespace.
+    if (ch == '*' && c0_ == '/') {
+      c0_ = ' ';
+      return Token::WHITESPACE;
+    }
    }
-  pos_--;
+
+  // Unterminated multi-line comment.
+  return Token::ILLEGAL;
  }
  
  
-bool BufferedUC16CharacterStream::ReadBlock() {
-  buffer_cursor_ = buffer_;
-  if (pushback_limit_ != NULL) {
-    // Leave pushback mode.
-    buffer_end_ = pushback_limit_;
-    pushback_limit_ = NULL;
-    // If there were any valid characters left at the
-    // start of the buffer, use those.
-    if (buffer_cursor_ < buffer_end_) return true;
-    // Otherwise read a new block.
+Token::Value JavaScriptScanner::ScanHtmlComment() {
+  // Check for <!-- comments.
+  ASSERT(c0_ == '!');
+  Advance();
+  if (c0_ == '-') {
+    Advance();
+    if (c0_ == '-') return SkipSingleLineComment();
+    PushBack('-');  // undo Advance()
    }
-  unsigned length = FillBuffer(pos_, kBufferSize);
-  buffer_end_ = buffer_ + length;
-  return length > 0;
+  PushBack('!');  // undo Advance()
+  ASSERT(c0_ == '!');
+  return Token::LT;
  }
  
  
-unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {
-  // Leave pushback mode (i.e., ignore that there might be valid data
-  // in the buffer before the pushback_limit_ point).
-  pushback_limit_ = NULL;
-  return BufferSeekForward(delta);
+void JavaScriptScanner::Scan() {
+  next_.literal_chars = NULL;
+  Token::Value token;
+  do {
+    // Remember the position of the next token
+    next_.location.beg_pos = source_pos();
+
+    switch (c0_) {
+      case ' ':
+      case '\t':
+        Advance();
+        token = Token::WHITESPACE;
+        break;
+
+      case '\n':
+        Advance();
+        has_line_terminator_before_next_ = true;
+        token = Token::WHITESPACE;
+        break;
+
+      case '"': case '\'':
+        token = ScanString();
+        break;
+
+      case '<':
+        // < <= << <<= <!--
+        Advance();
+        if (c0_ == '=') {
+          token = Select(Token::LTE);
+        } else if (c0_ == '<') {
+          token = Select('=', Token::ASSIGN_SHL, Token::SHL);
+        } else if (c0_ == '!') {
+          token = ScanHtmlComment();
+        } else {
+          token = Token::LT;
+        }
+        break;
+
+      case '>':
+        // > >= >> >>= >>> >>>=
+        Advance();
+        if (c0_ == '=') {
+          token = Select(Token::GTE);
+        } else if (c0_ == '>') {
+          // >> >>= >>> >>>=
+          Advance();
+          if (c0_ == '=') {
+            token = Select(Token::ASSIGN_SAR);
+          } else if (c0_ == '>') {
+            token = Select('=', Token::ASSIGN_SHR, Token::SHR);
+          } else {
+            token = Token::SAR;
+          }
+        } else {
+          token = Token::GT;
+        }
+        break;
+
+      case '=':
+        // = == ===
+        Advance();
+        if (c0_ == '=') {
+          token = Select('=', Token::EQ_STRICT, Token::EQ);
+        } else {
+          token = Token::ASSIGN;
+        }
+        break;
+
+      case '!':
+        // ! != !==
+        Advance();
+        if (c0_ == '=') {
+          token = Select('=', Token::NE_STRICT, Token::NE);
+        } else {
+          token = Token::NOT;
+        }
+        break;
+
+      case '+':
+        // + ++ +=
+        Advance();
+        if (c0_ == '+') {
+          token = Select(Token::INC);
+        } else if (c0_ == '=') {
+          token = Select(Token::ASSIGN_ADD);
+        } else {
+          token = Token::ADD;
+        }
+        break;
+
+      case '-':
+        // - -- --> -=
+        Advance();
+        if (c0_ == '-') {
+          Advance();
+          if (c0_ == '>' && has_line_terminator_before_next_) {
+            // For compatibility with SpiderMonkey, we skip lines that
+            // start with an HTML comment end '-->'.
+            token = SkipSingleLineComment();
+          } else {
+            token = Token::DEC;
+          }
+        } else if (c0_ == '=') {
+          token = Select(Token::ASSIGN_SUB);
+        } else {
+          token = Token::SUB;
+        }
+        break;
+
+      case '*':
+        // * *=
+        token = Select('=', Token::ASSIGN_MUL, Token::MUL);
+        break;
+
+      case '%':
+        // % %=
+        token = Select('=', Token::ASSIGN_MOD, Token::MOD);
+        break;
+
+      case '/':
+        // /  // /* /=
+        Advance();
+        if (c0_ == '/') {
+          token = SkipSingleLineComment();
+        } else if (c0_ == '*') {
+          token = SkipMultiLineComment();
+        } else if (c0_ == '=') {
+          token = Select(Token::ASSIGN_DIV);
+        } else {
+          token = Token::DIV;
+        }
+        break;
+
+      case '&':
+        // & && &=
+        Advance();
+        if (c0_ == '&') {
+          token = Select(Token::AND);
+        } else if (c0_ == '=') {
+          token = Select(Token::ASSIGN_BIT_AND);
+        } else {
+          token = Token::BIT_AND;
+        }
+        break;
+
+      case '|':
+        // | || |=
+        Advance();
+        if (c0_ == '|') {
+          token = Select(Token::OR);
+        } else if (c0_ == '=') {
+          token = Select(Token::ASSIGN_BIT_OR);
+        } else {
+          token = Token::BIT_OR;
+        }
+        break;
+
+      case '^':
+        // ^ ^=
+        token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
+        break;
+
+      case '.':
+        // . Number
+        Advance();
+        if (IsDecimalDigit(c0_)) {
+          token = ScanNumber(true);
+        } else {
+          token = Token::PERIOD;
+        }
+        break;
+
+      case ':':
+        token = Select(Token::COLON);
+        break;
+
+      case ';':
+        token = Select(Token::SEMICOLON);
+        break;
+
+      case ',':
+        token = Select(Token::COMMA);
+        break;
+
+      case '(':
+        token = Select(Token::LPAREN);
+        break;
+
+      case ')':
+        token = Select(Token::RPAREN);
+        break;
+
+      case '[':
+        token = Select(Token::LBRACK);
+        break;
+
+      case ']':
+        token = Select(Token::RBRACK);
+        break;
+
+      case '{':
+        token = Select(Token::LBRACE);
+        break;
+
+      case '}':
+        token = Select(Token::RBRACE);
+        break;
+
+      case '?':
+        token = Select(Token::CONDITIONAL);
+        break;
+
+      case '~':
+        token = Select(Token::BIT_NOT);
+        break;
+
+      default:
+        if (unicode_cache_->IsIdentifierStart(c0_)) {
+          token = ScanIdentifierOrKeyword();
+        } else if (IsDecimalDigit(c0_)) {
+          token = ScanNumber(false);
+        } else if (SkipWhiteSpace()) {
+          token = Token::WHITESPACE;
+        } else if (c0_ < 0) {
+          token = Token::EOS;
+        } else {
+          token = Select(Token::ILLEGAL);
+        }
+        break;
+    }
+
+    // Continue scanning for tokens as long as we're just skipping
+    // whitespace.
+  } while (token == Token::WHITESPACE);
+
+  next_.location.end_pos = source_pos();
+  next_.token = token;
  }
  
-// ----------------------------------------------------------------------------
-// GenericStringUC16CharacterStream
-
-
-GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(
-    Handle<String> data,
-    unsigned start_position,
-    unsigned end_position)
-    : string_(data),
-      length_(end_position) {
-  ASSERT(end_position >= start_position);
-  buffer_cursor_ = buffer_;
-  buffer_end_ = buffer_;
-  pos_ = start_position;
+
+void JavaScriptScanner::SeekForward(int pos) {
+  // After this call, we will have the token at the given position as
+  // the "next" token. The "current" token will be invalid.
+  if (pos == next_.location.beg_pos) return;
+  int current_pos = source_pos();
+  ASSERT_EQ(next_.location.end_pos, current_pos);
+  // Positions inside the lookahead token aren't supported.
+  ASSERT(pos >= current_pos);
+  if (pos != current_pos) {
+    source_->SeekForward(pos - source_->pos());
+    Advance();
+    // This function is only called to seek to the location
+    // of the end of a function (at the "}" token). It doesn't matter
+    // whether there was a line terminator in the part we skip.
+    has_line_terminator_before_next_ = false;
+    has_multiline_comment_before_next_ = false;
+  }
+  Scan();
  }
  
  
-GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }
+void JavaScriptScanner::ScanEscape() {
+  uc32 c = c0_;
+  Advance();
+
+  // Skip escaped newlines.
+  if (unicode_cache_->IsLineTerminator(c)) {
+    // Allow CR+LF newlines in multiline string literals.
+    if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
+    // Allow LF+CR newlines in multiline string literals.
+    if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
+    return;
+  }
  
+  switch (c) {
+    case '\'':  // fall through
+    case '"' :  // fall through
+    case '\\': break;
+    case 'b' : c = '\b'; break;
+    case 'f' : c = '\f'; break;
+    case 'n' : c = '\n'; break;
+    case 'r' : c = '\r'; break;
+    case 't' : c = '\t'; break;
+    case 'u' : {
+      c = ScanHexNumber(4);
+      if (c < 0) c = 'u';
+      break;
+    }
+    case 'v' : c = '\v'; break;
+    case 'x' : {
+      c = ScanHexNumber(2);
+      if (c < 0) c = 'x';
+      break;
+    }
+    case '0' :  // fall through
+    case '1' :  // fall through
+    case '2' :  // fall through
+    case '3' :  // fall through
+    case '4' :  // fall through
+    case '5' :  // fall through
+    case '6' :  // fall through
+    case '7' : c = ScanOctalEscape(c, 2); break;
+  }
  
-unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {
-  unsigned old_pos = pos_;
-  pos_ = Min(pos_ + delta, length_);
-  ReadBlock();
-  return pos_ - old_pos;
+  // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
+  // should be illegal, but they are commonly handled
+  // as non-escaped characters by JS VMs.
+  AddLiteralChar(c);
  }
  
  
-unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,
-                                                      unsigned length) {
-  if (from_pos >= length_) return 0;
-  if (from_pos + length > length_) {
-    length = length_ - from_pos;
+// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
+// ECMA-262. Other JS VMs support them.
+uc32 JavaScriptScanner::ScanOctalEscape(uc32 c, int length) {
+  uc32 x = c - '0';
+  int i = 0;
+  for (; i < length; i++) {
+    int d = c0_ - '0';
+    if (d < 0 || d > 7) break;
+    int nx = x * 8 + d;
+    if (nx >= 256) break;
+    x = nx;
+    Advance();
+  }
+  // Anything except '\0' is an octal escape sequence, illegal in strict mode.
+  // Remember the position of octal escape sequences so that an error
+  // can be reported later (in strict mode).
+  // We don't report the error immediately, because the octal escape can
+  // occur before the "use strict" directive.
+  if (c != '0' || i > 0) {
+    octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
    }
-  String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
-  return length;
+  return x;
  }
  
  
-// ----------------------------------------------------------------------------
-// Utf8ToUC16CharacterStream
-Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,
-                                                     unsigned length)
-    : BufferedUC16CharacterStream(),
-      raw_data_(data),
-      raw_data_length_(length),
-      raw_data_pos_(0),
-      raw_character_position_(0) {
-  ReadBlock();
-}
+Token::Value JavaScriptScanner::ScanString() {
+  uc32 quote = c0_;
+  Advance();  // consume quote
  
+  LiteralScope literal(this);
+  while (c0_ != quote && c0_ >= 0
+         && !unicode_cache_->IsLineTerminator(c0_)) {
+    uc32 c = c0_;
+    Advance();
+    if (c == '\\') {
+      if (c0_ < 0) return Token::ILLEGAL;
+      ScanEscape();
+    } else {
+      AddLiteralChar(c);
+    }
+  }
+  if (c0_ != quote) return Token::ILLEGAL;
+  literal.Complete();
  
-Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }
+  Advance();  // consume quote
+  return Token::STRING;
+}
  
  
-unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {
-  unsigned old_pos = pos_;
-  unsigned target_pos = pos_ + delta;
-  SetRawPosition(target_pos);
-  pos_ = raw_character_position_;
-  ReadBlock();
-  return pos_ - old_pos;
+void JavaScriptScanner::ScanDecimalDigits() {
+  while (IsDecimalDigit(c0_))
+    AddLiteralCharAdvance();
  }
  
  
-unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
-                                               unsigned length) {
-  static const unibrow::uchar kMaxUC16Character = 0xffff;
-  SetRawPosition(char_position);
-  if (raw_character_position_ != char_position) {
-    // char_position was not a valid position in the stream (hit the end
-    // while spooling to it).
-    return 0u;
-  }
-  unsigned i = 0;
-  while (i < length) {
-    if (raw_data_pos_ == raw_data_length_) break;
-    unibrow::uchar c = raw_data_[raw_data_pos_];
-    if (c <= unibrow::Utf8::kMaxOneByteChar) {
-      raw_data_pos_++;
-    } else {
-      c =  unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
-                                         raw_data_length_ - raw_data_pos_,
-                                         &raw_data_pos_);
-      // Don't allow characters outside of the BMP.
-      if (c > kMaxUC16Character) {
-        c = unibrow::Utf8::kBadChar;
+Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {
+  ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
+
+  enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
+
+  LiteralScope literal(this);
+  if (seen_period) {
+    // we have already seen a decimal point of the float
+    AddLiteralChar('.');
+    ScanDecimalDigits();  // we know we have at least one digit
+
+  } else {
+    // if the first character is '0' we must check for octals and hex
+    if (c0_ == '0') {
+      int start_pos = source_pos();  // For reporting octal positions.
+      AddLiteralCharAdvance();
+
+      // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
+      if (c0_ == 'x' || c0_ == 'X') {
+        // hex number
+        kind = HEX;
+        AddLiteralCharAdvance();
+        if (!IsHexDigit(c0_)) {
+          // we must have at least one hex digit after 'x'/'X'
+          return Token::ILLEGAL;
+        }
+        while (IsHexDigit(c0_)) {
+          AddLiteralCharAdvance();
+        }
+      } else if ('0' <= c0_ && c0_ <= '7') {
+        // (possible) octal number
+        kind = OCTAL;
+        while (true) {
+          if (c0_ == '8' || c0_ == '9') {
+            kind = DECIMAL;
+            break;
+          }
+          if (c0_  < '0' || '7'  < c0_) {
+            // Octal literal finished.
+            octal_pos_ = Location(start_pos, source_pos());
+            break;
+          }
+          AddLiteralCharAdvance();
+        }
+      }
+    }
+
+    // Parse decimal digits and allow trailing fractional part.
+    if (kind == DECIMAL) {
+      ScanDecimalDigits();  // optional
+      if (c0_ == '.') {
+        AddLiteralCharAdvance();
+        ScanDecimalDigits();  // optional
        }
      }
-    buffer_[i++] = static_cast<uc16>(c);
    }
-  raw_character_position_ = char_position + i;
-  return i;
-}
  
+  // scan exponent, if any
+  if (c0_ == 'e' || c0_ == 'E') {
+    ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
+    if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
+    // scan exponent
+    AddLiteralCharAdvance();
+    if (c0_ == '+' || c0_ == '-')
+      AddLiteralCharAdvance();
+    if (!IsDecimalDigit(c0_)) {
+      // we must have at least one decimal digit after 'e'/'E'
+      return Token::ILLEGAL;
+    }
+    ScanDecimalDigits();
+  }
  
-static const byte kUtf8MultiByteMask = 0xC0;
-static const byte kUtf8MultiByteCharStart = 0xC0;
-static const byte kUtf8MultiByteCharFollower = 0x80;
+  // The source character immediately following a numeric literal must
+  // not be an identifier start or a decimal digit; see ECMA-262
+  // section 7.8.3, page 17 (note that we read only one decimal digit
+  // if the value is 0).
+  if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
+    return Token::ILLEGAL;
  
+  literal.Complete();
  
-#ifdef DEBUG
-static bool IsUtf8MultiCharacterStart(byte first_byte) {
-  return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
+  return Token::NUMBER;
  }
-#endif
  
  
-static bool IsUtf8MultiCharacterFollower(byte later_byte) {
-  return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
+uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {
+  Advance();
+  if (c0_ != 'u') return -1;
+  Advance();
+  uc32 result = ScanHexNumber(4);
+  if (result < 0) PushBack('u');
+  return result;
  }
  
  
-// Move the cursor back to point at the preceding UTF-8 character start
-// in the buffer.
-static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
-  byte character = buffer[--*cursor];
-  if (character > unibrow::Utf8::kMaxOneByteChar) {
-    ASSERT(IsUtf8MultiCharacterFollower(character));
-    // Last byte of a multi-byte character encoding. Step backwards until
-    // pointing to the first byte of the encoding, recognized by having the
-    // top two bits set.
-    while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
-    ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor]));
+// ----------------------------------------------------------------------------
+// Keyword Matcher
+
+#define KEYWORDS(KEYWORD_GROUP, KEYWORD)                            \
+  KEYWORD_GROUP('b')                                                \
+  KEYWORD("break", Token::BREAK)                                    \
+  KEYWORD_GROUP('c')                                                \
+  KEYWORD("case", Token::CASE)                                      \
+  KEYWORD("catch", Token::CATCH)                                    \
+  KEYWORD("class", Token::FUTURE_RESERVED_WORD)                     \
+  KEYWORD("const", Token::CONST)                                    \
+  KEYWORD("continue", Token::CONTINUE)                              \
+  KEYWORD_GROUP('d')                                                \
+  KEYWORD("debugger", Token::DEBUGGER)                              \
+  KEYWORD("default", Token::DEFAULT)                                \
+  KEYWORD("delete", Token::DELETE)                                  \
+  KEYWORD("do", Token::DO)                                          \
+  KEYWORD_GROUP('e')                                                \
+  KEYWORD("else", Token::ELSE)                                      \
+  KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                      \
+  KEYWORD("export", Token::FUTURE_RESERVED_WORD)                    \
+  KEYWORD("extends", Token::FUTURE_RESERVED_WORD)                   \
+  KEYWORD_GROUP('f')                                                \
+  KEYWORD("false", Token::FALSE_LITERAL)                            \
+  KEYWORD("finally", Token::FINALLY)                                \
+  KEYWORD("for", Token::FOR)                                        \
+  KEYWORD("function", Token::FUNCTION)                              \
+  KEYWORD_GROUP('i')                                                \
+  KEYWORD("if", Token::IF)                                          \
+  KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)         \
+  KEYWORD("import", Token::FUTURE_RESERVED_WORD)                    \
+  KEYWORD("in", Token::IN)                                          \
+  KEYWORD("instanceof", Token::INSTANCEOF)                          \
+  KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)          \
+  KEYWORD_GROUP('l')                                                \
+  KEYWORD("let", harmony_block_scoping                              \
+                 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
+  KEYWORD_GROUP('n')                                                \
+  KEYWORD("new", Token::NEW)                                        \
+  KEYWORD("null", Token::NULL_LITERAL)                              \
+  KEYWORD_GROUP('p')                                                \
+  KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)            \
+  KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)            \
+  KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)          \
+  KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)             \
+  KEYWORD_GROUP('r')                                                \
+  KEYWORD("return", Token::RETURN)                                  \
+  KEYWORD_GROUP('s')                                                \
+  KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD)             \
+  KEYWORD("super", Token::FUTURE_RESERVED_WORD)                     \
+  KEYWORD("switch", Token::SWITCH)                                  \
+  KEYWORD_GROUP('t')                                                \
+  KEYWORD("this", Token::THIS)                                      \
+  KEYWORD("throw", Token::THROW)                                    \
+  KEYWORD("true", Token::TRUE_LITERAL)                              \
+  KEYWORD("try", Token::TRY)                                        \
+  KEYWORD("typeof", Token::TYPEOF)                                  \
+  KEYWORD_GROUP('v')                                                \
+  KEYWORD("var", Token::VAR)                                        \
+  KEYWORD("void", Token::VOID)                                      \
+  KEYWORD_GROUP('w')                                                \
+  KEYWORD("while", Token::WHILE)                                    \
+  KEYWORD("with", Token::WITH)                                      \
+  KEYWORD_GROUP('y')                                                \
+  KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD)
+
+
+static Token::Value KeywordOrIdentifierToken(const char* input,
+                                             int input_length,
+                                             bool harmony_block_scoping) {
+  ASSERT(input_length >= 1);
+  const int kMinLength = 2;
+  const int kMaxLength = 10;
+  if (input_length < kMinLength || input_length > kMaxLength) {
+    return Token::IDENTIFIER;
+  }
+  switch (input[0]) {
+    default:
+#define KEYWORD_GROUP_CASE(ch)                                \
+      break;                                                  \
+    case ch:
+#define KEYWORD(keyword, token)                               \
+    {                                                         \
+      /* 'keyword' is a char array, so sizeof(keyword) is */  \
+      /* strlen(keyword) plus 1 for the NUL char. */          \
+      const int keyword_length = sizeof(keyword) - 1;         \
+      STATIC_ASSERT(keyword_length >= kMinLength);            \
+      STATIC_ASSERT(keyword_length <= kMaxLength);            \
+      if (input_length == keyword_length &&                   \
+          input[1] == keyword[1] &&                           \
+          (keyword_length <= 2 || input[2] == keyword[2]) &&  \
+          (keyword_length <= 3 || input[3] == keyword[3]) &&  \
+          (keyword_length <= 4 || input[4] == keyword[4]) &&  \
+          (keyword_length <= 5 || input[5] == keyword[5]) &&  \
+          (keyword_length <= 6 || input[6] == keyword[6]) &&  \
+          (keyword_length <= 7 || input[7] == keyword[7]) &&  \
+          (keyword_length <= 8 || input[8] == keyword[8]) &&  \
+          (keyword_length <= 9 || input[9] == keyword[9])) {  \
+        return token;                                         \
+      }                                                       \
+    }
+    KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
    }
+  return Token::IDENTIFIER;
  }
  
  
-// Move the cursor forward to point at the next following UTF-8 character start
-// in the buffer.
-static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
-  byte character = buffer[(*cursor)++];
-  if (character > unibrow::Utf8::kMaxOneByteChar) {
-    // First character of a multi-byte character encoding.
-    // The number of most-significant one-bits determines the length of the
-    // encoding:
-    //  110..... - (0xCx, 0xDx) one additional byte (minimum).
-    //  1110.... - (0xEx) two additional bytes.
-    //  11110... - (0xFx) three additional bytes (maximum).
-    ASSERT(IsUtf8MultiCharacterStart(character));
-    // Additional bytes is:
-    // 1 if value in range 0xC0 .. 0xDF.
-    // 2 if value in range 0xE0 .. 0xEF.
-    // 3 if value in range 0xF0 .. 0xF7.
-    // Encode that in a single value.
-    unsigned additional_bytes =
-        ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
-    *cursor += additional_bytes;
-    ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
+Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
+  ASSERT(unicode_cache_->IsIdentifierStart(c0_));
+  LiteralScope literal(this);
+  // Scan identifier start character.
+  if (c0_ == '\\') {
+    uc32 c = ScanIdentifierUnicodeEscape();
+    // Only allow legal identifier start characters.
+    if (c < 0 ||
+        c == '\\' ||  // No recursive escapes.
+        !unicode_cache_->IsIdentifierStart(c)) {
+      return Token::ILLEGAL;
+    }
+    AddLiteralChar(c);
+    return ScanIdentifierSuffix(&literal);
    }
-}
  
+  uc32 first_char = c0_;
+  Advance();
+  AddLiteralChar(first_char);
  
-void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {
-  if (raw_character_position_ > target_position) {
-    // Spool backwards in utf8 buffer.
-    do {
-      Utf8CharacterBack(raw_data_, &raw_data_pos_);
-      raw_character_position_--;
-    } while (raw_character_position_ > target_position);
-    return;
+  // Scan the rest of the identifier characters.
+  while (unicode_cache_->IsIdentifierPart(c0_)) {
+    if (c0_ != '\\') {
+      uc32 next_char = c0_;
+      Advance();
+      AddLiteralChar(next_char);
+      continue;
+    }
+    // Fallthrough if no longer able to complete keyword.
+    return ScanIdentifierSuffix(&literal);
    }
-  // Spool forwards in the utf8 buffer.
-  while (raw_character_position_ < target_position) {
-    if (raw_data_pos_ == raw_data_length_) return;
-    Utf8CharacterForward(raw_data_, &raw_data_pos_);
-    raw_character_position_++;
+
+  literal.Complete();
+
+  if (next_.literal_chars->is_ascii()) {
+    Vector<const char> chars = next_.literal_chars->ascii_literal();
+    return KeywordOrIdentifierToken(chars.start(),
+                                    chars.length(),
+                                    harmony_block_scoping_);
    }
+
+  return Token::IDENTIFIER;
  }
  
  
-// ----------------------------------------------------------------------------
-// ExternalTwoByteStringUC16CharacterStream
-
-ExternalTwoByteStringUC16CharacterStream::
-    ~ExternalTwoByteStringUC16CharacterStream() { }
-
-
-ExternalTwoByteStringUC16CharacterStream
-    ::ExternalTwoByteStringUC16CharacterStream(
-        Handle<ExternalTwoByteString> data,
-        int start_position,
-        int end_position)
-    : UC16CharacterStream(),
-      source_(data),
-      raw_data_(data->GetTwoByteData(start_position)) {
-  buffer_cursor_ = raw_data_,
-  buffer_end_ = raw_data_ + (end_position - start_position);
-  pos_ = start_position;
+Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {
+  // Scan the rest of the identifier characters.
+  while (unicode_cache_->IsIdentifierPart(c0_)) {
+    if (c0_ == '\\') {
+      uc32 c = ScanIdentifierUnicodeEscape();
+      // Only allow legal identifier part characters.
+      if (c < 0 ||
+          c == '\\' ||
+          !unicode_cache_->IsIdentifierPart(c)) {
+        return Token::ILLEGAL;
+      }
+      AddLiteralChar(c);
+    } else {
+      AddLiteralChar(c0_);
+      Advance();
+    }
+  }
+  literal->Complete();
+
+  return Token::IDENTIFIER;
  }
  
  
-// ----------------------------------------------------------------------------
-// Scanner::LiteralScope
+bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
+  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
+  bool in_character_class = false;
+
+  // Previous token is either '/' or '/=', in the second case, the
+  // pattern starts at =.
+  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
+  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
  
-Scanner::LiteralScope::LiteralScope(Scanner* self)
-    : scanner_(self), complete_(false) {
-  self->StartLiteral();
+  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
+  // the scanner should pass uninterpreted bodies to the RegExp
+  // constructor.
+  LiteralScope literal(this);
+  if (seen_equal) {
+    AddLiteralChar('=');
+  }
+
+  while (c0_ != '/' || in_character_class) {
+    if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
+    if (c0_ == '\\') {  // Escape sequence.
+      AddLiteralCharAdvance();
+      if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
+      AddLiteralCharAdvance();
+      // If the escape allows more characters, i.e., \x??, \u????, or \c?,
+      // only "safe" characters are allowed (letters, digits, underscore),
+      // otherwise the escape isn't valid and the invalid character has
+      // its normal meaning. I.e., we can just continue scanning without
+      // worrying whether the following characters are part of the escape
+      // or not, since any '/', '\\' or '[' is guaranteed to not be part
+      // of the escape sequence.
+
+      // TODO(896): At some point, parse RegExps more throughly to capture
+      // octal esacpes in strict mode.
+    } else {  // Unescaped character.
+      if (c0_ == '[') in_character_class = true;
+      if (c0_ == ']') in_character_class = false;
+      AddLiteralCharAdvance();
+    }
+  }
+  Advance();  // consume '/'
+
+  literal.Complete();
+
+  return true;
  }
  
  
-Scanner::LiteralScope::~LiteralScope() {
-  if (!complete_) scanner_->DropLiteral();
+bool JavaScriptScanner::ScanLiteralUnicodeEscape() {
+  ASSERT(c0_ == '\\');
+  uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
+  Advance();
+  int i = 1;
+  if (c0_ == 'u') {
+    i++;
+    while (i < 6) {
+      Advance();
+      if (!IsHexDigit(c0_)) break;
+      chars_read[i] = c0_;
+      i++;
+    }
+  }
+  if (i < 6) {
+    // Incomplete escape. Undo all advances and return false.
+    while (i > 0) {
+      i--;
+      PushBack(chars_read[i]);
+    }
+    return false;
+  }
+  // Complete escape. Add all chars to current literal buffer.
+  for (int i = 0; i < 6; i++) {
+    AddLiteralChar(chars_read[i]);
+  }
+  return true;
  }
  
  
-void Scanner::LiteralScope::Complete() {
-  scanner_->TerminateLiteral();
-  complete_ = true;
+bool JavaScriptScanner::ScanRegExpFlags() {
+  // Scan regular expression flags.
+  LiteralScope literal(this);
+  while (unicode_cache_->IsIdentifierPart(c0_)) {
+    if (c0_ != '\\') {
+      AddLiteralCharAdvance();
+    } else {
+      if (!ScanLiteralUnicodeEscape()) {
+        break;
+      }
+    }
+  }
+  literal.Complete();
+
+  next_.location.end_pos = source_pos() - 1;
+  return true;
  }
  
  } }  // namespace v8::internal
diff --git a/src/scanner.h b/src/scanner.h

index 6422ee8cab8ab71868b47f504d37e89fb3a47a4d..73a4e217981c593d3ec5871b4ded1f053c8d4c13 100644 (file)
--- a/src/scanner.h
+++ b/src/scanner.h
@@ -25,103 +25,538 @@
  // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
+// Features shared by parsing and pre-parsing scanners.
+
  #ifndef V8_SCANNER_H_
  #define V8_SCANNER_H_
  
-#include "scanner-base.h"
+#include "allocation.h"
+#include "char-predicates.h"
+#include "checks.h"
+#include "globals.h"
+#include "token.h"
+#include "unicode-inl.h"
+#include "utils.h"
  
  namespace v8 {
  namespace internal {
  
-// A buffered character stream based on a random access character
-// source (ReadBlock can be called with pos_ pointing to any position,
-// even positions before the current).
-class BufferedUC16CharacterStream: public UC16CharacterStream {
+// Returns the value (0 .. 15) of a hexadecimal character c.
+// If c is not a legal hexadecimal character, returns a value < 0.
+inline int HexValue(uc32 c) {
+  c -= '0';
+  if (static_cast<unsigned>(c) <= 9) return c;
+  c = (c | 0x20) - ('a' - '0');  // detect 0x11..0x16 and 0x31..0x36.
+  if (static_cast<unsigned>(c) <= 5) return c + 10;
+  return -1;
+}
+
+
+// ---------------------------------------------------------------------
+// Buffered stream of characters, using an internal UC16 buffer.
+
+class UC16CharacterStream {
   public:
-  BufferedUC16CharacterStream();
-  virtual ~BufferedUC16CharacterStream();
+  UC16CharacterStream() : pos_(0) { }
+  virtual ~UC16CharacterStream() { }
+
+  // Returns and advances past the next UC16 character in the input
+  // stream. If there are no more characters, it returns a negative
+  // value.
+  inline uc32 Advance() {
+    if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
+      pos_++;
+      return static_cast<uc32>(*(buffer_cursor_++));
+    }
+    // Note: currently the following increment is necessary to avoid a
+    // parser problem! The scanner treats the final kEndOfInput as
+    // a character with a position, and does math relative to that
+    // position.
+    pos_++;
+
+    return kEndOfInput;
+  }
  
-  virtual void PushBack(uc32 character);
+  // Return the current position in the character stream.
+  // Starts at zero.
+  inline unsigned pos() const { return pos_; }
+
+  // Skips forward past the next character_count UC16 characters
+  // in the input, or until the end of input if that comes sooner.
+  // Returns the number of characters actually skipped. If less
+  // than character_count,
+  inline unsigned SeekForward(unsigned character_count) {
+    unsigned buffered_chars =
+        static_cast<unsigned>(buffer_end_ - buffer_cursor_);
+    if (character_count <= buffered_chars) {
+      buffer_cursor_ += character_count;
+      pos_ += character_count;
+      return character_count;
+    }
+    return SlowSeekForward(character_count);
+  }
+
+  // Pushes back the most recently read UC16 character (or negative
+  // value if at end of input), i.e., the value returned by the most recent
+  // call to Advance.
+  // Must not be used right after calling SeekForward.
+  virtual void PushBack(int32_t character) = 0;
  
   protected:
-  static const unsigned kBufferSize = 512;
-  static const unsigned kPushBackStepSize = 16;
+  static const uc32 kEndOfInput = -1;
+
+  // Ensures that the buffer_cursor_ points to the character at
+  // position pos_ of the input, if possible. If the position
+  // is at or after the end of the input, return false. If there
+  // are more characters available, return true.
+  virtual bool ReadBlock() = 0;
+  virtual unsigned SlowSeekForward(unsigned character_count) = 0;
+
+  const uc16* buffer_cursor_;
+  const uc16* buffer_end_;
+  unsigned pos_;
+};
+
  
-  virtual unsigned SlowSeekForward(unsigned delta);
-  virtual bool ReadBlock();
-  virtual void SlowPushBack(uc16 character);
+class UnicodeCache {
+// ---------------------------------------------------------------------
+// Caching predicates used by scanners.
+ public:
+  UnicodeCache() {}
+  typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
+
+  StaticResource<Utf8Decoder>* utf8_decoder() {
+    return &utf8_decoder_;
+  }
+
+  bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }
+  bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
+  bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
+  bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
  
-  virtual unsigned BufferSeekForward(unsigned delta) = 0;
-  virtual unsigned FillBuffer(unsigned position, unsigned length) = 0;
+ private:
  
-  const uc16* pushback_limit_;
-  uc16 buffer_[kBufferSize];
+  unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
+  unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
+  unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
+  unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
+  StaticResource<Utf8Decoder> utf8_decoder_;
+
+  DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
  };
  
  
-// Generic string stream.
-class GenericStringUC16CharacterStream: public BufferedUC16CharacterStream {
+// ----------------------------------------------------------------------------
+// LiteralBuffer -  Collector of chars of literals.
+
+class LiteralBuffer {
   public:
-  GenericStringUC16CharacterStream(Handle<String> data,
-                                   unsigned start_position,
-                                   unsigned end_position);
-  virtual ~GenericStringUC16CharacterStream();
+  LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
  
- protected:
-  virtual unsigned BufferSeekForward(unsigned delta);
-  virtual unsigned FillBuffer(unsigned position, unsigned length);
+  ~LiteralBuffer() {
+    if (backing_store_.length() > 0) {
+      backing_store_.Dispose();
+    }
+  }
+
+  inline void AddChar(uc16 character) {
+    if (position_ >= backing_store_.length()) ExpandBuffer();
+    if (is_ascii_) {
+      if (character < kMaxAsciiCharCodeU) {
+        backing_store_[position_] = static_cast<byte>(character);
+        position_ += kASCIISize;
+        return;
+      }
+      ConvertToUC16();
+    }
+    *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
+    position_ += kUC16Size;
+  }
+
+  bool is_ascii() { return is_ascii_; }
+
+  Vector<const uc16> uc16_literal() {
+    ASSERT(!is_ascii_);
+    ASSERT((position_ & 0x1) == 0);
+    return Vector<const uc16>(
+        reinterpret_cast<const uc16*>(backing_store_.start()),
+        position_ >> 1);
+  }
+
+  Vector<const char> ascii_literal() {
+    ASSERT(is_ascii_);
+    return Vector<const char>(
+        reinterpret_cast<const char*>(backing_store_.start()),
+        position_);
+  }
+
+  int length() {
+    return is_ascii_ ? position_ : (position_ >> 1);
+  }
+
+  void Reset() {
+    position_ = 0;
+    is_ascii_ = true;
+  }
+ private:
+  static const int kInitialCapacity = 16;
+  static const int kGrowthFactory = 4;
+  static const int kMinConversionSlack = 256;
+  static const int kMaxGrowth = 1 * MB;
+  inline int NewCapacity(int min_capacity) {
+    int capacity = Max(min_capacity, backing_store_.length());
+    int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
+    return new_capacity;
+  }
+
+  void ExpandBuffer() {
+    Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
+    memcpy(new_store.start(), backing_store_.start(), position_);
+    backing_store_.Dispose();
+    backing_store_ = new_store;
+  }
+
+  void ConvertToUC16() {
+    ASSERT(is_ascii_);
+    Vector<byte> new_store;
+    int new_content_size = position_ * kUC16Size;
+    if (new_content_size >= backing_store_.length()) {
+      // Ensure room for all currently read characters as UC16 as well
+      // as the character about to be stored.
+      new_store = Vector<byte>::New(NewCapacity(new_content_size));
+    } else {
+      new_store = backing_store_;
+    }
+    char* src = reinterpret_cast<char*>(backing_store_.start());
+    uc16* dst = reinterpret_cast<uc16*>(new_store.start());
+    for (int i = position_ - 1; i >= 0; i--) {
+      dst[i] = src[i];
+    }
+    if (new_store.start() != backing_store_.start()) {
+      backing_store_.Dispose();
+      backing_store_ = new_store;
+    }
+    position_ = new_content_size;
+    is_ascii_ = false;
+  }
+
+  bool is_ascii_;
+  int position_;
+  Vector<byte> backing_store_;
  
-  Handle<String> string_;
-  unsigned start_position_;
-  unsigned length_;
+  DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
  };
  
  
-// UC16 stream based on a literal UTF-8 string.
-class Utf8ToUC16CharacterStream: public BufferedUC16CharacterStream {
+// ----------------------------------------------------------------------------
+// Scanner base-class.
+
+// Generic functionality used by both JSON and JavaScript scanners.
+class Scanner {
   public:
-  Utf8ToUC16CharacterStream(const byte* data, unsigned length);
-  virtual ~Utf8ToUC16CharacterStream();
+  // -1 is outside of the range of any real source code.
+  static const int kNoOctalLocation = -1;
+
+  typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
+
+  class LiteralScope {
+   public:
+    explicit LiteralScope(Scanner* self);
+    ~LiteralScope();
+    void Complete();
+
+   private:
+    Scanner* scanner_;
+    bool complete_;
+  };
+
+  explicit Scanner(UnicodeCache* scanner_contants);
+
+  // Returns the current token again.
+  Token::Value current_token() { return current_.token; }
+
+  // One token look-ahead (past the token returned by Next()).
+  Token::Value peek() const { return next_.token; }
+
+  struct Location {
+    Location(int b, int e) : beg_pos(b), end_pos(e) { }
+    Location() : beg_pos(0), end_pos(0) { }
+
+    bool IsValid() const {
+      return beg_pos >= 0 && end_pos >= beg_pos;
+    }
+
+    static Location invalid() { return Location(-1, -1); }
+
+    int beg_pos;
+    int end_pos;
+  };
+
+  // Returns the location information for the current token
+  // (the token returned by Next()).
+  Location location() const { return current_.location; }
+  Location peek_location() const { return next_.location; }
+
+  // Returns the literal string, if any, for the current token (the
+  // token returned by Next()). The string is 0-terminated and in
+  // UTF-8 format; they may contain 0-characters. Literal strings are
+  // collected for identifiers, strings, and numbers.
+  // These functions only give the correct result if the literal
+  // was scanned between calls to StartLiteral() and TerminateLiteral().
+  bool is_literal_ascii() {
+    ASSERT_NOT_NULL(current_.literal_chars);
+    return current_.literal_chars->is_ascii();
+  }
+  Vector<const char> literal_ascii_string() {
+    ASSERT_NOT_NULL(current_.literal_chars);
+    return current_.literal_chars->ascii_literal();
+  }
+  Vector<const uc16> literal_uc16_string() {
+    ASSERT_NOT_NULL(current_.literal_chars);
+    return current_.literal_chars->uc16_literal();
+  }
+  int literal_length() const {
+    ASSERT_NOT_NULL(current_.literal_chars);
+    return current_.literal_chars->length();
+  }
+
+  bool literal_contains_escapes() const {
+    Location location = current_.location;
+    int source_length = (location.end_pos - location.beg_pos);
+    if (current_.token == Token::STRING) {
+      // Subtract delimiters.
+      source_length -= 2;
+    }
+    return current_.literal_chars->length() != source_length;
+  }
+
+  // Returns the literal string for the next token (the token that
+  // would be returned if Next() were called).
+  bool is_next_literal_ascii() {
+    ASSERT_NOT_NULL(next_.literal_chars);
+    return next_.literal_chars->is_ascii();
+  }
+  Vector<const char> next_literal_ascii_string() {
+    ASSERT_NOT_NULL(next_.literal_chars);
+    return next_.literal_chars->ascii_literal();
+  }
+  Vector<const uc16> next_literal_uc16_string() {
+    ASSERT_NOT_NULL(next_.literal_chars);
+    return next_.literal_chars->uc16_literal();
+  }
+  int next_literal_length() const {
+    ASSERT_NOT_NULL(next_.literal_chars);
+    return next_.literal_chars->length();
+  }
+
+  UnicodeCache* unicode_cache() { return unicode_cache_; }
+
+  static const int kCharacterLookaheadBufferSize = 1;
  
   protected:
-  virtual unsigned BufferSeekForward(unsigned delta);
-  virtual unsigned FillBuffer(unsigned char_position, unsigned length);
-  void SetRawPosition(unsigned char_position);
-
-  const byte* raw_data_;
-  unsigned raw_data_length_;  // Measured in bytes, not characters.
-  unsigned raw_data_pos_;
-  // The character position of the character at raw_data[raw_data_pos_].
-  // Not necessarily the same as pos_.
-  unsigned raw_character_position_;
+  // The current and look-ahead token.
+  struct TokenDesc {
+    Token::Value token;
+    Location location;
+    LiteralBuffer* literal_chars;
+  };
+
+  // Call this after setting source_ to the input.
+  void Init() {
+    // Set c0_ (one character ahead)
+    STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
+    Advance();
+    // Initialize current_ to not refer to a literal.
+    current_.literal_chars = NULL;
+  }
+
+  // Literal buffer support
+  inline void StartLiteral() {
+    LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
+            &literal_buffer2_ : &literal_buffer1_;
+    free_buffer->Reset();
+    next_.literal_chars = free_buffer;
+  }
+
+  inline void AddLiteralChar(uc32 c) {
+    ASSERT_NOT_NULL(next_.literal_chars);
+    next_.literal_chars->AddChar(c);
+  }
+
+  // Complete scanning of a literal.
+  inline void TerminateLiteral() {
+    // Does nothing in the current implementation.
+  }
+
+  // Stops scanning of a literal and drop the collected characters,
+  // e.g., due to an encountered error.
+  inline void DropLiteral() {
+    next_.literal_chars = NULL;
+  }
+
+  inline void AddLiteralCharAdvance() {
+    AddLiteralChar(c0_);
+    Advance();
+  }
+
+  // Low-level scanning support.
+  void Advance() { c0_ = source_->Advance(); }
+  void PushBack(uc32 ch) {
+    source_->PushBack(c0_);
+    c0_ = ch;
+  }
+
+  inline Token::Value Select(Token::Value tok) {
+    Advance();
+    return tok;
+  }
+
+  inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
+    Advance();
+    if (c0_ == next) {
+      Advance();
+      return then;
+    } else {
+      return else_;
+    }
+  }
+
+  uc32 ScanHexNumber(int expected_length);
+
+  // Return the current source position.
+  int source_pos() {
+    return source_->pos() - kCharacterLookaheadBufferSize;
+  }
+
+  UnicodeCache* unicode_cache_;
+
+  // Buffers collecting literal strings, numbers, etc.
+  LiteralBuffer literal_buffer1_;
+  LiteralBuffer literal_buffer2_;
+
+  TokenDesc current_;  // desc for current token (as returned by Next())
+  TokenDesc next_;     // desc for next token (one token look-ahead)
+
+  // Input stream. Must be initialized to an UC16CharacterStream.
+  UC16CharacterStream* source_;
+
+  // One Unicode character look-ahead; c0_ < 0 at the end of the input.
+  uc32 c0_;
  };
  
+// ----------------------------------------------------------------------------
+// JavaScriptScanner - base logic for JavaScript scanning.
  
-// UTF16 buffer to read characters from an external string.
-class ExternalTwoByteStringUC16CharacterStream: public UC16CharacterStream {
+class JavaScriptScanner : public Scanner {
   public:
-  ExternalTwoByteStringUC16CharacterStream(Handle<ExternalTwoByteString> data,
-                                           int start_position,
-                                           int end_position);
-  virtual ~ExternalTwoByteStringUC16CharacterStream();
+  // A LiteralScope that disables recording of some types of JavaScript
+  // literals. If the scanner is configured to not record the specific
+  // type of literal, the scope will not call StartLiteral.
+  class LiteralScope {
+   public:
+    explicit LiteralScope(JavaScriptScanner* self)
+        : scanner_(self), complete_(false) {
+      scanner_->StartLiteral();
+    }
+     ~LiteralScope() {
+       if (!complete_) scanner_->DropLiteral();
+     }
+    void Complete() {
+      scanner_->TerminateLiteral();
+      complete_ = true;
+    }
  
-  virtual void PushBack(uc32 character) {
-    ASSERT(buffer_cursor_ > raw_data_);
-    buffer_cursor_--;
-    pos_--;
+   private:
+    JavaScriptScanner* scanner_;
+    bool complete_;
+  };
+
+  explicit JavaScriptScanner(UnicodeCache* scanner_contants);
+
+  void Initialize(UC16CharacterStream* source);
+
+  // Returns the next token.
+  Token::Value Next();
+
+  // Returns true if there was a line terminator before the peek'ed token,
+  // possibly inside a multi-line comment.
+  bool HasAnyLineTerminatorBeforeNext() const {
+    return has_line_terminator_before_next_ ||
+           has_multiline_comment_before_next_;
    }
  
- protected:
-  virtual unsigned SlowSeekForward(unsigned delta) {
-    // Fast case always handles seeking.
-    return 0;
+  // Scans the input as a regular expression pattern, previous
+  // character(s) must be /(=). Returns true if a pattern is scanned.
+  bool ScanRegExpPattern(bool seen_equal);
+  // Returns true if regexp flags are scanned (always since flags can
+  // be empty).
+  bool ScanRegExpFlags();
+
+  // Tells whether the buffer contains an identifier (no escapes).
+  // Used for checking if a property name is an identifier.
+  static bool IsIdentifier(unibrow::CharacterStream* buffer);
+
+  // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
+  uc32 ScanOctalEscape(uc32 c, int length);
+
+  // Returns the location of the last seen octal literal
+  Location octal_position() const { return octal_pos_; }
+  void clear_octal_position() { octal_pos_ = Location::invalid(); }
+
+  // Seek forward to the given position.  This operation does not
+  // work in general, for instance when there are pushed back
+  // characters, but works for seeking forward until simple delimiter
+  // tokens, which is what it is used for.
+  void SeekForward(int pos);
+
+  bool HarmonyBlockScoping() const {
+    return harmony_block_scoping_;
    }
-  virtual bool ReadBlock() {
-    // Entire string is read at start.
-    return false;
+  void SetHarmonyBlockScoping(bool block_scoping) {
+    harmony_block_scoping_ = block_scoping;
    }
-  Handle<ExternalTwoByteString> source_;
-  const uc16* raw_data_;  // Pointer to the actual array of characters.
+
+
+ protected:
+  bool SkipWhiteSpace();
+  Token::Value SkipSingleLineComment();
+  Token::Value SkipMultiLineComment();
+
+  // Scans a single JavaScript token.
+  void Scan();
+
+  void ScanDecimalDigits();
+  Token::Value ScanNumber(bool seen_period);
+  Token::Value ScanIdentifierOrKeyword();
+  Token::Value ScanIdentifierSuffix(LiteralScope* literal);
+
+  void ScanEscape();
+  Token::Value ScanString();
+
+  // Scans a possible HTML comment -- begins with '<!'.
+  Token::Value ScanHtmlComment();
+
+  // Decodes a unicode escape-sequence which is part of an identifier.
+  // If the escape sequence cannot be decoded the result is kBadChar.
+  uc32 ScanIdentifierUnicodeEscape();
+  // Recognizes a uniocde escape-sequence and adds its characters,
+  // uninterpreted, to the current literal. Used for parsing RegExp
+  // flags.
+  bool ScanLiteralUnicodeEscape();
+
+  // Start position of the octal literal last scanned.
+  Location octal_pos_;
+
+  // Whether there is a line terminator whitespace character after
+  // the current token, and  before the next. Does not count newlines
+  // inside multiline comments.
+  bool has_line_terminator_before_next_;
+  // Whether there is a multi-line comment that contains a
+  // line-terminator after the current token, and before the next.
+  bool has_multiline_comment_before_next_;
+  // Whether we scan 'let' as a keyword for harmony block scoped
+  // let bindings.
+  bool harmony_block_scoping_;
  };
  
  } }  // namespace v8::internal
diff --git a/src/v8conversions.cc b/src/v8conversions.cc

index 96056ecf44196cb2094df7f53799cd214ba911b9..bf175e50b5f4e566e133b980127d75436e128f99 100644 (file)
--- a/src/v8conversions.cc
+++ b/src/v8conversions.cc
@@ -34,7 +34,6 @@
  #include "v8conversions.h"
  #include "dtoa.h"
  #include "factory.h"
-#include "scanner-base.h"
  #include "strtod.h"
  
  namespace v8 {
diff --git a/test/cctest/test-parsing.cc b/test/cctest/test-parsing.cc

index 8b6afdc59c4be64aac6e286327bb2361efb77e9c..160c9b11bee21f1e780eab65cb3fec82f9c4d7bb 100755 (executable)
--- a/test/cctest/test-parsing.cc
+++ b/test/cctest/test-parsing.cc
@@ -31,14 +31,14 @@
  
  #include "v8.h"
  
+#include "cctest.h"
+#include "execution.h"
  #include "isolate.h"
-#include "token.h"
-#include "scanner.h"
  #include "parser.h"
-#include "utils.h"
-#include "execution.h"
  #include "preparser.h"
-#include "cctest.h"
+#include "scanner-character-streams.h"
+#include "token.h"
+#include "utils.h"
  
  TEST(ScanKeywords) {
    struct KeywordToken {
diff --git a/tools/gyp/v8.gyp b/tools/gyp/v8.gyp

index f76bbd9c57bdb417713ef3bd0f9ad21193eb0aa9..0512bec93f19dbf688a06cb3a058e3ae240e48c2 100644 (file)
--- a/tools/gyp/v8.gyp
+++ b/tools/gyp/v8.gyp
@@ -407,10 +407,10 @@
              '../../src/runtime-profiler.h',
              '../../src/safepoint-table.cc',
              '../../src/safepoint-table.h',
-            '../../src/scanner-base.cc',
-            '../../src/scanner-base.h',
              '../../src/scanner.cc',
              '../../src/scanner.h',
+            '../../src/scanner-character-streams.cc',
+            '../../src/scanner-character-streams.h',
              '../../src/scopeinfo.cc',
              '../../src/scopeinfo.h',
              '../../src/scopes.cc',
@@ -825,8 +825,8 @@
              '../../src/preparser.cc',
              '../../src/preparser.h',
              '../../src/preparser-api.cc',
-            '../../src/scanner-base.cc',
-            '../../src/scanner-base.h',
+            '../../src/scanner.cc',
+            '../../src/scanner.h',
              '../../src/strtod.cc',
              '../../src/strtod.h',
              '../../src/token.cc',
author	lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Thu, 8 Sep 2011 13:06:44 +0000 (13:06 +0000)
committer	lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Thu, 8 Sep 2011 13:06:44 +0000 (13:06 +0000)
src/SConscript		patch \| blob \| history
src/api.cc		patch \| blob \| history
src/compiler.cc		patch \| blob \| history
src/conversions-inl.h		patch \| blob \| history
src/conversions.cc		patch \| blob \| history
src/conversions.h		patch \| blob \| history
src/dateparser.h		patch \| blob \| history
src/heap.cc		patch \| blob \| history
src/isolate.cc		patch \| blob \| history
src/objects.cc		patch \| blob \| history
src/parser.cc		patch \| blob \| history
src/parser.h		patch \| blob \| history
src/preparser-api.cc		patch \| blob \| history
src/preparser.cc		patch \| blob \| history
src/preparser.h		patch \| blob \| history
src/scanner-base.cc	[deleted file]	patch \| blob \| history
src/scanner-base.h	[deleted file]	patch \| blob \| history
src/scanner-character-streams.cc	[new file with mode: 0644]	patch \| blob
src/scanner-character-streams.h	[new file with mode: 0644]	patch \| blob
src/scanner.cc		patch \| blob \| history
src/scanner.h		patch \| blob \| history
src/v8conversions.cc		patch \| blob \| history
test/cctest/test-parsing.cc		patch \| blob \| history
tools/gyp/v8.gyp		patch \| blob \| history