Optimized scanner to avoid virtual calls for every character read.

author lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Tue, 7 Dec 2010 14:03:59 +0000 (14:03 +0000)

committer lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Tue, 7 Dec 2010 14:03:59 +0000 (14:03 +0000)
author lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Tue, 7 Dec 2010 14:03:59 +0000 (14:03 +0000)
committer lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Tue, 7 Dec 2010 14:03:59 +0000 (14:03 +0000)
diff --git a/src/api.cc b/src/api.cc

index 0ec8cf1..4bff5e1 100644 (file)
--- a/src/api.cc
+++ b/src/api.cc
@@ -1165,14 +1165,22 @@ void ObjectTemplate::SetInternalFieldCount(int value) {
  
  
  ScriptData* ScriptData::PreCompile(const char* input, int length) {
-  unibrow::Utf8InputBuffer<> buf(input, length);
-  return i::ParserApi::PreParse(i::Handle<i::String>(), &buf, NULL);
+  i::Utf8ToUC16CharacterStream stream(
+      reinterpret_cast<const unsigned char*>(input), length);
+  return i::ParserApi::PreParse(&stream, NULL);
  }
  
  
  ScriptData* ScriptData::PreCompile(v8::Handle<String> source) {
    i::Handle<i::String> str = Utils::OpenHandle(*source);
-  return i::ParserApi::PreParse(str, NULL, NULL);
+  if (str->IsExternalTwoByteString()) {
+    i::ExternalTwoByteStringUC16CharacterStream stream(
+      i::Handle<i::ExternalTwoByteString>::cast(str), 0, str->length());
+    return i::ParserApi::PreParse(&stream, NULL);
+  } else {
+    i::GenericStringUC16CharacterStream stream(str, 0, str->length());
+    return i::ParserApi::PreParse(&stream, NULL);
+  }
  }
  
  
diff --git a/src/checks.h b/src/checks.h

index aa557f0..8d13d65 100644 (file)
--- a/src/checks.h
+++ b/src/checks.h
@@ -231,6 +231,8 @@ static inline void CheckNonEqualsHelper(const char* file,
  
  #define CHECK_GT(a, b) CHECK((a) > (b))
  #define CHECK_GE(a, b) CHECK((a) >= (b))
+#define CHECK_LT(a, b) CHECK((a) < (b))
+#define CHECK_LE(a, b) CHECK((a) <= (b))
  
  
  // This is inspired by the static assertion facility in boost.  This
diff --git a/src/compiler.cc b/src/compiler.cc

index 59a684c..a52f3c2 100755 (executable)
--- a/src/compiler.cc
+++ b/src/compiler.cc
@@ -461,7 +461,14 @@ Handle<SharedFunctionInfo> Compiler::Compile(Handle<String> source,
      ScriptDataImpl* pre_data = input_pre_data;
      if (pre_data == NULL
          && source_length >= FLAG_min_preparse_length) {
-      pre_data = ParserApi::PartialPreParse(source, NULL, extension);
+      if (source->IsExternalTwoByteString()) {
+        ExternalTwoByteStringUC16CharacterStream stream(
+            Handle<ExternalTwoByteString>::cast(source), 0, source->length());
+        pre_data = ParserApi::PartialPreParse(&stream, extension);
+      } else {
+        GenericStringUC16CharacterStream stream(source, 0, source->length());
+        pre_data = ParserApi::PartialPreParse(&stream, extension);
+      }
      }
  
      // Create a script object describing the script to be compiled.
diff --git a/src/parser.cc b/src/parser.cc

index 056332b..160f0f7 100644 (file)
--- a/src/parser.cc
+++ b/src/parser.cc
@@ -609,7 +609,25 @@ FunctionLiteral* Parser::ParseProgram(Handle<String> source,
  
    // Initialize parser state.
    source->TryFlatten();
-  scanner_.Initialize(source);
+  if (source->IsExternalTwoByteString()) {
+    // Notice that the stream is destroyed at the end of the branch block.
+    // The last line of the blocks can't be moved outside, even though they're
+    // identical calls.
+    ExternalTwoByteStringUC16CharacterStream stream(
+        Handle<ExternalTwoByteString>::cast(source), 0, source->length());
+    scanner_.Initialize(&stream, JavaScriptScanner::kAllLiterals);
+    return DoParseProgram(source, in_global_context, &zone_scope);
+  } else {
+    GenericStringUC16CharacterStream stream(source, 0, source->length());
+    scanner_.Initialize(&stream, JavaScriptScanner::kAllLiterals);
+    return DoParseProgram(source, in_global_context, &zone_scope);
+  }
+}
+
+
+FunctionLiteral* Parser::DoParseProgram(Handle<String> source,
+                                        bool in_global_context,
+                                        ZoneScope* zone_scope) {
    ASSERT(target_stack_ == NULL);
    if (pre_data_ != NULL) pre_data_->Initialize();
  
@@ -655,25 +673,45 @@ FunctionLiteral* Parser::ParseProgram(Handle<String> source,
  
    // If there was a syntax error we have to get rid of the AST
    // and it is not safe to do so before the scope has been deleted.
-  if (result == NULL) zone_scope.DeleteOnExit();
+  if (result == NULL) zone_scope->DeleteOnExit();
    return result;
  }
  
-
  FunctionLiteral* Parser::ParseLazy(Handle<SharedFunctionInfo> info) {
    CompilationZoneScope zone_scope(DONT_DELETE_ON_EXIT);
    HistogramTimerScope timer(&Counters::parse_lazy);
    Handle<String> source(String::cast(script_->source()));
    Counters::total_parse_size.Increment(source->length());
  
+  // Initialize parser state.
+  source->TryFlatten();
+  if (source->IsExternalTwoByteString()) {
+    ExternalTwoByteStringUC16CharacterStream stream(
+        Handle<ExternalTwoByteString>::cast(source),
+        info->start_position(),
+        info->end_position());
+    FunctionLiteral* result = ParseLazy(info, &stream, &zone_scope);
+    return result;
+  } else {
+    GenericStringUC16CharacterStream stream(source,
+                                            info->start_position(),
+                                            info->end_position());
+    FunctionLiteral* result = ParseLazy(info, &stream, &zone_scope);
+    return result;
+  }
+}
+
+
+FunctionLiteral* Parser::ParseLazy(Handle<SharedFunctionInfo> info,
+                                   UC16CharacterStream* source,
+                                   ZoneScope* zone_scope) {
+  scanner_.Initialize(source, JavaScriptScanner::kAllLiterals);
+  ASSERT(target_stack_ == NULL);
+
    Handle<String> name(String::cast(info->name()));
    fni_ = new FuncNameInferrer();
    fni_->PushEnclosingName(name);
  
-  // Initialize parser state.
-  source->TryFlatten();
-  scanner_.Initialize(source, info->start_position(), info->end_position());
-  ASSERT(target_stack_ == NULL);
    mode_ = PARSE_EAGERLY;
  
    // Place holder for the result.
@@ -705,7 +743,7 @@ FunctionLiteral* Parser::ParseLazy(Handle<SharedFunctionInfo> info) {
    // not safe to do before scope has been deleted.
    if (result == NULL) {
      Top::StackOverflow();
-    zone_scope.DeleteOnExit();
+    zone_scope->DeleteOnExit();
    } else {
      Handle<String> inferred_name(info->inferred_name());
      result->set_inferred_name(inferred_name);
@@ -719,12 +757,12 @@ Handle<String> Parser::GetSymbol(bool* ok) {
    if (pre_data() != NULL) {
      symbol_id = pre_data()->GetSymbolIdentifier();
    }
-  return LookupSymbol(symbol_id, scanner_.literal());
+  return LookupSymbol(symbol_id, scanner().literal());
  }
  
  
  void Parser::ReportMessage(const char* type, Vector<const char*> args) {
-  Scanner::Location source_location = scanner_.location();
+  Scanner::Location source_location = scanner().location();
    ReportMessageAt(source_location, type, args);
  }
  
@@ -1641,7 +1679,7 @@ Statement* Parser::ParseContinueStatement(bool* ok) {
    Expect(Token::CONTINUE, CHECK_OK);
    Handle<String> label = Handle<String>::null();
    Token::Value tok = peek();
-  if (!scanner_.has_line_terminator_before_next() &&
+  if (!scanner().has_line_terminator_before_next() &&
        tok != Token::SEMICOLON && tok != Token::RBRACE && tok != Token::EOS) {
      label = ParseIdentifier(CHECK_OK);
    }
@@ -1667,7 +1705,7 @@ Statement* Parser::ParseBreakStatement(ZoneStringList* labels, bool* ok) {
    Expect(Token::BREAK, CHECK_OK);
    Handle<String> label;
    Token::Value tok = peek();
-  if (!scanner_.has_line_terminator_before_next() &&
+  if (!scanner().has_line_terminator_before_next() &&
        tok != Token::SEMICOLON && tok != Token::RBRACE && tok != Token::EOS) {
      label = ParseIdentifier(CHECK_OK);
    }
@@ -1712,7 +1750,7 @@ Statement* Parser::ParseReturnStatement(bool* ok) {
    }
  
    Token::Value tok = peek();
-  if (scanner_.has_line_terminator_before_next() ||
+  if (scanner().has_line_terminator_before_next() ||
        tok == Token::SEMICOLON ||
        tok == Token::RBRACE ||
        tok == Token::EOS) {
@@ -1844,7 +1882,7 @@ Statement* Parser::ParseThrowStatement(bool* ok) {
  
    Expect(Token::THROW, CHECK_OK);
    int pos = scanner().location().beg_pos;
-  if (scanner_.has_line_terminator_before_next()) {
+  if (scanner().has_line_terminator_before_next()) {
      ReportMessage("newline_after_throw", Vector<const char*>::empty());
      *ok = false;
      return NULL;
@@ -2408,7 +2446,8 @@ Expression* Parser::ParsePostfixExpression(bool* ok) {
    //   LeftHandSideExpression ('++' | '--')?
  
    Expression* expression = ParseLeftHandSideExpression(CHECK_OK);
-  if (!scanner_.has_line_terminator_before_next() && Token::IsCountOp(peek())) {
+  if (!scanner().has_line_terminator_before_next() &&
+      Token::IsCountOp(peek())) {
      // Signal a reference error if the expression is an invalid
      // left-hand side expression.  We could report this as a syntax
      // error here but for compatibility with JSC we choose to report the
@@ -2677,7 +2716,7 @@ Expression* Parser::ParsePrimaryExpression(bool* ok) {
      case Token::NUMBER: {
        Consume(Token::NUMBER);
        double value =
-        StringToDouble(scanner_.literal(), ALLOW_HEX | ALLOW_OCTALS);
+        StringToDouble(scanner().literal(), ALLOW_HEX | ALLOW_OCTALS);
        result = NewNumberLiteral(value);
        break;
      }
@@ -3028,7 +3067,7 @@ Expression* Parser::ParseObjectLiteral(bool* ok) {
        case Token::NUMBER: {
          Consume(Token::NUMBER);
          double value =
-          StringToDouble(scanner_.literal(), ALLOW_HEX | ALLOW_OCTALS);
+          StringToDouble(scanner().literal(), ALLOW_HEX | ALLOW_OCTALS);
          key = NewNumberLiteral(value);
          break;
        }
@@ -3089,7 +3128,7 @@ Expression* Parser::ParseObjectLiteral(bool* ok) {
  
  
  Expression* Parser::ParseRegExpLiteral(bool seen_equal, bool* ok) {
-  if (!scanner_.ScanRegExpPattern(seen_equal)) {
+  if (!scanner().ScanRegExpPattern(seen_equal)) {
      Next();
      ReportMessage("unterminated_regexp", Vector<const char*>::empty());
      *ok = false;
@@ -3099,10 +3138,10 @@ Expression* Parser::ParseRegExpLiteral(bool seen_equal, bool* ok) {
    int literal_index = temp_scope_->NextMaterializedLiteralIndex();
  
    Handle<String> js_pattern =
-      Factory::NewStringFromUtf8(scanner_.next_literal(), TENURED);
-  scanner_.ScanRegExpFlags();
+      Factory::NewStringFromUtf8(scanner().next_literal(), TENURED);
+  scanner().ScanRegExpFlags();
    Handle<String> js_flags =
-      Factory::NewStringFromUtf8(scanner_.next_literal(), TENURED);
+      Factory::NewStringFromUtf8(scanner().next_literal(), TENURED);
    Next();
  
    return new RegExpLiteral(js_pattern, js_flags, literal_index);
@@ -3158,7 +3197,7 @@ FunctionLiteral* Parser::ParseFunctionLiteral(Handle<String> var_name,
      //  FormalParameterList ::
      //    '(' (Identifier)*[','] ')'
      Expect(Token::LPAREN, CHECK_OK);
-    int start_pos = scanner_.location().beg_pos;
+    int start_pos = scanner().location().beg_pos;
      bool done = (peek() == Token::RPAREN);
      while (!done) {
        Handle<String> param_name = ParseIdentifier(CHECK_OK);
@@ -3195,7 +3234,7 @@ FunctionLiteral* Parser::ParseFunctionLiteral(Handle<String> var_name,
      bool is_lazily_compiled =
          mode() == PARSE_LAZILY && top_scope_->HasTrivialOuterContext();
  
-    int function_block_pos = scanner_.location().beg_pos;
+    int function_block_pos = scanner().location().beg_pos;
      int materialized_literal_count;
      int expected_property_count;
      int end_pos;
@@ -3212,7 +3251,8 @@ FunctionLiteral* Parser::ParseFunctionLiteral(Handle<String> var_name,
          ReportInvalidPreparseData(name, CHECK_OK);
        }
        Counters::total_preparse_skipped.Increment(end_pos - function_block_pos);
-      scanner_.SeekForward(end_pos);
+      // Seek to position just before terminal '}'.
+      scanner().SeekForward(end_pos - 1);
        materialized_literal_count = entry.literal_count();
        expected_property_count = entry.property_count();
        only_simple_this_property_assignments = false;
@@ -3228,7 +3268,7 @@ FunctionLiteral* Parser::ParseFunctionLiteral(Handle<String> var_name,
        this_property_assignments = temp_scope.this_property_assignments();
  
        Expect(Token::RBRACE, CHECK_OK);
-      end_pos = scanner_.location().end_pos;
+      end_pos = scanner().location().end_pos;
      }
  
      FunctionLiteral* function_literal =
@@ -3332,7 +3372,7 @@ void Parser::ExpectSemicolon(bool* ok) {
      Next();
      return;
    }
-  if (scanner_.has_line_terminator_before_next() ||
+  if (scanner().has_line_terminator_before_next() ||
        tok == Token::RBRACE ||
        tok == Token::EOS) {
      return;
@@ -3383,8 +3423,8 @@ Handle<String> Parser::ParseIdentifierOrGetOrSet(bool* is_get,
                                                   bool* ok) {
    Expect(Token::IDENTIFIER, ok);
    if (!*ok) return Handle<String>();
-  if (scanner_.literal_length() == 3) {
-    const char* token = scanner_.literal_string();
+  if (scanner().literal_length() == 3) {
+    const char* token = scanner().literal_string();
      *is_get = strcmp(token, "get") == 0;
      *is_set = !*is_get && strcmp(token, "set") == 0;
    }
@@ -3503,8 +3543,8 @@ Expression* Parser::NewThrowError(Handle<String> constructor,
  // ----------------------------------------------------------------------------
  // JSON
  
-Handle<Object> JsonParser::ParseJson(Handle<String> source) {
-  source->TryFlatten();
+Handle<Object> JsonParser::ParseJson(Handle<String> script,
+                                     UC16CharacterStream* source) {
    scanner_.Initialize(source);
    stack_overflow_ = false;
    Handle<Object> result = ParseJsonValue();
@@ -3540,7 +3580,7 @@ Handle<Object> JsonParser::ParseJson(Handle<String> source) {
        }
  
        Scanner::Location source_location = scanner_.location();
-      MessageLocation location(Factory::NewScript(source),
+      MessageLocation location(Factory::NewScript(script),
                                 source_location.beg_pos,
                                 source_location.end_pos);
        int argc = (name_opt == NULL) ? 0 : 1;
@@ -4555,13 +4595,12 @@ int ScriptDataImpl::ReadNumber(byte** source) {
  
  
  // Create a Scanner for the preparser to use as input, and preparse the source.
-static ScriptDataImpl* DoPreParse(Handle<String> source,
-                                  unibrow::CharacterStream* stream,
+static ScriptDataImpl* DoPreParse(UC16CharacterStream* source,
                                    bool allow_lazy,
                                    ParserRecorder* recorder,
                                    int literal_flags) {
    V8JavaScriptScanner scanner;
-  scanner.Initialize(source, stream, literal_flags);
+  scanner.Initialize(source, literal_flags);
    intptr_t stack_limit = StackGuard::real_climit();
    if (!preparser::PreParser::PreParseProgram(&scanner,
                                               recorder,
@@ -4580,8 +4619,7 @@ static ScriptDataImpl* DoPreParse(Handle<String> source,
  
  // Preparse, but only collect data that is immediately useful,
  // even if the preparser data is only used once.
-ScriptDataImpl* ParserApi::PartialPreParse(Handle<String> source,
-                                           unibrow::CharacterStream* stream,
+ScriptDataImpl* ParserApi::PartialPreParse(UC16CharacterStream* source,
                                             v8::Extension* extension) {
    bool allow_lazy = FLAG_lazy && (extension == NULL);
    if (!allow_lazy) {
@@ -4590,22 +4628,19 @@ ScriptDataImpl* ParserApi::PartialPreParse(Handle<String> source,
      return NULL;
    }
    PartialParserRecorder recorder;
-
-  return DoPreParse(source, stream, allow_lazy, &recorder,
+  return DoPreParse(source, allow_lazy, &recorder,
                      JavaScriptScanner::kNoLiterals);
  }
  
  
-ScriptDataImpl* ParserApi::PreParse(Handle<String> source,
-                                    unibrow::CharacterStream* stream,
+ScriptDataImpl* ParserApi::PreParse(UC16CharacterStream* source,
                                      v8::Extension* extension) {
    Handle<Script> no_script;
    bool allow_lazy = FLAG_lazy && (extension == NULL);
    CompleteParserRecorder recorder;
    int kPreParseLiteralsFlags =
        JavaScriptScanner::kLiteralString | JavaScriptScanner::kLiteralIdentifier;
-  return DoPreParse(source, stream, allow_lazy,
-                    &recorder, kPreParseLiteralsFlags);
+  return DoPreParse(source, allow_lazy, &recorder, kPreParseLiteralsFlags);
  }
  
  
diff --git a/src/parser.h b/src/parser.h

index 58cd946..70d0e18 100644 (file)
--- a/src/parser.h
+++ b/src/parser.h
@@ -169,14 +169,12 @@ class ParserApi {
    static bool Parse(CompilationInfo* info);
  
    // Generic preparser generating full preparse data.
-  static ScriptDataImpl* PreParse(Handle<String> source,
-                                  unibrow::CharacterStream* stream,
+  static ScriptDataImpl* PreParse(UC16CharacterStream* source,
                                    v8::Extension* extension);
  
    // Preparser that only does preprocessing that makes sense if only used
    // immediately after.
-  static ScriptDataImpl* PartialPreParse(Handle<String> source,
-                                         unibrow::CharacterStream* stream,
+  static ScriptDataImpl* PartialPreParse(UC16CharacterStream* source,
                                           v8::Extension* extension);
  };
  
@@ -435,18 +433,26 @@ class Parser {
                         Vector<const char*> args);
  
   protected:
+  FunctionLiteral* ParseLazy(Handle<SharedFunctionInfo> info,
+                             UC16CharacterStream* source,
+                             ZoneScope* zone_scope);
    enum Mode {
      PARSE_LAZILY,
      PARSE_EAGERLY
    };
  
+  // Called by ParseProgram after setting up the scanner.
+  FunctionLiteral* DoParseProgram(Handle<String> source,
+                                  bool in_global_context,
+                                  ZoneScope* zone_scope);
+
    // Report syntax error
    void ReportUnexpectedToken(Token::Value token);
    void ReportInvalidPreparseData(Handle<String> name, bool* ok);
    void ReportMessage(const char* message, Vector<const char*> args);
  
    bool inside_with() const { return with_nesting_level_ > 0; }
-  Scanner& scanner()  { return scanner_; }
+  V8JavaScriptScanner& scanner()  { return scanner_; }
    Mode mode() const { return mode_; }
    ScriptDataImpl* pre_data() const { return pre_data_; }
  
@@ -548,7 +554,7 @@ class Parser {
  
    INLINE(Token::Value peek()) {
      if (stack_overflow_) return Token::ILLEGAL;
-    return scanner_.peek();
+    return scanner().peek();
    }
  
    INLINE(Token::Value Next()) {
@@ -560,9 +566,11 @@ class Parser {
      }
      if (StackLimitCheck().HasOverflowed()) {
        // Any further calls to Next or peek will return the illegal token.
+      // The current call must return the next token, which might already
+      // have been peek'ed.
        stack_overflow_ = true;
      }
-    return scanner_.Next();
+    return scanner().Next();
    }
  
    INLINE(void Consume(Token::Value token));
@@ -702,7 +710,14 @@ class JsonParser BASE_EMBEDDED {
    // Parse JSON input as a single JSON value.
    // Returns null handle and sets exception if parsing failed.
    static Handle<Object> Parse(Handle<String> source) {
-    return JsonParser().ParseJson(source);
+    if (source->IsExternalTwoByteString()) {
+      ExternalTwoByteStringUC16CharacterStream stream(
+          Handle<ExternalTwoByteString>::cast(source), 0, source->length());
+      return JsonParser().ParseJson(source, &stream);
+    } else {
+      GenericStringUC16CharacterStream stream(source, 0, source->length());
+      return JsonParser().ParseJson(source, &stream);
+    }
    }
  
   private:
@@ -710,7 +725,7 @@ class JsonParser BASE_EMBEDDED {
    ~JsonParser() { }
  
    // Parse a string containing a single JSON value.
-  Handle<Object> ParseJson(Handle<String>);
+  Handle<Object> ParseJson(Handle<String> script, UC16CharacterStream* source);
    // Parse a single JSON value from input (grammar production JSONValue).
    // A JSON value is either a (double-quoted) string literal, a number literal,
    // one of "true", "false", or "null", or an object or array literal.
diff --git a/src/preparser-api.cc b/src/preparser-api.cc

index f096e94..df2cf2b 100644 (file)
--- a/src/preparser-api.cc
+++ b/src/preparser-api.cc
@@ -39,39 +39,82 @@ namespace v8 {
  namespace internal {
  
  // UTF16Buffer based on a v8::UnicodeInputStream.
-class InputStreamUTF16Buffer : public UTF16Buffer {
+class InputStreamUTF16Buffer : public UC16CharacterStream {
   public:
-  explicit InputStreamUTF16Buffer(UnicodeInputStream* stream)
-      : UTF16Buffer(),
-        stream_(stream) { }
+  explicit InputStreamUTF16Buffer(v8::UnicodeInputStream* stream)
+      : UC16CharacterStream(),
+        stream_(stream),
+        pushback_active_(false) {
+    buffer_cursor_ = buffer_end_ = buffer_ + kPushBackSize;
+  }
  
    virtual ~InputStreamUTF16Buffer() { }
  
-  virtual void PushBack(uc32 ch) {
+  virtual void PushBack(uc16 ch) {
+    ASSERT(pos_ > 0);
+    if (buffer_cursor_ > buffer_) {
+      // While we can stay within the buffer, just do so.
+      *--buffer_cursor_ = ch;
+      pos_--;
+      return;
+    }
+    if (!pushback_active_) {
+      // Push back the entire buffer to the stream and let the
+      // stream handle pushbacks from now.
+      // We leave buffer_cursor_ == buffer_end_, so the next read
+      // will fill the buffer from the current position.
+      // This should happen exceedingly rarely.
+      while (buffer_end_ > buffer_) {
+        stream_->PushBack(*--buffer_end_);
+      }
+      buffer_cursor_ = buffer_end_;
+      pushback_active_ = true;
+    }
      stream_->PushBack(ch);
      pos_--;
    }
  
-  virtual uc32 Advance() {
-    uc32 result = stream_->Next();
-    if (result >= 0) pos_++;
-    return result;
+ protected:
+  virtual bool ReadBlock() {
+    // Copy the top of the buffer into the pushback area.
+    pushback_active_ = false;
+    int32_t value;
+    uc16* buffer_start = buffer_ + kPushBackSize;
+    buffer_cursor_ = buffer_end_ = buffer_start;
+    while ((value = stream_->Next()) >= 0) {
+      if (value > static_cast<int32_t>(unibrow::Utf8::kMaxThreeByteChar)) {
+        value = unibrow::Utf8::kBadChar;
+      }
+      // buffer_end_ is a const pointer, but buffer_ is writable.
+      buffer_start[buffer_end_++ - buffer_start] = static_cast<uc16>(value);
+      if (buffer_end_ == buffer_ + kPushBackSize + kBufferSize) break;
+    }
+    return buffer_end_ > buffer_start;
    }
  
-  virtual void SeekForward(int pos) {
+  virtual unsigned SlowSeekForward(unsigned pos) {
      // Seeking in the input is not used by preparsing.
      // It's only used by the real parser based on preparser data.
      UNIMPLEMENTED();
+    return 0;
    }
  
   private:
+  static const unsigned kBufferSize = 512;
+  static const unsigned kPushBackSize = 16;
    v8::UnicodeInputStream* const stream_;
+  // Buffer holding first kPushBackSize characters of pushback buffer,
+  // then kBufferSize chars of read-ahead.
+  // The pushback buffer is only used if pushing back characters past
+  // the start of a block.
+  uc16 buffer_[kBufferSize + kPushBackSize];
+  bool pushback_active_;
  };
  
  
  class StandAloneJavaScriptScanner : public JavaScriptScanner {
   public:
-  void Initialize(UTF16Buffer* source) {
+  void Initialize(UC16CharacterStream* source) {
      source_ = source;
      literal_flags_ = kLiteralString | kLiteralIdentifier;
      Init();
diff --git a/src/preparser.cc b/src/preparser.cc

index 03fc9dc..7cce685 100644 (file)
--- a/src/preparser.cc
+++ b/src/preparser.cc
@@ -1078,6 +1078,7 @@ PreParser::Expression PreParser::ParseFunctionLiteral(bool* ok) {
  
      Expect(i::Token::RBRACE, CHECK_OK);
  
+    // Position right after terminal '}'.
      int end_pos = scanner_->location().end_pos;
      log_->LogFunction(function_block_pos, end_pos,
                        function_scope.materialized_literal_count(),
diff --git a/src/scanner-base.cc b/src/scanner-base.cc

index 9e58c4e..a01e75e 100644 (file)
--- a/src/scanner-base.cc
+++ b/src/scanner-base.cc
@@ -35,12 +35,6 @@ namespace v8 {
  namespace internal {
  
  // ----------------------------------------------------------------------------
-// UTF16Buffer
-
-UTF16Buffer::UTF16Buffer()
-    : pos_(0), end_(kNoEndPosition) { }
-
-// ----------------------------------------------------------------------------
  // LiteralCollector
  
  LiteralCollector::LiteralCollector()
@@ -92,7 +86,7 @@ bool ScannerConstants::IsIdentifier(unibrow::CharacterStream* buffer) {
  // ----------------------------------------------------------------------------
  // Scanner
  
-Scanner::Scanner() : source_(NULL) {}
+Scanner::Scanner() { }
  
  
  uc32 Scanner::ScanHexEscape(uc32 c, int length) {
@@ -142,8 +136,7 @@ uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
  // ----------------------------------------------------------------------------
  // JavaScriptScanner
  
-JavaScriptScanner::JavaScriptScanner()
-    : has_line_terminator_before_next_(false) {}
+JavaScriptScanner::JavaScriptScanner() : Scanner() {}
  
  
  Token::Value JavaScriptScanner::Next() {
@@ -503,13 +496,23 @@ void JavaScriptScanner::Scan() {
  
  
  void JavaScriptScanner::SeekForward(int pos) {
-  source_->SeekForward(pos - 1);
-  Advance();
-  // This function is only called to seek to the location
-  // of the end of a function (at the "}" token). It doesn't matter
-  // whether there was a line terminator in the part we skip.
-  has_line_terminator_before_next_ = false;
+  // After this call, we will have the token at the given position as
+  // the "next" token. The "current" token will be invalid.
+  if (pos == next_.location.beg_pos) return;
+  int current_pos = source_pos();
+  ASSERT_EQ(next_.location.end_pos, current_pos);
+  // Positions inside the lookahead token aren't supported.
+  ASSERT(pos >= current_pos);
+  if (pos != current_pos) {
+    source_->SeekForward(pos - source_->pos());
+    Advance();
+    // This function is only called to seek to the location
+    // of the end of a function (at the "}" token). It doesn't matter
+    // whether there was a line terminator in the part we skip.
+    has_line_terminator_before_next_ = false;
+  }
    Scan();
+  ASSERT_EQ(Token::RBRACE, next_.token);
  }
  
  
diff --git a/src/scanner-base.h b/src/scanner-base.h

index 3d344f3..c50b8f3 100644 (file)
--- a/src/scanner-base.h
+++ b/src/scanner-base.h
@@ -52,31 +52,75 @@ inline int HexValue(uc32 c) {
    return -1;
  }
  
-// ----------------------------------------------------------------------------
-// UTF16Buffer - scanner input source with pushback.
  
-class UTF16Buffer {
+// ---------------------------------------------------------------------
+// Buffered stream of characters, using an internal UC16 buffer.
+
+class UC16CharacterStream {
   public:
-  UTF16Buffer();
-  virtual ~UTF16Buffer() {}
+  UC16CharacterStream() : pos_(0) { }
+  virtual ~UC16CharacterStream() { }
+
+  // Returns and advances past the next UC16 character in the input
+  // stream. If there are no more characters, it returns a negative
+  // value.
+  inline int32_t Advance() {
+    if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
+      pos_++;
+      return *(buffer_cursor_++);
+    }
+    // Note: currently the following increment is necessary to avoid a
+    // parser problem! The scanner treats the final kEndOfInput as
+    // a character with a position, and does math relative to that
+    // position.
+    pos_++;
  
-  virtual void PushBack(uc32 ch) = 0;
-  // Returns a value < 0 when the buffer end is reached.
-  virtual uc32 Advance() = 0;
-  virtual void SeekForward(int pos) = 0;
+    return kEndOfInput;
+  }
  
-  int pos() const { return pos_; }
+  // Return the current position in the character stream.
+  // Starts at zero.
+  inline unsigned pos() const { return pos_; }
+
+  // Skips forward past the next character_count UC16 characters
+  // in the input, or until the end of input if that comes sooner.
+  // Returns the number of characters actually skipped. If less
+  // than character_count,
+  inline unsigned SeekForward(unsigned character_count) {
+    unsigned buffered_chars =
+        static_cast<unsigned>(buffer_end_ - buffer_cursor_);
+    if (character_count <= buffered_chars) {
+      buffer_cursor_ += character_count;
+      pos_ += character_count;
+      return character_count;
+    }
+    return SlowSeekForward(character_count);
+  }
  
-  static const int kNoEndPosition = 1;
+  // Pushes back the most recently read UC16 character, i.e.,
+  // the value returned by the most recent call to Advance.
+  // Must not be used right after calling SeekForward.
+  virtual void PushBack(uc16 character) = 0;
  
   protected:
-  // Initial value of end_ before the input stream is initialized.
-
-  int pos_;  // Current position in the buffer.
-  int end_;  // Position where scanning should stop (EOF).
+  static const int32_t kEndOfInput = -1;
+
+  // Ensures that the buffer_cursor_ points to the character at
+  // position pos_ of the input, if possible. If the position
+  // is at or after the end of the input, return false. If there
+  // are more characters available, return true.
+  virtual bool ReadBlock() = 0;
+  virtual unsigned SlowSeekForward(unsigned character_count) = 0;
+
+  const uc16* buffer_cursor_;
+  const uc16* buffer_end_;
+  unsigned pos_;
  };
  
  
+// ---------------------------------------------------------------------
+// Constants used by scanners.
+
  class ScannerConstants : AllStatic {
   public:
    typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
@@ -277,7 +321,7 @@ class Scanner {
    // Low-level scanning support.
    void Advance() { c0_ = source_->Advance(); }
    void PushBack(uc32 ch) {
-    source_->PushBack(ch);
+    source_->PushBack(c0_);
      c0_ = ch;
    }
  
@@ -307,8 +351,8 @@ class Scanner {
    TokenDesc current_;  // desc for current token (as returned by Next())
    TokenDesc next_;     // desc for next token (one token look-ahead)
  
-  // Input stream. Must be initialized to an UTF16Buffer.
-  UTF16Buffer* source_;
+  // Input stream. Must be initialized to an UC16CharacterStream.
+  UC16CharacterStream* source_;
  
    // Buffer to hold literal values (identifiers, strings, numbers)
    // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
diff --git a/src/scanner.cc b/src/scanner.cc

index d22ebc7..47e9895 100755 (executable)
--- a/src/scanner.cc
+++ b/src/scanner.cc
@@ -36,63 +36,265 @@ namespace v8 {
  namespace internal {
  
  // ----------------------------------------------------------------------------
-// UTF16Buffer
-
-// CharacterStreamUTF16Buffer
-CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()
-    : pushback_buffer_(0), last_(0), stream_(NULL) { }
+// BufferedUC16CharacterStreams
+
+BufferedUC16CharacterStream::BufferedUC16CharacterStream()
+    : UC16CharacterStream(),
+      pushback_limit_(NULL) {
+  // Initialize buffer as being empty. First read will fill the buffer.
+  buffer_cursor_ = buffer_;
+  buffer_end_ = buffer_;
+}
  
+BufferedUC16CharacterStream::~BufferedUC16CharacterStream() { }
  
-void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,
-                                            unibrow::CharacterStream* input,
-                                            int start_position,
-                                            int end_position) {
-  stream_ = input;
-  if (start_position > 0) {
-    SeekForward(start_position);
+void BufferedUC16CharacterStream::PushBack(uc16 character) {
+  if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {
+    // buffer_ is writable, buffer_cursor_ is const pointer.
+    buffer_[--buffer_cursor_ - buffer_] = character;
+    pos_--;
+    return;
    }
-  end_ = end_position != kNoEndPosition ? end_position : kMaxInt;
+  SlowPushBack(character);
  }
  
  
-void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {
-  pushback_buffer()->Add(last_);
-  last_ = ch;
+void BufferedUC16CharacterStream::SlowPushBack(uc16 character) {
+  // In pushback mode, the end of the buffer contains pushback,
+  // and the start of the buffer (from buffer start to pushback_limit_)
+  // contains valid data that comes just after the pushback.
+  // We NULL the pushback_limit_ if pushing all the way back to the
+  // start of the buffer.
+
+  if (pushback_limit_ == NULL) {
+    // Enter pushback mode.
+    pushback_limit_ = buffer_end_;
+    buffer_end_ = buffer_ + kBufferSize;
+    buffer_cursor_ = buffer_end_;
+  }
+  ASSERT(pushback_limit_ > buffer_);
+  ASSERT(pos_ > 0);
+  buffer_[--buffer_cursor_ - buffer_] = character;
+  if (buffer_cursor_ == buffer_) {
+    pushback_limit_ = NULL;
+  } else if (buffer_cursor_ < pushback_limit_) {
+    pushback_limit_ = buffer_cursor_;
+  }
    pos_--;
  }
  
  
-uc32 CharacterStreamUTF16Buffer::Advance() {
-  ASSERT(end_ != kNoEndPosition);
-  ASSERT(end_ >= 0);
-  // NOTE: It is of importance to Persian / Farsi resources that we do
-  // *not* strip format control characters in the scanner; see
-  //
-  //    https://bugzilla.mozilla.org/show_bug.cgi?id=274152
-  //
-  // So, even though ECMA-262, section 7.1, page 11, dictates that we
-  // must remove Unicode format-control characters, we do not. This is
-  // in line with how IE and SpiderMonkey handles it.
-  if (!pushback_buffer()->is_empty()) {
-    pos_++;
-    return last_ = pushback_buffer()->RemoveLast();
-  } else if (stream_->has_more() && pos_ < end_) {
-    pos_++;
-    uc32 next = stream_->GetNext();
-    return last_ = next;
-  } else {
-    // Note: currently the following increment is necessary to avoid a
-    // test-parser problem!
-    pos_++;
-    return last_ = static_cast<uc32>(-1);
+bool BufferedUC16CharacterStream::ReadBlock() {
+  if (pushback_limit_ != NULL) {
+    buffer_cursor_ = buffer_;
+    buffer_end_ = pushback_limit_;
+    pushback_limit_ = NULL;
+    ASSERT(buffer_cursor_ != buffer_end_);
+    return true;
    }
+  unsigned length = FillBuffer(pos_, kBufferSize);
+  buffer_cursor_ = buffer_;
+  buffer_end_ = buffer_ + length;
+  return length > 0;
  }
  
  
-void CharacterStreamUTF16Buffer::SeekForward(int pos) {
-  pos_ = pos;
-  ASSERT(pushback_buffer()->is_empty());
-  stream_->Seek(pos);
+unsigned BufferedUC16CharacterStream::SlowSeekForward(unsigned delta) {
+  // Leave pushback mode (i.e., ignore that there might be valid data
+  // in the buffer before the pushback_limit_ point).
+  pushback_limit_ = NULL;
+  return BufferSeekForward(delta);
+}
+
+// ----------------------------------------------------------------------------
+// GenericStringUC16CharacterStream
+
+
+GenericStringUC16CharacterStream::GenericStringUC16CharacterStream(
+    Handle<String> data,
+    unsigned start_position,
+    unsigned end_position)
+    : string_(data),
+      length_(end_position) {
+  ASSERT(end_position >= start_position);
+  buffer_cursor_ = buffer_;
+  buffer_end_ = buffer_;
+  pos_ = start_position;
+}
+
+
+GenericStringUC16CharacterStream::~GenericStringUC16CharacterStream() { }
+
+
+unsigned GenericStringUC16CharacterStream::BufferSeekForward(unsigned delta) {
+  unsigned old_pos = pos_;
+  pos_ = Min(pos_ + delta, length_);
+  ReadBlock();
+  return pos_ - old_pos;
+}
+
+
+unsigned GenericStringUC16CharacterStream::FillBuffer(unsigned from_pos,
+                                                      unsigned length) {
+  if (from_pos >= length_) return 0;
+  if (from_pos + length > length_) {
+    length = length_ - from_pos;
+  }
+  String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
+  return length;
+}
+
+
+// ----------------------------------------------------------------------------
+// Utf8ToUC16CharacterStream
+Utf8ToUC16CharacterStream::Utf8ToUC16CharacterStream(const byte* data,
+                                                     unsigned length)
+    : BufferedUC16CharacterStream(),
+      raw_data_(data),
+      raw_data_length_(length),
+      raw_data_pos_(0),
+      raw_character_position_(0) {
+  ReadBlock();
+}
+
+
+Utf8ToUC16CharacterStream::~Utf8ToUC16CharacterStream() { }
+
+
+unsigned Utf8ToUC16CharacterStream::BufferSeekForward(unsigned delta) {
+  unsigned old_pos = pos_;
+  unsigned target_pos = pos_ + delta;
+  SetRawPosition(target_pos);
+  pos_ = raw_character_position_;
+  ReadBlock();
+  return pos_ - old_pos;
+}
+
+
+unsigned Utf8ToUC16CharacterStream::FillBuffer(unsigned char_position,
+                                               unsigned length) {
+  static const unibrow::uchar kMaxUC16Character = 0xffff;
+  SetRawPosition(char_position);
+  if (raw_character_position_ != char_position) {
+    // char_position was not a valid position in the stream (hit the end
+    // while spooling to it).
+    return 0u;
+  }
+  unsigned i = 0;
+  while (i < length) {
+    if (raw_data_pos_ == raw_data_length_) break;
+    unibrow::uchar c = raw_data_[raw_data_pos_];
+    if (c <= unibrow::Utf8::kMaxOneByteChar) {
+      raw_data_pos_++;
+    } else {
+      c =  unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_,
+                                         raw_data_length_ - raw_data_pos_,
+                                         &raw_data_pos_);
+      // Don't allow characters outside of the BMP.
+      if (c > kMaxUC16Character) {
+        c = unibrow::Utf8::kBadChar;
+      }
+    }
+    buffer_[i++] = static_cast<uc16>(c);
+  }
+  raw_character_position_ = char_position + i;
+  return i;
+}
+
+
+static const byte kUtf8MultiByteMask = 0xC0;
+static const byte kUtf8MultiByteCharStart = 0xC0;
+static const byte kUtf8MultiByteCharFollower = 0x80;
+
+
+#ifdef DEBUG
+static bool IsUtf8MultiCharacterStart(byte first_byte) {
+  return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
+}
+#endif
+
+
+static bool IsUtf8MultiCharacterFollower(byte later_byte) {
+  return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
+}
+
+
+// Move the cursor back to point at the preceding UTF-8 character start
+// in the buffer.
+static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
+  byte character = buffer[--*cursor];
+  if (character > unibrow::Utf8::kMaxOneByteChar) {
+    ASSERT(IsUtf8MultiCharacterFollower(character));
+    // Last byte of a multi-byte character encoding. Step backwards until
+    // pointing to the first byte of the encoding, recognized by having the
+    // top two bits set.
+    while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
+    ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor]));
+  }
+}
+
+
+// Move the cursor forward to point at the next following UTF-8 character start
+// in the buffer.
+static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
+  byte character = buffer[(*cursor)++];
+  if (character > unibrow::Utf8::kMaxOneByteChar) {
+    // First character of a multi-byte character encoding.
+    // The number of most-significant one-bits determines the length of the
+    // encoding:
+    //  110..... - (0xCx, 0xDx) one additional byte (minimum).
+    //  1110.... - (0xEx) two additional bytes.
+    //  11110... - (0xFx) three additional bytes (maximum).
+    ASSERT(IsUtf8MultiCharacterStart(character));
+    // Additional bytes is:
+    // 1 if value in range 0xC0 .. 0xDF.
+    // 2 if value in range 0xE0 .. 0xEF.
+    // 3 if value in range 0xF0 .. 0xF7.
+    // Encode that in a single value.
+    unsigned additional_bytes =
+        ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
+    *cursor += additional_bytes;
+    ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
+  }
+}
+
+
+void Utf8ToUC16CharacterStream::SetRawPosition(unsigned target_position) {
+  if (raw_character_position_ > target_position) {
+    // Spool backwards in utf8 buffer.
+    do {
+      Utf8CharacterBack(raw_data_, &raw_data_pos_);
+      raw_character_position_--;
+    } while (raw_character_position_ > target_position);
+    return;
+  }
+  // Spool forwards in the utf8 buffer.
+  while (raw_character_position_ < target_position) {
+    if (raw_data_pos_ == raw_data_length_) return;
+    Utf8CharacterForward(raw_data_, &raw_data_pos_);
+    raw_character_position_++;
+  }
+}
+
+
+// ----------------------------------------------------------------------------
+// ExternalTwoByteStringUC16CharacterStream
+
+ExternalTwoByteStringUC16CharacterStream::
+    ~ExternalTwoByteStringUC16CharacterStream() { }
+
+
+ExternalTwoByteStringUC16CharacterStream
+    ::ExternalTwoByteStringUC16CharacterStream(
+        Handle<ExternalTwoByteString> data,
+        int start_position,
+        int end_position)
+    : UC16CharacterStream(),
+      source_(data),
+      raw_data_(data->GetTwoByteData(start_position)) {
+  buffer_cursor_ = raw_data_,
+  buffer_end_ = raw_data_ + (end_position - start_position);
+  pos_ = start_position;
  }
  
  
@@ -115,46 +317,19 @@ void Scanner::LiteralScope::Complete() {
    complete_ = true;
  }
  
+
  // ----------------------------------------------------------------------------
  // V8JavaScriptScanner
  
-void V8JavaScriptScanner::Initialize(Handle<String> source,
-                                     int literal_flags) {
-  source_ = stream_initializer_.Init(source, NULL, 0, source->length());
-  // Need to capture identifiers in order to recognize "get" and "set"
-  // in object literals.
-  literal_flags_ = literal_flags | kLiteralIdentifier;
-  Init();
-  // Skip initial whitespace allowing HTML comment ends just like
-  // after a newline and scan first token.
-  has_line_terminator_before_next_ = true;
-  SkipWhiteSpace();
-  Scan();
-}
-
-
-void V8JavaScriptScanner::Initialize(Handle<String> source,
-                                     unibrow::CharacterStream* stream,
-                                     int literal_flags) {
-  source_ = stream_initializer_.Init(source, stream,
-                                     0, UTF16Buffer::kNoEndPosition);
-  literal_flags_ = literal_flags | kLiteralIdentifier;
-  Init();
-  // Skip initial whitespace allowing HTML comment ends just like
-  // after a newline and scan first token.
-  has_line_terminator_before_next_ = true;
-  SkipWhiteSpace();
-  Scan();
-}
+V8JavaScriptScanner::V8JavaScriptScanner() : JavaScriptScanner() { }
  
  
-void V8JavaScriptScanner::Initialize(Handle<String> source,
-                                     int start_position,
-                                     int end_position,
+void V8JavaScriptScanner::Initialize(UC16CharacterStream* source,
                                       int literal_flags) {
-  source_ = stream_initializer_.Init(source, NULL,
-                                     start_position, end_position);
+  source_ = source;
    literal_flags_ = literal_flags | kLiteralIdentifier;
+  // Need to capture identifiers in order to recognize "get" and "set"
+  // in object literals.
    Init();
    // Skip initial whitespace allowing HTML comment ends just like
    // after a newline and scan first token.
@@ -164,48 +339,14 @@ void V8JavaScriptScanner::Initialize(Handle<String> source,
  }
  
  
-UTF16Buffer* StreamInitializer::Init(Handle<String> source,
-                                     unibrow::CharacterStream* stream,
-                                     int start_position,
-                                     int end_position) {
-  // Either initialize the scanner from a character stream or from a
-  // string.
-  ASSERT(source.is_null() || stream == NULL);
-
-  // Initialize the source buffer.
-  if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {
-    two_byte_string_buffer_.Initialize(
-        Handle<ExternalTwoByteString>::cast(source),
-        start_position,
-        end_position);
-    return &two_byte_string_buffer_;
-  } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) {
-    ascii_string_buffer_.Initialize(
-        Handle<ExternalAsciiString>::cast(source),
-        start_position,
-        end_position);
-    return &ascii_string_buffer_;
-  } else {
-    if (!source.is_null()) {
-      safe_string_input_buffer_.Reset(source.location());
-      stream = &safe_string_input_buffer_;
-    }
-    char_stream_buffer_.Initialize(source,
-                                   stream,
-                                   start_position,
-                                   end_position);
-    return &char_stream_buffer_;
-  }
-}
-
  // ----------------------------------------------------------------------------
  // JsonScanner
  
-JsonScanner::JsonScanner() {}
+JsonScanner::JsonScanner() : Scanner() { }
  
  
-void JsonScanner::Initialize(Handle<String> source) {
-  source_ = stream_initializer_.Init(source, NULL, 0, source->length());
+void JsonScanner::Initialize(UC16CharacterStream* source) {
+  source_ = source;
    Init();
    // Skip initial whitespace.
    SkipJsonWhiteSpace();
diff --git a/src/scanner.h b/src/scanner.h

index adeea9b..572778f 100644 (file)
--- a/src/scanner.h
+++ b/src/scanner.h
@@ -35,67 +35,97 @@
  namespace v8 {
  namespace internal {
  
-// UTF16 buffer to read characters from a character stream.
-class CharacterStreamUTF16Buffer: public UTF16Buffer {
+// A buffered character stream based on a random access character
+// source (ReadBlock can be called with pos_ pointing to any position,
+// even positions before the current).
+class BufferedUC16CharacterStream: public UC16CharacterStream {
   public:
-  CharacterStreamUTF16Buffer();
-  virtual ~CharacterStreamUTF16Buffer() {}
-  void Initialize(Handle<String> data,
-                  unibrow::CharacterStream* stream,
-                  int start_position,
-                  int end_position);
-  virtual void PushBack(uc32 ch);
-  virtual uc32 Advance();
-  virtual void SeekForward(int pos);
-
- private:
-  List<uc32> pushback_buffer_;
-  uc32 last_;
-  unibrow::CharacterStream* stream_;
-
-  List<uc32>* pushback_buffer() { return &pushback_buffer_; }
+  BufferedUC16CharacterStream();
+  virtual ~BufferedUC16CharacterStream();
+
+  virtual void PushBack(uc16 character);
+
+ protected:
+  static const unsigned kBufferSize = 512;
+  static const unsigned kPushBackStepSize = 16;
+
+  virtual unsigned SlowSeekForward(unsigned delta);
+  virtual bool ReadBlock();
+  virtual void SlowPushBack(uc16 character);
+
+  virtual unsigned BufferSeekForward(unsigned delta) = 0;
+  virtual unsigned FillBuffer(unsigned position, unsigned length) = 0;
+
+  const uc16* pushback_limit_;
+  uc16 buffer_[kBufferSize];
  };
  
  
-// UTF16 buffer to read characters from an external string.
-template <typename StringType, typename CharType>
-class ExternalStringUTF16Buffer: public UTF16Buffer {
+// Generic string stream.
+class GenericStringUC16CharacterStream: public BufferedUC16CharacterStream {
   public:
-  ExternalStringUTF16Buffer();
-  virtual ~ExternalStringUTF16Buffer() {}
-  void Initialize(Handle<StringType> data,
-                  int start_position,
-                  int end_position);
-  virtual void PushBack(uc32 ch);
-  virtual uc32 Advance();
-  virtual void SeekForward(int pos);
-
- private:
-  const CharType* raw_data_;  // Pointer to the actual array of characters.
+  GenericStringUC16CharacterStream(Handle<String> data,
+                                   unsigned start_position,
+                                   unsigned end_position);
+  virtual ~GenericStringUC16CharacterStream();
+
+ protected:
+  virtual unsigned BufferSeekForward(unsigned delta);
+  virtual unsigned FillBuffer(unsigned position, unsigned length);
+
+  Handle<String> string_;
+  unsigned start_position_;
+  unsigned length_;
  };
  
  
-// Initializes a UTF16Buffer as input stream, using one of a number
-// of strategies depending on the available character sources.
-class StreamInitializer {
+// UC16 stream based on a literal UTF-8 string.
+class Utf8ToUC16CharacterStream: public BufferedUC16CharacterStream {
   public:
-  UTF16Buffer* Init(Handle<String> source,
-                    unibrow::CharacterStream* stream,
-                    int start_position,
-                    int end_position);
- private:
-  // Different UTF16 buffers used to pull characters from. Based on input one of
-  // these will be initialized as the actual data source.
-  CharacterStreamUTF16Buffer char_stream_buffer_;
-  ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
-      two_byte_string_buffer_;
-  ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
-
-  // Used to convert the source string into a character stream when a stream
-  // is not passed to the scanner.
-  SafeStringInputBuffer safe_string_input_buffer_;
+  Utf8ToUC16CharacterStream(const byte* data, unsigned length);
+  virtual ~Utf8ToUC16CharacterStream();
+
+ protected:
+  virtual unsigned BufferSeekForward(unsigned delta);
+  virtual unsigned FillBuffer(unsigned char_position, unsigned length);
+  void SetRawPosition(unsigned char_position);
+
+  const byte* raw_data_;
+  unsigned raw_data_length_;  // Measured in bytes, not characters.
+  unsigned raw_data_pos_;
+  // The character position of the character at raw_data[raw_data_pos_].
+  // Not necessarily the same as pos_.
+  unsigned raw_character_position_;
+};
+
+
+// UTF16 buffer to read characters from an external string.
+class ExternalTwoByteStringUC16CharacterStream: public UC16CharacterStream {
+ public:
+  ExternalTwoByteStringUC16CharacterStream(Handle<ExternalTwoByteString> data,
+                                           int start_position,
+                                           int end_position);
+  virtual ~ExternalTwoByteStringUC16CharacterStream();
+
+  virtual void PushBack(uc16 character) {
+    ASSERT(buffer_cursor_ > raw_data_);
+    buffer_cursor_--;
+    pos_--;
+  }
+ protected:
+  virtual unsigned SlowSeekForward(unsigned delta) {
+    // Fast case always handles seeking.
+    return 0;
+  }
+  virtual bool ReadBlock() {
+    // Entire string is read at start.
+    return false;
+  }
+  Handle<ExternalTwoByteString> source_;
+  const uc16* raw_data_;  // Pointer to the actual array of characters.
  };
  
+
  // ----------------------------------------------------------------------------
  // V8JavaScriptScanner
  // JavaScript scanner getting its input from either a V8 String or a unicode
@@ -103,19 +133,9 @@ class StreamInitializer {
  
  class V8JavaScriptScanner : public JavaScriptScanner {
   public:
-  V8JavaScriptScanner() {}
-
-  // Initialize the Scanner to scan source.
-  void Initialize(Handle<String> source, int literal_flags = kAllLiterals);
-  void Initialize(Handle<String> source,
-                  unibrow::CharacterStream* stream,
-                  int literal_flags = kAllLiterals);
-  void Initialize(Handle<String> source,
-                  int start_position, int end_position,
+  V8JavaScriptScanner();
+  void Initialize(UC16CharacterStream* source,
                    int literal_flags = kAllLiterals);
-
- protected:
-  StreamInitializer stream_initializer_;
  };
  
  
@@ -123,8 +143,7 @@ class JsonScanner : public Scanner {
   public:
    JsonScanner();
  
-  // Initialize the Scanner to scan source.
-  void Initialize(Handle<String> source);
+  void Initialize(UC16CharacterStream* source);
  
    // Returns the next token.
    Token::Value Next();
@@ -138,7 +157,7 @@ class JsonScanner : public Scanner {
    // Recognizes all of the single-character tokens directly, or calls a function
    // to scan a number, string or identifier literal.
    // The only allowed whitespace characters between tokens are tab,
-  // carrige-return, newline and space.
+  // carriage-return, newline and space.
    void ScanJson();
  
    // A JSON number (production JSONNumber) is a subset of the valid JavaScript
@@ -159,60 +178,8 @@ class JsonScanner : public Scanner {
    // are the only valid JSON identifiers (productions JSONBooleanLiteral,
    // JSONNullLiteral).
    Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
-
-  StreamInitializer stream_initializer_;
  };
  
-
-// ExternalStringUTF16Buffer
-template <typename StringType, typename CharType>
-ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer()
-    : raw_data_(NULL) { }
-
-
-template <typename StringType, typename CharType>
-void ExternalStringUTF16Buffer<StringType, CharType>::Initialize(
-     Handle<StringType> data,
-     int start_position,
-     int end_position) {
-  ASSERT(!data.is_null());
-  raw_data_ = data->resource()->data();
-
-  ASSERT(end_position <= data->length());
-  if (start_position > 0) {
-    SeekForward(start_position);
-  }
-  end_ =
-      end_position != kNoEndPosition ? end_position : data->length();
-}
-
-
-template <typename StringType, typename CharType>
-uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() {
-  if (pos_ < end_) {
-    return raw_data_[pos_++];
-  } else {
-    // note: currently the following increment is necessary to avoid a
-    // test-parser problem!
-    pos_++;
-    return static_cast<uc32>(-1);
-  }
-}
-
-
-template <typename StringType, typename CharType>
-void ExternalStringUTF16Buffer<StringType, CharType>::PushBack(uc32 ch) {
-  pos_--;
-  ASSERT(pos_ >= Scanner::kCharacterLookaheadBufferSize);
-  ASSERT(raw_data_[pos_ - Scanner::kCharacterLookaheadBufferSize] == ch);
-}
-
-
-template <typename StringType, typename CharType>
-void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) {
-  pos_ = pos;
-}
-
  } }  // namespace v8::internal
  
  #endif  // V8_SCANNER_H_
diff --git a/test/cctest/test-parsing.cc b/test/cctest/test-parsing.cc

index badbab5..e642d1b 100755 (executable)
--- a/test/cctest/test-parsing.cc
+++ b/test/cctest/test-parsing.cc
@@ -260,10 +260,12 @@ TEST(StandAlonePreParser) {
    uintptr_t stack_limit = i::StackGuard::real_climit();
    for (int i = 0; programs[i]; i++) {
      const char* program = programs[i];
-    unibrow::Utf8InputBuffer<256> stream(program, strlen(program));
+    i::Utf8ToUC16CharacterStream stream(
+        reinterpret_cast<const i::byte*>(program),
+        static_cast<unsigned>(strlen(program)));
      i::CompleteParserRecorder log;
      i::V8JavaScriptScanner scanner;
-    scanner.Initialize(i::Handle<i::String>::null(), &stream);
+    scanner.Initialize(&stream);
  
      v8::preparser::PreParser::PreParseResult result =
          v8::preparser::PreParser::PreParseProgram(&scanner,
@@ -289,9 +291,10 @@ TEST(RegressChromium62639) {
    // and then used the invalid currently scanned literal. This always
    // failed in debug mode, and sometimes crashed in release mode.
  
-  unibrow::Utf8InputBuffer<256> stream(program, strlen(program));
+  i::Utf8ToUC16CharacterStream stream(reinterpret_cast<const i::byte*>(program),
+                                      static_cast<unsigned>(strlen(program)));
    i::ScriptDataImpl* data =
-      i::ParserApi::PreParse(i::Handle<i::String>::null(), &stream, NULL);
+      i::ParserApi::PreParse(&stream, NULL);
    CHECK(data->HasError());
    delete data;
  }
@@ -310,10 +313,10 @@ TEST(Regress928) {
        "try { } catch (e) { var foo = function () { /* first */ } }"
        "var bar = function () { /* second */ }";
  
-  unibrow::Utf8InputBuffer<256> stream(program, strlen(program));
+  i::Utf8ToUC16CharacterStream stream(reinterpret_cast<const i::byte*>(program),
+                                      static_cast<unsigned>(strlen(program)));
    i::ScriptDataImpl* data =
-      i::ParserApi::PartialPreParse(i::Handle<i::String>::null(),
-                                    &stream, NULL);
+      i::ParserApi::PartialPreParse(&stream, NULL);
    CHECK(!data->HasError());
  
    data->Initialize();
@@ -347,10 +350,12 @@ TEST(PreParseOverflow) {
  
    uintptr_t stack_limit = i::StackGuard::real_climit();
  
-  unibrow::Utf8InputBuffer<256> stream(*program, strlen(*program));
+  i::Utf8ToUC16CharacterStream stream(
+      reinterpret_cast<const i::byte*>(*program),
+      static_cast<unsigned>(kProgramSize));
    i::CompleteParserRecorder log;
    i::V8JavaScriptScanner scanner;
-  scanner.Initialize(i::Handle<i::String>::null(), &stream);
+  scanner.Initialize(&stream);
  
  
    v8::preparser::PreParser::PreParseResult result =
@@ -360,3 +365,283 @@ TEST(PreParseOverflow) {
                                                  stack_limit);
    CHECK_EQ(v8::preparser::PreParser::kPreParseStackOverflow, result);
  }
+
+
+class TestExternalResource: public v8::String::ExternalStringResource {
+ public:
+  explicit TestExternalResource(uint16_t* data, int length)
+      : data_(data), length_(static_cast<size_t>(length)) { }
+
+  ~TestExternalResource() { }
+
+  const uint16_t* data() const {
+    return data_;
+  }
+
+  size_t length() const {
+    return length_;
+  }
+ private:
+  uint16_t* data_;
+  size_t length_;
+};
+
+
+#define CHECK_EQU(v1, v2) CHECK_EQ(static_cast<int>(v1), static_cast<int>(v2))
+
+void TestCharacterStream(const char* ascii_source,
+                         unsigned length,
+                         unsigned start = 0,
+                         unsigned end = 0) {
+  if (end == 0) end = length;
+  unsigned sub_length = end - start;
+  i::HandleScope test_scope;
+  i::SmartPointer<i::uc16> uc16_buffer(new i::uc16[length]);
+  for (unsigned i = 0; i < length; i++) {
+    uc16_buffer[i] = static_cast<i::uc16>(ascii_source[i]);
+  }
+  i::Vector<const char> ascii_vector(ascii_source, static_cast<int>(length));
+  i::Handle<i::String> ascii_string(
+      i::Factory::NewStringFromAscii(ascii_vector));
+  TestExternalResource resource(*uc16_buffer, length);
+  i::Handle<i::String> uc16_string(
+      i::Factory::NewExternalStringFromTwoByte(&resource));
+
+  i::ExternalTwoByteStringUC16CharacterStream uc16_stream(
+      i::Handle<i::ExternalTwoByteString>::cast(uc16_string), start, end);
+  i::GenericStringUC16CharacterStream string_stream(ascii_string, start, end);
+  i::Utf8ToUC16CharacterStream utf8_stream(
+      reinterpret_cast<const i::byte*>(ascii_source), end);
+  utf8_stream.SeekForward(start);
+
+  unsigned i = start;
+  while (i < end) {
+    // Read streams one char at a time
+    CHECK_EQU(i, uc16_stream.pos());
+    CHECK_EQU(i, string_stream.pos());
+    CHECK_EQU(i, utf8_stream.pos());
+    int32_t c0 = ascii_source[i];
+    int32_t c1 = uc16_stream.Advance();
+    int32_t c2 = string_stream.Advance();
+    int32_t c3 = utf8_stream.Advance();
+    i++;
+    CHECK_EQ(c0, c1);
+    CHECK_EQ(c0, c2);
+    CHECK_EQ(c0, c3);
+    CHECK_EQU(i, uc16_stream.pos());
+    CHECK_EQU(i, string_stream.pos());
+    CHECK_EQU(i, utf8_stream.pos());
+  }
+  while (i > start + sub_length / 4) {
+    // Pushback, re-read, pushback again.
+    int32_t c0 = ascii_source[i - 1];
+    CHECK_EQU(i, uc16_stream.pos());
+    CHECK_EQU(i, string_stream.pos());
+    CHECK_EQU(i, utf8_stream.pos());
+    uc16_stream.PushBack(c0);
+    string_stream.PushBack(c0);
+    utf8_stream.PushBack(c0);
+    i--;
+    CHECK_EQU(i, uc16_stream.pos());
+    CHECK_EQU(i, string_stream.pos());
+    CHECK_EQU(i, utf8_stream.pos());
+    int32_t c1 = uc16_stream.Advance();
+    int32_t c2 = string_stream.Advance();
+    int32_t c3 = utf8_stream.Advance();
+    i++;
+    CHECK_EQU(i, uc16_stream.pos());
+    CHECK_EQU(i, string_stream.pos());
+    CHECK_EQU(i, utf8_stream.pos());
+    CHECK_EQ(c0, c1);
+    CHECK_EQ(c0, c2);
+    CHECK_EQ(c0, c3);
+    uc16_stream.PushBack(c0);
+    string_stream.PushBack(c0);
+    utf8_stream.PushBack(c0);
+    i--;
+    CHECK_EQU(i, uc16_stream.pos());
+    CHECK_EQU(i, string_stream.pos());
+    CHECK_EQU(i, utf8_stream.pos());
+  }
+  unsigned halfway = start + sub_length / 2;
+  uc16_stream.SeekForward(halfway - i);
+  string_stream.SeekForward(halfway - i);
+  utf8_stream.SeekForward(halfway - i);
+  i = halfway;
+  CHECK_EQU(i, uc16_stream.pos());
+  CHECK_EQU(i, string_stream.pos());
+  CHECK_EQU(i, utf8_stream.pos());
+
+  while (i < end) {
+    // Read streams one char at a time
+    CHECK_EQU(i, uc16_stream.pos());
+    CHECK_EQU(i, string_stream.pos());
+    CHECK_EQU(i, utf8_stream.pos());
+    int32_t c0 = ascii_source[i];
+    int32_t c1 = uc16_stream.Advance();
+    int32_t c2 = string_stream.Advance();
+    int32_t c3 = utf8_stream.Advance();
+    i++;
+    CHECK_EQ(c0, c1);
+    CHECK_EQ(c0, c2);
+    CHECK_EQ(c0, c3);
+    CHECK_EQU(i, uc16_stream.pos());
+    CHECK_EQU(i, string_stream.pos());
+    CHECK_EQU(i, utf8_stream.pos());
+  }
+
+  int32_t c1 = uc16_stream.Advance();
+  int32_t c2 = string_stream.Advance();
+  int32_t c3 = utf8_stream.Advance();
+  CHECK_LT(c1, 0);
+  CHECK_LT(c2, 0);
+  CHECK_LT(c3, 0);
+}
+
+
+TEST(CharacterStreams) {
+  v8::HandleScope handles;
+  v8::Persistent<v8::Context> context = v8::Context::New();
+  v8::Context::Scope context_scope(context);
+
+  TestCharacterStream("abc\0\n\r\x7f", 7);
+  static const unsigned kBigStringSize = 4096;
+  char buffer[kBigStringSize + 1];
+  for (unsigned i = 0; i < kBigStringSize; i++) {
+    buffer[i] = static_cast<char>(i & 0x7f);
+  }
+  TestCharacterStream(buffer, kBigStringSize);
+
+  TestCharacterStream(buffer, kBigStringSize, 576, 3298);
+
+  TestCharacterStream("\0", 1);
+  TestCharacterStream("", 0);
+}
+
+
+TEST(Utf8CharacterStream) {
+  static const unsigned kMaxUC16CharU = unibrow::Utf8::kMaxThreeByteChar;
+  static const int kMaxUC16Char = static_cast<int>(kMaxUC16CharU);
+
+  static const int kAllUtf8CharsSize =
+      (unibrow::Utf8::kMaxOneByteChar + 1) +
+      (unibrow::Utf8::kMaxTwoByteChar - unibrow::Utf8::kMaxOneByteChar) * 2 +
+      (unibrow::Utf8::kMaxThreeByteChar - unibrow::Utf8::kMaxTwoByteChar) * 3;
+  static const unsigned kAllUtf8CharsSizeU =
+      static_cast<unsigned>(kAllUtf8CharsSize);
+
+  char buffer[kAllUtf8CharsSizeU];
+  unsigned cursor = 0;
+  for (int i = 0; i <= kMaxUC16Char; i++) {
+    cursor += unibrow::Utf8::Encode(buffer + cursor, i);
+  }
+  ASSERT(cursor == kAllUtf8CharsSizeU);
+
+  i::Utf8ToUC16CharacterStream stream(reinterpret_cast<const i::byte*>(buffer),
+                                      kAllUtf8CharsSizeU);
+  for (int i = 0; i <= kMaxUC16Char; i++) {
+    CHECK_EQU(i, stream.pos());
+    int32_t c = stream.Advance();
+    CHECK_EQ(i, c);
+    CHECK_EQU(i + 1, stream.pos());
+  }
+  for (int i = kMaxUC16Char; i >= 0; i--) {
+    CHECK_EQU(i + 1, stream.pos());
+    stream.PushBack(i);
+    CHECK_EQU(i, stream.pos());
+  }
+  int i = 0;
+  while (stream.pos() < kMaxUC16CharU) {
+    CHECK_EQU(i, stream.pos());
+    unsigned progress = stream.SeekForward(12);
+    i += progress;
+    int32_t c = stream.Advance();
+    if (i <= kMaxUC16Char) {
+      CHECK_EQ(i, c);
+    } else {
+      CHECK_EQ(-1, c);
+    }
+    i += 1;
+    CHECK_EQU(i, stream.pos());
+  }
+}
+
+#undef CHECK_EQU
+
+void TestStreamScanner(i::UC16CharacterStream* stream,
+                       i::Token::Value* expected_tokens,
+                       int skip_pos = 0,  // Zero means not skipping.
+                       int skip_to = 0) {
+  i::V8JavaScriptScanner scanner;
+  scanner.Initialize(stream, i::JavaScriptScanner::kAllLiterals);
+
+  int i = 0;
+  do {
+    i::Token::Value expected = expected_tokens[i];
+    i::Token::Value actual = scanner.Next();
+    CHECK_EQ(i::Token::String(expected), i::Token::String(actual));
+    if (scanner.location().end_pos == skip_pos) {
+      scanner.SeekForward(skip_to);
+    }
+    i++;
+  } while (expected_tokens[i] != i::Token::ILLEGAL);
+}
+
+TEST(StreamScanner) {
+  const char* str1 = "{ foo get for : */ <- \n\n /*foo*/ bib";
+  i::Utf8ToUC16CharacterStream stream1(reinterpret_cast<const i::byte*>(str1),
+                                       static_cast<unsigned>(strlen(str1)));
+  i::Token::Value expectations1[] = {
+      i::Token::LBRACE,
+      i::Token::IDENTIFIER,
+      i::Token::IDENTIFIER,
+      i::Token::FOR,
+      i::Token::COLON,
+      i::Token::MUL,
+      i::Token::DIV,
+      i::Token::LT,
+      i::Token::SUB,
+      i::Token::IDENTIFIER,
+      i::Token::EOS,
+      i::Token::ILLEGAL
+  };
+  TestStreamScanner(&stream1, expectations1, 0, 0);
+
+  const char* str2 = "case default const {THIS\nPART\nSKIPPED} do";
+  i::Utf8ToUC16CharacterStream stream2(reinterpret_cast<const i::byte*>(str2),
+                                       static_cast<unsigned>(strlen(str2)));
+  i::Token::Value expectations2[] = {
+      i::Token::CASE,
+      i::Token::DEFAULT,
+      i::Token::CONST,
+      i::Token::LBRACE,
+      // Skipped part here
+      i::Token::RBRACE,
+      i::Token::DO,
+      i::Token::EOS,
+      i::Token::ILLEGAL
+  };
+  ASSERT_EQ('{', str2[19]);
+  ASSERT_EQ('}', str2[37]);
+  TestStreamScanner(&stream2, expectations2, 20, 37);
+
+  const char* str3 = "{}}}}";
+  i::Token::Value expectations3[] = {
+      i::Token::LBRACE,
+      i::Token::RBRACE,
+      i::Token::RBRACE,
+      i::Token::RBRACE,
+      i::Token::RBRACE,
+      i::Token::EOS,
+      i::Token::ILLEGAL
+  };
+  // Skip zero-four RBRACEs.
+  for (int i = 0; i <= 4; i++) {
+     expectations3[6 - i] = i::Token::ILLEGAL;
+     expectations3[5 - i] = i::Token::EOS;
+     i::Utf8ToUC16CharacterStream stream3(
+         reinterpret_cast<const i::byte*>(str3),
+         static_cast<unsigned>(strlen(str3)));
+     TestStreamScanner(&stream3, expectations3, 1, 1 + i);
+  }
+}
author	lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Tue, 7 Dec 2010 14:03:59 +0000 (14:03 +0000)
committer	lrn@chromium.org <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Tue, 7 Dec 2010 14:03:59 +0000 (14:03 +0000)
src/api.cc		patch \| blob \| history
src/checks.h		patch \| blob \| history
src/compiler.cc		patch \| blob \| history
src/parser.cc		patch \| blob \| history
src/parser.h		patch \| blob \| history
src/preparser-api.cc		patch \| blob \| history
src/preparser.cc		patch \| blob \| history
src/scanner-base.cc		patch \| blob \| history
src/scanner-base.h		patch \| blob \| history
src/scanner.cc		patch \| blob \| history
src/scanner.h		patch \| blob \| history
test/cctest/test-parsing.cc		patch \| blob \| history