From 3ed8c2f5208468ed8659c73d2088fbb18605f899 Mon Sep 17 00:00:00 2001
From: "sandholm@chromium.org"
 <sandholm@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Date: Wed, 1 Jun 2011 14:06:30 +0000
Subject: [PATCH] Remove scanner abstraction layer from JSON parsing. Review
 URL: http://codereview.chromium.org/7020018

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@8147 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
---
 src/json-parser.cc | 350 +++++++++++++++++++----------------------------------
 src/json-parser.h  |  93 +++++++-------
 2 files changed, 173 insertions(+), 270 deletions(-)
diff --git a/src/json-parser.cc b/src/json-parser.cc
index b7f57c2..7e24310 100644
--- a/src/json-parser.cc
+++ b/src/json-parser.cc
@@ -53,50 +53,50 @@ Handle<Object> JsonParser::ParseJson(Handle<String> source) {
   // Set initial position right before the string.
   position_ = -1;
   // Advance to the first character (posibly EOS)
-  Advance();
-  Next();
+  AdvanceSkipWhitespace();
   Handle<Object> result = ParseJsonValue();
-  if (result.is_null() || Next() != Token::EOS) {
-    // Parse failed. Scanner's current token is the unexpected token.
-    Token::Value token = current_.token;
+  if (result.is_null() || c0_ != kEndOfString) {
+    // Parse failed. Current character is the unexpected token.
 
     const char* message;
-    const char* name_opt = NULL;
+    Factory* factory = isolate()->factory();
+    Handle<JSArray> array;
 
-    switch (token) {
-      case Token::EOS:
+    switch (c0_) {
+      case kEndOfString:
         message = "unexpected_eos";
+        array = factory->NewJSArray(0);
         break;
-      case Token::NUMBER:
+      case '-':
+      case '0':
+      case '1':
+      case '2':
+      case '3':
+      case '4':
+      case '5':
+      case '6':
+      case '7':
+      case '8':
+      case '9':
         message = "unexpected_token_number";
+        array = factory->NewJSArray(0);
         break;
-      case Token::STRING:
+      case '"':
         message = "unexpected_token_string";
-        break;
-      case Token::IDENTIFIER:
-      case Token::FUTURE_RESERVED_WORD:
-        message = "unexpected_token_identifier";
+        array = factory->NewJSArray(0);
         break;
       default:
         message = "unexpected_token";
-        name_opt = Token::String(token);
-        ASSERT(name_opt != NULL);
+        Handle<Object> name = LookupSingleCharacterStringFromCode(c0_);
+        Handle<FixedArray> element = factory->NewFixedArray(1);
+        element->set(0, *name);
+        array = factory->NewJSArrayWithElements(element);
         break;
     }
 
-    Factory* factory = isolate()->factory();
     MessageLocation location(factory->NewScript(source),
-                             current_.beg_pos,
-                             current_.end_pos);
-    Handle<JSArray> array;
-    if (name_opt == NULL) {
-      array = factory->NewJSArray(0);
-    } else {
-      Handle<String> name = factory->NewStringFromUtf8(CStrVector(name_opt));
-      Handle<FixedArray> element = factory->NewFixedArray(1);
-      element->set(0, *name);
-      array = factory->NewJSArrayWithElements(element);
-    }
+                             position_,
+                             position_ + 1);
     Handle<Object> result = factory->NewSyntaxError(message, array);
     isolate()->Throw(*result, &location);
     return Handle<Object>::null();
@@ -107,49 +107,71 @@ Handle<Object> JsonParser::ParseJson(Handle<String> source) {
 
 // Parse any JSON value.
 Handle<Object> JsonParser::ParseJsonValue() {
-  Token::Value token = Next();
-  switch (token) {
-    case Token::STRING:
-      return GetString(false);
-    case Token::NUMBER:
-      return isolate()->factory()->NewNumber(number_);
-    case Token::FALSE_LITERAL:
-      return isolate()->factory()->false_value();
-    case Token::TRUE_LITERAL:
-      return isolate()->factory()->true_value();
-    case Token::NULL_LITERAL:
-      return isolate()->factory()->null_value();
-    case Token::LBRACE:
+  switch (c0_) {
+    case '"':
+      return ParseJsonString();
+    case '-':
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+      return ParseJsonNumber();
+    case 'f':
+      if (AdvanceGetChar() == 'a' && AdvanceGetChar() == 'l' &&
+          AdvanceGetChar() == 's' && AdvanceGetChar() == 'e') {
+        AdvanceSkipWhitespace();
+        return isolate()->factory()->false_value();
+      } else {
+        return ReportUnexpectedCharacter();
+      }
+    case 't':
+      if (AdvanceGetChar() == 'r' && AdvanceGetChar() == 'u' &&
+          AdvanceGetChar() == 'e') {
+        AdvanceSkipWhitespace();
+        return isolate()->factory()->true_value();
+      } else {
+        return ReportUnexpectedCharacter();
+      }
+    case 'n':
+      if (AdvanceGetChar() == 'u' && AdvanceGetChar() == 'l' &&
+          AdvanceGetChar() == 'l') {
+        AdvanceSkipWhitespace();
+        return isolate()->factory()->null_value();
+      } else {
+        return ReportUnexpectedCharacter();
+      }
+    case '{':
       return ParseJsonObject();
-    case Token::LBRACK:
+    case '[':
       return ParseJsonArray();
     default:
-      return ReportUnexpectedToken();
+      return ReportUnexpectedCharacter();
   }
 }
 
 
-// Parse a JSON object. Scanner must be right after '{' token.
+// Parse a JSON object. Position must be right at '{'.
 Handle<Object> JsonParser::ParseJsonObject() {
   Handle<JSFunction> object_constructor(
       isolate()->global_context()->object_function());
   Handle<JSObject> json_object =
       isolate()->factory()->NewJSObject(object_constructor);
+  ASSERT_EQ(c0_, '{');
 
-  if (Peek() == Token::RBRACE) {
-    Next();
-  } else {
+  AdvanceSkipWhitespace();
+  if (c0_ != '}') {
     do {
-      if (Next() != Token::STRING) {
-        return ReportUnexpectedToken();
-      }
-      Handle<String> key = GetString(true);
-      if (Next() != Token::COLON) {
-        return ReportUnexpectedToken();
-      }
-
+      Handle<String> key = ParseJsonSymbol();
+      if (key.is_null() || c0_ != ':') return ReportUnexpectedCharacter();
+      AdvanceSkipWhitespace();
       Handle<Object> value = ParseJsonValue();
-      if (value.is_null()) return Handle<Object>::null();
+      if (value.is_null()) return ReportUnexpectedCharacter();
 
       uint32_t index;
       if (key->AsArrayIndex(&index)) {
@@ -159,150 +181,46 @@ Handle<Object> JsonParser::ParseJsonObject() {
       } else {
         SetLocalPropertyIgnoreAttributes(json_object, key, value, NONE);
       }
-    } while (Next() == Token::COMMA);
-    if (current_.token != Token::RBRACE) {
-      return ReportUnexpectedToken();
+    } while (MatchSkipWhiteSpace(','));
+    if (c0_ != '}') {
+      return ReportUnexpectedCharacter();
     }
   }
+  AdvanceSkipWhitespace();
   return json_object;
 }
 
-// Parse a JSON array. Scanner must be right after '[' token.
+// Parse a JSON array. Position must be right at '['.
 Handle<Object> JsonParser::ParseJsonArray() {
   ZoneScope zone_scope(isolate(), DELETE_ON_EXIT);
   ZoneList<Handle<Object> > elements(4);
+  ASSERT_EQ(c0_, '[');
 
-  Token::Value token = Peek();
-  if (token == Token::RBRACK) {
-    Next();
-  } else {
+  AdvanceSkipWhitespace();
+  if (c0_ != ']') {
     do {
       Handle<Object> element = ParseJsonValue();
-      if (element.is_null()) return Handle<Object>::null();
+      if (element.is_null()) return ReportUnexpectedCharacter();
       elements.Add(element);
-      token = Next();
-    } while (token == Token::COMMA);
-    if (token != Token::RBRACK) {
-      return ReportUnexpectedToken();
+    } while (MatchSkipWhiteSpace(','));
+    if (c0_ != ']') {
+      return ReportUnexpectedCharacter();
     }
   }
-
+  AdvanceSkipWhitespace();
   // Allocate a fixed array with all the elements.
   Handle<FixedArray> fast_elements =
       isolate()->factory()->NewFixedArray(elements.length());
-
   for (int i = 0, n = elements.length(); i < n; i++) {
     fast_elements->set(i, *elements[i]);
   }
-
   return isolate()->factory()->NewJSArrayWithElements(fast_elements);
 }
 
 
-Token::Value JsonParser::Next() {
-  current_ = next_;
-  ScanJson();
-  return current_.token;
-}
-
-void JsonParser::ScanJson() {
-  if (source_->IsSeqAsciiString()) {
-    is_sequential_ascii_ = true;
-  } else {
-    is_sequential_ascii_ = false;
-  }
-
-  Token::Value token;
-  do {
-    // Remember the position of the next token
-    next_.beg_pos = position_;
-    switch (c0_) {
-      case '\t':
-      case '\r':
-      case '\n':
-      case ' ':
-        Advance();
-        token = Token::WHITESPACE;
-        break;
-      case '{':
-        Advance();
-        token = Token::LBRACE;
-        break;
-      case '}':
-        Advance();
-        token = Token::RBRACE;
-        break;
-      case '[':
-        Advance();
-        token = Token::LBRACK;
-        break;
-      case ']':
-        Advance();
-        token = Token::RBRACK;
-        break;
-      case ':':
-        Advance();
-        token = Token::COLON;
-        break;
-      case ',':
-        Advance();
-        token = Token::COMMA;
-        break;
-      case '"':
-        token = ScanJsonString();
-        break;
-      case '-':
-      case '0':
-      case '1':
-      case '2':
-      case '3':
-      case '4':
-      case '5':
-      case '6':
-      case '7':
-      case '8':
-      case '9':
-        token = ScanJsonNumber();
-        break;
-      case 't':
-        token = ScanJsonIdentifier("true", Token::TRUE_LITERAL);
-        break;
-      case 'f':
-        token = ScanJsonIdentifier("false", Token::FALSE_LITERAL);
-        break;
-      case 'n':
-        token = ScanJsonIdentifier("null", Token::NULL_LITERAL);
-        break;
-      default:
-        if (c0_ < 0) {
-          Advance();
-          token = Token::EOS;
-        } else {
-          Advance();
-          token = Token::ILLEGAL;
-        }
-    }
-  } while (token == Token::WHITESPACE);
-
-  next_.end_pos = position_;
-  next_.token = token;
-}
-
-
-Token::Value JsonParser::ScanJsonIdentifier(const char* text,
-                                            Token::Value token) {
-  while (*text != '\0') {
-    if (c0_ != *text) return Token::ILLEGAL;
-    Advance();
-    text++;
-  }
-  return token;
-}
-
-
-Token::Value JsonParser::ScanJsonNumber() {
+Handle<Object> JsonParser::ParseJsonNumber() {
   bool negative = false;
-
+  beg_pos_ = position_;
   if (c0_ == '-') {
     Advance();
     negative = true;
@@ -311,11 +229,11 @@ Token::Value JsonParser::ScanJsonNumber() {
     Advance();
     // Prefix zero is only allowed if it's the only digit before
     // a decimal point or exponent.
-    if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL;
+    if ('0' <= c0_ && c0_ <= '9') return ReportUnexpectedCharacter();
   } else {
     int i = 0;
     int digits = 0;
-    if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL;
+    if (c0_ < '1' || c0_ > '9') return ReportUnexpectedCharacter();
     do {
       i = i * 10 + c0_ - '0';
       digits++;
@@ -323,12 +241,13 @@ Token::Value JsonParser::ScanJsonNumber() {
     } while (c0_ >= '0' && c0_ <= '9');
     if (c0_ != '.' && c0_ != 'e' && c0_ != 'E' && digits < 10) {
       number_ = (negative ? -i : i);
-      return Token::NUMBER;
+      SkipWhitespace();
+      return isolate()->factory()->NewNumber(number_);
     }
   }
   if (c0_ == '.') {
     Advance();
-    if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
+    if (c0_ < '0' || c0_ > '9') return ReportUnexpectedCharacter();
     do {
       Advance();
     } while (c0_ >= '0' && c0_ <= '9');
@@ -336,37 +255,38 @@ Token::Value JsonParser::ScanJsonNumber() {
   if (AsciiAlphaToLower(c0_) == 'e') {
     Advance();
     if (c0_ == '-' || c0_ == '+') Advance();
-    if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
+    if (c0_ < '0' || c0_ > '9') return ReportUnexpectedCharacter();
     do {
       Advance();
     } while (c0_ >= '0' && c0_ <= '9');
   }
+  int length = position_ - beg_pos_;
   if (is_sequential_ascii_) {
-    Vector<const char> chars(seq_source_->GetChars() +  next_.beg_pos,
-                             position_ - next_.beg_pos);
+    Vector<const char> chars(seq_source_->GetChars() +  beg_pos_, length);
     number_ = StringToDouble(isolate()->unicode_cache(),
                              chars,
                              NO_FLAGS,  // Hex, octal or trailing junk.
                              OS::nan_value());
   } else {
-    Vector<char> buffer = Vector<char>::New(position_ - next_.beg_pos);
-    String::WriteToFlat(*source_, buffer.start(), next_.beg_pos, position_);
+    Vector<char> buffer = Vector<char>::New(length);
+    String::WriteToFlat(*source_, buffer.start(), beg_pos_, position_);
     Vector<const char> result =
         Vector<const char>(reinterpret_cast<const char*>(buffer.start()),
-        position_ - next_.beg_pos);
+        length);
     number_ = StringToDouble(isolate()->unicode_cache(),
                              result,
                              NO_FLAGS,  // Hex, octal or trailing junk.
                              0.0);
     buffer.Dispose();
   }
-  return Token::NUMBER;
+  SkipWhitespace();
+  return isolate()->factory()->NewNumber(number_);
 }
 
-Token::Value JsonParser::SlowScanJsonString() {
+Handle<Object> JsonParser::SlowScanJsonString() {
   // The currently scanned ascii characters.
   Handle<String> ascii(isolate()->factory()->NewSubString(source_,
-                                                          next_.beg_pos + 1,
+                                                          beg_pos_,
                                                           position_));
   Handle<String> two_byte =
       isolate()->factory()->NewRawTwoByteString(kInitialSpecialStringSize,
@@ -392,7 +312,7 @@ Token::Value JsonParser::SlowScanJsonString() {
     }
 
     // Check for control character (0x00-0x1f) or unterminated string (<0).
-    if (c0_ < 0x20) return Token::ILLEGAL;
+    if (c0_ < 0x20) return ReportUnexpectedCharacter();
     if (c0_ != '\\') {
       seq_two_byte->SeqTwoByteStringSet(count++, c0_);
       Advance();
@@ -425,7 +345,7 @@ Token::Value JsonParser::SlowScanJsonString() {
             Advance();
             int digit = HexValue(c0_);
             if (digit < 0) {
-              return Token::ILLEGAL;
+              return ReportUnexpectedCharacter();
             }
             value = value * 16 + digit;
           }
@@ -433,14 +353,14 @@ Token::Value JsonParser::SlowScanJsonString() {
           break;
         }
         default:
-          return Token::ILLEGAL;
+          return ReportUnexpectedCharacter();
       }
       Advance();
     }
   }
   // Advance past the last '"'.
   ASSERT_EQ('"', c0_);
-  Advance();
+  AdvanceSkipWhitespace();
 
   // Shrink the the string to our length.
   if (isolate()->heap()->InNewSpace(*seq_two_byte)) {
@@ -456,21 +376,19 @@ Token::Value JsonParser::SlowScanJsonString() {
     seq_two_byte->set_length(count);
     isolate()->heap()->CreateFillerObjectAt(start_filler_object, delta);
   }
-  string_val_ = isolate()->factory()->NewConsString(ascii, seq_two_byte);
-  return Token::STRING;
+  return isolate()->factory()->NewConsString(ascii, seq_two_byte);
 }
 
 
-Token::Value JsonParser::ScanJsonString() {
+template <bool is_symbol>
+Handle<Object> JsonParser::ScanJsonString() {
   ASSERT_EQ('"', c0_);
-  // Set string_val to null. If string_val is not set we assume an
-  // ascii string begining at next_.beg_pos + 1 to next_.end_pos - 1.
-  string_val_ = Handle<String>::null();
   Advance();
+  beg_pos_ = position_;
   // Fast case for ascii only without escape characters.
   while (c0_ != '"') {
     // Check for control character (0x00-0x1f) or unterminated string (<0).
-    if (c0_ < 0x20) return Token::ILLEGAL;
+    if (c0_ < 0x20) return ReportUnexpectedCharacter();
     if (c0_ != '\\' && c0_ < kMaxAsciiCharCode) {
       Advance();
     } else {
@@ -478,36 +396,16 @@ Token::Value JsonParser::ScanJsonString() {
     }
   }
   ASSERT_EQ('"', c0_);
+  end_pos_ = position_;
   // Advance past the last '"'.
-  Advance();
-  return Token::STRING;
-}
-
-Handle<String> JsonParser::GetString() {
-  return GetString(false);
-}
-
-Handle<String> JsonParser::GetSymbol() {
-  Handle<String> result = GetString(true);
-  if (result->IsSymbol()) return result;
-  return isolate()->factory()->LookupSymbol(result);
-}
-
-Handle<String> JsonParser::GetString(bool hint_symbol) {
-  // We have a non ascii string, return that.
-  if (!string_val_.is_null()) return string_val_;
-
-  if (is_sequential_ascii_ && hint_symbol) {
-    Handle<SeqAsciiString> seq = Handle<SeqAsciiString>::cast(source_);
-    // The current token includes the '"' in both ends.
-    int length = current_.end_pos - current_.beg_pos - 2;
+  AdvanceSkipWhitespace();
+  if (is_sequential_ascii_ && is_symbol) {
     return isolate()->factory()->LookupAsciiSymbol(seq_source_,
-                                                   current_.beg_pos + 1,
-                                                   length);
+                                                   beg_pos_,
+                                                   end_pos_ - beg_pos_);
+  } else {
+    return isolate()->factory()->NewSubString(source_, beg_pos_, end_pos_);
   }
-  // The current token includes the '"' in both ends.
-  return  isolate()->factory()->NewSubString(
-      source_, current_.beg_pos + 1, current_.end_pos - 1);
 }
 
 } }  // namespace v8::internal
diff --git a/src/json-parser.h b/src/json-parser.h
index 5903d21..07f00f5 100644
--- a/src/json-parser.h
+++ b/src/json-parser.h
@@ -47,41 +47,62 @@ class JsonParser BASE_EMBEDDED {
   Handle<Object> ParseJson(Handle<String> source);
 
   inline void Advance() {
-    if (position_ >= source_length_) {
-      position_++;
+    position_++;
+    if (position_ > source_length_) {
       c0_ = kEndOfString;
     } else if (is_sequential_ascii_) {
-      position_++;
       c0_ = seq_source_->SeqAsciiStringGet(position_);
     } else {
-      position_++;
       c0_ = source_->Get(position_);
     }
   }
 
-  inline Isolate* isolate() { return isolate_; }
+  // The JSON lexical grammar is specified in the ECMAScript 5 standard,
+  // section 15.12.1.1. The only allowed whitespace characters between tokens
+  // are tab, carriage-return, newline and space.
+
+  inline void AdvanceSkipWhitespace() {
+    do {
+      Advance();
+    } while (c0_ == '\t' || c0_ == '\r' || c0_ == '\n' || c0_ == ' ');
+  }
 
-  // Get the string for the current string token.
-  Handle<String> GetString(bool hint_symbol);
-  Handle<String> GetString();
-  Handle<String> GetSymbol();
+  inline void SkipWhitespace() {
+    while (c0_ == '\t' || c0_ == '\r' || c0_ == '\n' || c0_ == ' ') {
+      Advance();
+    }
+  }
 
-  // Scan a single JSON token. The JSON lexical grammar is specified in the
-  // ECMAScript 5 standard, section 15.12.1.1.
-  // Recognizes all of the single-character tokens directly, or calls a function
-  // to scan a number, string or identifier literal.
-  // The only allowed whitespace characters between tokens are tab,
-  // carriage-return, newline and space.
-  void ScanJson();
+  inline uc32 AdvanceGetChar() {
+    Advance();
+    return c0_;
+  }
+
+  // Checks that current charater is c.
+  // If so, then consume c and skip whitespace.
+  inline bool MatchSkipWhiteSpace(uc32 c) {
+    if (c0_ == c) {
+      AdvanceSkipWhitespace();
+      return true;
+    }
+    return false;
+  }
 
   // A JSON string (production JSONString) is subset of valid JavaScript string
   // literals. The string must only be double-quoted (not single-quoted), and
   // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
   // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
-  Token::Value ScanJsonString();
+  Handle<Object> ParseJsonString() {
+    return ScanJsonString<false>();
+  }
+  Handle<Object> ParseJsonSymbol() {
+    return ScanJsonString<true>();
+  }
+  template <bool is_symbol>
+  Handle<Object> ScanJsonString();
   // Slow version for unicode support, uses the first ascii_count characters,
   // as first part of a ConsString
-  Token::Value SlowScanJsonString();
+  Handle<Object> SlowScanJsonString();
 
   // A JSON number (production JSONNumber) is a subset of the valid JavaScript
   // decimal number literals.
@@ -89,12 +110,7 @@ class JsonParser BASE_EMBEDDED {
   // digit before and after a decimal point, may not have prefixed zeros (unless
   // the integer part is zero), and may include an exponent part (e.g., "e-10").
   // Hexadecimal and octal numbers are not allowed.
-  Token::Value ScanJsonNumber();
-
-  // Used to recognizes one of the literals "true", "false", or "null". These
-  // are the only valid JSON identifiers (productions JSONBooleanLiteral,
-  // JSONNullLiteral).
-  Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
+  Handle<Object> ParseJsonNumber();
 
   // Parse a single JSON value from input (grammar production JSONValue).
   // A JSON value is either a (double-quoted) string literal, a number literal,
@@ -119,21 +135,11 @@ class JsonParser BASE_EMBEDDED {
 
   // Mark that a parsing error has happened at the current token, and
   // return a null handle. Primarily for readability.
-  Handle<Object> ReportUnexpectedToken() { return Handle<Object>::null(); }
-
-  // Peek at the next token.
-  Token::Value Peek() { return next_.token; }
-  // Scan the next token and return the token scanned on the last call.
-  Token::Value Next();
-
-  struct TokenInfo {
-    TokenInfo() : token(Token::ILLEGAL),
-                  beg_pos(0),
-                  end_pos(0) { }
-    Token::Value token;
-    int beg_pos;
-    int end_pos;
-  };
+  inline Handle<Object> ReportUnexpectedCharacter() {
+    return Handle<Object>::null();
+  }
+
+  inline Isolate* isolate() { return isolate_; }
 
   static const int kInitialSpecialStringSize = 1024;
 
@@ -144,15 +150,14 @@ class JsonParser BASE_EMBEDDED {
   Handle<SeqAsciiString> seq_source_;
 
   bool is_sequential_ascii_;
-  // Current and next token
-  TokenInfo current_;
-  TokenInfo next_;
+  // begin and end position of scanned string or number
+  int beg_pos_;
+  int end_pos_;
+
   Isolate* isolate_;
   uc32 c0_;
   int position_;
 
-
-  Handle<String> string_val_;
   double number_;
 };
 
-- 
2.7.4