src/google/protobuf/util/internal/json_stream_parser.cc

   1 // Protocol Buffers - Google's data interchange format
   2 // Copyright 2008 Google Inc.  All rights reserved.
   3 // https://developers.google.com/protocol-buffers/
   4 //
   5 // Redistribution and use in source and binary forms, with or without
   6 // modification, are permitted provided that the following conditions are
   7 // met:
   8 //
   9 //     * Redistributions of source code must retain the above copyright
  10 // notice, this list of conditions and the following disclaimer.
  11 //     * Redistributions in binary form must reproduce the above
  12 // copyright notice, this list of conditions and the following disclaimer
  13 // in the documentation and/or other materials provided with the
  14 // distribution.
  15 //     * Neither the name of Google Inc. nor the names of its
  16 // contributors may be used to endorse or promote products derived from
  17 // this software without specific prior written permission.
  18 //
  19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30
  31 #include <google/protobuf/util/internal/json_stream_parser.h>
  32
  33 #include <algorithm>
  34 #include <cctype>
  35 #include <cerrno>
  36 #include <cstdlib>
  37 #include <cstring>
  38 #include <memory>
  39
  40 #include <google/protobuf/stubs/logging.h>
  41 #include <google/protobuf/stubs/common.h>
  42 #include <google/protobuf/stubs/strutil.h>
  43
  44 #include <google/protobuf/util/internal/object_writer.h>
  45 #include <google/protobuf/util/internal/json_escaping.h>
  46 #include <google/protobuf/stubs/mathlimits.h>
  47
  48
  49 namespace google {
  50 namespace protobuf {
  51 namespace util {
  52
  53 // Allow these symbols to be referenced as util::Status, util::error::* in
  54 // this file.
  55 using util::Status;
  56 namespace error {
  57 using util::error::CANCELLED;
  58 using util::error::INTERNAL;
  59 using util::error::INVALID_ARGUMENT;
  60 }  // namespace error
  61
  62 namespace converter {
  63
  64 // Number of digits in an escaped UTF-16 code unit ('\\' 'u' X X X X)
  65 static const int kUnicodeEscapedLength = 6;
  66
  67 static const int kDefaultMaxRecursionDepth = 100;
  68
  69 // Length of the true, false, and null literals.
  70 static const int true_len = strlen("true");
  71 static const int false_len = strlen("false");
  72 static const int null_len = strlen("null");
  73
  74 inline bool IsLetter(char c) {
  75   return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_') ||
  76          (c == '$');
  77 }
  78
  79 inline bool IsAlphanumeric(char c) {
  80   return IsLetter(c) || ('0' <= c && c <= '9');
  81 }
  82
  83 static bool ConsumeKey(StringPiece* input, StringPiece* key) {
  84   if (input->empty() || !IsLetter((*input)[0])) return false;
  85   int len = 1;
  86   for (; len < input->size(); ++len) {
  87     if (!IsAlphanumeric((*input)[len])) {
  88       break;
  89     }
  90   }
  91   *key = StringPiece(input->data(), len);
  92   *input = StringPiece(input->data() + len, input->size() - len);
  93   return true;
  94 }
  95
  96 static bool MatchKey(StringPiece input) {
  97   return !input.empty() && IsLetter(input[0]);
  98 }
  99
 100 JsonStreamParser::JsonStreamParser(ObjectWriter* ow)
 101     : ow_(ow),
 102       stack_(),
 103       leftover_(),
 104       json_(),
 105       p_(),
 106       key_(),
 107       key_storage_(),
 108       finishing_(false),
 109       parsed_(),
 110       parsed_storage_(),
 111       string_open_(0),
 112       chunk_storage_(),
 113       coerce_to_utf8_(false),
 114       allow_empty_null_(false),
 115       loose_float_number_conversion_(false),
 116       recursion_depth_(0),
 117       max_recursion_depth_(kDefaultMaxRecursionDepth) {
 118   // Initialize the stack with a single value to be parsed.
 119   stack_.push(VALUE);
 120 }
 121
 122 JsonStreamParser::~JsonStreamParser() {}
 123
 124
 125 util::Status JsonStreamParser::Parse(StringPiece json) {
 126   StringPiece chunk = json;
 127   // If we have leftovers from a previous chunk, append the new chunk to it
 128   // and create a new StringPiece pointing at the string's data. This could
 129   // be large but we rely on the chunks to be small, assuming they are
 130   // fragments of a Cord.
 131   if (!leftover_.empty()) {
 132     // Don't point chunk to leftover_ because leftover_ will be updated in
 133     // ParseChunk(chunk).
 134     chunk_storage_.swap(leftover_);
 135     StrAppend(&chunk_storage_, json);
 136     chunk = StringPiece(chunk_storage_);
 137   }
 138
 139   // Find the structurally valid UTF8 prefix and parse only that.
 140   int n = internal::UTF8SpnStructurallyValid(chunk);
 141   if (n > 0) {
 142     util::Status status = ParseChunk(chunk.substr(0, n));
 143
 144     // Any leftover characters are stashed in leftover_ for later parsing when
 145     // there is more data available.
 146     StrAppend(&leftover_, chunk.substr(n));
 147     return status;
 148   } else {
 149     leftover_.assign(chunk.data(), chunk.size());
 150     return util::Status();
 151   }
 152 }
 153
 154 util::Status JsonStreamParser::FinishParse() {
 155   // If we do not expect anything and there is nothing left to parse we're all
 156   // done.
 157   if (stack_.empty() && leftover_.empty()) {
 158     return util::Status();
 159   }
 160
 161   // Storage for UTF8-coerced string.
 162   std::unique_ptr<char[]> utf8;
 163   if (coerce_to_utf8_) {
 164     utf8.reset(new char[leftover_.size()]);
 165     char* coerced = internal::UTF8CoerceToStructurallyValid(leftover_, utf8.get(), ' ');
 166     p_ = json_ = StringPiece(coerced, leftover_.size());
 167   } else {
 168     p_ = json_ = leftover_;
 169     if (!internal::IsStructurallyValidUTF8(leftover_)) {
 170       return ReportFailure("Encountered non UTF-8 code points.");
 171     }
 172   }
 173
 174   // Parse the remainder in finishing mode, which reports errors for things like
 175   // unterminated strings or unknown tokens that would normally be retried.
 176   finishing_ = true;
 177   util::Status result = RunParser();
 178   if (result.ok()) {
 179     SkipWhitespace();
 180     if (!p_.empty()) {
 181       result = ReportFailure("Parsing terminated before end of input.");
 182     }
 183   }
 184   return result;
 185 }
 186
 187 util::Status JsonStreamParser::ParseChunk(StringPiece chunk) {
 188   // Do not do any work if the chunk is empty.
 189   if (chunk.empty()) return util::Status();
 190
 191   p_ = json_ = chunk;
 192
 193   finishing_ = false;
 194   util::Status result = RunParser();
 195   if (!result.ok()) return result;
 196
 197   SkipWhitespace();
 198   if (p_.empty()) {
 199     // If we parsed everything we had, clear the leftover.
 200     leftover_.clear();
 201   } else {
 202     // If we do not expect anything i.e. stack is empty, and we have non-empty
 203     // string left to parse, we report an error.
 204     if (stack_.empty()) {
 205       return ReportFailure("Parsing terminated before end of input.");
 206     }
 207     // If we expect future data i.e. stack is non-empty, and we have some
 208     // unparsed data left, we save it for later parse.
 209     leftover_ = std::string(p_);
 210   }
 211   return util::Status();
 212 }
 213
 214 util::Status JsonStreamParser::RunParser() {
 215   while (!stack_.empty()) {
 216     ParseType type = stack_.top();
 217     TokenType t = (string_open_ == 0) ? GetNextTokenType() : BEGIN_STRING;
 218     stack_.pop();
 219     util::Status result;
 220     switch (type) {
 221       case VALUE:
 222         result = ParseValue(t);
 223         break;
 224
 225       case OBJ_MID:
 226         result = ParseObjectMid(t);
 227         break;
 228
 229       case ENTRY:
 230         result = ParseEntry(t);
 231         break;
 232
 233       case ENTRY_MID:
 234         result = ParseEntryMid(t);
 235         break;
 236
 237       case ARRAY_VALUE:
 238         result = ParseArrayValue(t);
 239         break;
 240
 241       case ARRAY_MID:
 242         result = ParseArrayMid(t);
 243         break;
 244
 245       default:
 246         result = util::Status(util::error::INTERNAL,
 247                               StrCat("Unknown parse type: ", type));
 248         break;
 249     }
 250     if (!result.ok()) {
 251       // If we were cancelled, save our state and try again later.
 252       if (!finishing_ &&
 253           result == util::Status(util::error::CANCELLED, "")) {
 254         stack_.push(type);
 255         // If we have a key we still need to render, make sure to save off the
 256         // contents in our own storage.
 257         if (!key_.empty() && key_storage_.empty()) {
 258           StrAppend(&key_storage_, key_);
 259           key_ = StringPiece(key_storage_);
 260         }
 261         result = util::Status();
 262       }
 263       return result;
 264     }
 265   }
 266   return util::Status();
 267 }
 268
 269 util::Status JsonStreamParser::ParseValue(TokenType type) {
 270   switch (type) {
 271     case BEGIN_OBJECT:
 272       return HandleBeginObject();
 273     case BEGIN_ARRAY:
 274       return HandleBeginArray();
 275     case BEGIN_STRING:
 276       return ParseString();
 277     case BEGIN_NUMBER:
 278       return ParseNumber();
 279     case BEGIN_TRUE:
 280       return ParseTrue();
 281     case BEGIN_FALSE:
 282       return ParseFalse();
 283     case BEGIN_NULL:
 284       return ParseNull();
 285     case UNKNOWN:
 286       return ReportUnknown("Expected a value.");
 287     default: {
 288       if (allow_empty_null_ && IsEmptyNullAllowed(type)) {
 289         return ParseEmptyNull();
 290       }
 291
 292       // Special case for having been cut off while parsing, wait for more data.
 293       // This handles things like 'fals' being at the end of the string, we
 294       // don't know if the next char would be e, completing it, or something
 295       // else, making it invalid.
 296       if (!finishing_ && p_.length() < false_len) {
 297         return util::Status(util::error::CANCELLED, "");
 298       }
 299       return ReportFailure("Unexpected token.");
 300     }
 301   }
 302 }
 303
 304 util::Status JsonStreamParser::ParseString() {
 305   util::Status result = ParseStringHelper();
 306   if (result.ok()) {
 307     ow_->RenderString(key_, parsed_);
 308     key_ = StringPiece();
 309     parsed_ = StringPiece();
 310     parsed_storage_.clear();
 311   }
 312   return result;
 313 }
 314
 315 util::Status JsonStreamParser::ParseStringHelper() {
 316   // If we haven't seen the start quote, grab it and remember it for later.
 317   if (string_open_ == 0) {
 318     string_open_ = *p_.data();
 319     GOOGLE_DCHECK(string_open_ == '\"' || string_open_ == '\'');
 320     Advance();
 321   }
 322   // Track where we last copied data from so we can minimize copying.
 323   const char* last = p_.data();
 324   while (!p_.empty()) {
 325     const char* data = p_.data();
 326     if (*data == '\\') {
 327       // We're about to handle an escape, copy all bytes from last to data.
 328       if (last < data) {
 329         parsed_storage_.append(last, data - last);
 330       }
 331       // If we ran out of string after the \, cancel or report an error
 332       // depending on if we expect more data later.
 333       if (p_.length() == 1) {
 334         if (!finishing_) {
 335           return util::Status(util::error::CANCELLED, "");
 336         }
 337         return ReportFailure("Closing quote expected in string.");
 338       }
 339       // Parse a unicode escape if we found \u in the string.
 340       if (data[1] == 'u') {
 341         util::Status result = ParseUnicodeEscape();
 342         if (!result.ok()) {
 343           return result;
 344         }
 345         // Move last pointer past the unicode escape and continue.
 346         last = p_.data();
 347         continue;
 348       }
 349       // Handle the standard set of backslash-escaped characters.
 350       switch (data[1]) {
 351         case 'b':
 352           parsed_storage_.push_back('\b');
 353           break;
 354         case 'f':
 355           parsed_storage_.push_back('\f');
 356           break;
 357         case 'n':
 358           parsed_storage_.push_back('\n');
 359           break;
 360         case 'r':
 361           parsed_storage_.push_back('\r');
 362           break;
 363         case 't':
 364           parsed_storage_.push_back('\t');
 365           break;
 366         case 'v':
 367           parsed_storage_.push_back('\v');
 368           break;
 369         default:
 370           parsed_storage_.push_back(data[1]);
 371       }
 372       // We handled two characters, so advance past them and continue.
 373       p_.remove_prefix(2);
 374       last = p_.data();
 375       continue;
 376     }
 377     // If we found the closing quote note it, advance past it, and return.
 378     if (*data == string_open_) {
 379       // If we didn't copy anything, reuse the input buffer.
 380       if (parsed_storage_.empty()) {
 381         parsed_ = StringPiece(last, data - last);
 382       } else {
 383         if (last < data) {
 384           parsed_storage_.append(last, data - last);
 385         }
 386         parsed_ = StringPiece(parsed_storage_);
 387       }
 388       // Clear the quote char so next time we try to parse a string we'll
 389       // start fresh.
 390       string_open_ = 0;
 391       Advance();
 392       return util::Status();
 393     }
 394     // Normal character, just advance past it.
 395     Advance();
 396   }
 397   // If we ran out of characters, copy over what we have so far.
 398   if (last < p_.data()) {
 399     parsed_storage_.append(last, p_.data() - last);
 400   }
 401   // If we didn't find the closing quote but we expect more data, cancel for now
 402   if (!finishing_) {
 403     return util::Status(util::error::CANCELLED, "");
 404   }
 405   // End of string reached without a closing quote, report an error.
 406   string_open_ = 0;
 407   return ReportFailure("Closing quote expected in string.");
 408 }
 409
 410 // Converts a unicode escaped character to a decimal value stored in a char32
 411 // for use in UTF8 encoding utility.  We assume that str begins with \uhhhh and
 412 // convert that from the hex number to a decimal value.
 413 //
 414 // There are some security exploits with UTF-8 that we should be careful of:
 415 //   - http://www.unicode.org/reports/tr36/#UTF-8_Exploit
 416 //   - http://sites/intl-eng/design-guide/core-application
 417 util::Status JsonStreamParser::ParseUnicodeEscape() {
 418   if (p_.length() < kUnicodeEscapedLength) {
 419     if (!finishing_) {
 420       return util::Status(util::error::CANCELLED, "");
 421     }
 422     return ReportFailure("Illegal hex string.");
 423   }
 424   GOOGLE_DCHECK_EQ('\\', p_.data()[0]);
 425   GOOGLE_DCHECK_EQ('u', p_.data()[1]);
 426   uint32 code = 0;
 427   for (int i = 2; i < kUnicodeEscapedLength; ++i) {
 428     if (!isxdigit(p_.data()[i])) {
 429       return ReportFailure("Invalid escape sequence.");
 430     }
 431     code = (code << 4) + hex_digit_to_int(p_.data()[i]);
 432   }
 433   if (code >= JsonEscaping::kMinHighSurrogate &&
 434       code <= JsonEscaping::kMaxHighSurrogate) {
 435     if (p_.length() < 2 * kUnicodeEscapedLength) {
 436       if (!finishing_) {
 437         return util::Status(util::error::CANCELLED, "");
 438       }
 439       if (!coerce_to_utf8_) {
 440         return ReportFailure("Missing low surrogate.");
 441       }
 442     } else if (p_.data()[kUnicodeEscapedLength] == '\\' &&
 443                p_.data()[kUnicodeEscapedLength + 1] == 'u') {
 444       uint32 low_code = 0;
 445       for (int i = kUnicodeEscapedLength + 2; i < 2 * kUnicodeEscapedLength;
 446            ++i) {
 447         if (!isxdigit(p_.data()[i])) {
 448           return ReportFailure("Invalid escape sequence.");
 449         }
 450         low_code = (low_code << 4) + hex_digit_to_int(p_.data()[i]);
 451       }
 452       if (low_code >= JsonEscaping::kMinLowSurrogate &&
 453           low_code <= JsonEscaping::kMaxLowSurrogate) {
 454         // Convert UTF-16 surrogate pair to 21-bit Unicode codepoint.
 455         code = (((code & 0x3FF) << 10) | (low_code & 0x3FF)) +
 456                JsonEscaping::kMinSupplementaryCodePoint;
 457         // Advance past the first code unit escape.
 458         p_.remove_prefix(kUnicodeEscapedLength);
 459       } else if (!coerce_to_utf8_) {
 460         return ReportFailure("Invalid low surrogate.");
 461       }
 462     } else if (!coerce_to_utf8_) {
 463       return ReportFailure("Missing low surrogate.");
 464     }
 465   }
 466   if (!coerce_to_utf8_ && !IsValidCodePoint(code)) {
 467     return ReportFailure("Invalid unicode code point.");
 468   }
 469   char buf[UTFmax];
 470   int len = EncodeAsUTF8Char(code, buf);
 471   // Advance past the [final] code unit escape.
 472   p_.remove_prefix(kUnicodeEscapedLength);
 473   parsed_storage_.append(buf, len);
 474   return util::Status();
 475 }
 476
 477 util::Status JsonStreamParser::ParseNumber() {
 478   NumberResult number;
 479   util::Status result = ParseNumberHelper(&number);
 480   if (result.ok()) {
 481     switch (number.type) {
 482       case NumberResult::DOUBLE:
 483         ow_->RenderDouble(key_, number.double_val);
 484         key_ = StringPiece();
 485         break;
 486
 487       case NumberResult::INT:
 488         ow_->RenderInt64(key_, number.int_val);
 489         key_ = StringPiece();
 490         break;
 491
 492       case NumberResult::UINT:
 493         ow_->RenderUint64(key_, number.uint_val);
 494         key_ = StringPiece();
 495         break;
 496
 497       default:
 498         return ReportFailure("Unable to parse number.");
 499     }
 500   }
 501   return result;
 502 }
 503
 504 util::Status JsonStreamParser::ParseDoubleHelper(const std::string& number,
 505                                                  NumberResult* result) {
 506   if (!safe_strtod(number, &result->double_val)) {
 507     return ReportFailure("Unable to parse number.");
 508   }
 509   if (!loose_float_number_conversion_ &&
 510       !MathLimits<double>::IsFinite(result->double_val)) {
 511     return ReportFailure("Number exceeds the range of double.");
 512   }
 513   result->type = NumberResult::DOUBLE;
 514   return util::Status();
 515 }
 516
 517 util::Status JsonStreamParser::ParseNumberHelper(NumberResult* result) {
 518   const char* data = p_.data();
 519   int length = p_.length();
 520
 521   // Look for the first non-numeric character, or the end of the string.
 522   int index = 0;
 523   bool floating = false;
 524   bool negative = data[index] == '-';
 525   // Find the first character that cannot be part of the number. Along the way
 526   // detect if the number needs to be parsed as a double.
 527   // Note that this restricts numbers to the JSON specification, so for example
 528   // we do not support hex or octal notations.
 529   for (; index < length; ++index) {
 530     char c = data[index];
 531     if (isdigit(c)) continue;
 532     if (c == '.' || c == 'e' || c == 'E') {
 533       floating = true;
 534       continue;
 535     }
 536     if (c == '+' || c == '-' || c == 'x') continue;
 537     // Not a valid number character, break out.
 538     break;
 539   }
 540
 541   // If the entire input is a valid number, and we may have more content in the
 542   // future, we abort for now and resume when we know more.
 543   if (index == length && !finishing_) {
 544     return util::Status(util::error::CANCELLED, "");
 545   }
 546
 547   // Create a string containing just the number, so we can use safe_strtoX
 548   std::string number = std::string(p_.substr(0, index));
 549
 550   // Floating point number, parse as a double.
 551   if (floating) {
 552     util::Status status = ParseDoubleHelper(number, result);
 553     if (status.ok()) {
 554       p_.remove_prefix(index);
 555     }
 556     return status;
 557   }
 558
 559   // Positive non-floating point number, parse as a uint64.
 560   if (!negative) {
 561     // Octal/Hex numbers are not valid JSON values.
 562     if (number.length() >= 2 && number[0] == '0') {
 563       return ReportFailure("Octal/hex numbers are not valid JSON values.");
 564     }
 565     if (safe_strtou64(number, &result->uint_val)) {
 566       result->type = NumberResult::UINT;
 567       p_.remove_prefix(index);
 568       return util::Status();
 569     } else {
 570       // If the value is too large, parse it as double.
 571       util::Status status = ParseDoubleHelper(number, result);
 572       if (status.ok()) {
 573         p_.remove_prefix(index);
 574       }
 575       return status;
 576     }
 577   }
 578
 579   // Octal/Hex numbers are not valid JSON values.
 580   if (number.length() >= 3 && number[1] == '0') {
 581     return ReportFailure("Octal/hex numbers are not valid JSON values.");
 582   }
 583   // Negative non-floating point number, parse as an int64.
 584   if (safe_strto64(number, &result->int_val)) {
 585     result->type = NumberResult::INT;
 586     p_.remove_prefix(index);
 587     return util::Status();
 588   } else {
 589     // If the value is too large, parse it as double.
 590     util::Status status = ParseDoubleHelper(number, result);
 591     if (status.ok()) {
 592       p_.remove_prefix(index);
 593     }
 594     return status;
 595   }
 596 }
 597
 598 util::Status JsonStreamParser::HandleBeginObject() {
 599   GOOGLE_DCHECK_EQ('{', *p_.data());
 600   Advance();
 601   ow_->StartObject(key_);
 602   auto status = IncrementRecursionDepth(key_);
 603   if (!status.ok()) {
 604     return status;
 605   }
 606   key_ = StringPiece();
 607   stack_.push(ENTRY);
 608   return util::Status();
 609 }
 610
 611 util::Status JsonStreamParser::ParseObjectMid(TokenType type) {
 612   if (type == UNKNOWN) {
 613     return ReportUnknown("Expected , or } after key:value pair.");
 614   }
 615
 616   // Object is complete, advance past the comma and render the EndObject.
 617   if (type == END_OBJECT) {
 618     Advance();
 619     ow_->EndObject();
 620     --recursion_depth_;
 621     return util::Status();
 622   }
 623   // Found a comma, advance past it and get ready for an entry.
 624   if (type == VALUE_SEPARATOR) {
 625     Advance();
 626     stack_.push(ENTRY);
 627     return util::Status();
 628   }
 629   // Illegal token after key:value pair.
 630   return ReportFailure("Expected , or } after key:value pair.");
 631 }
 632
 633 util::Status JsonStreamParser::ParseEntry(TokenType type) {
 634   if (type == UNKNOWN) {
 635     return ReportUnknown("Expected an object key or }.");
 636   }
 637
 638   // Close the object and return. This allows for trailing commas.
 639   if (type == END_OBJECT) {
 640     ow_->EndObject();
 641     Advance();
 642     --recursion_depth_;
 643     return util::Status();
 644   }
 645
 646   util::Status result;
 647   if (type == BEGIN_STRING) {
 648     // Key is a string (standard JSON), parse it and store the string.
 649     result = ParseStringHelper();
 650     if (result.ok()) {
 651       key_storage_.clear();
 652       if (!parsed_storage_.empty()) {
 653         parsed_storage_.swap(key_storage_);
 654         key_ = StringPiece(key_storage_);
 655       } else {
 656         key_ = parsed_;
 657       }
 658       parsed_ = StringPiece();
 659     }
 660   } else if (type == BEGIN_KEY) {
 661     // Key is a bare key (back compat), create a StringPiece pointing to it.
 662     result = ParseKey();
 663   } else {
 664     // Unknown key type, report an error.
 665     result = ReportFailure("Expected an object key or }.");
 666   }
 667   // On success we next expect an entry mid ':' then an object mid ',' or '}'
 668   if (result.ok()) {
 669     stack_.push(OBJ_MID);
 670     stack_.push(ENTRY_MID);
 671   }
 672   return result;
 673 }
 674
 675 util::Status JsonStreamParser::ParseEntryMid(TokenType type) {
 676   if (type == UNKNOWN) {
 677     return ReportUnknown("Expected : between key:value pair.");
 678   }
 679   if (type == ENTRY_SEPARATOR) {
 680     Advance();
 681     stack_.push(VALUE);
 682     return util::Status();
 683   }
 684   return ReportFailure("Expected : between key:value pair.");
 685 }
 686
 687 util::Status JsonStreamParser::HandleBeginArray() {
 688   GOOGLE_DCHECK_EQ('[', *p_.data());
 689   Advance();
 690   ow_->StartList(key_);
 691   key_ = StringPiece();
 692   stack_.push(ARRAY_VALUE);
 693   return util::Status();
 694 }
 695
 696 util::Status JsonStreamParser::ParseArrayValue(TokenType type) {
 697   if (type == UNKNOWN) {
 698     return ReportUnknown("Expected a value or ] within an array.");
 699   }
 700
 701   if (type == END_ARRAY) {
 702     ow_->EndList();
 703     Advance();
 704     return util::Status();
 705   }
 706
 707   // The ParseValue call may push something onto the stack so we need to make
 708   // sure an ARRAY_MID is after it, so we push it on now. Also, the parsing of
 709   // empty-null array value is relying on this ARRAY_MID token.
 710   stack_.push(ARRAY_MID);
 711   util::Status result = ParseValue(type);
 712   if (result == util::Status(util::error::CANCELLED, "")) {
 713     // If we were cancelled, pop back off the ARRAY_MID so we don't try to
 714     // push it on again when we try over.
 715     stack_.pop();
 716   }
 717   return result;
 718 }
 719
 720 util::Status JsonStreamParser::ParseArrayMid(TokenType type) {
 721   if (type == UNKNOWN) {
 722     return ReportUnknown("Expected , or ] after array value.");
 723   }
 724
 725   if (type == END_ARRAY) {
 726     ow_->EndList();
 727     Advance();
 728     return util::Status();
 729   }
 730
 731   // Found a comma, advance past it and expect an array value next.
 732   if (type == VALUE_SEPARATOR) {
 733     Advance();
 734     stack_.push(ARRAY_VALUE);
 735     return util::Status();
 736   }
 737   // Illegal token after array value.
 738   return ReportFailure("Expected , or ] after array value.");
 739 }
 740
 741 util::Status JsonStreamParser::ParseTrue() {
 742   ow_->RenderBool(key_, true);
 743   key_ = StringPiece();
 744   p_.remove_prefix(true_len);
 745   return util::Status();
 746 }
 747
 748 util::Status JsonStreamParser::ParseFalse() {
 749   ow_->RenderBool(key_, false);
 750   key_ = StringPiece();
 751   p_.remove_prefix(false_len);
 752   return util::Status();
 753 }
 754
 755 util::Status JsonStreamParser::ParseNull() {
 756   ow_->RenderNull(key_);
 757   key_ = StringPiece();
 758   p_.remove_prefix(null_len);
 759   return util::Status();
 760 }
 761
 762 util::Status JsonStreamParser::ParseEmptyNull() {
 763   ow_->RenderNull(key_);
 764   key_ = StringPiece();
 765   return util::Status();
 766 }
 767
 768 bool JsonStreamParser::IsEmptyNullAllowed(TokenType type) {
 769   if (stack_.empty()) return false;
 770   return (stack_.top() == ARRAY_MID && type == VALUE_SEPARATOR) ||
 771          stack_.top() == OBJ_MID;
 772 }
 773
 774 util::Status JsonStreamParser::ReportFailure(StringPiece message) {
 775   static const int kContextLength = 20;
 776   const char* p_start = p_.data();
 777   const char* json_start = json_.data();
 778   const char* begin = std::max(p_start - kContextLength, json_start);
 779   const char* end =
 780       std::min(p_start + kContextLength, json_start + json_.size());
 781   StringPiece segment(begin, end - begin);
 782   std::string location(p_start - begin, ' ');
 783   location.push_back('^');
 784   return util::Status(util::error::INVALID_ARGUMENT,
 785                       StrCat(message, "\n", segment, "\n", location));
 786 }
 787
 788 util::Status JsonStreamParser::ReportUnknown(StringPiece message) {
 789   // If we aren't finishing the parse, cancel parsing and try later.
 790   if (!finishing_) {
 791     return util::Status(util::error::CANCELLED, "");
 792   }
 793   if (p_.empty()) {
 794     return ReportFailure(StrCat("Unexpected end of string. ", message));
 795   }
 796   return ReportFailure(message);
 797 }
 798
 799 util::Status JsonStreamParser::IncrementRecursionDepth(
 800     StringPiece key) const {
 801   if (++recursion_depth_ > max_recursion_depth_) {
 802     return Status(
 803         util::error::INVALID_ARGUMENT,
 804         StrCat("Message too deep. Max recursion depth reached for key '",
 805                      key, "'"));
 806   }
 807   return util::Status();
 808 }
 809
 810 void JsonStreamParser::SkipWhitespace() {
 811   while (!p_.empty() && ascii_isspace(*p_.data())) {
 812     Advance();
 813   }
 814 }
 815
 816 void JsonStreamParser::Advance() {
 817   // Advance by moving one UTF8 character while making sure we don't go beyond
 818   // the length of StringPiece.
 819   p_.remove_prefix(std::min<int>(
 820       p_.length(), UTF8FirstLetterNumBytes(p_.data(), p_.length())));
 821 }
 822
 823 util::Status JsonStreamParser::ParseKey() {
 824   StringPiece original = p_;
 825   if (!ConsumeKey(&p_, &key_)) {
 826     return ReportFailure("Invalid key or variable name.");
 827   }
 828   // If we consumed everything but expect more data, reset p_ and cancel since
 829   // we can't know if the key was complete or not.
 830   if (!finishing_ && p_.empty()) {
 831     p_ = original;
 832     return util::Status(util::error::CANCELLED, "");
 833   }
 834   // Since we aren't using the key storage, clear it out.
 835   key_storage_.clear();
 836   return util::Status();
 837 }
 838
 839 JsonStreamParser::TokenType JsonStreamParser::GetNextTokenType() {
 840   SkipWhitespace();
 841
 842   int size = p_.size();
 843   if (size == 0) {
 844     // If we ran out of data, report unknown and we'll place the previous parse
 845     // type onto the stack and try again when we have more data.
 846     return UNKNOWN;
 847   }
 848   // TODO(sven): Split this method based on context since different contexts
 849   // support different tokens. Would slightly speed up processing?
 850   const char* data = p_.data();
 851   if (*data == '\"' || *data == '\'') return BEGIN_STRING;
 852   if (*data == '-' || ('0' <= *data && *data <= '9')) {
 853     return BEGIN_NUMBER;
 854   }
 855   if (size >= true_len && !strncmp(data, "true", true_len)) {
 856     return BEGIN_TRUE;
 857   }
 858   if (size >= false_len && !strncmp(data, "false", false_len)) {
 859     return BEGIN_FALSE;
 860   }
 861   if (size >= null_len && !strncmp(data, "null", null_len)) {
 862     return BEGIN_NULL;
 863   }
 864   if (*data == '{') return BEGIN_OBJECT;
 865   if (*data == '}') return END_OBJECT;
 866   if (*data == '[') return BEGIN_ARRAY;
 867   if (*data == ']') return END_ARRAY;
 868   if (*data == ':') return ENTRY_SEPARATOR;
 869   if (*data == ',') return VALUE_SEPARATOR;
 870   if (MatchKey(p_)) {
 871     return BEGIN_KEY;
 872   }
 873
 874   // We don't know that we necessarily have an invalid token here, just that we
 875   // can't parse what we have so far. So we don't report an error and just
 876   // return UNKNOWN so we can try again later when we have more data, or if we
 877   // finish and we have leftovers.
 878   return UNKNOWN;
 879 }
 880
 881 }  // namespace converter
 882 }  // namespace util
 883 }  // namespace protobuf
 884 }  // namespace google