deps/v8/src/scanner.cc

   1 // Copyright 2011 the V8 project authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // Features shared by parsing and pre-parsing scanners.
   6
   7 #include <stdint.h>
   8
   9 #include <cmath>
  10
  11 #include "src/v8.h"
  12
  13 #include "src/ast-value-factory.h"
  14 #include "src/char-predicates-inl.h"
  15 #include "src/conversions-inl.h"
  16 #include "src/list-inl.h"
  17 #include "src/parser.h"
  18 #include "src/scanner.h"
  19
  20 namespace v8 {
  21 namespace internal {
  22
  23
  24 Handle<String> LiteralBuffer::Internalize(Isolate* isolate) const {
  25   if (is_one_byte()) {
  26     return isolate->factory()->InternalizeOneByteString(one_byte_literal());
  27   }
  28   return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
  29 }
  30
  31
  32 // ----------------------------------------------------------------------------
  33 // Scanner
  34
  35 Scanner::Scanner(UnicodeCache* unicode_cache)
  36     : unicode_cache_(unicode_cache),
  37       octal_pos_(Location::invalid()),
  38       harmony_scoping_(false),
  39       harmony_modules_(false),
  40       harmony_numeric_literals_(false),
  41       harmony_classes_(false),
  42       harmony_templates_(false),
  43       harmony_unicode_(false) {}
  44
  45
  46 void Scanner::Initialize(Utf16CharacterStream* source) {
  47   source_ = source;
  48   // Need to capture identifiers in order to recognize "get" and "set"
  49   // in object literals.
  50   Init();
  51   // Skip initial whitespace allowing HTML comment ends just like
  52   // after a newline and scan first token.
  53   has_line_terminator_before_next_ = true;
  54   SkipWhiteSpace();
  55   Scan();
  56 }
  57
  58
  59 template <bool capture_raw>
  60 uc32 Scanner::ScanHexNumber(int expected_length) {
  61   DCHECK(expected_length <= 4);  // prevent overflow
  62
  63   uc32 x = 0;
  64   for (int i = 0; i < expected_length; i++) {
  65     int d = HexValue(c0_);
  66     if (d < 0) {
  67       return -1;
  68     }
  69     x = x * 16 + d;
  70     Advance<capture_raw>();
  71   }
  72
  73   return x;
  74 }
  75
  76
  77 template <bool capture_raw>
  78 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value) {
  79   uc32 x = 0;
  80   int d = HexValue(c0_);
  81   if (d < 0) {
  82     return -1;
  83   }
  84   while (d >= 0) {
  85     x = x * 16 + d;
  86     if (x > max_value) return -1;
  87     Advance<capture_raw>();
  88     d = HexValue(c0_);
  89   }
  90   return x;
  91 }
  92
  93
  94 // Ensure that tokens can be stored in a byte.
  95 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
  96
  97 // Table of one-character tokens, by character (0x00..0x7f only).
  98 static const byte one_char_tokens[] = {
  99   Token::ILLEGAL,
 100   Token::ILLEGAL,
 101   Token::ILLEGAL,
 102   Token::ILLEGAL,
 103   Token::ILLEGAL,
 104   Token::ILLEGAL,
 105   Token::ILLEGAL,
 106   Token::ILLEGAL,
 107   Token::ILLEGAL,
 108   Token::ILLEGAL,
 109   Token::ILLEGAL,
 110   Token::ILLEGAL,
 111   Token::ILLEGAL,
 112   Token::ILLEGAL,
 113   Token::ILLEGAL,
 114   Token::ILLEGAL,
 115   Token::ILLEGAL,
 116   Token::ILLEGAL,
 117   Token::ILLEGAL,
 118   Token::ILLEGAL,
 119   Token::ILLEGAL,
 120   Token::ILLEGAL,
 121   Token::ILLEGAL,
 122   Token::ILLEGAL,
 123   Token::ILLEGAL,
 124   Token::ILLEGAL,
 125   Token::ILLEGAL,
 126   Token::ILLEGAL,
 127   Token::ILLEGAL,
 128   Token::ILLEGAL,
 129   Token::ILLEGAL,
 130   Token::ILLEGAL,
 131   Token::ILLEGAL,
 132   Token::ILLEGAL,
 133   Token::ILLEGAL,
 134   Token::ILLEGAL,
 135   Token::ILLEGAL,
 136   Token::ILLEGAL,
 137   Token::ILLEGAL,
 138   Token::ILLEGAL,
 139   Token::LPAREN,       // 0x28
 140   Token::RPAREN,       // 0x29
 141   Token::ILLEGAL,
 142   Token::ILLEGAL,
 143   Token::COMMA,        // 0x2c
 144   Token::ILLEGAL,
 145   Token::ILLEGAL,
 146   Token::ILLEGAL,
 147   Token::ILLEGAL,
 148   Token::ILLEGAL,
 149   Token::ILLEGAL,
 150   Token::ILLEGAL,
 151   Token::ILLEGAL,
 152   Token::ILLEGAL,
 153   Token::ILLEGAL,
 154   Token::ILLEGAL,
 155   Token::ILLEGAL,
 156   Token::ILLEGAL,
 157   Token::COLON,        // 0x3a
 158   Token::SEMICOLON,    // 0x3b
 159   Token::ILLEGAL,
 160   Token::ILLEGAL,
 161   Token::ILLEGAL,
 162   Token::CONDITIONAL,  // 0x3f
 163   Token::ILLEGAL,
 164   Token::ILLEGAL,
 165   Token::ILLEGAL,
 166   Token::ILLEGAL,
 167   Token::ILLEGAL,
 168   Token::ILLEGAL,
 169   Token::ILLEGAL,
 170   Token::ILLEGAL,
 171   Token::ILLEGAL,
 172   Token::ILLEGAL,
 173   Token::ILLEGAL,
 174   Token::ILLEGAL,
 175   Token::ILLEGAL,
 176   Token::ILLEGAL,
 177   Token::ILLEGAL,
 178   Token::ILLEGAL,
 179   Token::ILLEGAL,
 180   Token::ILLEGAL,
 181   Token::ILLEGAL,
 182   Token::ILLEGAL,
 183   Token::ILLEGAL,
 184   Token::ILLEGAL,
 185   Token::ILLEGAL,
 186   Token::ILLEGAL,
 187   Token::ILLEGAL,
 188   Token::ILLEGAL,
 189   Token::ILLEGAL,
 190   Token::LBRACK,     // 0x5b
 191   Token::ILLEGAL,
 192   Token::RBRACK,     // 0x5d
 193   Token::ILLEGAL,
 194   Token::ILLEGAL,
 195   Token::ILLEGAL,
 196   Token::ILLEGAL,
 197   Token::ILLEGAL,
 198   Token::ILLEGAL,
 199   Token::ILLEGAL,
 200   Token::ILLEGAL,
 201   Token::ILLEGAL,
 202   Token::ILLEGAL,
 203   Token::ILLEGAL,
 204   Token::ILLEGAL,
 205   Token::ILLEGAL,
 206   Token::ILLEGAL,
 207   Token::ILLEGAL,
 208   Token::ILLEGAL,
 209   Token::ILLEGAL,
 210   Token::ILLEGAL,
 211   Token::ILLEGAL,
 212   Token::ILLEGAL,
 213   Token::ILLEGAL,
 214   Token::ILLEGAL,
 215   Token::ILLEGAL,
 216   Token::ILLEGAL,
 217   Token::ILLEGAL,
 218   Token::ILLEGAL,
 219   Token::ILLEGAL,
 220   Token::ILLEGAL,
 221   Token::ILLEGAL,
 222   Token::LBRACE,       // 0x7b
 223   Token::ILLEGAL,
 224   Token::RBRACE,       // 0x7d
 225   Token::BIT_NOT,      // 0x7e
 226   Token::ILLEGAL
 227 };
 228
 229
 230 Token::Value Scanner::Next() {
 231   current_ = next_;
 232   has_line_terminator_before_next_ = false;
 233   has_multiline_comment_before_next_ = false;
 234   if (static_cast<unsigned>(c0_) <= 0x7f) {
 235     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
 236     if (token != Token::ILLEGAL) {
 237       int pos = source_pos();
 238       next_.token = token;
 239       next_.location.beg_pos = pos;
 240       next_.location.end_pos = pos + 1;
 241       Advance();
 242       return current_.token;
 243     }
 244   }
 245   Scan();
 246   return current_.token;
 247 }
 248
 249
 250 // TODO(yangguo): check whether this is actually necessary.
 251 static inline bool IsLittleEndianByteOrderMark(uc32 c) {
 252   // The Unicode value U+FFFE is guaranteed never to be assigned as a
 253   // Unicode character; this implies that in a Unicode context the
 254   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
 255   // character expressed in little-endian byte order (since it could
 256   // not be a U+FFFE character expressed in big-endian byte
 257   // order). Nevertheless, we check for it to be compatible with
 258   // Spidermonkey.
 259   return c == 0xFFFE;
 260 }
 261
 262
 263 bool Scanner::SkipWhiteSpace() {
 264   int start_position = source_pos();
 265
 266   while (true) {
 267     while (true) {
 268       // The unicode cache accepts unsigned inputs.
 269       if (c0_ < 0) break;
 270       // Advance as long as character is a WhiteSpace or LineTerminator.
 271       // Remember if the latter is the case.
 272       if (unicode_cache_->IsLineTerminator(c0_)) {
 273         has_line_terminator_before_next_ = true;
 274       } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
 275                  !IsLittleEndianByteOrderMark(c0_)) {
 276         break;
 277       }
 278       Advance();
 279     }
 280
 281     // If there is an HTML comment end '-->' at the beginning of a
 282     // line (with only whitespace in front of it), we treat the rest
 283     // of the line as a comment. This is in line with the way
 284     // SpiderMonkey handles it.
 285     if (c0_ == '-' && has_line_terminator_before_next_) {
 286       Advance();
 287       if (c0_ == '-') {
 288         Advance();
 289         if (c0_ == '>') {
 290           // Treat the rest of the line as a comment.
 291           SkipSingleLineComment();
 292           // Continue skipping white space after the comment.
 293           continue;
 294         }
 295         PushBack('-');  // undo Advance()
 296       }
 297       PushBack('-');  // undo Advance()
 298     }
 299     // Return whether or not we skipped any characters.
 300     return source_pos() != start_position;
 301   }
 302 }
 303
 304
 305 Token::Value Scanner::SkipSingleLineComment() {
 306   Advance();
 307
 308   // The line terminator at the end of the line is not considered
 309   // to be part of the single-line comment; it is recognized
 310   // separately by the lexical grammar and becomes part of the
 311   // stream of input elements for the syntactic grammar (see
 312   // ECMA-262, section 7.4).
 313   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
 314     Advance();
 315   }
 316
 317   return Token::WHITESPACE;
 318 }
 319
 320
 321 Token::Value Scanner::SkipSourceURLComment() {
 322   TryToParseSourceURLComment();
 323   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
 324     Advance();
 325   }
 326
 327   return Token::WHITESPACE;
 328 }
 329
 330
 331 void Scanner::TryToParseSourceURLComment() {
 332   // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
 333   // function will just return if it cannot parse a magic comment.
 334   if (c0_ < 0 || !unicode_cache_->IsWhiteSpace(c0_)) return;
 335   Advance();
 336   LiteralBuffer name;
 337   while (c0_ >= 0 && !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) &&
 338          c0_ != '=') {
 339     name.AddChar(c0_);
 340     Advance();
 341   }
 342   if (!name.is_one_byte()) return;
 343   Vector<const uint8_t> name_literal = name.one_byte_literal();
 344   LiteralBuffer* value;
 345   if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
 346     value = &source_url_;
 347   } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
 348     value = &source_mapping_url_;
 349   } else {
 350     return;
 351   }
 352   if (c0_ != '=')
 353     return;
 354   Advance();
 355   value->Reset();
 356   while (c0_ >= 0 && unicode_cache_->IsWhiteSpace(c0_)) {
 357     Advance();
 358   }
 359   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
 360     // Disallowed characters.
 361     if (c0_ == '"' || c0_ == '\'') {
 362       value->Reset();
 363       return;
 364     }
 365     if (unicode_cache_->IsWhiteSpace(c0_)) {
 366       break;
 367     }
 368     value->AddChar(c0_);
 369     Advance();
 370   }
 371   // Allow whitespace at the end.
 372   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
 373     if (!unicode_cache_->IsWhiteSpace(c0_)) {
 374       value->Reset();
 375       break;
 376     }
 377     Advance();
 378   }
 379 }
 380
 381
 382 Token::Value Scanner::SkipMultiLineComment() {
 383   DCHECK(c0_ == '*');
 384   Advance();
 385
 386   while (c0_ >= 0) {
 387     uc32 ch = c0_;
 388     Advance();
 389     if (c0_ >= 0 && unicode_cache_->IsLineTerminator(ch)) {
 390       // Following ECMA-262, section 7.4, a comment containing
 391       // a newline will make the comment count as a line-terminator.
 392       has_multiline_comment_before_next_ = true;
 393     }
 394     // If we have reached the end of the multi-line comment, we
 395     // consume the '/' and insert a whitespace. This way all
 396     // multi-line comments are treated as whitespace.
 397     if (ch == '*' && c0_ == '/') {
 398       c0_ = ' ';
 399       return Token::WHITESPACE;
 400     }
 401   }
 402
 403   // Unterminated multi-line comment.
 404   return Token::ILLEGAL;
 405 }
 406
 407
 408 Token::Value Scanner::ScanHtmlComment() {
 409   // Check for <!-- comments.
 410   DCHECK(c0_ == '!');
 411   Advance();
 412   if (c0_ == '-') {
 413     Advance();
 414     if (c0_ == '-') return SkipSingleLineComment();
 415     PushBack('-');  // undo Advance()
 416   }
 417   PushBack('!');  // undo Advance()
 418   DCHECK(c0_ == '!');
 419   return Token::LT;
 420 }
 421
 422
 423 void Scanner::Scan() {
 424   next_.literal_chars = NULL;
 425   next_.raw_literal_chars = NULL;
 426   Token::Value token;
 427   do {
 428     // Remember the position of the next token
 429     next_.location.beg_pos = source_pos();
 430
 431     switch (c0_) {
 432       case ' ':
 433       case '\t':
 434         Advance();
 435         token = Token::WHITESPACE;
 436         break;
 437
 438       case '\n':
 439         Advance();
 440         has_line_terminator_before_next_ = true;
 441         token = Token::WHITESPACE;
 442         break;
 443
 444       case '"': case '\'':
 445         token = ScanString();
 446         break;
 447
 448       case '<':
 449         // < <= << <<= <!--
 450         Advance();
 451         if (c0_ == '=') {
 452           token = Select(Token::LTE);
 453         } else if (c0_ == '<') {
 454           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
 455         } else if (c0_ == '!') {
 456           token = ScanHtmlComment();
 457         } else {
 458           token = Token::LT;
 459         }
 460         break;
 461
 462       case '>':
 463         // > >= >> >>= >>> >>>=
 464         Advance();
 465         if (c0_ == '=') {
 466           token = Select(Token::GTE);
 467         } else if (c0_ == '>') {
 468           // >> >>= >>> >>>=
 469           Advance();
 470           if (c0_ == '=') {
 471             token = Select(Token::ASSIGN_SAR);
 472           } else if (c0_ == '>') {
 473             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
 474           } else {
 475             token = Token::SAR;
 476           }
 477         } else {
 478           token = Token::GT;
 479         }
 480         break;
 481
 482       case '=':
 483         // = == === =>
 484         Advance();
 485         if (c0_ == '=') {
 486           token = Select('=', Token::EQ_STRICT, Token::EQ);
 487         } else if (c0_ == '>') {
 488           token = Select(Token::ARROW);
 489         } else {
 490           token = Token::ASSIGN;
 491         }
 492         break;
 493
 494       case '!':
 495         // ! != !==
 496         Advance();
 497         if (c0_ == '=') {
 498           token = Select('=', Token::NE_STRICT, Token::NE);
 499         } else {
 500           token = Token::NOT;
 501         }
 502         break;
 503
 504       case '+':
 505         // + ++ +=
 506         Advance();
 507         if (c0_ == '+') {
 508           token = Select(Token::INC);
 509         } else if (c0_ == '=') {
 510           token = Select(Token::ASSIGN_ADD);
 511         } else {
 512           token = Token::ADD;
 513         }
 514         break;
 515
 516       case '-':
 517         // - -- --> -=
 518         Advance();
 519         if (c0_ == '-') {
 520           Advance();
 521           if (c0_ == '>' && has_line_terminator_before_next_) {
 522             // For compatibility with SpiderMonkey, we skip lines that
 523             // start with an HTML comment end '-->'.
 524             token = SkipSingleLineComment();
 525           } else {
 526             token = Token::DEC;
 527           }
 528         } else if (c0_ == '=') {
 529           token = Select(Token::ASSIGN_SUB);
 530         } else {
 531           token = Token::SUB;
 532         }
 533         break;
 534
 535       case '*':
 536         // * *=
 537         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
 538         break;
 539
 540       case '%':
 541         // % %=
 542         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
 543         break;
 544
 545       case '/':
 546         // /  // /* /=
 547         Advance();
 548         if (c0_ == '/') {
 549           Advance();
 550           if (c0_ == '@' || c0_ == '#') {
 551             Advance();
 552             token = SkipSourceURLComment();
 553           } else {
 554             PushBack(c0_);
 555             token = SkipSingleLineComment();
 556           }
 557         } else if (c0_ == '*') {
 558           token = SkipMultiLineComment();
 559         } else if (c0_ == '=') {
 560           token = Select(Token::ASSIGN_DIV);
 561         } else {
 562           token = Token::DIV;
 563         }
 564         break;
 565
 566       case '&':
 567         // & && &=
 568         Advance();
 569         if (c0_ == '&') {
 570           token = Select(Token::AND);
 571         } else if (c0_ == '=') {
 572           token = Select(Token::ASSIGN_BIT_AND);
 573         } else {
 574           token = Token::BIT_AND;
 575         }
 576         break;
 577
 578       case '|':
 579         // | || |=
 580         Advance();
 581         if (c0_ == '|') {
 582           token = Select(Token::OR);
 583         } else if (c0_ == '=') {
 584           token = Select(Token::ASSIGN_BIT_OR);
 585         } else {
 586           token = Token::BIT_OR;
 587         }
 588         break;
 589
 590       case '^':
 591         // ^ ^=
 592         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
 593         break;
 594
 595       case '.':
 596         // . Number
 597         Advance();
 598         if (IsDecimalDigit(c0_)) {
 599           token = ScanNumber(true);
 600         } else {
 601           token = Token::PERIOD;
 602           if (c0_ == '.') {
 603             Advance();
 604             if (c0_ == '.') {
 605               Advance();
 606               token = Token::ELLIPSIS;
 607             } else {
 608               PushBack('.');
 609             }
 610           }
 611         }
 612         break;
 613
 614       case ':':
 615         token = Select(Token::COLON);
 616         break;
 617
 618       case ';':
 619         token = Select(Token::SEMICOLON);
 620         break;
 621
 622       case ',':
 623         token = Select(Token::COMMA);
 624         break;
 625
 626       case '(':
 627         token = Select(Token::LPAREN);
 628         break;
 629
 630       case ')':
 631         token = Select(Token::RPAREN);
 632         break;
 633
 634       case '[':
 635         token = Select(Token::LBRACK);
 636         break;
 637
 638       case ']':
 639         token = Select(Token::RBRACK);
 640         break;
 641
 642       case '{':
 643         token = Select(Token::LBRACE);
 644         break;
 645
 646       case '}':
 647         token = Select(Token::RBRACE);
 648         break;
 649
 650       case '?':
 651         token = Select(Token::CONDITIONAL);
 652         break;
 653
 654       case '~':
 655         token = Select(Token::BIT_NOT);
 656         break;
 657
 658       case '`':
 659         if (HarmonyTemplates()) {
 660           token = ScanTemplateStart();
 661           break;
 662         }
 663
 664       default:
 665         if (c0_ < 0) {
 666           token = Token::EOS;
 667         } else if (unicode_cache_->IsIdentifierStart(c0_)) {
 668           token = ScanIdentifierOrKeyword();
 669         } else if (IsDecimalDigit(c0_)) {
 670           token = ScanNumber(false);
 671         } else if (SkipWhiteSpace()) {
 672           token = Token::WHITESPACE;
 673         } else {
 674           token = Select(Token::ILLEGAL);
 675         }
 676         break;
 677     }
 678
 679     // Continue scanning for tokens as long as we're just skipping
 680     // whitespace.
 681   } while (token == Token::WHITESPACE);
 682
 683   next_.location.end_pos = source_pos();
 684   next_.token = token;
 685 }
 686
 687
 688 void Scanner::SeekForward(int pos) {
 689   // After this call, we will have the token at the given position as
 690   // the "next" token. The "current" token will be invalid.
 691   if (pos == next_.location.beg_pos) return;
 692   int current_pos = source_pos();
 693   DCHECK_EQ(next_.location.end_pos, current_pos);
 694   // Positions inside the lookahead token aren't supported.
 695   DCHECK(pos >= current_pos);
 696   if (pos != current_pos) {
 697     source_->SeekForward(pos - source_->pos());
 698     Advance();
 699     // This function is only called to seek to the location
 700     // of the end of a function (at the "}" token). It doesn't matter
 701     // whether there was a line terminator in the part we skip.
 702     has_line_terminator_before_next_ = false;
 703     has_multiline_comment_before_next_ = false;
 704   }
 705   Scan();
 706 }
 707
 708
 709 template <bool capture_raw, bool in_template_literal>
 710 bool Scanner::ScanEscape() {
 711   uc32 c = c0_;
 712   Advance<capture_raw>();
 713
 714   // Skip escaped newlines.
 715   if (!in_template_literal && c0_ >= 0 && unicode_cache_->IsLineTerminator(c)) {
 716     // Allow CR+LF newlines in multiline string literals.
 717     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>();
 718     // Allow LF+CR newlines in multiline string literals.
 719     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>();
 720     return true;
 721   }
 722
 723   switch (c) {
 724     case '\'':  // fall through
 725     case '"' :  // fall through
 726     case '\\': break;
 727     case 'b' : c = '\b'; break;
 728     case 'f' : c = '\f'; break;
 729     case 'n' : c = '\n'; break;
 730     case 'r' : c = '\r'; break;
 731     case 't' : c = '\t'; break;
 732     case 'u' : {
 733       c = ScanUnicodeEscape<capture_raw>();
 734       if (c < 0) return false;
 735       break;
 736     }
 737     case 'v':
 738       c = '\v';
 739       break;
 740     case 'x': {
 741       c = ScanHexNumber<capture_raw>(2);
 742       if (c < 0) return false;
 743       break;
 744     }
 745     case '0':  // Fall through.
 746     case '1':  // fall through
 747     case '2':  // fall through
 748     case '3':  // fall through
 749     case '4':  // fall through
 750     case '5':  // fall through
 751     case '6':  // fall through
 752     case '7':
 753       c = ScanOctalEscape<capture_raw>(c, 2);
 754       break;
 755   }
 756
 757   // According to ECMA-262, section 7.8.4, characters not covered by the
 758   // above cases should be illegal, but they are commonly handled as
 759   // non-escaped characters by JS VMs.
 760   AddLiteralChar(c);
 761   return true;
 762 }
 763
 764
 765 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
 766 // ECMA-262. Other JS VMs support them.
 767 template <bool capture_raw>
 768 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
 769   uc32 x = c - '0';
 770   int i = 0;
 771   for (; i < length; i++) {
 772     int d = c0_ - '0';
 773     if (d < 0 || d > 7) break;
 774     int nx = x * 8 + d;
 775     if (nx >= 256) break;
 776     x = nx;
 777     Advance<capture_raw>();
 778   }
 779   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
 780   // Remember the position of octal escape sequences so that an error
 781   // can be reported later (in strict mode).
 782   // We don't report the error immediately, because the octal escape can
 783   // occur before the "use strict" directive.
 784   if (c != '0' || i > 0) {
 785     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
 786   }
 787   return x;
 788 }
 789
 790
 791 Token::Value Scanner::ScanString() {
 792   uc32 quote = c0_;
 793   Advance();  // consume quote
 794
 795   LiteralScope literal(this);
 796   while (c0_ != quote && c0_ >= 0
 797          && !unicode_cache_->IsLineTerminator(c0_)) {
 798     uc32 c = c0_;
 799     Advance();
 800     if (c == '\\') {
 801       if (c0_ < 0 || !ScanEscape<false, false>()) return Token::ILLEGAL;
 802     } else {
 803       AddLiteralChar(c);
 804     }
 805   }
 806   if (c0_ != quote) return Token::ILLEGAL;
 807   literal.Complete();
 808
 809   Advance();  // consume quote
 810   return Token::STRING;
 811 }
 812
 813
 814 Token::Value Scanner::ScanTemplateSpan() {
 815   // When scanning a TemplateSpan, we are looking for the following construct:
 816   // TEMPLATE_SPAN ::
 817   //     ` LiteralChars* ${
 818   //   | } LiteralChars* ${
 819   //
 820   // TEMPLATE_TAIL ::
 821   //     ` LiteralChars* `
 822   //   | } LiteralChar* `
 823   //
 824   // A TEMPLATE_SPAN should always be followed by an Expression, while a
 825   // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
 826   // followed by an Expression.
 827
 828   Token::Value result = Token::TEMPLATE_SPAN;
 829   LiteralScope literal(this);
 830   StartRawLiteral();
 831   const bool capture_raw = true;
 832   const bool in_template_literal = true;
 833
 834   while (true) {
 835     uc32 c = c0_;
 836     Advance<capture_raw>();
 837     if (c == '`') {
 838       result = Token::TEMPLATE_TAIL;
 839       ReduceRawLiteralLength(1);
 840       break;
 841     } else if (c == '$' && c0_ == '{') {
 842       Advance<capture_raw>();  // Consume '{'
 843       ReduceRawLiteralLength(2);
 844       break;
 845     } else if (c == '\\') {
 846       if (c0_ > 0 && unicode_cache_->IsLineTerminator(c0_)) {
 847         // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
 848         // code unit sequence.
 849         uc32 lastChar = c0_;
 850         Advance<capture_raw>();
 851         if (lastChar == '\r') {
 852           ReduceRawLiteralLength(1);  // Remove \r
 853           if (c0_ == '\n') {
 854             Advance<capture_raw>();  // Adds \n
 855           } else {
 856             AddRawLiteralChar('\n');
 857           }
 858         }
 859       } else if (!ScanEscape<capture_raw, in_template_literal>()) {
 860         return Token::ILLEGAL;
 861       }
 862     } else if (c < 0) {
 863       // Unterminated template literal
 864       PushBack(c);
 865       break;
 866     } else {
 867       // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
 868       // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
 869       // consisting of the CV 0x000A.
 870       if (c == '\r') {
 871         ReduceRawLiteralLength(1);  // Remove \r
 872         if (c0_ == '\n') {
 873           Advance<capture_raw>();  // Adds \n
 874         } else {
 875           AddRawLiteralChar('\n');
 876         }
 877         c = '\n';
 878       }
 879       AddLiteralChar(c);
 880     }
 881   }
 882   literal.Complete();
 883   next_.location.end_pos = source_pos();
 884   next_.token = result;
 885   return result;
 886 }
 887
 888
 889 Token::Value Scanner::ScanTemplateStart() {
 890   DCHECK(c0_ == '`');
 891   next_.location.beg_pos = source_pos();
 892   Advance();  // Consume `
 893   return ScanTemplateSpan();
 894 }
 895
 896
 897 Token::Value Scanner::ScanTemplateContinuation() {
 898   DCHECK_EQ(next_.token, Token::RBRACE);
 899   next_.location.beg_pos = source_pos() - 1;  // We already consumed }
 900   return ScanTemplateSpan();
 901 }
 902
 903
 904 void Scanner::ScanDecimalDigits() {
 905   while (IsDecimalDigit(c0_))
 906     AddLiteralCharAdvance();
 907 }
 908
 909
 910 Token::Value Scanner::ScanNumber(bool seen_period) {
 911   DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
 912
 913   enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL;
 914
 915   LiteralScope literal(this);
 916   if (seen_period) {
 917     // we have already seen a decimal point of the float
 918     AddLiteralChar('.');
 919     ScanDecimalDigits();  // we know we have at least one digit
 920
 921   } else {
 922     // if the first character is '0' we must check for octals and hex
 923     if (c0_ == '0') {
 924       int start_pos = source_pos();  // For reporting octal positions.
 925       AddLiteralCharAdvance();
 926
 927       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
 928       // an octal number.
 929       if (c0_ == 'x' || c0_ == 'X') {
 930         // hex number
 931         kind = HEX;
 932         AddLiteralCharAdvance();
 933         if (!IsHexDigit(c0_)) {
 934           // we must have at least one hex digit after 'x'/'X'
 935           return Token::ILLEGAL;
 936         }
 937         while (IsHexDigit(c0_)) {
 938           AddLiteralCharAdvance();
 939         }
 940       } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) {
 941         kind = OCTAL;
 942         AddLiteralCharAdvance();
 943         if (!IsOctalDigit(c0_)) {
 944           // we must have at least one octal digit after 'o'/'O'
 945           return Token::ILLEGAL;
 946         }
 947         while (IsOctalDigit(c0_)) {
 948           AddLiteralCharAdvance();
 949         }
 950       } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) {
 951         kind = BINARY;
 952         AddLiteralCharAdvance();
 953         if (!IsBinaryDigit(c0_)) {
 954           // we must have at least one binary digit after 'b'/'B'
 955           return Token::ILLEGAL;
 956         }
 957         while (IsBinaryDigit(c0_)) {
 958           AddLiteralCharAdvance();
 959         }
 960       } else if ('0' <= c0_ && c0_ <= '7') {
 961         // (possible) octal number
 962         kind = IMPLICIT_OCTAL;
 963         while (true) {
 964           if (c0_ == '8' || c0_ == '9') {
 965             kind = DECIMAL;
 966             break;
 967           }
 968           if (c0_  < '0' || '7'  < c0_) {
 969             // Octal literal finished.
 970             octal_pos_ = Location(start_pos, source_pos());
 971             break;
 972           }
 973           AddLiteralCharAdvance();
 974         }
 975       }
 976     }
 977
 978     // Parse decimal digits and allow trailing fractional part.
 979     if (kind == DECIMAL) {
 980       ScanDecimalDigits();  // optional
 981       if (c0_ == '.') {
 982         AddLiteralCharAdvance();
 983         ScanDecimalDigits();  // optional
 984       }
 985     }
 986   }
 987
 988   // scan exponent, if any
 989   if (c0_ == 'e' || c0_ == 'E') {
 990     DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
 991     if (kind != DECIMAL) return Token::ILLEGAL;
 992     // scan exponent
 993     AddLiteralCharAdvance();
 994     if (c0_ == '+' || c0_ == '-')
 995       AddLiteralCharAdvance();
 996     if (!IsDecimalDigit(c0_)) {
 997       // we must have at least one decimal digit after 'e'/'E'
 998       return Token::ILLEGAL;
 999     }
1000     ScanDecimalDigits();
1001   }
1002
1003   // The source character immediately following a numeric literal must
1004   // not be an identifier start or a decimal digit; see ECMA-262
1005   // section 7.8.3, page 17 (note that we read only one decimal digit
1006   // if the value is 0).
1007   if (IsDecimalDigit(c0_) ||
1008       (c0_ >= 0 && unicode_cache_->IsIdentifierStart(c0_)))
1009     return Token::ILLEGAL;
1010
1011   literal.Complete();
1012
1013   return Token::NUMBER;
1014 }
1015
1016
1017 uc32 Scanner::ScanIdentifierUnicodeEscape() {
1018   Advance();
1019   if (c0_ != 'u') return -1;
1020   Advance();
1021   return ScanUnicodeEscape<false>();
1022 }
1023
1024
1025 template <bool capture_raw>
1026 uc32 Scanner::ScanUnicodeEscape() {
1027   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1028   // allowed). In the latter case, the number of hex digits between { } is
1029   // arbitrary. \ and u have already been read.
1030   if (c0_ == '{' && HarmonyUnicode()) {
1031     Advance<capture_raw>();
1032     uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff);
1033     if (cp < 0) {
1034       return -1;
1035     }
1036     if (c0_ != '}') {
1037       return -1;
1038     }
1039     Advance<capture_raw>();
1040     return cp;
1041   }
1042   return ScanHexNumber<capture_raw>(4);
1043 }
1044
1045
1046 // ----------------------------------------------------------------------------
1047 // Keyword Matcher
1048
1049 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                                     \
1050   KEYWORD_GROUP('b')                                                         \
1051   KEYWORD("break", Token::BREAK)                                             \
1052   KEYWORD_GROUP('c')                                                         \
1053   KEYWORD("case", Token::CASE)                                               \
1054   KEYWORD("catch", Token::CATCH)                                             \
1055   KEYWORD("class",                                                           \
1056           harmony_classes ? Token::CLASS : Token::FUTURE_RESERVED_WORD)      \
1057   KEYWORD("const", Token::CONST)                                             \
1058   KEYWORD("continue", Token::CONTINUE)                                       \
1059   KEYWORD_GROUP('d')                                                         \
1060   KEYWORD("debugger", Token::DEBUGGER)                                       \
1061   KEYWORD("default", Token::DEFAULT)                                         \
1062   KEYWORD("delete", Token::DELETE)                                           \
1063   KEYWORD("do", Token::DO)                                                   \
1064   KEYWORD_GROUP('e')                                                         \
1065   KEYWORD("else", Token::ELSE)                                               \
1066   KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                               \
1067   KEYWORD("export",                                                          \
1068           harmony_modules ? Token::EXPORT : Token::FUTURE_RESERVED_WORD)     \
1069   KEYWORD("extends",                                                         \
1070           harmony_classes ? Token::EXTENDS : Token::FUTURE_RESERVED_WORD)    \
1071   KEYWORD_GROUP('f')                                                         \
1072   KEYWORD("false", Token::FALSE_LITERAL)                                     \
1073   KEYWORD("finally", Token::FINALLY)                                         \
1074   KEYWORD("for", Token::FOR)                                                 \
1075   KEYWORD("function", Token::FUNCTION)                                       \
1076   KEYWORD_GROUP('i')                                                         \
1077   KEYWORD("if", Token::IF)                                                   \
1078   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)                  \
1079   KEYWORD("import",                                                          \
1080           harmony_modules ? Token::IMPORT : Token::FUTURE_RESERVED_WORD)     \
1081   KEYWORD("in", Token::IN)                                                   \
1082   KEYWORD("instanceof", Token::INSTANCEOF)                                   \
1083   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)                   \
1084   KEYWORD_GROUP('l')                                                         \
1085   KEYWORD("let",                                                             \
1086           harmony_scoping ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
1087   KEYWORD_GROUP('n')                                                         \
1088   KEYWORD("new", Token::NEW)                                                 \
1089   KEYWORD("null", Token::NULL_LITERAL)                                       \
1090   KEYWORD_GROUP('p')                                                         \
1091   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)                     \
1092   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)                     \
1093   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)                   \
1094   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)                      \
1095   KEYWORD_GROUP('r')                                                         \
1096   KEYWORD("return", Token::RETURN)                                           \
1097   KEYWORD_GROUP('s')                                                         \
1098   KEYWORD("static", harmony_classes ? Token::STATIC                          \
1099                                     : Token::FUTURE_STRICT_RESERVED_WORD)    \
1100   KEYWORD("super",                                                           \
1101           harmony_classes ? Token::SUPER : Token::FUTURE_RESERVED_WORD)      \
1102   KEYWORD("switch", Token::SWITCH)                                           \
1103   KEYWORD_GROUP('t')                                                         \
1104   KEYWORD("this", Token::THIS)                                               \
1105   KEYWORD("throw", Token::THROW)                                             \
1106   KEYWORD("true", Token::TRUE_LITERAL)                                       \
1107   KEYWORD("try", Token::TRY)                                                 \
1108   KEYWORD("typeof", Token::TYPEOF)                                           \
1109   KEYWORD_GROUP('v')                                                         \
1110   KEYWORD("var", Token::VAR)                                                 \
1111   KEYWORD("void", Token::VOID)                                               \
1112   KEYWORD_GROUP('w')                                                         \
1113   KEYWORD("while", Token::WHILE)                                             \
1114   KEYWORD("with", Token::WITH)                                               \
1115   KEYWORD_GROUP('y')                                                         \
1116   KEYWORD("yield", Token::YIELD)
1117
1118
1119 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
1120                                              int input_length,
1121                                              bool harmony_scoping,
1122                                              bool harmony_modules,
1123                                              bool harmony_classes) {
1124   DCHECK(input_length >= 1);
1125   const int kMinLength = 2;
1126   const int kMaxLength = 10;
1127   if (input_length < kMinLength || input_length > kMaxLength) {
1128     return Token::IDENTIFIER;
1129   }
1130   switch (input[0]) {
1131     default:
1132 #define KEYWORD_GROUP_CASE(ch)                                \
1133       break;                                                  \
1134     case ch:
1135 #define KEYWORD(keyword, token)                               \
1136     {                                                         \
1137       /* 'keyword' is a char array, so sizeof(keyword) is */  \
1138       /* strlen(keyword) plus 1 for the NUL char. */          \
1139       const int keyword_length = sizeof(keyword) - 1;         \
1140       STATIC_ASSERT(keyword_length >= kMinLength);            \
1141       STATIC_ASSERT(keyword_length <= kMaxLength);            \
1142       if (input_length == keyword_length &&                   \
1143           input[1] == keyword[1] &&                           \
1144           (keyword_length <= 2 || input[2] == keyword[2]) &&  \
1145           (keyword_length <= 3 || input[3] == keyword[3]) &&  \
1146           (keyword_length <= 4 || input[4] == keyword[4]) &&  \
1147           (keyword_length <= 5 || input[5] == keyword[5]) &&  \
1148           (keyword_length <= 6 || input[6] == keyword[6]) &&  \
1149           (keyword_length <= 7 || input[7] == keyword[7]) &&  \
1150           (keyword_length <= 8 || input[8] == keyword[8]) &&  \
1151           (keyword_length <= 9 || input[9] == keyword[9])) {  \
1152         return token;                                         \
1153       }                                                       \
1154     }
1155     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
1156   }
1157   return Token::IDENTIFIER;
1158 }
1159
1160
1161 bool Scanner::IdentifierIsFutureStrictReserved(
1162     const AstRawString* string) const {
1163   // Keywords are always 1-byte strings.
1164   if (!string->is_one_byte()) return false;
1165   if (string->IsOneByteEqualTo("let") || string->IsOneByteEqualTo("static") ||
1166       string->IsOneByteEqualTo("yield")) {
1167     return true;
1168   }
1169   return Token::FUTURE_STRICT_RESERVED_WORD ==
1170          KeywordOrIdentifierToken(string->raw_data(), string->length(),
1171                                   harmony_scoping_, harmony_modules_,
1172                                   harmony_classes_);
1173 }
1174
1175
1176 Token::Value Scanner::ScanIdentifierOrKeyword() {
1177   DCHECK(unicode_cache_->IsIdentifierStart(c0_));
1178   LiteralScope literal(this);
1179   // Scan identifier start character.
1180   if (c0_ == '\\') {
1181     uc32 c = ScanIdentifierUnicodeEscape();
1182     // Only allow legal identifier start characters.
1183     if (c < 0 ||
1184         c == '\\' ||  // No recursive escapes.
1185         !unicode_cache_->IsIdentifierStart(c)) {
1186       return Token::ILLEGAL;
1187     }
1188     AddLiteralChar(c);
1189     return ScanIdentifierSuffix(&literal);
1190   }
1191
1192   uc32 first_char = c0_;
1193   Advance();
1194   AddLiteralChar(first_char);
1195
1196   // Scan the rest of the identifier characters.
1197   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
1198     if (c0_ != '\\') {
1199       uc32 next_char = c0_;
1200       Advance();
1201       AddLiteralChar(next_char);
1202       continue;
1203     }
1204     // Fallthrough if no longer able to complete keyword.
1205     return ScanIdentifierSuffix(&literal);
1206   }
1207
1208   literal.Complete();
1209
1210   if (next_.literal_chars->is_one_byte()) {
1211     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1212     return KeywordOrIdentifierToken(chars.start(),
1213                                     chars.length(),
1214                                     harmony_scoping_,
1215                                     harmony_modules_,
1216                                     harmony_classes_);
1217   }
1218
1219   return Token::IDENTIFIER;
1220 }
1221
1222
1223 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
1224   // Scan the rest of the identifier characters.
1225   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
1226     if (c0_ == '\\') {
1227       uc32 c = ScanIdentifierUnicodeEscape();
1228       // Only allow legal identifier part characters.
1229       if (c < 0 ||
1230           c == '\\' ||
1231           !unicode_cache_->IsIdentifierPart(c)) {
1232         return Token::ILLEGAL;
1233       }
1234       AddLiteralChar(c);
1235     } else {
1236       AddLiteralChar(c0_);
1237       Advance();
1238     }
1239   }
1240   literal->Complete();
1241
1242   return Token::IDENTIFIER;
1243 }
1244
1245
1246 bool Scanner::ScanRegExpPattern(bool seen_equal) {
1247   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1248   bool in_character_class = false;
1249
1250   // Previous token is either '/' or '/=', in the second case, the
1251   // pattern starts at =.
1252   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1253   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1254
1255   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1256   // the scanner should pass uninterpreted bodies to the RegExp
1257   // constructor.
1258   LiteralScope literal(this);
1259   if (seen_equal) {
1260     AddLiteralChar('=');
1261   }
1262
1263   while (c0_ != '/' || in_character_class) {
1264     if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false;
1265     if (c0_ == '\\') {  // Escape sequence.
1266       AddLiteralCharAdvance();
1267       if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false;
1268       AddLiteralCharAdvance();
1269       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1270       // only "safe" characters are allowed (letters, digits, underscore),
1271       // otherwise the escape isn't valid and the invalid character has
1272       // its normal meaning. I.e., we can just continue scanning without
1273       // worrying whether the following characters are part of the escape
1274       // or not, since any '/', '\\' or '[' is guaranteed to not be part
1275       // of the escape sequence.
1276
1277       // TODO(896): At some point, parse RegExps more throughly to capture
1278       // octal esacpes in strict mode.
1279     } else {  // Unescaped character.
1280       if (c0_ == '[') in_character_class = true;
1281       if (c0_ == ']') in_character_class = false;
1282       AddLiteralCharAdvance();
1283     }
1284   }
1285   Advance();  // consume '/'
1286
1287   literal.Complete();
1288
1289   return true;
1290 }
1291
1292
1293 bool Scanner::ScanRegExpFlags() {
1294   // Scan regular expression flags.
1295   LiteralScope literal(this);
1296   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
1297     if (c0_ != '\\') {
1298       AddLiteralCharAdvance();
1299     } else {
1300       return false;
1301     }
1302   }
1303   literal.Complete();
1304
1305   next_.location.end_pos = source_pos() - 1;
1306   return true;
1307 }
1308
1309
1310 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) {
1311   if (is_literal_one_byte()) {
1312     return ast_value_factory->GetOneByteString(literal_one_byte_string());
1313   }
1314   return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1315 }
1316
1317
1318 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) {
1319   if (is_next_literal_one_byte()) {
1320     return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1321   }
1322   return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1323 }
1324
1325
1326 const AstRawString* Scanner::CurrentRawSymbol(
1327     AstValueFactory* ast_value_factory) {
1328   if (is_raw_literal_one_byte()) {
1329     return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1330   }
1331   return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1332 }
1333
1334
1335 double Scanner::DoubleValue() {
1336   DCHECK(is_literal_one_byte());
1337   return StringToDouble(
1338       unicode_cache_,
1339       literal_one_byte_string(),
1340       ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1341 }
1342
1343
1344 int Scanner::FindNumber(DuplicateFinder* finder, int value) {
1345   return finder->AddNumber(literal_one_byte_string(), value);
1346 }
1347
1348
1349 int Scanner::FindSymbol(DuplicateFinder* finder, int value) {
1350   if (is_literal_one_byte()) {
1351     return finder->AddOneByteSymbol(literal_one_byte_string(), value);
1352   }
1353   return finder->AddTwoByteSymbol(literal_two_byte_string(), value);
1354 }
1355
1356
1357 int DuplicateFinder::AddOneByteSymbol(Vector<const uint8_t> key, int value) {
1358   return AddSymbol(key, true, value);
1359 }
1360
1361
1362 int DuplicateFinder::AddTwoByteSymbol(Vector<const uint16_t> key, int value) {
1363   return AddSymbol(Vector<const uint8_t>::cast(key), false, value);
1364 }
1365
1366
1367 int DuplicateFinder::AddSymbol(Vector<const uint8_t> key,
1368                                bool is_one_byte,
1369                                int value) {
1370   uint32_t hash = Hash(key, is_one_byte);
1371   byte* encoding = BackupKey(key, is_one_byte);
1372   HashMap::Entry* entry = map_.Lookup(encoding, hash, true);
1373   int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value));
1374   entry->value =
1375     reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value));
1376   return old_value;
1377 }
1378
1379
1380 int DuplicateFinder::AddNumber(Vector<const uint8_t> key, int value) {
1381   DCHECK(key.length() > 0);
1382   // Quick check for already being in canonical form.
1383   if (IsNumberCanonical(key)) {
1384     return AddOneByteSymbol(key, value);
1385   }
1386
1387   int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY;
1388   double double_value = StringToDouble(
1389       unicode_constants_, key, flags, 0.0);
1390   int length;
1391   const char* string;
1392   if (!std::isfinite(double_value)) {
1393     string = "Infinity";
1394     length = 8;  // strlen("Infinity");
1395   } else {
1396     string = DoubleToCString(double_value,
1397                              Vector<char>(number_buffer_, kBufferSize));
1398     length = StrLength(string);
1399   }
1400   return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string),
1401                                       length), true, value);
1402 }
1403
1404
1405 bool DuplicateFinder::IsNumberCanonical(Vector<const uint8_t> number) {
1406   // Test for a safe approximation of number literals that are already
1407   // in canonical form: max 15 digits, no leading zeroes, except an
1408   // integer part that is a single zero, and no trailing zeros below
1409   // the decimal point.
1410   int pos = 0;
1411   int length = number.length();
1412   if (number.length() > 15) return false;
1413   if (number[pos] == '0') {
1414     pos++;
1415   } else {
1416     while (pos < length &&
1417            static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++;
1418   }
1419   if (length == pos) return true;
1420   if (number[pos] != '.') return false;
1421   pos++;
1422   bool invalid_last_digit = true;
1423   while (pos < length) {
1424     uint8_t digit = number[pos] - '0';
1425     if (digit > '9' - '0') return false;
1426     invalid_last_digit = (digit == 0);
1427     pos++;
1428   }
1429   return !invalid_last_digit;
1430 }
1431
1432
1433 uint32_t DuplicateFinder::Hash(Vector<const uint8_t> key, bool is_one_byte) {
1434   // Primitive hash function, almost identical to the one used
1435   // for strings (except that it's seeded by the length and representation).
1436   int length = key.length();
1437   uint32_t hash = (length << 1) | (is_one_byte ? 1 : 0) ;
1438   for (int i = 0; i < length; i++) {
1439     uint32_t c = key[i];
1440     hash = (hash + c) * 1025;
1441     hash ^= (hash >> 6);
1442   }
1443   return hash;
1444 }
1445
1446
1447 bool DuplicateFinder::Match(void* first, void* second) {
1448   // Decode lengths.
1449   // Length + representation is encoded as base 128, most significant heptet
1450   // first, with a 8th bit being non-zero while there are more heptets.
1451   // The value encodes the number of bytes following, and whether the original
1452   // was Latin1.
1453   byte* s1 = reinterpret_cast<byte*>(first);
1454   byte* s2 = reinterpret_cast<byte*>(second);
1455   uint32_t length_one_byte_field = 0;
1456   byte c1;
1457   do {
1458     c1 = *s1;
1459     if (c1 != *s2) return false;
1460     length_one_byte_field = (length_one_byte_field << 7) | (c1 & 0x7f);
1461     s1++;
1462     s2++;
1463   } while ((c1 & 0x80) != 0);
1464   int length = static_cast<int>(length_one_byte_field >> 1);
1465   return memcmp(s1, s2, length) == 0;
1466 }
1467
1468
1469 byte* DuplicateFinder::BackupKey(Vector<const uint8_t> bytes,
1470                                  bool is_one_byte) {
1471   uint32_t one_byte_length = (bytes.length() << 1) | (is_one_byte ? 1 : 0);
1472   backing_store_.StartSequence();
1473   // Emit one_byte_length as base-128 encoded number, with the 7th bit set
1474   // on the byte of every heptet except the last, least significant, one.
1475   if (one_byte_length >= (1 << 7)) {
1476     if (one_byte_length >= (1 << 14)) {
1477       if (one_byte_length >= (1 << 21)) {
1478         if (one_byte_length >= (1 << 28)) {
1479           backing_store_.Add(
1480               static_cast<uint8_t>((one_byte_length >> 28) | 0x80));
1481         }
1482         backing_store_.Add(
1483             static_cast<uint8_t>((one_byte_length >> 21) | 0x80u));
1484       }
1485       backing_store_.Add(
1486           static_cast<uint8_t>((one_byte_length >> 14) | 0x80u));
1487     }
1488     backing_store_.Add(static_cast<uint8_t>((one_byte_length >> 7) | 0x80u));
1489   }
1490   backing_store_.Add(static_cast<uint8_t>(one_byte_length & 0x7f));
1491
1492   backing_store_.AddBlock(bytes);
1493   return backing_store_.EndSequence().start();
1494 }
1495
1496 } }  // namespace v8::internal