deps/v8/src/scanner.cc

   1 // Copyright 2011 the V8 project authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // Features shared by parsing and pre-parsing scanners.
   6
   7 #include <stdint.h>
   8
   9 #include <cmath>
  10
  11 #include "src/v8.h"
  12
  13 #include "src/ast-value-factory.h"
  14 #include "src/char-predicates-inl.h"
  15 #include "src/conversions-inl.h"
  16 #include "src/list-inl.h"
  17 #include "src/parser.h"
  18 #include "src/scanner.h"
  19
  20 namespace v8 {
  21 namespace internal {
  22
  23
  24 Handle<String> LiteralBuffer::Internalize(Isolate* isolate) const {
  25   if (is_one_byte()) {
  26     return isolate->factory()->InternalizeOneByteString(one_byte_literal());
  27   }
  28   return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
  29 }
  30
  31
  32 // ----------------------------------------------------------------------------
  33 // Scanner
  34
  35 Scanner::Scanner(UnicodeCache* unicode_cache)
  36     : unicode_cache_(unicode_cache),
  37       octal_pos_(Location::invalid()),
  38       harmony_modules_(false),
  39       harmony_numeric_literals_(false),
  40       harmony_classes_(false),
  41       harmony_unicode_(false) {}
  42
  43
  44 void Scanner::Initialize(Utf16CharacterStream* source) {
  45   source_ = source;
  46   // Need to capture identifiers in order to recognize "get" and "set"
  47   // in object literals.
  48   Init();
  49   // Skip initial whitespace allowing HTML comment ends just like
  50   // after a newline and scan first token.
  51   has_line_terminator_before_next_ = true;
  52   SkipWhiteSpace();
  53   Scan();
  54 }
  55
  56
  57 template <bool capture_raw>
  58 uc32 Scanner::ScanHexNumber(int expected_length) {
  59   DCHECK(expected_length <= 4);  // prevent overflow
  60
  61   uc32 x = 0;
  62   for (int i = 0; i < expected_length; i++) {
  63     int d = HexValue(c0_);
  64     if (d < 0) {
  65       return -1;
  66     }
  67     x = x * 16 + d;
  68     Advance<capture_raw>();
  69   }
  70
  71   return x;
  72 }
  73
  74
  75 template <bool capture_raw>
  76 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value) {
  77   uc32 x = 0;
  78   int d = HexValue(c0_);
  79   if (d < 0) {
  80     return -1;
  81   }
  82   while (d >= 0) {
  83     x = x * 16 + d;
  84     if (x > max_value) return -1;
  85     Advance<capture_raw>();
  86     d = HexValue(c0_);
  87   }
  88   return x;
  89 }
  90
  91
  92 // Ensure that tokens can be stored in a byte.
  93 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
  94
  95 // Table of one-character tokens, by character (0x00..0x7f only).
  96 static const byte one_char_tokens[] = {
  97   Token::ILLEGAL,
  98   Token::ILLEGAL,
  99   Token::ILLEGAL,
 100   Token::ILLEGAL,
 101   Token::ILLEGAL,
 102   Token::ILLEGAL,
 103   Token::ILLEGAL,
 104   Token::ILLEGAL,
 105   Token::ILLEGAL,
 106   Token::ILLEGAL,
 107   Token::ILLEGAL,
 108   Token::ILLEGAL,
 109   Token::ILLEGAL,
 110   Token::ILLEGAL,
 111   Token::ILLEGAL,
 112   Token::ILLEGAL,
 113   Token::ILLEGAL,
 114   Token::ILLEGAL,
 115   Token::ILLEGAL,
 116   Token::ILLEGAL,
 117   Token::ILLEGAL,
 118   Token::ILLEGAL,
 119   Token::ILLEGAL,
 120   Token::ILLEGAL,
 121   Token::ILLEGAL,
 122   Token::ILLEGAL,
 123   Token::ILLEGAL,
 124   Token::ILLEGAL,
 125   Token::ILLEGAL,
 126   Token::ILLEGAL,
 127   Token::ILLEGAL,
 128   Token::ILLEGAL,
 129   Token::ILLEGAL,
 130   Token::ILLEGAL,
 131   Token::ILLEGAL,
 132   Token::ILLEGAL,
 133   Token::ILLEGAL,
 134   Token::ILLEGAL,
 135   Token::ILLEGAL,
 136   Token::ILLEGAL,
 137   Token::LPAREN,       // 0x28
 138   Token::RPAREN,       // 0x29
 139   Token::ILLEGAL,
 140   Token::ILLEGAL,
 141   Token::COMMA,        // 0x2c
 142   Token::ILLEGAL,
 143   Token::ILLEGAL,
 144   Token::ILLEGAL,
 145   Token::ILLEGAL,
 146   Token::ILLEGAL,
 147   Token::ILLEGAL,
 148   Token::ILLEGAL,
 149   Token::ILLEGAL,
 150   Token::ILLEGAL,
 151   Token::ILLEGAL,
 152   Token::ILLEGAL,
 153   Token::ILLEGAL,
 154   Token::ILLEGAL,
 155   Token::COLON,        // 0x3a
 156   Token::SEMICOLON,    // 0x3b
 157   Token::ILLEGAL,
 158   Token::ILLEGAL,
 159   Token::ILLEGAL,
 160   Token::CONDITIONAL,  // 0x3f
 161   Token::ILLEGAL,
 162   Token::ILLEGAL,
 163   Token::ILLEGAL,
 164   Token::ILLEGAL,
 165   Token::ILLEGAL,
 166   Token::ILLEGAL,
 167   Token::ILLEGAL,
 168   Token::ILLEGAL,
 169   Token::ILLEGAL,
 170   Token::ILLEGAL,
 171   Token::ILLEGAL,
 172   Token::ILLEGAL,
 173   Token::ILLEGAL,
 174   Token::ILLEGAL,
 175   Token::ILLEGAL,
 176   Token::ILLEGAL,
 177   Token::ILLEGAL,
 178   Token::ILLEGAL,
 179   Token::ILLEGAL,
 180   Token::ILLEGAL,
 181   Token::ILLEGAL,
 182   Token::ILLEGAL,
 183   Token::ILLEGAL,
 184   Token::ILLEGAL,
 185   Token::ILLEGAL,
 186   Token::ILLEGAL,
 187   Token::ILLEGAL,
 188   Token::LBRACK,     // 0x5b
 189   Token::ILLEGAL,
 190   Token::RBRACK,     // 0x5d
 191   Token::ILLEGAL,
 192   Token::ILLEGAL,
 193   Token::ILLEGAL,
 194   Token::ILLEGAL,
 195   Token::ILLEGAL,
 196   Token::ILLEGAL,
 197   Token::ILLEGAL,
 198   Token::ILLEGAL,
 199   Token::ILLEGAL,
 200   Token::ILLEGAL,
 201   Token::ILLEGAL,
 202   Token::ILLEGAL,
 203   Token::ILLEGAL,
 204   Token::ILLEGAL,
 205   Token::ILLEGAL,
 206   Token::ILLEGAL,
 207   Token::ILLEGAL,
 208   Token::ILLEGAL,
 209   Token::ILLEGAL,
 210   Token::ILLEGAL,
 211   Token::ILLEGAL,
 212   Token::ILLEGAL,
 213   Token::ILLEGAL,
 214   Token::ILLEGAL,
 215   Token::ILLEGAL,
 216   Token::ILLEGAL,
 217   Token::ILLEGAL,
 218   Token::ILLEGAL,
 219   Token::ILLEGAL,
 220   Token::LBRACE,       // 0x7b
 221   Token::ILLEGAL,
 222   Token::RBRACE,       // 0x7d
 223   Token::BIT_NOT,      // 0x7e
 224   Token::ILLEGAL
 225 };
 226
 227
 228 Token::Value Scanner::Next() {
 229   current_ = next_;
 230   has_line_terminator_before_next_ = false;
 231   has_multiline_comment_before_next_ = false;
 232   if (static_cast<unsigned>(c0_) <= 0x7f) {
 233     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
 234     if (token != Token::ILLEGAL) {
 235       int pos = source_pos();
 236       next_.token = token;
 237       next_.location.beg_pos = pos;
 238       next_.location.end_pos = pos + 1;
 239       Advance();
 240       return current_.token;
 241     }
 242   }
 243   Scan();
 244   return current_.token;
 245 }
 246
 247
 248 // TODO(yangguo): check whether this is actually necessary.
 249 static inline bool IsLittleEndianByteOrderMark(uc32 c) {
 250   // The Unicode value U+FFFE is guaranteed never to be assigned as a
 251   // Unicode character; this implies that in a Unicode context the
 252   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
 253   // character expressed in little-endian byte order (since it could
 254   // not be a U+FFFE character expressed in big-endian byte
 255   // order). Nevertheless, we check for it to be compatible with
 256   // Spidermonkey.
 257   return c == 0xFFFE;
 258 }
 259
 260
 261 bool Scanner::SkipWhiteSpace() {
 262   int start_position = source_pos();
 263
 264   while (true) {
 265     while (true) {
 266       // The unicode cache accepts unsigned inputs.
 267       if (c0_ < 0) break;
 268       // Advance as long as character is a WhiteSpace or LineTerminator.
 269       // Remember if the latter is the case.
 270       if (unicode_cache_->IsLineTerminator(c0_)) {
 271         has_line_terminator_before_next_ = true;
 272       } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
 273                  !IsLittleEndianByteOrderMark(c0_)) {
 274         break;
 275       }
 276       Advance();
 277     }
 278
 279     // If there is an HTML comment end '-->' at the beginning of a
 280     // line (with only whitespace in front of it), we treat the rest
 281     // of the line as a comment. This is in line with the way
 282     // SpiderMonkey handles it.
 283     if (c0_ == '-' && has_line_terminator_before_next_) {
 284       Advance();
 285       if (c0_ == '-') {
 286         Advance();
 287         if (c0_ == '>') {
 288           // Treat the rest of the line as a comment.
 289           SkipSingleLineComment();
 290           // Continue skipping white space after the comment.
 291           continue;
 292         }
 293         PushBack('-');  // undo Advance()
 294       }
 295       PushBack('-');  // undo Advance()
 296     }
 297     // Return whether or not we skipped any characters.
 298     return source_pos() != start_position;
 299   }
 300 }
 301
 302
 303 Token::Value Scanner::SkipSingleLineComment() {
 304   Advance();
 305
 306   // The line terminator at the end of the line is not considered
 307   // to be part of the single-line comment; it is recognized
 308   // separately by the lexical grammar and becomes part of the
 309   // stream of input elements for the syntactic grammar (see
 310   // ECMA-262, section 7.4).
 311   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
 312     Advance();
 313   }
 314
 315   return Token::WHITESPACE;
 316 }
 317
 318
 319 Token::Value Scanner::SkipSourceURLComment() {
 320   TryToParseSourceURLComment();
 321   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
 322     Advance();
 323   }
 324
 325   return Token::WHITESPACE;
 326 }
 327
 328
 329 void Scanner::TryToParseSourceURLComment() {
 330   // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
 331   // function will just return if it cannot parse a magic comment.
 332   if (c0_ < 0 || !unicode_cache_->IsWhiteSpace(c0_)) return;
 333   Advance();
 334   LiteralBuffer name;
 335   while (c0_ >= 0 && !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) &&
 336          c0_ != '=') {
 337     name.AddChar(c0_);
 338     Advance();
 339   }
 340   if (!name.is_one_byte()) return;
 341   Vector<const uint8_t> name_literal = name.one_byte_literal();
 342   LiteralBuffer* value;
 343   if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
 344     value = &source_url_;
 345   } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
 346     value = &source_mapping_url_;
 347   } else {
 348     return;
 349   }
 350   if (c0_ != '=')
 351     return;
 352   Advance();
 353   value->Reset();
 354   while (c0_ >= 0 && unicode_cache_->IsWhiteSpace(c0_)) {
 355     Advance();
 356   }
 357   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
 358     // Disallowed characters.
 359     if (c0_ == '"' || c0_ == '\'') {
 360       value->Reset();
 361       return;
 362     }
 363     if (unicode_cache_->IsWhiteSpace(c0_)) {
 364       break;
 365     }
 366     value->AddChar(c0_);
 367     Advance();
 368   }
 369   // Allow whitespace at the end.
 370   while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
 371     if (!unicode_cache_->IsWhiteSpace(c0_)) {
 372       value->Reset();
 373       break;
 374     }
 375     Advance();
 376   }
 377 }
 378
 379
 380 Token::Value Scanner::SkipMultiLineComment() {
 381   DCHECK(c0_ == '*');
 382   Advance();
 383
 384   while (c0_ >= 0) {
 385     uc32 ch = c0_;
 386     Advance();
 387     if (c0_ >= 0 && unicode_cache_->IsLineTerminator(ch)) {
 388       // Following ECMA-262, section 7.4, a comment containing
 389       // a newline will make the comment count as a line-terminator.
 390       has_multiline_comment_before_next_ = true;
 391     }
 392     // If we have reached the end of the multi-line comment, we
 393     // consume the '/' and insert a whitespace. This way all
 394     // multi-line comments are treated as whitespace.
 395     if (ch == '*' && c0_ == '/') {
 396       c0_ = ' ';
 397       return Token::WHITESPACE;
 398     }
 399   }
 400
 401   // Unterminated multi-line comment.
 402   return Token::ILLEGAL;
 403 }
 404
 405
 406 Token::Value Scanner::ScanHtmlComment() {
 407   // Check for <!-- comments.
 408   DCHECK(c0_ == '!');
 409   Advance();
 410   if (c0_ == '-') {
 411     Advance();
 412     if (c0_ == '-') return SkipSingleLineComment();
 413     PushBack('-');  // undo Advance()
 414   }
 415   PushBack('!');  // undo Advance()
 416   DCHECK(c0_ == '!');
 417   return Token::LT;
 418 }
 419
 420
 421 void Scanner::Scan() {
 422   next_.literal_chars = NULL;
 423   next_.raw_literal_chars = NULL;
 424   Token::Value token;
 425   do {
 426     // Remember the position of the next token
 427     next_.location.beg_pos = source_pos();
 428
 429     switch (c0_) {
 430       case ' ':
 431       case '\t':
 432         Advance();
 433         token = Token::WHITESPACE;
 434         break;
 435
 436       case '\n':
 437         Advance();
 438         has_line_terminator_before_next_ = true;
 439         token = Token::WHITESPACE;
 440         break;
 441
 442       case '"': case '\'':
 443         token = ScanString();
 444         break;
 445
 446       case '<':
 447         // < <= << <<= <!--
 448         Advance();
 449         if (c0_ == '=') {
 450           token = Select(Token::LTE);
 451         } else if (c0_ == '<') {
 452           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
 453         } else if (c0_ == '!') {
 454           token = ScanHtmlComment();
 455         } else {
 456           token = Token::LT;
 457         }
 458         break;
 459
 460       case '>':
 461         // > >= >> >>= >>> >>>=
 462         Advance();
 463         if (c0_ == '=') {
 464           token = Select(Token::GTE);
 465         } else if (c0_ == '>') {
 466           // >> >>= >>> >>>=
 467           Advance();
 468           if (c0_ == '=') {
 469             token = Select(Token::ASSIGN_SAR);
 470           } else if (c0_ == '>') {
 471             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
 472           } else {
 473             token = Token::SAR;
 474           }
 475         } else {
 476           token = Token::GT;
 477         }
 478         break;
 479
 480       case '=':
 481         // = == === =>
 482         Advance();
 483         if (c0_ == '=') {
 484           token = Select('=', Token::EQ_STRICT, Token::EQ);
 485         } else if (c0_ == '>') {
 486           token = Select(Token::ARROW);
 487         } else {
 488           token = Token::ASSIGN;
 489         }
 490         break;
 491
 492       case '!':
 493         // ! != !==
 494         Advance();
 495         if (c0_ == '=') {
 496           token = Select('=', Token::NE_STRICT, Token::NE);
 497         } else {
 498           token = Token::NOT;
 499         }
 500         break;
 501
 502       case '+':
 503         // + ++ +=
 504         Advance();
 505         if (c0_ == '+') {
 506           token = Select(Token::INC);
 507         } else if (c0_ == '=') {
 508           token = Select(Token::ASSIGN_ADD);
 509         } else {
 510           token = Token::ADD;
 511         }
 512         break;
 513
 514       case '-':
 515         // - -- --> -=
 516         Advance();
 517         if (c0_ == '-') {
 518           Advance();
 519           if (c0_ == '>' && has_line_terminator_before_next_) {
 520             // For compatibility with SpiderMonkey, we skip lines that
 521             // start with an HTML comment end '-->'.
 522             token = SkipSingleLineComment();
 523           } else {
 524             token = Token::DEC;
 525           }
 526         } else if (c0_ == '=') {
 527           token = Select(Token::ASSIGN_SUB);
 528         } else {
 529           token = Token::SUB;
 530         }
 531         break;
 532
 533       case '*':
 534         // * *=
 535         token = Select('=', Token::ASSIGN_MUL, Token::MUL);
 536         break;
 537
 538       case '%':
 539         // % %=
 540         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
 541         break;
 542
 543       case '/':
 544         // /  // /* /=
 545         Advance();
 546         if (c0_ == '/') {
 547           Advance();
 548           if (c0_ == '@' || c0_ == '#') {
 549             Advance();
 550             token = SkipSourceURLComment();
 551           } else {
 552             PushBack(c0_);
 553             token = SkipSingleLineComment();
 554           }
 555         } else if (c0_ == '*') {
 556           token = SkipMultiLineComment();
 557         } else if (c0_ == '=') {
 558           token = Select(Token::ASSIGN_DIV);
 559         } else {
 560           token = Token::DIV;
 561         }
 562         break;
 563
 564       case '&':
 565         // & && &=
 566         Advance();
 567         if (c0_ == '&') {
 568           token = Select(Token::AND);
 569         } else if (c0_ == '=') {
 570           token = Select(Token::ASSIGN_BIT_AND);
 571         } else {
 572           token = Token::BIT_AND;
 573         }
 574         break;
 575
 576       case '|':
 577         // | || |=
 578         Advance();
 579         if (c0_ == '|') {
 580           token = Select(Token::OR);
 581         } else if (c0_ == '=') {
 582           token = Select(Token::ASSIGN_BIT_OR);
 583         } else {
 584           token = Token::BIT_OR;
 585         }
 586         break;
 587
 588       case '^':
 589         // ^ ^=
 590         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
 591         break;
 592
 593       case '.':
 594         // . Number
 595         Advance();
 596         if (IsDecimalDigit(c0_)) {
 597           token = ScanNumber(true);
 598         } else {
 599           token = Token::PERIOD;
 600           if (c0_ == '.') {
 601             Advance();
 602             if (c0_ == '.') {
 603               Advance();
 604               token = Token::ELLIPSIS;
 605             } else {
 606               PushBack('.');
 607             }
 608           }
 609         }
 610         break;
 611
 612       case ':':
 613         token = Select(Token::COLON);
 614         break;
 615
 616       case ';':
 617         token = Select(Token::SEMICOLON);
 618         break;
 619
 620       case ',':
 621         token = Select(Token::COMMA);
 622         break;
 623
 624       case '(':
 625         token = Select(Token::LPAREN);
 626         break;
 627
 628       case ')':
 629         token = Select(Token::RPAREN);
 630         break;
 631
 632       case '[':
 633         token = Select(Token::LBRACK);
 634         break;
 635
 636       case ']':
 637         token = Select(Token::RBRACK);
 638         break;
 639
 640       case '{':
 641         token = Select(Token::LBRACE);
 642         break;
 643
 644       case '}':
 645         token = Select(Token::RBRACE);
 646         break;
 647
 648       case '?':
 649         token = Select(Token::CONDITIONAL);
 650         break;
 651
 652       case '~':
 653         token = Select(Token::BIT_NOT);
 654         break;
 655
 656       case '`':
 657         token = ScanTemplateStart();
 658         break;
 659
 660       default:
 661         if (c0_ < 0) {
 662           token = Token::EOS;
 663         } else if (unicode_cache_->IsIdentifierStart(c0_)) {
 664           token = ScanIdentifierOrKeyword();
 665         } else if (IsDecimalDigit(c0_)) {
 666           token = ScanNumber(false);
 667         } else if (SkipWhiteSpace()) {
 668           token = Token::WHITESPACE;
 669         } else {
 670           token = Select(Token::ILLEGAL);
 671         }
 672         break;
 673     }
 674
 675     // Continue scanning for tokens as long as we're just skipping
 676     // whitespace.
 677   } while (token == Token::WHITESPACE);
 678
 679   next_.location.end_pos = source_pos();
 680   next_.token = token;
 681 }
 682
 683
 684 void Scanner::SeekForward(int pos) {
 685   // After this call, we will have the token at the given position as
 686   // the "next" token. The "current" token will be invalid.
 687   if (pos == next_.location.beg_pos) return;
 688   int current_pos = source_pos();
 689   DCHECK_EQ(next_.location.end_pos, current_pos);
 690   // Positions inside the lookahead token aren't supported.
 691   DCHECK(pos >= current_pos);
 692   if (pos != current_pos) {
 693     source_->SeekForward(pos - source_->pos());
 694     Advance();
 695     // This function is only called to seek to the location
 696     // of the end of a function (at the "}" token). It doesn't matter
 697     // whether there was a line terminator in the part we skip.
 698     has_line_terminator_before_next_ = false;
 699     has_multiline_comment_before_next_ = false;
 700   }
 701   Scan();
 702 }
 703
 704
 705 template <bool capture_raw, bool in_template_literal>
 706 bool Scanner::ScanEscape() {
 707   uc32 c = c0_;
 708   Advance<capture_raw>();
 709
 710   // Skip escaped newlines.
 711   if (!in_template_literal && c0_ >= 0 && unicode_cache_->IsLineTerminator(c)) {
 712     // Allow CR+LF newlines in multiline string literals.
 713     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>();
 714     // Allow LF+CR newlines in multiline string literals.
 715     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>();
 716     return true;
 717   }
 718
 719   switch (c) {
 720     case '\'':  // fall through
 721     case '"' :  // fall through
 722     case '\\': break;
 723     case 'b' : c = '\b'; break;
 724     case 'f' : c = '\f'; break;
 725     case 'n' : c = '\n'; break;
 726     case 'r' : c = '\r'; break;
 727     case 't' : c = '\t'; break;
 728     case 'u' : {
 729       c = ScanUnicodeEscape<capture_raw>();
 730       if (c < 0) return false;
 731       break;
 732     }
 733     case 'v':
 734       c = '\v';
 735       break;
 736     case 'x': {
 737       c = ScanHexNumber<capture_raw>(2);
 738       if (c < 0) return false;
 739       break;
 740     }
 741     case '0':  // Fall through.
 742     case '1':  // fall through
 743     case '2':  // fall through
 744     case '3':  // fall through
 745     case '4':  // fall through
 746     case '5':  // fall through
 747     case '6':  // fall through
 748     case '7':
 749       c = ScanOctalEscape<capture_raw>(c, 2);
 750       break;
 751   }
 752
 753   // According to ECMA-262, section 7.8.4, characters not covered by the
 754   // above cases should be illegal, but they are commonly handled as
 755   // non-escaped characters by JS VMs.
 756   AddLiteralChar(c);
 757   return true;
 758 }
 759
 760
 761 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
 762 // ECMA-262. Other JS VMs support them.
 763 template <bool capture_raw>
 764 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
 765   uc32 x = c - '0';
 766   int i = 0;
 767   for (; i < length; i++) {
 768     int d = c0_ - '0';
 769     if (d < 0 || d > 7) break;
 770     int nx = x * 8 + d;
 771     if (nx >= 256) break;
 772     x = nx;
 773     Advance<capture_raw>();
 774   }
 775   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
 776   // Remember the position of octal escape sequences so that an error
 777   // can be reported later (in strict mode).
 778   // We don't report the error immediately, because the octal escape can
 779   // occur before the "use strict" directive.
 780   if (c != '0' || i > 0) {
 781     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
 782   }
 783   return x;
 784 }
 785
 786
 787 const int kMaxAscii = 127;
 788
 789
 790 Token::Value Scanner::ScanString() {
 791   uc32 quote = c0_;
 792   Advance<false, false>();  // consume quote
 793
 794   LiteralScope literal(this);
 795   while (true) {
 796     if (c0_ > kMaxAscii) {
 797       HandleLeadSurrogate();
 798       break;
 799     }
 800     if (c0_ < 0 || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL;
 801     if (c0_ == quote) {
 802       literal.Complete();
 803       Advance<false, false>();
 804       return Token::STRING;
 805     }
 806     uc32 c = c0_;
 807     if (c == '\\') break;
 808     Advance<false, false>();
 809     AddLiteralChar(c);
 810   }
 811
 812   while (c0_ != quote && c0_ >= 0
 813          && !unicode_cache_->IsLineTerminator(c0_)) {
 814     uc32 c = c0_;
 815     Advance();
 816     if (c == '\\') {
 817       if (c0_ < 0 || !ScanEscape<false, false>()) return Token::ILLEGAL;
 818     } else {
 819       AddLiteralChar(c);
 820     }
 821   }
 822   if (c0_ != quote) return Token::ILLEGAL;
 823   literal.Complete();
 824
 825   Advance();  // consume quote
 826   return Token::STRING;
 827 }
 828
 829
 830 Token::Value Scanner::ScanTemplateSpan() {
 831   // When scanning a TemplateSpan, we are looking for the following construct:
 832   // TEMPLATE_SPAN ::
 833   //     ` LiteralChars* ${
 834   //   | } LiteralChars* ${
 835   //
 836   // TEMPLATE_TAIL ::
 837   //     ` LiteralChars* `
 838   //   | } LiteralChar* `
 839   //
 840   // A TEMPLATE_SPAN should always be followed by an Expression, while a
 841   // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
 842   // followed by an Expression.
 843
 844   Token::Value result = Token::TEMPLATE_SPAN;
 845   LiteralScope literal(this);
 846   StartRawLiteral();
 847   const bool capture_raw = true;
 848   const bool in_template_literal = true;
 849
 850   while (true) {
 851     uc32 c = c0_;
 852     Advance<capture_raw>();
 853     if (c == '`') {
 854       result = Token::TEMPLATE_TAIL;
 855       ReduceRawLiteralLength(1);
 856       break;
 857     } else if (c == '$' && c0_ == '{') {
 858       Advance<capture_raw>();  // Consume '{'
 859       ReduceRawLiteralLength(2);
 860       break;
 861     } else if (c == '\\') {
 862       if (c0_ > 0 && unicode_cache_->IsLineTerminator(c0_)) {
 863         // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
 864         // code unit sequence.
 865         uc32 lastChar = c0_;
 866         Advance<capture_raw>();
 867         if (lastChar == '\r') {
 868           ReduceRawLiteralLength(1);  // Remove \r
 869           if (c0_ == '\n') {
 870             Advance<capture_raw>();  // Adds \n
 871           } else {
 872             AddRawLiteralChar('\n');
 873           }
 874         }
 875       } else if (!ScanEscape<capture_raw, in_template_literal>()) {
 876         return Token::ILLEGAL;
 877       }
 878     } else if (c < 0) {
 879       // Unterminated template literal
 880       PushBack(c);
 881       break;
 882     } else {
 883       // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
 884       // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
 885       // consisting of the CV 0x000A.
 886       if (c == '\r') {
 887         ReduceRawLiteralLength(1);  // Remove \r
 888         if (c0_ == '\n') {
 889           Advance<capture_raw>();  // Adds \n
 890         } else {
 891           AddRawLiteralChar('\n');
 892         }
 893         c = '\n';
 894       }
 895       AddLiteralChar(c);
 896     }
 897   }
 898   literal.Complete();
 899   next_.location.end_pos = source_pos();
 900   next_.token = result;
 901   return result;
 902 }
 903
 904
 905 Token::Value Scanner::ScanTemplateStart() {
 906   DCHECK(c0_ == '`');
 907   next_.location.beg_pos = source_pos();
 908   Advance();  // Consume `
 909   return ScanTemplateSpan();
 910 }
 911
 912
 913 Token::Value Scanner::ScanTemplateContinuation() {
 914   DCHECK_EQ(next_.token, Token::RBRACE);
 915   next_.location.beg_pos = source_pos() - 1;  // We already consumed }
 916   return ScanTemplateSpan();
 917 }
 918
 919
 920 void Scanner::ScanDecimalDigits() {
 921   while (IsDecimalDigit(c0_))
 922     AddLiteralCharAdvance();
 923 }
 924
 925
 926 Token::Value Scanner::ScanNumber(bool seen_period) {
 927   DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
 928
 929   enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL;
 930
 931   LiteralScope literal(this);
 932   bool at_start = !seen_period;
 933   if (seen_period) {
 934     // we have already seen a decimal point of the float
 935     AddLiteralChar('.');
 936     ScanDecimalDigits();  // we know we have at least one digit
 937
 938   } else {
 939     // if the first character is '0' we must check for octals and hex
 940     if (c0_ == '0') {
 941       int start_pos = source_pos();  // For reporting octal positions.
 942       AddLiteralCharAdvance();
 943
 944       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
 945       // an octal number.
 946       if (c0_ == 'x' || c0_ == 'X') {
 947         // hex number
 948         kind = HEX;
 949         AddLiteralCharAdvance();
 950         if (!IsHexDigit(c0_)) {
 951           // we must have at least one hex digit after 'x'/'X'
 952           return Token::ILLEGAL;
 953         }
 954         while (IsHexDigit(c0_)) {
 955           AddLiteralCharAdvance();
 956         }
 957       } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) {
 958         kind = OCTAL;
 959         AddLiteralCharAdvance();
 960         if (!IsOctalDigit(c0_)) {
 961           // we must have at least one octal digit after 'o'/'O'
 962           return Token::ILLEGAL;
 963         }
 964         while (IsOctalDigit(c0_)) {
 965           AddLiteralCharAdvance();
 966         }
 967       } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) {
 968         kind = BINARY;
 969         AddLiteralCharAdvance();
 970         if (!IsBinaryDigit(c0_)) {
 971           // we must have at least one binary digit after 'b'/'B'
 972           return Token::ILLEGAL;
 973         }
 974         while (IsBinaryDigit(c0_)) {
 975           AddLiteralCharAdvance();
 976         }
 977       } else if ('0' <= c0_ && c0_ <= '7') {
 978         // (possible) octal number
 979         kind = IMPLICIT_OCTAL;
 980         while (true) {
 981           if (c0_ == '8' || c0_ == '9') {
 982             at_start = false;
 983             kind = DECIMAL;
 984             break;
 985           }
 986           if (c0_  < '0' || '7'  < c0_) {
 987             // Octal literal finished.
 988             octal_pos_ = Location(start_pos, source_pos());
 989             break;
 990           }
 991           AddLiteralCharAdvance();
 992         }
 993       }
 994     }
 995
 996     // Parse decimal digits and allow trailing fractional part.
 997     if (kind == DECIMAL) {
 998       if (at_start) {
 999         uint64_t value = 0;
1000         while (IsDecimalDigit(c0_)) {
1001           value = 10 * value + (c0_ - '0');
1002
1003           uc32 first_char = c0_;
1004           Advance<false, false>();
1005           AddLiteralChar(first_char);
1006         }
1007
1008         if (next_.literal_chars->one_byte_literal().length() <= 10 &&
1009             value <= Smi::kMaxValue && c0_ != '.' && c0_ != 'e' && c0_ != 'E') {
1010           smi_value_ = static_cast<int>(value);
1011           literal.Complete();
1012           HandleLeadSurrogate();
1013
1014           return Token::SMI;
1015         }
1016         HandleLeadSurrogate();
1017       }
1018
1019       ScanDecimalDigits();  // optional
1020       if (c0_ == '.') {
1021         AddLiteralCharAdvance();
1022         ScanDecimalDigits();  // optional
1023       }
1024     }
1025   }
1026
1027   // scan exponent, if any
1028   if (c0_ == 'e' || c0_ == 'E') {
1029     DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
1030     if (kind != DECIMAL) return Token::ILLEGAL;
1031     // scan exponent
1032     AddLiteralCharAdvance();
1033     if (c0_ == '+' || c0_ == '-')
1034       AddLiteralCharAdvance();
1035     if (!IsDecimalDigit(c0_)) {
1036       // we must have at least one decimal digit after 'e'/'E'
1037       return Token::ILLEGAL;
1038     }
1039     ScanDecimalDigits();
1040   }
1041
1042   // The source character immediately following a numeric literal must
1043   // not be an identifier start or a decimal digit; see ECMA-262
1044   // section 7.8.3, page 17 (note that we read only one decimal digit
1045   // if the value is 0).
1046   if (IsDecimalDigit(c0_) ||
1047       (c0_ >= 0 && unicode_cache_->IsIdentifierStart(c0_)))
1048     return Token::ILLEGAL;
1049
1050   literal.Complete();
1051
1052   return Token::NUMBER;
1053 }
1054
1055
1056 uc32 Scanner::ScanIdentifierUnicodeEscape() {
1057   Advance();
1058   if (c0_ != 'u') return -1;
1059   Advance();
1060   return ScanUnicodeEscape<false>();
1061 }
1062
1063
1064 template <bool capture_raw>
1065 uc32 Scanner::ScanUnicodeEscape() {
1066   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1067   // allowed). In the latter case, the number of hex digits between { } is
1068   // arbitrary. \ and u have already been read.
1069   if (c0_ == '{' && HarmonyUnicode()) {
1070     Advance<capture_raw>();
1071     uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff);
1072     if (cp < 0) {
1073       return -1;
1074     }
1075     if (c0_ != '}') {
1076       return -1;
1077     }
1078     Advance<capture_raw>();
1079     return cp;
1080   }
1081   return ScanHexNumber<capture_raw>(4);
1082 }
1083
1084
1085 // ----------------------------------------------------------------------------
1086 // Keyword Matcher
1087
1088 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                                  \
1089   KEYWORD_GROUP('b')                                                      \
1090   KEYWORD("break", Token::BREAK)                                          \
1091   KEYWORD_GROUP('c')                                                      \
1092   KEYWORD("case", Token::CASE)                                            \
1093   KEYWORD("catch", Token::CATCH)                                          \
1094   KEYWORD("class",                                                        \
1095           harmony_classes ? Token::CLASS : Token::FUTURE_RESERVED_WORD)   \
1096   KEYWORD("const", Token::CONST)                                          \
1097   KEYWORD("continue", Token::CONTINUE)                                    \
1098   KEYWORD_GROUP('d')                                                      \
1099   KEYWORD("debugger", Token::DEBUGGER)                                    \
1100   KEYWORD("default", Token::DEFAULT)                                      \
1101   KEYWORD("delete", Token::DELETE)                                        \
1102   KEYWORD("do", Token::DO)                                                \
1103   KEYWORD_GROUP('e')                                                      \
1104   KEYWORD("else", Token::ELSE)                                            \
1105   KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                            \
1106   KEYWORD("export",                                                       \
1107           harmony_modules ? Token::EXPORT : Token::FUTURE_RESERVED_WORD)  \
1108   KEYWORD("extends",                                                      \
1109           harmony_classes ? Token::EXTENDS : Token::FUTURE_RESERVED_WORD) \
1110   KEYWORD_GROUP('f')                                                      \
1111   KEYWORD("false", Token::FALSE_LITERAL)                                  \
1112   KEYWORD("finally", Token::FINALLY)                                      \
1113   KEYWORD("for", Token::FOR)                                              \
1114   KEYWORD("function", Token::FUNCTION)                                    \
1115   KEYWORD_GROUP('i')                                                      \
1116   KEYWORD("if", Token::IF)                                                \
1117   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)               \
1118   KEYWORD("import",                                                       \
1119           harmony_modules ? Token::IMPORT : Token::FUTURE_RESERVED_WORD)  \
1120   KEYWORD("in", Token::IN)                                                \
1121   KEYWORD("instanceof", Token::INSTANCEOF)                                \
1122   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)                \
1123   KEYWORD_GROUP('l')                                                      \
1124   KEYWORD("let", Token::LET)                                              \
1125   KEYWORD_GROUP('n')                                                      \
1126   KEYWORD("new", Token::NEW)                                              \
1127   KEYWORD("null", Token::NULL_LITERAL)                                    \
1128   KEYWORD_GROUP('p')                                                      \
1129   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)                  \
1130   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)                  \
1131   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)                \
1132   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)                   \
1133   KEYWORD_GROUP('r')                                                      \
1134   KEYWORD("return", Token::RETURN)                                        \
1135   KEYWORD_GROUP('s')                                                      \
1136   KEYWORD("static", harmony_classes ? Token::STATIC                       \
1137                                     : Token::FUTURE_STRICT_RESERVED_WORD) \
1138   KEYWORD("super",                                                        \
1139           harmony_classes ? Token::SUPER : Token::FUTURE_RESERVED_WORD)   \
1140   KEYWORD("switch", Token::SWITCH)                                        \
1141   KEYWORD_GROUP('t')                                                      \
1142   KEYWORD("this", Token::THIS)                                            \
1143   KEYWORD("throw", Token::THROW)                                          \
1144   KEYWORD("true", Token::TRUE_LITERAL)                                    \
1145   KEYWORD("try", Token::TRY)                                              \
1146   KEYWORD("typeof", Token::TYPEOF)                                        \
1147   KEYWORD_GROUP('v')                                                      \
1148   KEYWORD("var", Token::VAR)                                              \
1149   KEYWORD("void", Token::VOID)                                            \
1150   KEYWORD_GROUP('w')                                                      \
1151   KEYWORD("while", Token::WHILE)                                          \
1152   KEYWORD("with", Token::WITH)                                            \
1153   KEYWORD_GROUP('y')                                                      \
1154   KEYWORD("yield", Token::YIELD)
1155
1156
1157 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
1158                                              int input_length,
1159                                              bool harmony_modules,
1160                                              bool harmony_classes) {
1161   DCHECK(input_length >= 1);
1162   const int kMinLength = 2;
1163   const int kMaxLength = 10;
1164   if (input_length < kMinLength || input_length > kMaxLength) {
1165     return Token::IDENTIFIER;
1166   }
1167   switch (input[0]) {
1168     default:
1169 #define KEYWORD_GROUP_CASE(ch)                                \
1170       break;                                                  \
1171     case ch:
1172 #define KEYWORD(keyword, token)                               \
1173     {                                                         \
1174       /* 'keyword' is a char array, so sizeof(keyword) is */  \
1175       /* strlen(keyword) plus 1 for the NUL char. */          \
1176       const int keyword_length = sizeof(keyword) - 1;         \
1177       STATIC_ASSERT(keyword_length >= kMinLength);            \
1178       STATIC_ASSERT(keyword_length <= kMaxLength);            \
1179       if (input_length == keyword_length &&                   \
1180           input[1] == keyword[1] &&                           \
1181           (keyword_length <= 2 || input[2] == keyword[2]) &&  \
1182           (keyword_length <= 3 || input[3] == keyword[3]) &&  \
1183           (keyword_length <= 4 || input[4] == keyword[4]) &&  \
1184           (keyword_length <= 5 || input[5] == keyword[5]) &&  \
1185           (keyword_length <= 6 || input[6] == keyword[6]) &&  \
1186           (keyword_length <= 7 || input[7] == keyword[7]) &&  \
1187           (keyword_length <= 8 || input[8] == keyword[8]) &&  \
1188           (keyword_length <= 9 || input[9] == keyword[9])) {  \
1189         return token;                                         \
1190       }                                                       \
1191     }
1192     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
1193   }
1194   return Token::IDENTIFIER;
1195 }
1196
1197
1198 bool Scanner::IdentifierIsFutureStrictReserved(
1199     const AstRawString* string) const {
1200   // Keywords are always 1-byte strings.
1201   if (!string->is_one_byte()) return false;
1202   if (string->IsOneByteEqualTo("let") || string->IsOneByteEqualTo("static") ||
1203       string->IsOneByteEqualTo("yield")) {
1204     return true;
1205   }
1206   return Token::FUTURE_STRICT_RESERVED_WORD ==
1207          KeywordOrIdentifierToken(string->raw_data(), string->length(),
1208                                   harmony_modules_, harmony_classes_);
1209 }
1210
1211
1212 Token::Value Scanner::ScanIdentifierOrKeyword() {
1213   DCHECK(unicode_cache_->IsIdentifierStart(c0_));
1214   LiteralScope literal(this);
1215   if (IsInRange(c0_, 'a', 'z')) {
1216     do {
1217       uc32 first_char = c0_;
1218       Advance<false, false>();
1219       AddLiteralChar(first_char);
1220     } while (IsInRange(c0_, 'a', 'z'));
1221
1222     if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' ||
1223         c0_ == '$') {
1224       // Identifier starting with lowercase.
1225       uc32 first_char = c0_;
1226       Advance<false, false>();
1227       AddLiteralChar(first_char);
1228       while (IsAsciiIdentifier(c0_)) {
1229         uc32 first_char = c0_;
1230         Advance<false, false>();
1231         AddLiteralChar(first_char);
1232       }
1233       if (c0_ <= kMaxAscii && c0_ != '\\') {
1234         literal.Complete();
1235         return Token::IDENTIFIER;
1236       }
1237     } else if (c0_ <= kMaxAscii && c0_ != '\\') {
1238       // Only a-z+: could be a keyword or identifier.
1239       literal.Complete();
1240       Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1241       return KeywordOrIdentifierToken(chars.start(), chars.length(),
1242                                       harmony_modules_, harmony_classes_);
1243     }
1244
1245     HandleLeadSurrogate();
1246   } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') {
1247     do {
1248       uc32 first_char = c0_;
1249       Advance<false, false>();
1250       AddLiteralChar(first_char);
1251     } while (IsAsciiIdentifier(c0_));
1252
1253     if (c0_ <= kMaxAscii && c0_ != '\\') {
1254       literal.Complete();
1255       return Token::IDENTIFIER;
1256     }
1257
1258     HandleLeadSurrogate();
1259   } else if (c0_ == '\\') {
1260     // Scan identifier start character.
1261     uc32 c = ScanIdentifierUnicodeEscape();
1262     // Only allow legal identifier start characters.
1263     if (c < 0 ||
1264         c == '\\' ||  // No recursive escapes.
1265         !unicode_cache_->IsIdentifierStart(c)) {
1266       return Token::ILLEGAL;
1267     }
1268     AddLiteralChar(c);
1269     return ScanIdentifierSuffix(&literal);
1270   } else {
1271     uc32 first_char = c0_;
1272     Advance();
1273     AddLiteralChar(first_char);
1274   }
1275
1276   // Scan the rest of the identifier characters.
1277   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
1278     if (c0_ != '\\') {
1279       uc32 next_char = c0_;
1280       Advance();
1281       AddLiteralChar(next_char);
1282       continue;
1283     }
1284     // Fallthrough if no longer able to complete keyword.
1285     return ScanIdentifierSuffix(&literal);
1286   }
1287
1288   literal.Complete();
1289
1290   if (next_.literal_chars->is_one_byte()) {
1291     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1292     return KeywordOrIdentifierToken(chars.start(),
1293                                     chars.length(),
1294                                     harmony_modules_,
1295                                     harmony_classes_);
1296   }
1297   return Token::IDENTIFIER;
1298 }
1299
1300
1301 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
1302   // Scan the rest of the identifier characters.
1303   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
1304     if (c0_ == '\\') {
1305       uc32 c = ScanIdentifierUnicodeEscape();
1306       // Only allow legal identifier part characters.
1307       if (c < 0 ||
1308           c == '\\' ||
1309           !unicode_cache_->IsIdentifierPart(c)) {
1310         return Token::ILLEGAL;
1311       }
1312       AddLiteralChar(c);
1313     } else {
1314       AddLiteralChar(c0_);
1315       Advance();
1316     }
1317   }
1318   literal->Complete();
1319
1320   return Token::IDENTIFIER;
1321 }
1322
1323
1324 bool Scanner::ScanRegExpPattern(bool seen_equal) {
1325   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1326   bool in_character_class = false;
1327
1328   // Previous token is either '/' or '/=', in the second case, the
1329   // pattern starts at =.
1330   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1331   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1332
1333   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1334   // the scanner should pass uninterpreted bodies to the RegExp
1335   // constructor.
1336   LiteralScope literal(this);
1337   if (seen_equal) {
1338     AddLiteralChar('=');
1339   }
1340
1341   while (c0_ != '/' || in_character_class) {
1342     if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false;
1343     if (c0_ == '\\') {  // Escape sequence.
1344       AddLiteralCharAdvance();
1345       if (c0_ < 0 || unicode_cache_->IsLineTerminator(c0_)) return false;
1346       AddLiteralCharAdvance();
1347       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1348       // only "safe" characters are allowed (letters, digits, underscore),
1349       // otherwise the escape isn't valid and the invalid character has
1350       // its normal meaning. I.e., we can just continue scanning without
1351       // worrying whether the following characters are part of the escape
1352       // or not, since any '/', '\\' or '[' is guaranteed to not be part
1353       // of the escape sequence.
1354
1355       // TODO(896): At some point, parse RegExps more throughly to capture
1356       // octal esacpes in strict mode.
1357     } else {  // Unescaped character.
1358       if (c0_ == '[') in_character_class = true;
1359       if (c0_ == ']') in_character_class = false;
1360       AddLiteralCharAdvance();
1361     }
1362   }
1363   Advance();  // consume '/'
1364
1365   literal.Complete();
1366
1367   return true;
1368 }
1369
1370
1371 bool Scanner::ScanRegExpFlags() {
1372   // Scan regular expression flags.
1373   LiteralScope literal(this);
1374   while (c0_ >= 0 && unicode_cache_->IsIdentifierPart(c0_)) {
1375     if (c0_ != '\\') {
1376       AddLiteralCharAdvance();
1377     } else {
1378       return false;
1379     }
1380   }
1381   literal.Complete();
1382
1383   next_.location.end_pos = source_pos() - 1;
1384   return true;
1385 }
1386
1387
1388 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) {
1389   if (is_literal_one_byte()) {
1390     return ast_value_factory->GetOneByteString(literal_one_byte_string());
1391   }
1392   return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1393 }
1394
1395
1396 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) {
1397   if (is_next_literal_one_byte()) {
1398     return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1399   }
1400   return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1401 }
1402
1403
1404 const AstRawString* Scanner::CurrentRawSymbol(
1405     AstValueFactory* ast_value_factory) {
1406   if (is_raw_literal_one_byte()) {
1407     return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1408   }
1409   return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1410 }
1411
1412
1413 double Scanner::DoubleValue() {
1414   DCHECK(is_literal_one_byte());
1415   return StringToDouble(
1416       unicode_cache_,
1417       literal_one_byte_string(),
1418       ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1419 }
1420
1421
1422 int Scanner::FindSymbol(DuplicateFinder* finder, int value) {
1423   if (is_literal_one_byte()) {
1424     return finder->AddOneByteSymbol(literal_one_byte_string(), value);
1425   }
1426   return finder->AddTwoByteSymbol(literal_two_byte_string(), value);
1427 }
1428
1429
1430 int DuplicateFinder::AddOneByteSymbol(Vector<const uint8_t> key, int value) {
1431   return AddSymbol(key, true, value);
1432 }
1433
1434
1435 int DuplicateFinder::AddTwoByteSymbol(Vector<const uint16_t> key, int value) {
1436   return AddSymbol(Vector<const uint8_t>::cast(key), false, value);
1437 }
1438
1439
1440 int DuplicateFinder::AddSymbol(Vector<const uint8_t> key,
1441                                bool is_one_byte,
1442                                int value) {
1443   uint32_t hash = Hash(key, is_one_byte);
1444   byte* encoding = BackupKey(key, is_one_byte);
1445   HashMap::Entry* entry = map_.Lookup(encoding, hash, true);
1446   int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value));
1447   entry->value =
1448     reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value));
1449   return old_value;
1450 }
1451
1452
1453 int DuplicateFinder::AddNumber(Vector<const uint8_t> key, int value) {
1454   DCHECK(key.length() > 0);
1455   // Quick check for already being in canonical form.
1456   if (IsNumberCanonical(key)) {
1457     return AddOneByteSymbol(key, value);
1458   }
1459
1460   int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY;
1461   double double_value = StringToDouble(
1462       unicode_constants_, key, flags, 0.0);
1463   int length;
1464   const char* string;
1465   if (!std::isfinite(double_value)) {
1466     string = "Infinity";
1467     length = 8;  // strlen("Infinity");
1468   } else {
1469     string = DoubleToCString(double_value,
1470                              Vector<char>(number_buffer_, kBufferSize));
1471     length = StrLength(string);
1472   }
1473   return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string),
1474                                       length), true, value);
1475 }
1476
1477
1478 bool DuplicateFinder::IsNumberCanonical(Vector<const uint8_t> number) {
1479   // Test for a safe approximation of number literals that are already
1480   // in canonical form: max 15 digits, no leading zeroes, except an
1481   // integer part that is a single zero, and no trailing zeros below
1482   // the decimal point.
1483   int pos = 0;
1484   int length = number.length();
1485   if (number.length() > 15) return false;
1486   if (number[pos] == '0') {
1487     pos++;
1488   } else {
1489     while (pos < length &&
1490            static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++;
1491   }
1492   if (length == pos) return true;
1493   if (number[pos] != '.') return false;
1494   pos++;
1495   bool invalid_last_digit = true;
1496   while (pos < length) {
1497     uint8_t digit = number[pos] - '0';
1498     if (digit > '9' - '0') return false;
1499     invalid_last_digit = (digit == 0);
1500     pos++;
1501   }
1502   return !invalid_last_digit;
1503 }
1504
1505
1506 uint32_t DuplicateFinder::Hash(Vector<const uint8_t> key, bool is_one_byte) {
1507   // Primitive hash function, almost identical to the one used
1508   // for strings (except that it's seeded by the length and representation).
1509   int length = key.length();
1510   uint32_t hash = (length << 1) | (is_one_byte ? 1 : 0) ;
1511   for (int i = 0; i < length; i++) {
1512     uint32_t c = key[i];
1513     hash = (hash + c) * 1025;
1514     hash ^= (hash >> 6);
1515   }
1516   return hash;
1517 }
1518
1519
1520 bool DuplicateFinder::Match(void* first, void* second) {
1521   // Decode lengths.
1522   // Length + representation is encoded as base 128, most significant heptet
1523   // first, with a 8th bit being non-zero while there are more heptets.
1524   // The value encodes the number of bytes following, and whether the original
1525   // was Latin1.
1526   byte* s1 = reinterpret_cast<byte*>(first);
1527   byte* s2 = reinterpret_cast<byte*>(second);
1528   uint32_t length_one_byte_field = 0;
1529   byte c1;
1530   do {
1531     c1 = *s1;
1532     if (c1 != *s2) return false;
1533     length_one_byte_field = (length_one_byte_field << 7) | (c1 & 0x7f);
1534     s1++;
1535     s2++;
1536   } while ((c1 & 0x80) != 0);
1537   int length = static_cast<int>(length_one_byte_field >> 1);
1538   return memcmp(s1, s2, length) == 0;
1539 }
1540
1541
1542 byte* DuplicateFinder::BackupKey(Vector<const uint8_t> bytes,
1543                                  bool is_one_byte) {
1544   uint32_t one_byte_length = (bytes.length() << 1) | (is_one_byte ? 1 : 0);
1545   backing_store_.StartSequence();
1546   // Emit one_byte_length as base-128 encoded number, with the 7th bit set
1547   // on the byte of every heptet except the last, least significant, one.
1548   if (one_byte_length >= (1 << 7)) {
1549     if (one_byte_length >= (1 << 14)) {
1550       if (one_byte_length >= (1 << 21)) {
1551         if (one_byte_length >= (1 << 28)) {
1552           backing_store_.Add(
1553               static_cast<uint8_t>((one_byte_length >> 28) | 0x80));
1554         }
1555         backing_store_.Add(
1556             static_cast<uint8_t>((one_byte_length >> 21) | 0x80u));
1557       }
1558       backing_store_.Add(
1559           static_cast<uint8_t>((one_byte_length >> 14) | 0x80u));
1560     }
1561     backing_store_.Add(static_cast<uint8_t>((one_byte_length >> 7) | 0x80u));
1562   }
1563   backing_store_.Add(static_cast<uint8_t>(one_byte_length & 0x7f));
1564
1565   backing_store_.AddBlock(bytes);
1566   return backing_store_.EndSequence().start();
1567 }
1568
1569 } }  // namespace v8::internal