src/3rdparty/v8/src/scanner.h

   1 // Copyright 2011 the V8 project authors. All rights reserved.
   2 // Redistribution and use in source and binary forms, with or without
   3 // modification, are permitted provided that the following conditions are
   4 // met:
   5 //
   6 //     * Redistributions of source code must retain the above copyright
   7 //       notice, this list of conditions and the following disclaimer.
   8 //     * Redistributions in binary form must reproduce the above
   9 //       copyright notice, this list of conditions and the following
  10 //       disclaimer in the documentation and/or other materials provided
  11 //       with the distribution.
  12 //     * Neither the name of Google Inc. nor the names of its
  13 //       contributors may be used to endorse or promote products derived
  14 //       from this software without specific prior written permission.
  15 //
  16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27
  28 // Features shared by parsing and pre-parsing scanners.
  29
  30 #ifndef V8_SCANNER_H_
  31 #define V8_SCANNER_H_
  32
  33 #include "allocation.h"
  34 #include "char-predicates.h"
  35 #include "checks.h"
  36 #include "globals.h"
  37 #include "token.h"
  38 #include "unicode-inl.h"
  39 #include "utils.h"
  40
  41 namespace v8 {
  42 namespace internal {
  43
  44
  45 // General collection of (multi-)bit-flags that can be passed to scanners and
  46 // parsers to signify their (initial) mode of operation.
  47 enum ParsingFlags {
  48   kNoParsingFlags = 0,
  49   // Embed LanguageMode values in parsing flags, i.e., equivalent to:
  50   // CLASSIC_MODE = 0,
  51   // STRICT_MODE,
  52   // EXTENDED_MODE,
  53   kLanguageModeMask = 0x03,
  54   kAllowLazy = 0x04,
  55   kAllowNativesSyntax = 0x08,
  56   kAllowModules = 0x10
  57 };
  58
  59 STATIC_ASSERT((kLanguageModeMask & CLASSIC_MODE) == CLASSIC_MODE);
  60 STATIC_ASSERT((kLanguageModeMask & STRICT_MODE) == STRICT_MODE);
  61 STATIC_ASSERT((kLanguageModeMask & EXTENDED_MODE) == EXTENDED_MODE);
  62
  63
  64 // Returns the value (0 .. 15) of a hexadecimal character c.
  65 // If c is not a legal hexadecimal character, returns a value < 0.
  66 inline int HexValue(uc32 c) {
  67   c -= '0';
  68   if (static_cast<unsigned>(c) <= 9) return c;
  69   c = (c | 0x20) - ('a' - '0');  // detect 0x11..0x16 and 0x31..0x36.
  70   if (static_cast<unsigned>(c) <= 5) return c + 10;
  71   return -1;
  72 }
  73
  74
  75 // ---------------------------------------------------------------------
  76 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
  77 // A code unit is a 16 bit value representing either a 16 bit code point
  78 // or one part of a surrogate pair that make a single 21 bit code point.
  79
  80 class Utf16CharacterStream {
  81  public:
  82   Utf16CharacterStream() : pos_(0) { }
  83   virtual ~Utf16CharacterStream() { }
  84
  85   // Returns and advances past the next UTF-16 code unit in the input
  86   // stream. If there are no more code units, it returns a negative
  87   // value.
  88   inline uc32 Advance() {
  89     if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
  90       pos_++;
  91       return static_cast<uc32>(*(buffer_cursor_++));
  92     }
  93     // Note: currently the following increment is necessary to avoid a
  94     // parser problem! The scanner treats the final kEndOfInput as
  95     // a code unit with a position, and does math relative to that
  96     // position.
  97     pos_++;
  98
  99     return kEndOfInput;
 100   }
 101
 102   // Return the current position in the code unit stream.
 103   // Starts at zero.
 104   inline unsigned pos() const { return pos_; }
 105
 106   // Skips forward past the next code_unit_count UTF-16 code units
 107   // in the input, or until the end of input if that comes sooner.
 108   // Returns the number of code units actually skipped. If less
 109   // than code_unit_count,
 110   inline unsigned SeekForward(unsigned code_unit_count) {
 111     unsigned buffered_chars =
 112         static_cast<unsigned>(buffer_end_ - buffer_cursor_);
 113     if (code_unit_count <= buffered_chars) {
 114       buffer_cursor_ += code_unit_count;
 115       pos_ += code_unit_count;
 116       return code_unit_count;
 117     }
 118     return SlowSeekForward(code_unit_count);
 119   }
 120
 121   // Pushes back the most recently read UTF-16 code unit (or negative
 122   // value if at end of input), i.e., the value returned by the most recent
 123   // call to Advance.
 124   // Must not be used right after calling SeekForward.
 125   virtual void PushBack(int32_t code_unit) = 0;
 126
 127  protected:
 128   static const uc32 kEndOfInput = -1;
 129
 130   // Ensures that the buffer_cursor_ points to the code_unit at
 131   // position pos_ of the input, if possible. If the position
 132   // is at or after the end of the input, return false. If there
 133   // are more code_units available, return true.
 134   virtual bool ReadBlock() = 0;
 135   virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0;
 136
 137   const uc16* buffer_cursor_;
 138   const uc16* buffer_end_;
 139   unsigned pos_;
 140 };
 141
 142
 143 class UnicodeCache {
 144 // ---------------------------------------------------------------------
 145 // Caching predicates used by scanners.
 146  public:
 147   UnicodeCache() {}
 148   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
 149
 150   StaticResource<Utf8Decoder>* utf8_decoder() {
 151     return &utf8_decoder_;
 152   }
 153
 154   bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }
 155   bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
 156   bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
 157   bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
 158
 159  private:
 160   unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
 161   unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
 162   unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
 163   unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
 164   StaticResource<Utf8Decoder> utf8_decoder_;
 165
 166   DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
 167 };
 168
 169
 170 // ----------------------------------------------------------------------------
 171 // LiteralBuffer -  Collector of chars of literals.
 172
 173 class LiteralBuffer {
 174  public:
 175   LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
 176
 177   ~LiteralBuffer() {
 178     if (backing_store_.length() > 0) {
 179       backing_store_.Dispose();
 180     }
 181   }
 182
 183   INLINE(void AddChar(uint32_t code_unit)) {
 184     if (position_ >= backing_store_.length()) ExpandBuffer();
 185     if (is_ascii_) {
 186       if (code_unit < kMaxAsciiCharCodeU) {
 187         backing_store_[position_] = static_cast<byte>(code_unit);
 188         position_ += kASCIISize;
 189         return;
 190       }
 191       ConvertToUtf16();
 192     }
 193     ASSERT(code_unit < 0x10000u);
 194     *reinterpret_cast<uc16*>(&backing_store_[position_]) = code_unit;
 195     position_ += kUC16Size;
 196   }
 197
 198   bool is_ascii() { return is_ascii_; }
 199
 200   Vector<const uc16> utf16_literal() {
 201     ASSERT(!is_ascii_);
 202     ASSERT((position_ & 0x1) == 0);
 203     return Vector<const uc16>(
 204         reinterpret_cast<const uc16*>(backing_store_.start()),
 205         position_ >> 1);
 206   }
 207
 208   Vector<const char> ascii_literal() {
 209     ASSERT(is_ascii_);
 210     return Vector<const char>(
 211         reinterpret_cast<const char*>(backing_store_.start()),
 212         position_);
 213   }
 214
 215   int length() {
 216     return is_ascii_ ? position_ : (position_ >> 1);
 217   }
 218
 219   void Reset() {
 220     position_ = 0;
 221     is_ascii_ = true;
 222   }
 223
 224  private:
 225   static const int kInitialCapacity = 16;
 226   static const int kGrowthFactory = 4;
 227   static const int kMinConversionSlack = 256;
 228   static const int kMaxGrowth = 1 * MB;
 229   inline int NewCapacity(int min_capacity) {
 230     int capacity = Max(min_capacity, backing_store_.length());
 231     int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
 232     return new_capacity;
 233   }
 234
 235   void ExpandBuffer() {
 236     Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
 237     memcpy(new_store.start(), backing_store_.start(), position_);
 238     backing_store_.Dispose();
 239     backing_store_ = new_store;
 240   }
 241
 242   void ConvertToUtf16() {
 243     ASSERT(is_ascii_);
 244     Vector<byte> new_store;
 245     int new_content_size = position_ * kUC16Size;
 246     if (new_content_size >= backing_store_.length()) {
 247       // Ensure room for all currently read code units as UC16 as well
 248       // as the code unit about to be stored.
 249       new_store = Vector<byte>::New(NewCapacity(new_content_size));
 250     } else {
 251       new_store = backing_store_;
 252     }
 253     char* src = reinterpret_cast<char*>(backing_store_.start());
 254     uc16* dst = reinterpret_cast<uc16*>(new_store.start());
 255     for (int i = position_ - 1; i >= 0; i--) {
 256       dst[i] = src[i];
 257     }
 258     if (new_store.start() != backing_store_.start()) {
 259       backing_store_.Dispose();
 260       backing_store_ = new_store;
 261     }
 262     position_ = new_content_size;
 263     is_ascii_ = false;
 264   }
 265
 266   bool is_ascii_;
 267   int position_;
 268   Vector<byte> backing_store_;
 269
 270   DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
 271 };
 272
 273
 274 // ----------------------------------------------------------------------------
 275 // JavaScript Scanner.
 276
 277 class Scanner {
 278  public:
 279   // Scoped helper for literal recording. Automatically drops the literal
 280   // if aborting the scanning before it's complete.
 281   class LiteralScope {
 282    public:
 283     explicit LiteralScope(Scanner* self)
 284         : scanner_(self), complete_(false) {
 285       scanner_->StartLiteral();
 286     }
 287      ~LiteralScope() {
 288        if (!complete_) scanner_->DropLiteral();
 289      }
 290     void Complete() {
 291       scanner_->TerminateLiteral();
 292       complete_ = true;
 293     }
 294
 295    private:
 296     Scanner* scanner_;
 297     bool complete_;
 298   };
 299
 300   // Representation of an interval of source positions.
 301   struct Location {
 302     Location(int b, int e) : beg_pos(b), end_pos(e) { }
 303     Location() : beg_pos(0), end_pos(0) { }
 304
 305     bool IsValid() const {
 306       return beg_pos >= 0 && end_pos >= beg_pos;
 307     }
 308
 309     static Location invalid() { return Location(-1, -1); }
 310
 311     int beg_pos;
 312     int end_pos;
 313   };
 314
 315   // -1 is outside of the range of any real source code.
 316   static const int kNoOctalLocation = -1;
 317
 318   typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
 319
 320   explicit Scanner(UnicodeCache* scanner_contants);
 321
 322   void Initialize(Utf16CharacterStream* source);
 323
 324   // Returns the next token and advances input.
 325   Token::Value Next();
 326   // Returns the current token again.
 327   Token::Value current_token() { return current_.token; }
 328   // Returns the location information for the current token
 329   // (the token last returned by Next()).
 330   Location location() const { return current_.location; }
 331   // Returns the literal string, if any, for the current token (the
 332   // token last returned by Next()). The string is 0-terminated.
 333   // Literal strings are collected for identifiers, strings, and
 334   // numbers.
 335   // These functions only give the correct result if the literal
 336   // was scanned between calls to StartLiteral() and TerminateLiteral().
 337   Vector<const char> literal_ascii_string() {
 338     ASSERT_NOT_NULL(current_.literal_chars);
 339     return current_.literal_chars->ascii_literal();
 340   }
 341   Vector<const uc16> literal_utf16_string() {
 342     ASSERT_NOT_NULL(current_.literal_chars);
 343     return current_.literal_chars->utf16_literal();
 344   }
 345   bool is_literal_ascii() {
 346     ASSERT_NOT_NULL(current_.literal_chars);
 347     return current_.literal_chars->is_ascii();
 348   }
 349   int literal_length() const {
 350     ASSERT_NOT_NULL(current_.literal_chars);
 351     return current_.literal_chars->length();
 352   }
 353
 354   bool literal_contains_escapes() const {
 355     Location location = current_.location;
 356     int source_length = (location.end_pos - location.beg_pos);
 357     if (current_.token == Token::STRING) {
 358       // Subtract delimiters.
 359       source_length -= 2;
 360     }
 361     return current_.literal_chars->length() != source_length;
 362   }
 363
 364   // Similar functions for the upcoming token.
 365
 366   // One token look-ahead (past the token returned by Next()).
 367   Token::Value peek() const { return next_.token; }
 368
 369   Location peek_location() const { return next_.location; }
 370
 371   // Returns the literal string for the next token (the token that
 372   // would be returned if Next() were called).
 373   Vector<const char> next_literal_ascii_string() {
 374     ASSERT_NOT_NULL(next_.literal_chars);
 375     return next_.literal_chars->ascii_literal();
 376   }
 377   Vector<const uc16> next_literal_utf16_string() {
 378     ASSERT_NOT_NULL(next_.literal_chars);
 379     return next_.literal_chars->utf16_literal();
 380   }
 381   bool is_next_literal_ascii() {
 382     ASSERT_NOT_NULL(next_.literal_chars);
 383     return next_.literal_chars->is_ascii();
 384   }
 385   int next_literal_length() const {
 386     ASSERT_NOT_NULL(next_.literal_chars);
 387     return next_.literal_chars->length();
 388   }
 389
 390   UnicodeCache* unicode_cache() { return unicode_cache_; }
 391
 392   static const int kCharacterLookaheadBufferSize = 1;
 393
 394   // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
 395   uc32 ScanOctalEscape(uc32 c, int length);
 396
 397   // Returns the location of the last seen octal literal.
 398   Location octal_position() const { return octal_pos_; }
 399   void clear_octal_position() { octal_pos_ = Location::invalid(); }
 400
 401   // Seek forward to the given position.  This operation does not
 402   // work in general, for instance when there are pushed back
 403   // characters, but works for seeking forward until simple delimiter
 404   // tokens, which is what it is used for.
 405   void SeekForward(int pos);
 406
 407   bool HarmonyScoping() const {
 408     return harmony_scoping_;
 409   }
 410   void SetHarmonyScoping(bool scoping) {
 411     harmony_scoping_ = scoping;
 412   }
 413   bool HarmonyModules() const {
 414     return harmony_modules_;
 415   }
 416   void SetHarmonyModules(bool modules) {
 417     harmony_modules_ = modules;
 418   }
 419
 420
 421   // Returns true if there was a line terminator before the peek'ed token,
 422   // possibly inside a multi-line comment.
 423   bool HasAnyLineTerminatorBeforeNext() const {
 424     return has_line_terminator_before_next_ ||
 425            has_multiline_comment_before_next_;
 426   }
 427
 428   // Scans the input as a regular expression pattern, previous
 429   // character(s) must be /(=). Returns true if a pattern is scanned.
 430   bool ScanRegExpPattern(bool seen_equal);
 431   // Returns true if regexp flags are scanned (always since flags can
 432   // be empty).
 433   bool ScanRegExpFlags();
 434
 435   // Tells whether the buffer contains an identifier (no escapes).
 436   // Used for checking if a property name is an identifier.
 437   static bool IsIdentifier(unibrow::CharacterStream* buffer);
 438
 439  private:
 440   // The current and look-ahead token.
 441   struct TokenDesc {
 442     Token::Value token;
 443     Location location;
 444     LiteralBuffer* literal_chars;
 445   };
 446
 447   // Call this after setting source_ to the input.
 448   void Init() {
 449     // Set c0_ (one character ahead)
 450     STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
 451     Advance();
 452     // Initialize current_ to not refer to a literal.
 453     current_.literal_chars = NULL;
 454   }
 455
 456   // Literal buffer support
 457   inline void StartLiteral() {
 458     LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
 459             &literal_buffer2_ : &literal_buffer1_;
 460     free_buffer->Reset();
 461     next_.literal_chars = free_buffer;
 462   }
 463
 464   INLINE(void AddLiteralChar(uc32 c)) {
 465     ASSERT_NOT_NULL(next_.literal_chars);
 466     next_.literal_chars->AddChar(c);
 467   }
 468
 469   // Complete scanning of a literal.
 470   inline void TerminateLiteral() {
 471     // Does nothing in the current implementation.
 472   }
 473
 474   // Stops scanning of a literal and drop the collected characters,
 475   // e.g., due to an encountered error.
 476   inline void DropLiteral() {
 477     next_.literal_chars = NULL;
 478   }
 479
 480   inline void AddLiteralCharAdvance() {
 481     AddLiteralChar(c0_);
 482     Advance();
 483   }
 484
 485   // Low-level scanning support.
 486   void Advance() { c0_ = source_->Advance(); }
 487   void PushBack(uc32 ch) {
 488     source_->PushBack(c0_);
 489     c0_ = ch;
 490   }
 491
 492   inline Token::Value Select(Token::Value tok) {
 493     Advance();
 494     return tok;
 495   }
 496
 497   inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
 498     Advance();
 499     if (c0_ == next) {
 500       Advance();
 501       return then;
 502     } else {
 503       return else_;
 504     }
 505   }
 506
 507   uc32 ScanHexNumber(int expected_length);
 508
 509   // Scans a single JavaScript token.
 510   void Scan();
 511
 512   bool SkipWhiteSpace();
 513   Token::Value SkipSingleLineComment();
 514   Token::Value SkipMultiLineComment();
 515   // Scans a possible HTML comment -- begins with '<!'.
 516   Token::Value ScanHtmlComment();
 517
 518   void ScanDecimalDigits();
 519   Token::Value ScanNumber(bool seen_period);
 520   Token::Value ScanIdentifierOrKeyword();
 521   Token::Value ScanIdentifierSuffix(LiteralScope* literal);
 522
 523   Token::Value ScanString();
 524
 525   // Scans an escape-sequence which is part of a string and adds the
 526   // decoded character to the current literal. Returns true if a pattern
 527   // is scanned.
 528   bool ScanEscape();
 529   // Decodes a Unicode escape-sequence which is part of an identifier.
 530   // If the escape sequence cannot be decoded the result is kBadChar.
 531   uc32 ScanIdentifierUnicodeEscape();
 532   // Scans a Unicode escape-sequence and adds its characters,
 533   // uninterpreted, to the current literal. Used for parsing RegExp
 534   // flags.
 535   bool ScanLiteralUnicodeEscape();
 536
 537   // Return the current source position.
 538   int source_pos() {
 539     return source_->pos() - kCharacterLookaheadBufferSize;
 540   }
 541
 542   UnicodeCache* unicode_cache_;
 543
 544   // Buffers collecting literal strings, numbers, etc.
 545   LiteralBuffer literal_buffer1_;
 546   LiteralBuffer literal_buffer2_;
 547
 548   TokenDesc current_;  // desc for current token (as returned by Next())
 549   TokenDesc next_;     // desc for next token (one token look-ahead)
 550
 551   // Input stream. Must be initialized to an Utf16CharacterStream.
 552   Utf16CharacterStream* source_;
 553
 554
 555   // Start position of the octal literal last scanned.
 556   Location octal_pos_;
 557
 558   // One Unicode character look-ahead; c0_ < 0 at the end of the input.
 559   uc32 c0_;
 560
 561   // Whether there is a line terminator whitespace character after
 562   // the current token, and  before the next. Does not count newlines
 563   // inside multiline comments.
 564   bool has_line_terminator_before_next_;
 565   // Whether there is a multi-line comment that contains a
 566   // line-terminator after the current token, and before the next.
 567   bool has_multiline_comment_before_next_;
 568   // Whether we scan 'let' as a keyword for harmony block-scoped let bindings.
 569   bool harmony_scoping_;
 570   // Whether we scan 'module', 'import', 'export' as keywords.
 571   bool harmony_modules_;
 572 };
 573
 574 } }  // namespace v8::internal
 575
 576 #endif  // V8_SCANNER_H_