1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Features shared by parsing and pre-parsing scanners.
10 #include "src/allocation.h"
11 #include "src/base/logging.h"
12 #include "src/char-predicates.h"
13 #include "src/globals.h"
14 #include "src/hashmap.h"
16 #include "src/token.h"
17 #include "src/unicode.h"
18 #include "src/unicode-decoder.h"
19 #include "src/utils.h"
26 class AstValueFactory;
31 // Returns the value (0 .. 15) of a hexadecimal character c.
32 // If c is not a legal hexadecimal character, returns a value < 0.
33 inline int HexValue(uc32 c) {
35 if (static_cast<unsigned>(c) <= 9) return c;
36 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
37 if (static_cast<unsigned>(c) <= 5) return c + 10;
42 // ---------------------------------------------------------------------
43 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
44 // A code unit is a 16 bit value representing either a 16 bit code point
45 // or one part of a surrogate pair that make a single 21 bit code point.
47 class Utf16CharacterStream {
49 Utf16CharacterStream() : pos_(0) { }
50 virtual ~Utf16CharacterStream() { }
52 // Returns and advances past the next UTF-16 code unit in the input
53 // stream. If there are no more code units, it returns a negative
55 inline uc32 Advance() {
56 if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
58 return static_cast<uc32>(*(buffer_cursor_++));
60 // Note: currently the following increment is necessary to avoid a
61 // parser problem! The scanner treats the final kEndOfInput as
62 // a code unit with a position, and does math relative to that
69 // Return the current position in the code unit stream.
71 inline size_t pos() const { return pos_; }
73 // Skips forward past the next code_unit_count UTF-16 code units
74 // in the input, or until the end of input if that comes sooner.
75 // Returns the number of code units actually skipped. If less
76 // than code_unit_count,
77 inline size_t SeekForward(size_t code_unit_count) {
78 size_t buffered_chars = buffer_end_ - buffer_cursor_;
79 if (code_unit_count <= buffered_chars) {
80 buffer_cursor_ += code_unit_count;
81 pos_ += code_unit_count;
82 return code_unit_count;
84 return SlowSeekForward(code_unit_count);
87 // Pushes back the most recently read UTF-16 code unit (or negative
88 // value if at end of input), i.e., the value returned by the most recent
90 // Must not be used right after calling SeekForward.
91 virtual void PushBack(int32_t code_unit) = 0;
93 virtual bool SetBookmark();
94 virtual void ResetToBookmark();
97 static const uc32 kEndOfInput = -1;
99 // Ensures that the buffer_cursor_ points to the code_unit at
100 // position pos_ of the input, if possible. If the position
101 // is at or after the end of the input, return false. If there
102 // are more code_units available, return true.
103 virtual bool ReadBlock() = 0;
104 virtual size_t SlowSeekForward(size_t code_unit_count) = 0;
106 const uint16_t* buffer_cursor_;
107 const uint16_t* buffer_end_;
112 // ---------------------------------------------------------------------
113 // DuplicateFinder discovers duplicate symbols.
115 class DuplicateFinder {
117 explicit DuplicateFinder(UnicodeCache* constants)
118 : unicode_constants_(constants),
122 int AddOneByteSymbol(Vector<const uint8_t> key, int value);
123 int AddTwoByteSymbol(Vector<const uint16_t> key, int value);
124 // Add a a number literal by converting it (if necessary)
125 // to the string that ToString(ToNumber(literal)) would generate.
126 // and then adding that string with AddOneByteSymbol.
127 // This string is the actual value used as key in an object literal,
128 // and the one that must be different from the other keys.
129 int AddNumber(Vector<const uint8_t> key, int value);
132 int AddSymbol(Vector<const uint8_t> key, bool is_one_byte, int value);
133 // Backs up the key and its length in the backing store.
134 // The backup is stored with a base 127 encoding of the
135 // length (plus a bit saying whether the string is one byte),
136 // followed by the bytes of the key.
137 uint8_t* BackupKey(Vector<const uint8_t> key, bool is_one_byte);
139 // Compare two encoded keys (both pointing into the backing store)
140 // for having the same base-127 encoded lengths and representation.
141 // and then having the same 'length' bytes following.
142 static bool Match(void* first, void* second);
143 // Creates a hash from a sequence of bytes.
144 static uint32_t Hash(Vector<const uint8_t> key, bool is_one_byte);
145 // Checks whether a string containing a JS number is its canonical
147 static bool IsNumberCanonical(Vector<const uint8_t> key);
149 // Size of buffer. Sufficient for using it to call DoubleToCString in
150 // from conversions.h.
151 static const int kBufferSize = 100;
153 UnicodeCache* unicode_constants_;
154 // Backing store used to store strings used as hashmap keys.
155 SequenceCollector<unsigned char> backing_store_;
157 // Buffer used for string->number->canonical string conversions.
158 char number_buffer_[kBufferSize];
162 // ----------------------------------------------------------------------------
163 // LiteralBuffer - Collector of chars of literals.
165 class LiteralBuffer {
167 LiteralBuffer() : is_one_byte_(true), position_(0), backing_store_() { }
170 if (backing_store_.length() > 0) {
171 backing_store_.Dispose();
175 INLINE(void AddChar(uint32_t code_unit)) {
176 if (position_ >= backing_store_.length()) ExpandBuffer();
178 if (code_unit <= unibrow::Latin1::kMaxChar) {
179 backing_store_[position_] = static_cast<byte>(code_unit);
180 position_ += kOneByteSize;
185 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
186 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
187 position_ += kUC16Size;
189 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
190 unibrow::Utf16::LeadSurrogate(code_unit);
191 position_ += kUC16Size;
192 if (position_ >= backing_store_.length()) ExpandBuffer();
193 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
194 unibrow::Utf16::TrailSurrogate(code_unit);
195 position_ += kUC16Size;
199 bool is_one_byte() const { return is_one_byte_; }
201 bool is_contextual_keyword(Vector<const char> keyword) const {
202 return is_one_byte() && keyword.length() == position_ &&
203 (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
206 Vector<const uint16_t> two_byte_literal() const {
207 DCHECK(!is_one_byte_);
208 DCHECK((position_ & 0x1) == 0);
209 return Vector<const uint16_t>(
210 reinterpret_cast<const uint16_t*>(backing_store_.start()),
214 Vector<const uint8_t> one_byte_literal() const {
215 DCHECK(is_one_byte_);
216 return Vector<const uint8_t>(
217 reinterpret_cast<const uint8_t*>(backing_store_.start()),
222 return is_one_byte_ ? position_ : (position_ >> 1);
225 void ReduceLength(int delta) {
226 position_ -= delta * (is_one_byte_ ? kOneByteSize : kUC16Size);
234 Handle<String> Internalize(Isolate* isolate) const;
236 void CopyFrom(const LiteralBuffer* other) {
237 if (other == nullptr) {
240 is_one_byte_ = other->is_one_byte_;
241 position_ = other->position_;
242 backing_store_.Dispose();
243 backing_store_ = other->backing_store_.Clone();
248 static const int kInitialCapacity = 16;
249 static const int kGrowthFactory = 4;
250 static const int kMinConversionSlack = 256;
251 static const int kMaxGrowth = 1 * MB;
252 inline int NewCapacity(int min_capacity) {
253 int capacity = Max(min_capacity, backing_store_.length());
254 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
258 void ExpandBuffer() {
259 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
260 MemCopy(new_store.start(), backing_store_.start(), position_);
261 backing_store_.Dispose();
262 backing_store_ = new_store;
265 void ConvertToTwoByte() {
266 DCHECK(is_one_byte_);
267 Vector<byte> new_store;
268 int new_content_size = position_ * kUC16Size;
269 if (new_content_size >= backing_store_.length()) {
270 // Ensure room for all currently read code units as UC16 as well
271 // as the code unit about to be stored.
272 new_store = Vector<byte>::New(NewCapacity(new_content_size));
274 new_store = backing_store_;
276 uint8_t* src = backing_store_.start();
277 uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
278 for (int i = position_ - 1; i >= 0; i--) {
281 if (new_store.start() != backing_store_.start()) {
282 backing_store_.Dispose();
283 backing_store_ = new_store;
285 position_ = new_content_size;
286 is_one_byte_ = false;
291 Vector<byte> backing_store_;
293 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
297 // ----------------------------------------------------------------------------
298 // JavaScript Scanner.
302 // Scoped helper for literal recording. Automatically drops the literal
303 // if aborting the scanning before it's complete.
306 explicit LiteralScope(Scanner* self) : scanner_(self), complete_(false) {
307 scanner_->StartLiteral();
310 if (!complete_) scanner_->DropLiteral();
321 // Scoped helper for a re-settable bookmark.
322 class BookmarkScope {
324 explicit BookmarkScope(Scanner* scanner) : scanner_(scanner) {
325 DCHECK_NOT_NULL(scanner_);
327 ~BookmarkScope() { scanner_->DropBookmark(); }
329 bool Set() { return scanner_->SetBookmark(); }
330 void Reset() { scanner_->ResetToBookmark(); }
331 bool HasBeenSet() { return scanner_->BookmarkHasBeenSet(); }
332 bool HasBeenReset() { return scanner_->BookmarkHasBeenReset(); }
337 DISALLOW_COPY_AND_ASSIGN(BookmarkScope);
340 // Representation of an interval of source positions.
342 Location(int b, int e) : beg_pos(b), end_pos(e) { }
343 Location() : beg_pos(0), end_pos(0) { }
345 bool IsValid() const {
346 return beg_pos >= 0 && end_pos >= beg_pos;
349 static Location invalid() { return Location(-1, -1); }
355 // -1 is outside of the range of any real source code.
356 static const int kNoOctalLocation = -1;
358 explicit Scanner(UnicodeCache* scanner_contants);
360 void Initialize(Utf16CharacterStream* source);
362 // Returns the next token and advances input.
364 // Returns the token following peek()
365 Token::Value PeekAhead();
366 // Returns the current token again.
367 Token::Value current_token() { return current_.token; }
368 // Returns the location information for the current token
369 // (the token last returned by Next()).
370 Location location() const { return current_.location; }
372 // Similar functions for the upcoming token.
374 // One token look-ahead (past the token returned by Next()).
375 Token::Value peek() const { return next_.token; }
377 Location peek_location() const { return next_.location; }
379 bool literal_contains_escapes() const {
380 Location location = current_.location;
381 int source_length = (location.end_pos - location.beg_pos);
382 if (current_.token == Token::STRING) {
383 // Subtract delimiters.
386 return current_.literal_chars->length() != source_length;
388 bool is_literal_contextual_keyword(Vector<const char> keyword) {
389 DCHECK_NOT_NULL(current_.literal_chars);
390 return current_.literal_chars->is_contextual_keyword(keyword);
392 bool is_next_contextual_keyword(Vector<const char> keyword) {
393 DCHECK_NOT_NULL(next_.literal_chars);
394 return next_.literal_chars->is_contextual_keyword(keyword);
397 const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory);
398 const AstRawString* NextSymbol(AstValueFactory* ast_value_factory);
399 const AstRawString* CurrentRawSymbol(AstValueFactory* ast_value_factory);
401 double DoubleValue();
403 bool LiteralMatches(const char* data, int length, bool allow_escapes = true) {
404 if (is_literal_one_byte() &&
405 literal_length() == length &&
406 (allow_escapes || !literal_contains_escapes())) {
408 reinterpret_cast<const char*>(literal_one_byte_string().start());
409 return !strncmp(token, data, length);
413 inline bool UnescapedLiteralMatches(const char* data, int length) {
414 return LiteralMatches(data, length, false);
417 void IsGetOrSet(bool* is_get, bool* is_set) {
418 if (is_literal_one_byte() &&
419 literal_length() == 3 &&
420 !literal_contains_escapes()) {
422 reinterpret_cast<const char*>(literal_one_byte_string().start());
423 *is_get = strncmp(token, "get", 3) == 0;
424 *is_set = !*is_get && strncmp(token, "set", 3) == 0;
428 int FindSymbol(DuplicateFinder* finder, int value);
430 UnicodeCache* unicode_cache() { return unicode_cache_; }
432 // Returns the location of the last seen octal literal.
433 Location octal_position() const { return octal_pos_; }
434 void clear_octal_position() { octal_pos_ = Location::invalid(); }
436 // Returns the value of the last smi that was scanned.
437 int smi_value() const { return current_.smi_value_; }
439 // Seek forward to the given position. This operation does not
440 // work in general, for instance when there are pushed back
441 // characters, but works for seeking forward until simple delimiter
442 // tokens, which is what it is used for.
443 void SeekForward(int pos);
445 // Returns true if there was a line terminator before the peek'ed token,
446 // possibly inside a multi-line comment.
447 bool HasAnyLineTerminatorBeforeNext() const {
448 return has_line_terminator_before_next_ ||
449 has_multiline_comment_before_next_;
452 // Scans the input as a regular expression pattern, previous
453 // character(s) must be /(=). Returns true if a pattern is scanned.
454 bool ScanRegExpPattern(bool seen_equal);
455 // Returns true if regexp flags are scanned (always since flags can
457 bool ScanRegExpFlags();
459 // Scans the input as a template literal
460 Token::Value ScanTemplateStart();
461 Token::Value ScanTemplateContinuation();
463 const LiteralBuffer* source_url() const { return &source_url_; }
464 const LiteralBuffer* source_mapping_url() const {
465 return &source_mapping_url_;
468 bool IdentifierIsFutureStrictReserved(const AstRawString* string) const;
471 // The current and look-ahead token.
475 LiteralBuffer* literal_chars;
476 LiteralBuffer* raw_literal_chars;
480 static const int kCharacterLookaheadBufferSize = 1;
482 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
483 template <bool capture_raw>
484 uc32 ScanOctalEscape(uc32 c, int length);
486 // Call this after setting source_ to the input.
488 // Set c0_ (one character ahead)
489 STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
491 // Initialize current_ to not refer to a literal.
492 current_.literal_chars = NULL;
493 current_.raw_literal_chars = NULL;
494 next_next_.token = Token::UNINITIALIZED;
497 // Support BookmarkScope functionality.
499 void ResetToBookmark();
500 bool BookmarkHasBeenSet();
501 bool BookmarkHasBeenReset();
503 static void CopyTokenDesc(TokenDesc* to, TokenDesc* from);
505 // Literal buffer support
506 inline void StartLiteral() {
507 LiteralBuffer* free_buffer =
508 (current_.literal_chars == &literal_buffer0_)
510 : (current_.literal_chars == &literal_buffer1_) ? &literal_buffer2_
512 free_buffer->Reset();
513 next_.literal_chars = free_buffer;
516 inline void StartRawLiteral() {
517 LiteralBuffer* free_buffer =
518 (current_.raw_literal_chars == &raw_literal_buffer0_)
519 ? &raw_literal_buffer1_
520 : (current_.raw_literal_chars == &raw_literal_buffer1_)
521 ? &raw_literal_buffer2_
522 : &raw_literal_buffer0_;
523 free_buffer->Reset();
524 next_.raw_literal_chars = free_buffer;
527 INLINE(void AddLiteralChar(uc32 c)) {
528 DCHECK_NOT_NULL(next_.literal_chars);
529 next_.literal_chars->AddChar(c);
532 INLINE(void AddRawLiteralChar(uc32 c)) {
533 DCHECK_NOT_NULL(next_.raw_literal_chars);
534 next_.raw_literal_chars->AddChar(c);
537 INLINE(void ReduceRawLiteralLength(int delta)) {
538 DCHECK_NOT_NULL(next_.raw_literal_chars);
539 next_.raw_literal_chars->ReduceLength(delta);
542 // Stops scanning of a literal and drop the collected characters,
543 // e.g., due to an encountered error.
544 inline void DropLiteral() {
545 next_.literal_chars = NULL;
546 next_.raw_literal_chars = NULL;
549 inline void AddLiteralCharAdvance() {
554 // Low-level scanning support.
555 template <bool capture_raw = false, bool check_surrogate = true>
558 AddRawLiteralChar(c0_);
560 c0_ = source_->Advance();
561 if (check_surrogate) HandleLeadSurrogate();
564 void HandleLeadSurrogate() {
565 if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
566 uc32 c1 = source_->Advance();
567 if (!unibrow::Utf16::IsTrailSurrogate(c1)) {
568 source_->PushBack(c1);
570 c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
575 void PushBack(uc32 ch) {
576 if (ch > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
577 source_->PushBack(unibrow::Utf16::TrailSurrogate(c0_));
578 source_->PushBack(unibrow::Utf16::LeadSurrogate(c0_));
580 source_->PushBack(c0_);
585 inline Token::Value Select(Token::Value tok) {
590 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
600 // Returns the literal string, if any, for the current token (the
601 // token last returned by Next()). The string is 0-terminated.
602 // Literal strings are collected for identifiers, strings, numbers as well
603 // as for template literals. For template literals we also collect the raw
605 // These functions only give the correct result if the literal was scanned
606 // when a LiteralScope object is alive.
607 Vector<const uint8_t> literal_one_byte_string() {
608 DCHECK_NOT_NULL(current_.literal_chars);
609 return current_.literal_chars->one_byte_literal();
611 Vector<const uint16_t> literal_two_byte_string() {
612 DCHECK_NOT_NULL(current_.literal_chars);
613 return current_.literal_chars->two_byte_literal();
615 bool is_literal_one_byte() {
616 DCHECK_NOT_NULL(current_.literal_chars);
617 return current_.literal_chars->is_one_byte();
619 int literal_length() const {
620 DCHECK_NOT_NULL(current_.literal_chars);
621 return current_.literal_chars->length();
623 // Returns the literal string for the next token (the token that
624 // would be returned if Next() were called).
625 Vector<const uint8_t> next_literal_one_byte_string() {
626 DCHECK_NOT_NULL(next_.literal_chars);
627 return next_.literal_chars->one_byte_literal();
629 Vector<const uint16_t> next_literal_two_byte_string() {
630 DCHECK_NOT_NULL(next_.literal_chars);
631 return next_.literal_chars->two_byte_literal();
633 bool is_next_literal_one_byte() {
634 DCHECK_NOT_NULL(next_.literal_chars);
635 return next_.literal_chars->is_one_byte();
637 Vector<const uint8_t> raw_literal_one_byte_string() {
638 DCHECK_NOT_NULL(current_.raw_literal_chars);
639 return current_.raw_literal_chars->one_byte_literal();
641 Vector<const uint16_t> raw_literal_two_byte_string() {
642 DCHECK_NOT_NULL(current_.raw_literal_chars);
643 return current_.raw_literal_chars->two_byte_literal();
645 bool is_raw_literal_one_byte() {
646 DCHECK_NOT_NULL(current_.raw_literal_chars);
647 return current_.raw_literal_chars->is_one_byte();
650 template <bool capture_raw>
651 uc32 ScanHexNumber(int expected_length);
652 // Scan a number of any length but not bigger than max_value. For example, the
653 // number can be 000000001, so it's very long in characters but its value is
655 template <bool capture_raw>
656 uc32 ScanUnlimitedLengthHexNumber(int max_value);
658 // Scans a single JavaScript token.
661 bool SkipWhiteSpace();
662 Token::Value SkipSingleLineComment();
663 Token::Value SkipSourceURLComment();
664 void TryToParseSourceURLComment();
665 Token::Value SkipMultiLineComment();
666 // Scans a possible HTML comment -- begins with '<!'.
667 Token::Value ScanHtmlComment();
669 void ScanDecimalDigits();
670 Token::Value ScanNumber(bool seen_period);
671 Token::Value ScanIdentifierOrKeyword();
672 Token::Value ScanIdentifierSuffix(LiteralScope* literal);
674 Token::Value ScanString();
676 // Scans an escape-sequence which is part of a string and adds the
677 // decoded character to the current literal. Returns true if a pattern
679 template <bool capture_raw, bool in_template_literal>
682 // Decodes a Unicode escape-sequence which is part of an identifier.
683 // If the escape sequence cannot be decoded the result is kBadChar.
684 uc32 ScanIdentifierUnicodeEscape();
685 // Helper for the above functions.
686 template <bool capture_raw>
687 uc32 ScanUnicodeEscape();
689 Token::Value ScanTemplateSpan();
691 // Return the current source position.
693 return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize;
696 UnicodeCache* unicode_cache_;
698 // Buffers collecting literal strings, numbers, etc.
699 LiteralBuffer literal_buffer0_;
700 LiteralBuffer literal_buffer1_;
701 LiteralBuffer literal_buffer2_;
703 // Values parsed from magic comments.
704 LiteralBuffer source_url_;
705 LiteralBuffer source_mapping_url_;
707 // Buffer to store raw string values
708 LiteralBuffer raw_literal_buffer0_;
709 LiteralBuffer raw_literal_buffer1_;
710 LiteralBuffer raw_literal_buffer2_;
712 TokenDesc current_; // desc for current token (as returned by Next())
713 TokenDesc next_; // desc for next token (one token look-ahead)
714 TokenDesc next_next_; // desc for the token after next (after PeakAhead())
716 // Variables for Scanner::BookmarkScope and the *Bookmark implementation.
717 // These variables contain the scanner state when a bookmark is set.
719 // We will use bookmark_c0_ as a 'control' variable, where:
720 // - bookmark_c0_ >= 0: A bookmark has been set and this contains c0_.
721 // - bookmark_c0_ == -1: No bookmark has been set.
722 // - bookmark_c0_ == -2: The bookmark has been applied (ResetToBookmark).
724 // Which state is being bookmarked? The parser state is distributed over
725 // several variables, roughly like this:
726 // ... 1234 + 5678 ..... [character stream]
727 // [current_] [next_] c0_ | [scanner state]
728 // So when the scanner is logically at the beginning of an expression
729 // like "1234 + 4567", then:
730 // - current_ contains "1234"
731 // - next_ contains "+"
732 // - c0_ contains ' ' (the space between "+" and "5678",
733 // - the source_ character stream points to the beginning of "5678".
734 // To be able to restore this state, we will keep copies of current_, next_,
735 // and c0_; we'll ask the stream to bookmark itself, and we'll copy the
736 // contents of current_'s and next_'s literal buffers to bookmark_*_literal_.
737 static const uc32 kNoBookmark = -1;
738 static const uc32 kBookmarkWasApplied = -2;
740 TokenDesc bookmark_current_;
741 TokenDesc bookmark_next_;
742 LiteralBuffer bookmark_current_literal_;
743 LiteralBuffer bookmark_current_raw_literal_;
744 LiteralBuffer bookmark_next_literal_;
745 LiteralBuffer bookmark_next_raw_literal_;
747 // Input stream. Must be initialized to an Utf16CharacterStream.
748 Utf16CharacterStream* source_;
751 // Start position of the octal literal last scanned.
754 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
757 // Whether there is a line terminator whitespace character after
758 // the current token, and before the next. Does not count newlines
759 // inside multiline comments.
760 bool has_line_terminator_before_next_;
761 // Whether there is a multi-line comment that contains a
762 // line-terminator after the current token, and before the next.
763 bool has_multiline_comment_before_next_;
766 } // namespace internal
769 #endif // V8_SCANNER_H_