1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Features shared by parsing and pre-parsing scanners.
10 #include "src/allocation.h"
11 #include "src/base/logging.h"
12 #include "src/char-predicates.h"
13 #include "src/globals.h"
14 #include "src/hashmap.h"
16 #include "src/token.h"
17 #include "src/unicode-inl.h"
18 #include "src/unicode-decoder.h"
19 #include "src/utils.h"
26 class AstValueFactory;
30 // Returns the value (0 .. 15) of a hexadecimal character c.
31 // If c is not a legal hexadecimal character, returns a value < 0.
32 inline int HexValue(uc32 c) {
34 if (static_cast<unsigned>(c) <= 9) return c;
35 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
36 if (static_cast<unsigned>(c) <= 5) return c + 10;
41 // ---------------------------------------------------------------------
42 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
43 // A code unit is a 16 bit value representing either a 16 bit code point
44 // or one part of a surrogate pair that make a single 21 bit code point.
46 class Utf16CharacterStream {
48 Utf16CharacterStream() : pos_(0) { }
49 virtual ~Utf16CharacterStream() { }
51 // Returns and advances past the next UTF-16 code unit in the input
52 // stream. If there are no more code units, it returns a negative
54 inline uc32 Advance() {
55 if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
57 return static_cast<uc32>(*(buffer_cursor_++));
59 // Note: currently the following increment is necessary to avoid a
60 // parser problem! The scanner treats the final kEndOfInput as
61 // a code unit with a position, and does math relative to that
68 // Return the current position in the code unit stream.
70 inline unsigned pos() const { return pos_; }
72 // Skips forward past the next code_unit_count UTF-16 code units
73 // in the input, or until the end of input if that comes sooner.
74 // Returns the number of code units actually skipped. If less
75 // than code_unit_count,
76 inline unsigned SeekForward(unsigned code_unit_count) {
77 unsigned buffered_chars =
78 static_cast<unsigned>(buffer_end_ - buffer_cursor_);
79 if (code_unit_count <= buffered_chars) {
80 buffer_cursor_ += code_unit_count;
81 pos_ += code_unit_count;
82 return code_unit_count;
84 return SlowSeekForward(code_unit_count);
87 // Pushes back the most recently read UTF-16 code unit (or negative
88 // value if at end of input), i.e., the value returned by the most recent
90 // Must not be used right after calling SeekForward.
91 virtual void PushBack(int32_t code_unit) = 0;
94 static const uc32 kEndOfInput = -1;
96 // Ensures that the buffer_cursor_ points to the code_unit at
97 // position pos_ of the input, if possible. If the position
98 // is at or after the end of the input, return false. If there
99 // are more code_units available, return true.
100 virtual bool ReadBlock() = 0;
101 virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0;
103 const uint16_t* buffer_cursor_;
104 const uint16_t* buffer_end_;
109 // ---------------------------------------------------------------------
110 // Caching predicates used by scanners.
115 typedef unibrow::Utf8Decoder<512> Utf8Decoder;
117 StaticResource<Utf8Decoder>* utf8_decoder() {
118 return &utf8_decoder_;
121 bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }
122 bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
123 bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
124 bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
125 bool IsWhiteSpaceOrLineTerminator(unibrow::uchar c) {
126 return kIsWhiteSpaceOrLineTerminator.get(c);
130 unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
131 unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
132 unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
133 unibrow::Predicate<WhiteSpace, 128> kIsWhiteSpace;
134 unibrow::Predicate<WhiteSpaceOrLineTerminator, 128>
135 kIsWhiteSpaceOrLineTerminator;
136 StaticResource<Utf8Decoder> utf8_decoder_;
138 DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
142 // ---------------------------------------------------------------------
143 // DuplicateFinder discovers duplicate symbols.
145 class DuplicateFinder {
147 explicit DuplicateFinder(UnicodeCache* constants)
148 : unicode_constants_(constants),
152 int AddOneByteSymbol(Vector<const uint8_t> key, int value);
153 int AddTwoByteSymbol(Vector<const uint16_t> key, int value);
154 // Add a a number literal by converting it (if necessary)
155 // to the string that ToString(ToNumber(literal)) would generate.
156 // and then adding that string with AddOneByteSymbol.
157 // This string is the actual value used as key in an object literal,
158 // and the one that must be different from the other keys.
159 int AddNumber(Vector<const uint8_t> key, int value);
162 int AddSymbol(Vector<const uint8_t> key, bool is_one_byte, int value);
163 // Backs up the key and its length in the backing store.
164 // The backup is stored with a base 127 encoding of the
165 // length (plus a bit saying whether the string is one byte),
166 // followed by the bytes of the key.
167 uint8_t* BackupKey(Vector<const uint8_t> key, bool is_one_byte);
169 // Compare two encoded keys (both pointing into the backing store)
170 // for having the same base-127 encoded lengths and representation.
171 // and then having the same 'length' bytes following.
172 static bool Match(void* first, void* second);
173 // Creates a hash from a sequence of bytes.
174 static uint32_t Hash(Vector<const uint8_t> key, bool is_one_byte);
175 // Checks whether a string containing a JS number is its canonical
177 static bool IsNumberCanonical(Vector<const uint8_t> key);
179 // Size of buffer. Sufficient for using it to call DoubleToCString in
180 // from conversions.h.
181 static const int kBufferSize = 100;
183 UnicodeCache* unicode_constants_;
184 // Backing store used to store strings used as hashmap keys.
185 SequenceCollector<unsigned char> backing_store_;
187 // Buffer used for string->number->canonical string conversions.
188 char number_buffer_[kBufferSize];
192 // ----------------------------------------------------------------------------
193 // LiteralBuffer - Collector of chars of literals.
195 class LiteralBuffer {
197 LiteralBuffer() : is_one_byte_(true), position_(0), backing_store_() { }
200 if (backing_store_.length() > 0) {
201 backing_store_.Dispose();
205 INLINE(void AddChar(uint32_t code_unit)) {
206 if (position_ >= backing_store_.length()) ExpandBuffer();
208 if (code_unit <= unibrow::Latin1::kMaxChar) {
209 backing_store_[position_] = static_cast<byte>(code_unit);
210 position_ += kOneByteSize;
215 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
216 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
217 position_ += kUC16Size;
219 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
220 unibrow::Utf16::LeadSurrogate(code_unit);
221 position_ += kUC16Size;
222 if (position_ >= backing_store_.length()) ExpandBuffer();
223 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
224 unibrow::Utf16::TrailSurrogate(code_unit);
225 position_ += kUC16Size;
229 bool is_one_byte() const { return is_one_byte_; }
231 bool is_contextual_keyword(Vector<const char> keyword) const {
232 return is_one_byte() && keyword.length() == position_ &&
233 (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
236 Vector<const uint16_t> two_byte_literal() const {
237 DCHECK(!is_one_byte_);
238 DCHECK((position_ & 0x1) == 0);
239 return Vector<const uint16_t>(
240 reinterpret_cast<const uint16_t*>(backing_store_.start()),
244 Vector<const uint8_t> one_byte_literal() const {
245 DCHECK(is_one_byte_);
246 return Vector<const uint8_t>(
247 reinterpret_cast<const uint8_t*>(backing_store_.start()),
252 return is_one_byte_ ? position_ : (position_ >> 1);
260 Handle<String> Internalize(Isolate* isolate) const;
263 static const int kInitialCapacity = 16;
264 static const int kGrowthFactory = 4;
265 static const int kMinConversionSlack = 256;
266 static const int kMaxGrowth = 1 * MB;
267 inline int NewCapacity(int min_capacity) {
268 int capacity = Max(min_capacity, backing_store_.length());
269 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
273 void ExpandBuffer() {
274 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
275 MemCopy(new_store.start(), backing_store_.start(), position_);
276 backing_store_.Dispose();
277 backing_store_ = new_store;
280 void ConvertToTwoByte() {
281 DCHECK(is_one_byte_);
282 Vector<byte> new_store;
283 int new_content_size = position_ * kUC16Size;
284 if (new_content_size >= backing_store_.length()) {
285 // Ensure room for all currently read code units as UC16 as well
286 // as the code unit about to be stored.
287 new_store = Vector<byte>::New(NewCapacity(new_content_size));
289 new_store = backing_store_;
291 uint8_t* src = backing_store_.start();
292 uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
293 for (int i = position_ - 1; i >= 0; i--) {
296 if (new_store.start() != backing_store_.start()) {
297 backing_store_.Dispose();
298 backing_store_ = new_store;
300 position_ = new_content_size;
301 is_one_byte_ = false;
306 Vector<byte> backing_store_;
308 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
312 // ----------------------------------------------------------------------------
313 // JavaScript Scanner.
317 // Scoped helper for literal recording. Automatically drops the literal
318 // if aborting the scanning before it's complete.
321 explicit LiteralScope(Scanner* self)
322 : scanner_(self), complete_(false) {
323 scanner_->StartLiteral();
326 if (!complete_) scanner_->DropLiteral();
329 scanner_->TerminateLiteral();
338 // Representation of an interval of source positions.
340 Location(int b, int e) : beg_pos(b), end_pos(e) { }
341 Location() : beg_pos(0), end_pos(0) { }
343 bool IsValid() const {
344 return beg_pos >= 0 && end_pos >= beg_pos;
347 static Location invalid() { return Location(-1, -1); }
353 // -1 is outside of the range of any real source code.
354 static const int kNoOctalLocation = -1;
356 explicit Scanner(UnicodeCache* scanner_contants);
358 void Initialize(Utf16CharacterStream* source);
360 // Returns the next token and advances input.
362 // Returns the current token again.
363 Token::Value current_token() { return current_.token; }
364 // Returns the location information for the current token
365 // (the token last returned by Next()).
366 Location location() const { return current_.location; }
368 // Similar functions for the upcoming token.
370 // One token look-ahead (past the token returned by Next()).
371 Token::Value peek() const { return next_.token; }
373 Location peek_location() const { return next_.location; }
375 bool literal_contains_escapes() const {
376 Location location = current_.location;
377 int source_length = (location.end_pos - location.beg_pos);
378 if (current_.token == Token::STRING) {
379 // Subtract delimiters.
382 return current_.literal_chars->length() != source_length;
384 bool is_literal_contextual_keyword(Vector<const char> keyword) {
385 DCHECK_NOT_NULL(current_.literal_chars);
386 return current_.literal_chars->is_contextual_keyword(keyword);
388 bool is_next_contextual_keyword(Vector<const char> keyword) {
389 DCHECK_NOT_NULL(next_.literal_chars);
390 return next_.literal_chars->is_contextual_keyword(keyword);
393 const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory);
394 const AstRawString* NextSymbol(AstValueFactory* ast_value_factory);
396 double DoubleValue();
397 bool LiteralMatches(const char* data, int length, bool allow_escapes = true) {
398 if (is_literal_one_byte() &&
399 literal_length() == length &&
400 (allow_escapes || !literal_contains_escapes())) {
402 reinterpret_cast<const char*>(literal_one_byte_string().start());
403 return !strncmp(token, data, length);
407 inline bool UnescapedLiteralMatches(const char* data, int length) {
408 return LiteralMatches(data, length, false);
411 void IsGetOrSet(bool* is_get, bool* is_set) {
412 if (is_literal_one_byte() &&
413 literal_length() == 3 &&
414 !literal_contains_escapes()) {
416 reinterpret_cast<const char*>(literal_one_byte_string().start());
417 *is_get = strncmp(token, "get", 3) == 0;
418 *is_set = !*is_get && strncmp(token, "set", 3) == 0;
422 int FindNumber(DuplicateFinder* finder, int value);
423 int FindSymbol(DuplicateFinder* finder, int value);
425 UnicodeCache* unicode_cache() { return unicode_cache_; }
427 // Returns the location of the last seen octal literal.
428 Location octal_position() const { return octal_pos_; }
429 void clear_octal_position() { octal_pos_ = Location::invalid(); }
431 // Seek forward to the given position. This operation does not
432 // work in general, for instance when there are pushed back
433 // characters, but works for seeking forward until simple delimiter
434 // tokens, which is what it is used for.
435 void SeekForward(int pos);
437 bool HarmonyScoping() const {
438 return harmony_scoping_;
440 void SetHarmonyScoping(bool scoping) {
441 harmony_scoping_ = scoping;
443 bool HarmonyModules() const {
444 return harmony_modules_;
446 void SetHarmonyModules(bool modules) {
447 harmony_modules_ = modules;
449 bool HarmonyNumericLiterals() const {
450 return harmony_numeric_literals_;
452 void SetHarmonyNumericLiterals(bool numeric_literals) {
453 harmony_numeric_literals_ = numeric_literals;
455 bool HarmonyClasses() const {
456 return harmony_classes_;
458 void SetHarmonyClasses(bool classes) {
459 harmony_classes_ = classes;
462 // Returns true if there was a line terminator before the peek'ed token,
463 // possibly inside a multi-line comment.
464 bool HasAnyLineTerminatorBeforeNext() const {
465 return has_line_terminator_before_next_ ||
466 has_multiline_comment_before_next_;
469 // Scans the input as a regular expression pattern, previous
470 // character(s) must be /(=). Returns true if a pattern is scanned.
471 bool ScanRegExpPattern(bool seen_equal);
472 // Returns true if regexp flags are scanned (always since flags can
474 bool ScanRegExpFlags();
476 const LiteralBuffer* source_url() const { return &source_url_; }
477 const LiteralBuffer* source_mapping_url() const {
478 return &source_mapping_url_;
481 bool IdentifierIsFutureStrictReserved(const AstRawString* string) const;
484 // The current and look-ahead token.
488 LiteralBuffer* literal_chars;
491 static const int kCharacterLookaheadBufferSize = 1;
493 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
494 uc32 ScanOctalEscape(uc32 c, int length);
496 // Call this after setting source_ to the input.
498 // Set c0_ (one character ahead)
499 STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
501 // Initialize current_ to not refer to a literal.
502 current_.literal_chars = NULL;
505 // Literal buffer support
506 inline void StartLiteral() {
507 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
508 &literal_buffer2_ : &literal_buffer1_;
509 free_buffer->Reset();
510 next_.literal_chars = free_buffer;
513 INLINE(void AddLiteralChar(uc32 c)) {
514 DCHECK_NOT_NULL(next_.literal_chars);
515 next_.literal_chars->AddChar(c);
518 // Complete scanning of a literal.
519 inline void TerminateLiteral() {
520 // Does nothing in the current implementation.
523 // Stops scanning of a literal and drop the collected characters,
524 // e.g., due to an encountered error.
525 inline void DropLiteral() {
526 next_.literal_chars = NULL;
529 inline void AddLiteralCharAdvance() {
534 // Low-level scanning support.
536 c0_ = source_->Advance();
537 if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
538 uc32 c1 = source_->Advance();
539 if (!unibrow::Utf16::IsTrailSurrogate(c1)) {
540 source_->PushBack(c1);
542 c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
547 void PushBack(uc32 ch) {
548 if (ch > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
549 source_->PushBack(unibrow::Utf16::TrailSurrogate(c0_));
550 source_->PushBack(unibrow::Utf16::LeadSurrogate(c0_));
552 source_->PushBack(c0_);
557 inline Token::Value Select(Token::Value tok) {
562 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
572 // Returns the literal string, if any, for the current token (the
573 // token last returned by Next()). The string is 0-terminated.
574 // Literal strings are collected for identifiers, strings, and
576 // These functions only give the correct result if the literal
577 // was scanned between calls to StartLiteral() and TerminateLiteral().
578 Vector<const uint8_t> literal_one_byte_string() {
579 DCHECK_NOT_NULL(current_.literal_chars);
580 return current_.literal_chars->one_byte_literal();
582 Vector<const uint16_t> literal_two_byte_string() {
583 DCHECK_NOT_NULL(current_.literal_chars);
584 return current_.literal_chars->two_byte_literal();
586 bool is_literal_one_byte() {
587 DCHECK_NOT_NULL(current_.literal_chars);
588 return current_.literal_chars->is_one_byte();
590 int literal_length() const {
591 DCHECK_NOT_NULL(current_.literal_chars);
592 return current_.literal_chars->length();
594 // Returns the literal string for the next token (the token that
595 // would be returned if Next() were called).
596 Vector<const uint8_t> next_literal_one_byte_string() {
597 DCHECK_NOT_NULL(next_.literal_chars);
598 return next_.literal_chars->one_byte_literal();
600 Vector<const uint16_t> next_literal_two_byte_string() {
601 DCHECK_NOT_NULL(next_.literal_chars);
602 return next_.literal_chars->two_byte_literal();
604 bool is_next_literal_one_byte() {
605 DCHECK_NOT_NULL(next_.literal_chars);
606 return next_.literal_chars->is_one_byte();
608 int next_literal_length() const {
609 DCHECK_NOT_NULL(next_.literal_chars);
610 return next_.literal_chars->length();
613 uc32 ScanHexNumber(int expected_length);
615 // Scans a single JavaScript token.
618 bool SkipWhiteSpace();
619 Token::Value SkipSingleLineComment();
620 Token::Value SkipSourceURLComment();
621 void TryToParseSourceURLComment();
622 Token::Value SkipMultiLineComment();
623 // Scans a possible HTML comment -- begins with '<!'.
624 Token::Value ScanHtmlComment();
626 void ScanDecimalDigits();
627 Token::Value ScanNumber(bool seen_period);
628 Token::Value ScanIdentifierOrKeyword();
629 Token::Value ScanIdentifierSuffix(LiteralScope* literal);
631 Token::Value ScanString();
633 // Scans an escape-sequence which is part of a string and adds the
634 // decoded character to the current literal. Returns true if a pattern
637 // Decodes a Unicode escape-sequence which is part of an identifier.
638 // If the escape sequence cannot be decoded the result is kBadChar.
639 uc32 ScanIdentifierUnicodeEscape();
640 // Scans a Unicode escape-sequence and adds its characters,
641 // uninterpreted, to the current literal. Used for parsing RegExp
643 bool ScanLiteralUnicodeEscape();
645 // Return the current source position.
647 return source_->pos() - kCharacterLookaheadBufferSize;
650 UnicodeCache* unicode_cache_;
652 // Buffers collecting literal strings, numbers, etc.
653 LiteralBuffer literal_buffer1_;
654 LiteralBuffer literal_buffer2_;
656 // Values parsed from magic comments.
657 LiteralBuffer source_url_;
658 LiteralBuffer source_mapping_url_;
660 TokenDesc current_; // desc for current token (as returned by Next())
661 TokenDesc next_; // desc for next token (one token look-ahead)
663 // Input stream. Must be initialized to an Utf16CharacterStream.
664 Utf16CharacterStream* source_;
667 // Start position of the octal literal last scanned.
670 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
673 // Whether there is a line terminator whitespace character after
674 // the current token, and before the next. Does not count newlines
675 // inside multiline comments.
676 bool has_line_terminator_before_next_;
677 // Whether there is a multi-line comment that contains a
678 // line-terminator after the current token, and before the next.
679 bool has_multiline_comment_before_next_;
680 // Whether we scan 'let' as a keyword for harmony block-scoped let bindings.
681 bool harmony_scoping_;
682 // Whether we scan 'module', 'import', 'export' as keywords.
683 bool harmony_modules_;
684 // Whether we scan 0o777 and 0b111 as numbers.
685 bool harmony_numeric_literals_;
686 // Whether we scan 'class', 'extends', 'static' and 'super' as keywords.
687 bool harmony_classes_;
690 } } // namespace v8::internal
692 #endif // V8_SCANNER_H_