From 0a0e6c8c81f2a59323e5afb449971f71004b17aa Mon Sep 17 00:00:00 2001 From: marja Date: Tue, 2 Dec 2014 02:58:11 -0800 Subject: [PATCH] ES6 unicode extensions, part 1. Allows \u{xxxxx} in variable names and string literals (not yet in regexps). Everything's behind the --harmony-unicode flag. BUG= Review URL: https://codereview.chromium.org/716423002 Cr-Commit-Position: refs/heads/master@{#25603} --- src/bootstrapper.cc | 3 ++ src/flag-definitions.h | 21 +++++++------- src/parser.cc | 2 ++ src/preparser.h | 4 +++ src/scanner.cc | 41 ++++++++++++++++++++++++-- src/scanner.h | 10 +++++++ test/cctest/test-parsing.cc | 51 +++++++++++++++++++++++++++++++-- test/mjsunit/harmony/unicode-escapes.js | 46 +++++++++++++++++++++++++++++ 8 files changed, 164 insertions(+), 14 deletions(-) create mode 100644 test/mjsunit/harmony/unicode-escapes.js diff --git a/src/bootstrapper.cc b/src/bootstrapper.cc index 5fed349..052d9f6 100644 --- a/src/bootstrapper.cc +++ b/src/bootstrapper.cc @@ -1590,6 +1590,7 @@ EMPTY_NATIVE_FUNCTIONS_FOR_FEATURE(harmony_numeric_literals) EMPTY_NATIVE_FUNCTIONS_FOR_FEATURE(harmony_tostring) EMPTY_NATIVE_FUNCTIONS_FOR_FEATURE(harmony_templates) EMPTY_NATIVE_FUNCTIONS_FOR_FEATURE(harmony_sloppy) +EMPTY_NATIVE_FUNCTIONS_FOR_FEATURE(harmony_unicode) void Genesis::InstallNativeFunctions_harmony_proxies() { @@ -1618,6 +1619,7 @@ EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_tostring) EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_proxies) EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_templates) EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_sloppy) +EMPTY_INITIALIZE_GLOBAL_FOR_FEATURE(harmony_unicode) void Genesis::InitializeGlobal_harmony_regexps() { Handle builtins(native_context()->builtins()); @@ -2176,6 +2178,7 @@ bool Genesis::InstallExperimentalNatives() { static const char* harmony_templates_natives[] = { "native harmony-templates.js", NULL}; static const char* harmony_sloppy_natives[] = {NULL}; + static const char* harmony_unicode_natives[] = {NULL}; for (int i = ExperimentalNatives::GetDebuggerCount(); i < ExperimentalNatives::GetBuiltinsCount(); i++) { diff --git a/src/flag-definitions.h b/src/flag-definitions.h index d1e74b6..55b4bde 100644 --- a/src/flag-definitions.h +++ b/src/flag-definitions.h @@ -162,17 +162,18 @@ DEFINE_IMPLICATION(harmony, es_staging) DEFINE_IMPLICATION(es_staging, harmony) // Features that are still work in progress (behind individual flags). -#define HARMONY_INPROGRESS(V) \ - V(harmony_modules, "harmony modules (implies block scoping)") \ - V(harmony_arrays, "harmony array methods") \ - V(harmony_classes, \ +#define HARMONY_INPROGRESS(V) \ + V(harmony_modules, "harmony modules (implies block scoping)") \ + V(harmony_arrays, "harmony array methods") \ + V(harmony_classes, \ "harmony classes (implies block scoping & object literal extension)") \ - V(harmony_object_literals, "harmony object literal extensions") \ - V(harmony_regexps, "harmony regular expression extensions") \ - V(harmony_arrow_functions, "harmony arrow functions") \ - V(harmony_proxies, "harmony proxies") \ - V(harmony_templates, "harmony template literals") \ - V(harmony_sloppy, "harmony features in sloppy mode") + V(harmony_object_literals, "harmony object literal extensions") \ + V(harmony_regexps, "harmony regular expression extensions") \ + V(harmony_arrow_functions, "harmony arrow functions") \ + V(harmony_proxies, "harmony proxies") \ + V(harmony_templates, "harmony template literals") \ + V(harmony_sloppy, "harmony features in sloppy mode") \ + V(harmony_unicode, "harmony unicode escapes") // Features that are complete (but still behind --harmony/es-staging flag). #define HARMONY_STAGED(V) \ diff --git a/src/parser.cc b/src/parser.cc index 83fb594..6fcca20 100644 --- a/src/parser.cc +++ b/src/parser.cc @@ -805,6 +805,7 @@ Parser::Parser(CompilationInfo* info, ParseInfo* parse_info) set_allow_harmony_object_literals(FLAG_harmony_object_literals); set_allow_harmony_templates(FLAG_harmony_templates); set_allow_harmony_sloppy(FLAG_harmony_sloppy); + set_allow_harmony_unicode(FLAG_harmony_unicode); for (int feature = 0; feature < v8::Isolate::kUseCounterFeatureCount; ++feature) { use_counts_[feature] = 0; @@ -3974,6 +3975,7 @@ PreParser::PreParseResult Parser::ParseLazyFunctionBodyWithPreParser( allow_harmony_object_literals()); reusable_preparser_->set_allow_harmony_templates(allow_harmony_templates()); reusable_preparser_->set_allow_harmony_sloppy(allow_harmony_sloppy()); + reusable_preparser_->set_allow_harmony_unicode(allow_harmony_unicode()); } PreParser::PreParseResult result = reusable_preparser_->PreParseLazyFunction(strict_mode(), diff --git a/src/preparser.h b/src/preparser.h index b50019c..cef5b94 100644 --- a/src/preparser.h +++ b/src/preparser.h @@ -107,6 +107,7 @@ class ParserBase : public Traits { } bool allow_harmony_templates() const { return scanner()->HarmonyTemplates(); } bool allow_harmony_sloppy() const { return allow_harmony_sloppy_; } + bool allow_harmony_unicode() const { return scanner()->HarmonyUnicode(); } // Setters that determine whether certain syntactical constructs are // allowed to be parsed by this instance of the parser. @@ -136,6 +137,9 @@ class ParserBase : public Traits { void set_allow_harmony_sloppy(bool allow) { allow_harmony_sloppy_ = allow; } + void set_allow_harmony_unicode(bool allow) { + scanner()->SetHarmonyUnicode(allow); + } protected: enum AllowEvalOrArgumentsAsIdentifier { diff --git a/src/scanner.cc b/src/scanner.cc index 3214c6f..6ce222c 100644 --- a/src/scanner.cc +++ b/src/scanner.cc @@ -39,7 +39,8 @@ Scanner::Scanner(UnicodeCache* unicode_cache) harmony_modules_(false), harmony_numeric_literals_(false), harmony_classes_(false), - harmony_templates_(false) {} + harmony_templates_(false), + harmony_unicode_(false) {} void Scanner::Initialize(Utf16CharacterStream* source) { @@ -72,6 +73,22 @@ uc32 Scanner::ScanHexNumber(int expected_length) { } +uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value) { + uc32 x = 0; + int d = HexValue(c0_); + if (d < 0) { + return -1; + } + while (d >= 0) { + x = x * 16 + d; + if (x > max_value) return -1; + Advance(); + d = HexValue(c0_); + } + return x; +} + + // Ensure that tokens can be stored in a byte. STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); @@ -700,7 +717,7 @@ bool Scanner::ScanEscape() { case 'r' : c = '\r'; break; case 't' : c = '\t'; break; case 'u' : { - c = ScanHexNumber(4); + c = ScanUnicodeEscape(); if (c < 0) return false; break; } @@ -964,6 +981,26 @@ uc32 Scanner::ScanIdentifierUnicodeEscape() { Advance(); if (c0_ != 'u') return -1; Advance(); + return ScanUnicodeEscape(); +} + + +uc32 Scanner::ScanUnicodeEscape() { + // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are + // allowed). In the latter case, the number of hex digits between { } is + // arbitrary. \ and u have already been read. + if (c0_ == '{' && HarmonyUnicode()) { + Advance(); + uc32 cp = ScanUnlimitedLengthHexNumber(0x10ffff); + if (cp < 0) { + return -1; + } + if (c0_ != '}') { + return -1; + } + Advance(); + return cp; + } return ScanHexNumber(4); } diff --git a/src/scanner.h b/src/scanner.h index 46e6d32..446355f 100644 --- a/src/scanner.h +++ b/src/scanner.h @@ -460,6 +460,8 @@ class Scanner { } bool HarmonyTemplates() const { return harmony_templates_; } void SetHarmonyTemplates(bool templates) { harmony_templates_ = templates; } + bool HarmonyUnicode() const { return harmony_unicode_; } + void SetHarmonyUnicode(bool unicode) { harmony_unicode_ = unicode; } // Returns true if there was a line terminator before the peek'ed token, // possibly inside a multi-line comment. @@ -616,6 +618,10 @@ class Scanner { } uc32 ScanHexNumber(int expected_length); + // Scan a number of any length but not bigger than max_value. For example, the + // number can be 000000001, so it's very long in characters but its value is + // small. + uc32 ScanUnlimitedLengthHexNumber(int max_value); // Scans a single JavaScript token. void Scan(); @@ -642,6 +648,8 @@ class Scanner { // Decodes a Unicode escape-sequence which is part of an identifier. // If the escape sequence cannot be decoded the result is kBadChar. uc32 ScanIdentifierUnicodeEscape(); + // Helper for the above functions. + uc32 ScanUnicodeEscape(); // Return the current source position. int source_pos() { @@ -688,6 +696,8 @@ class Scanner { bool harmony_classes_; // Whether we scan TEMPLATE_SPAN and TEMPLATE_TAIL bool harmony_templates_; + // Whether we allow \u{xxxxx}. + bool harmony_unicode_; }; } } // namespace v8::internal diff --git a/test/cctest/test-parsing.cc b/test/cctest/test-parsing.cc index 4e2f828..7da5dac 100644 --- a/test/cctest/test-parsing.cc +++ b/test/cctest/test-parsing.cc @@ -1357,7 +1357,8 @@ enum ParserFlag { kAllowHarmonyClasses, kAllowHarmonyObjectLiterals, kAllowHarmonyTemplates, - kAllowHarmonySloppy + kAllowHarmonySloppy, + kAllowHarmonyUnicode }; @@ -1383,6 +1384,7 @@ void SetParserFlags(i::ParserBase* parser, parser->set_allow_harmony_classes(flags.Contains(kAllowHarmonyClasses)); parser->set_allow_harmony_templates(flags.Contains(kAllowHarmonyTemplates)); parser->set_allow_harmony_sloppy(flags.Contains(kAllowHarmonySloppy)); + parser->set_allow_harmony_unicode(flags.Contains(kAllowHarmonyUnicode)); } @@ -1693,6 +1695,7 @@ void RunParserSyncTest(const char* context_data[][2], kAllowHarmonyModules, kAllowHarmonyTemplates, kAllowHarmonySloppy, + kAllowHarmonyUnicode, kAllowLazy, kAllowNatives, }; @@ -4374,8 +4377,52 @@ TEST(InvalidUnicodeEscapes) { // No escapes allowed in regexp flags "/regex/\\u0069g", "/regex/\\u006g", + // Braces gone wrong + "var foob\\u{c481r = 0;", + "var foob\\uc481}r = 0;", + "var \\u{0052oo = 0;", + "var \\u0052}oo = 0;", + "\"foob\\u{c481r\"", + "var foob\\u{}ar = 0;", + // Too high value for the unicode escape + "\"\\u{110000}\"", + // Not an unicode escape + "var foob\\v1234r = 0;", + "var foob\\U1234r = 0;", + "var foob\\v{1234}r = 0;", + "var foob\\U{1234}r = 0;", NULL}; - RunParserSyncTest(context_data, data, kError); + static const ParserFlag always_flags[] = {kAllowHarmonyUnicode}; + RunParserSyncTest(context_data, data, kError, NULL, 0, always_flags, + arraysize(always_flags)); +} + + +TEST(UnicodeEscapes) { + const char* context_data[][2] = {{"", ""}, + {"'use strict';", ""}, + {NULL, NULL}}; + const char* data[] = { + // Identifier starting with escape + "var \\u0052oo = 0;", + "var \\u{0052}oo = 0;", + "var \\u{52}oo = 0;", + "var \\u{00000000052}oo = 0;", + // Identifier with an escape but not starting with an escape + "var foob\\uc481r = 0;", + "var foob\\u{c481}r = 0;", + // String with an escape + "\"foob\\uc481r\"", + "\"foob\\{uc481}r\"", + // This character is a valid unicode character, representable as a surrogate + // pair, not representable as 4 hex digits. + "\"foo\\u{10e6d}\"", + // Max value for the unicode escape + "\"\\u{10ffff}\"", + NULL}; + static const ParserFlag always_flags[] = {kAllowHarmonyUnicode}; + RunParserSyncTest(context_data, data, kSuccess, NULL, 0, always_flags, + arraysize(always_flags)); } diff --git a/test/mjsunit/harmony/unicode-escapes.js b/test/mjsunit/harmony/unicode-escapes.js new file mode 100644 index 0000000..b39ee1a --- /dev/null +++ b/test/mjsunit/harmony/unicode-escapes.js @@ -0,0 +1,46 @@ +// Copyright 2014 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// ES6 extends the \uxxxx escape and also allows \u{xxxxx}. + +// Flags: --harmony-unicode + +// Unicode escapes in variable names. + +(function TestVariableNames1() { + var foobar = 1; + assertEquals(foob\u0061r, 1); + assertEquals(foob\u{0061}r, 1); + assertEquals(foob\u{61}r, 1); + assertEquals(foob\u{0000000061}r, 1); +})(); + +(function TestVariableNames2() { + var foobar = 1; + assertEquals(\u0066oobar, 1); + assertEquals(\u{0066}oobar, 1); + assertEquals(\u{66}oobar, 1); + assertEquals(\u{0000000066}oobar, 1); +})(); + +// Unicode escapes in strings. + +(function TestStrings() { + var s1 = "foob\u0061r"; + assertEquals(s1, "foobar"); + var s2 = "foob\u{0061}r"; + assertEquals(s2, "foobar"); + var s3 = "foob\u{61}r"; + assertEquals(s3, "foobar"); + var s4 = "foob\u{0000000061}r"; + assertEquals(s4, "foobar"); +})(); + + +(function TestSurrogates() { + // U+10E6D corresponds to the surrogate pair [U+D803, U+DE6D]. + var s1 = "foo\u{10e6d}"; + var s2 = "foo\u{d803}\u{de6d}"; + assertEquals(s1, s2); +})(); -- 2.7.4