From 0cb5fd3d3e3d376fb8fdf9df8dbfc70fd3d7896d Mon Sep 17 00:00:00 2001 From: "svenpanne@chromium.org" Date: Fri, 31 Jan 2014 08:09:17 +0000 Subject: [PATCH] Implements ES6 String.prototype.normalize method. BUG=v8:2943 LOG=Y TEST=Unit tests for "real life" use cases, edge cases, various types of normalization. ========================== This is identical to the previous CL https://codereview.chromium.org/40133004/ with two differences: * Added a dummy implementation of String.prototype.normalize to be used when v8 is compiled without intl support * Rebased the the test files for webkit. That was the only reason for the previous failure (and revert). Thank you, Mihai R=svenpanne@chromium.org Review URL: https://codereview.chromium.org/68133016 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@18972 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/i18n.js | 39 ++++++ src/runtime.cc | 29 +++++ src/runtime.h | 3 + src/string.js | 23 ++++ test/intl/string/normalization.js | 145 +++++++++++++++++++++ .../js/Object-getOwnPropertyNames-expected.txt | 2 +- test/webkit/fast/js/Object-getOwnPropertyNames.js | 2 +- .../js/kde/inbuilt_function_proto-expected.txt | 1 + test/webkit/fast/js/kde/inbuilt_function_proto.js | 1 + 9 files changed, 243 insertions(+), 2 deletions(-) create mode 100644 test/intl/string/normalization.js diff --git a/src/i18n.js b/src/i18n.js index 6b563a0..2e54857 100644 --- a/src/i18n.js +++ b/src/i18n.js @@ -45,6 +45,11 @@ var AVAILABLE_SERVICES = ['collator', 'dateformat', 'breakiterator']; +var NORMALIZATION_FORMS = ['NFC', + 'NFD', + 'NFKC', + 'NFKD']; + /** * Caches available locales for each service. */ @@ -1987,6 +1992,40 @@ $Object.defineProperty($String.prototype, 'localeCompare', { /** + * Unicode normalization. This method is called with one argument that + * specifies the normalization form. + * If none is specified, "NFC" is assumed. + * If the form is not one of "NFC", "NFD", "NFKC", or "NFKD", then throw + * a RangeError Exception. + */ +$Object.defineProperty($String.prototype, 'normalize', { + value: function(that) { + if (%_IsConstructCall()) { + throw new $TypeError(ORDINARY_FUNCTION_CALLED_AS_CONSTRUCTOR); + } + + CHECK_OBJECT_COERCIBLE(this, "String.prototype.normalize"); + + var form = $String(%_Arguments(0) || 'NFC'); + + var normalizationForm = NORMALIZATION_FORMS.indexOf(form); + if (normalizationForm === -1) { + throw new $RangeError('The normalization form should be one of ' + + NORMALIZATION_FORMS.join(', ') + '.'); + } + + return %StringNormalize(this, normalizationForm); + }, + writable: true, + configurable: true, + enumerable: false +}); +%FunctionSetName($String.prototype.normalize, 'normalize'); +%FunctionRemovePrototype($String.prototype.normalize); +%SetNativeFlag($String.prototype.normalize); + + +/** * Formats a Number object (this) using locale and options values. * If locale or options are omitted, defaults are used. */ diff --git a/src/runtime.cc b/src/runtime.cc index 3442351..b3429fa 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -13976,6 +13976,35 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_InternalCompare) { } +RUNTIME_FUNCTION(MaybeObject*, Runtime_StringNormalize) { + HandleScope scope(isolate); + static const UNormalizationMode normalizationForms[] = + { UNORM_NFC, UNORM_NFD, UNORM_NFKC, UNORM_NFKD }; + + ASSERT(args.length() == 2); + + CONVERT_ARG_HANDLE_CHECKED(String, stringValue, 0); + CONVERT_NUMBER_CHECKED(int, form_id, Int32, args[1]); + + v8::String::Value string_value(v8::Utils::ToLocal(stringValue)); + const UChar* u_value = reinterpret_cast(*string_value); + + // TODO(mnita): check Normalizer2 (not available in ICU 46) + UErrorCode status = U_ZERO_ERROR; + icu::UnicodeString result; + icu::Normalizer::normalize(u_value, normalizationForms[form_id], 0, + result, status); + if (U_FAILURE(status)) { + return isolate->heap()->undefined_value(); + } + + return *isolate->factory()->NewStringFromTwoByte( + Vector( + reinterpret_cast(result.getBuffer()), + result.length())); +} + + RUNTIME_FUNCTION(MaybeObject*, Runtime_CreateBreakIterator) { HandleScope scope(isolate); diff --git a/src/runtime.h b/src/runtime.h index 532066f..354244d 100644 --- a/src/runtime.h +++ b/src/runtime.h @@ -568,6 +568,9 @@ namespace internal { F(CreateCollator, 3, 1) \ F(InternalCompare, 3, 1) \ \ + /* String.prototype.normalize. */ \ + F(StringNormalize, 2, 1) \ + \ /* Break iterator. */ \ F(CreateBreakIterator, 3, 1) \ F(BreakIteratorAdoptText, 2, 1) \ diff --git a/src/string.js b/src/string.js index 8e4b896..74230c9 100644 --- a/src/string.js +++ b/src/string.js @@ -186,6 +186,28 @@ function StringMatch(regexp) { } +var NORMALIZATION_FORMS = ['NFC', 'NFD', 'NFKC', 'NFKD']; + + +// ECMA-262 v6, section 21.1.3.12 +// +// For now we do nothing, as proper normalization requires big tables. +// If Intl is enabled, then i18n.js will override it and provide the the +// proper functionality. +function StringNormalize(form) { + CHECK_OBJECT_COERCIBLE(this, "String.prototype.normalize"); + + var form = form ? TO_STRING_INLINE(form) : 'NFC'; + var normalizationForm = NORMALIZATION_FORMS.indexOf(form); + if (normalizationForm === -1) { + throw new $RangeError('The normalization form should be one of ' + + NORMALIZATION_FORMS.join(', ') + '.'); + } + + return %_ValueOf(this); +} + + // This has the same size as the lastMatchInfo array, and can be used for // functions that expect that structure to be returned. It is used when the // needle is a string rather than a regexp. In this case we can't update @@ -942,6 +964,7 @@ function SetUpString() { "lastIndexOf", StringLastIndexOf, "localeCompare", StringLocaleCompare, "match", StringMatch, + "normalize", StringNormalize, "replace", StringReplace, "search", StringSearch, "slice", StringSlice, diff --git a/test/intl/string/normalization.js b/test/intl/string/normalization.js new file mode 100644 index 0000000..446d627 --- /dev/null +++ b/test/intl/string/normalization.js @@ -0,0 +1,145 @@ +// Copyright 2013 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Tests the new String.prototype.normalize method. + + +// Common use case when searching for 'not very exact' match. +// These are examples of data one might encounter in real use. +var testRealUseCases = function() { + // Vietnamese legacy text, old Windows 9x / non-Unicode applications use + // windows-1258 code page, which is neither precomposed, nor decomposed. + assertEquals('ti\u00ea\u0301ng Vi\u00ea\u0323t'.normalize('NFKD'), + 'ti\u1ebfng Vi\u1ec7t'.normalize('NFKD')); // all precomposed + + // Various kinds of spaces + assertEquals('Google\u0020Maps'.normalize('NFKD'), // normal space + 'Google\u00a0Maps'.normalize('NFKD')); // non-breaking space + assertEquals('Google\u0020Maps'.normalize('NFKD'), // normal space + 'Google\u2002Maps'.normalize('NFKD')); // en-space + assertEquals('Google\u0020Maps'.normalize('NFKD'), // normal space + 'Google\u2003Maps'.normalize('NFKD')); // em-space + assertEquals('Google\u0020Maps'.normalize('NFKD'), // normal space + 'Google\u3000Maps'.normalize('NFKC')); // ideographic space + + // Latin small ligature "fi" + assertEquals('fi'.normalize('NFKD'), '\ufb01'.normalize('NFKD')); + + // ŀ, Latin small L with middle dot, used in Catalan and often represented + // as decomposed for non-Unicode environments ( l + ·) + assertEquals('l\u00b7'.normalize('NFKD'), '\u0140'.normalize('NFKD')); + + // Legacy text, Japanese narrow Kana (MS-DOS & Win 3.x time) + assertEquals('\u30d1\u30bd\u30b3\u30f3'.normalize('NFKD'), // パソコン : wide + '\uff8a\uff9f\uff7f\uff7a\uff9d'.normalize('NFKD')); // パソコン : narrow + // Also for Japanese, Latin fullwidth forms vs. ASCII + assertEquals('ABCD'.normalize('NFKD'), + '\uff21\uff22\uff23\uff24'.normalize('NFKD')); // ABCD, fullwidth +}(); + + +var testEdgeCases = function() { + // Make sure we throw RangeError, as the standard requires. + assertThrows('"".normalize(1234)', RangeError); + assertThrows('"".normalize("BAD")', RangeError); + + // The standard does not say what kind of exceptions we should throw, so we + // will not be specific. But we still test that we throw errors. + assertThrows('s.normalize()'); // s is not defined + assertThrows('var s = null; s.normalize()'); + assertThrows('var s = undefined; s.normalize()'); + assertThrows('var s = 1234; s.normalize()'); // no normalize for non-strings +}(); + + +// Several kinds of mappings. No need to be comprehensive, we don't test +// the ICU functionality, we only test C - JavaScript 'glue' +var testData = [ + // org, default, NFC, NFD, NKFC, NKFD + ['\u00c7', // Ç : Combining sequence, Latin 1 + '\u00c7', '\u0043\u0327', + '\u00c7', '\u0043\u0327'], + ['\u0218', // Ș : Combining sequence, non-Latin 1 + '\u0218', '\u0053\u0326', + '\u0218', '\u0053\u0326'], + ['\uac00', // 가 : Hangul + '\uac00', '\u1100\u1161', + '\uac00', '\u1100\u1161'], + ['\uff76', // カ : Narrow Kana + '\uff76', '\uff76', + '\u30ab', '\u30ab'], + ['\u00bc', // ¼ : Fractions + '\u00bc', '\u00bc', + '\u0031\u2044\u0034', '\u0031\u2044\u0034'], + ['\u01c6', // dž : Latin ligature + '\u01c6', '\u01c6', + '\u0064\u017e', '\u0064\u007a\u030c'], + ['s\u0307\u0323', // s + dot above + dot below, ordering of combining marks + '\u1e69', 's\u0323\u0307', + '\u1e69', 's\u0323\u0307'], + ['\u3300', // ㌀ : Squared characters + '\u3300', '\u3300', + '\u30a2\u30d1\u30fc\u30c8', // アパート + '\u30a2\u30cf\u309a\u30fc\u30c8'], // アパート + ['\ufe37', // ︷ : Vertical forms + '\ufe37', '\ufe37', + '{' , '{'], + ['\u2079', // ⁹ : superscript 9 + '\u2079', '\u2079', + '9', '9'], + ['\ufee5\ufee6\ufee7\ufee8', // Arabic forms + '\ufee5\ufee6\ufee7\ufee8', '\ufee5\ufee6\ufee7\ufee8', + '\u0646\u0646\u0646\u0646', '\u0646\u0646\u0646\u0646'], + ['\u2460', // ① : Circled + '\u2460', '\u2460', + '1', '1'], + ['\u210c', // ℌ : Font variants + '\u210c', '\u210c', + 'H', 'H'], + ['\u2126', // Ω : Singleton, OHM sign vs. Greek capital letter OMEGA + '\u03a9', '\u03a9', + '\u03a9', '\u03a9'], + ['\ufdfb', // Long ligature, ARABIC LIGATURE JALLAJALALOUHOU + '\ufdfb', '\ufdfb', + '\u062C\u0644\u0020\u062C\u0644\u0627\u0644\u0647', + '\u062C\u0644\u0020\u062C\u0644\u0627\u0644\u0647'] +]; + +var testArray = function() { + var kNFC = 1, kNFD = 2, kNFKC = 3, kNFKD = 4; + for (var i = 0; i < testData.length; ++i) { + // the original, NFC and NFD should normalize to the same thing + for (var column = 0; column < 3; ++column) { + var str = testData[i][column]; + assertEquals(str.normalize(), testData[i][kNFC]); // defaults to NFC + assertEquals(str.normalize('NFC'), testData[i][kNFC]); + assertEquals(str.normalize('NFD'), testData[i][kNFD]); + assertEquals(str.normalize('NFKC'), testData[i][kNFKC]); + assertEquals(str.normalize('NFKD'), testData[i][kNFKD]); + } + } +}(); diff --git a/test/webkit/fast/js/Object-getOwnPropertyNames-expected.txt b/test/webkit/fast/js/Object-getOwnPropertyNames-expected.txt index 560be18..b8c4bec 100644 --- a/test/webkit/fast/js/Object-getOwnPropertyNames-expected.txt +++ b/test/webkit/fast/js/Object-getOwnPropertyNames-expected.txt @@ -70,7 +70,7 @@ FAIL getSortedOwnPropertyNames(Function.prototype) should be apply,bind,call,con FAIL getSortedOwnPropertyNames(Array) should be isArray,length,name,prototype. Was arguments,caller,isArray,length,name,prototype. PASS getSortedOwnPropertyNames(Array.prototype) is ['concat', 'constructor', 'every', 'filter', 'forEach', 'indexOf', 'join', 'lastIndexOf', 'length', 'map', 'pop', 'push', 'reduce', 'reduceRight', 'reverse', 'shift', 'slice', 'some', 'sort', 'splice', 'toLocaleString', 'toString', 'unshift'] FAIL getSortedOwnPropertyNames(String) should be fromCharCode,length,name,prototype. Was arguments,caller,fromCharCode,length,name,prototype. -PASS getSortedOwnPropertyNames(String.prototype) is ['anchor', 'big', 'blink', 'bold', 'charAt', 'charCodeAt', 'concat', 'constructor', 'fixed', 'fontcolor', 'fontsize', 'indexOf', 'italics', 'lastIndexOf', 'length', 'link', 'localeCompare', 'match', 'replace', 'search', 'slice', 'small', 'split', 'strike', 'sub', 'substr', 'substring', 'sup', 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toString', 'toUpperCase', 'trim', 'trimLeft', 'trimRight', 'valueOf'] +PASS getSortedOwnPropertyNames(String.prototype) is ['anchor', 'big', 'blink', 'bold', 'charAt', 'charCodeAt', 'concat', 'constructor', 'fixed', 'fontcolor', 'fontsize', 'indexOf', 'italics', 'lastIndexOf', 'length', 'link', 'localeCompare', 'match', 'normalize', 'replace', 'search', 'slice', 'small', 'split', 'strike', 'sub', 'substr', 'substring', 'sup', 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toString', 'toUpperCase', 'trim', 'trimLeft', 'trimRight', 'valueOf'] FAIL getSortedOwnPropertyNames(Boolean) should be length,name,prototype. Was arguments,caller,length,name,prototype. PASS getSortedOwnPropertyNames(Boolean.prototype) is ['constructor', 'toString', 'valueOf'] FAIL getSortedOwnPropertyNames(Number) should be MAX_VALUE,MIN_VALUE,NEGATIVE_INFINITY,NaN,POSITIVE_INFINITY,length,name,prototype. Was EPSILON,MAX_SAFE_INTEGER,MAX_VALUE,MIN_SAFE_INTEGER,MIN_VALUE,NEGATIVE_INFINITY,NaN,POSITIVE_INFINITY,arguments,caller,isFinite,isInteger,isNaN,isSafeInteger,length,name,parseFloat,parseInt,prototype. diff --git a/test/webkit/fast/js/Object-getOwnPropertyNames.js b/test/webkit/fast/js/Object-getOwnPropertyNames.js index 97ab6ad..6373cf1 100644 --- a/test/webkit/fast/js/Object-getOwnPropertyNames.js +++ b/test/webkit/fast/js/Object-getOwnPropertyNames.js @@ -78,7 +78,7 @@ var expectedPropertyNamesSet = { "Array": "['isArray', 'length', 'name', 'prototype']", "Array.prototype": "['concat', 'constructor', 'every', 'filter', 'forEach', 'indexOf', 'join', 'lastIndexOf', 'length', 'map', 'pop', 'push', 'reduce', 'reduceRight', 'reverse', 'shift', 'slice', 'some', 'sort', 'splice', 'toLocaleString', 'toString', 'unshift']", "String": "['fromCharCode', 'length', 'name', 'prototype']", - "String.prototype": "['anchor', 'big', 'blink', 'bold', 'charAt', 'charCodeAt', 'concat', 'constructor', 'fixed', 'fontcolor', 'fontsize', 'indexOf', 'italics', 'lastIndexOf', 'length', 'link', 'localeCompare', 'match', 'replace', 'search', 'slice', 'small', 'split', 'strike', 'sub', 'substr', 'substring', 'sup', 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toString', 'toUpperCase', 'trim', 'trimLeft', 'trimRight', 'valueOf']", + "String.prototype": "['anchor', 'big', 'blink', 'bold', 'charAt', 'charCodeAt', 'concat', 'constructor', 'fixed', 'fontcolor', 'fontsize', 'indexOf', 'italics', 'lastIndexOf', 'length', 'link', 'localeCompare', 'match', 'normalize', 'replace', 'search', 'slice', 'small', 'split', 'strike', 'sub', 'substr', 'substring', 'sup', 'toLocaleLowerCase', 'toLocaleUpperCase', 'toLowerCase', 'toString', 'toUpperCase', 'trim', 'trimLeft', 'trimRight', 'valueOf']", "Boolean": "['length', 'name', 'prototype']", "Boolean.prototype": "['constructor', 'toString', 'valueOf']", "Number": "['MAX_VALUE', 'MIN_VALUE', 'NEGATIVE_INFINITY', 'NaN', 'POSITIVE_INFINITY', 'length', 'name', 'prototype']", diff --git a/test/webkit/fast/js/kde/inbuilt_function_proto-expected.txt b/test/webkit/fast/js/kde/inbuilt_function_proto-expected.txt index 99818c3..8fc28af 100644 --- a/test/webkit/fast/js/kde/inbuilt_function_proto-expected.txt +++ b/test/webkit/fast/js/kde/inbuilt_function_proto-expected.txt @@ -47,6 +47,7 @@ PASS String.prototype.charCodeAt.__proto__ is Function.prototype PASS String.prototype.indexOf.__proto__ is Function.prototype PASS String.prototype.lastIndexOf.__proto__ is Function.prototype PASS String.prototype.match.__proto__ is Function.prototype +PASS String.prototype.normalize.__proto__ is Function.prototype PASS String.prototype.replace.__proto__ is Function.prototype PASS String.prototype.search.__proto__ is Function.prototype PASS String.prototype.slice.__proto__ is Function.prototype diff --git a/test/webkit/fast/js/kde/inbuilt_function_proto.js b/test/webkit/fast/js/kde/inbuilt_function_proto.js index 294e23a..cd2657a 100644 --- a/test/webkit/fast/js/kde/inbuilt_function_proto.js +++ b/test/webkit/fast/js/kde/inbuilt_function_proto.js @@ -43,6 +43,7 @@ shouldBe("String.prototype.charCodeAt.__proto__","Function.prototype"); shouldBe("String.prototype.indexOf.__proto__","Function.prototype"); shouldBe("String.prototype.lastIndexOf.__proto__","Function.prototype"); shouldBe("String.prototype.match.__proto__","Function.prototype"); +shouldBe("String.prototype.normalize.__proto__","Function.prototype"); shouldBe("String.prototype.replace.__proto__","Function.prototype"); shouldBe("String.prototype.search.__proto__","Function.prototype"); shouldBe("String.prototype.slice.__proto__","Function.prototype"); -- 2.7.4