From: yangguo@chromium.org Date: Mon, 21 Jan 2013 16:11:31 +0000 (+0000) Subject: Fix some latin-1 webkit units tests X-Git-Tag: upstream/4.7.83~15245 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0c822b21cbf77b3ec6f3beb5bf78c01e62bb4dad;p=platform%2Fupstream%2Fv8.git Fix some latin-1 webkit units tests R=yangguo@chromium.org BUG= Review URL: https://chromiumcodereview.appspot.com/11962035 Patch from Dan Carney . git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13455 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- diff --git a/src/jsregexp.cc b/src/jsregexp.cc index f6e2e7f..a33df6f 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -2855,6 +2855,29 @@ RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { } +// We need to check for the following characters: 0x39c 0x3bc 0x178. +static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { +#ifdef ENABLE_LATIN_1 + // TODO(dcarney): this could be a lot more efficient. + return range.Contains(0x39c) || + range.Contains(0x3bc) || range.Contains(0x178); +#else + return false; +#endif +} + + +#ifdef ENABLE_LATIN_1 +static bool RangesContainLatin1Equivalents(ZoneList* ranges) { + for (int i = 0; i < ranges->length(); i++) { + // TODO(dcarney): this could be a lot more efficient. + if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; + } + return false; +} +#endif + + RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; @@ -2871,21 +2894,21 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { return set_replacement(NULL); } #else - if (quarks[j] <= String::kMaxOneByteCharCode) continue; + uint16_t c = quarks[j]; + if (c <= String::kMaxOneByteCharCode) continue; if (!ignore_case) return set_replacement(NULL); // Here, we need to check for characters whose upper and lower cases // are outside the Latin-1 range. - if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) { - return set_replacement(NULL); - } + uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); + // Character is outside Latin-1 completely + if (converted == 0) return set_replacement(NULL); + // Convert quark to Latin-1 in place. + uint16_t* copy = const_cast(quarks.start()); + copy[j] = converted; #endif } } else { ASSERT(elm.type == TextElement::CHAR_CLASS); -#ifdef ENABLE_LATIN_1 - // TODO(dcarney): Can this be improved? - if (ignore_case) continue; -#endif RegExpCharacterClass* cc = elm.data.u_char_class; ZoneList* ranges = cc->ranges(zone()); if (!CharacterRange::IsCanonical(ranges)) { @@ -2897,11 +2920,19 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { if (range_count != 0 && ranges->at(0).from() == 0 && ranges->at(0).to() >= String::kMaxOneByteCharCode) { +#ifdef ENABLE_LATIN_1 + // This will be handled in a later filter. + if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; +#endif return set_replacement(NULL); } } else { if (range_count == 0 || ranges->at(0).from() > String::kMaxOneByteCharCode) { +#ifdef ENABLE_LATIN_1 + // This will be handled in a later filter. + if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; +#endif return set_replacement(NULL); } } @@ -5354,7 +5385,7 @@ void CharacterRange::AddCaseEquivalents(ZoneList* ranges, Isolate* isolate = Isolate::Current(); uc16 bottom = from(); uc16 top = to(); - if (is_ascii) { + if (is_ascii && !RangeContainsLatin1Equivalents(*this)) { if (bottom > String::kMaxOneByteCharCode) return; if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; } diff --git a/src/regexp-macro-assembler.cc b/src/regexp-macro-assembler.cc index f73726a..3ebf5a8 100644 --- a/src/regexp-macro-assembler.cc +++ b/src/regexp-macro-assembler.cc @@ -210,6 +210,26 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = { 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o' 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w' 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z' + // Latin-1 range + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, + 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, }; diff --git a/src/regexp-macro-assembler.h b/src/regexp-macro-assembler.h index bcf3673..211ab6b 100644 --- a/src/regexp-macro-assembler.h +++ b/src/regexp-macro-assembler.h @@ -244,10 +244,10 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler { static const byte* StringCharacterPosition(String* subject, int start_index); - // Byte map of ASCII characters with a 0xff if the character is a word + // Byte map of one byte characters with a 0xff if the character is a word // character (digit, letter or underscore) and 0x00 otherwise. // Used by generated RegExp code. - static const byte word_character_map[128]; + static const byte word_character_map[256]; static Address word_character_map_address() { return const_cast
(&word_character_map[0]); diff --git a/src/runtime.cc b/src/runtime.cc index b5f04e3..ef04ed3 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -5051,8 +5051,8 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToNumber) { // Fast check for a junk value. A valid string may start from a // whitespace, a sign ('+' or '-'), the decimal point, a decimal digit or // the 'I' character ('Infinity'). All of that have codes not greater than - // '9' except 'I'. - if (data[start_pos] != 'I') { + // '9' except 'I' and  . + if (data[start_pos] != 'I' && data[start_pos] != 0xa0) { return isolate->heap()->nan_value(); } } else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) { diff --git a/src/unicode-inl.h b/src/unicode-inl.h index b4e2cb5..c80c67e 100644 --- a/src/unicode-inl.h +++ b/src/unicode-inl.h @@ -79,33 +79,19 @@ template int Mapping::CalculateValue(uchar c, uchar n, } -bool Latin1::NonLatin1CanBeConvertedToLatin1(uint16_t c) { +uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { ASSERT(c > Latin1::kMaxChar); switch (c) { - case 0x130: - case 0x131: - case 0x149: + // This are equivalent characters in unicode. + case 0x39c: + case 0x3bc: + return 0xb5; + // This is an uppercase of a Latin-1 character + // outside of Latin-1. case 0x178: - case 0x17f: - case 0x1f0: - case 0x1e96: - case 0x1e97: - case 0x1e98: - case 0x1e99: - case 0x1e9a: - case 0x1e9e: - case 0x212a: - case 0x212b: - case 0xfb00: - case 0xfb01: - case 0xfb02: - case 0xfb03: - case 0xfb04: - case 0xfb05: - case 0xfb06: - return true; + return 0xff; } - return false; + return 0; } diff --git a/src/unicode.h b/src/unicode.h index 0278576..f8a1f60 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -140,7 +140,10 @@ class Latin1 { #else static const unsigned kMaxChar = 0xff; #endif - static inline bool NonLatin1CanBeConvertedToLatin1(uint16_t); + // Returns 0 if character does not convert to single latin-1 character + // or if the character doesn't not convert back to latin-1 via inverse + // operation (upper to lower, etc). + static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); }; class Utf8 { diff --git a/test/cctest/test-strings.cc b/test/cctest/test-strings.cc index 3684b87..950fd59 100644 --- a/test/cctest/test-strings.cc +++ b/test/cctest/test-strings.cc @@ -1277,38 +1277,60 @@ TEST(IsAscii) { } -static bool CanBeConvertedToLatin1(uint16_t c) { - CHECK(c > unibrow::Latin1::kMaxChar); - uint32_t result[4]; + +#ifdef ENABLE_LATIN_1 +template +static uint16_t ConvertLatin1(uint16_t c) { + uint32_t result[Op::kMaxWidth]; int chars; - chars = unibrow::ToLowercase::Convert(c, 0, result, NULL); - if (chars > 0) { - CHECK_LE(chars, static_cast(sizeof(result))); - for (int i = 0; i < chars; i++) { - if (result[i] <= unibrow::Latin1::kMaxChar) { - return true; - } - } - } - chars = unibrow::ToUppercase::Convert(c, 0, result, NULL); - if (chars > 0) { - CHECK_LE(chars, static_cast(sizeof(result))); - for (int i = 0; i < chars; i++) { - if (result[i] <= unibrow::Latin1::kMaxChar) { - return true; - } - } + chars = Op::Convert(c, 0, result, NULL); + if (chars == 0) return 0; + CHECK_LE(chars, static_cast(sizeof(result))); + if (!return_first && chars > 1) { + return 0; } - return false; + return result[0]; } -TEST(Latin1) { -#ifndef ENABLE_LATIN_1 - if (true) return; -#endif - for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) { - CHECK_EQ(CanBeConvertedToLatin1(c), - unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(c)); +static void CheckCanonicalEquivalence(uint16_t c, uint16_t test) { + uint16_t expect = ConvertLatin1(c); + if (expect > unibrow::Latin1::kMaxChar) expect = 0; + CHECK_EQ(expect, test); +} + + +TEST(Latin1IgnoreCase) { + if (true) return; + using namespace unibrow; + for (uint16_t c = Latin1::kMaxChar + 1; c != 0; c++) { + uint16_t lower = ConvertLatin1(c); + uint16_t upper = ConvertLatin1(c); + uint16_t test = Latin1::ConvertNonLatin1ToLatin1(c); + // Filter out all character whose upper is not their lower or vice versa. + if (lower == 0 && upper == 0) { + CheckCanonicalEquivalence(c, test); + continue; + } + if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) { + CheckCanonicalEquivalence(c, test); + continue; + } + if (lower == 0 && upper != 0) { + lower = ConvertLatin1(upper); + } + if (upper == 0 && lower != c) { + upper = ConvertLatin1(lower); + } + if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) { + CheckCanonicalEquivalence(c, test); + continue; + } + if (upper != c && lower != c) { + CheckCanonicalEquivalence(c, test); + continue; + } + CHECK_EQ(Min(upper, lower), test); } } +#endif // ENABLE_LATIN_1 diff --git a/test/mjsunit/regress/regress-latin-1.js b/test/mjsunit/regress/regress-latin-1.js index b1f006d..b6cd714 100644 --- a/test/mjsunit/regress/regress-latin-1.js +++ b/test/mjsunit/regress/regress-latin-1.js @@ -57,3 +57,22 @@ for (var i = 0; i < 0xff; i++) { // Should have hit the branch for the following char codes: // [A-Z], [192-222] but not 215 assertEquals((90-65+1)+(222-192-1+1), total_lo); + +// Latin-1 whitespace character +assertEquals( 1, +(String.fromCharCode(0xA0) + '1') ); + +// Latin-1 \W characters +assertEquals(["+\u00a3", "=="], "+\u00a3==".match(/\W\W/g)); + +// Latin-1 character that uppercases out of Latin-1. +assertTrue(/\u0178/i.test('\u00ff')); + +// Unicode equivalence +assertTrue(/\u039c/i.test('\u00b5')); +assertTrue(/\u039c/i.test('\u03bc')); +assertTrue(/\u00b5/i.test('\u03bc')); +// Unicode equivalence ranges +assertTrue(/[\u039b-\u039d]/i.test('\u00b5')); +assertFalse(/[^\u039b-\u039d]/i.test('\u00b5')); +assertFalse(/[\u039b-\u039d]/.test('\u00b5')); +assertTrue(/[^\u039b-\u039d]/.test('\u00b5'));