From 57c919e414efef323230c69c473f88bbc249f63e Mon Sep 17 00:00:00 2001 From: "erik.corry@gmail.com" Date: Fri, 6 Nov 2009 11:15:20 +0000 Subject: [PATCH] Fix bug 486, Cyrillic character ranges in case independent regexps. http://code.google.com/p/v8/issues/detail?id=486 Review URL: http://codereview.chromium.org/361033 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@3236 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/jsregexp.cc | 47 +++++++- test/mjsunit/cyrillic.js | 169 ++++++++++++++++++++++++++++ test/mjsunit/regress/regress-486.js | 30 +++++ 3 files changed, 241 insertions(+), 5 deletions(-) create mode 100644 test/mjsunit/cyrillic.js create mode 100644 test/mjsunit/regress/regress-486.js diff --git a/src/jsregexp.cc b/src/jsregexp.cc index c77f32d1e..5fb467d29 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -2440,8 +2440,8 @@ void TextNode::MakeCaseIndependent() { RegExpCharacterClass* cc = elm.data.u_char_class; ZoneList* ranges = cc->ranges(); int range_count = ranges->length(); - for (int i = 0; i < range_count; i++) { - ranges->at(i).AddCaseEquivalents(ranges); + for (int j = 0; j < range_count; j++) { + ranges->at(j).AddCaseEquivalents(ranges); } } } @@ -3961,7 +3961,7 @@ void CharacterRange::AddCaseEquivalents(ZoneList* ranges) { } else { start = pos; } - // Then we add the ranges on at a time, incrementing the current + // Then we add the ranges one at a time, incrementing the current // position to be after the last block each time. The position // always points to the start of a block. while (pos < to()) { @@ -3987,8 +3987,45 @@ void CharacterRange::AddCaseEquivalents(ZoneList* ranges) { } start = pos = block_end + 1; } - } else { - // TODO(plesner) when we've fixed the 2^11 bug in unibrow. + } else if (from() > 0 || to() < String::kMaxUC16CharCode) { + // Unibrow ranges don't work for high characters due to the "2^11 bug". + // Therefore we do something dumber for these ranges. We don't bother + // if the range is 0-max (as encountered at the start of an unanchored + // regexp). + ZoneList *characters = new ZoneList(100); + int bottom = from(); + int top = to(); + for (int i = bottom; i <= top; i++) { + int length = uncanonicalize.get(i, '\0', chars); + for (int j = 0; j < length; j++) { + uc32 chr = chars[j]; + if (chr != i && chr < bottom || chr > top) { + characters->Add(chr); + } + } + } + if (characters->length() > 0) { + int new_from = characters->at(0); + int new_to = new_from; + for (int i = 1; i < characters->length(); i++) { + int chr = characters->at(i); + if (chr == new_to + 1) { + new_to++; + } else { + if (new_to == new_from) { + ranges->Add(CharacterRange::Singleton(new_from)); + } else { + ranges->Add(CharacterRange(new_from, new_to)); + } + new_from = new_to = chr; + } + } + if (new_to == new_from) { + ranges->Add(CharacterRange::Singleton(new_from)); + } else { + ranges->Add(CharacterRange(new_from, new_to)); + } + } } } diff --git a/test/mjsunit/cyrillic.js b/test/mjsunit/cyrillic.js new file mode 100644 index 000000000..a7893a0dc --- /dev/null +++ b/test/mjsunit/cyrillic.js @@ -0,0 +1,169 @@ +// Copyright 2009 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Test Unicode character ranges in regexps. + + +// Cyrillic. +var cyrillic = { + FIRST: "\u0410", // A + first: "\u0430", // a + LAST: "\u042f", // YA + last: "\u044f", // ya + MIDDLE: "\u0427", // CHE + middle: "\u0447", // che + // Actually no characters are between the cases in Cyrillic. + BetweenCases: false}; + +var SIGMA = "\u03a3"; +var sigma = "\u03c3"; +var alternative_sigma = "\u03c2"; + +// Greek. +var greek = { + FIRST: "\u0391", // ALPHA + first: "\u03b1", // alpha + LAST: "\u03a9", // OMEGA + last: "\u03c9", // omega + MIDDLE: SIGMA, // SIGMA + middle: sigma, // sigma + // Epsilon acute is between ALPHA-OMEGA and alpha-omega, ie it + // is between OMEGA and alpha. + BetweenCases: "\u03ad"}; + + +function Range(from, to, flags) { + return new RegExp("[" + from + "-" + to + "]", flags); +} + +for (var lang = 0; lang < 2; lang++) { + var chars = (lang == 0) ? cyrillic : greek; + + for (var i = 0; i < 2; i++) { + var lc = (i == 0); // Lower case. + var first = lc ? chars.first : chars.FIRST; + var middle = lc ? chars.middle : chars.MIDDLE; + var last = lc ? chars.last : chars.LAST; + var first_other_case = lc ? chars.FIRST : chars.first; + var middle_other_case = lc ? chars.MIDDLE : chars.middle; + var last_other_case = lc ? chars.LAST : chars.last; + + assertTrue(Range(first, last).test(first), 1); + assertTrue(Range(first, last).test(middle), 2); + assertTrue(Range(first, last).test(last), 3); + + assertFalse(Range(first, last).test(first_other_case), 4); + assertFalse(Range(first, last).test(middle_other_case), 5); + assertFalse(Range(first, last).test(last_other_case), 6); + + assertTrue(Range(first, last, "i").test(first), 7); + assertTrue(Range(first, last, "i").test(middle), 8); + assertTrue(Range(first, last, "i").test(last), 9); + + assertTrue(Range(first, last, "i").test(first_other_case), 10); + assertTrue(Range(first, last, "i").test(middle_other_case), 11); + assertTrue(Range(first, last, "i").test(last_other_case), 12); + + if (chars.BetweenCases) { + assertFalse(Range(first, last).test(chars.BetweenCases), 13); + assertFalse(Range(first, last, "i").test(chars.BetweenCases), 14); + } + } + if (chars.BetweenCases) { + assertTrue(Range(chars.FIRST, chars.last).test(chars.BetweenCases), 15); + assertTrue(Range(chars.FIRST, chars.last, "i").test(chars.BetweenCases), 16); + } +} + +for (key in greek) { + assertTrue(Range(greek.FIRST, cyrillic.last).test(greek[key]), 17 + key); + if (cyrillic[key]) { + assertTrue(Range(greek.FIRST, cyrillic.last).test(cyrillic[key]), 18 + key); + } +} + + +for (var i = 0; i < 2; i++) { + var ignore_case = (i == 0); + var flag = ignore_case ? "i" : ""; + assertTrue(Range(greek.first, cyrillic.LAST, flag).test(greek.first), 19); + assertTrue(Range(greek.first, cyrillic.LAST, flag).test(greek.middle), 20); + assertTrue(Range(greek.first, cyrillic.LAST, flag).test(greek.last), 21); + + assertTrue(Range(greek.first, cyrillic.LAST, flag).test(cyrillic.FIRST), 22); + assertTrue(Range(greek.first, cyrillic.LAST, flag).test(cyrillic.MIDDLE), 23); + assertTrue(Range(greek.first, cyrillic.LAST, flag).test(cyrillic.LAST), 24); + + assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(greek.FIRST), 25); + assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(greek.MIDDLE), 26); + assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(greek.LAST), 27); + + assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(cyrillic.first), 28); + assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(cyrillic.middle), 29); + assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(cyrillic.last), 30); +} + + +for (var i = 0; i < 2; i++) { + var simple = (i != 0); + var name = simple ? "" : "[]"; + var regex = simple ? SIGMA : "[" + SIGMA + "]"; + + assertFalse(new RegExp(regex).test(sigma), 31 + name); + assertFalse(new RegExp(regex).test(alternative_sigma), 32 + name); + assertTrue(new RegExp(regex).test(SIGMA), 33 + name); + + assertTrue(new RegExp(regex, "i").test(sigma), 34 + name); + // JSC and Tracemonkey fail this one. + assertTrue(new RegExp(regex, "i").test(alternative_sigma), 35 + name); + assertTrue(new RegExp(regex, "i").test(SIGMA), 36 + name); + + regex = simple ? sigma : "[" + sigma + "]"; + + assertTrue(new RegExp(regex).test(sigma), 41 + name); + assertFalse(new RegExp(regex).test(alternative_sigma), 42 + name); + assertFalse(new RegExp(regex).test(SIGMA), 43 + name); + + assertTrue(new RegExp(regex, "i").test(sigma), 44 + name); + // JSC and Tracemonkey fail this one. + assertTrue(new RegExp(regex, "i").test(alternative_sigma), 45 + name); + assertTrue(new RegExp(regex, "i").test(SIGMA), 46 + name); + + regex = simple ? alternative_sigma : "[" + alternative_sigma + "]"; + + assertFalse(new RegExp(regex).test(sigma), 51 + name); + assertTrue(new RegExp(regex).test(alternative_sigma), 52 + name); + assertFalse(new RegExp(regex).test(SIGMA), 53 + name); + + // JSC and Tracemonkey fail this one. + assertTrue(new RegExp(regex, "i").test(sigma), 54 + name); + assertTrue(new RegExp(regex, "i").test(alternative_sigma), 55 + name); + // JSC and Tracemonkey fail this one. + assertTrue(new RegExp(regex, "i").test(SIGMA), 56 + name); +} + +print("ok"); diff --git a/test/mjsunit/regress/regress-486.js b/test/mjsunit/regress/regress-486.js new file mode 100644 index 000000000..c1e29a636 --- /dev/null +++ b/test/mjsunit/regress/regress-486.js @@ -0,0 +1,30 @@ +// Copyright 2009 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +var st = "\u0422\u0435\u0441\u0442"; // Test in Cyrillic characters. +var cyrillicMatch = /^[\u0430-\u044fa-z]+$/i.test(st); // a-ja a-z. +assertTrue(cyrillicMatch); -- 2.34.1