From 94bb378ee558ed52411894754a0e1851fc243ed8 Mon Sep 17 00:00:00 2001 From: "lrn@chromium.org" Date: Mon, 13 Dec 2010 08:33:32 +0000 Subject: [PATCH] Make RegExp character class match JSC. See http://trac.webkit.org/changeset/73594 Review URL: http://codereview.chromium.org/5723002 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@5974 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/parser.cc | 36 +++++++++++++++++++++++++----------- test/mjsunit/regexp.js | 12 ++++++++++++ 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/src/parser.cc b/src/parser.cc index fd93aae..94ad57c 100644 --- a/src/parser.cc +++ b/src/parser.cc @@ -4449,6 +4449,22 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { } +static const uc16 kNoCharClass = 0; + +// Adds range or pre-defined character class to character ranges. +// If char_class is not kInvalidClass, it's interpreted as a class +// escape (i.e., 's' means whitespace, from '\s'). +static inline void AddRangeOrEscape(ZoneList* ranges, + uc16 char_class, + CharacterRange range) { + if (char_class != kNoCharClass) { + CharacterRange::AddClassEscape(char_class, ranges); + } else { + ranges->Add(range); + } +} + + RegExpTree* RegExpParser::ParseCharacterClass() { static const char* kUnterminated = "Unterminated character class"; static const char* kRangeOutOfOrder = "Range out of order in character class"; @@ -4462,12 +4478,8 @@ RegExpTree* RegExpParser::ParseCharacterClass() { } ZoneList* ranges = new ZoneList(2); while (has_more() && current() != ']') { - uc16 char_class = 0; + uc16 char_class = kNoCharClass; CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); - if (char_class) { - CharacterRange::AddClassEscape(char_class, ranges); - continue; - } if (current() == '-') { Advance(); if (current() == kEndMarker) { @@ -4475,15 +4487,17 @@ RegExpTree* RegExpParser::ParseCharacterClass() { // following code report an error. break; } else if (current() == ']') { - ranges->Add(first); + AddRangeOrEscape(ranges, char_class, first); ranges->Add(CharacterRange::Singleton('-')); break; } - CharacterRange next = ParseClassAtom(&char_class CHECK_FAILED); - if (char_class) { - ranges->Add(first); + uc16 char_class_2 = kNoCharClass; + CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); + if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { + // Either end is an escaped character class. Treat the '-' verbatim. + AddRangeOrEscape(ranges, char_class, first); ranges->Add(CharacterRange::Singleton('-')); - CharacterRange::AddClassEscape(char_class, ranges); + AddRangeOrEscape(ranges, char_class_2, next); continue; } if (first.from() > next.to()) { @@ -4491,7 +4505,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() { } ranges->Add(CharacterRange::Range(first.from(), next.to())); } else { - ranges->Add(first); + AddRangeOrEscape(ranges, char_class, first); } } if (!has_more()) { diff --git a/test/mjsunit/regexp.js b/test/mjsunit/regexp.js index b57b86d..4c1d2e3 100644 --- a/test/mjsunit/regexp.js +++ b/test/mjsunit/regexp.js @@ -202,6 +202,17 @@ assertFalse(re.test('\n')); assertFalse(re.test('a')); assertFalse(re.test('Z')); +// First - is treated as range operator, second as literal minus. +// This follows the specification in parsing, but doesn't throw on +// the \s at the beginning of the range. +re = /[\s-0-9]/; +assertTrue(re.test(' ')); +assertTrue(re.test('\xA0')); +assertTrue(re.test('-')); +assertTrue(re.test('0')); +assertTrue(re.test('9')); +assertFalse(re.test('1')); + // Test beginning and end of line assertions with or without the // multiline flag. re = /^\d+/; @@ -647,3 +658,4 @@ assertEquals(4, re.exec("zimzamzumba").index); assertEquals(["bc"], re.exec("zimzomzumbc")); assertFalse(re.test("c")); assertFalse(re.test("")); + -- 2.7.4