From 09e3c76137e76aa7359677e7bfe147958e3b44a8 Mon Sep 17 00:00:00 2001 From: "lrn@chromium.org" Date: Fri, 12 Dec 2008 10:22:56 +0000 Subject: [PATCH] Quantified look-aheads are sometimes removed entirely, leaving only a single atom node. A flag was not set in this case, leading the wrapper code to think the pattern was equal to the atom and use the pattern in the indexOf operation. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@971 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/jsregexp.cc | 31 ++++++------ src/jsregexp.h | 4 +- src/parser.cc | 30 ++++++------ test/cctest/test-regexp.cc | 97 ++++++++++++++++++++------------------ 4 files changed, 86 insertions(+), 76 deletions(-) diff --git a/src/jsregexp.cc b/src/jsregexp.cc index b6165c482..c93cae98d 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -270,22 +270,23 @@ Handle RegExpImpl::Compile(Handle re, "malformed_regexp"); return Handle::null(); } - RegExpAtom* atom = parse_result.tree->AsAtom(); - if (atom != NULL && !flags.is_ignore_case()) { - if (parse_result.has_character_escapes) { - Vector atom_pattern = atom->data(); - Handle atom_string = - Factory::NewStringFromTwoByte(atom_pattern); - result = AtomCompile(re, pattern, flags, atom_string); - } else { - result = AtomCompile(re, pattern, flags, pattern); - } + + if (parse_result.simple && !flags.is_ignore_case()) { + // Parse-tree is a single atom that is equal to the pattern. + result = AtomCompile(re, pattern, flags, pattern); + } else if (parse_result.tree->IsAtom() && + !flags.is_ignore_case() && + parse_result.capture_count == 0) { + // TODO(lrn) Accept capture_count > 0 on atoms. + RegExpAtom* atom = parse_result.tree->AsAtom(); + Vector atom_pattern = atom->data(); + Handle atom_string = + Factory::NewStringFromTwoByte(atom_pattern); + result = AtomCompile(re, pattern, flags, atom_string); + } else if (FLAG_irregexp) { + result = IrregexpPrepare(re, pattern, flags); } else { - if (FLAG_irregexp) { - result = IrregexpPrepare(re, pattern, flags); - } else { - result = JscrePrepare(re, pattern, flags); - } + result = JscrePrepare(re, pattern, flags); } Object* data = re->data(); if (data->IsFixedArray()) { diff --git a/src/jsregexp.h b/src/jsregexp.h index dbeb6e23e..32cf288f9 100644 --- a/src/jsregexp.h +++ b/src/jsregexp.h @@ -1117,12 +1117,12 @@ struct RegExpCompileData { : tree(NULL), node(NULL), has_lookbehind(false), - has_character_escapes(false), + simple(true), capture_count(0) { } RegExpTree* tree; RegExpNode* node; bool has_lookbehind; - bool has_character_escapes; + bool simple; Handle error; int capture_count; }; diff --git a/src/parser.cc b/src/parser.cc index 7236cb42c..6c9161858 100644 --- a/src/parser.cc +++ b/src/parser.cc @@ -527,7 +527,9 @@ class RegExpParser { void Advance(int dist); void Reset(int pos); - bool HasCharacterEscapes(); + // Reports whether the pattern might be used as a literal search string. + // Only use if the result of the parse is a single atom node. + bool simple(); int captures_started() { return captures_ == NULL ? 0 : captures_->length(); } int position() { return next_pos_ - 1; } @@ -548,7 +550,7 @@ class RegExpParser { int next_pos_; FlatStringReader* in_; Handle* error_; - bool has_character_escapes_; + bool simple_; ZoneList* captures_; bool is_scanned_for_captures_; // The capture count is only valid after we have scanned for captures. @@ -3502,7 +3504,7 @@ RegExpParser::RegExpParser(FlatStringReader* in, next_pos_(0), in_(in), error_(error), - has_character_escapes_(false), + simple_(true), captures_(NULL), is_scanned_for_captures_(false), capture_count_(0), @@ -3550,11 +3552,8 @@ void RegExpParser::Advance(int dist) { } -// Reports whether the parsed string atoms contain any characters that were -// escaped in the original pattern. If not, all atoms are proper substrings -// of the original pattern. -bool RegExpParser::HasCharacterEscapes() { - return has_character_escapes_; +bool RegExpParser::simple() { + return simple_; } RegExpTree* RegExpParser::ReportError(Vector message) { @@ -3769,7 +3768,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { Advance(2); break; } - has_character_escapes_ = true; + simple_ = false; break; case '{': { int dummy; @@ -3822,6 +3821,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { is_greedy = false; Advance(); } + simple_ = false; // Adding quantifier might *remove* look-ahead. builder.AddQuantifierToAtom(min, max, is_greedy); } } @@ -4307,15 +4307,17 @@ bool ParseRegExp(FlatStringReader* input, // Make sure we have a stack guard. StackGuard guard; RegExpParser parser(input, &result->error, multiline); - result->tree = parser.ParsePattern(); + RegExpTree* tree = parser.ParsePattern(); if (parser.failed()) { - ASSERT(result->tree == NULL); + ASSERT(tree == NULL); ASSERT(!result->error.is_null()); } else { - ASSERT(result->tree != NULL); + ASSERT(tree != NULL); ASSERT(result->error.is_null()); - result->has_character_escapes = parser.HasCharacterEscapes(); - result->capture_count = parser.captures_started(); + result->tree = tree; + int capture_count = parser.captures_started(); + result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; + result->capture_count = capture_count; } return !parser.failed(); } diff --git a/test/cctest/test-regexp.cc b/test/cctest/test-regexp.cc index 782bb1168..998f113d9 100644 --- a/test/cctest/test-regexp.cc +++ b/test/cctest/test-regexp.cc @@ -63,7 +63,7 @@ static SmartPointer Parse(const char* input) { return output; } -static bool ParseEscapes(const char* input) { +static bool CheckSimple(const char* input) { V8::Initialize(NULL); v8::HandleScope scope; unibrow::Utf8InputBuffer<> buffer(input, strlen(input)); @@ -73,13 +73,12 @@ static bool ParseEscapes(const char* input) { CHECK(v8::internal::ParseRegExp(&reader, false, &result)); CHECK(result.tree != NULL); CHECK(result.error.is_null()); - return result.has_character_escapes; + return result.simple; } #define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input)) -#define CHECK_ESCAPES(input, has_escapes) CHECK_EQ(has_escapes, \ - ParseEscapes(input)); +#define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input)); TEST(Parser) { V8::Initialize(NULL); @@ -168,6 +167,11 @@ TEST(Parser) { CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))"); CHECK_PARSE_EQ("(a\\1)", "(^ 'a')"); CHECK_PARSE_EQ("(\\1a)", "(^ 'a')"); + CHECK_PARSE_EQ("(?=a)?a", "'a'"); + CHECK_PARSE_EQ("(?=a){0,10}a", "'a'"); + CHECK_PARSE_EQ("(?=a){1,10}a", "(: (-> + 'a') 'a')"); + CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')"); + CHECK_PARSE_EQ("(?!a)?a", "'a'"); CHECK_PARSE_EQ("\\1(a)", "(^ 'a')"); CHECK_PARSE_EQ("(?!(a))\\1", "(-> - (^ 'a'))"); CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: (^ 'a') (<- 1)))"); @@ -186,47 +190,50 @@ TEST(Parser) { CHECK_PARSE_EQ("\\u003z", "'u003z'"); CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))"); - CHECK_ESCAPES("a", false); - CHECK_ESCAPES("a|b", false); - CHECK_ESCAPES("a\\n", true); - CHECK_ESCAPES("^a", false); - CHECK_ESCAPES("a$", false); - CHECK_ESCAPES("a\\b!", false); - CHECK_ESCAPES("a\\Bb", false); - CHECK_ESCAPES("a*", false); - CHECK_ESCAPES("a*?", false); - CHECK_ESCAPES("a?", false); - CHECK_ESCAPES("a??", false); - CHECK_ESCAPES("a{0,1}?", false); - CHECK_ESCAPES("a{1,1}?", false); - CHECK_ESCAPES("a{1,2}?", false); - CHECK_ESCAPES("a+?", false); - CHECK_ESCAPES("(a)", false); - CHECK_ESCAPES("(a)\\1", false); - CHECK_ESCAPES("(\\1a)", false); - CHECK_ESCAPES("\\1(a)", false); - CHECK_ESCAPES("a\\s", false); - CHECK_ESCAPES("a\\S", false); - CHECK_ESCAPES("a\\d", false); - CHECK_ESCAPES("a\\D", false); - CHECK_ESCAPES("a\\w", false); - CHECK_ESCAPES("a\\W", false); - CHECK_ESCAPES("a.", false); - CHECK_ESCAPES("a\\q", true); - CHECK_ESCAPES("a[a]", false); - CHECK_ESCAPES("a[^a]", false); - CHECK_ESCAPES("a[a-z]", false); - CHECK_ESCAPES("a[\\q]", false); - CHECK_ESCAPES("a(?:b)", false); - CHECK_ESCAPES("a(?=b)", false); - CHECK_ESCAPES("a(?!b)", false); - CHECK_ESCAPES("\\x60", true); - CHECK_ESCAPES("\\u0060", true); - CHECK_ESCAPES("\\cA", true); - CHECK_ESCAPES("\\q", true); - CHECK_ESCAPES("\\1112", true); - CHECK_ESCAPES("\\0", true); - CHECK_ESCAPES("(a)\\1", false); + CHECK_SIMPLE("a", true); + CHECK_SIMPLE("a|b", false); + CHECK_SIMPLE("a\\n", false); + CHECK_SIMPLE("^a", false); + CHECK_SIMPLE("a$", false); + CHECK_SIMPLE("a\\b!", false); + CHECK_SIMPLE("a\\Bb", false); + CHECK_SIMPLE("a*", false); + CHECK_SIMPLE("a*?", false); + CHECK_SIMPLE("a?", false); + CHECK_SIMPLE("a??", false); + CHECK_SIMPLE("a{0,1}?", false); + CHECK_SIMPLE("a{1,1}?", false); + CHECK_SIMPLE("a{1,2}?", false); + CHECK_SIMPLE("a+?", false); + CHECK_SIMPLE("(a)", false); + CHECK_SIMPLE("(a)\\1", false); + CHECK_SIMPLE("(\\1a)", false); + CHECK_SIMPLE("\\1(a)", false); + CHECK_SIMPLE("a\\s", false); + CHECK_SIMPLE("a\\S", false); + CHECK_SIMPLE("a\\d", false); + CHECK_SIMPLE("a\\D", false); + CHECK_SIMPLE("a\\w", false); + CHECK_SIMPLE("a\\W", false); + CHECK_SIMPLE("a.", false); + CHECK_SIMPLE("a\\q", false); + CHECK_SIMPLE("a[a]", false); + CHECK_SIMPLE("a[^a]", false); + CHECK_SIMPLE("a[a-z]", false); + CHECK_SIMPLE("a[\\q]", false); + CHECK_SIMPLE("a(?:b)", false); + CHECK_SIMPLE("a(?=b)", false); + CHECK_SIMPLE("a(?!b)", false); + CHECK_SIMPLE("\\x60", false); + CHECK_SIMPLE("\\u0060", false); + CHECK_SIMPLE("\\cA", false); + CHECK_SIMPLE("\\q", false); + CHECK_SIMPLE("\\1112", false); + CHECK_SIMPLE("\\0", false); + CHECK_SIMPLE("(a)\\1", false); + CHECK_SIMPLE("(?=a)?a", false); + CHECK_SIMPLE("(?!a)?a\\1", false); + CHECK_SIMPLE("(?:(?=a))a\\1", false); CHECK_PARSE_EQ("a{}", "'a{}'"); CHECK_PARSE_EQ("a{,}", "'a{,}'"); -- 2.34.1