"malformed_regexp");
return Handle<Object>::null();
}
- RegExpAtom* atom = parse_result.tree->AsAtom();
- if (atom != NULL && !flags.is_ignore_case()) {
- if (parse_result.has_character_escapes) {
- Vector<const uc16> atom_pattern = atom->data();
- Handle<String> atom_string =
- Factory::NewStringFromTwoByte(atom_pattern);
- result = AtomCompile(re, pattern, flags, atom_string);
- } else {
- result = AtomCompile(re, pattern, flags, pattern);
- }
+
+ if (parse_result.simple && !flags.is_ignore_case()) {
+ // Parse-tree is a single atom that is equal to the pattern.
+ result = AtomCompile(re, pattern, flags, pattern);
+ } else if (parse_result.tree->IsAtom() &&
+ !flags.is_ignore_case() &&
+ parse_result.capture_count == 0) {
+ // TODO(lrn) Accept capture_count > 0 on atoms.
+ RegExpAtom* atom = parse_result.tree->AsAtom();
+ Vector<const uc16> atom_pattern = atom->data();
+ Handle<String> atom_string =
+ Factory::NewStringFromTwoByte(atom_pattern);
+ result = AtomCompile(re, pattern, flags, atom_string);
+ } else if (FLAG_irregexp) {
+ result = IrregexpPrepare(re, pattern, flags);
} else {
- if (FLAG_irregexp) {
- result = IrregexpPrepare(re, pattern, flags);
- } else {
- result = JscrePrepare(re, pattern, flags);
- }
+ result = JscrePrepare(re, pattern, flags);
}
Object* data = re->data();
if (data->IsFixedArray()) {
void Advance(int dist);
void Reset(int pos);
- bool HasCharacterEscapes();
+ // Reports whether the pattern might be used as a literal search string.
+ // Only use if the result of the parse is a single atom node.
+ bool simple();
int captures_started() { return captures_ == NULL ? 0 : captures_->length(); }
int position() { return next_pos_ - 1; }
int next_pos_;
FlatStringReader* in_;
Handle<String>* error_;
- bool has_character_escapes_;
+ bool simple_;
ZoneList<RegExpCapture*>* captures_;
bool is_scanned_for_captures_;
// The capture count is only valid after we have scanned for captures.
next_pos_(0),
in_(in),
error_(error),
- has_character_escapes_(false),
+ simple_(true),
captures_(NULL),
is_scanned_for_captures_(false),
capture_count_(0),
}
-// Reports whether the parsed string atoms contain any characters that were
-// escaped in the original pattern. If not, all atoms are proper substrings
-// of the original pattern.
-bool RegExpParser::HasCharacterEscapes() {
- return has_character_escapes_;
+bool RegExpParser::simple() {
+ return simple_;
}
RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
Advance(2);
break;
}
- has_character_escapes_ = true;
+ simple_ = false;
break;
case '{': {
int dummy;
is_greedy = false;
Advance();
}
+ simple_ = false; // Adding quantifier might *remove* look-ahead.
builder.AddQuantifierToAtom(min, max, is_greedy);
}
}
// Make sure we have a stack guard.
StackGuard guard;
RegExpParser parser(input, &result->error, multiline);
- result->tree = parser.ParsePattern();
+ RegExpTree* tree = parser.ParsePattern();
if (parser.failed()) {
- ASSERT(result->tree == NULL);
+ ASSERT(tree == NULL);
ASSERT(!result->error.is_null());
} else {
- ASSERT(result->tree != NULL);
+ ASSERT(tree != NULL);
ASSERT(result->error.is_null());
- result->has_character_escapes = parser.HasCharacterEscapes();
- result->capture_count = parser.captures_started();
+ result->tree = tree;
+ int capture_count = parser.captures_started();
+ result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
+ result->capture_count = capture_count;
}
return !parser.failed();
}
return output;
}
-static bool ParseEscapes(const char* input) {
+static bool CheckSimple(const char* input) {
V8::Initialize(NULL);
v8::HandleScope scope;
unibrow::Utf8InputBuffer<> buffer(input, strlen(input));
CHECK(v8::internal::ParseRegExp(&reader, false, &result));
CHECK(result.tree != NULL);
CHECK(result.error.is_null());
- return result.has_character_escapes;
+ return result.simple;
}
#define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input))
-#define CHECK_ESCAPES(input, has_escapes) CHECK_EQ(has_escapes, \
- ParseEscapes(input));
+#define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
TEST(Parser) {
V8::Initialize(NULL);
CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))");
CHECK_PARSE_EQ("(a\\1)", "(^ 'a')");
CHECK_PARSE_EQ("(\\1a)", "(^ 'a')");
+ CHECK_PARSE_EQ("(?=a)?a", "'a'");
+ CHECK_PARSE_EQ("(?=a){0,10}a", "'a'");
+ CHECK_PARSE_EQ("(?=a){1,10}a", "(: (-> + 'a') 'a')");
+ CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')");
+ CHECK_PARSE_EQ("(?!a)?a", "'a'");
CHECK_PARSE_EQ("\\1(a)", "(^ 'a')");
CHECK_PARSE_EQ("(?!(a))\\1", "(-> - (^ 'a'))");
CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: (^ 'a') (<- 1)))");
CHECK_PARSE_EQ("\\u003z", "'u003z'");
CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))");
- CHECK_ESCAPES("a", false);
- CHECK_ESCAPES("a|b", false);
- CHECK_ESCAPES("a\\n", true);
- CHECK_ESCAPES("^a", false);
- CHECK_ESCAPES("a$", false);
- CHECK_ESCAPES("a\\b!", false);
- CHECK_ESCAPES("a\\Bb", false);
- CHECK_ESCAPES("a*", false);
- CHECK_ESCAPES("a*?", false);
- CHECK_ESCAPES("a?", false);
- CHECK_ESCAPES("a??", false);
- CHECK_ESCAPES("a{0,1}?", false);
- CHECK_ESCAPES("a{1,1}?", false);
- CHECK_ESCAPES("a{1,2}?", false);
- CHECK_ESCAPES("a+?", false);
- CHECK_ESCAPES("(a)", false);
- CHECK_ESCAPES("(a)\\1", false);
- CHECK_ESCAPES("(\\1a)", false);
- CHECK_ESCAPES("\\1(a)", false);
- CHECK_ESCAPES("a\\s", false);
- CHECK_ESCAPES("a\\S", false);
- CHECK_ESCAPES("a\\d", false);
- CHECK_ESCAPES("a\\D", false);
- CHECK_ESCAPES("a\\w", false);
- CHECK_ESCAPES("a\\W", false);
- CHECK_ESCAPES("a.", false);
- CHECK_ESCAPES("a\\q", true);
- CHECK_ESCAPES("a[a]", false);
- CHECK_ESCAPES("a[^a]", false);
- CHECK_ESCAPES("a[a-z]", false);
- CHECK_ESCAPES("a[\\q]", false);
- CHECK_ESCAPES("a(?:b)", false);
- CHECK_ESCAPES("a(?=b)", false);
- CHECK_ESCAPES("a(?!b)", false);
- CHECK_ESCAPES("\\x60", true);
- CHECK_ESCAPES("\\u0060", true);
- CHECK_ESCAPES("\\cA", true);
- CHECK_ESCAPES("\\q", true);
- CHECK_ESCAPES("\\1112", true);
- CHECK_ESCAPES("\\0", true);
- CHECK_ESCAPES("(a)\\1", false);
+ CHECK_SIMPLE("a", true);
+ CHECK_SIMPLE("a|b", false);
+ CHECK_SIMPLE("a\\n", false);
+ CHECK_SIMPLE("^a", false);
+ CHECK_SIMPLE("a$", false);
+ CHECK_SIMPLE("a\\b!", false);
+ CHECK_SIMPLE("a\\Bb", false);
+ CHECK_SIMPLE("a*", false);
+ CHECK_SIMPLE("a*?", false);
+ CHECK_SIMPLE("a?", false);
+ CHECK_SIMPLE("a??", false);
+ CHECK_SIMPLE("a{0,1}?", false);
+ CHECK_SIMPLE("a{1,1}?", false);
+ CHECK_SIMPLE("a{1,2}?", false);
+ CHECK_SIMPLE("a+?", false);
+ CHECK_SIMPLE("(a)", false);
+ CHECK_SIMPLE("(a)\\1", false);
+ CHECK_SIMPLE("(\\1a)", false);
+ CHECK_SIMPLE("\\1(a)", false);
+ CHECK_SIMPLE("a\\s", false);
+ CHECK_SIMPLE("a\\S", false);
+ CHECK_SIMPLE("a\\d", false);
+ CHECK_SIMPLE("a\\D", false);
+ CHECK_SIMPLE("a\\w", false);
+ CHECK_SIMPLE("a\\W", false);
+ CHECK_SIMPLE("a.", false);
+ CHECK_SIMPLE("a\\q", false);
+ CHECK_SIMPLE("a[a]", false);
+ CHECK_SIMPLE("a[^a]", false);
+ CHECK_SIMPLE("a[a-z]", false);
+ CHECK_SIMPLE("a[\\q]", false);
+ CHECK_SIMPLE("a(?:b)", false);
+ CHECK_SIMPLE("a(?=b)", false);
+ CHECK_SIMPLE("a(?!b)", false);
+ CHECK_SIMPLE("\\x60", false);
+ CHECK_SIMPLE("\\u0060", false);
+ CHECK_SIMPLE("\\cA", false);
+ CHECK_SIMPLE("\\q", false);
+ CHECK_SIMPLE("\\1112", false);
+ CHECK_SIMPLE("\\0", false);
+ CHECK_SIMPLE("(a)\\1", false);
+ CHECK_SIMPLE("(?=a)?a", false);
+ CHECK_SIMPLE("(?!a)?a\\1", false);
+ CHECK_SIMPLE("(?:(?=a))a\\1", false);
CHECK_PARSE_EQ("a{}", "'a{}'");
CHECK_PARSE_EQ("a{,}", "'a{,}'");