From 17362b27eae590091552725a46f2adaf6d348a53 Mon Sep 17 00:00:00 2001 From: "lrn@chromium.org" Date: Fri, 3 Jul 2009 08:18:35 +0000 Subject: [PATCH] Changed RegExp parser to use a recursive data structure instead of stack-based recursion. Shouldn't run out of stack space while parsing deeply nested regexps. Might be a little faster. Review URL: http://codereview.chromium.org/149069 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@2345 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/ast.h | 13 +-- src/parser.cc | 275 +++++++++++++++++++++++++-------------------- test/cctest/test-regexp.cc | 4 +- 3 files changed, 157 insertions(+), 135 deletions(-) diff --git a/src/ast.h b/src/ast.h index 15d762f..64d61cc 100644 --- a/src/ast.h +++ b/src/ast.h @@ -1575,16 +1575,10 @@ class RegExpQuantifier: public RegExpTree { }; -enum CaptureAvailability { - CAPTURE_AVAILABLE, - CAPTURE_UNREACHABLE, - CAPTURE_PERMANENTLY_UNREACHABLE -}; - class RegExpCapture: public RegExpTree { public: explicit RegExpCapture(RegExpTree* body, int index) - : body_(body), index_(index), available_(CAPTURE_AVAILABLE) { } + : body_(body), index_(index) { } virtual void* Accept(RegExpVisitor* visitor, void* data); virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); @@ -1600,16 +1594,11 @@ class RegExpCapture: public RegExpTree { virtual int max_match() { return body_->max_match(); } RegExpTree* body() { return body_; } int index() { return index_; } - inline CaptureAvailability available() { return available_; } - inline void set_available(CaptureAvailability availability) { - available_ = availability; - } static int StartRegister(int index) { return index * 2; } static int EndRegister(int index) { return index * 2 + 1; } private: RegExpTree* body_; int index_; - CaptureAvailability available_; }; diff --git a/src/parser.cc b/src/parser.cc index 2b4be79..8883208 100644 --- a/src/parser.cc +++ b/src/parser.cc @@ -361,7 +361,7 @@ class BufferedZoneList { }; // Accumulates RegExp atoms and assertions into lists of terms and alternatives. -class RegExpBuilder { +class RegExpBuilder: public ZoneObject { public: RegExpBuilder(); void AddCharacter(uc16 character); @@ -392,7 +392,10 @@ class RegExpBuilder { RegExpBuilder::RegExpBuilder() - : pending_empty_(false), characters_(NULL), terms_(), alternatives_() + : pending_empty_(false), + characters_(NULL), + terms_(), + alternatives_() #ifdef DEBUG , last_added_(ADD_NONE) #endif @@ -594,6 +597,44 @@ class RegExpParser { static const int kMaxCaptures = 1 << 16; static const uc32 kEndMarker = (1 << 21); private: + enum SubexpressionType { + INITIAL, + CAPTURE, // All positive values represent captures. + POSITIVE_LOOKAHEAD, + NEGATIVE_LOOKAHEAD, + GROUPING + }; + + class RegExpParserState : public ZoneObject { + public: + RegExpParserState(RegExpParserState* previous_state, + SubexpressionType group_type, + int disjunction_capture_index) + : previous_state_(previous_state), + builder_(new RegExpBuilder()), + group_type_(group_type), + disjunction_capture_index_(disjunction_capture_index) {} + // Parser state of containing expression, if any. + RegExpParserState* previous_state() { return previous_state_; } + bool IsSubexpression() { return previous_state_ != NULL; } + // RegExpBuilder building this regexp's AST. + RegExpBuilder* builder() { return builder_; } + // Type of regexp being parsed (parenthesized group or entire regexp). + SubexpressionType group_type() { return group_type_; } + // Index in captures array of first capture in this sub-expression, if any. + // Also the capture index of this sub-expression itself, if group_type + // is CAPTURE. + int capture_index() { return disjunction_capture_index_; } + private: + // Linked list implementation of stack of states. + RegExpParserState* previous_state_; + // Builder for the stored disjunction. + RegExpBuilder* builder_; + // Stored disjunction type (capture, look-ahead or grouping), if any. + SubexpressionType group_type_; + // Stored disjunction's capture index (if any). + int disjunction_capture_index_; + }; uc32 current() { return current_; } bool has_more() { return has_more_; } @@ -601,7 +642,6 @@ class RegExpParser { uc32 Next(); FlatStringReader* in() { return in_; } void ScanForCaptures(); - bool CaptureAvailable(int index); uc32 current_; bool has_more_; bool multiline_; @@ -3820,14 +3860,6 @@ RegExpTree* RegExpParser::ParsePattern() { } -bool RegExpParser::CaptureAvailable(int index) { - if (captures_ == NULL) return false; - if (index >= captures_->length()) return false; - RegExpCapture* capture = captures_->at(index); - return capture != NULL && capture->available() == CAPTURE_AVAILABLE; -} - - // Disjunction :: // Alternative // Alternative | Disjunction @@ -3839,24 +3871,60 @@ bool RegExpParser::CaptureAvailable(int index) { // Atom // Atom Quantifier RegExpTree* RegExpParser::ParseDisjunction() { - RegExpBuilder builder; - int capture_start_index = captures_started(); + // Used to store current state while parsing subexpressions. + RegExpParserState initial_state(NULL, INITIAL, 0); + RegExpParserState* stored_state = &initial_state; + // Cache the builder in a local variable for quick access. + RegExpBuilder* builder = initial_state.builder(); while (true) { switch (current()) { case kEndMarker: - case ')': - return builder.ToRegExp(); - case '|': { + if (stored_state->IsSubexpression()) { + // Inside a parenthesized group when hitting end of input. + ReportError(CStrVector("Unterminated group") CHECK_FAILED); + } + ASSERT_EQ(INITIAL, stored_state->group_type()); + // Parsing completed successfully. + return builder->ToRegExp(); + case ')': { + if (!stored_state->IsSubexpression()) { + ReportError(CStrVector("Unexpected ')'") CHECK_FAILED); + } + ASSERT_NE(INITIAL, stored_state->group_type()); + Advance(); - builder.NewAlternative(); - int capture_new_alt_start_index = captures_started(); - for (int i = capture_start_index; i < capture_new_alt_start_index; i++) { - RegExpCapture* capture = captures_->at(i); - if (capture->available() == CAPTURE_AVAILABLE) { - capture->set_available(CAPTURE_UNREACHABLE); - } + // End disjunction parsing and convert builder content to new single + // regexp atom. + RegExpTree* body = builder->ToRegExp(); + + int end_capture_index = captures_started(); + + int capture_index = stored_state->capture_index(); + SubexpressionType type = stored_state->group_type(); + + // Restore previous state. + stored_state = stored_state->previous_state(); + builder = stored_state->builder(); + + // Build result of subexpression. + if (type == CAPTURE) { + RegExpCapture* capture = new RegExpCapture(body, capture_index); + captures_->at(capture_index - 1) = capture; + body = capture; + } else if (type != GROUPING) { + ASSERT(type == POSITIVE_LOOKAHEAD || type == NEGATIVE_LOOKAHEAD); + bool is_positive = (type == POSITIVE_LOOKAHEAD); + body = new RegExpLookahead(body, + is_positive, + end_capture_index - capture_index, + capture_index); } - capture_start_index = capture_new_alt_start_index; + builder->AddAtom(body); + break; + } + case '|': { + Advance(); + builder->NewAlternative(); continue; } case '*': @@ -3866,10 +3934,10 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '^': { Advance(); if (multiline_) { - builder.AddAssertion( + builder->AddAssertion( new RegExpAssertion(RegExpAssertion::START_OF_LINE)); } else { - builder.AddAssertion( + builder->AddAssertion( new RegExpAssertion(RegExpAssertion::START_OF_INPUT)); set_contains_anchor(); } @@ -3880,7 +3948,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { RegExpAssertion::Type type = multiline_ ? RegExpAssertion::END_OF_LINE : RegExpAssertion::END_OF_INPUT; - builder.AddAssertion(new RegExpAssertion(type)); + builder->AddAssertion(new RegExpAssertion(type)); continue; } case '.': { @@ -3889,17 +3957,47 @@ RegExpTree* RegExpParser::ParseDisjunction() { ZoneList* ranges = new ZoneList(2); CharacterRange::AddClassEscape('.', ranges); RegExpTree* atom = new RegExpCharacterClass(ranges, false); - builder.AddAtom(atom); + builder->AddAtom(atom); break; } case '(': { - RegExpTree* atom = ParseGroup(CHECK_FAILED); - builder.AddAtom(atom); + SubexpressionType type = CAPTURE; + Advance(); + if (current() == '?') { + switch (Next()) { + case ':': + type = GROUPING; + break; + case '=': + type = POSITIVE_LOOKAHEAD; + break; + case '!': + type = NEGATIVE_LOOKAHEAD; + break; + default: + ReportError(CStrVector("Invalid group") CHECK_FAILED); + break; + } + Advance(2); + } else { + if (captures_ == NULL) { + captures_ = new ZoneList(2); + } + if (captures_started() >= kMaxCaptures) { + ReportError(CStrVector("Too many captures") CHECK_FAILED); + } + captures_->Add(NULL); + } + // Store current state and begin new disjunction parsing. + stored_state = new RegExpParserState(stored_state, + type, + captures_started()); + builder = stored_state->builder(); break; } case '[': { RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); - builder.AddAtom(atom); + builder->AddAtom(atom); break; } // Atom :: @@ -3910,12 +4008,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { ReportError(CStrVector("\\ at end of pattern") CHECK_FAILED); case 'b': Advance(2); - builder.AddAssertion( + builder->AddAssertion( new RegExpAssertion(RegExpAssertion::BOUNDARY)); continue; case 'B': Advance(2); - builder.AddAssertion( + builder->AddAssertion( new RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); continue; // AtomEscape :: @@ -3929,27 +4027,29 @@ RegExpTree* RegExpParser::ParseDisjunction() { ZoneList* ranges = new ZoneList(2); CharacterRange::AddClassEscape(c, ranges); RegExpTree* atom = new RegExpCharacterClass(ranges, false); - builder.AddAtom(atom); + builder->AddAtom(atom); goto has_read_atom; // Avoid setting has_character_escapes_. } case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { int index = 0; if (ParseBackReferenceIndex(&index)) { - if (!CaptureAvailable(index - 1)) { - // Prepare to ignore a following quantifier - builder.AddEmpty(); + RegExpCapture* capture = NULL; + if (captures_ != NULL && index <= captures_->length()) { + capture = captures_->at(index - 1); + } + if (capture == NULL) { + builder->AddEmpty(); goto has_read_atom; } - RegExpCapture* capture = captures_->at(index - 1); RegExpTree* atom = new RegExpBackReference(capture); - builder.AddAtom(atom); + builder->AddAtom(atom); goto has_read_atom; // Avoid setting has_character_escapes_. } uc32 first_digit = Next(); if (first_digit == '8' || first_digit == '9') { // Treat as identity escape - builder.AddCharacter(first_digit); + builder->AddCharacter(first_digit); Advance(2); break; } @@ -3958,44 +4058,44 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '0': { Advance(); uc32 octal = ParseOctalLiteral(); - builder.AddCharacter(octal); + builder->AddCharacter(octal); break; } // ControlEscape :: one of // f n r t v case 'f': Advance(2); - builder.AddCharacter('\f'); + builder->AddCharacter('\f'); break; case 'n': Advance(2); - builder.AddCharacter('\n'); + builder->AddCharacter('\n'); break; case 'r': Advance(2); - builder.AddCharacter('\r'); + builder->AddCharacter('\r'); break; case 't': Advance(2); - builder.AddCharacter('\t'); + builder->AddCharacter('\t'); break; case 'v': Advance(2); - builder.AddCharacter('\v'); + builder->AddCharacter('\v'); break; case 'c': { Advance(2); uc32 control = ParseControlLetterEscape(); - builder.AddCharacter(control); + builder->AddCharacter(control); break; } case 'x': { Advance(2); uc32 value; if (ParseHexEscape(2, &value)) { - builder.AddCharacter(value); + builder->AddCharacter(value); } else { - builder.AddCharacter('x'); + builder->AddCharacter('x'); } break; } @@ -4003,15 +4103,15 @@ RegExpTree* RegExpParser::ParseDisjunction() { Advance(2); uc32 value; if (ParseHexEscape(4, &value)) { - builder.AddCharacter(value); + builder->AddCharacter(value); } else { - builder.AddCharacter('u'); + builder->AddCharacter('u'); } break; } default: // Identity escape. - builder.AddCharacter(Next()); + builder->AddCharacter(Next()); Advance(2); break; } @@ -4024,7 +4124,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { // fallthrough } default: - builder.AddCharacter(current()); + builder->AddCharacter(current()); Advance(); break; } // end switch(current()) @@ -4071,7 +4171,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { is_greedy = false; Advance(); } - builder.AddQuantifierToAtom(min, max, is_greedy); + builder->AddQuantifierToAtom(min, max, is_greedy); } } @@ -4382,73 +4482,6 @@ uc32 RegExpParser::ParseClassCharacterEscape() { } -RegExpTree* RegExpParser::ParseGroup() { - ASSERT_EQ(current(), '('); - char type = '('; - Advance(); - if (current() == '?') { - switch (Next()) { - case ':': case '=': case '!': - type = Next(); - Advance(2); - break; - default: - ReportError(CStrVector("Invalid group") CHECK_FAILED); - break; - } - } else { - if (captures_ == NULL) { - captures_ = new ZoneList(2); - } - if (captures_started() >= kMaxCaptures) { - ReportError(CStrVector("Too many captures") CHECK_FAILED); - } - captures_->Add(NULL); - } - int capture_index = captures_started(); - RegExpTree* body = ParseDisjunction(CHECK_FAILED); - if (current() != ')') { - ReportError(CStrVector("Unterminated group") CHECK_FAILED); - } - Advance(); - - int end_capture_index = captures_started(); - if (type == '!') { - // Captures inside a negative lookahead are never available outside it. - for (int i = capture_index; i < end_capture_index; i++) { - RegExpCapture* capture = captures_->at(i); - ASSERT(capture != NULL); - capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE); - } - } else { - // Captures temporarily unavailable because they are in different - // alternatives are all available after the disjunction. - for (int i = capture_index; i < end_capture_index; i++) { - RegExpCapture* capture = captures_->at(i); - ASSERT(capture != NULL); - if (capture->available() == CAPTURE_UNREACHABLE) { - capture->set_available(CAPTURE_AVAILABLE); - } - } - } - - if (type == '(') { - RegExpCapture* capture = new RegExpCapture(body, capture_index); - captures_->at(capture_index - 1) = capture; - return capture; - } else if (type == ':') { - return body; - } else { - ASSERT(type == '=' || type == '!'); - bool is_positive = (type == '='); - return new RegExpLookahead(body, - is_positive, - end_capture_index - capture_index, - capture_index); - } -} - - CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { ASSERT_EQ(0, *char_class); uc32 first = current(); diff --git a/test/cctest/test-regexp.cc b/test/cctest/test-regexp.cc index 8761cf5..62597fb 100644 --- a/test/cctest/test-regexp.cc +++ b/test/cctest/test-regexp.cc @@ -204,8 +204,8 @@ TEST(Parser) { CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')"); CHECK_PARSE_EQ("(?!a)?a", "'a'"); CHECK_PARSE_EQ("\\1(a)", "(^ 'a')"); - CHECK_PARSE_EQ("(?!(a))\\1", "(-> - (^ 'a'))"); - CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: (^ 'a') (<- 1)))"); + CHECK_PARSE_EQ("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))"); + CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))"); CHECK_PARSE_EQ("[\\0]", "[\\x00]"); CHECK_PARSE_EQ("[\\11]", "[\\x09]"); CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]"); -- 2.7.4