From 17362b27eae590091552725a46f2adaf6d348a53 Mon Sep 17 00:00:00 2001
From: "lrn@chromium.org"
 <lrn@chromium.org@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Date: Fri, 3 Jul 2009 08:18:35 +0000
Subject: [PATCH] Changed RegExp parser to use a recursive data structure
 instead of stack-based recursion. Shouldn't run out of stack space while
 parsing deeply nested regexps. Might be a little faster.

Review URL: http://codereview.chromium.org/149069


git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@2345 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
---
 src/ast.h                  |  13 +--
 src/parser.cc              | 275 +++++++++++++++++++++++++--------------------
 test/cctest/test-regexp.cc |   4 +-
 3 files changed, 157 insertions(+), 135 deletions(-)

diff --git a/src/ast.h b/src/ast.h
index 15d762f..64d61cc 100644
--- a/src/ast.h
+++ b/src/ast.h
@@ -1575,16 +1575,10 @@ class RegExpQuantifier: public RegExpTree {
 };
 
 
-enum CaptureAvailability {
-  CAPTURE_AVAILABLE,
-  CAPTURE_UNREACHABLE,
-  CAPTURE_PERMANENTLY_UNREACHABLE
-};
-
 class RegExpCapture: public RegExpTree {
  public:
   explicit RegExpCapture(RegExpTree* body, int index)
-      : body_(body), index_(index), available_(CAPTURE_AVAILABLE) { }
+      : body_(body), index_(index) { }
   virtual void* Accept(RegExpVisitor* visitor, void* data);
   virtual RegExpNode* ToNode(RegExpCompiler* compiler,
                              RegExpNode* on_success);
@@ -1600,16 +1594,11 @@ class RegExpCapture: public RegExpTree {
   virtual int max_match() { return body_->max_match(); }
   RegExpTree* body() { return body_; }
   int index() { return index_; }
-  inline CaptureAvailability available() { return available_; }
-  inline void set_available(CaptureAvailability availability) {
-    available_ = availability;
-  }
   static int StartRegister(int index) { return index * 2; }
   static int EndRegister(int index) { return index * 2 + 1; }
  private:
   RegExpTree* body_;
   int index_;
-  CaptureAvailability available_;
 };
 
 
diff --git a/src/parser.cc b/src/parser.cc
index 2b4be79..8883208 100644
--- a/src/parser.cc
+++ b/src/parser.cc
@@ -361,7 +361,7 @@ class BufferedZoneList {
 };
 
 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
-class RegExpBuilder {
+class RegExpBuilder: public ZoneObject {
  public:
   RegExpBuilder();
   void AddCharacter(uc16 character);
@@ -392,7 +392,10 @@ class RegExpBuilder {
 
 
 RegExpBuilder::RegExpBuilder()
-  : pending_empty_(false), characters_(NULL), terms_(), alternatives_()
+  : pending_empty_(false),
+    characters_(NULL),
+    terms_(),
+    alternatives_()
 #ifdef DEBUG
   , last_added_(ADD_NONE)
 #endif
@@ -594,6 +597,44 @@ class RegExpParser {
   static const int kMaxCaptures = 1 << 16;
   static const uc32 kEndMarker = (1 << 21);
  private:
+  enum SubexpressionType {
+    INITIAL,
+    CAPTURE,  // All positive values represent captures.
+    POSITIVE_LOOKAHEAD,
+    NEGATIVE_LOOKAHEAD,
+    GROUPING
+  };
+
+  class RegExpParserState : public ZoneObject {
+   public:
+    RegExpParserState(RegExpParserState* previous_state,
+                      SubexpressionType group_type,
+                      int disjunction_capture_index)
+        : previous_state_(previous_state),
+          builder_(new RegExpBuilder()),
+          group_type_(group_type),
+          disjunction_capture_index_(disjunction_capture_index) {}
+    // Parser state of containing expression, if any.
+    RegExpParserState* previous_state() { return previous_state_; }
+    bool IsSubexpression() { return previous_state_ != NULL; }
+    // RegExpBuilder building this regexp's AST.
+    RegExpBuilder* builder() { return builder_; }
+    // Type of regexp being parsed (parenthesized group or entire regexp).
+    SubexpressionType group_type() { return group_type_; }
+    // Index in captures array of first capture in this sub-expression, if any.
+    // Also the capture index of this sub-expression itself, if group_type
+    // is CAPTURE.
+    int capture_index() { return disjunction_capture_index_; }
+   private:
+    // Linked list implementation of stack of states.
+    RegExpParserState* previous_state_;
+    // Builder for the stored disjunction.
+    RegExpBuilder* builder_;
+    // Stored disjunction type (capture, look-ahead or grouping), if any.
+    SubexpressionType group_type_;
+    // Stored disjunction's capture index (if any).
+    int disjunction_capture_index_;
+  };
 
   uc32 current() { return current_; }
   bool has_more() { return has_more_; }
@@ -601,7 +642,6 @@ class RegExpParser {
   uc32 Next();
   FlatStringReader* in() { return in_; }
   void ScanForCaptures();
-  bool CaptureAvailable(int index);
   uc32 current_;
   bool has_more_;
   bool multiline_;
@@ -3820,14 +3860,6 @@ RegExpTree* RegExpParser::ParsePattern() {
 }
 
 
-bool RegExpParser::CaptureAvailable(int index) {
-  if (captures_ == NULL) return false;
-  if (index >= captures_->length()) return false;
-  RegExpCapture* capture = captures_->at(index);
-  return capture != NULL && capture->available() == CAPTURE_AVAILABLE;
-}
-
-
 // Disjunction ::
 //   Alternative
 //   Alternative | Disjunction
@@ -3839,24 +3871,60 @@ bool RegExpParser::CaptureAvailable(int index) {
 //   Atom
 //   Atom Quantifier
 RegExpTree* RegExpParser::ParseDisjunction() {
-  RegExpBuilder builder;
-  int capture_start_index = captures_started();
+  // Used to store current state while parsing subexpressions.
+  RegExpParserState initial_state(NULL, INITIAL, 0);
+  RegExpParserState* stored_state = &initial_state;
+  // Cache the builder in a local variable for quick access.
+  RegExpBuilder* builder = initial_state.builder();
   while (true) {
     switch (current()) {
     case kEndMarker:
-    case ')':
-      return builder.ToRegExp();
-    case '|': {
+      if (stored_state->IsSubexpression()) {
+        // Inside a parenthesized group when hitting end of input.
+        ReportError(CStrVector("Unterminated group") CHECK_FAILED);
+      }
+      ASSERT_EQ(INITIAL, stored_state->group_type());
+      // Parsing completed successfully.
+      return builder->ToRegExp();
+    case ')': {
+      if (!stored_state->IsSubexpression()) {
+        ReportError(CStrVector("Unexpected ')'") CHECK_FAILED);
+      }
+      ASSERT_NE(INITIAL, stored_state->group_type());
+
       Advance();
-      builder.NewAlternative();
-      int capture_new_alt_start_index = captures_started();
-      for (int i = capture_start_index; i < capture_new_alt_start_index; i++) {
-        RegExpCapture* capture = captures_->at(i);
-        if (capture->available() == CAPTURE_AVAILABLE) {
-          capture->set_available(CAPTURE_UNREACHABLE);
-        }
+      // End disjunction parsing and convert builder content to new single
+      // regexp atom.
+      RegExpTree* body = builder->ToRegExp();
+
+      int end_capture_index = captures_started();
+
+      int capture_index = stored_state->capture_index();
+      SubexpressionType type = stored_state->group_type();
+
+      // Restore previous state.
+      stored_state = stored_state->previous_state();
+      builder = stored_state->builder();
+
+      // Build result of subexpression.
+      if (type == CAPTURE) {
+        RegExpCapture* capture = new RegExpCapture(body, capture_index);
+        captures_->at(capture_index - 1) = capture;
+        body = capture;
+      } else if (type != GROUPING) {
+        ASSERT(type == POSITIVE_LOOKAHEAD || type == NEGATIVE_LOOKAHEAD);
+        bool is_positive = (type == POSITIVE_LOOKAHEAD);
+        body = new RegExpLookahead(body,
+                                   is_positive,
+                                   end_capture_index - capture_index,
+                                   capture_index);
       }
-      capture_start_index = capture_new_alt_start_index;
+      builder->AddAtom(body);
+      break;
+    }
+    case '|': {
+      Advance();
+      builder->NewAlternative();
       continue;
     }
     case '*':
@@ -3866,10 +3934,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
     case '^': {
       Advance();
       if (multiline_) {
-        builder.AddAssertion(
+        builder->AddAssertion(
             new RegExpAssertion(RegExpAssertion::START_OF_LINE));
       } else {
-        builder.AddAssertion(
+        builder->AddAssertion(
             new RegExpAssertion(RegExpAssertion::START_OF_INPUT));
         set_contains_anchor();
       }
@@ -3880,7 +3948,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       RegExpAssertion::Type type =
           multiline_ ? RegExpAssertion::END_OF_LINE :
                        RegExpAssertion::END_OF_INPUT;
-      builder.AddAssertion(new RegExpAssertion(type));
+      builder->AddAssertion(new RegExpAssertion(type));
       continue;
     }
     case '.': {
@@ -3889,17 +3957,47 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
       CharacterRange::AddClassEscape('.', ranges);
       RegExpTree* atom = new RegExpCharacterClass(ranges, false);
-      builder.AddAtom(atom);
+      builder->AddAtom(atom);
       break;
     }
     case '(': {
-      RegExpTree* atom = ParseGroup(CHECK_FAILED);
-      builder.AddAtom(atom);
+      SubexpressionType type = CAPTURE;
+      Advance();
+      if (current() == '?') {
+        switch (Next()) {
+          case ':':
+            type = GROUPING;
+            break;
+          case '=':
+            type = POSITIVE_LOOKAHEAD;
+            break;
+          case '!':
+            type = NEGATIVE_LOOKAHEAD;
+            break;
+          default:
+            ReportError(CStrVector("Invalid group") CHECK_FAILED);
+            break;
+        }
+        Advance(2);
+      } else {
+        if (captures_ == NULL) {
+          captures_ = new ZoneList<RegExpCapture*>(2);
+        }
+        if (captures_started() >= kMaxCaptures) {
+          ReportError(CStrVector("Too many captures") CHECK_FAILED);
+        }
+        captures_->Add(NULL);
+      }
+      // Store current state and begin new disjunction parsing.
+      stored_state = new RegExpParserState(stored_state,
+                                           type,
+                                           captures_started());
+      builder = stored_state->builder();
       break;
     }
     case '[': {
       RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
-      builder.AddAtom(atom);
+      builder->AddAtom(atom);
       break;
     }
     // Atom ::
@@ -3910,12 +4008,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
         ReportError(CStrVector("\\ at end of pattern") CHECK_FAILED);
       case 'b':
         Advance(2);
-        builder.AddAssertion(
+        builder->AddAssertion(
             new RegExpAssertion(RegExpAssertion::BOUNDARY));
         continue;
       case 'B':
         Advance(2);
-        builder.AddAssertion(
+        builder->AddAssertion(
             new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
         continue;
         // AtomEscape ::
@@ -3929,27 +4027,29 @@ RegExpTree* RegExpParser::ParseDisjunction() {
         ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
         CharacterRange::AddClassEscape(c, ranges);
         RegExpTree* atom = new RegExpCharacterClass(ranges, false);
-        builder.AddAtom(atom);
+        builder->AddAtom(atom);
         goto has_read_atom;  // Avoid setting has_character_escapes_.
       }
       case '1': case '2': case '3': case '4': case '5': case '6':
       case '7': case '8': case '9': {
         int index = 0;
         if (ParseBackReferenceIndex(&index)) {
-          if (!CaptureAvailable(index - 1)) {
-            // Prepare to ignore a following quantifier
-            builder.AddEmpty();
+          RegExpCapture* capture = NULL;
+          if (captures_ != NULL && index <= captures_->length()) {
+            capture = captures_->at(index - 1);
+          }
+          if (capture == NULL) {
+            builder->AddEmpty();
             goto has_read_atom;
           }
-          RegExpCapture* capture = captures_->at(index - 1);
           RegExpTree* atom = new RegExpBackReference(capture);
-          builder.AddAtom(atom);
+          builder->AddAtom(atom);
           goto has_read_atom;  // Avoid setting has_character_escapes_.
         }
         uc32 first_digit = Next();
         if (first_digit == '8' || first_digit == '9') {
           // Treat as identity escape
-          builder.AddCharacter(first_digit);
+          builder->AddCharacter(first_digit);
           Advance(2);
           break;
         }
@@ -3958,44 +4058,44 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       case '0': {
         Advance();
         uc32 octal = ParseOctalLiteral();
-        builder.AddCharacter(octal);
+        builder->AddCharacter(octal);
         break;
       }
       // ControlEscape :: one of
       //   f n r t v
       case 'f':
         Advance(2);
-        builder.AddCharacter('\f');
+        builder->AddCharacter('\f');
         break;
       case 'n':
         Advance(2);
-        builder.AddCharacter('\n');
+        builder->AddCharacter('\n');
         break;
       case 'r':
         Advance(2);
-        builder.AddCharacter('\r');
+        builder->AddCharacter('\r');
         break;
       case 't':
         Advance(2);
-        builder.AddCharacter('\t');
+        builder->AddCharacter('\t');
         break;
       case 'v':
         Advance(2);
-        builder.AddCharacter('\v');
+        builder->AddCharacter('\v');
         break;
       case 'c': {
         Advance(2);
         uc32 control = ParseControlLetterEscape();
-        builder.AddCharacter(control);
+        builder->AddCharacter(control);
         break;
       }
       case 'x': {
         Advance(2);
         uc32 value;
         if (ParseHexEscape(2, &value)) {
-          builder.AddCharacter(value);
+          builder->AddCharacter(value);
         } else {
-          builder.AddCharacter('x');
+          builder->AddCharacter('x');
         }
         break;
       }
@@ -4003,15 +4103,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {
         Advance(2);
         uc32 value;
         if (ParseHexEscape(4, &value)) {
-          builder.AddCharacter(value);
+          builder->AddCharacter(value);
         } else {
-          builder.AddCharacter('u');
+          builder->AddCharacter('u');
         }
         break;
       }
       default:
         // Identity escape.
-        builder.AddCharacter(Next());
+        builder->AddCharacter(Next());
         Advance(2);
         break;
       }
@@ -4024,7 +4124,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       // fallthrough
     }
     default:
-      builder.AddCharacter(current());
+      builder->AddCharacter(current());
       Advance();
       break;
     }  // end switch(current())
@@ -4071,7 +4171,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       is_greedy = false;
       Advance();
     }
-    builder.AddQuantifierToAtom(min, max, is_greedy);
+    builder->AddQuantifierToAtom(min, max, is_greedy);
   }
 }
 
@@ -4382,73 +4482,6 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
 }
 
 
-RegExpTree* RegExpParser::ParseGroup() {
-  ASSERT_EQ(current(), '(');
-  char type = '(';
-  Advance();
-  if (current() == '?') {
-    switch (Next()) {
-      case ':': case '=': case '!':
-        type = Next();
-        Advance(2);
-        break;
-      default:
-        ReportError(CStrVector("Invalid group") CHECK_FAILED);
-        break;
-    }
-  } else {
-    if (captures_ == NULL) {
-      captures_ = new ZoneList<RegExpCapture*>(2);
-    }
-    if (captures_started() >= kMaxCaptures) {
-      ReportError(CStrVector("Too many captures") CHECK_FAILED);
-    }
-    captures_->Add(NULL);
-  }
-  int capture_index = captures_started();
-  RegExpTree* body = ParseDisjunction(CHECK_FAILED);
-  if (current() != ')') {
-    ReportError(CStrVector("Unterminated group") CHECK_FAILED);
-  }
-  Advance();
-
-  int end_capture_index = captures_started();
-  if (type == '!') {
-    // Captures inside a negative lookahead are never available outside it.
-    for (int i = capture_index; i < end_capture_index; i++) {
-      RegExpCapture* capture = captures_->at(i);
-      ASSERT(capture != NULL);
-      capture->set_available(CAPTURE_PERMANENTLY_UNREACHABLE);
-    }
-  } else {
-    // Captures temporarily unavailable because they are in different
-    // alternatives are all available after the disjunction.
-    for (int i = capture_index; i < end_capture_index; i++) {
-      RegExpCapture* capture = captures_->at(i);
-      ASSERT(capture != NULL);
-      if (capture->available() == CAPTURE_UNREACHABLE) {
-        capture->set_available(CAPTURE_AVAILABLE);
-      }
-    }
-  }
-
-  if (type == '(') {
-    RegExpCapture* capture = new RegExpCapture(body, capture_index);
-    captures_->at(capture_index - 1) = capture;
-    return capture;
-  } else if (type == ':') {
-    return body;
-  } else {
-    ASSERT(type == '=' || type == '!');
-    bool is_positive = (type == '=');
-    return new RegExpLookahead(body,
-                               is_positive,
-                               end_capture_index - capture_index,
-                               capture_index);
-  }
-}
-
-
 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
   ASSERT_EQ(0, *char_class);
   uc32 first = current();
diff --git a/test/cctest/test-regexp.cc b/test/cctest/test-regexp.cc
index 8761cf5..62597fb 100644
--- a/test/cctest/test-regexp.cc
+++ b/test/cctest/test-regexp.cc
@@ -204,8 +204,8 @@ TEST(Parser) {
   CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')");
   CHECK_PARSE_EQ("(?!a)?a", "'a'");
   CHECK_PARSE_EQ("\\1(a)", "(^ 'a')");
-  CHECK_PARSE_EQ("(?!(a))\\1", "(-> - (^ 'a'))");
-  CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(-> - (: (^ 'a') (<- 1)))");
+  CHECK_PARSE_EQ("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))");
+  CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))");
   CHECK_PARSE_EQ("[\\0]", "[\\x00]");
   CHECK_PARSE_EQ("[\\11]", "[\\x09]");
   CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]");
-- 
2.7.4