Refix issue 1472. The previous fix worked for the example in the bug

author erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Fri, 17 Jun 2011 08:01:12 +0000 (08:01 +0000)

committer erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Fri, 17 Jun 2011 08:01:12 +0000 (08:01 +0000)
author erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Fri, 17 Jun 2011 08:01:12 +0000 (08:01 +0000)
committer erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Fri, 17 Jun 2011 08:01:12 +0000 (08:01 +0000)
diff --git a/src/ast.cc b/src/ast.cc

index 9f83d16..03df229 100644 (file)
--- a/src/ast.cc
+++ b/src/ast.cc
@@ -925,26 +925,6 @@ bool RegExpCapture::IsAnchoredAtEnd() {
  }
  
  
-bool RegExpDisjunction::ContainsExpandedQuantifier() {
-  if (contains_expanded_quantifier_) return true;
-  int len = alternatives_->length();
-  for (int i = 0; i < len; i++) {
-    if (alternatives_->at(i)->ContainsExpandedQuantifier()) return true;
-  }
-  return false;
-}
-
-
-bool RegExpAlternative::ContainsExpandedQuantifier() {
-  if (contains_expanded_quantifier_) return true;
-  int len = nodes_->length();
-  for (int i = 0; i < len; i++) {
-    if (nodes_->at(i)->ContainsExpandedQuantifier()) return true;
-  }
-  return false;
-}
-
-
  // Convert regular expression trees to a simple sexp representation.
  // This representation should be different from the input grammar
  // in as many cases as possible, to make it more difficult for incorrect
diff --git a/src/ast.h b/src/ast.h

index 5823f92..0101249 100644 (file)
--- a/src/ast.h
+++ b/src/ast.h
@@ -1751,7 +1751,6 @@ class RegExpVisitor BASE_EMBEDDED {
  class RegExpTree: public ZoneObject {
   public:
    static const int kInfinity = kMaxInt;
-  RegExpTree() : contains_expanded_quantifier_(false) { }
    virtual ~RegExpTree() { }
    virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
    virtual RegExpNode* ToNode(RegExpCompiler* compiler,
@@ -1761,12 +1760,6 @@ class RegExpTree: public ZoneObject {
    virtual bool IsAnchoredAtEnd() { return false; }
    virtual int min_match() = 0;
    virtual int max_match() = 0;
-  virtual bool ContainsExpandedQuantifier() {
-    return contains_expanded_quantifier_;
-  }
-  void set_contains_expanded_quantifier(bool value) {
-    contains_expanded_quantifier_ = value;
-  }
    // Returns the interval of registers used for captures within this
    // expression.
    virtual Interval CaptureRegisters() { return Interval::Empty(); }
@@ -1777,9 +1770,6 @@ class RegExpTree: public ZoneObject {
    virtual bool Is##Name();
    FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
  #undef MAKE_ASTYPE
-
- protected:
-  bool contains_expanded_quantifier_;
  };
  
  
@@ -1796,7 +1786,6 @@ class RegExpDisjunction: public RegExpTree {
    virtual bool IsAnchoredAtEnd();
    virtual int min_match() { return min_match_; }
    virtual int max_match() { return max_match_; }
-  virtual bool ContainsExpandedQuantifier();
    ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
   private:
    ZoneList<RegExpTree*>* alternatives_;
@@ -1818,7 +1807,6 @@ class RegExpAlternative: public RegExpTree {
    virtual bool IsAnchoredAtEnd();
    virtual int min_match() { return min_match_; }
    virtual int max_match() { return max_match_; }
-  virtual bool ContainsExpandedQuantifier();
    ZoneList<RegExpTree*>* nodes() { return nodes_; }
   private:
    ZoneList<RegExpTree*>* nodes_;
@@ -1968,8 +1956,7 @@ class RegExpQuantifier: public RegExpTree {
          min_(min),
          max_(max),
          min_match_(min * body->min_match()),
-        type_(type),
-        contains_expanded_quantifier_(false) {
+        type_(type) {
      if (max > 0 && body->max_match() > kInfinity / max) {
        max_match_ = kInfinity;
      } else {
@@ -1991,9 +1978,6 @@ class RegExpQuantifier: public RegExpTree {
    virtual bool IsQuantifier();
    virtual int min_match() { return min_match_; }
    virtual int max_match() { return max_match_; }
-  virtual bool ContainsExpandedQuantifier() {
-    return contains_expanded_quantifier_ || body_->ContainsExpandedQuantifier();
-  }
    int min() { return min_; }
    int max() { return max_; }
    bool is_possessive() { return type_ == POSSESSIVE; }
@@ -2008,7 +1992,6 @@ class RegExpQuantifier: public RegExpTree {
    int min_match_;
    int max_match_;
    Type type_;
-  bool contains_expanded_quantifier_;
  };
  
  
@@ -2030,9 +2013,6 @@ class RegExpCapture: public RegExpTree {
    virtual bool IsCapture();
    virtual int min_match() { return body_->min_match(); }
    virtual int max_match() { return body_->max_match(); }
-  virtual bool ContainsExpandedQuantifier() {
-    return contains_expanded_quantifier_ || body_->ContainsExpandedQuantifier();
-  }
    RegExpTree* body() { return body_; }
    int index() { return index_; }
    static int StartRegister(int index) { return index * 2; }
@@ -2064,9 +2044,6 @@ class RegExpLookahead: public RegExpTree {
    virtual bool IsAnchoredAtStart();
    virtual int min_match() { return 0; }
    virtual int max_match() { return 0; }
-  virtual bool ContainsExpandedQuantifier() {
-    return contains_expanded_quantifier_ || body_->ContainsExpandedQuantifier();
-  }
    RegExpTree* body() { return body_; }
    bool is_positive() { return is_positive_; }
    int capture_count() { return capture_count_; }
diff --git a/src/jsregexp.cc b/src/jsregexp.cc

index 1ce6791..73dbdb0 100644 (file)
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -810,6 +810,11 @@ class RegExpCompiler {
    inline bool ignore_case() { return ignore_case_; }
    inline bool ascii() { return ascii_; }
  
+  int current_expansion_factor() { return current_expansion_factor_; }
+  void set_current_expansion_factor(int value) {
+    current_expansion_factor_ = value;
+  }
+
    static const int kNoRegister = -1;
  
   private:
@@ -821,6 +826,7 @@ class RegExpCompiler {
    bool ignore_case_;
    bool ascii_;
    bool reg_exp_too_big_;
+  int current_expansion_factor_;
  };
  
  
@@ -848,7 +854,8 @@ RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
        recursion_depth_(0),
        ignore_case_(ignore_case),
        ascii_(ascii),
-      reg_exp_too_big_(false) {
+      reg_exp_too_big_(false),
+      current_expansion_factor_(1) {
    accept_ = new EndNode(EndNode::ACCEPT);
    ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
  }
@@ -3730,6 +3737,44 @@ RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
  }
  
  
+// Scoped object to keep track of how much we unroll quantifier loops in the
+// regexp graph generator.
+class RegExpExpansionLimiter {
+ public:
+  static const int kMaxExpansionFactor = 6;
+  RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
+      : compiler_(compiler),
+        saved_expansion_factor_(compiler->current_expansion_factor()),
+        ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
+    ASSERT(factor > 0);
+    if (ok_to_expand_) {
+      if (factor > kMaxExpansionFactor) {
+        // Avoid integer overflow of the current expansion factor.
+        ok_to_expand_ = false;
+        compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
+      } else {
+        int new_factor = saved_expansion_factor_ * factor;
+        ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
+        compiler->set_current_expansion_factor(new_factor);
+      }
+    }
+  }
+
+  ~RegExpExpansionLimiter() {
+    compiler_->set_current_expansion_factor(saved_expansion_factor_);
+  }
+
+  bool ok_to_expand() { return ok_to_expand_; }
+
+ private:
+  RegExpCompiler* compiler_;
+  int saved_expansion_factor_;
+  bool ok_to_expand_;
+
+  DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
+};
+
+
  RegExpNode* RegExpQuantifier::ToNode(int min,
                                       int max,
                                       bool is_greedy,
@@ -3766,45 +3811,49 @@ RegExpNode* RegExpQuantifier::ToNode(int min,
    bool needs_capture_clearing = !capture_registers.is_empty();
    if (body_can_be_empty) {
      body_start_reg = compiler->AllocateRegister();
-  } else if (FLAG_regexp_optimization &&
-             !body->ContainsExpandedQuantifier() &&
-             !needs_capture_clearing) {
+  } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
      // Only unroll if there are no captures and the body can't be
      // empty.
-    if (min > 0 && min <= kMaxUnrolledMinMatches) {
-      int new_max = (max == kInfinity) ? max : max - min;
-      // Recurse once to get the loop or optional matches after the fixed ones.
-      RegExpNode* answer = ToNode(
-          0, new_max, is_greedy, body, compiler, on_success, true);
-      // Unroll the forced matches from 0 to min.  This can cause chains of
-      // TextNodes (which the parser does not generate).  These should be
-      // combined if it turns out they hinder good code generation.
-      for (int i = 0; i < min; i++) {
-        answer = body->ToNode(compiler, answer);
+    {
+      RegExpExpansionLimiter limiter(
+          compiler, min + ((max != min) ? 1 : 0));
+      if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
+        int new_max = (max == kInfinity) ? max : max - min;
+        // Recurse once to get the loop or optional matches after the fixed
+        // ones.
+        RegExpNode* answer = ToNode(
+            0, new_max, is_greedy, body, compiler, on_success, true);
+        // Unroll the forced matches from 0 to min.  This can cause chains of
+        // TextNodes (which the parser does not generate).  These should be
+        // combined if it turns out they hinder good code generation.
+        for (int i = 0; i < min; i++) {
+          answer = body->ToNode(compiler, answer);
+        }
+        return answer;
        }
-      if (min > 1) body->set_contains_expanded_quantifier(true);
-      return answer;
      }
-    if (max <= kMaxUnrolledMaxMatches) {
-      ASSERT(min == 0);
-      // Unroll the optional matches up to max.
-      RegExpNode* answer = on_success;
-      for (int i = 0; i < max; i++) {
-        ChoiceNode* alternation = new ChoiceNode(2);
-        if (is_greedy) {
-          alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
-                                                                      answer)));
-          alternation->AddAlternative(GuardedAlternative(on_success));
-        } else {
-          alternation->AddAlternative(GuardedAlternative(on_success));
-          alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
-                                                                      answer)));
+    if (max <= kMaxUnrolledMaxMatches && min == 0) {
+      ASSERT(max > 0);  // Due to the 'if' above.
+      RegExpExpansionLimiter limiter(compiler, max);
+      if (limiter.ok_to_expand()) {
+        // Unroll the optional matches up to max.
+        RegExpNode* answer = on_success;
+        for (int i = 0; i < max; i++) {
+          ChoiceNode* alternation = new ChoiceNode(2);
+          if (is_greedy) {
+            alternation->AddAlternative(
+                GuardedAlternative(body->ToNode(compiler, answer)));
+            alternation->AddAlternative(GuardedAlternative(on_success));
+          } else {
+            alternation->AddAlternative(GuardedAlternative(on_success));
+            alternation->AddAlternative(
+                GuardedAlternative(body->ToNode(compiler, answer)));
+          }
+          answer = alternation;
+          if (not_at_start) alternation->set_not_at_start();
          }
-        answer = alternation;
-        if (not_at_start) alternation->set_not_at_start();
+        return answer;
        }
-      if (max > 1) body->set_contains_expanded_quantifier(true);
-      return answer;
      }
    }
    bool has_min = min > 0;
@@ -4130,12 +4179,6 @@ void CharacterRange::Split(ZoneList<CharacterRange>* base,
  }
  
  
-static void AddUncanonicals(Isolate* isolate,
-                            ZoneList<CharacterRange>* ranges,
-                            int bottom,
-                            int top);
-
-
  void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
                                          bool is_ascii) {
    Isolate* isolate = Isolate::Current();
@@ -4296,101 +4339,6 @@ SetRelation CharacterRange::WordCharacterRelation(
  }
  
  
-static void AddUncanonicals(Isolate* isolate,
-                            ZoneList<CharacterRange>* ranges,
-                            int bottom,
-                            int top) {
-  unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
-  // Zones with no case mappings.  There is a DEBUG-mode loop to assert that
-  // this table is correct.
-  // 0x0600 - 0x0fff
-  // 0x1100 - 0x1cff
-  // 0x2000 - 0x20ff
-  // 0x2200 - 0x23ff
-  // 0x2500 - 0x2bff
-  // 0x2e00 - 0xa5ff
-  // 0xa800 - 0xfaff
-  // 0xfc00 - 0xfeff
-  const int boundary_count = 18;
-  int boundaries[] = {
-      0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,
-      0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};
-
-  // Special ASCII rule from spec can save us some work here.
-  if (bottom == 0x80 && top == 0xffff) return;
-
-  if (top <= boundaries[0]) {
-    CharacterRange range(bottom, top);
-    range.AddCaseEquivalents(ranges, false);
-    return;
-  }
-
-  // Split up very large ranges.  This helps remove ranges where there are no
-  // case mappings.
-  for (int i = 0; i < boundary_count; i++) {
-    if (bottom < boundaries[i] && top >= boundaries[i]) {
-      AddUncanonicals(isolate, ranges, bottom, boundaries[i] - 1);
-      AddUncanonicals(isolate, ranges, boundaries[i], top);
-      return;
-    }
-  }
-
-  // If we are completely in a zone with no case mappings then we are done.
-  for (int i = 0; i < boundary_count; i += 2) {
-    if (bottom >= boundaries[i] && top < boundaries[i + 1]) {
-#ifdef DEBUG
-      for (int j = bottom; j <= top; j++) {
-        unsigned current_char = j;
-        int length = isolate->jsregexp_uncanonicalize()->get(current_char,
-                                                             '\0', chars);
-        for (int k = 0; k < length; k++) {
-          ASSERT(chars[k] == current_char);
-        }
-      }
-#endif
-      return;
-    }
-  }
-
-  // Step through the range finding equivalent characters.
-  ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);
-  for (int i = bottom; i <= top; i++) {
-    int length = isolate->jsregexp_uncanonicalize()->get(i, '\0', chars);
-    for (int j = 0; j < length; j++) {
-      uc32 chr = chars[j];
-      if (chr != i && (chr < bottom || chr > top)) {
-        characters->Add(chr);
-      }
-    }
-  }
-
-  // Step through the equivalent characters finding simple ranges and
-  // adding ranges to the character class.
-  if (characters->length() > 0) {
-    int new_from = characters->at(0);
-    int new_to = new_from;
-    for (int i = 1; i < characters->length(); i++) {
-      int chr = characters->at(i);
-      if (chr == new_to + 1) {
-        new_to++;
-      } else {
-        if (new_to == new_from) {
-          ranges->Add(CharacterRange::Singleton(new_from));
-        } else {
-          ranges->Add(CharacterRange(new_from, new_to));
-        }
-        new_from = new_to = chr;
-      }
-    }
-    if (new_to == new_from) {
-      ranges->Add(CharacterRange::Singleton(new_from));
-    } else {
-      ranges->Add(CharacterRange(new_from, new_to));
-    }
-  }
-}
-
-
  ZoneList<CharacterRange>* CharacterSet::ranges() {
    if (ranges_ == NULL) {
      ranges_ = new ZoneList<CharacterRange>(2);
diff --git a/test/mjsunit/regress/regress-1472.js b/test/mjsunit/regress/regress-1472.js

new file mode 100644 (file)

index 0000000..b2a30d2
--- /dev/null
+++ b/test/mjsunit/regress/regress-1472.js
@@ -0,0 +1,40 @@
+// Copyright 2011 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Avoid excessive expansions of regexp repetitions inside regexp repetitions.
+// Some of these caused stack overflows, others cause out-of-memory.
+var r1 = /(?:a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:m(?:n(?:o(?:p(?:q(?:r(?:s(?:t(?:u(?:v(?:w(?:x(?:y(?:z(?:FooBar)+)+)+)+)+)+)+)+)+)+)+)+)+)+)+)+)+)+)+)+)+)+)+)+)+)+)/;
+"xxx".match(r1);
+
+var r2 = /(?:a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:FooBar){0,2}){0,2}){0,2}){0,2}){0,2}){0,2}){0,2}){0,2}){0,2}){0,2}){0,2}){0,2}){0,2}/;
+"xxx".match(r2);
+
+var r3 = /(?:a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:FooBar){2}){2}){2}){2}){2}){2}){2}){2}){2}){2}){2}){2}){2}/;
+"xxx".match(r3);
+
+var r4 = /(?:a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:FooBar){3,6}){3,6}){3,6}){3,6}){3,6}){3,6}){3,6}){3,6}){3,6}){3,6}/;
+"xxx".match(r4);
author	erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Fri, 17 Jun 2011 08:01:12 +0000 (08:01 +0000)
committer	erik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Fri, 17 Jun 2011 08:01:12 +0000 (08:01 +0000)
src/ast.cc		patch \| blob \| history
src/ast.h		patch \| blob \| history
src/jsregexp.cc		patch \| blob \| history
test/mjsunit/regress/regress-1472.js	[new file with mode: 0644]	patch \| blob