Fixed some assertion propagation issues and added non-multiline $
authorchristian.plesner.hansen@gmail.com <christian.plesner.hansen@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Thu, 27 Nov 2008 10:35:06 +0000 (10:35 +0000)
committerchristian.plesner.hansen@gmail.com <christian.plesner.hansen@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Thu, 27 Nov 2008 10:35:06 +0000 (10:35 +0000)
propagation.

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@856 ce2b1a6d-e550-0410-aec6-3dcde31c8c00

src/jsregexp.cc
src/jsregexp.h
src/parser.cc
src/parser.h
test/cctest/test-regexp.cc

index 7bb9482..1707f46 100644 (file)
@@ -218,7 +218,7 @@ Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
     FlattenString(pattern);
     RegExpParseResult parse_result;
     FlatStringReader reader(pattern);
-    if (!ParseRegExp(&reader, &parse_result)) {
+    if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
       // Throw an exception if we fail to parse the pattern.
       ThrowRegExpException(re,
                            pattern,
@@ -241,7 +241,8 @@ Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
       Handle<FixedArray> irregexp_data =
           RegExpEngine::Compile(&parse_result,
                                 &node,
-                                flags.is_ignore_case());
+                                flags.is_ignore_case(),
+                                flags.is_multiline());
       if (irregexp_data.is_null()) {
         if (FLAG_disable_jscre) {
           UNIMPLEMENTED();
@@ -980,7 +981,8 @@ bool RegExpNode::GoTo(RegExpCompiler* compiler) {
   // TODO(erikcorry): Implement support.
   if (info_.follows_word_interest ||
       info_.follows_newline_interest ||
-      info_.follows_start_interest) {
+      info_.follows_start_interest ||
+      info_.at_end) {
     return false;
   }
   if (label_.is_bound()) {
@@ -1004,7 +1006,8 @@ bool RegExpNode::GoTo(RegExpCompiler* compiler) {
 bool EndNode::GoTo(RegExpCompiler* compiler) {
   if (info()->follows_word_interest ||
       info()->follows_newline_interest ||
-      info()->follows_start_interest) {
+      info()->follows_start_interest ||
+      info()->at_end) {
     return false;
   }
   if (!label()->is_bound()) {
@@ -1629,14 +1632,21 @@ void DotPrinter::PrintAttributes(RegExpNode* that) {
                 "fontcolor=lightgrey, margin=0.1, fontsize=10, label=\"{",
                 that);
   NodeInfo* info = that->info();
-  stream()->Add("{NI|%i}|{SI|%i}|{WI|%i}",
+  stream()->Add("{NI|%i}|{WI|%i}|{SI|%i}",
                 info->follows_newline_interest,
-                info->follows_start_interest,
-                info->follows_word_interest);
-  stream()->Add("|{DN|%i}|{DS|%i}|{DW|%i}",
+                info->follows_word_interest,
+                info->follows_start_interest);
+  stream()->Add("|{DN|%i}|{DW|%i}|{DS|%i}|{AE|%i}",
                 info->determine_newline,
+                info->determine_word,
                 info->determine_start,
-                info->determine_word);
+                info->at_end);
+  if (info->follows_newline != NodeInfo::UNKNOWN)
+    stream()->Add("|{FN|%i}", info->follows_newline);
+  if (info->follows_word != NodeInfo::UNKNOWN)
+    stream()->Add("|{FW|%i}", info->follows_word);
+  if (info->follows_start != NodeInfo::UNKNOWN)
+    stream()->Add("|{FS|%i}", info->follows_start);
   Label* label = that->label();
   if (label->is_bound())
     stream()->Add("|{@|%x}", label->pos());
@@ -1924,11 +1934,14 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
     case BOUNDARY: case NON_BOUNDARY:
       info.follows_word_interest = true;
       break;
-    case END_OF_LINE: case END_OF_INPUT:
+    case END_OF_INPUT:
+      info.at_end = true;
+      break;
+    case END_OF_LINE:
       // This is wrong but has the effect of making the compiler abort.
-      info.follows_start_interest = true;
+      info.at_end = true;
   }
-  return on_success->PropagateInterest(&info);
+  return on_success->PropagateForward(&info);
 }
 
 
@@ -2216,7 +2229,7 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
 RegExpNode* RegExpNode::GetSibling(NodeInfo* info) {
   for (int i = 0; i < siblings_.length(); i++) {
     RegExpNode* sibling = siblings_.Get(i);
-    if (sibling->info()->SameInterests(info))
+    if (sibling->info()->HasSameForwardInterests(info))
       return sibling;
   }
   return NULL;
@@ -2225,58 +2238,64 @@ RegExpNode* RegExpNode::GetSibling(NodeInfo* info) {
 
 template <class C>
 static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
-  RegExpNode* sibling = node->GetSibling(info);
+  NodeInfo full_info(*node->info());
+  full_info.AddFromPreceding(info);
+  RegExpNode* sibling = node->GetSibling(&full_info);
   if (sibling != NULL) return sibling;
   node->EnsureSiblings();
   sibling = new C(*node);
-  sibling->info()->AdoptInterests(info);
+  sibling->info()->AddFromPreceding(&full_info);
   node->AddSibling(sibling);
   return sibling;
 }
 
 
-RegExpNode* ActionNode::PropagateInterest(NodeInfo* info) {
-  RegExpNode* sibling = GetSibling(info);
+RegExpNode* ActionNode::PropagateForward(NodeInfo* info) {
+  NodeInfo full_info(*this->info());
+  full_info.AddFromPreceding(info);
+  RegExpNode* sibling = GetSibling(&full_info);
   if (sibling != NULL) return sibling;
   EnsureSiblings();
   ActionNode* action = new ActionNode(*this);
-  action->info()->AdoptInterests(info);
+  action->info()->AddFromPreceding(&full_info);
   AddSibling(action);
-  action->set_on_success(action->on_success()->PropagateInterest(info));
+  action->set_on_success(action->on_success()->PropagateForward(info));
   return action;
 }
 
 
-RegExpNode* ChoiceNode::PropagateInterest(NodeInfo* info) {
-  RegExpNode* sibling = GetSibling(info);
+RegExpNode* ChoiceNode::PropagateForward(NodeInfo* info) {
+  NodeInfo full_info(*this->info());
+  full_info.AddFromPreceding(info);
+  RegExpNode* sibling = GetSibling(&full_info);
   if (sibling != NULL) return sibling;
   EnsureSiblings();
   ChoiceNode* choice = new ChoiceNode(*this);
-  choice->info()->AdoptInterests(info);
+  choice->info()->AddFromPreceding(&full_info);
   AddSibling(choice);
   ZoneList<GuardedAlternative>* old_alternatives = alternatives();
   int count = old_alternatives->length();
   choice->alternatives_ = new ZoneList<GuardedAlternative>(count);
   for (int i = 0; i < count; i++) {
     GuardedAlternative alternative = old_alternatives->at(i);
-    alternative.set_node(alternative.node()->PropagateInterest(info));
+    alternative.set_node(alternative.node()->PropagateForward(info));
     choice->alternatives()->Add(alternative);
   }
   return choice;
 }
 
 
-RegExpNode* EndNode::PropagateInterest(NodeInfo* info) {
+RegExpNode* EndNode::PropagateForward(NodeInfo* info) {
   return PropagateToEndpoint(this, info);
 }
 
 
-RegExpNode* BackReferenceNode::PropagateInterest(NodeInfo* info) {
+RegExpNode* BackReferenceNode::PropagateForward(NodeInfo* info) {
   return PropagateToEndpoint(this, info);
 }
 
 
-RegExpNode* TextNode::PropagateInterest(NodeInfo* info) {
+RegExpNode* TextNode::PropagateForward(NodeInfo* info) {
   return PropagateToEndpoint(this, info);
 }
 
@@ -2455,15 +2474,21 @@ void Analysis::VisitText(TextNode* that) {
   }
   EnsureAnalyzed(that->on_success());
   EnsureAnalyzed(that->on_failure());
+  NodeInfo* info = that->info();
+  NodeInfo* next_info = that->on_success()->info();
+  // If the following node is interested in what it follows then this
+  // node must determine it.
+  info->determine_newline = next_info->follows_newline_interest;
+  info->determine_word = next_info->follows_word_interest;
+  info->determine_start = next_info->follows_start_interest;
 }
 
 
 void Analysis::VisitAction(ActionNode* that) {
-  RegExpNode* next = that->on_success();
-  EnsureAnalyzed(next);
-  that->info()->determine_newline = next->info()->prev_determine_newline();
-  that->info()->determine_word = next->info()->prev_determine_word();
-  that->info()->determine_start = next->info()->prev_determine_start();
+  EnsureAnalyzed(that->on_success());
+  // If the next node is interested in what it follows then this node
+  // has to be interested too so it can pass the information on.
+  that->info()->AddFromFollowing(that->on_success()->info());
 }
 
 
@@ -2472,9 +2497,9 @@ void Analysis::VisitChoice(ChoiceNode* that) {
   for (int i = 0; i < that->alternatives()->length(); i++) {
     RegExpNode* node = that->alternatives()->at(i).node();
     EnsureAnalyzed(node);
-    info->determine_newline |= node->info()->prev_determine_newline();
-    info->determine_word |= node->info()->prev_determine_word();
-    info->determine_start |= node->info()->prev_determine_start();
+    // Anything the following nodes need to know has to be known by
+    // this node also, so it can pass it on.
+    info->AddFromFollowing(node->info());
   }
   if (!that->table_calculated()) {
     DispatchTableConstructor cons(that->table());
@@ -2607,7 +2632,8 @@ void DispatchTableConstructor::VisitAction(ActionNode* that) {
 
 Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
                                          RegExpNode** node_return,
-                                         bool ignore_case) {
+                                         bool ignore_case,
+                                         bool is_multiline) {
   RegExpCompiler compiler(input->capture_count, ignore_case);
   // Wrap the body of the regexp in capture #0.
   RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,
index b611a0e..2bea0de 100644 (file)
@@ -437,6 +437,10 @@ class TextElement {
 
 
 struct NodeInfo {
+  enum PrecedingInfo {
+    UNKNOWN = -1, FALSE = 0, TRUE = 1
+  };
+
   NodeInfo()
       : being_analyzed(false),
         been_analyzed(false),
@@ -445,34 +449,59 @@ struct NodeInfo {
         determine_start(false),
         follows_word_interest(false),
         follows_newline_interest(false),
-        follows_start_interest(false) { }
-  bool SameInterests(NodeInfo* that) {
-    return (follows_word_interest == that->follows_word_interest)
+        follows_start_interest(false),
+        at_end(false),
+        follows_word(UNKNOWN),
+        follows_newline(UNKNOWN),
+        follows_start(UNKNOWN) { }
+
+  bool HasSameForwardInterests(NodeInfo* that) {
+    return (at_end == that->at_end)
+        && (follows_word_interest == that->follows_word_interest)
         && (follows_newline_interest == that->follows_newline_interest)
         && (follows_start_interest == that->follows_start_interest);
   }
-  void AdoptInterests(NodeInfo* that) {
-    follows_word_interest = that->follows_word_interest;
-    follows_newline_interest = that->follows_newline_interest;
-    follows_start_interest = that->follows_start_interest;
-  }
-  bool prev_determine_word() {
-    return determine_word || follows_word_interest;
-  }
-  bool prev_determine_newline() {
-    return determine_newline || follows_newline_interest;
+
+  // Updates the interests of this node given the interests of the
+  // node preceding it.
+  void AddFromPreceding(NodeInfo* that) {
+    at_end |= that->at_end;
+    follows_word_interest |= that->follows_word_interest;
+    follows_newline_interest |= that->follows_newline_interest;
+    follows_start_interest |= that->follows_start_interest;
   }
-  bool prev_determine_start() {
-    return determine_start || follows_start_interest;
+
+  // Sets the interests of this node to include the interests of the
+  // following node.
+  void AddFromFollowing(NodeInfo* that) {
+    follows_word_interest |= that->follows_word_interest;
+    follows_newline_interest |= that->follows_newline_interest;
+    follows_start_interest |= that->follows_start_interest;
   }
+
   bool being_analyzed: 1;
   bool been_analyzed: 1;
+
+  // These bits are set if this node must propagate forward information
+  // about the last character it consumed (or, in the case of 'start',
+  // if it is at the start of the input).
   bool determine_word: 1;
   bool determine_newline: 1;
   bool determine_start: 1;
+
+  // These bits are set of this node has to know what the preceding
+  // character was.
   bool follows_word_interest: 1;
   bool follows_newline_interest: 1;
   bool follows_start_interest: 1;
+
+  bool at_end: 1;
+
+  // These bits are set if the node can make assumptions about what
+  // the previous character was.
+  PrecedingInfo follows_word: 2;
+  PrecedingInfo follows_newline: 2;
+  PrecedingInfo follows_start: 2;
 };
 
 
@@ -511,7 +540,13 @@ class RegExpNode: public ZoneObject {
   // Until the implementation is complete we will return true for success and
   // false for failure.
   virtual bool Emit(RegExpCompiler* compiler) = 0;
-  virtual RegExpNode* PropagateInterest(NodeInfo* info) = 0;
+
+  // Propagates the given interest information forward.  When seeing
+  // \bfoo for instance, the \b is implemented by propagating forward
+  // to the 'foo' string that it should only succeed if its first
+  // character is a letter xor the previous character was a letter.
+  virtual RegExpNode* PropagateForward(NodeInfo* info) = 0;
+
   NodeInfo* info() { return &info_; }
   virtual bool IsBacktrack() { return false; }
   RegExpNode* GetSibling(NodeInfo* info);
@@ -558,7 +593,7 @@ class ActionNode: public SeqRegExpNode {
   static ActionNode* EscapeSubmatch(int reg, RegExpNode* on_success);
   virtual void Accept(NodeVisitor* visitor);
   virtual bool Emit(RegExpCompiler* compiler);
-  virtual RegExpNode* PropagateInterest(NodeInfo* info);
+  virtual RegExpNode* PropagateForward(NodeInfo* info);
  private:
   union {
     struct {
@@ -592,7 +627,7 @@ class TextNode: public SeqRegExpNode {
         on_failure_(on_failure),
         elms_(elms) { }
   virtual void Accept(NodeVisitor* visitor);
-  virtual RegExpNode* PropagateInterest(NodeInfo* info);
+  virtual RegExpNode* PropagateForward(NodeInfo* info);
   RegExpNode* on_failure() { return on_failure_; }
   virtual bool Emit(RegExpCompiler* compiler);
   ZoneList<TextElement>* elements() { return elms_; }
@@ -618,7 +653,7 @@ class BackReferenceNode: public SeqRegExpNode {
   int start_register() { return start_reg_; }
   int end_register() { return end_reg_; }
   virtual bool Emit(RegExpCompiler* compiler);
-  virtual RegExpNode* PropagateInterest(NodeInfo* info);
+  virtual RegExpNode* PropagateForward(NodeInfo* info);
  private:
   RegExpNode* on_failure_;
   int start_reg_;
@@ -632,7 +667,7 @@ class EndNode: public RegExpNode {
   explicit EndNode(Action action) : action_(action) { }
   virtual void Accept(NodeVisitor* visitor);
   virtual bool Emit(RegExpCompiler* compiler);
-  virtual RegExpNode* PropagateInterest(NodeInfo* info);
+  virtual RegExpNode* PropagateForward(NodeInfo* info);
   virtual bool IsBacktrack() { return action_ == BACKTRACK; }
   virtual bool GoTo(RegExpCompiler* compiler);
  private:
@@ -683,7 +718,7 @@ class ChoiceNode: public RegExpNode {
   DispatchTable* table() { return &table_; }
   RegExpNode* on_failure() { return on_failure_; }
   virtual bool Emit(RegExpCompiler* compiler);
-  virtual RegExpNode* PropagateInterest(NodeInfo* info);
+  virtual RegExpNode* PropagateForward(NodeInfo* info);
   bool table_calculated() { return table_calculated_; }
   void set_table_calculated(bool b) { table_calculated_ = b; }
   bool being_calculated() { return being_calculated_; }
@@ -770,7 +805,8 @@ class RegExpEngine: public AllStatic {
  public:
   static Handle<FixedArray> Compile(RegExpParseResult* input,
                                     RegExpNode** node_return,
-                                    bool ignore_case);
+                                    bool ignore_case,
+                                    bool multiline);
   static void DotPrint(const char* label, RegExpNode* node);
 };
 
index a1a4a6b..6c6712d 100644 (file)
@@ -545,7 +545,7 @@ class RegExpParser {
   bool CaptureAvailable(int index);
   uc32 current_;
   bool has_more_;
-  bool multiline_mode_;
+  bool multiline_;
   int next_pos_;
   FlatStringReader* in_;
   Handle<String>* error_;
@@ -3499,10 +3499,10 @@ Expression* Parser::NewThrowError(Handle<String> constructor,
 
 RegExpParser::RegExpParser(FlatStringReader* in,
                            Handle<String>* error,
-                           bool multiline_mode)
+                           bool multiline)
   : current_(kEndMarker),
     has_more_(true),
-    multiline_mode_(multiline_mode),
+    multiline_(multiline),
     next_pos_(0),
     in_(in),
     error_(error),
@@ -3617,16 +3617,16 @@ RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {
     case '^': {
       Advance();
       RegExpAssertion::Type type =
-          multiline_mode_ ? RegExpAssertion::START_OF_LINE :
-                            RegExpAssertion::START_OF_INPUT;
+          multiline_ ? RegExpAssertion::START_OF_LINE :
+                       RegExpAssertion::START_OF_INPUT;
       builder.AddAssertion(new RegExpAssertion(type));
       continue;
     }
     case '$': {
       Advance();
       RegExpAssertion::Type type =
-          multiline_mode_ ? RegExpAssertion::END_OF_LINE :
-                            RegExpAssertion::END_OF_INPUT;
+          multiline_ ? RegExpAssertion::END_OF_LINE :
+                       RegExpAssertion::END_OF_INPUT;
       builder.AddAssertion(new RegExpAssertion(type));
       continue;
     }
@@ -4294,10 +4294,11 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream,
 }
 
 
-bool ParseRegExp(FlatStringReader* input, RegExpParseResult* result) {
+bool ParseRegExp(FlatStringReader* input,
+                 bool multiline,
+                 RegExpParseResult* result) {
   ASSERT(result != NULL);
-  // TODO(plesner): Get multiline flag somehow
-  RegExpParser parser(input, &result->error, false);
+  RegExpParser parser(input, &result->error, multiline);
   bool ok = true;
   result->tree = parser.ParsePattern(&ok);
   if (!ok) {
index b6de9e5..0ebef74 100644 (file)
@@ -145,7 +145,9 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream,
                          v8::Extension* extension);
 
 
-bool ParseRegExp(FlatStringReader* input, RegExpParseResult* result);
+bool ParseRegExp(FlatStringReader* input,
+                 bool multiline,
+                 RegExpParseResult* result);
 
 
 // Support for doing lazy compilation. The script is the script containing full
index 6237c61..0e85249 100644 (file)
@@ -56,7 +56,7 @@ static SmartPointer<const char> Parse(const char* input) {
   ZoneScope zone_scope(DELETE_ON_EXIT);
   FlatStringReader reader(CStrVector(input));
   RegExpParseResult result;
-  CHECK(v8::internal::ParseRegExp(&reader, &result));
+  CHECK(v8::internal::ParseRegExp(&reader, false, &result));
   CHECK(result.tree != NULL);
   CHECK(result.error.is_null());
   SmartPointer<const char> output = result.tree->ToString();
@@ -69,7 +69,7 @@ static bool ParseEscapes(const char* input) {
   ZoneScope zone_scope(DELETE_ON_EXIT);
   FlatStringReader reader(CStrVector(input));
   RegExpParseResult result;
-  CHECK(v8::internal::ParseRegExp(&reader, &result));
+  CHECK(v8::internal::ParseRegExp(&reader, false, &result));
   CHECK(result.tree != NULL);
   CHECK(result.error.is_null());
   return result.has_character_escapes;
@@ -257,7 +257,7 @@ static void ExpectError(const char* input,
   ZoneScope zone_scope(DELETE_ON_EXIT);
   FlatStringReader reader(CStrVector(input));
   RegExpParseResult result;
-  CHECK_EQ(false, v8::internal::ParseRegExp(&reader, &result));
+  CHECK_EQ(false, v8::internal::ParseRegExp(&reader, false, &result));
   CHECK(result.tree == NULL);
   CHECK(!result.error.is_null());
   SmartPointer<char> str = result.error->ToCString(ALLOW_NULLS);
@@ -373,23 +373,23 @@ TEST(CharacterClassEscapes) {
 }
 
 
-static RegExpNode* Compile(const char* input) {
+static RegExpNode* Compile(const char* input, bool multiline) {
   FlatStringReader reader(CStrVector(input));
   RegExpParseResult result;
-  if (!v8::internal::ParseRegExp(&reader, &result))
+  if (!v8::internal::ParseRegExp(&reader, multiline, &result))
     return NULL;
   RegExpNode* node = NULL;
-  RegExpEngine::Compile(&result, &node, false);
+  RegExpEngine::Compile(&result, &node, false, multiline);
   return node;
 }
 
 
 static void Execute(const char* input,
-                    const char* str,
+                    bool multiline,
                     bool dot_output = false) {
   v8::HandleScope scope;
   ZoneScope zone_scope(DELETE_ON_EXIT);
-  RegExpNode* node = Compile(input);
+  RegExpNode* node = Compile(input, multiline);
   USE(node);
 #ifdef DEBUG
   if (dot_output) {
@@ -400,14 +400,6 @@ static void Execute(const char* input,
 }
 
 
-TEST(Execution) {
-  V8::Initialize(NULL);
-  Execute(".*?(?:a[bc]d|e[fg]h)", "xxxabbegh");
-  Execute(".*?(?:a[bc]d|e[fg]h)", "xxxabbefh");
-  Execute(".*?(?:a[bc]d|e[fg]h)", "xxxabbefd");
-}
-
-
 class TestConfig {
  public:
   typedef int Key;
@@ -1043,8 +1035,8 @@ TEST(LatinCanonicalize) {
 TEST(SimplePropagation) {
   v8::HandleScope scope;
   ZoneScope zone_scope(DELETE_ON_EXIT);
-  RegExpNode* node = Compile("(a|^b|c)");
-  CHECK(node->info()->determine_start);
+  RegExpNode* node = Compile("(a|^b|c)", false);
+  CHECK(node->info()->follows_start_interest);
 }
 
 
@@ -1061,7 +1053,7 @@ static uc32 CanonRange(uc32 c) {
 
 
 TEST(RangeCanonicalization) {
-  ASSERT((CanonRange(0) & CharacterRange::kStartMarker) != 0);
+  CHECK((CanonRange(0) & CharacterRange::kStartMarker) != 0);
   // Check that we arrive at the same result when using the basic
   // range canonicalization primitives as when using immediate
   // canonicalization.
@@ -1175,5 +1167,5 @@ TEST(CharacterRangeCaseIndependence) {
 
 TEST(Graph) {
   V8::Initialize(NULL);
-  Execute("\\w+", "", true);
+  Execute("(?:foo|bar$)", false, true);
 }