- Added some expansion of assertions.
authorchristian.plesner.hansen@gmail.com <christian.plesner.hansen@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Mon, 1 Dec 2008 15:42:35 +0000 (15:42 +0000)
committerchristian.plesner.hansen@gmail.com <christian.plesner.hansen@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Mon, 1 Dec 2008 15:42:35 +0000 (15:42 +0000)
- Splitting of character classes into word and non-word parts.
- A bunch of refactorings.
- Made dispatch table construction lazy.

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@880 ce2b1a6d-e550-0410-aec6-3dcde31c8c00

src/char-predicates-inl.h
src/char-predicates.h
src/jsregexp.cc
src/jsregexp.h
src/parser.cc
test/cctest/test-regexp.cc

index b575bdfe58228008f1f728b42ed29aa74c6e51a4..3ef2f91616904f79b577c4c5eb1558b83aff63b3 100644 (file)
@@ -59,6 +59,25 @@ inline bool IsHexDigit(uc32 c) {
 }
 
 
+inline bool IsRegExpWord(uc16 c) {
+  return ('a' <= c && c <= 'z')
+      || ('A' <= c && c <= 'Z')
+      || ('0' <= c && c <= '9')
+      || (c == '_');
+}
+
+
+inline bool IsRegExpNewline(uc16 c) {
+  switch (c) {
+    //   CR           LF           LS           PS
+    case 0x000A: case 0x000D: case 0x2028: case 0x2029:
+      return false;
+    default:
+      return true;
+  }
+}
+
+
 } }  // namespace v8::internal
 
 #endif  // V8_CHAR_PREDICATES_INL_H_
index 3fcaeb626f457d7397f5a83fc806356bfba166c7..63e83b4d3f05b3b458199934443590048cbfaadf 100644 (file)
@@ -37,6 +37,8 @@ inline bool IsCarriageReturn(uc32 c);
 inline bool IsLineFeed(uc32 c);
 inline bool IsDecimalDigit(uc32 c);
 inline bool IsHexDigit(uc32 c);
+inline bool IsRegExpWord(uc32 c);
+inline bool IsRegExpNewline(uc32 c);
 
 struct IdentifierStart {
   static inline bool Is(uc32 c) {
index 439a8d1098a5b54b857950b9cbe8ca9cb99a28ac..8bafc28d66e0ef26188dfb4d40d3a64284a5763c 100644 (file)
@@ -897,6 +897,16 @@ TextElement TextElement::CharClass(
 }
 
 
+DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
+  if (table_ == NULL) {
+    table_ = new DispatchTable();
+    DispatchTableConstructor cons(table_, ignore_case);
+    cons.BuildTable(this);
+  }
+  return table_;
+}
+
+
 class RegExpCompiler {
  public:
   RegExpCompiler(int capture_count, bool ignore_case);
@@ -1561,7 +1571,9 @@ bool BackReferenceNode::Emit(RegExpCompiler* compiler) {
 
 class DotPrinter: public NodeVisitor {
  public:
-  DotPrinter() : stream_(&alloc_) { }
+  explicit DotPrinter(bool ignore_case)
+      : ignore_case_(ignore_case),
+        stream_(&alloc_) { }
   void PrintNode(const char* label, RegExpNode* node);
   void Visit(RegExpNode* node);
   void PrintOnFailure(RegExpNode* from, RegExpNode* on_failure);
@@ -1572,6 +1584,7 @@ class DotPrinter: public NodeVisitor {
 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
 #undef DECLARE_VISIT
  private:
+  bool ignore_case_;
   HeapStringAllocator alloc_;
   StringStream stream_;
   std::set<RegExpNode*> seen_;
@@ -1668,44 +1681,80 @@ class TableEntryHeaderPrinter {
 };
 
 
+class AttributePrinter {
+ public:
+  explicit AttributePrinter(DotPrinter* out)
+      : out_(out), first_(true) { }
+  void PrintSeparator() {
+    if (first_) {
+      first_ = false;
+    } else {
+      out_->stream()->Add("|");
+    }
+  }
+  void PrintBit(const char* name, bool value) {
+    if (!value) return;
+    PrintSeparator();
+    out_->stream()->Add("{%s}", name);
+  }
+  void PrintPositive(const char* name, int value) {
+    if (value < 0) return;
+    PrintSeparator();
+    out_->stream()->Add("{%s|%x}", name, value);
+  }
+ private:
+  DotPrinter* out_;
+  bool first_;
+};
+
+
 void DotPrinter::PrintAttributes(RegExpNode* that) {
-  stream()->Add("  a%p [shape=Mrecord, style=dashed, color=lightgrey, "
-                "fontcolor=lightgrey, margin=0.1, fontsize=10, label=\"{",
+  stream()->Add("  a%p [shape=Mrecord, color=grey, fontcolor=grey, "
+                "margin=0.1, fontsize=10, label=\"{",
                 that);
+  AttributePrinter printer(this);
   NodeInfo* info = that->info();
-  stream()->Add("{NI|%i}|{WI|%i}|{SI|%i}",
-                info->follows_newline_interest,
-                info->follows_word_interest,
-                info->follows_start_interest);
-  stream()->Add("|{DN|%i}|{DW|%i}|{DS|%i}|{AE|%i}",
-                info->determine_newline,
-                info->determine_word,
-                info->determine_start,
-                info->at_end);
-  if (info->follows_newline != NodeInfo::UNKNOWN)
-    stream()->Add("|{FN|%i}", info->follows_newline);
-  if (info->follows_word != NodeInfo::UNKNOWN)
-    stream()->Add("|{FW|%i}", info->follows_word);
-  if (info->follows_start != NodeInfo::UNKNOWN)
-    stream()->Add("|{FS|%i}", info->follows_start);
+  printer.PrintBit("NI", info->follows_newline_interest);
+  printer.PrintBit("WI", info->follows_word_interest);
+  printer.PrintBit("SI", info->follows_start_interest);
+  printer.PrintBit("DN", info->determine_newline);
+  printer.PrintBit("DW", info->determine_word);
+  printer.PrintBit("DS", info->determine_start);
+  printer.PrintBit("DDN", info->does_determine_newline);
+  printer.PrintBit("DDW", info->does_determine_word);
+  printer.PrintBit("DDS", info->does_determine_start);
+  printer.PrintPositive("IW", info->is_word);
+  printer.PrintPositive("IN", info->is_newline);
+  printer.PrintPositive("FN", info->follows_newline);
+  printer.PrintPositive("FW", info->follows_word);
+  printer.PrintPositive("FS", info->follows_start);
   Label* label = that->label();
   if (label->is_bound())
-    stream()->Add("|{@|%x}", label->pos());
+    printer.PrintPositive("@", label->pos());
   stream()->Add("}\"];\n");
-  stream()->Add("  a%p -> n%p [style=dashed, color=lightgrey, "
+  stream()->Add("  a%p -> n%p [style=dashed, color=grey, "
                 "arrowhead=none];\n", that, that);
 }
 
 
+static const bool kPrintDispatchTable = false;
 void DotPrinter::VisitChoice(ChoiceNode* that) {
-  stream()->Add("  n%p [shape=Mrecord, label=\"", that);
-  TableEntryHeaderPrinter header_printer(stream());
-  that->table()->ForEach(&header_printer);
-  stream()->Add("\"]\n", that);
-  PrintAttributes(that);
-  TableEntryBodyPrinter body_printer(stream(), that);
-  that->table()->ForEach(&body_printer);
-  PrintOnFailure(that, that->on_failure());
+  if (kPrintDispatchTable) {
+    stream()->Add("  n%p [shape=Mrecord, label=\"", that);
+    TableEntryHeaderPrinter header_printer(stream());
+    that->GetTable(ignore_case_)->ForEach(&header_printer);
+    stream()->Add("\"]\n", that);
+    PrintAttributes(that);
+    TableEntryBodyPrinter body_printer(stream(), that);
+    that->GetTable(ignore_case_)->ForEach(&body_printer);
+    PrintOnFailure(that, that->on_failure());
+  } else {
+    stream()->Add("  n%p [shape=Mrecord, label=\"?\"];\n", that);
+    for (int i = 0; i < that->alternatives()->length(); i++) {
+      GuardedAlternative alt = that->alternatives()->at(i);
+      stream()->Add("  n%p -> n%p;\n", that, alt.node());
+    }
+  }
   for (int i = 0; i < that->alternatives()->length(); i++) {
     GuardedAlternative alt = that->alternatives()->at(i);
     alt.node()->Accept(this);
@@ -1837,8 +1886,10 @@ void DispatchTable::Dump() {
 }
 
 
-void RegExpEngine::DotPrint(const char* label, RegExpNode* node) {
-  DotPrinter printer;
+void RegExpEngine::DotPrint(const char* label,
+                            RegExpNode* node,
+                            bool ignore_case) {
+  DotPrinter printer(ignore_case);
   printer.PrintNode(label, node);
 }
 
@@ -2181,6 +2232,56 @@ void CharacterRange::AddClassEscape(uc16 type,
 }
 
 
+Vector<const uc16> CharacterRange::GetWordBounds() {
+  return Vector<const uc16>(kWordRanges, kWordRangeCount);
+}
+
+
+class CharacterRangeSplitter {
+ public:
+  CharacterRangeSplitter(ZoneList<CharacterRange>** included,
+                          ZoneList<CharacterRange>** excluded)
+      : included_(included),
+        excluded_(excluded) { }
+  void Call(uc16 from, DispatchTable::Entry entry);
+
+  static const int kInBase = 0;
+  static const int kInOverlay = 1;
+
+ private:
+  ZoneList<CharacterRange>** included_;
+  ZoneList<CharacterRange>** excluded_;
+};
+
+
+void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
+  if (!entry.out_set()->Get(kInBase)) return;
+  ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
+    ? included_
+    : excluded_;
+  if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
+  (*target)->Add(CharacterRange(entry.from(), entry.to()));
+}
+
+
+void CharacterRange::Split(ZoneList<CharacterRange>* base,
+                           Vector<const uc16> overlay,
+                           ZoneList<CharacterRange>** included,
+                           ZoneList<CharacterRange>** excluded) {
+  ASSERT_EQ(NULL, *included);
+  ASSERT_EQ(NULL, *excluded);
+  DispatchTable table;
+  for (int i = 0; i < base->length(); i++)
+    table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
+  for (int i = 0; i < overlay.length(); i += 2) {
+    table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
+                   CharacterRangeSplitter::kInOverlay);
+  }
+  CharacterRangeSplitter callback(included, excluded);
+  table.ForEach(&callback);
+}
+
+
 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
   if (IsSingleton()) {
@@ -2266,40 +2367,47 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
 // Interest propagation
 
 
-RegExpNode* RegExpNode::GetSibling(NodeInfo* info) {
+RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
   for (int i = 0; i < siblings_.length(); i++) {
     RegExpNode* sibling = siblings_.Get(i);
-    if (sibling->info()->HasSameForwardInterests(info))
+    if (sibling->info()->Matches(info))
       return sibling;
   }
   return NULL;
 }
 
 
+RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
+  ASSERT_EQ(false, *cloned);
+  ASSERT(!info->HasAssertions());
+  siblings_.Ensure(this);
+  RegExpNode* result = TryGetSibling(info);
+  if (result != NULL) return result;
+  result = this->Clone();
+  NodeInfo* new_info = result->info();
+  new_info->ResetCompilationState();
+  new_info->AddFromPreceding(info);
+  AddSibling(result);
+  *cloned = true;
+  return result;
+}
+
+
 template <class C>
 static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
   NodeInfo full_info(*node->info());
   full_info.AddFromPreceding(info);
-  RegExpNode* sibling = node->GetSibling(&full_info);
-  if (sibling != NULL) return sibling;
-  node->EnsureSiblings();
-  sibling = new C(*node);
-  sibling->info()->AddFromPreceding(&full_info);
-  node->AddSibling(sibling);
-  return sibling;
+  bool cloned = false;
+  return RegExpNode::EnsureSibling(node, &full_info, &cloned);
 }
 
 
 RegExpNode* ActionNode::PropagateForward(NodeInfo* info) {
   NodeInfo full_info(*this->info());
   full_info.AddFromPreceding(info);
-  RegExpNode* sibling = GetSibling(&full_info);
-  if (sibling != NULL) return sibling;
-  EnsureSiblings();
-  ActionNode* action = new ActionNode(*this);
-  action->info()->AddFromPreceding(&full_info);
-  AddSibling(action);
-  if (type_ != ESCAPE_SUBMATCH) {
+  bool cloned = false;
+  ActionNode* action = EnsureSibling(this, &full_info, &cloned);
+  if (cloned && type_ != ESCAPE_SUBMATCH) {
     action->set_on_success(action->on_success()->PropagateForward(info));
   }
   return action;
@@ -2309,22 +2417,20 @@ RegExpNode* ActionNode::PropagateForward(NodeInfo* info) {
 RegExpNode* ChoiceNode::PropagateForward(NodeInfo* info) {
   NodeInfo full_info(*this->info());
   full_info.AddFromPreceding(info);
-  RegExpNode* sibling = GetSibling(&full_info);
-  if (sibling != NULL) return sibling;
-  EnsureSiblings();
-  ChoiceNode* choice = new ChoiceNode(*this);
-  choice->info()->AddFromPreceding(&full_info);
-  AddSibling(choice);
-  ZoneList<GuardedAlternative>* old_alternatives = alternatives();
-  int count = old_alternatives->length();
-  choice->alternatives_ = new ZoneList<GuardedAlternative>(count);
-  for (int i = 0; i < count; i++) {
-    GuardedAlternative alternative = old_alternatives->at(i);
-    alternative.set_node(alternative.node()->PropagateForward(info));
-    choice->alternatives()->Add(alternative);
-  }
-  if (!choice->on_failure_->IsBacktrack()) {
-    choice->on_failure_ = choice->on_failure_->PropagateForward(info);
+  bool cloned = false;
+  ChoiceNode* choice = EnsureSibling(this, &full_info, &cloned);
+  if (cloned) {
+    ZoneList<GuardedAlternative>* old_alternatives = alternatives();
+    int count = old_alternatives->length();
+    choice->alternatives_ = new ZoneList<GuardedAlternative>(count);
+    for (int i = 0; i < count; i++) {
+      GuardedAlternative alternative = old_alternatives->at(i);
+      alternative.set_node(alternative.node()->PropagateForward(info));
+      choice->alternatives()->Add(alternative);
+    }
+    if (!choice->on_failure_->IsBacktrack()) {
+      choice->on_failure_ = choice->on_failure_->PropagateForward(info);
+    }
   }
   return choice;
 }
@@ -2338,18 +2444,16 @@ RegExpNode* EndNode::PropagateForward(NodeInfo* info) {
 RegExpNode* BackReferenceNode::PropagateForward(NodeInfo* info) {
   NodeInfo full_info(*this->info());
   full_info.AddFromPreceding(info);
-  RegExpNode* sibling = GetSibling(&full_info);
-  if (sibling != NULL) return sibling;
-  EnsureSiblings();
-  BackReferenceNode* back_ref = new BackReferenceNode(*this);
-  back_ref->info()->AddFromPreceding(&full_info);
-  AddSibling(back_ref);
-  // TODO(erikcorry): A back reference has to have two successors (by default
-  // the same node).  The first is used if the back reference matches a non-
-  // empty back reference, the second if it matches an empty one.  This doesn't
-  // matter for at_end, which is the only one implemented right now, but it will
-  // matter for other pieces of info.
-  back_ref->set_on_success(back_ref->on_success()->PropagateForward(info));
+  bool cloned = false;
+  BackReferenceNode* back_ref = EnsureSibling(this, &full_info, &cloned);
+  if (cloned) {
+    // TODO(erikcorry): A back reference has to have two successors (by default
+    // the same node).  The first is used if the back reference matches a non-
+    // empty back reference, the second if it matches an empty one.  This
+    // doesn't matter for at_end, which is the only one implemented right now,
+    // but it will matter for other pieces of info.
+    back_ref->set_on_success(back_ref->on_success()->PropagateForward(info));
+  }
   return back_ref;
 }
 
@@ -2560,10 +2664,6 @@ void Analysis::VisitChoice(ChoiceNode* that) {
     // this node also, so it can pass it on.
     info->AddFromFollowing(node->info());
   }
-  if (!that->table_calculated()) {
-    DispatchTableConstructor cons(that->table());
-    cons.BuildTable(that);
-  }
   EnsureAnalyzed(that->on_failure());
 }
 
@@ -2574,6 +2674,208 @@ void Analysis::VisitBackReference(BackReferenceNode* that) {
 }
 
 
+// -------------------------------------------------------------------
+// Assumption expansion
+
+
+RegExpNode* RegExpNode::EnsureExpanded(NodeInfo* info) {
+  siblings_.Ensure(this);
+  NodeInfo new_info = *this->info();
+  if (new_info.follows_word_interest)
+    new_info.follows_word = info->follows_word;
+  if (new_info.follows_newline_interest)
+    new_info.follows_newline = info->follows_newline;
+  // If the following node should determine something we need to get
+  // a sibling that determines it.
+  new_info.does_determine_newline = new_info.determine_newline;
+  new_info.does_determine_word = new_info.determine_word;
+  new_info.does_determine_start = new_info.determine_start;
+  RegExpNode* sibling = TryGetSibling(&new_info);
+  if (sibling == NULL) {
+    sibling = ExpandLocal(&new_info);
+    siblings_.Add(sibling);
+    sibling->info()->being_expanded = true;
+    sibling->ExpandChildren();
+    sibling->info()->being_expanded = false;
+    sibling->info()->been_expanded = true;
+  } else {
+    NodeInfo* sib_info = sibling->info();
+    if (!sib_info->been_expanded && !sib_info->being_expanded) {
+      sibling->info()->being_expanded = true;
+      sibling->ExpandChildren();
+      sibling->info()->being_expanded = false;
+      sibling->info()->been_expanded = true;
+    }
+  }
+  return sibling;
+}
+
+
+RegExpNode* ChoiceNode::ExpandLocal(NodeInfo* info) {
+  ChoiceNode* clone = this->Clone();
+  clone->info()->ResetCompilationState();
+  clone->info()->AddAssumptions(info);
+  return clone;
+}
+
+
+void ChoiceNode::ExpandChildren() {
+  ZoneList<GuardedAlternative>* alts = alternatives();
+  ZoneList<GuardedAlternative>* new_alts
+      = new ZoneList<GuardedAlternative>(alts->length());
+  for (int i = 0; i < alts->length(); i++) {
+    GuardedAlternative next = alts->at(i);
+    next.set_node(next.node()->EnsureExpanded(info()));
+    new_alts->Add(next);
+  }
+  alternatives_ = new_alts;
+}
+
+
+RegExpNode* TextNode::ExpandLocal(NodeInfo* info) {
+  TextElement last = elements()->last();
+  if (last.type == TextElement::CHAR_CLASS) {
+    RegExpCharacterClass* char_class = last.data.u_char_class;
+    if (info->does_determine_word) {
+      ZoneList<CharacterRange>* word = NULL;
+      ZoneList<CharacterRange>* non_word = NULL;
+      CharacterRange::Split(char_class->ranges(),
+                            CharacterRange::GetWordBounds(),
+                            &word,
+                            &non_word);
+      if (non_word == NULL) {
+        // This node contains no non-word characters so it must be
+        // all word.
+        this->info()->is_word = NodeInfo::TRUE;
+      } else if (word == NULL) {
+        // Vice versa.
+        this->info()->is_word = NodeInfo::FALSE;
+      } else {
+        // If this character class contains both word and non-word
+        // characters we need to split it into two.
+        ChoiceNode* result = new ChoiceNode(2, on_failure());
+        // Welcome to the family, son!
+        result->set_siblings(this->siblings());
+        *result->info() = *this->info();
+        result->info()->ResetCompilationState();
+        result->info()->AddAssumptions(info);
+        RegExpNode* word_node
+            = new TextNode(new RegExpCharacterClass(word, false),
+                           on_success(),
+                           on_failure());
+        word_node->info()->determine_word = true;
+        word_node->info()->does_determine_word = true;
+        word_node->info()->is_word = NodeInfo::TRUE;
+        result->alternatives()->Add(GuardedAlternative(word_node));
+        RegExpNode* non_word_node
+            = new TextNode(new RegExpCharacterClass(non_word, false),
+                           on_success(),
+                           on_failure());
+        non_word_node->info()->determine_word = true;
+        non_word_node->info()->does_determine_word = true;
+        non_word_node->info()->is_word = NodeInfo::FALSE;
+        result->alternatives()->Add(GuardedAlternative(non_word_node));
+        return result;
+      }
+    }
+  }
+  TextNode* clone = this->Clone();
+  clone->info()->ResetCompilationState();
+  clone->info()->AddAssumptions(info);
+  return clone;
+}
+
+
+void TextNode::ExpandAtomChildren(RegExpAtom* that) {
+  NodeInfo new_info = *info();
+  uc16 last = that->data()[that->data().length() - 1];
+  if (info()->determine_word) {
+    new_info.follows_word = IsRegExpWord(last)
+      ? NodeInfo::TRUE : NodeInfo::FALSE;
+  } else {
+    new_info.follows_word = NodeInfo::UNKNOWN;
+  }
+  if (info()->determine_newline) {
+    new_info.follows_newline = IsRegExpNewline(last)
+      ? NodeInfo::TRUE : NodeInfo::FALSE;
+  } else {
+    new_info.follows_newline = NodeInfo::UNKNOWN;
+  }
+  if (info()->determine_start) {
+    new_info.follows_start = NodeInfo::FALSE;
+  } else {
+    new_info.follows_start = NodeInfo::UNKNOWN;
+  }
+  set_on_success(on_success()->EnsureExpanded(&new_info));
+}
+
+
+void TextNode::ExpandCharClassChildren(RegExpCharacterClass* that) {
+  if (info()->does_determine_word) {
+    // ASSERT(info()->is_word != NodeInfo::UNKNOWN);
+    NodeInfo next_info = *on_success()->info();
+    next_info.follows_word = info()->is_word;
+    set_on_success(on_success()->EnsureExpanded(&next_info));
+  } else {
+    set_on_success(on_success()->EnsureExpanded(info()));
+  }
+}
+
+
+void TextNode::ExpandChildren() {
+  TextElement last = elements()->last();
+  switch (last.type) {
+    case TextElement::ATOM:
+      ExpandAtomChildren(last.data.u_atom);
+      break;
+    case TextElement::CHAR_CLASS:
+      ExpandCharClassChildren(last.data.u_char_class);
+      break;
+    default:
+      UNREACHABLE();
+  }
+}
+
+
+RegExpNode* ActionNode::ExpandLocal(NodeInfo* info) {
+  ActionNode* clone = this->Clone();
+  clone->info()->ResetCompilationState();
+  clone->info()->AddAssumptions(info);
+  return clone;
+}
+
+
+void ActionNode::ExpandChildren() {
+  set_on_success(on_success()->EnsureExpanded(info()));
+}
+
+
+RegExpNode* BackReferenceNode::ExpandLocal(NodeInfo* info) {
+  BackReferenceNode* clone = this->Clone();
+  clone->info()->ResetCompilationState();
+  clone->info()->AddAssumptions(info);
+  return clone;
+}
+
+
+void BackReferenceNode::ExpandChildren() {
+  set_on_success(on_success()->EnsureExpanded(info()));
+}
+
+
+RegExpNode* EndNode::ExpandLocal(NodeInfo* info) {
+  EndNode* clone = this->Clone();
+  clone->info()->ResetCompilationState();
+  clone->info()->AddAssumptions(info);
+  return clone;
+}
+
+
+void EndNode::ExpandChildren() {
+  // nothing to do
+}
+
+
 // -------------------------------------------------------------------
 // Dispatch table construction
 
@@ -2584,7 +2886,6 @@ void DispatchTableConstructor::VisitEnd(EndNode* that) {
 
 
 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
-  ASSERT(!node->table_calculated());
   node->set_being_calculated(true);
   ZoneList<GuardedAlternative>* alternatives = node->alternatives();
   for (int i = 0; i < alternatives->length(); i++) {
@@ -2592,7 +2893,6 @@ void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
     alternatives->at(i).node()->Accept(this);
   }
   node->set_being_calculated(false);
-  node->set_table_calculated(true);
 }
 
 
@@ -2615,13 +2915,9 @@ void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
   if (node->being_calculated())
     return;
-  if (!node->table_calculated()) {
-    DispatchTableConstructor constructor(node->table());
-    constructor.BuildTable(node);
-  }
-  ASSERT(node->table_calculated());
+  DispatchTable* table = node->GetTable(ignore_case_);
   AddDispatchRange adder(this);
-  node->table()->ForEach(&adder);
+  table->ForEach(&adder);
 }
 
 
@@ -2716,6 +3012,9 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
   Analysis analysis(ignore_case);
   analysis.EnsureAnalyzed(node);
 
+  NodeInfo info = *node->info();
+  node = node->EnsureExpanded(&info);
+
   if (!FLAG_irregexp) {
     return Handle<FixedArray>::null();
   }
index a691b127aaa8560b256432a826de4ea5d5fae26f..38357d670ce86ce3aee98c27ec81f5db3046af46 100644 (file)
@@ -185,6 +185,7 @@ class CharacterRange {
   CharacterRange(void* null) { ASSERT_EQ(NULL, null); }  //NOLINT
   CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { }
   static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges);
+  static Vector<const uc16> GetWordBounds();
   static inline CharacterRange Singleton(uc16 value) {
     return CharacterRange(value, value);
   }
@@ -203,9 +204,15 @@ class CharacterRange {
   bool is_valid() { return from_ <= to_; }
   bool IsSingleton() { return (from_ == to_); }
   void AddCaseEquivalents(ZoneList<CharacterRange>* ranges);
+  static void Split(ZoneList<CharacterRange>* base,
+                    Vector<const uc16> overlay,
+                    ZoneList<CharacterRange>** included,
+                    ZoneList<CharacterRange>** excluded);
+
   static const int kRangeCanonicalizeMax = 0x346;
   static const int kStartMarker = (1 << 24);
   static const int kPayloadMask = (1 << 24) - 1;
+
  private:
   uc16 from_;
   uc16 to_;
@@ -344,7 +351,7 @@ class OutSet: public ZoneObject {
 
 // A mapping from integers, specified as ranges, to a set of integers.
 // Used for mapping character ranges to choices.
-class DispatchTable {
+class DispatchTable : public ZoneObject {
  public:
   class Entry {
    public:
@@ -437,29 +444,50 @@ class TextElement {
 
 
 struct NodeInfo {
-  enum PrecedingInfo {
+  enum TriBool {
     UNKNOWN = -1, FALSE = 0, TRUE = 1
   };
 
   NodeInfo()
       : being_analyzed(false),
         been_analyzed(false),
+        being_expanded(false),
+        been_expanded(false),
         determine_word(false),
         determine_newline(false),
         determine_start(false),
+        does_determine_word(false),
+        does_determine_newline(false),
+        does_determine_start(false),
         follows_word_interest(false),
         follows_newline_interest(false),
         follows_start_interest(false),
+        is_word(UNKNOWN),
+        is_newline(UNKNOWN),
         at_end(false),
         follows_word(UNKNOWN),
         follows_newline(UNKNOWN),
         follows_start(UNKNOWN) { }
 
-  bool HasSameForwardInterests(NodeInfo* that) {
-    return (at_end == that->at_end)
-        && (follows_word_interest == that->follows_word_interest)
-        && (follows_newline_interest == that->follows_newline_interest)
-        && (follows_start_interest == that->follows_start_interest);
+  // Returns true if the interests and assumptions of this node
+  // matches the given one.
+  bool Matches(NodeInfo* that) {
+    return (at_end == that->at_end) &&
+           (follows_word_interest == that->follows_word_interest) &&
+           (follows_newline_interest == that->follows_newline_interest) &&
+           (follows_start_interest == that->follows_start_interest) &&
+           (follows_word == that->follows_word) &&
+           (follows_newline == that->follows_newline) &&
+           (follows_start == that->follows_start) &&
+           (does_determine_word == that->does_determine_word) &&
+           (does_determine_newline == that->does_determine_newline) &&
+           (does_determine_start == that->does_determine_start);
+  }
+
+  bool HasAssertions() {
+    return (follows_word != UNKNOWN) ||
+           (follows_newline != UNKNOWN) ||
+           (follows_start != UNKNOWN);
   }
 
   // Updates the interests of this node given the interests of the
@@ -471,6 +499,26 @@ struct NodeInfo {
     follows_start_interest |= that->follows_start_interest;
   }
 
+  void AddAssumptions(NodeInfo* that) {
+    if (that->follows_word != UNKNOWN) {
+      ASSERT(follows_word == UNKNOWN || follows_word == that->follows_word);
+      follows_word = that->follows_word;
+    }
+    if (that->follows_newline != UNKNOWN) {
+      ASSERT(follows_newline == UNKNOWN ||
+             follows_newline == that->follows_newline);
+      follows_newline = that->follows_newline;
+    }
+    if (that->follows_start != UNKNOWN) {
+      ASSERT(follows_start == UNKNOWN ||
+             follows_start == that->follows_start);
+      follows_start = that->follows_start;
+    }
+    does_determine_word = that->does_determine_word;
+    does_determine_newline = that->does_determine_newline;
+    does_determine_start = that->does_determine_start;
+  }
+
   // Sets the interests of this node to include the interests of the
   // following node.
   void AddFromFollowing(NodeInfo* that) {
@@ -479,8 +527,17 @@ struct NodeInfo {
     follows_start_interest |= that->follows_start_interest;
   }
 
+  void ResetCompilationState() {
+    being_analyzed = false;
+    been_analyzed = false;
+    being_expanded = false;
+    been_expanded = false;
+  }
+
   bool being_analyzed: 1;
   bool been_analyzed: 1;
+  bool being_expanded: 1;
+  bool been_expanded: 1;
 
   // These bits are set if this node must propagate forward information
   // about the last character it consumed (or, in the case of 'start',
@@ -489,19 +546,40 @@ struct NodeInfo {
   bool determine_newline: 1;
   bool determine_start: 1;
 
+  bool does_determine_word: 1;
+  bool does_determine_newline: 1;
+  bool does_determine_start: 1;
+
   // These bits are set of this node has to know what the preceding
   // character was.
   bool follows_word_interest: 1;
   bool follows_newline_interest: 1;
   bool follows_start_interest: 1;
 
+  TriBool is_word: 2;
+  TriBool is_newline: 2;
+
   bool at_end: 1;
 
   // These bits are set if the node can make assumptions about what
   // the previous character was.
-  PrecedingInfo follows_word: 2;
-  PrecedingInfo follows_newline: 2;
-  PrecedingInfo follows_start: 2;
+  TriBool follows_word: 2;
+  TriBool follows_newline: 2;
+  TriBool follows_start: 2;
+};
+
+
+class ExpansionGuard {
+ public:
+  explicit inline ExpansionGuard(NodeInfo* info) : info_(info) {
+    ASSERT(!info->being_expanded);
+    info->being_expanded = true;
+  }
+  inline ~ExpansionGuard() {
+    info_->being_expanded = false;
+  }
+ private:
+  NodeInfo* info_;
 };
 
 
@@ -538,6 +616,10 @@ class RegExpNode: public ZoneObject {
   // false for failure.
   virtual bool Emit(RegExpCompiler* compiler) = 0;
 
+  RegExpNode* EnsureExpanded(NodeInfo* info);
+  virtual RegExpNode* ExpandLocal(NodeInfo* info) = 0;
+  virtual void ExpandChildren() = 0;
+
   // Propagates the given interest information forward.  When seeing
   // \bfoo for instance, the \b is implemented by propagating forward
   // to the 'foo' string that it should only succeed if its first
@@ -546,11 +628,40 @@ class RegExpNode: public ZoneObject {
 
   NodeInfo* info() { return &info_; }
   virtual bool IsBacktrack() { return false; }
-  RegExpNode* GetSibling(NodeInfo* info);
-  void EnsureSiblings() { siblings_.Ensure(this); }
+
   void AddSibling(RegExpNode* node) { siblings_.Add(node); }
+
+  // Static version of EnsureSibling that expresses the fact that the
+  // result has the same type as the input.
+  template <class C>
+  static C* EnsureSibling(C* node, NodeInfo* info, bool* cloned) {
+    return static_cast<C*>(node->EnsureSibling(info, cloned));
+  }
+
+  SiblingList* siblings() { return &siblings_; }
+  void set_siblings(SiblingList* other) { siblings_ = *other; }
+
  protected:
+
+  // Returns a sibling of this node whose interests and assumptions
+  // match the ones in the given node info.  If no sibling exists NULL
+  // is returned.
+  RegExpNode* TryGetSibling(NodeInfo* info);
+
+  // Returns a sibling of this node whose interests match the ones in
+  // the given node info.  The info must not contain any assertions.
+  // If no node exists a new one will be created by cloning the current
+  // node.  The result will always be an instance of the same concrete
+  // class as this node.
+  RegExpNode* EnsureSibling(NodeInfo* info, bool* cloned);
+
+  // Returns a clone of this node initialized using the copy constructor
+  // of its concrete class.  Note that the node may have to be pre-
+  // processed before it is on a useable state.
+  virtual RegExpNode* Clone() = 0;
+
   inline void Bind(RegExpMacroAssembler* macro);
+
  private:
   Label label_;
   NodeInfo info_;
@@ -593,7 +704,11 @@ class ActionNode: public SeqRegExpNode {
                                     RegExpNode* on_success);
   virtual void Accept(NodeVisitor* visitor);
   virtual bool Emit(RegExpCompiler* compiler);
+  virtual RegExpNode* ExpandLocal(NodeInfo* info);
+  virtual void ExpandChildren();
   virtual RegExpNode* PropagateForward(NodeInfo* info);
+  virtual ActionNode* Clone() { return new ActionNode(*this); }
+
  private:
   union {
     struct {
@@ -627,13 +742,28 @@ class TextNode: public SeqRegExpNode {
       : SeqRegExpNode(on_success),
         on_failure_(on_failure),
         elms_(elms) { }
+  TextNode(RegExpCharacterClass* that,
+           RegExpNode* on_success,
+           RegExpNode* on_failure)
+      : SeqRegExpNode(on_success),
+        on_failure_(on_failure),
+        elms_(new ZoneList<TextElement>(1)) {
+    elms_->Add(TextElement::CharClass(that));
+  }
   virtual void Accept(NodeVisitor* visitor);
   virtual RegExpNode* PropagateForward(NodeInfo* info);
+  virtual RegExpNode* ExpandLocal(NodeInfo* info);
+  virtual void ExpandChildren();
   RegExpNode* on_failure() { return on_failure_; }
   virtual bool Emit(RegExpCompiler* compiler);
   ZoneList<TextElement>* elements() { return elms_; }
   void MakeCaseIndependent();
+  virtual TextNode* Clone() { return new TextNode(*this); }
+
  private:
+  void ExpandAtomChildren(RegExpAtom* that);
+  void ExpandCharClassChildren(RegExpCharacterClass* that);
+
   RegExpNode* on_failure_;
   ZoneList<TextElement>* elms_;
 };
@@ -655,6 +785,10 @@ class BackReferenceNode: public SeqRegExpNode {
   int end_register() { return end_reg_; }
   virtual bool Emit(RegExpCompiler* compiler);
   virtual RegExpNode* PropagateForward(NodeInfo* info);
+  virtual RegExpNode* ExpandLocal(NodeInfo* info);
+  virtual void ExpandChildren();
+  virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
+
  private:
   RegExpNode* on_failure_;
   int start_reg_;
@@ -669,8 +803,12 @@ class EndNode: public RegExpNode {
   virtual void Accept(NodeVisitor* visitor);
   virtual bool Emit(RegExpCompiler* compiler);
   virtual RegExpNode* PropagateForward(NodeInfo* info);
+  virtual RegExpNode* ExpandLocal(NodeInfo* info);
+  virtual void ExpandChildren();
   virtual bool IsBacktrack() { return action_ == BACKTRACK; }
   virtual bool GoTo(RegExpCompiler* compiler);
+  virtual EndNode* Clone() { return new EndNode(*this); }
+
  private:
   Action action_;
 };
@@ -686,6 +824,7 @@ class Guard: public ZoneObject {
   int reg() { return reg_; }
   Relation op() { return op_; }
   int value() { return value_; }
+
  private:
   int reg_;
   Relation op_;
@@ -700,6 +839,7 @@ class GuardedAlternative {
   RegExpNode* node() { return node_; }
   void set_node(RegExpNode* node) { node_ = node; }
   ZoneList<Guard*>* guards() { return guards_; }
+
  private:
   RegExpNode* node_;
   ZoneList<Guard*>* guards_;
@@ -711,27 +851,31 @@ class ChoiceNode: public RegExpNode {
   explicit ChoiceNode(int expected_size, RegExpNode* on_failure)
       : on_failure_(on_failure),
         alternatives_(new ZoneList<GuardedAlternative>(expected_size)),
-        table_calculated_(false),
+        table_(NULL),
         being_calculated_(false) { }
   virtual void Accept(NodeVisitor* visitor);
   void AddAlternative(GuardedAlternative node) { alternatives()->Add(node); }
   ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
-  DispatchTable* table() { return &table_; }
+  DispatchTable* GetTable(bool ignore_case);
   RegExpNode* on_failure() { return on_failure_; }
   virtual bool Emit(RegExpCompiler* compiler);
   virtual RegExpNode* PropagateForward(NodeInfo* info);
-  bool table_calculated() { return table_calculated_; }
-  void set_table_calculated(bool b) { table_calculated_ = b; }
+  virtual RegExpNode* ExpandLocal(NodeInfo* info);
+  virtual void ExpandChildren();
+  virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }
+
   bool being_calculated() { return being_calculated_; }
   void set_being_calculated(bool b) { being_calculated_ = b; }
+
  private:
+  friend class DispatchTableConstructor;
+  friend class Analysis;
   void GenerateGuard(RegExpMacroAssembler* macro_assembler,
                      Guard *guard,
                      Label* on_failure);
   RegExpNode* on_failure_;
   ZoneList<GuardedAlternative>* alternatives_;
-  DispatchTable table_;
-  bool table_calculated_;
+  DispatchTable* table_;
   bool being_calculated_;
 };
 
@@ -750,9 +894,10 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
 // dispatch table of a choice node.
 class DispatchTableConstructor: public NodeVisitor {
  public:
-  explicit DispatchTableConstructor(DispatchTable* table)
+  DispatchTableConstructor(DispatchTable* table, bool ignore_case)
       : table_(table),
-        choice_index_(-1) { }
+        choice_index_(-1),
+        ignore_case_(ignore_case) { }
 
   void BuildTable(ChoiceNode* node);
 
@@ -773,6 +918,7 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
  protected:
   DispatchTable *table_;
   int choice_index_;
+  bool ignore_case_;
 };
 
 
@@ -808,7 +954,7 @@ class RegExpEngine: public AllStatic {
                                     RegExpNode** node_return,
                                     bool ignore_case,
                                     bool multiline);
-  static void DotPrint(const char* label, RegExpNode* node);
+  static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
 };
 
 
index 50412c4d3309656f7a5318d31c1180e64830339c..8217315c8612be8773a6243e42125fb4d2f996ef 100644 (file)
@@ -1028,7 +1028,7 @@ class LexicalScope BASE_EMBEDDED {
 #define DUMMY )  // to make indentation work
 #undef DUMMY
 
-#define CHECK_FAILED  );   \
+#define CHECK_FAILED  /**/);   \
   if (failed_) return NULL; \
   ((void)0
 #define DUMMY )  // to make indentation work
index 8be425e4f43da9b8831acfaf7700dc3d8c3ea409..2212949a337661fd8334cf3f0fda9d774f608639 100644 (file)
@@ -324,27 +324,8 @@ static bool NotWhiteSpace(uc16 c) {
 }
 
 
-static bool IsWord(uc16 c) {
-  return ('a' <= c && c <= 'z')
-      || ('A' <= c && c <= 'Z')
-      || ('0' <= c && c <= '9')
-      || (c == '_');
-}
-
-
 static bool NotWord(uc16 c) {
-  return !IsWord(c);
-}
-
-
-static bool Dot(uc16 c) {
-  switch (c) {
-    //   CR           LF           LS           PS
-    case 0x000A: case 0x000D: case 0x2028: case 0x2029:
-      return false;
-    default:
-      return true;
-  }
+  return !IsRegExpWord(c);
 }
 
 
@@ -364,12 +345,12 @@ static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
 
 
 TEST(CharacterClassEscapes) {
-  TestCharacterClassEscapes('.', Dot);
+  TestCharacterClassEscapes('.', IsRegExpNewline);
   TestCharacterClassEscapes('d', IsDigit);
   TestCharacterClassEscapes('D', NotDigit);
   TestCharacterClassEscapes('s', IsWhiteSpace);
   TestCharacterClassEscapes('S', NotWhiteSpace);
-  TestCharacterClassEscapes('w', IsWord);
+  TestCharacterClassEscapes('w', IsRegExpWord);
   TestCharacterClassEscapes('W', NotWord);
 }
 
@@ -395,7 +376,7 @@ static void Execute(const char* input,
   USE(node);
 #ifdef DEBUG
   if (dot_output) {
-    RegExpEngine::DotPrint(input, node);
+    RegExpEngine::DotPrint(input, node, false);
     exit(0);
   }
 #endif  // DEBUG
@@ -964,7 +945,7 @@ TEST(AddInverseToTable) {
       ranges->Add(CharacterRange(from, to));
     }
     DispatchTable table;
-    DispatchTableConstructor cons(&table);
+    DispatchTableConstructor cons(&table, false);
     cons.set_choice_index(0);
     cons.AddInverse(ranges);
     for (int i = 0; i < kLimit; i++) {
@@ -980,7 +961,7 @@ TEST(AddInverseToTable) {
           new ZoneList<CharacterRange>(1);
   ranges->Add(CharacterRange(0xFFF0, 0xFFFE));
   DispatchTable table;
-  DispatchTableConstructor cons(&table);
+  DispatchTableConstructor cons(&table, false);
   cons.set_choice_index(0);
   cons.AddInverse(ranges);
   CHECK(!table.Get(0xFFFE)->Get(0));
@@ -1162,7 +1143,45 @@ TEST(CharacterRangeCaseIndependence) {
 }
 
 
+static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) {
+  if (ranges == NULL)
+    return false;
+  for (int i = 0; i < ranges->length(); i++) {
+    CharacterRange range = ranges->at(i);
+    if (range.from() <= c && c <= range.to())
+      return true;
+  }
+  return false;
+}
+
+
+TEST(CharClassDifference) {
+  ZoneScope zone_scope(DELETE_ON_EXIT);
+  ZoneList<CharacterRange>* base = new ZoneList<CharacterRange>(1);
+  base->Add(CharacterRange::Everything());
+  Vector<const uc16> overlay = CharacterRange::GetWordBounds();
+  ZoneList<CharacterRange>* included = NULL;
+  ZoneList<CharacterRange>* excluded = NULL;
+  CharacterRange::Split(base, overlay, &included, &excluded);
+  for (int i = 0; i < (1 << 16); i++) {
+    bool in_base = InClass(i, base);
+    if (in_base) {
+      bool in_overlay = false;
+      for (int j = 0; !in_overlay && j < overlay.length(); j += 2) {
+        if (overlay[j] <= i && i <= overlay[j+1])
+          in_overlay = true;
+      }
+      CHECK_EQ(in_overlay, InClass(i, included));
+      CHECK_EQ(!in_overlay, InClass(i, excluded));
+    } else {
+      CHECK(!InClass(i, included));
+      CHECK(!InClass(i, excluded));
+    }
+  }
+}
+
+
 TEST(Graph) {
   V8::Initialize(NULL);
-  Execute("foo$(?!bar)", false, true);
+  Execute("\\b\\w", false, true);
 }