Add support for \b and ^ and $ in multiline mode, completing Irregexp
authorerik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Mon, 19 Jan 2009 18:56:47 +0000 (18:56 +0000)
committererik.corry@gmail.com <erik.corry@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Mon, 19 Jan 2009 18:56:47 +0000 (18:56 +0000)
features.  Switch on Irregexp by default.
Review URL: http://codereview.chromium.org/18193

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@1104 ce2b1a6d-e550-0410-aec6-3dcde31c8c00

15 files changed:
src/ast.h
src/bytecodes-irregexp.h
src/flag-definitions.h
src/interpreter-irregexp.cc
src/jsregexp.cc
src/jsregexp.h
src/regexp-macro-assembler-ia32.cc
src/regexp-macro-assembler-ia32.h
src/regexp-macro-assembler-irregexp.cc
src/regexp-macro-assembler-irregexp.h
src/regexp-macro-assembler-tracer.cc
src/regexp-macro-assembler-tracer.h
src/regexp-macro-assembler.h
src/string.js
test/mozilla/mozilla.status

index 360a054..063e882 100644 (file)
--- a/src/ast.h
+++ b/src/ast.h
@@ -1382,7 +1382,7 @@ class RegExpCharacterClass: public RegExpTree {
   // W : non-ASCII word character
   // d : ASCII digit
   // D : non-ASCII digit
-  // . : non-unicode newline
+  // . : non-unicode non-newline
   // * : All characters
   uc16 standard_type() { return set_.standard_set_type(); }
   ZoneList<CharacterRange>* ranges() { return set_.ranges(); }
index 7ec8635..16d06ef 100644 (file)
@@ -72,8 +72,9 @@ V(LOOKUP_HI_MAP8,    36, 99) /* l_himap8 start8 byte_map_addr32 addr32*     */ \
 V(CHECK_REGISTER_LT, 37, 8) /* check_reg_lt register_index value16 addr32   */ \
 V(CHECK_REGISTER_GE, 38, 8) /* check_reg_ge register_index value16 addr32   */ \
 V(CHECK_REGISTER_EQ_POS, 39, 6) /* check_register_eq_pos index addr32       */ \
-V(CHECK_NOT_AT_START, 40, 5) /* check_not_at_start addr32                   */ \
-V(CHECK_GREEDY,      41, 5) /* check_greedy addr32                          */
+V(CHECK_AT_START,    40, 5) /* check_at_start addr32                        */ \
+V(CHECK_NOT_AT_START, 41, 5) /* check_not_at_start addr32                   */ \
+V(CHECK_GREEDY,      42, 5) /* check_greedy addr32                          */
 
 #define DECLARE_BYTECODES(name, code, length) \
   static const int BC_##name = code;
index 194340b..13efb3f 100644 (file)
@@ -199,12 +199,11 @@ DEFINE_bool(usage_computation, true, "compute variable usage counts")
 DEFINE_bool(preemption, false,
             "activate a 100ms timer that switches between V8 threads")
 
-// irregexp
+// Irregexp
 DEFINE_bool(irregexp, false, "new regular expression code")
 DEFINE_bool(trace_regexps, false, "trace Irregexp execution")
 DEFINE_bool(irregexp_native, false, "use native code Irregexp implementation (IA32 only)")
 DEFINE_bool(disable_jscre, false, "abort if JSCRE is used.  Only useful with --irregexp")
-DEFINE_bool(attempt_multiline_irregexp, false, "attempt to use Irregexp for multiline regexps")
 
 // Testing flags test/cctest/test-{flags,api,serialization}.cc
 DEFINE_bool(testing_bool_flag, true, "testing_bool_flag")
index 8ffc040..0ce0d33 100644 (file)
@@ -490,6 +490,13 @@ static bool RawMatch(const byte* code_base,
         }
         break;
       }
+      BYTECODE(CHECK_AT_START)
+        if (current == 0) {
+          pc = code_base + Load32(pc + 1);
+        } else {
+          pc += BC_CHECK_AT_START_LENGTH;
+        }
+        break;
       BYTECODE(CHECK_NOT_AT_START)
         if (current == 0) {
           pc += BC_CHECK_NOT_AT_START_LENGTH;
index aa11a69..e2d1d02 100644 (file)
@@ -1522,18 +1522,6 @@ bool Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
 }
 
 
-void EndNode::EmitInfoChecks(RegExpMacroAssembler* assembler, Trace* trace) {
-  if (info()->at_end) {
-    Label succeed;
-    // LoadCurrentCharacter will go to the label if we are at the end of the
-    // input string.
-    assembler->LoadCurrentCharacter(0, &succeed);
-    assembler->GoTo(trace->backtrack());
-    assembler->Bind(&succeed);
-  }
-}
-
-
 bool NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
   if (!trace->is_trivial()) {
     return trace->Flush(compiler, this);
@@ -1542,7 +1530,6 @@ bool NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
   if (!label()->is_bound()) {
     assembler->Bind(label());
   }
-  EmitInfoChecks(assembler, trace);
   assembler->ReadCurrentPositionFromRegister(current_position_register_);
   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
   // Now that we have unwound the stack we find at the top of the stack the
@@ -1562,11 +1549,9 @@ bool EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   }
   switch (action_) {
     case ACCEPT:
-      EmitInfoChecks(assembler, trace);
       assembler->Succeed();
       return true;
     case BACKTRACK:
-      ASSERT(!info()->at_end);
       assembler->GoTo(trace->backtrack());
       return true;
     case NEGATIVE_SUBMATCH_SUCCESS:
@@ -1935,13 +1920,6 @@ RegExpNode::~RegExpNode() {
 
 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
                                                   Trace* trace) {
-  // TODO(erikcorry): Implement support.
-  if (info_.follows_word_interest ||
-      info_.follows_newline_interest ||
-      info_.follows_start_interest) {
-    return FAIL;
-  }
-
   // If we are generating a greedy loop then don't stop and don't reuse code.
   if (trace->stop_node() != NULL) {
     return CONTINUE;
@@ -1990,6 +1968,19 @@ int ActionNode::EatsAtLeast(int recursion_depth) {
 }
 
 
+int AssertionNode::EatsAtLeast(int recursion_depth) {
+  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
+  return on_success()->EatsAtLeast(recursion_depth + 1);
+}
+
+
+int BackReferenceNode::EatsAtLeast(int recursion_depth) {
+  if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
+  return on_success()->EatsAtLeast(recursion_depth + 1);
+}
+
+
+
 int TextNode::EatsAtLeast(int recursion_depth) {
   int answer = Length();
   if (answer >= 4) return answer;
@@ -2257,7 +2248,7 @@ void QuickCheckDetails::Clear() {
 
 
 void QuickCheckDetails::Advance(int by, bool ascii) {
-  ASSERT(by > 0);
+  ASSERT(by >= 0);
   if (by >= characters_) {
     Clear();
     return;
@@ -2342,6 +2333,148 @@ void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
 }
 
 
+// Check for [0-9A-Z_a-z].
+static void EmitWordCheck(RegExpMacroAssembler* assembler,
+                          Label* word,
+                          Label* non_word,
+                          bool fall_through_on_word) {
+  assembler->CheckCharacterGT('z', non_word);
+  assembler->CheckCharacterLT('0', non_word);
+  assembler->CheckCharacterGT('a' - 1, word);
+  assembler->CheckCharacterLT('9' + 1, word);
+  assembler->CheckCharacterLT('A', non_word);
+  assembler->CheckCharacterLT('Z' + 1, word);
+  if (fall_through_on_word) {
+    assembler->CheckNotCharacter('_', non_word);
+  } else {
+    assembler->CheckCharacter('_', word);
+  }
+}
+
+
+// Emit the code to check for a ^ in multiline mode (1-character lookbehind
+// that matches newline or the start of input).
+static bool EmitHat(RegExpCompiler* compiler,
+                    RegExpNode* on_success,
+                    Trace* trace) {
+  RegExpMacroAssembler* assembler = compiler->macro_assembler();
+  // We will be loading the previous character into the current character
+  // register.
+  Trace new_trace(*trace);
+  new_trace.InvalidateCurrentCharacter();
+
+  Label ok;
+  if (new_trace.cp_offset() == 0) {
+    // The start of input counts as a newline in this context, so skip to
+    // ok if we are at the start.
+    assembler->CheckAtStart(&ok);
+  }
+  // We already checked that we are not at the start of input so it must be
+  // OK to load the previous character.
+  assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
+                                  new_trace.backtrack(),
+                                  false);
+  // Newline means \n, \r, 0x2028 or 0x2029.
+  if (!compiler->ascii()) {
+    assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
+  }
+  assembler->CheckCharacter('\n', &ok);
+  assembler->CheckNotCharacter('\r', new_trace.backtrack());
+  assembler->Bind(&ok);
+  return on_success->Emit(compiler, &new_trace);
+}
+
+
+// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
+static bool EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
+                              RegExpCompiler* compiler,
+                              RegExpNode* on_success,
+                              Trace* trace) {
+  RegExpMacroAssembler* assembler = compiler->macro_assembler();
+  Label before_non_word;
+  Label before_word;
+  if (trace->characters_preloaded() != 1) {
+    assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
+  }
+  // Fall through on non-word.
+  EmitWordCheck(assembler, &before_word, &before_non_word, false);
+
+  // We will be loading the previous character into the current character
+  // register.
+  Trace new_trace(*trace);
+  new_trace.InvalidateCurrentCharacter();
+
+  Label ok;
+  Label* boundary;
+  Label* not_boundary;
+  if (type == AssertionNode::AT_BOUNDARY) {
+    boundary = &ok;
+    not_boundary = new_trace.backtrack();
+  } else {
+    not_boundary = &ok;
+    boundary = new_trace.backtrack();
+  }
+
+  // Next character is not a word character.
+  assembler->Bind(&before_non_word);
+  if (new_trace.cp_offset() == 0) {
+    // The start of input counts as a non-word character, so the question is
+    // decided if we are at the start.
+    assembler->CheckAtStart(not_boundary);
+  }
+  // We already checked that we are not at the start of input so it must be
+  // OK to load the previous character.
+  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
+                                  &ok,  // Unused dummy label in this call.
+                                  false);
+  // Fall through on non-word.
+  EmitWordCheck(assembler, boundary, not_boundary, false);
+  assembler->GoTo(not_boundary);
+
+  // Next character is a word character.
+  assembler->Bind(&before_word);
+  if (new_trace.cp_offset() == 0) {
+    // The start of input counts as a non-word character, so the question is
+    // decided if we are at the start.
+    assembler->CheckAtStart(boundary);
+  }
+  // We already checked that we are not at the start of input so it must be
+  // OK to load the previous character.
+  assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
+                                  &ok,  // Unused dummy label in this call.
+                                  false);
+  bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
+  EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
+
+  assembler->Bind(&ok);
+
+  return on_success->Emit(compiler, &new_trace);
+}
+
+
+bool AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
+  RegExpMacroAssembler* assembler = compiler->macro_assembler();
+  switch (type_) {
+    case AT_END: {
+      Label ok;
+      assembler->LoadCurrentCharacter(trace->cp_offset(), &ok);
+      assembler->GoTo(trace->backtrack());
+      assembler->Bind(&ok);
+      break;
+    }
+    case AT_START:
+      assembler->CheckNotAtStart(trace->backtrack());
+      break;
+    case AFTER_NEWLINE:
+      return EmitHat(compiler, on_success(), trace);
+    case AT_NON_BOUNDARY:
+    case AT_BOUNDARY:
+      return EmitBoundaryCheck(type_, compiler, on_success(), trace);
+  }
+  return on_success()->Emit(compiler, trace);
+}
+
+
 // We call this repeatedly to generate code for each pass over the text node.
 // The passes are in increasing order of difficulty because we hope one
 // of the first passes will fail in which case we are saved the work of the
@@ -2487,17 +2620,6 @@ bool TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   if (limit_result == DONE) return true;
   ASSERT(limit_result == CONTINUE);
 
-  if (info()->follows_word_interest ||
-      info()->follows_newline_interest ||
-      info()->follows_start_interest) {
-    return false;
-  }
-
-  if (info()->at_end) {
-    compiler->macro_assembler()->GoTo(trace->backtrack());
-    return true;
-  }
-
   if (compiler->ascii()) {
     int dummy = 0;
     TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
@@ -2561,6 +2683,11 @@ bool TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
 }
 
 
+void Trace::InvalidateCurrentCharacter() {
+  characters_preloaded_ = 0;
+}
+
+
 void Trace::AdvanceCurrentPositionInTrace(int by, bool ascii) {
   ASSERT(by > 0);
   // We don't have an instruction for shifting the current character register
@@ -2616,12 +2743,6 @@ int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
       return kNodeIsTooComplexForGreedyLoops;
     }
-    NodeInfo* info = node->info();
-    if (info->follows_word_interest ||
-        info->follows_newline_interest ||
-        info->follows_start_interest) {
-      return kNodeIsTooComplexForGreedyLoops;
-    }
     int node_length = node->GreedyLoopTextLength();
     if (node_length == kNodeIsTooComplexForGreedyLoops) {
       return kNodeIsTooComplexForGreedyLoops;
@@ -3096,20 +3217,6 @@ bool ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
     }
     case POSITIVE_SUBMATCH_SUCCESS:
       if (!trace->is_trivial()) return trace->Flush(compiler, this);
-      // TODO(erikcorry): Implement support.
-      if (info()->follows_word_interest ||
-          info()->follows_newline_interest ||
-          info()->follows_start_interest) {
-        return false;
-      }
-      if (info()->at_end) {
-        Label at_end;
-        // Load current character jumps to the label if we are beyond the string
-        // end.
-        assembler->LoadCurrentCharacter(0, &at_end);
-        assembler->GoTo(trace->backtrack());
-        assembler->Bind(&at_end);
-      }
       assembler->ReadCurrentPositionFromRegister(
           data_.u_submatch.current_position_register);
       assembler->ReadStackPointerFromRegister(
@@ -3136,19 +3243,11 @@ bool BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
   RecursionCheck rc(compiler);
 
   ASSERT_EQ(start_reg_ + 1, end_reg_);
-  if (info()->at_end) {
-    // If we are constrained to match at the end of the input then succeed
-    // iff the back reference is empty.
-    assembler->CheckNotRegistersEqual(start_reg_,
-                                      end_reg_,
-                                      trace->backtrack());
+  if (compiler->ignore_case()) {
+    assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
+                                               trace->backtrack());
   } else {
-    if (compiler->ignore_case()) {
-      assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
-                                                 trace->backtrack());
-    } else {
-      assembler->CheckNotBackReference(start_reg_, trace->backtrack());
-    }
+    assembler->CheckNotBackReference(start_reg_, trace->backtrack());
   }
   return on_success()->Emit(compiler, trace);
 }
@@ -3389,6 +3488,33 @@ void DotPrinter::VisitEnd(EndNode* that) {
 }
 
 
+void DotPrinter::VisitAssertion(AssertionNode* that) {
+  stream()->Add("  n%p [", that);
+  switch (that->type()) {
+    case AssertionNode::AT_END:
+      stream()->Add("label=\"$\", shape=septagon");
+      break;
+    case AssertionNode::AT_START:
+      stream()->Add("label=\"^\", shape=septagon");
+      break;
+    case AssertionNode::AT_BOUNDARY:
+      stream()->Add("label=\"\\b\", shape=septagon");
+      break;
+    case AssertionNode::AT_NON_BOUNDARY:
+      stream()->Add("label=\"\\B\", shape=septagon");
+      break;
+    case AssertionNode::AFTER_NEWLINE:
+      stream()->Add("label=\"(?<=\\n)\", shape=septagon");
+      break;
+  }
+  stream()->Add("];\n");
+  PrintAttributes(that);
+  RegExpNode* successor = that->on_success();
+  stream()->Add("  n%p -> n%p;\n", that, successor);
+  Visit(successor);
+}
+
+
 void DotPrinter::VisitAction(ActionNode* that) {
   stream()->Add("  n%p [", that);
   switch (that->type_) {
@@ -3749,22 +3875,49 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
   NodeInfo info;
   switch (type()) {
     case START_OF_LINE:
-      info.follows_newline_interest = true;
-      break;
+      return AssertionNode::AfterNewline(on_success);
     case START_OF_INPUT:
-      info.follows_start_interest = true;
-      break;
-    case BOUNDARY: case NON_BOUNDARY:
-      info.follows_word_interest = true;
-      break;
+      return AssertionNode::AtStart(on_success);
+    case BOUNDARY:
+      return AssertionNode::AtBoundary(on_success);
+    case NON_BOUNDARY:
+      return AssertionNode::AtNonBoundary(on_success);
     case END_OF_INPUT:
-      info.at_end = true;
-      break;
-    case END_OF_LINE:
-      // This is wrong but has the effect of making the compiler abort.
-      info.at_end = true;
+      return AssertionNode::AtEnd(on_success);
+    case END_OF_LINE: {
+      // Compile $ in multiline regexps as an alternation with a positive
+      // lookahead in one side and an end-of-input on the other side.
+      // We need two registers for the lookahead.
+      int stack_pointer_register = compiler->AllocateRegister();
+      int position_register = compiler->AllocateRegister();
+      // The ChoiceNode to distinguish between a newline and end-of-input.
+      ChoiceNode* result = new ChoiceNode(2);
+      // Create a newline atom.
+      ZoneList<CharacterRange>* newline_ranges =
+          new ZoneList<CharacterRange>(3);
+      CharacterRange::AddClassEscape('n', newline_ranges);
+      RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
+      TextNode* newline_matcher = new TextNode(
+         newline_atom,
+         ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
+                                             position_register,
+                                             on_success));
+      // Create an end-of-input matcher.
+      RegExpNode* end_of_line = ActionNode::BeginSubmatch(
+          stack_pointer_register,
+          position_register,
+          newline_matcher);
+      // Add the two alternatives to the ChoiceNode.
+      GuardedAlternative eol_alternative(end_of_line);
+      result->AddAlternative(eol_alternative);
+      GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
+      result->AddAlternative(end_alternative);
+      return result;
+    }
+    default:
+      UNREACHABLE();
   }
-  return on_success->PropagateForward(&info);
+  return on_success;
 }
 
 
@@ -3911,6 +4064,13 @@ void CharacterRange::AddClassEscape(uc16 type,
     case '*':
       ranges->Add(CharacterRange::Everything());
       break;
+    // This is the set of characters matched by the $ and ^ symbols
+    // in multiline mode.
+    case 'n':
+      AddClass(kLineTerminatorRanges,
+               kLineTerminatorRangeCount,
+               ranges);
+      break;
     default:
       UNREACHABLE();
   }
@@ -4096,62 +4256,6 @@ static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
 }
 
 
-RegExpNode* ActionNode::PropagateForward(NodeInfo* info) {
-  NodeInfo full_info(*this->info());
-  full_info.AddFromPreceding(info);
-  bool cloned = false;
-  ActionNode* action = EnsureSibling(this, &full_info, &cloned);
-  action->set_on_success(action->on_success()->PropagateForward(info));
-  return action;
-}
-
-
-RegExpNode* ChoiceNode::PropagateForward(NodeInfo* info) {
-  NodeInfo full_info(*this->info());
-  full_info.AddFromPreceding(info);
-  bool cloned = false;
-  ChoiceNode* choice = EnsureSibling(this, &full_info, &cloned);
-  if (cloned) {
-    ZoneList<GuardedAlternative>* old_alternatives = alternatives();
-    int count = old_alternatives->length();
-    choice->alternatives_ = new ZoneList<GuardedAlternative>(count);
-    for (int i = 0; i < count; i++) {
-      GuardedAlternative alternative = old_alternatives->at(i);
-      alternative.set_node(alternative.node()->PropagateForward(info));
-      choice->alternatives()->Add(alternative);
-    }
-  }
-  return choice;
-}
-
-
-RegExpNode* EndNode::PropagateForward(NodeInfo* info) {
-  return PropagateToEndpoint(this, info);
-}
-
-
-RegExpNode* BackReferenceNode::PropagateForward(NodeInfo* info) {
-  NodeInfo full_info(*this->info());
-  full_info.AddFromPreceding(info);
-  bool cloned = false;
-  BackReferenceNode* back_ref = EnsureSibling(this, &full_info, &cloned);
-  if (cloned) {
-    // TODO(erikcorry): A back reference has to have two successors (by default
-    // the same node).  The first is used if the back reference matches a non-
-    // empty back reference, the second if it matches an empty one.  This
-    // doesn't matter for at_end, which is the only one implemented right now,
-    // but it will matter for other pieces of info.
-    back_ref->set_on_success(back_ref->on_success()->PropagateForward(info));
-  }
-  return back_ref;
-}
-
-
-RegExpNode* TextNode::PropagateForward(NodeInfo* info) {
-  return PropagateToEndpoint(this, info);
-}
-
-
 // -------------------------------------------------------------------
 // Splay tree
 
@@ -4389,6 +4493,11 @@ void Analysis::VisitBackReference(BackReferenceNode* that) {
 }
 
 
+void Analysis::VisitAssertion(AssertionNode* that) {
+  EnsureAnalyzed(that->on_success());
+}
+
+
 // -------------------------------------------------------------------
 // Dispatch table construction
 
@@ -4441,6 +4550,12 @@ void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
 }
 
 
+void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
+  RegExpNode* target = that->on_success();
+  target->Accept(this);
+}
+
+
 
 static int CompareRangeByFrom(const CharacterRange* a,
                               const CharacterRange* b) {
@@ -4527,10 +4642,6 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpCompileData* data,
 
   NodeInfo info = *node->info();
 
-  if (is_multiline && !FLAG_attempt_multiline_irregexp) {
-    return Handle<FixedArray>::null();
-  }
-
   if (FLAG_irregexp_native) {
 #ifdef ARM
     // Unimplemented, fall-through to bytecode implementation.
index bf3bdb7..1693713 100644 (file)
@@ -410,6 +410,7 @@ class DispatchTable : public ZoneObject {
   VISIT(Action)                                                      \
   VISIT(Choice)                                                      \
   VISIT(BackReference)                                               \
+  VISIT(Assertion)                                                   \
   VISIT(Text)
 
 
@@ -619,12 +620,6 @@ class RegExpNode: public ZoneObject {
   // the deferred actions in the current trace and generating a goto.
   static const int kMaxCopiesCodeGenerated = 10;
 
-  // Propagates the given interest information forward.  When seeing
-  // \bfoo for instance, the \b is implemented by propagating forward
-  // to the 'foo' string that it should only succeed if its first
-  // character is a letter xor the previous character was a letter.
-  virtual RegExpNode* PropagateForward(NodeInfo* info) = 0;
-
   NodeInfo* info() { return &info_; }
 
   void AddSibling(RegExpNode* node) { siblings_.Add(node); }
@@ -744,7 +739,6 @@ class ActionNode: public SeqRegExpNode {
                                     int filled_in) {
     return on_success()->GetQuickCheckDetails(details, compiler, filled_in);
   }
-  virtual RegExpNode* PropagateForward(NodeInfo* info);
   Type type() { return type_; }
   // TODO(erikcorry): We should allow some action nodes in greedy loops.
   virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
@@ -797,7 +791,6 @@ class TextNode: public SeqRegExpNode {
     elms_->Add(TextElement::CharClass(that));
   }
   virtual void Accept(NodeVisitor* visitor);
-  virtual RegExpNode* PropagateForward(NodeInfo* info);
   virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
   virtual int EatsAtLeast(int recursion_depth);
   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
@@ -831,6 +824,47 @@ class TextNode: public SeqRegExpNode {
 };
 
 
+class AssertionNode: public SeqRegExpNode {
+ public:
+  enum AssertionNodeType {
+    AT_END,
+    AT_START,
+    AT_BOUNDARY,
+    AT_NON_BOUNDARY,
+    AFTER_NEWLINE
+  };
+  static AssertionNode* AtEnd(RegExpNode* on_success) {
+    return new AssertionNode(AT_END, on_success);
+  }
+  static AssertionNode* AtStart(RegExpNode* on_success) {
+    return new AssertionNode(AT_START, on_success);
+  }
+  static AssertionNode* AtBoundary(RegExpNode* on_success) {
+    return new AssertionNode(AT_BOUNDARY, on_success);
+  }
+  static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
+    return new AssertionNode(AT_NON_BOUNDARY, on_success);
+  }
+  static AssertionNode* AfterNewline(RegExpNode* on_success) {
+    return new AssertionNode(AFTER_NEWLINE, on_success);
+  }
+  virtual void Accept(NodeVisitor* visitor);
+  virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
+  virtual int EatsAtLeast(int recursion_depth);
+  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
+                                    RegExpCompiler* compiler,
+                                    int filled_in) {
+    return on_success()->GetQuickCheckDetails(details, compiler, filled_in);
+  }
+  virtual AssertionNode* Clone() { return new AssertionNode(*this); }
+  AssertionNodeType type() { return type_; }
+ private:
+  AssertionNode(AssertionNodeType t, RegExpNode* on_success)
+      : SeqRegExpNode(on_success), type_(t) { }
+  AssertionNodeType type_;
+};
+
+
 class BackReferenceNode: public SeqRegExpNode {
  public:
   BackReferenceNode(int start_reg,
@@ -843,13 +877,12 @@ class BackReferenceNode: public SeqRegExpNode {
   int start_register() { return start_reg_; }
   int end_register() { return end_reg_; }
   virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
-  virtual int EatsAtLeast(int recursion_depth) { return 0; }
+  virtual int EatsAtLeast(int recursion_depth);
   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                     RegExpCompiler* compiler,
                                     int characters_filled_in) {
     return;
   }
-  virtual RegExpNode* PropagateForward(NodeInfo* info);
   virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
 
  private:
@@ -871,12 +904,8 @@ class EndNode: public RegExpNode {
     // Returning 0 from EatsAtLeast should ensure we never get here.
     UNREACHABLE();
   }
-  virtual RegExpNode* PropagateForward(NodeInfo* info);
   virtual EndNode* Clone() { return new EndNode(*this); }
 
- protected:
-  void EmitInfoChecks(RegExpMacroAssembler* macro, Trace* trace);
-
  private:
   Action action_;
 };
@@ -947,7 +976,6 @@ class ChoiceNode: public RegExpNode {
   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
                                     RegExpCompiler* compiler,
                                     int characters_filled_in);
-  virtual RegExpNode* PropagateForward(NodeInfo* info);
   virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }
 
   bool being_calculated() { return being_calculated_; }
@@ -1133,8 +1161,7 @@ class Trace {
   void set_quick_check_performed(QuickCheckDetails* d) {
     quick_check_performed_ = *d;
   }
-  void clear_quick_check_performed() {
-  }
+  void InvalidateCurrentCharacter();
   void AdvanceCurrentPositionInTrace(int by, bool ascii);
  private:
   int FindAffectedRegisters(OutSet* affected_registers);
index 824a297..6f11b86 100644 (file)
@@ -174,6 +174,20 @@ void RegExpMacroAssemblerIA32::CheckCharacterGT(uc16 limit, Label* on_greater) {
 }
 
 
+void RegExpMacroAssemblerIA32::CheckAtStart(Label* on_at_start) {
+  Label ok;
+  // Did we start the match at the start of the string at all?
+  __ cmp(Operand(ebp, kAtStart), Immediate(0));
+  BranchOrBacktrack(equal, &ok);
+  // If we did, are we still at the start of the input?
+  __ mov(eax, Operand(ebp, kInputEndOffset));
+  __ add(eax, Operand(edi));
+  __ cmp(eax, Operand(ebp, kInputStartOffset));
+  BranchOrBacktrack(equal, on_at_start);
+  __ bind(&ok);
+}
+
+
 void RegExpMacroAssemblerIA32::CheckNotAtStart(Label* on_not_at_start) {
   // Did we start the match at the start of the string at all?
   __ cmp(Operand(ebp, kAtStart), Immediate(0));
index 8d28beb..b69cf82 100644 (file)
@@ -43,6 +43,7 @@ class RegExpMacroAssemblerIA32: public RegExpMacroAssembler {
   virtual void AdvanceRegister(int reg, int by);
   virtual void Backtrack();
   virtual void Bind(Label* label);
+  virtual void CheckAtStart(Label* on_at_start);
   virtual void CheckBitmap(uc16 start, Label* bitmap, Label* on_zero);
   virtual void CheckCharacter(uint32_t c, Label* on_equal);
   virtual void CheckCharacterAfterAnd(uint32_t c,
index a36f0a7..deb5ada 100644 (file)
@@ -256,6 +256,12 @@ void RegExpMacroAssemblerIrregexp::CheckCharacter(uint32_t c, Label* on_equal) {
 }
 
 
+void RegExpMacroAssemblerIrregexp::CheckAtStart(Label* on_at_start) {
+  Emit(BC_CHECK_AT_START);
+  EmitOrLink(on_at_start);
+}
+
+
 void RegExpMacroAssemblerIrregexp::CheckNotAtStart(Label* on_not_at_start) {
   Emit(BC_CHECK_NOT_AT_START);
   EmitOrLink(on_not_at_start);
index 95b903c..96744c3 100644 (file)
@@ -81,6 +81,7 @@ class RegExpMacroAssemblerIrregexp: public RegExpMacroAssembler {
   virtual void CheckCharacterGT(uc16 limit, Label* on_greater);
   virtual void CheckCharacterLT(uc16 limit, Label* on_less);
   virtual void CheckGreedyLoop(Label* on_tos_equals_current_position);
+  virtual void CheckAtStart(Label* on_at_start);
   virtual void CheckNotAtStart(Label* on_not_at_start);
   virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
   virtual void CheckNotCharacterAfterAnd(uint32_t c,
index 5caf032..541f909 100644 (file)
@@ -210,6 +210,12 @@ void RegExpMacroAssemblerTracer::CheckCharacter(uint32_t c, Label* on_equal) {
 }
 
 
+void RegExpMacroAssemblerTracer::CheckAtStart(Label* on_at_start) {
+  PrintF(" CheckAtStart(label[%08x]);\n", on_at_start);
+  assembler_->CheckAtStart(on_at_start);
+}
+
+
 void RegExpMacroAssemblerTracer::CheckNotAtStart(Label* on_not_at_start) {
   PrintF(" CheckNotAtStart(label[%08x]);\n", on_not_at_start);
   assembler_->CheckNotAtStart(on_not_at_start);
index c8088a9..49576e0 100644 (file)
@@ -41,6 +41,7 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
   virtual void AdvanceRegister(int reg, int by);  // r[reg] += by.
   virtual void Backtrack();
   virtual void Bind(Label* label);
+  virtual void CheckAtStart(Label* on_at_start);
   virtual void CheckBitmap(uc16 start, Label* bitmap, Label* on_zero);
   virtual void CheckCharacter(uint32_t c, Label* on_equal);
   virtual void CheckCharacterAfterAnd(uint32_t c,
index e9f7731..74133d5 100644 (file)
@@ -61,6 +61,7 @@ class RegExpMacroAssembler {
   // stack by an earlier PushBacktrack(Label*).
   virtual void Backtrack() = 0;
   virtual void Bind(Label* label) = 0;
+  virtual void CheckAtStart(Label* on_at_start) = 0;
   // Check the current character against a bitmap.  The range of the current
   // character must be from start to start + length_of_bitmap_in_bits.
   virtual void CheckBitmap(
index 614d541..c90c0b4 100644 (file)
@@ -572,6 +572,9 @@ function splitMatch(separator, subject, current_index, start_index) {
     if (ovector == null) return null;
     var nof_results = ovector.length >> 1;
     var result = new $Array(nof_results + 1);
+    // Section 15.5.4.14 paragraph two says that we do not allow zero length
+    // matches at the end of the string.
+    if (ovector[0] === subject.length) return null;
     result[0] = ovector[1];
     result[1] = subject.slice(current_index, ovector[0]);
     for (var i = 1; i < nof_results; i++) {
index 2e5a823..97182f3 100644 (file)
@@ -240,11 +240,8 @@ ecma_3/RegExp/regress-119909: PASS || FAIL_OK
 # 'minimum repeat count' is reached, the empty string must not match.
 # In this case, we are similar but not identical to JSC.  Hard to
 # support the JS behavior with PCRE, so maybe emulate JSC?
-#
-# Note: We do not support toSource currently so we cannot run this
-# test. We should make an isolated test case for the regexp issue.
-ecma_3/RegExp/regress-209919: FAIL_OK
-js1_5/extensions/regress-459606: FAIL_OK
+ecma_3/RegExp/regress-209919: PASS || FAIL_OK
+js1_5/extensions/regress-459606: PASS || FAIL_OK
 
 
 # PCRE's match limit is reached.  SpiderMonkey hangs on the first one,
@@ -265,11 +262,6 @@ ecma_3/RegExp/regress-307456: PASS || FAIL_OK
 js1_5/Regress/regress-230216-2: FAIL_OK
 
 
-# According to ECMA-262, \b is a 'word' boundary, where words are only
-# ASCII characters.  PCRE supports non-ASCII word characters.
-js1_5/Regress/regress-247179: FAIL_OK
-
-
 # Regexp too long for PCRE.
 js1_5/Regress/regress-280769: PASS || FAIL
 js1_5/Regress/regress-280769-1: PASS || FAIL
@@ -471,7 +463,7 @@ ecma_3/Unicode/uc-001: FAIL_OK
 # A non-breaking space doesn't match \s in a regular expression.  This behaviour
 # matches JSC.  All the VMs have different behaviours in which characters match
 # \s so we do the same as JSC until they change.
-ecma_3/Unicode/uc-002: FAIL_OK
+ecma_3/Unicode/uc-002: PASS || FAIL_OK
 
 
 # String.prototype.split on empty strings always returns an array
@@ -521,10 +513,12 @@ js1_5/Regress/regress-336100: FAIL_OK
 
 # Regular expression test failures due to PCRE. We match JSC (ie, perl)
 # behavior and not the ECMA spec.
-ecma_3/RegExp/15.10.2-1: FAIL_OK
-ecma_3/RegExp/perlstress-001: FAIL_OK
+ecma_3/RegExp/perlstress-001: PASS || FAIL_OK
 ecma_3/RegExp/regress-334158: PASS || FAIL
 
+# This test fails due to http://code.google.com/p/v8/issues/detail?id=187
+# Failure to clear captures when a lookahead is unwound.
+ecma_3/RegExp/15.10.2-1: PASS || FAIL_OK
 
 # This test requires a failure if we try to compile a function with more
 # than 65536 arguments.  This seems to be a Mozilla restriction.