- Added lookbehind propagation for the initial node; now, if the

author christian.plesner.hansen@gmail.com <christian.plesner.hansen@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Thu, 11 Dec 2008 11:13:13 +0000 (11:13 +0000)

committer christian.plesner.hansen@gmail.com <christian.plesner.hansen@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>

Thu, 11 Dec 2008 11:13:13 +0000 (11:13 +0000)
author christian.plesner.hansen@gmail.com <christian.plesner.hansen@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Thu, 11 Dec 2008 11:13:13 +0000 (11:13 +0000)
committer christian.plesner.hansen@gmail.com <christian.plesner.hansen@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
Thu, 11 Dec 2008 11:13:13 +0000 (11:13 +0000)
diff --git a/src/globals.h b/src/globals.h

index e2fb2a9f226439d88f2b96144a8d17a6de12fe66..e174bf303e84400716201eacb6402511f884bc6e 100644 (file)
--- a/src/globals.h
+++ b/src/globals.h
@@ -184,7 +184,7 @@ class OldSpace;
  class Property;
  class Proxy;
  class RegExpNode;
-struct RegExpParseResult;
+struct RegExpCompileData;
  class RegExpTree;
  class RegExpCompiler;
  class RegExpVisitor;
diff --git a/src/jsregexp.cc b/src/jsregexp.cc

index d44b5afceaa9ae46066f7da216b5be3dd526d3a2..5f4f6bda51f2d1870156edb056499cf2f0ac9755 100644 (file)
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -260,7 +260,7 @@ Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
    } else {
      FlattenString(pattern);
      ZoneScope zone_scope(DELETE_ON_EXIT);
-    RegExpParseResult parse_result;
+    RegExpCompileData parse_result;
      FlatStringReader reader(pattern);
      if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
        // Throw an exception if we fail to parse the pattern.
@@ -706,20 +706,19 @@ static Handle<FixedArray> GetCompiledIrregexp(Handle<JSRegExp> re,
      pattern->Flatten(shape);
    }
  
-  RegExpParseResult parse_result;
+  RegExpCompileData compile_data;
    FlatStringReader reader(pattern);
-  if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
+  if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
      // Throw an exception if we fail to parse the pattern.
      // THIS SHOULD NOT HAPPEN. We already parsed it successfully once.
      ThrowRegExpException(re,
                           pattern,
-                         parse_result.error,
+                         compile_data.error,
                           "malformed_regexp");
      return Handle<FixedArray>::null();
    }
    Handle<FixedArray> compiled_entry =
-      RegExpEngine::Compile(&parse_result,
-                            NULL,
+      RegExpEngine::Compile(&compile_data,
                              flags.is_ignore_case(),
                              flags.is_multiline(),
                              pattern,
@@ -2603,9 +2602,7 @@ RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
  
  RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
                                           RegExpNode* on_success) {
-  ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
-  elms->Add(TextElement::CharClass(this));
-  return new TextNode(elms, on_success);
+  return new TextNode(this, on_success);
  }
  
  
@@ -3265,7 +3262,7 @@ OutSet* DispatchTable::Get(uc16 value) {
  // Analysis
  
  
-void Analysis::EnsureAnalyzed(RegExpNode* that) {
+void AssertionPropagation::EnsureAnalyzed(RegExpNode* that) {
    if (that->info()->been_analyzed || that->info()->being_analyzed)
      return;
    that->info()->being_analyzed = true;
@@ -3275,7 +3272,7 @@ void Analysis::EnsureAnalyzed(RegExpNode* that) {
  }
  
  
-void Analysis::VisitEnd(EndNode* that) {
+void AssertionPropagation::VisitEnd(EndNode* that) {
    // nothing to do
  }
  
@@ -3298,7 +3295,7 @@ void TextNode::CalculateOffsets() {
  }
  
  
-void Analysis::VisitText(TextNode* that) {
+void AssertionPropagation::VisitText(TextNode* that) {
    if (ignore_case_) {
      that->MakeCaseIndependent();
    }
@@ -3314,7 +3311,7 @@ void Analysis::VisitText(TextNode* that) {
  }
  
  
-void Analysis::VisitAction(ActionNode* that) {
+void AssertionPropagation::VisitAction(ActionNode* that) {
    RegExpNode* target = that->on_success();
    EnsureAnalyzed(target);
    // If the next node is interested in what it follows then this node
@@ -3323,7 +3320,7 @@ void Analysis::VisitAction(ActionNode* that) {
  }
  
  
-void Analysis::VisitChoice(ChoiceNode* that) {
+void AssertionPropagation::VisitChoice(ChoiceNode* that) {
    NodeInfo* info = that->info();
    for (int i = 0; i < that->alternatives()->length(); i++) {
      RegExpNode* node = that->alternatives()->at(i).node();
@@ -3335,7 +3332,7 @@ void Analysis::VisitChoice(ChoiceNode* that) {
  }
  
  
-void Analysis::VisitBackReference(BackReferenceNode* that) {
+void AssertionPropagation::VisitBackReference(BackReferenceNode* that) {
    EnsureAnalyzed(that->on_success());
  }
  
@@ -3650,15 +3647,118 @@ void DispatchTableConstructor::VisitAction(ActionNode* that) {
  }
  
  
-Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
-                                         RegExpNode** node_return,
+#ifdef DEBUG
+
+
+class VisitNodeScope {
+ public:
+  explicit VisitNodeScope(RegExpNode* node) : node_(node) {
+    ASSERT(!node->info()->visited);
+    node->info()->visited = true;
+  }
+  ~VisitNodeScope() {
+    node_->info()->visited = false;
+  }
+ private:
+  RegExpNode* node_;
+};
+
+
+class NodeValidator : public NodeVisitor {
+ public:
+  virtual void ValidateInfo(NodeInfo* info) = 0;
+#define DECLARE_VISIT(Type)                                          \
+  virtual void Visit##Type(Type##Node* that);
+FOR_EACH_NODE_TYPE(DECLARE_VISIT)
+#undef DECLARE_VISIT
+};
+
+
+class PostAnalysisNodeValidator : public NodeValidator {
+public:
+  virtual void ValidateInfo(NodeInfo* info);
+};
+
+
+class PostExpansionNodeValidator : public NodeValidator {
+public:
+  virtual void ValidateInfo(NodeInfo* info);
+};
+
+
+void PostAnalysisNodeValidator::ValidateInfo(NodeInfo* info) {
+  ASSERT(info->been_analyzed);
+}
+
+
+void PostExpansionNodeValidator::ValidateInfo(NodeInfo* info) {
+  ASSERT_EQ(info->determine_newline, info->does_determine_newline);
+  ASSERT_EQ(info->determine_start, info->does_determine_start);
+  ASSERT_EQ(info->determine_word, info->does_determine_word);
+  ASSERT_EQ(info->follows_word_interest,
+            (info->follows_word != NodeInfo::UNKNOWN));
+  if (false) {
+    // These are still unimplemented.
+    ASSERT_EQ(info->follows_start_interest,
+              (info->follows_start != NodeInfo::UNKNOWN));
+    ASSERT_EQ(info->follows_newline_interest,
+              (info->follows_newline != NodeInfo::UNKNOWN));
+  }
+}
+
+
+void NodeValidator::VisitAction(ActionNode* that) {
+  if (that->info()->visited) return;
+  VisitNodeScope scope(that);
+  ValidateInfo(that->info());
+  that->on_success()->Accept(this);
+}
+
+
+void NodeValidator::VisitBackReference(BackReferenceNode* that) {
+  if (that->info()->visited) return;
+  VisitNodeScope scope(that);
+  ValidateInfo(that->info());
+  that->on_success()->Accept(this);
+}
+
+
+void NodeValidator::VisitChoice(ChoiceNode* that) {
+  if (that->info()->visited) return;
+  VisitNodeScope scope(that);
+  ValidateInfo(that->info());
+  ZoneList<GuardedAlternative>* alts = that->alternatives();
+  for (int i = 0; i < alts->length(); i++)
+    alts->at(i).node()->Accept(this);
+}
+
+
+void NodeValidator::VisitEnd(EndNode* that) {
+  if (that->info()->visited) return;
+  VisitNodeScope scope(that);
+  ValidateInfo(that->info());
+}
+
+
+void NodeValidator::VisitText(TextNode* that) {
+  if (that->info()->visited) return;
+  VisitNodeScope scope(that);
+  ValidateInfo(that->info());
+  that->on_success()->Accept(this);
+}
+
+
+#endif
+
+
+Handle<FixedArray> RegExpEngine::Compile(RegExpCompileData* data,
                                           bool ignore_case,
                                           bool is_multiline,
                                           Handle<String> pattern,
                                           bool is_ascii) {
-  RegExpCompiler compiler(input->capture_count, ignore_case, is_ascii);
+  RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
    // Wrap the body of the regexp in capture #0.
-  RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,
+  RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
                                                      0,
                                                      &compiler,
                                                      compiler.accept());
@@ -3673,17 +3773,43 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
                                                new RegExpCharacterClass('*'),
                                                &compiler,
                                                captured_body);
-  if (node_return != NULL) *node_return = node;
-  Analysis analysis(ignore_case);
+  AssertionPropagation analysis(ignore_case);
    analysis.EnsureAnalyzed(node);
  
    NodeInfo info = *node->info();
+  data->has_lookbehind = info.HasLookbehind();
+  if (data->has_lookbehind) {
+    // If this node needs information about the preceding text we let
+    // it start with a character class that consumes a single character
+    // and proceeds to wherever is appropriate.  This means that if
+    // has_lookbehind is set the code generator must start one character
+    // before the start position.
+    node = new TextNode(new RegExpCharacterClass('*'), node);
+    analysis.EnsureAnalyzed(node);
+  }
+
+#ifdef DEBUG
+  PostAnalysisNodeValidator post_analysis_validator;
+  node->Accept(&post_analysis_validator);
+#endif
+
    node = node->EnsureExpanded(&info);
  
+#ifdef DEBUG
+  PostExpansionNodeValidator post_expansion_validator;
+  node->Accept(&post_expansion_validator);
+#endif
+
+  data->node = node;
+
    if (is_multiline && !FLAG_attempt_multiline_irregexp) {
      return Handle<FixedArray>::null();
    }
  
+  if (data->has_lookbehind) {
+    return Handle<FixedArray>::null();
+  }
+
    if (FLAG_irregexp_native) {
  #ifdef ARM
      // Unimplemented, fall-through to bytecode implementation.
@@ -3695,10 +3821,10 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
        mode = RegExpMacroAssemblerIA32::UC16;
      }
      RegExpMacroAssemblerIA32 macro_assembler(mode,
-                                             (input->capture_count + 1) * 2);
+                                             (data->capture_count + 1) * 2);
      return compiler.Assemble(&macro_assembler,
                               node,
-                             input->capture_count,
+                             data->capture_count,
                               pattern);
  #endif
    }
@@ -3706,7 +3832,7 @@ Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
    RegExpMacroAssemblerIrregexp macro_assembler(codes);
    return compiler.Assemble(&macro_assembler,
                             node,
-                           input->capture_count,
+                           data->capture_count,
                             pattern);
  }
  
diff --git a/src/jsregexp.h b/src/jsregexp.h

index 3dc2bc1d115bf34f5dbb1deea42523d6887ad08f..dbeb6e23ebedbef5be48e47bddc766d1b1b9fdb3 100644 (file)
--- a/src/jsregexp.h
+++ b/src/jsregexp.h
@@ -528,6 +528,12 @@ struct NodeInfo {
      does_determine_start = that->does_determine_start;
    }
  
+  bool HasLookbehind() {
+    return follows_word_interest ||
+           follows_newline_interest ||
+           follows_start_interest;
+  }
+
    // Sets the interests of this node to include the interests of the
    // following node.
    void AddFromFollowing(NodeInfo* that) {
@@ -894,7 +900,7 @@ class ChoiceNode: public RegExpNode {
  
   private:
    friend class DispatchTableConstructor;
-  friend class Analysis;
+  friend class AssertionPropagation;
    void GenerateGuard(RegExpMacroAssembler* macro_assembler,
                       Guard *guard,
                       GenerationVariant* variant);
@@ -1052,9 +1058,45 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
  };
  
  
-class Analysis: public NodeVisitor {
+// Assertion propagation moves information about assertions such as
+// \b to the affected nodes.  For instance, in /.\b./ information must
+// be propagated to the first '.' that whatever follows needs to know
+// if it matched a word or a non-word, and to the second '.' that it
+// has to check if it succeeds a word or non-word.  In this case the
+// result will be something like:
+//
+//   +-------+        +------------+
+//   |   .   |        |      .     |
+//   +-------+  --->  +------------+
+//   | word? |        | check word |
+//   +-------+        +------------+
+//
+// At a later phase all nodes that determine information for their
+// following nodes are split into several 'sibling' nodes.  In this
+// case the first '.' is split into one node that only matches words
+// and one that only matches non-words.  The second '.' is also split,
+// into one node that assumes that the previous character was a word
+// character and one that assumes that is was non-word.  In this case
+// the result is
+//
+//         +------------------+        +------------------+
+//   /-->  | intersect(., \w) |  --->  | intersect(., \W) |
+//   |     +------------------+        +------------------+
+//   |                                 |    follows \w    |
+//   |                                 +------------------+
+// --?
+//   |     +------------------+        +------------------+
+//   \-->  | intersect(., \W) |  --->  | intersect(., \w) |
+//         +------------------+        +------------------+
+//                                     |    follows \W    |
+//                                     +------------------+
+//
+// This way we don't need to explicitly check the previous character
+// but can always assume that whoever consumed the previous character
+// has propagated the relevant information forward.
+class AssertionPropagation: public NodeVisitor {
   public:
-  explicit Analysis(bool ignore_case)
+  explicit AssertionPropagation(bool ignore_case)
        : ignore_case_(ignore_case) { }
    void EnsureAnalyzed(RegExpNode* node);
  
@@ -1066,12 +1108,20 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
   private:
    bool ignore_case_;
  
-  DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
+  DISALLOW_IMPLICIT_CONSTRUCTORS(AssertionPropagation);
  };
  
  
-struct RegExpParseResult {
+struct RegExpCompileData {
+  RegExpCompileData()
+    : tree(NULL),
+      node(NULL),
+      has_lookbehind(false),
+      has_character_escapes(false),
+      capture_count(0) { }
    RegExpTree* tree;
+  RegExpNode* node;
+  bool has_lookbehind;
    bool has_character_escapes;
    Handle<String> error;
    int capture_count;
@@ -1080,8 +1130,7 @@ struct RegExpParseResult {
  
  class RegExpEngine: public AllStatic {
   public:
-  static Handle<FixedArray> Compile(RegExpParseResult* input,
-                                    RegExpNode** node_return,
+  static Handle<FixedArray> Compile(RegExpCompileData* input,
                                      bool ignore_case,
                                      bool multiline,
                                      Handle<String> pattern,
diff --git a/src/parser.cc b/src/parser.cc

index 8217315c8612be8773a6243e42125fb4d2f996ef..7236cb42cc01e418d16814efc2789b6f2a2e83aa 100644 (file)
--- a/src/parser.cc
+++ b/src/parser.cc
@@ -4302,7 +4302,7 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream,
  
  bool ParseRegExp(FlatStringReader* input,
                   bool multiline,
-                 RegExpParseResult* result) {
+                 RegExpCompileData* result) {
    ASSERT(result != NULL);
    // Make sure we have a stack guard.
    StackGuard guard;
diff --git a/src/parser.h b/src/parser.h

index 0ebef74eee879431d45ddb1ffd150621a6968e40..6e86f0961ab43a8646466e66d184a0d69aa231a1 100644 (file)
--- a/src/parser.h
+++ b/src/parser.h
@@ -147,7 +147,7 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream,
  
  bool ParseRegExp(FlatStringReader* input,
                   bool multiline,
-                 RegExpParseResult* result);
+                 RegExpCompileData* result);
  
  
  // Support for doing lazy compilation. The script is the script containing full
diff --git a/test/cctest/test-regexp.cc b/test/cctest/test-regexp.cc

index b3e54f38f3f19d29abdda1c8516f4b13771bb48b..782bb116885bfbccbd0d12c3a2c18db6e9ae08c2 100644 (file)
--- a/test/cctest/test-regexp.cc
+++ b/test/cctest/test-regexp.cc
@@ -55,7 +55,7 @@ static SmartPointer<const char> Parse(const char* input) {
    v8::HandleScope scope;
    ZoneScope zone_scope(DELETE_ON_EXIT);
    FlatStringReader reader(CStrVector(input));
-  RegExpParseResult result;
+  RegExpCompileData result;
    CHECK(v8::internal::ParseRegExp(&reader, false, &result));
    CHECK(result.tree != NULL);
    CHECK(result.error.is_null());
@@ -69,7 +69,7 @@ static bool ParseEscapes(const char* input) {
    unibrow::Utf8InputBuffer<> buffer(input, strlen(input));
    ZoneScope zone_scope(DELETE_ON_EXIT);
    FlatStringReader reader(CStrVector(input));
-  RegExpParseResult result;
+  RegExpCompileData result;
    CHECK(v8::internal::ParseRegExp(&reader, false, &result));
    CHECK(result.tree != NULL);
    CHECK(result.error.is_null());
@@ -259,7 +259,7 @@ static void ExpectError(const char* input,
    v8::HandleScope scope;
    ZoneScope zone_scope(DELETE_ON_EXIT);
    FlatStringReader reader(CStrVector(input));
-  RegExpParseResult result;
+  RegExpCompileData result;
    CHECK_EQ(false, v8::internal::ParseRegExp(&reader, false, &result));
    CHECK(result.tree == NULL);
    CHECK(!result.error.is_null());
@@ -358,13 +358,12 @@ TEST(CharacterClassEscapes) {
  static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) {
    V8::Initialize(NULL);
    FlatStringReader reader(CStrVector(input));
-  RegExpParseResult result;
-  if (!v8::internal::ParseRegExp(&reader, multiline, &result))
+  RegExpCompileData compile_data;
+  if (!v8::internal::ParseRegExp(&reader, multiline, &compile_data))
      return NULL;
-  RegExpNode* node = NULL;
    Handle<String> pattern = Factory::NewStringFromUtf8(CStrVector(input));
-  RegExpEngine::Compile(&result, &node, false, multiline, pattern, is_ascii);
-  return node;
+  RegExpEngine::Compile(&compile_data, false, multiline, pattern, is_ascii);
+  return compile_data.node;
  }
  
  
@@ -1128,14 +1127,6 @@ TEST(LatinCanonicalize) {
  }
  
  
-TEST(SimplePropagation) {
-  v8::HandleScope scope;
-  ZoneScope zone_scope(DELETE_ON_EXIT);
-  RegExpNode* node = Compile("(a|^b|c)", false, true);
-  CHECK(node->info()->follows_start_interest);
-}
-
-
  static uc32 CanonRange(uc32 c) {
    unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
    int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
@@ -1301,5 +1292,5 @@ TEST(CharClassDifference) {
  
  TEST(Graph) {
    V8::Initialize(NULL);
-  Execute("(?=[d#.])", false, true, true);
+  Execute("\\bboy\\b", false, true, true);
  }
author	christian.plesner.hansen@gmail.com <christian.plesner.hansen@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Thu, 11 Dec 2008 11:13:13 +0000 (11:13 +0000)
committer	christian.plesner.hansen@gmail.com <christian.plesner.hansen@gmail.com@ce2b1a6d-e550-0410-aec6-3dcde31c8c00>
	Thu, 11 Dec 2008 11:13:13 +0000 (11:13 +0000)
src/globals.h		patch \| blob \| history
src/jsregexp.cc		patch \| blob \| history
src/jsregexp.h		patch \| blob \| history
src/parser.cc		patch \| blob \| history
src/parser.h		patch \| blob \| history
test/cctest/test-regexp.cc		patch \| blob \| history