From ff3e30ae11278f4d6e5b76582ef72c0754b2d9b7 Mon Sep 17 00:00:00 2001 From: "christian.plesner.hansen@gmail.com" Date: Thu, 11 Dec 2008 11:13:13 +0000 Subject: [PATCH] - Added lookbehind propagation for the initial node; now, if the initial node is interested in what precedes it the automaton is given an initial all-consuming character class that determines it. - Added verification of some node information invariants. We now check that if a node expresses interest in what precedes it that information is available to it after assertion expansion. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@964 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/globals.h | 2 +- src/jsregexp.cc | 174 ++++++++++++++++++++++++++++++++----- src/jsregexp.h | 63 ++++++++++++-- src/parser.cc | 2 +- src/parser.h | 2 +- test/cctest/test-regexp.cc | 25 ++---- 6 files changed, 217 insertions(+), 51 deletions(-) diff --git a/src/globals.h b/src/globals.h index e2fb2a9f2..e174bf303 100644 --- a/src/globals.h +++ b/src/globals.h @@ -184,7 +184,7 @@ class OldSpace; class Property; class Proxy; class RegExpNode; -struct RegExpParseResult; +struct RegExpCompileData; class RegExpTree; class RegExpCompiler; class RegExpVisitor; diff --git a/src/jsregexp.cc b/src/jsregexp.cc index d44b5afce..5f4f6bda5 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -260,7 +260,7 @@ Handle RegExpImpl::Compile(Handle re, } else { FlattenString(pattern); ZoneScope zone_scope(DELETE_ON_EXIT); - RegExpParseResult parse_result; + RegExpCompileData parse_result; FlatStringReader reader(pattern); if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) { // Throw an exception if we fail to parse the pattern. @@ -706,20 +706,19 @@ static Handle GetCompiledIrregexp(Handle re, pattern->Flatten(shape); } - RegExpParseResult parse_result; + RegExpCompileData compile_data; FlatStringReader reader(pattern); - if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) { + if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) { // Throw an exception if we fail to parse the pattern. // THIS SHOULD NOT HAPPEN. We already parsed it successfully once. ThrowRegExpException(re, pattern, - parse_result.error, + compile_data.error, "malformed_regexp"); return Handle::null(); } Handle compiled_entry = - RegExpEngine::Compile(&parse_result, - NULL, + RegExpEngine::Compile(&compile_data, flags.is_ignore_case(), flags.is_multiline(), pattern, @@ -2603,9 +2602,7 @@ RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { - ZoneList* elms = new ZoneList(1); - elms->Add(TextElement::CharClass(this)); - return new TextNode(elms, on_success); + return new TextNode(this, on_success); } @@ -3265,7 +3262,7 @@ OutSet* DispatchTable::Get(uc16 value) { // Analysis -void Analysis::EnsureAnalyzed(RegExpNode* that) { +void AssertionPropagation::EnsureAnalyzed(RegExpNode* that) { if (that->info()->been_analyzed || that->info()->being_analyzed) return; that->info()->being_analyzed = true; @@ -3275,7 +3272,7 @@ void Analysis::EnsureAnalyzed(RegExpNode* that) { } -void Analysis::VisitEnd(EndNode* that) { +void AssertionPropagation::VisitEnd(EndNode* that) { // nothing to do } @@ -3298,7 +3295,7 @@ void TextNode::CalculateOffsets() { } -void Analysis::VisitText(TextNode* that) { +void AssertionPropagation::VisitText(TextNode* that) { if (ignore_case_) { that->MakeCaseIndependent(); } @@ -3314,7 +3311,7 @@ void Analysis::VisitText(TextNode* that) { } -void Analysis::VisitAction(ActionNode* that) { +void AssertionPropagation::VisitAction(ActionNode* that) { RegExpNode* target = that->on_success(); EnsureAnalyzed(target); // If the next node is interested in what it follows then this node @@ -3323,7 +3320,7 @@ void Analysis::VisitAction(ActionNode* that) { } -void Analysis::VisitChoice(ChoiceNode* that) { +void AssertionPropagation::VisitChoice(ChoiceNode* that) { NodeInfo* info = that->info(); for (int i = 0; i < that->alternatives()->length(); i++) { RegExpNode* node = that->alternatives()->at(i).node(); @@ -3335,7 +3332,7 @@ void Analysis::VisitChoice(ChoiceNode* that) { } -void Analysis::VisitBackReference(BackReferenceNode* that) { +void AssertionPropagation::VisitBackReference(BackReferenceNode* that) { EnsureAnalyzed(that->on_success()); } @@ -3650,15 +3647,118 @@ void DispatchTableConstructor::VisitAction(ActionNode* that) { } -Handle RegExpEngine::Compile(RegExpParseResult* input, - RegExpNode** node_return, +#ifdef DEBUG + + +class VisitNodeScope { + public: + explicit VisitNodeScope(RegExpNode* node) : node_(node) { + ASSERT(!node->info()->visited); + node->info()->visited = true; + } + ~VisitNodeScope() { + node_->info()->visited = false; + } + private: + RegExpNode* node_; +}; + + +class NodeValidator : public NodeVisitor { + public: + virtual void ValidateInfo(NodeInfo* info) = 0; +#define DECLARE_VISIT(Type) \ + virtual void Visit##Type(Type##Node* that); +FOR_EACH_NODE_TYPE(DECLARE_VISIT) +#undef DECLARE_VISIT +}; + + +class PostAnalysisNodeValidator : public NodeValidator { +public: + virtual void ValidateInfo(NodeInfo* info); +}; + + +class PostExpansionNodeValidator : public NodeValidator { +public: + virtual void ValidateInfo(NodeInfo* info); +}; + + +void PostAnalysisNodeValidator::ValidateInfo(NodeInfo* info) { + ASSERT(info->been_analyzed); +} + + +void PostExpansionNodeValidator::ValidateInfo(NodeInfo* info) { + ASSERT_EQ(info->determine_newline, info->does_determine_newline); + ASSERT_EQ(info->determine_start, info->does_determine_start); + ASSERT_EQ(info->determine_word, info->does_determine_word); + ASSERT_EQ(info->follows_word_interest, + (info->follows_word != NodeInfo::UNKNOWN)); + if (false) { + // These are still unimplemented. + ASSERT_EQ(info->follows_start_interest, + (info->follows_start != NodeInfo::UNKNOWN)); + ASSERT_EQ(info->follows_newline_interest, + (info->follows_newline != NodeInfo::UNKNOWN)); + } +} + + +void NodeValidator::VisitAction(ActionNode* that) { + if (that->info()->visited) return; + VisitNodeScope scope(that); + ValidateInfo(that->info()); + that->on_success()->Accept(this); +} + + +void NodeValidator::VisitBackReference(BackReferenceNode* that) { + if (that->info()->visited) return; + VisitNodeScope scope(that); + ValidateInfo(that->info()); + that->on_success()->Accept(this); +} + + +void NodeValidator::VisitChoice(ChoiceNode* that) { + if (that->info()->visited) return; + VisitNodeScope scope(that); + ValidateInfo(that->info()); + ZoneList* alts = that->alternatives(); + for (int i = 0; i < alts->length(); i++) + alts->at(i).node()->Accept(this); +} + + +void NodeValidator::VisitEnd(EndNode* that) { + if (that->info()->visited) return; + VisitNodeScope scope(that); + ValidateInfo(that->info()); +} + + +void NodeValidator::VisitText(TextNode* that) { + if (that->info()->visited) return; + VisitNodeScope scope(that); + ValidateInfo(that->info()); + that->on_success()->Accept(this); +} + + +#endif + + +Handle RegExpEngine::Compile(RegExpCompileData* data, bool ignore_case, bool is_multiline, Handle pattern, bool is_ascii) { - RegExpCompiler compiler(input->capture_count, ignore_case, is_ascii); + RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); // Wrap the body of the regexp in capture #0. - RegExpNode* captured_body = RegExpCapture::ToNode(input->tree, + RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 0, &compiler, compiler.accept()); @@ -3673,17 +3773,43 @@ Handle RegExpEngine::Compile(RegExpParseResult* input, new RegExpCharacterClass('*'), &compiler, captured_body); - if (node_return != NULL) *node_return = node; - Analysis analysis(ignore_case); + AssertionPropagation analysis(ignore_case); analysis.EnsureAnalyzed(node); NodeInfo info = *node->info(); + data->has_lookbehind = info.HasLookbehind(); + if (data->has_lookbehind) { + // If this node needs information about the preceding text we let + // it start with a character class that consumes a single character + // and proceeds to wherever is appropriate. This means that if + // has_lookbehind is set the code generator must start one character + // before the start position. + node = new TextNode(new RegExpCharacterClass('*'), node); + analysis.EnsureAnalyzed(node); + } + +#ifdef DEBUG + PostAnalysisNodeValidator post_analysis_validator; + node->Accept(&post_analysis_validator); +#endif + node = node->EnsureExpanded(&info); +#ifdef DEBUG + PostExpansionNodeValidator post_expansion_validator; + node->Accept(&post_expansion_validator); +#endif + + data->node = node; + if (is_multiline && !FLAG_attempt_multiline_irregexp) { return Handle::null(); } + if (data->has_lookbehind) { + return Handle::null(); + } + if (FLAG_irregexp_native) { #ifdef ARM // Unimplemented, fall-through to bytecode implementation. @@ -3695,10 +3821,10 @@ Handle RegExpEngine::Compile(RegExpParseResult* input, mode = RegExpMacroAssemblerIA32::UC16; } RegExpMacroAssemblerIA32 macro_assembler(mode, - (input->capture_count + 1) * 2); + (data->capture_count + 1) * 2); return compiler.Assemble(¯o_assembler, node, - input->capture_count, + data->capture_count, pattern); #endif } @@ -3706,7 +3832,7 @@ Handle RegExpEngine::Compile(RegExpParseResult* input, RegExpMacroAssemblerIrregexp macro_assembler(codes); return compiler.Assemble(¯o_assembler, node, - input->capture_count, + data->capture_count, pattern); } diff --git a/src/jsregexp.h b/src/jsregexp.h index 3dc2bc1d1..dbeb6e23e 100644 --- a/src/jsregexp.h +++ b/src/jsregexp.h @@ -528,6 +528,12 @@ struct NodeInfo { does_determine_start = that->does_determine_start; } + bool HasLookbehind() { + return follows_word_interest || + follows_newline_interest || + follows_start_interest; + } + // Sets the interests of this node to include the interests of the // following node. void AddFromFollowing(NodeInfo* that) { @@ -894,7 +900,7 @@ class ChoiceNode: public RegExpNode { private: friend class DispatchTableConstructor; - friend class Analysis; + friend class AssertionPropagation; void GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard *guard, GenerationVariant* variant); @@ -1052,9 +1058,45 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) }; -class Analysis: public NodeVisitor { +// Assertion propagation moves information about assertions such as +// \b to the affected nodes. For instance, in /.\b./ information must +// be propagated to the first '.' that whatever follows needs to know +// if it matched a word or a non-word, and to the second '.' that it +// has to check if it succeeds a word or non-word. In this case the +// result will be something like: +// +// +-------+ +------------+ +// | . | | . | +// +-------+ ---> +------------+ +// | word? | | check word | +// +-------+ +------------+ +// +// At a later phase all nodes that determine information for their +// following nodes are split into several 'sibling' nodes. In this +// case the first '.' is split into one node that only matches words +// and one that only matches non-words. The second '.' is also split, +// into one node that assumes that the previous character was a word +// character and one that assumes that is was non-word. In this case +// the result is +// +// +------------------+ +------------------+ +// /--> | intersect(., \w) | ---> | intersect(., \W) | +// | +------------------+ +------------------+ +// | | follows \w | +// | +------------------+ +// --? +// | +------------------+ +------------------+ +// \--> | intersect(., \W) | ---> | intersect(., \w) | +// +------------------+ +------------------+ +// | follows \W | +// +------------------+ +// +// This way we don't need to explicitly check the previous character +// but can always assume that whoever consumed the previous character +// has propagated the relevant information forward. +class AssertionPropagation: public NodeVisitor { public: - explicit Analysis(bool ignore_case) + explicit AssertionPropagation(bool ignore_case) : ignore_case_(ignore_case) { } void EnsureAnalyzed(RegExpNode* node); @@ -1066,12 +1108,20 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) private: bool ignore_case_; - DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); + DISALLOW_IMPLICIT_CONSTRUCTORS(AssertionPropagation); }; -struct RegExpParseResult { +struct RegExpCompileData { + RegExpCompileData() + : tree(NULL), + node(NULL), + has_lookbehind(false), + has_character_escapes(false), + capture_count(0) { } RegExpTree* tree; + RegExpNode* node; + bool has_lookbehind; bool has_character_escapes; Handle error; int capture_count; @@ -1080,8 +1130,7 @@ struct RegExpParseResult { class RegExpEngine: public AllStatic { public: - static Handle Compile(RegExpParseResult* input, - RegExpNode** node_return, + static Handle Compile(RegExpCompileData* input, bool ignore_case, bool multiline, Handle pattern, diff --git a/src/parser.cc b/src/parser.cc index 8217315c8..7236cb42c 100644 --- a/src/parser.cc +++ b/src/parser.cc @@ -4302,7 +4302,7 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream, bool ParseRegExp(FlatStringReader* input, bool multiline, - RegExpParseResult* result) { + RegExpCompileData* result) { ASSERT(result != NULL); // Make sure we have a stack guard. StackGuard guard; diff --git a/src/parser.h b/src/parser.h index 0ebef74ee..6e86f0961 100644 --- a/src/parser.h +++ b/src/parser.h @@ -147,7 +147,7 @@ ScriptDataImpl* PreParse(unibrow::CharacterStream* stream, bool ParseRegExp(FlatStringReader* input, bool multiline, - RegExpParseResult* result); + RegExpCompileData* result); // Support for doing lazy compilation. The script is the script containing full diff --git a/test/cctest/test-regexp.cc b/test/cctest/test-regexp.cc index b3e54f38f..782bb1168 100644 --- a/test/cctest/test-regexp.cc +++ b/test/cctest/test-regexp.cc @@ -55,7 +55,7 @@ static SmartPointer Parse(const char* input) { v8::HandleScope scope; ZoneScope zone_scope(DELETE_ON_EXIT); FlatStringReader reader(CStrVector(input)); - RegExpParseResult result; + RegExpCompileData result; CHECK(v8::internal::ParseRegExp(&reader, false, &result)); CHECK(result.tree != NULL); CHECK(result.error.is_null()); @@ -69,7 +69,7 @@ static bool ParseEscapes(const char* input) { unibrow::Utf8InputBuffer<> buffer(input, strlen(input)); ZoneScope zone_scope(DELETE_ON_EXIT); FlatStringReader reader(CStrVector(input)); - RegExpParseResult result; + RegExpCompileData result; CHECK(v8::internal::ParseRegExp(&reader, false, &result)); CHECK(result.tree != NULL); CHECK(result.error.is_null()); @@ -259,7 +259,7 @@ static void ExpectError(const char* input, v8::HandleScope scope; ZoneScope zone_scope(DELETE_ON_EXIT); FlatStringReader reader(CStrVector(input)); - RegExpParseResult result; + RegExpCompileData result; CHECK_EQ(false, v8::internal::ParseRegExp(&reader, false, &result)); CHECK(result.tree == NULL); CHECK(!result.error.is_null()); @@ -358,13 +358,12 @@ TEST(CharacterClassEscapes) { static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) { V8::Initialize(NULL); FlatStringReader reader(CStrVector(input)); - RegExpParseResult result; - if (!v8::internal::ParseRegExp(&reader, multiline, &result)) + RegExpCompileData compile_data; + if (!v8::internal::ParseRegExp(&reader, multiline, &compile_data)) return NULL; - RegExpNode* node = NULL; Handle pattern = Factory::NewStringFromUtf8(CStrVector(input)); - RegExpEngine::Compile(&result, &node, false, multiline, pattern, is_ascii); - return node; + RegExpEngine::Compile(&compile_data, false, multiline, pattern, is_ascii); + return compile_data.node; } @@ -1128,14 +1127,6 @@ TEST(LatinCanonicalize) { } -TEST(SimplePropagation) { - v8::HandleScope scope; - ZoneScope zone_scope(DELETE_ON_EXIT); - RegExpNode* node = Compile("(a|^b|c)", false, true); - CHECK(node->info()->follows_start_interest); -} - - static uc32 CanonRange(uc32 c) { unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth]; int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL); @@ -1301,5 +1292,5 @@ TEST(CharClassDifference) { TEST(Graph) { V8::Initialize(NULL); - Execute("(?=[d#.])", false, true, true); + Execute("\\bboy\\b", false, true, true); } -- 2.34.1