} else {
FlattenString(pattern);
ZoneScope zone_scope(DELETE_ON_EXIT);
- RegExpParseResult parse_result;
+ RegExpCompileData parse_result;
FlatStringReader reader(pattern);
if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
// Throw an exception if we fail to parse the pattern.
pattern->Flatten(shape);
}
- RegExpParseResult parse_result;
+ RegExpCompileData compile_data;
FlatStringReader reader(pattern);
- if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
+ if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
// Throw an exception if we fail to parse the pattern.
// THIS SHOULD NOT HAPPEN. We already parsed it successfully once.
ThrowRegExpException(re,
pattern,
- parse_result.error,
+ compile_data.error,
"malformed_regexp");
return Handle<FixedArray>::null();
}
Handle<FixedArray> compiled_entry =
- RegExpEngine::Compile(&parse_result,
- NULL,
+ RegExpEngine::Compile(&compile_data,
flags.is_ignore_case(),
flags.is_multiline(),
pattern,
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
- ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
- elms->Add(TextElement::CharClass(this));
- return new TextNode(elms, on_success);
+ return new TextNode(this, on_success);
}
// Analysis
-void Analysis::EnsureAnalyzed(RegExpNode* that) {
+void AssertionPropagation::EnsureAnalyzed(RegExpNode* that) {
if (that->info()->been_analyzed || that->info()->being_analyzed)
return;
that->info()->being_analyzed = true;
}
-void Analysis::VisitEnd(EndNode* that) {
+void AssertionPropagation::VisitEnd(EndNode* that) {
// nothing to do
}
}
-void Analysis::VisitText(TextNode* that) {
+void AssertionPropagation::VisitText(TextNode* that) {
if (ignore_case_) {
that->MakeCaseIndependent();
}
}
-void Analysis::VisitAction(ActionNode* that) {
+void AssertionPropagation::VisitAction(ActionNode* that) {
RegExpNode* target = that->on_success();
EnsureAnalyzed(target);
// If the next node is interested in what it follows then this node
}
-void Analysis::VisitChoice(ChoiceNode* that) {
+void AssertionPropagation::VisitChoice(ChoiceNode* that) {
NodeInfo* info = that->info();
for (int i = 0; i < that->alternatives()->length(); i++) {
RegExpNode* node = that->alternatives()->at(i).node();
}
-void Analysis::VisitBackReference(BackReferenceNode* that) {
+void AssertionPropagation::VisitBackReference(BackReferenceNode* that) {
EnsureAnalyzed(that->on_success());
}
}
-Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
- RegExpNode** node_return,
+#ifdef DEBUG
+
+
+class VisitNodeScope {
+ public:
+ explicit VisitNodeScope(RegExpNode* node) : node_(node) {
+ ASSERT(!node->info()->visited);
+ node->info()->visited = true;
+ }
+ ~VisitNodeScope() {
+ node_->info()->visited = false;
+ }
+ private:
+ RegExpNode* node_;
+};
+
+
+class NodeValidator : public NodeVisitor {
+ public:
+ virtual void ValidateInfo(NodeInfo* info) = 0;
+#define DECLARE_VISIT(Type) \
+ virtual void Visit##Type(Type##Node* that);
+FOR_EACH_NODE_TYPE(DECLARE_VISIT)
+#undef DECLARE_VISIT
+};
+
+
+class PostAnalysisNodeValidator : public NodeValidator {
+public:
+ virtual void ValidateInfo(NodeInfo* info);
+};
+
+
+class PostExpansionNodeValidator : public NodeValidator {
+public:
+ virtual void ValidateInfo(NodeInfo* info);
+};
+
+
+void PostAnalysisNodeValidator::ValidateInfo(NodeInfo* info) {
+ ASSERT(info->been_analyzed);
+}
+
+
+void PostExpansionNodeValidator::ValidateInfo(NodeInfo* info) {
+ ASSERT_EQ(info->determine_newline, info->does_determine_newline);
+ ASSERT_EQ(info->determine_start, info->does_determine_start);
+ ASSERT_EQ(info->determine_word, info->does_determine_word);
+ ASSERT_EQ(info->follows_word_interest,
+ (info->follows_word != NodeInfo::UNKNOWN));
+ if (false) {
+ // These are still unimplemented.
+ ASSERT_EQ(info->follows_start_interest,
+ (info->follows_start != NodeInfo::UNKNOWN));
+ ASSERT_EQ(info->follows_newline_interest,
+ (info->follows_newline != NodeInfo::UNKNOWN));
+ }
+}
+
+
+void NodeValidator::VisitAction(ActionNode* that) {
+ if (that->info()->visited) return;
+ VisitNodeScope scope(that);
+ ValidateInfo(that->info());
+ that->on_success()->Accept(this);
+}
+
+
+void NodeValidator::VisitBackReference(BackReferenceNode* that) {
+ if (that->info()->visited) return;
+ VisitNodeScope scope(that);
+ ValidateInfo(that->info());
+ that->on_success()->Accept(this);
+}
+
+
+void NodeValidator::VisitChoice(ChoiceNode* that) {
+ if (that->info()->visited) return;
+ VisitNodeScope scope(that);
+ ValidateInfo(that->info());
+ ZoneList<GuardedAlternative>* alts = that->alternatives();
+ for (int i = 0; i < alts->length(); i++)
+ alts->at(i).node()->Accept(this);
+}
+
+
+void NodeValidator::VisitEnd(EndNode* that) {
+ if (that->info()->visited) return;
+ VisitNodeScope scope(that);
+ ValidateInfo(that->info());
+}
+
+
+void NodeValidator::VisitText(TextNode* that) {
+ if (that->info()->visited) return;
+ VisitNodeScope scope(that);
+ ValidateInfo(that->info());
+ that->on_success()->Accept(this);
+}
+
+
+#endif
+
+
+Handle<FixedArray> RegExpEngine::Compile(RegExpCompileData* data,
bool ignore_case,
bool is_multiline,
Handle<String> pattern,
bool is_ascii) {
- RegExpCompiler compiler(input->capture_count, ignore_case, is_ascii);
+ RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
// Wrap the body of the regexp in capture #0.
- RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,
+ RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
0,
&compiler,
compiler.accept());
new RegExpCharacterClass('*'),
&compiler,
captured_body);
- if (node_return != NULL) *node_return = node;
- Analysis analysis(ignore_case);
+ AssertionPropagation analysis(ignore_case);
analysis.EnsureAnalyzed(node);
NodeInfo info = *node->info();
+ data->has_lookbehind = info.HasLookbehind();
+ if (data->has_lookbehind) {
+ // If this node needs information about the preceding text we let
+ // it start with a character class that consumes a single character
+ // and proceeds to wherever is appropriate. This means that if
+ // has_lookbehind is set the code generator must start one character
+ // before the start position.
+ node = new TextNode(new RegExpCharacterClass('*'), node);
+ analysis.EnsureAnalyzed(node);
+ }
+
+#ifdef DEBUG
+ PostAnalysisNodeValidator post_analysis_validator;
+ node->Accept(&post_analysis_validator);
+#endif
+
node = node->EnsureExpanded(&info);
+#ifdef DEBUG
+ PostExpansionNodeValidator post_expansion_validator;
+ node->Accept(&post_expansion_validator);
+#endif
+
+ data->node = node;
+
if (is_multiline && !FLAG_attempt_multiline_irregexp) {
return Handle<FixedArray>::null();
}
+ if (data->has_lookbehind) {
+ return Handle<FixedArray>::null();
+ }
+
if (FLAG_irregexp_native) {
#ifdef ARM
// Unimplemented, fall-through to bytecode implementation.
mode = RegExpMacroAssemblerIA32::UC16;
}
RegExpMacroAssemblerIA32 macro_assembler(mode,
- (input->capture_count + 1) * 2);
+ (data->capture_count + 1) * 2);
return compiler.Assemble(¯o_assembler,
node,
- input->capture_count,
+ data->capture_count,
pattern);
#endif
}
RegExpMacroAssemblerIrregexp macro_assembler(codes);
return compiler.Assemble(¯o_assembler,
node,
- input->capture_count,
+ data->capture_count,
pattern);
}
does_determine_start = that->does_determine_start;
}
+ bool HasLookbehind() {
+ return follows_word_interest ||
+ follows_newline_interest ||
+ follows_start_interest;
+ }
+
// Sets the interests of this node to include the interests of the
// following node.
void AddFromFollowing(NodeInfo* that) {
private:
friend class DispatchTableConstructor;
- friend class Analysis;
+ friend class AssertionPropagation;
void GenerateGuard(RegExpMacroAssembler* macro_assembler,
Guard *guard,
GenerationVariant* variant);
};
-class Analysis: public NodeVisitor {
+// Assertion propagation moves information about assertions such as
+// \b to the affected nodes. For instance, in /.\b./ information must
+// be propagated to the first '.' that whatever follows needs to know
+// if it matched a word or a non-word, and to the second '.' that it
+// has to check if it succeeds a word or non-word. In this case the
+// result will be something like:
+//
+// +-------+ +------------+
+// | . | | . |
+// +-------+ ---> +------------+
+// | word? | | check word |
+// +-------+ +------------+
+//
+// At a later phase all nodes that determine information for their
+// following nodes are split into several 'sibling' nodes. In this
+// case the first '.' is split into one node that only matches words
+// and one that only matches non-words. The second '.' is also split,
+// into one node that assumes that the previous character was a word
+// character and one that assumes that is was non-word. In this case
+// the result is
+//
+// +------------------+ +------------------+
+// /--> | intersect(., \w) | ---> | intersect(., \W) |
+// | +------------------+ +------------------+
+// | | follows \w |
+// | +------------------+
+// --?
+// | +------------------+ +------------------+
+// \--> | intersect(., \W) | ---> | intersect(., \w) |
+// +------------------+ +------------------+
+// | follows \W |
+// +------------------+
+//
+// This way we don't need to explicitly check the previous character
+// but can always assume that whoever consumed the previous character
+// has propagated the relevant information forward.
+class AssertionPropagation: public NodeVisitor {
public:
- explicit Analysis(bool ignore_case)
+ explicit AssertionPropagation(bool ignore_case)
: ignore_case_(ignore_case) { }
void EnsureAnalyzed(RegExpNode* node);
private:
bool ignore_case_;
- DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
+ DISALLOW_IMPLICIT_CONSTRUCTORS(AssertionPropagation);
};
-struct RegExpParseResult {
+struct RegExpCompileData {
+ RegExpCompileData()
+ : tree(NULL),
+ node(NULL),
+ has_lookbehind(false),
+ has_character_escapes(false),
+ capture_count(0) { }
RegExpTree* tree;
+ RegExpNode* node;
+ bool has_lookbehind;
bool has_character_escapes;
Handle<String> error;
int capture_count;
class RegExpEngine: public AllStatic {
public:
- static Handle<FixedArray> Compile(RegExpParseResult* input,
- RegExpNode** node_return,
+ static Handle<FixedArray> Compile(RegExpCompileData* input,
bool ignore_case,
bool multiline,
Handle<String> pattern,
v8::HandleScope scope;
ZoneScope zone_scope(DELETE_ON_EXIT);
FlatStringReader reader(CStrVector(input));
- RegExpParseResult result;
+ RegExpCompileData result;
CHECK(v8::internal::ParseRegExp(&reader, false, &result));
CHECK(result.tree != NULL);
CHECK(result.error.is_null());
unibrow::Utf8InputBuffer<> buffer(input, strlen(input));
ZoneScope zone_scope(DELETE_ON_EXIT);
FlatStringReader reader(CStrVector(input));
- RegExpParseResult result;
+ RegExpCompileData result;
CHECK(v8::internal::ParseRegExp(&reader, false, &result));
CHECK(result.tree != NULL);
CHECK(result.error.is_null());
v8::HandleScope scope;
ZoneScope zone_scope(DELETE_ON_EXIT);
FlatStringReader reader(CStrVector(input));
- RegExpParseResult result;
+ RegExpCompileData result;
CHECK_EQ(false, v8::internal::ParseRegExp(&reader, false, &result));
CHECK(result.tree == NULL);
CHECK(!result.error.is_null());
static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) {
V8::Initialize(NULL);
FlatStringReader reader(CStrVector(input));
- RegExpParseResult result;
- if (!v8::internal::ParseRegExp(&reader, multiline, &result))
+ RegExpCompileData compile_data;
+ if (!v8::internal::ParseRegExp(&reader, multiline, &compile_data))
return NULL;
- RegExpNode* node = NULL;
Handle<String> pattern = Factory::NewStringFromUtf8(CStrVector(input));
- RegExpEngine::Compile(&result, &node, false, multiline, pattern, is_ascii);
- return node;
+ RegExpEngine::Compile(&compile_data, false, multiline, pattern, is_ascii);
+ return compile_data.node;
}
}
-TEST(SimplePropagation) {
- v8::HandleScope scope;
- ZoneScope zone_scope(DELETE_ON_EXIT);
- RegExpNode* node = Compile("(a|^b|c)", false, true);
- CHECK(node->info()->follows_start_interest);
-}
-
-
static uc32 CanonRange(uc32 c) {
unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
TEST(Graph) {
V8::Initialize(NULL);
- Execute("(?=[d#.])", false, true, true);
+ Execute("\\bboy\\b", false, true, true);
}