From 484b9df414b26c5bb29b7ed819bbae3e6af0b860 Mon Sep 17 00:00:00 2001 From: "lrn@chromium.org" Date: Tue, 19 Oct 2010 14:00:01 +0000 Subject: [PATCH] Limit end-anchored regexps to testing end of string where possible. Review URL: http://codereview.chromium.org/3844006 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@5661 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/arm/regexp-macro-assembler-arm.cc | 14 +++++++- src/arm/regexp-macro-assembler-arm.h | 1 + src/ast.cc | 49 +++++++++++++++++++++++----- src/ast.h | 17 ++++++---- src/bytecodes-irregexp.h | 3 +- src/ia32/regexp-macro-assembler-ia32.cc | 12 ++++++- src/ia32/regexp-macro-assembler-ia32.h | 1 + src/interpreter-irregexp.cc | 9 +++++ src/jsregexp.cc | 14 +++++++- src/regexp-macro-assembler-irregexp.cc | 6 ++++ src/regexp-macro-assembler-irregexp.h | 1 + src/regexp-macro-assembler-tracer.cc | 6 ++++ src/regexp-macro-assembler-tracer.h | 1 + src/regexp-macro-assembler.h | 1 + src/x64/regexp-macro-assembler-x64.cc | 14 +++++++- src/x64/regexp-macro-assembler-x64.h | 1 + test/mjsunit/regexp.js | 58 +++++++++++++++++++++++++++++++++ 17 files changed, 188 insertions(+), 20 deletions(-) diff --git a/src/arm/regexp-macro-assembler-arm.cc b/src/arm/regexp-macro-assembler-arm.cc index 8f45886..37bb1f0 100644 --- a/src/arm/regexp-macro-assembler-arm.cc +++ b/src/arm/regexp-macro-assembler-arm.cc @@ -142,7 +142,6 @@ int RegExpMacroAssemblerARM::stack_limit_slack() { void RegExpMacroAssemblerARM::AdvanceCurrentPosition(int by) { if (by != 0) { - Label inside_string; __ add(current_input_offset(), current_input_offset(), Operand(by * char_size())); } @@ -927,6 +926,19 @@ void RegExpMacroAssemblerARM::ReadStackPointerFromRegister(int reg) { } +void RegExpMacroAssemblerARM::SetCurrentPositionFromEnd(int by) { + Label after_position; + __ cmp(current_input_offset(), Operand(-by * char_size())); + __ b(ge, &after_position); + __ mov(current_input_offset(), Operand(-by * char_size())); + // On RegExp code entry (where this operation is used), the character before + // the current position is expected to be already loaded. + // We have advanced the position, so it's safe to read backwards. + LoadCurrentCharacterUnchecked(-1, 1); + __ bind(&after_position); +} + + void RegExpMacroAssemblerARM::SetRegister(int register_index, int to) { ASSERT(register_index >= num_saved_registers_); // Reserved for positions! __ mov(r0, Operand(to)); diff --git a/src/arm/regexp-macro-assembler-arm.h b/src/arm/regexp-macro-assembler-arm.h index 93a74d7..4e09f67 100644 --- a/src/arm/regexp-macro-assembler-arm.h +++ b/src/arm/regexp-macro-assembler-arm.h @@ -100,6 +100,7 @@ class RegExpMacroAssemblerARM: public NativeRegExpMacroAssembler { StackCheckFlag check_stack_limit); virtual void ReadCurrentPositionFromRegister(int reg); virtual void ReadStackPointerFromRegister(int reg); + virtual void SetCurrentPositionFromEnd(int by); virtual void SetRegister(int register_index, int to); virtual void Succeed(); virtual void WriteCurrentPositionToRegister(int reg, int cp_offset); diff --git a/src/ast.cc b/src/ast.cc index f47dffd..92f1496 100644 --- a/src/ast.cc +++ b/src/ast.cc @@ -398,39 +398,70 @@ Interval RegExpQuantifier::CaptureRegisters() { } -bool RegExpAssertion::IsAnchored() { +bool RegExpAssertion::IsAnchoredAtStart() { return type() == RegExpAssertion::START_OF_INPUT; } -bool RegExpAlternative::IsAnchored() { +bool RegExpAssertion::IsAnchoredAtEnd() { + return type() == RegExpAssertion::END_OF_INPUT; +} + + +bool RegExpAlternative::IsAnchoredAtStart() { ZoneList* nodes = this->nodes(); for (int i = 0; i < nodes->length(); i++) { RegExpTree* node = nodes->at(i); - if (node->IsAnchored()) { return true; } + if (node->IsAnchoredAtStart()) { return true; } + if (node->max_match() > 0) { return false; } + } + return false; +} + + +bool RegExpAlternative::IsAnchoredAtEnd() { + ZoneList* nodes = this->nodes(); + for (int i = nodes->length() - 1; i >= 0; i--) { + RegExpTree* node = nodes->at(i); + if (node->IsAnchoredAtEnd()) { return true; } if (node->max_match() > 0) { return false; } } return false; } -bool RegExpDisjunction::IsAnchored() { +bool RegExpDisjunction::IsAnchoredAtStart() { ZoneList* alternatives = this->alternatives(); for (int i = 0; i < alternatives->length(); i++) { - if (!alternatives->at(i)->IsAnchored()) + if (!alternatives->at(i)->IsAnchoredAtStart()) return false; } return true; } -bool RegExpLookahead::IsAnchored() { - return is_positive() && body()->IsAnchored(); +bool RegExpDisjunction::IsAnchoredAtEnd() { + ZoneList* alternatives = this->alternatives(); + for (int i = 0; i < alternatives->length(); i++) { + if (!alternatives->at(i)->IsAnchoredAtEnd()) + return false; + } + return true; +} + + +bool RegExpLookahead::IsAnchoredAtStart() { + return is_positive() && body()->IsAnchoredAtStart(); +} + + +bool RegExpCapture::IsAnchoredAtStart() { + return body()->IsAnchoredAtStart(); } -bool RegExpCapture::IsAnchored() { - return body()->IsAnchored(); +bool RegExpCapture::IsAnchoredAtEnd() { + return body()->IsAnchoredAtEnd(); } diff --git a/src/ast.h b/src/ast.h index e8d54e4..a01e48d 100644 --- a/src/ast.h +++ b/src/ast.h @@ -1523,7 +1523,8 @@ class RegExpTree: public ZoneObject { virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) = 0; virtual bool IsTextElement() { return false; } - virtual bool IsAnchored() { return false; } + virtual bool IsAnchoredAtStart() { return false; } + virtual bool IsAnchoredAtEnd() { return false; } virtual int min_match() = 0; virtual int max_match() = 0; // Returns the interval of registers used for captures within this @@ -1548,7 +1549,8 @@ class RegExpDisjunction: public RegExpTree { virtual RegExpDisjunction* AsDisjunction(); virtual Interval CaptureRegisters(); virtual bool IsDisjunction(); - virtual bool IsAnchored(); + virtual bool IsAnchoredAtStart(); + virtual bool IsAnchoredAtEnd(); virtual int min_match() { return min_match_; } virtual int max_match() { return max_match_; } ZoneList* alternatives() { return alternatives_; } @@ -1568,7 +1570,8 @@ class RegExpAlternative: public RegExpTree { virtual RegExpAlternative* AsAlternative(); virtual Interval CaptureRegisters(); virtual bool IsAlternative(); - virtual bool IsAnchored(); + virtual bool IsAnchoredAtStart(); + virtual bool IsAnchoredAtEnd(); virtual int min_match() { return min_match_; } virtual int max_match() { return max_match_; } ZoneList* nodes() { return nodes_; } @@ -1595,7 +1598,8 @@ class RegExpAssertion: public RegExpTree { RegExpNode* on_success); virtual RegExpAssertion* AsAssertion(); virtual bool IsAssertion(); - virtual bool IsAnchored(); + virtual bool IsAnchoredAtStart(); + virtual bool IsAnchoredAtEnd(); virtual int min_match() { return 0; } virtual int max_match() { return 0; } Type type() { return type_; } @@ -1768,7 +1772,8 @@ class RegExpCapture: public RegExpTree { RegExpCompiler* compiler, RegExpNode* on_success); virtual RegExpCapture* AsCapture(); - virtual bool IsAnchored(); + virtual bool IsAnchoredAtStart(); + virtual bool IsAnchoredAtEnd(); virtual Interval CaptureRegisters(); virtual bool IsCapture(); virtual int min_match() { return body_->min_match(); } @@ -1800,7 +1805,7 @@ class RegExpLookahead: public RegExpTree { virtual RegExpLookahead* AsLookahead(); virtual Interval CaptureRegisters(); virtual bool IsLookahead(); - virtual bool IsAnchored(); + virtual bool IsAnchoredAtStart(); virtual int min_match() { return 0; } virtual int max_match() { return 0; } RegExpTree* body() { return body_; } diff --git a/src/bytecodes-irregexp.h b/src/bytecodes-irregexp.h index bcb34c8..93218ea 100644 --- a/src/bytecodes-irregexp.h +++ b/src/bytecodes-irregexp.h @@ -88,7 +88,8 @@ V(CHECK_REGISTER_EQ_POS, 43, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_AT_START, 44, 8) /* bc8 pad24 addr32 */ \ V(CHECK_NOT_AT_START, 45, 8) /* bc8 pad24 addr32 */ \ V(CHECK_GREEDY, 46, 8) /* bc8 pad24 addr32 */ \ -V(ADVANCE_CP_AND_GOTO, 47, 8) /* bc8 offset24 addr32 */ +V(ADVANCE_CP_AND_GOTO, 47, 8) /* bc8 offset24 addr32 */ \ +V(SET_CURRENT_POSITION_FROM_END, 48, 4) /* bc8 idx24 */ #define DECLARE_BYTECODES(name, code, length) \ static const int BC_##name = code; diff --git a/src/ia32/regexp-macro-assembler-ia32.cc b/src/ia32/regexp-macro-assembler-ia32.cc index 2aab7a8..e2853e8 100644 --- a/src/ia32/regexp-macro-assembler-ia32.cc +++ b/src/ia32/regexp-macro-assembler-ia32.cc @@ -133,7 +133,6 @@ int RegExpMacroAssemblerIA32::stack_limit_slack() { void RegExpMacroAssemblerIA32::AdvanceCurrentPosition(int by) { if (by != 0) { - Label inside_string; __ add(Operand(edi), Immediate(by * char_size())); } } @@ -964,6 +963,17 @@ void RegExpMacroAssemblerIA32::ReadStackPointerFromRegister(int reg) { __ add(backtrack_stackpointer(), Operand(ebp, kStackHighEnd)); } +void RegExpMacroAssemblerIA32::SetCurrentPositionFromEnd(int by) { + NearLabel after_position; + __ cmp(edi, -by * char_size()); + __ j(greater_equal, &after_position); + __ mov(edi, -by * char_size()); + // On RegExp code entry (where this operation is used), the character before + // the current position is expected to be already loaded. + // We have advanced the position, so it's safe to read backwards. + LoadCurrentCharacterUnchecked(-1, 1); + __ bind(&after_position); +} void RegExpMacroAssemblerIA32::SetRegister(int register_index, int to) { ASSERT(register_index >= num_saved_registers_); // Reserved for positions! diff --git a/src/ia32/regexp-macro-assembler-ia32.h b/src/ia32/regexp-macro-assembler-ia32.h index 8b8eeed..51e2cb0 100644 --- a/src/ia32/regexp-macro-assembler-ia32.h +++ b/src/ia32/regexp-macro-assembler-ia32.h @@ -98,6 +98,7 @@ class RegExpMacroAssemblerIA32: public NativeRegExpMacroAssembler { StackCheckFlag check_stack_limit); virtual void ReadCurrentPositionFromRegister(int reg); virtual void ReadStackPointerFromRegister(int reg); + virtual void SetCurrentPositionFromEnd(int by); virtual void SetRegister(int register_index, int to); virtual void Succeed(); virtual void WriteCurrentPositionToRegister(int reg, int cp_offset); diff --git a/src/interpreter-irregexp.cc b/src/interpreter-irregexp.cc index a904447..c9c3cc4 100644 --- a/src/interpreter-irregexp.cc +++ b/src/interpreter-irregexp.cc @@ -607,6 +607,15 @@ static bool RawMatch(const byte* code_base, pc = code_base + Load32Aligned(pc + 4); } break; + BYTECODE(SET_CURRENT_POSITION_FROM_END) { + int by = static_cast(insn) >> BYTECODE_SHIFT; + if (subject.length() - current > by) { + current = subject.length() - by; + current_char = subject[current - 1]; + } + pc += BC_SET_CURRENT_POSITION_FROM_END_LENGTH; + break; + } default: UNREACHABLE(); break; diff --git a/src/jsregexp.cc b/src/jsregexp.cc index 82a370f..3c5ddfb 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -5180,7 +5180,10 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data, &compiler, compiler.accept()); RegExpNode* node = captured_body; - if (!data->tree->IsAnchored()) { + bool is_end_anchored = data->tree->IsAnchoredAtEnd(); + bool is_start_anchored = data->tree->IsAnchoredAtStart(); + int max_length = data->tree->max_match(); + if (!is_start_anchored) { // Add a .*? at the beginning, outside the body capture, unless // this expression is anchored at the beginning. RegExpNode* loop_node = @@ -5236,6 +5239,15 @@ RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data, RegExpMacroAssemblerIrregexp macro_assembler(codes); #endif // V8_INTERPRETED_REGEXP + // Inserted here, instead of in Assembler, because it depends on information + // in the AST that isn't replicated in the Node structure. + static const int kMaxBacksearchLimit = 1024; + if (is_end_anchored && + !is_start_anchored && + max_length < kMaxBacksearchLimit) { + macro_assembler.SetCurrentPositionFromEnd(max_length); + } + return compiler.Assemble(¯o_assembler, node, data->capture_count, diff --git a/src/regexp-macro-assembler-irregexp.cc b/src/regexp-macro-assembler-irregexp.cc index 90abe91..6fbb14a 100644 --- a/src/regexp-macro-assembler-irregexp.cc +++ b/src/regexp-macro-assembler-irregexp.cc @@ -145,6 +145,12 @@ void RegExpMacroAssemblerIrregexp::ReadStackPointerFromRegister( } +void RegExpMacroAssemblerIrregexp::SetCurrentPositionFromEnd(int by) { + ASSERT(is_uint24(by)); + Emit(BC_SET_CURRENT_POSITION_FROM_END, by); +} + + void RegExpMacroAssemblerIrregexp::SetRegister(int register_index, int to) { ASSERT(register_index >= 0); ASSERT(register_index <= kMaxRegister); diff --git a/src/regexp-macro-assembler-irregexp.h b/src/regexp-macro-assembler-irregexp.h index 3ddbc2f..6c9c2eb 100644 --- a/src/regexp-macro-assembler-irregexp.h +++ b/src/regexp-macro-assembler-irregexp.h @@ -65,6 +65,7 @@ class RegExpMacroAssemblerIrregexp: public RegExpMacroAssembler { virtual void PushRegister(int register_index, StackCheckFlag check_stack_limit); virtual void AdvanceRegister(int reg, int by); // r[reg] += by. + virtual void SetCurrentPositionFromEnd(int by); virtual void SetRegister(int register_index, int to); virtual void WriteCurrentPositionToRegister(int reg, int cp_offset); virtual void ClearRegisters(int reg_from, int reg_to); diff --git a/src/regexp-macro-assembler-tracer.cc b/src/regexp-macro-assembler-tracer.cc index 41c674b..463c1a8 100644 --- a/src/regexp-macro-assembler-tracer.cc +++ b/src/regexp-macro-assembler-tracer.cc @@ -136,6 +136,12 @@ void RegExpMacroAssemblerTracer::AdvanceRegister(int reg, int by) { } +void RegExpMacroAssemblerTracer::SetCurrentPositionFromEnd(int by) { + PrintF(" SetCurrentPositionFromEnd(by=%d);\n", by); + assembler_->SetCurrentPositionFromEnd(by); +} + + void RegExpMacroAssemblerTracer::SetRegister(int register_index, int to) { PrintF(" SetRegister(register=%d, to=%d);\n", register_index, to); assembler_->SetRegister(register_index, to); diff --git a/src/regexp-macro-assembler-tracer.h b/src/regexp-macro-assembler-tracer.h index 9608f9e..6a8f4d4 100644 --- a/src/regexp-macro-assembler-tracer.h +++ b/src/regexp-macro-assembler-tracer.h @@ -89,6 +89,7 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler { StackCheckFlag check_stack_limit); virtual void ReadCurrentPositionFromRegister(int reg); virtual void ReadStackPointerFromRegister(int reg); + virtual void SetCurrentPositionFromEnd(int by); virtual void SetRegister(int register_index, int to); virtual void Succeed(); virtual void WriteCurrentPositionToRegister(int reg, int cp_offset); diff --git a/src/regexp-macro-assembler.h b/src/regexp-macro-assembler.h index 652b690..dc3bd82 100644 --- a/src/regexp-macro-assembler.h +++ b/src/regexp-macro-assembler.h @@ -155,6 +155,7 @@ class RegExpMacroAssembler { StackCheckFlag check_stack_limit) = 0; virtual void ReadCurrentPositionFromRegister(int reg) = 0; virtual void ReadStackPointerFromRegister(int reg) = 0; + virtual void SetCurrentPositionFromEnd(int by) = 0; virtual void SetRegister(int register_index, int to) = 0; virtual void Succeed() = 0; virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0; diff --git a/src/x64/regexp-macro-assembler-x64.cc b/src/x64/regexp-macro-assembler-x64.cc index 91e2b44..47c19c7 100644 --- a/src/x64/regexp-macro-assembler-x64.cc +++ b/src/x64/regexp-macro-assembler-x64.cc @@ -145,7 +145,6 @@ int RegExpMacroAssemblerX64::stack_limit_slack() { void RegExpMacroAssemblerX64::AdvanceCurrentPosition(int by) { if (by != 0) { - Label inside_string; __ addq(rdi, Immediate(by * char_size())); } } @@ -1053,6 +1052,19 @@ void RegExpMacroAssemblerX64::ReadStackPointerFromRegister(int reg) { } +void RegExpMacroAssemblerX64::SetCurrentPositionFromEnd(int by) { + NearLabel after_position; + __ cmpq(rdi, Immediate(-by * char_size())); + __ j(greater_equal, &after_position); + __ movq(rdi, Immediate(-by * char_size())); + // On RegExp code entry (where this operation is used), the character before + // the current position is expected to be already loaded. + // We have advanced the position, so it's safe to read backwards. + LoadCurrentCharacterUnchecked(-1, 1); + __ bind(&after_position); +} + + void RegExpMacroAssemblerX64::SetRegister(int register_index, int to) { ASSERT(register_index >= num_saved_registers_); // Reserved for positions! __ movq(register_location(register_index), Immediate(to)); diff --git a/src/x64/regexp-macro-assembler-x64.h b/src/x64/regexp-macro-assembler-x64.h index 3bcc3ac..182bc55 100644 --- a/src/x64/regexp-macro-assembler-x64.h +++ b/src/x64/regexp-macro-assembler-x64.h @@ -93,6 +93,7 @@ class RegExpMacroAssemblerX64: public NativeRegExpMacroAssembler { StackCheckFlag check_stack_limit); virtual void ReadCurrentPositionFromRegister(int reg); virtual void ReadStackPointerFromRegister(int reg); + virtual void SetCurrentPositionFromEnd(int by); virtual void SetRegister(int register_index, int to); virtual void Succeed(); virtual void WriteCurrentPositionToRegister(int reg, int cp_offset); diff --git a/test/mjsunit/regexp.js b/test/mjsunit/regexp.js index 5836ddb..b57b86d 100644 --- a/test/mjsunit/regexp.js +++ b/test/mjsunit/regexp.js @@ -589,3 +589,61 @@ assertEquals(0, desc.value); assertEquals(false, desc.configurable); assertEquals(false, desc.enumerable); assertEquals(true, desc.writable); + + +// Check that end-anchored regexps are optimized correctly. +var re = /(?:a|bc)g$/; +assertTrue(re.test("ag")); +assertTrue(re.test("bcg")); +assertTrue(re.test("abcg")); +assertTrue(re.test("zimbag")); +assertTrue(re.test("zimbcg")); + +assertFalse(re.test("g")); +assertFalse(re.test("")); + +// Global regexp (non-zero start). +var re = /(?:a|bc)g$/g; +assertTrue(re.test("ag")); +re.lastIndex = 1; // Near start of string. +assertTrue(re.test("zimbag")); +re.lastIndex = 6; // At end of string. +assertFalse(re.test("zimbag")); +re.lastIndex = 5; // Near end of string. +assertFalse(re.test("zimbag")); +re.lastIndex = 4; +assertTrue(re.test("zimbag")); + +// Anchored at both ends. +var re = /^(?:a|bc)g$/g; +assertTrue(re.test("ag")); +re.lastIndex = 1; +assertFalse(re.test("ag")); +re.lastIndex = 1; +assertFalse(re.test("zag")); + +// Long max_length of RegExp. +var re = /VeryLongRegExp!{1,1000}$/; +assertTrue(re.test("BahoolaVeryLongRegExp!!!!!!")); +assertFalse(re.test("VeryLongRegExp")); +assertFalse(re.test("!")); + +// End anchor inside disjunction. +var re = /(?:a$|bc$)/; +assertTrue(re.test("a")); +assertTrue(re.test("bc")); +assertTrue(re.test("abc")); +assertTrue(re.test("zimzamzumba")); +assertTrue(re.test("zimzamzumbc")); +assertFalse(re.test("c")); +assertFalse(re.test("")); + +// Only partially anchored. +var re = /(?:a|bc$)/; +assertTrue(re.test("a")); +assertTrue(re.test("bc")); +assertEquals(["a"], re.exec("abc")); +assertEquals(4, re.exec("zimzamzumba").index); +assertEquals(["bc"], re.exec("zimzomzumbc")); +assertFalse(re.test("c")); +assertFalse(re.test("")); -- 2.7.4