From 6a1b70ccbc2bbb0c425423c58d7a86e20c5b3aab Mon Sep 17 00:00:00 2001 From: "erik.corry@gmail.com" Date: Tue, 17 Apr 2012 08:06:53 +0000 Subject: [PATCH] Regexp: Unify the lookahead code for the BoyerMoore-like skipping and the lookahead code for simplifying \b. Also cache lookahead results on the nodes, fix a memory leak and remove some character class code we are no longer using. Review URL: https://chromiumcodereview.appspot.com/9961009 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@11345 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/jsregexp.cc | 832 +++++++++++++-------------------------------- src/jsregexp.h | 355 ++++++++++--------- test/cctest/test-regexp.cc | 75 +--- 3 files changed, 410 insertions(+), 852 deletions(-) diff --git a/src/jsregexp.cc b/src/jsregexp.cc index b7d0d30..1e413d8 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -108,6 +108,30 @@ static inline void ThrowRegExpException(Handle re, } +ContainedInLattice AddRange(ContainedInLattice containment, + const int* ranges, + int ranges_length, + Interval new_range) { + ASSERT((ranges_length & 1) == 1); + ASSERT(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1); + if (containment == kLatticeUnknown) return containment; + bool inside = false; + int last = 0; + for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { + // Consider the range from last to ranges[i]. + // We haven't got to the new range yet. + if (ranges[i] <= new_range.from()) continue; + // New range is wholly inside last-ranges[i]. Note that new_range.to() is + // inclusive, but the values in ranges are not. + if (last <= new_range.from() && new_range.to() < ranges[i]) { + return Combine(containment, inside ? kLatticeIn : kLatticeOut); + } + return kLatticeUnknown; + } + return containment; +} + + // More makes code generation slower, less makes V8 benchmark score lower. const int kMaxLookaheadForBoyerMoore = 8; // In a 3-character pattern you can maximally step forwards 3 characters @@ -2157,6 +2181,7 @@ void ActionNode::FillInBMInfo(int offset, } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { on_success()->FillInBMInfo(offset, bm, not_at_start); } + SaveBMInfo(bm, not_at_start, offset); } @@ -2181,6 +2206,7 @@ void AssertionNode::FillInBMInfo( // Match the behaviour of EatsAtLeast on this node. if (type() == AT_START && not_at_start) return; on_success()->FillInBMInfo(offset, bm, not_at_start); + SaveBMInfo(bm, not_at_start, offset); } @@ -2617,12 +2643,14 @@ void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, void LoopChoiceNode::FillInBMInfo( - int offset, BoyerMooreLookahead* bm, bool nas) { + int offset, BoyerMooreLookahead* bm, bool not_at_start) { if (body_can_be_zero_length_) { bm->SetRest(offset); + SaveBMInfo(bm, not_at_start, offset); return; } - ChoiceNode::FillInBMInfo(offset, bm, nas); + ChoiceNode::FillInBMInfo(offset, bm, not_at_start); + SaveBMInfo(bm, not_at_start, offset); } @@ -2710,110 +2738,83 @@ static void EmitHat(RegExpCompiler* compiler, } -// Emit the code to handle \b and \B (word-boundary or non-word-boundary) -// when we know whether the next character must be a word character or not. -static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type, - RegExpCompiler* compiler, - RegExpNode* on_success, - Trace* trace) { +// Emit the code to handle \b and \B (word-boundary or non-word-boundary). +void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { RegExpMacroAssembler* assembler = compiler->macro_assembler(); - Label done; - - Trace new_trace(*trace); - - bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER); - Label* on_word = expect_word_character ? &done : new_trace.backtrack(); - Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done; - - // Check whether previous character was a word character. - switch (trace->at_start()) { - case Trace::TRUE: - if (expect_word_character) { - assembler->GoTo(on_non_word); - } - break; - case Trace::UNKNOWN: - ASSERT_EQ(0, trace->cp_offset()); - assembler->CheckAtStart(on_non_word); - // Fall through. - case Trace::FALSE: - int prev_char_offset = trace->cp_offset() - 1; - assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1); - EmitWordCheck(assembler, on_word, on_non_word, expect_word_character); - // We may or may not have loaded the previous character. - new_trace.InvalidateCurrentCharacter(); + Trace::TriBool next_is_word_character = Trace::UNKNOWN; + bool not_at_start = (trace->at_start() == Trace::FALSE); + BoyerMooreLookahead* lookahead = bm_info(not_at_start); + if (lookahead == NULL) { + int eats_at_least = + Min(kMaxLookaheadForBoyerMoore, + EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); + if (eats_at_least >= 1) { + BoyerMooreLookahead* bm = + new BoyerMooreLookahead(eats_at_least, compiler); + FillInBMInfo(0, bm, not_at_start); + if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE; + if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE; + } + } else { + if (lookahead->at(0)->is_non_word()) next_is_word_character = Trace::FALSE; + if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE; + } + bool at_boundary = (type_ == AssertionNode::AT_BOUNDARY); + if (next_is_word_character == Trace::UNKNOWN) { + Label before_non_word; + Label before_word; + if (trace->characters_preloaded() != 1) { + assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); + } + // Fall through on non-word. + EmitWordCheck(assembler, &before_word, &before_non_word, false); + // Next character is not a word character. + assembler->Bind(&before_non_word); + Label ok; + BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); + assembler->GoTo(&ok); + + assembler->Bind(&before_word); + BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); + assembler->Bind(&ok); + } else if (next_is_word_character == Trace::TRUE) { + BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord); + } else { + ASSERT(next_is_word_character == Trace::FALSE); + BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord); } - - assembler->Bind(&done); - - on_success->Emit(compiler, &new_trace); } -// Emit the code to handle \b and \B (word-boundary or non-word-boundary). -static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type, - RegExpCompiler* compiler, - RegExpNode* on_success, - Trace* trace) { +void AssertionNode::BacktrackIfPrevious( + RegExpCompiler* compiler, + Trace* trace, + AssertionNode::IfPrevious backtrack_if_previous) { RegExpMacroAssembler* assembler = compiler->macro_assembler(); - Label before_non_word; - Label before_word; - if (trace->characters_preloaded() != 1) { - assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word); - } - // Fall through on non-word. - EmitWordCheck(assembler, &before_word, &before_non_word, false); - - // We will be loading the previous character into the current character - // register. Trace new_trace(*trace); new_trace.InvalidateCurrentCharacter(); - Label ok; - Label* boundary; - Label* not_boundary; - if (type == AssertionNode::AT_BOUNDARY) { - boundary = &ok; - not_boundary = new_trace.backtrack(); - } else { - not_boundary = &ok; - boundary = new_trace.backtrack(); - } + Label fall_through, dummy; - // Next character is not a word character. - assembler->Bind(&before_non_word); - if (new_trace.cp_offset() == 0) { - // The start of input counts as a non-word character, so the question is - // decided if we are at the start. - assembler->CheckAtStart(not_boundary); - } - // We already checked that we are not at the start of input so it must be - // OK to load the previous character. - assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, - &ok, // Unused dummy label in this call. - false); - // Fall through on non-word. - EmitWordCheck(assembler, boundary, not_boundary, false); - assembler->GoTo(not_boundary); + Label* non_word = backtrack_if_previous == kIsNonWord ? + new_trace.backtrack() : + &fall_through; + Label* word = backtrack_if_previous == kIsNonWord ? + &fall_through : + new_trace.backtrack(); - // Next character is a word character. - assembler->Bind(&before_word); if (new_trace.cp_offset() == 0) { // The start of input counts as a non-word character, so the question is // decided if we are at the start. - assembler->CheckAtStart(boundary); + assembler->CheckAtStart(non_word); } // We already checked that we are not at the start of input so it must be // OK to load the previous character. - assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, - &ok, // Unused dummy label in this call. - false); - bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY); - EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word); + assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false); + EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord); - assembler->Bind(&ok); - - on_success->Emit(compiler, &new_trace); + assembler->Bind(&fall_through); + on_success()->Emit(compiler, &new_trace); } @@ -2861,13 +2862,9 @@ void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { return; case AT_BOUNDARY: case AT_NON_BOUNDARY: { - EmitBoundaryCheck(type_, compiler, on_success(), trace); + EmitBoundaryCheck(compiler, trace); return; } - case AFTER_WORD_CHARACTER: - case AFTER_NONWORD_CHARACTER: { - EmitHalfBoundaryCheck(type_, compiler, on_success(), trace); - } } on_success()->Emit(compiler, trace); } @@ -3277,24 +3274,74 @@ class AlternativeGenerationList { }; +// The '2' variant is has inclusive from and exclusive to. +static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, + 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, + 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 }; +static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); + +static const int kWordRanges[] = { + '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; +static const int kWordRangeCount = ARRAY_SIZE(kWordRanges); +static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 }; +static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges); +static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 }; +static const int kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges); +static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E, + 0x2028, 0x202A, 0x10000 }; +static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); + + +void BoyerMoorePositionInfo::Set(int character) { + SetInterval(Interval(character, character)); +} + + +void BoyerMoorePositionInfo::SetInterval(const Interval& interval) { + s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval); + w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval); + d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval); + surrogate_ = + AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval); + if (interval.to() - interval.from() >= kMapSize - 1) { + if (map_count_ != kMapSize) { + map_count_ = kMapSize; + for (int i = 0; i < kMapSize; i++) map_->at(i) = true; + } + return; + } + for (int i = interval.from(); i <= interval.to(); i++) { + int mod_character = (i & kMask); + if (!map_->at(mod_character)) { + map_count_++; + map_->at(mod_character) = true; + } + if (map_count_ == kMapSize) return; + } +} + + +void BoyerMoorePositionInfo::SetAll() { + s_ = w_ = d_ = kLatticeUnknown; + if (map_count_ != kMapSize) { + map_count_ = kMapSize; + for (int i = 0; i < kMapSize; i++) map_->at(i) = true; + } +} + + BoyerMooreLookahead::BoyerMooreLookahead( - int length, int map_length, RegExpCompiler* compiler) + int length, RegExpCompiler* compiler) : length_(length), - map_length_(map_length), compiler_(compiler) { - ASSERT(IsPowerOf2(map_length)); if (compiler->ascii()) { max_char_ = String::kMaxAsciiCharCode; } else { max_char_ = String::kMaxUtf16CodeUnit; } - bitmaps_ = new ZoneList*>(length); + bitmaps_ = new ZoneList(length); for (int i = 0; i < length; i++) { - bitmaps_->Add(new ZoneList(map_length)); - ZoneList* map = bitmaps_->at(i); - for (int i = 0; i < map_length; i++) { - map->Add(false); - } + bitmaps_->Add(new BoyerMoorePositionInfo()); } } @@ -3304,8 +3351,11 @@ BoyerMooreLookahead::BoyerMooreLookahead( // different parameters at once this is a tradeoff. bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { int biggest_points = 0; + // If more than 32 characters out of 128 can occur it is unlikely that we can + // be lucky enough to step forwards much of the time. + const int kMaxMax = 32; for (int max_number_of_chars = 4; - max_number_of_chars < kTooManyCharacters; + max_number_of_chars < kMaxMax; max_number_of_chars *= 2) { biggest_points = FindBestInterval(max_number_of_chars, biggest_points, from, to); @@ -3332,7 +3382,7 @@ int BoyerMooreLookahead::FindBestInterval( bool union_map[kSize]; for (int j = 0; j < kSize; j++) union_map[j] = false; while (i < length_ && Count(i) <= max_number_of_chars) { - ZoneList* map = bitmaps_->at(i); + BoyerMoorePositionInfo* map = bitmaps_->at(i); for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); i++; } @@ -3387,8 +3437,8 @@ int BoyerMooreLookahead::GetSkipTable(int min_lookahead, int skip = max_lookahead + 1 - min_lookahead; for (int i = max_lookahead; i >= min_lookahead; i--) { - ZoneList* map = bitmaps_->at(i); - for (int j = 0; j < map_length_; j++) { + BoyerMoorePositionInfo* map = bitmaps_->at(i); + for (int j = 0; j < kSize; j++) { if (map->at(j)) { boolean_skip_table->set(j, kDontSkipArrayEntry); } @@ -3401,29 +3451,29 @@ int BoyerMooreLookahead::GetSkipTable(int min_lookahead, // See comment above on the implementation of GetSkipTable. bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { + const int kSize = RegExpMacroAssembler::kTableSize; + int min_lookahead = 0; int max_lookahead = 0; if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; bool found_single_character = false; - bool abandoned_search_for_single_character = false; int single_character = 0; for (int i = max_lookahead; i >= min_lookahead; i--) { - ZoneList* map = bitmaps_->at(i); - for (int j = 0; j < map_length_; j++) { + BoyerMoorePositionInfo* map = bitmaps_->at(i); + if (map->map_count() > 1 || + (found_single_character && map->map_count() != 0)) { + found_single_character = false; + break; + } + for (int j = 0; j < kSize; j++) { if (map->at(j)) { - if (found_single_character) { - found_single_character = false; // Found two. - abandoned_search_for_single_character = true; - break; - } else { - found_single_character = true; - single_character = j; - } + found_single_character = true; + single_character = j; + break; } } - if (abandoned_search_for_single_character) break; } int lookahead_width = max_lookahead + 1 - min_lookahead; @@ -3437,8 +3487,7 @@ bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { Label cont, again; masm->Bind(&again); masm->LoadCurrentCharacter(max_lookahead, &cont, true); - if (max_char_ > map_length_) { - ASSERT(map_length_ == RegExpMacroAssembler::kTableSize); + if (max_char_ > kSize) { masm->CheckCharacterAfterAnd(single_character, RegExpMacroAssembler::kTableMask, &cont); @@ -3452,7 +3501,7 @@ bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { } Handle boolean_skip_table = - FACTORY->NewByteArray(map_length_, TENURED); + FACTORY->NewByteArray(kSize, TENURED); int skip_distance = GetSkipTable( min_lookahead, max_lookahead, boolean_skip_table); ASSERT(skip_distance != 0); @@ -3631,16 +3680,20 @@ void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { // not be atoms, they can be any reasonably limited character class or // small alternation. ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. - eats_at_least = - Min(kMaxLookaheadForBoyerMoore, - EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); - if (eats_at_least >= 1) { - BoyerMooreLookahead bm(eats_at_least, - RegExpMacroAssembler::kTableSize, - compiler); - GuardedAlternative alt0 = alternatives_->at(0); - alt0.node()->FillInBMInfo(0, &bm, not_at_start); - skip_was_emitted = bm.EmitSkipInstructions(macro_assembler); + BoyerMooreLookahead* lookahead = bm_info(not_at_start); + if (lookahead == NULL) { + eats_at_least = + Min(kMaxLookaheadForBoyerMoore, + EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); + if (eats_at_least >= 1) { + BoyerMooreLookahead* bm = + new BoyerMooreLookahead(eats_at_least, compiler); + GuardedAlternative alt0 = alternatives_->at(0); + alt0.node()->FillInBMInfo(0, bm, not_at_start); + skip_was_emitted = bm->EmitSkipInstructions(macro_assembler); + } + } else { + skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler); } } } @@ -4203,12 +4256,6 @@ void DotPrinter::VisitAssertion(AssertionNode* that) { case AssertionNode::AFTER_NEWLINE: stream()->Add("label=\"(?<=\\n)\", shape=septagon"); break; - case AssertionNode::AFTER_WORD_CHARACTER: - stream()->Add("label=\"(?<=\\w)\", shape=septagon"); - break; - case AssertionNode::AFTER_NONWORD_CHARACTER: - stream()->Add("label=\"(?<=\\W)\", shape=septagon"); - break; } stream()->Add("];\n"); PrintAttributes(that); @@ -4313,21 +4360,6 @@ void RegExpEngine::DotPrint(const char* label, // ------------------------------------------------------------------- // Tree to graph conversion -static const uc16 kSpaceRanges[] = { 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, - 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029, - 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000, 0xFEFF, 0xFEFF }; -static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); - -static const uc16 kWordRanges[] = { '0', '9', 'A', 'Z', '_', '_', 'a', 'z' }; -static const int kWordRangeCount = ARRAY_SIZE(kWordRanges); - -static const uc16 kDigitRanges[] = { '0', '9' }; -static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges); - -static const uc16 kLineTerminatorRanges[] = { 0x000A, 0x000A, 0x000D, 0x000D, - 0x2028, 0x2029 }; -static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); - RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { ZoneList* elms = new ZoneList(1); @@ -4341,9 +4373,12 @@ RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, return new TextNode(elements(), on_success); } + static bool CompareInverseRanges(ZoneList* ranges, - const uc16* special_class, + const int* special_class, int length) { + length--; // Remove final 0x10000. + ASSERT(special_class[length] == 0x10000); ASSERT(ranges->length() != 0); ASSERT(length != 0); ASSERT(special_class[0] != 0); @@ -4359,7 +4394,7 @@ static bool CompareInverseRanges(ZoneList* ranges, return false; } range = ranges->at((i >> 1) + 1); - if (special_class[i+1] != range.from() - 1) { + if (special_class[i+1] != range.from()) { return false; } } @@ -4371,14 +4406,17 @@ static bool CompareInverseRanges(ZoneList* ranges, static bool CompareRanges(ZoneList* ranges, - const uc16* special_class, + const int* special_class, int length) { + length--; // Remove final 0x10000. + ASSERT(special_class[length] == 0x10000); if (ranges->length() * 2 != length) { return false; } for (int i = 0; i < length; i += 2) { CharacterRange range = ranges->at(i >> 1); - if (range.from() != special_class[i] || range.to() != special_class[i+1]) { + if (range.from() != special_class[i] || + range.to() != special_class[i + 1] - 1) { return false; } } @@ -4779,27 +4817,31 @@ RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, } -static void AddClass(const uc16* elmv, +static void AddClass(const int* elmv, int elmc, ZoneList* ranges) { + elmc--; + ASSERT(elmv[elmc] == 0x10000); for (int i = 0; i < elmc; i += 2) { - ASSERT(elmv[i] <= elmv[i + 1]); - ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); + ASSERT(elmv[i] < elmv[i + 1]); + ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1)); } } -static void AddClassNegated(const uc16 *elmv, +static void AddClassNegated(const int *elmv, int elmc, ZoneList* ranges) { + elmc--; + ASSERT(elmv[elmc] == 0x10000); ASSERT(elmv[0] != 0x0000); ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit); uc16 last = 0x0000; for (int i = 0; i < elmc; i += 2) { ASSERT(last <= elmv[i] - 1); - ASSERT(elmv[i] <= elmv[i + 1]); + ASSERT(elmv[i] < elmv[i + 1]); ranges->Add(CharacterRange(last, elmv[i] - 1)); - last = elmv[i + 1] + 1; + last = elmv[i + 1]; } ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit)); } @@ -4850,8 +4892,8 @@ void CharacterRange::AddClassEscape(uc16 type, } -Vector CharacterRange::GetWordBounds() { - return Vector(kWordRanges, kWordRangeCount); +Vector CharacterRange::GetWordBounds() { + return Vector(kWordRanges, kWordRangeCount - 1); } @@ -4883,7 +4925,7 @@ void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) { void CharacterRange::Split(ZoneList* base, - Vector overlay, + Vector overlay, ZoneList** included, ZoneList** excluded) { ASSERT_EQ(NULL, *included); @@ -4892,7 +4934,7 @@ void CharacterRange::Split(ZoneList* base, for (int i = 0; i < base->length(); i++) table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); for (int i = 0; i < overlay.length(); i += 2) { - table.AddRange(CharacterRange(overlay[i], overlay[i+1]), + table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), CharacterRangeSplitter::kInOverlay); } CharacterRangeSplitter callback(included, excluded); @@ -4978,87 +5020,6 @@ bool CharacterRange::IsCanonical(ZoneList* ranges) { return true; } -SetRelation CharacterRange::WordCharacterRelation( - ZoneList* range) { - ASSERT(IsCanonical(range)); - int i = 0; // Word character range index. - int j = 0; // Argument range index. - ASSERT_NE(0, kWordRangeCount); - SetRelation result; - if (range->length() == 0) { - result.SetElementsInSecondSet(); - return result; - } - CharacterRange argument_range = range->at(0); - CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]); - while (i < kWordRangeCount && j < range->length()) { - // Check the two ranges for the five cases: - // - no overlap. - // - partial overlap (there are elements in both ranges that isn't - // in the other, and there are also elements that are in both). - // - argument range entirely inside word range. - // - word range entirely inside argument range. - // - ranges are completely equal. - - // First check for no overlap. The earlier range is not in the other set. - if (argument_range.from() > word_range.to()) { - // Ranges are disjoint. The earlier word range contains elements that - // cannot be in the argument set. - result.SetElementsInSecondSet(); - } else if (word_range.from() > argument_range.to()) { - // Ranges are disjoint. The earlier argument range contains elements that - // cannot be in the word set. - result.SetElementsInFirstSet(); - } else if (word_range.from() <= argument_range.from() && - word_range.to() >= argument_range.from()) { - result.SetElementsInBothSets(); - // argument range completely inside word range. - if (word_range.from() < argument_range.from() || - word_range.to() > argument_range.from()) { - result.SetElementsInSecondSet(); - } - } else if (word_range.from() >= argument_range.from() && - word_range.to() <= argument_range.from()) { - result.SetElementsInBothSets(); - result.SetElementsInFirstSet(); - } else { - // There is overlap, and neither is a subrange of the other - result.SetElementsInFirstSet(); - result.SetElementsInSecondSet(); - result.SetElementsInBothSets(); - } - if (result.NonTrivialIntersection()) { - // The result is as (im)precise as we can possibly make it. - return result; - } - // Progress the range(s) with minimal to-character. - uc16 word_to = word_range.to(); - uc16 argument_to = argument_range.to(); - if (argument_to <= word_to) { - j++; - if (j < range->length()) { - argument_range = range->at(j); - } - } - if (word_to <= argument_to) { - i += 2; - if (i < kWordRangeCount) { - word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]); - } - } - } - // Check if anything wasn't compared in the loop. - if (i < kWordRangeCount) { - // word range contains something not in argument range. - result.SetElementsInSecondSet(); - } else if (j < range->length()) { - // Argument range contains something not in word range. - result.SetElementsInFirstSet(); - } - - return result; -} - ZoneList* CharacterSet::ranges() { if (ranges_ == NULL) { @@ -5191,145 +5152,6 @@ void CharacterRange::Canonicalize(ZoneList* character_ranges) { } -// Utility function for CharacterRange::Merge. Adds a range at the end of -// a canonicalized range list, if necessary merging the range with the last -// range of the list. -static void AddRangeToSet(ZoneList* set, CharacterRange range) { - if (set == NULL) return; - ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from()); - int n = set->length(); - if (n > 0) { - CharacterRange lastRange = set->at(n - 1); - if (lastRange.to() == range.from() - 1) { - set->at(n - 1) = CharacterRange(lastRange.from(), range.to()); - return; - } - } - set->Add(range); -} - - -static void AddRangeToSelectedSet(int selector, - ZoneList* first_set, - ZoneList* second_set, - ZoneList* intersection_set, - CharacterRange range) { - switch (selector) { - case kInsideFirst: - AddRangeToSet(first_set, range); - break; - case kInsideSecond: - AddRangeToSet(second_set, range); - break; - case kInsideBoth: - AddRangeToSet(intersection_set, range); - break; - } -} - - - -void CharacterRange::Merge(ZoneList* first_set, - ZoneList* second_set, - ZoneList* first_set_only_out, - ZoneList* second_set_only_out, - ZoneList* both_sets_out) { - // Inputs are canonicalized. - ASSERT(CharacterRange::IsCanonical(first_set)); - ASSERT(CharacterRange::IsCanonical(second_set)); - // Outputs are empty, if applicable. - ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0); - ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0); - ASSERT(both_sets_out == NULL || both_sets_out->length() == 0); - - // Merge sets by iterating through the lists in order of lowest "from" value, - // and putting intervals into one of three sets. - - if (first_set->length() == 0) { - second_set_only_out->AddAll(*second_set); - return; - } - if (second_set->length() == 0) { - first_set_only_out->AddAll(*first_set); - return; - } - // Indices into input lists. - int i1 = 0; - int i2 = 0; - // Cache length of input lists. - int n1 = first_set->length(); - int n2 = second_set->length(); - // Current range. May be invalid if state is kInsideNone. - int from = 0; - int to = -1; - // Where current range comes from. - int state = kInsideNone; - - while (i1 < n1 || i2 < n2) { - CharacterRange next_range; - int range_source; - if (i2 == n2 || - (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) { - // Next smallest element is in first set. - next_range = first_set->at(i1++); - range_source = kInsideFirst; - } else { - // Next smallest element is in second set. - next_range = second_set->at(i2++); - range_source = kInsideSecond; - } - if (to < next_range.from()) { - // Ranges disjoint: |current| |next| - AddRangeToSelectedSet(state, - first_set_only_out, - second_set_only_out, - both_sets_out, - CharacterRange(from, to)); - from = next_range.from(); - to = next_range.to(); - state = range_source; - } else { - if (from < next_range.from()) { - AddRangeToSelectedSet(state, - first_set_only_out, - second_set_only_out, - both_sets_out, - CharacterRange(from, next_range.from()-1)); - } - if (to < next_range.to()) { - // Ranges overlap: |current| - // |next| - AddRangeToSelectedSet(state | range_source, - first_set_only_out, - second_set_only_out, - both_sets_out, - CharacterRange(next_range.from(), to)); - from = to + 1; - to = next_range.to(); - state = range_source; - } else { - // Range included: |current| , possibly ending at same character. - // |next| - AddRangeToSelectedSet( - state | range_source, - first_set_only_out, - second_set_only_out, - both_sets_out, - CharacterRange(next_range.from(), next_range.to())); - from = next_range.to() + 1; - // If ranges end at same character, both ranges are consumed completely. - if (next_range.to() == to) state = kInsideNone; - } - } - } - AddRangeToSelectedSet(state, - first_set_only_out, - second_set_only_out, - both_sets_out, - CharacterRange(from, to)); -} - - void CharacterRange::Negate(ZoneList* ranges, ZoneList* negated_ranges) { ASSERT(CharacterRange::IsCanonical(ranges)); @@ -5642,169 +5464,20 @@ void Analysis::VisitBackReference(BackReferenceNode* that) { void Analysis::VisitAssertion(AssertionNode* that) { EnsureAnalyzed(that->on_success()); - AssertionNode::AssertionNodeType type = that->type(); - if (type == AssertionNode::AT_BOUNDARY || - type == AssertionNode::AT_NON_BOUNDARY) { - // Check if the following character is known to be a word character - // or known to not be a word character. - ZoneList* following_chars = that->FirstCharacterSet(); - - CharacterRange::Canonicalize(following_chars); - - SetRelation word_relation = - CharacterRange::WordCharacterRelation(following_chars); - if (word_relation.Disjoint()) { - // Includes the case where following_chars is empty (e.g., end-of-input). - // Following character is definitely *not* a word character. - type = (type == AssertionNode::AT_BOUNDARY) ? - AssertionNode::AFTER_WORD_CHARACTER : - AssertionNode::AFTER_NONWORD_CHARACTER; - that->set_type(type); - } else if (word_relation.ContainedIn()) { - // Following character is definitely a word character. - type = (type == AssertionNode::AT_BOUNDARY) ? - AssertionNode::AFTER_NONWORD_CHARACTER : - AssertionNode::AFTER_WORD_CHARACTER; - that->set_type(type); - } - } -} - - -ZoneList* RegExpNode::FirstCharacterSet() { - if (first_character_set_ == NULL) { - if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) { - // If we can't find an exact solution within the budget, we - // set the value to the set of every character, i.e., all characters - // are possible. - ZoneList* all_set = new ZoneList(1); - all_set->Add(CharacterRange::Everything()); - first_character_set_ = all_set; - } - } - return first_character_set_; } -int RegExpNode::ComputeFirstCharacterSet(int budget) { - // Default behavior is to not be able to determine the first character. - return kComputeFirstCharacterSetFail; -} - - -int LoopChoiceNode::ComputeFirstCharacterSet(int budget) { - budget--; - if (budget >= 0) { - // Find loop min-iteration. It's the value of the guarded choice node - // with a GEQ guard, if any. - int min_repetition = 0; - - for (int i = 0; i <= 1; i++) { - GuardedAlternative alternative = alternatives()->at(i); - ZoneList* guards = alternative.guards(); - if (guards != NULL && guards->length() > 0) { - Guard* guard = guards->at(0); - if (guard->op() == Guard::GEQ) { - min_repetition = guard->value(); - break; - } - } - } - - budget = loop_node()->ComputeFirstCharacterSet(budget); - if (budget >= 0) { - ZoneList* character_set = - loop_node()->first_character_set(); - if (body_can_be_zero_length() || min_repetition == 0) { - budget = continue_node()->ComputeFirstCharacterSet(budget); - if (budget < 0) return budget; - ZoneList* body_set = - continue_node()->first_character_set(); - ZoneList* union_set = - new ZoneList(Max(character_set->length(), - body_set->length())); - CharacterRange::Merge(character_set, - body_set, - union_set, - union_set, - union_set); - character_set = union_set; - } - set_first_character_set(character_set); - } - } - return budget; -} - - -int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) { - budget--; - if (budget >= 0) { - GuardedAlternative successor = this->alternatives()->at(1); - RegExpNode* successor_node = successor.node(); - budget = successor_node->ComputeFirstCharacterSet(budget); - if (budget >= 0) { - set_first_character_set(successor_node->first_character_set()); - } - } - return budget; -} - - -// The first character set of an EndNode is unknowable. Just use the -// default implementation that fails and returns all characters as possible. - - -int AssertionNode::ComputeFirstCharacterSet(int budget) { - budget -= 1; - if (budget >= 0) { - switch (type_) { - case AT_END: { - set_first_character_set(new ZoneList(0)); - break; - } - case AT_START: - case AT_BOUNDARY: - case AT_NON_BOUNDARY: - case AFTER_NEWLINE: - case AFTER_NONWORD_CHARACTER: - case AFTER_WORD_CHARACTER: { - ASSERT_NOT_NULL(on_success()); - budget = on_success()->ComputeFirstCharacterSet(budget); - if (budget >= 0) { - set_first_character_set(on_success()->first_character_set()); - } - break; - } - } - } - return budget; -} - - -int ActionNode::ComputeFirstCharacterSet(int budget) { - if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail; - budget--; - if (budget >= 0) { - ASSERT_NOT_NULL(on_success()); - budget = on_success()->ComputeFirstCharacterSet(budget); - if (budget >= 0) { - set_first_character_set(on_success()->first_character_set()); - } - } - return budget; +void BackReferenceNode::FillInBMInfo( + int offset, BoyerMooreLookahead* bm, bool not_at_start) { + // Working out the set of characters that a backreference can match is too + // hard, so we just say that any character can match. + bm->SetRest(offset); + SaveBMInfo(bm, not_at_start, offset); } -int BackReferenceNode::ComputeFirstCharacterSet(int budget) { - // We don't know anything about the first character of a backreference - // at this point. - // The potential first characters are the first characters of the capture, - // and the first characters of the on_success node, depending on whether the - // capture can be empty and whether it is known to be participating or known - // not to be. - return kComputeFirstCharacterSetFail; -} +STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == + RegExpMacroAssembler::kTableSize); void ChoiceNode::FillInBMInfo( @@ -5814,24 +5487,33 @@ void ChoiceNode::FillInBMInfo( GuardedAlternative& alt = alts->at(i); if (alt.guards() != NULL && alt.guards()->length() != 0) { bm->SetRest(offset); // Give up trying to fill in info. + SaveBMInfo(bm, not_at_start, offset); return; } alt.node()->FillInBMInfo(offset, bm, not_at_start); } + SaveBMInfo(bm, not_at_start, offset); } void TextNode::FillInBMInfo( - int offset, BoyerMooreLookahead* bm, bool not_at_start) { - if (offset >= bm->length()) return; + int initial_offset, BoyerMooreLookahead* bm, bool not_at_start) { + if (initial_offset >= bm->length()) return; + int offset = initial_offset; int max_char = bm->max_char(); for (int i = 0; i < elements()->length(); i++) { - if (offset >= bm->length()) return; + if (offset >= bm->length()) { + if (initial_offset == 0) set_bm_info(not_at_start, bm); + return; + } TextElement text = elements()->at(i); if (text.type == TextElement::ATOM) { RegExpAtom* atom = text.data.u_atom; for (int j = 0; j < atom->length(); j++, offset++) { - if (offset >= bm->length()) return; + if (offset >= bm->length()) { + if (initial_offset == 0) set_bm_info(not_at_start, bm); + return; + } uc16 character = atom->data()[j]; if (bm->compiler()->ignore_case()) { unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; @@ -5858,67 +5540,23 @@ void TextNode::FillInBMInfo( CharacterRange& range = ranges->at(k); if (range.from() > max_char) continue; int to = Min(max_char, static_cast(range.to())); - if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) { - bm->SetAll(offset); - break; - } - for (int m = range.from(); m <= to; m++) { - bm->Set(offset, m); - } + bm->SetInterval(offset, Interval(range.from(), to)); } } offset++; } } - if (offset >= bm->length()) return; + if (offset >= bm->length()) { + if (initial_offset == 0) set_bm_info(not_at_start, bm); + return; + } on_success()->FillInBMInfo(offset, bm, true); // Not at start after a text node. + if (initial_offset == 0) set_bm_info(not_at_start, bm); } -int TextNode::ComputeFirstCharacterSet(int budget) { - budget--; - if (budget >= 0) { - ASSERT_NE(0, elements()->length()); - TextElement text = elements()->at(0); - if (text.type == TextElement::ATOM) { - RegExpAtom* atom = text.data.u_atom; - ASSERT_NE(0, atom->length()); - uc16 first_char = atom->data()[0]; - ZoneList* range = new ZoneList(1); - range->Add(CharacterRange(first_char, first_char)); - set_first_character_set(range); - } else { - ASSERT(text.type == TextElement::CHAR_CLASS); - RegExpCharacterClass* char_class = text.data.u_char_class; - ZoneList* ranges = char_class->ranges(); - // TODO(lrn): Canonicalize ranges when they are created - // instead of waiting until now. - CharacterRange::Canonicalize(ranges); - if (char_class->is_negated()) { - int length = ranges->length(); - int new_length = length + 1; - if (length > 0) { - if (ranges->at(0).from() == 0) new_length--; - if (ranges->at(length - 1).to() == String::kMaxUtf16CodeUnit) { - new_length--; - } - } - ZoneList* negated_ranges = - new ZoneList(new_length); - CharacterRange::Negate(ranges, negated_ranges); - set_first_character_set(negated_ranges); - } else { - set_first_character_set(ranges); - } - } - } - return budget; -} - - - // ------------------------------------------------------------------- // Dispatch table construction diff --git a/src/jsregexp.h b/src/jsregexp.h index 288e995..4847739 100644 --- a/src/jsregexp.h +++ b/src/jsregexp.h @@ -40,6 +40,7 @@ class RegExpCompiler; class RegExpMacroAssembler; class RegExpNode; class RegExpTree; +class BoyerMooreLookahead; class RegExpImpl { public: @@ -224,48 +225,6 @@ enum ElementInSetsRelation { }; -// Represents the relation of two sets. -// Sets can be either disjoint, partially or fully overlapping, or equal. -class SetRelation BASE_EMBEDDED { - public: - // Relation is represented by a bit saying whether there are elements in - // one set that is not in the other, and a bit saying that there are elements - // that are in both sets. - - // Location of an element. Corresponds to the internal areas of - // a Venn diagram. - enum { - kInFirst = 1 << kInsideFirst, - kInSecond = 1 << kInsideSecond, - kInBoth = 1 << kInsideBoth - }; - SetRelation() : bits_(0) {} - ~SetRelation() {} - // Add the existence of objects in a particular - void SetElementsInFirstSet() { bits_ |= kInFirst; } - void SetElementsInSecondSet() { bits_ |= kInSecond; } - void SetElementsInBothSets() { bits_ |= kInBoth; } - // Check the currently known relation of the sets (common functions only, - // for other combinations, use value() to get the bits and check them - // manually). - // Sets are completely disjoint. - bool Disjoint() { return (bits_ & kInBoth) == 0; } - // Sets are equal. - bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; } - // First set contains second. - bool Contains() { return (bits_ & kInSecond) == 0; } - // Second set contains first. - bool ContainedIn() { return (bits_ & kInFirst) == 0; } - bool NonTrivialIntersection() { - return (bits_ == (kInFirst | kInSecond | kInBoth)); - } - int value() { return bits_; } - - private: - int bits_; -}; - - class CharacterRange { public: CharacterRange() : from_(0), to_(0) { } @@ -273,7 +232,7 @@ class CharacterRange { CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { } static void AddClassEscape(uc16 type, ZoneList* ranges); - static Vector GetWordBounds(); + static Vector GetWordBounds(); static inline CharacterRange Singleton(uc16 value) { return CharacterRange(value, value); } @@ -294,7 +253,7 @@ class CharacterRange { bool IsSingleton() { return (from_ == to_); } void AddCaseEquivalents(ZoneList* ranges, bool is_ascii); static void Split(ZoneList* base, - Vector overlay, + Vector overlay, ZoneList** included, ZoneList** excluded); // Whether a range list is in canonical form: Ranges ordered by from value, @@ -305,28 +264,6 @@ class CharacterRange { // adjacent ranges are merged. The resulting list may be shorter than the // original, but cannot be longer. static void Canonicalize(ZoneList* ranges); - // Check how the set of characters defined by a CharacterRange list relates - // to the set of word characters. List must be in canonical form. - static SetRelation WordCharacterRelation(ZoneList* ranges); - // Takes two character range lists (representing character sets) in canonical - // form and merges them. - // The characters that are only covered by the first set are added to - // first_set_only_out. the characters that are only in the second set are - // added to second_set_only_out, and the characters that are in both are - // added to both_sets_out. - // The pointers to first_set_only_out, second_set_only_out and both_sets_out - // should be to empty lists, but they need not be distinct, and may be NULL. - // If NULL, the characters are dropped, and if two arguments are the same - // pointer, the result is the union of the two sets that would be created - // if the pointers had been distinct. - // This way, the Merge function can compute all the usual set operations: - // union (all three out-sets are equal), intersection (only both_sets_out is - // non-NULL), and set difference (only first_set is non-NULL). - static void Merge(ZoneList* first_set, - ZoneList* second_set, - ZoneList* first_set_only_out, - ZoneList* second_set_only_out, - ZoneList* both_sets_out); // Negate the contents of a character range in canonical form. static void Negate(ZoneList* src, ZoneList* dst); @@ -421,90 +358,6 @@ class DispatchTable : public ZoneObject { }; -// Improve the speed that we scan for an initial point where a non-anchored -// regexp can match by using a Boyer-Moore-like table. This is done by -// identifying non-greedy non-capturing loops in the nodes that eat any -// character one at a time. For example in the middle of the regexp -// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly -// inserted at the start of any non-anchored regexp. -// -// When we have found such a loop we look ahead in the nodes to find the set of -// characters that can come at given distances. For example for the regexp -// /.?foo/ we know that there are at least 3 characters ahead of us, and the -// sets of characters that can occur are [any, [f, o], [o]]. We find a range in -// the lookahead info where the set of characters is reasonably constrained. In -// our example this is from index 1 to 2 (0 is not constrained). We can now -// look 3 characters ahead and if we don't find one of [f, o] (the union of -// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). -// -// For Unicode input strings we do the same, but modulo 128. -// -// We also look at the first string fed to the regexp and use that to get a hint -// of the character frequencies in the inputs. This affects the assessment of -// whether the set of characters is 'reasonably constrained'. -// -// We also have another lookahead mechanism (called quick check in the code), -// which uses a wide load of multiple characters followed by a mask and compare -// to determine whether a match is possible at this point. -class BoyerMooreLookahead { - public: - BoyerMooreLookahead(int length, int map_length, RegExpCompiler* compiler); - - int length() { return length_; } - int max_char() { return max_char_; } - RegExpCompiler* compiler() { return compiler_; } - - static const int kTooManyCharacters = 32; - - int Count(int map_number) { - ZoneList* map = bitmaps_->at(map_number); - if (map == NULL) return map_length_; - int count = 0; - for (int i = 0; i < map_length_; i++) { - if (map->at(i)) count++; - } - return count; - } - - void Set(int map_number, int character) { - if (character > max_char_) return; - ZoneList* map = bitmaps_->at(map_number); - if (map == NULL) return; - map->at(character & (map_length_ - 1)) = true; - } - - void SetAll(int map_number) { - bitmaps_->at(map_number) = NULL; - } - - void SetRest(int from_map) { - for (int i = from_map; i < length_; i++) SetAll(i); - } - bool EmitSkipInstructions(RegExpMacroAssembler* masm); - - private: - // This is the value obtained by EatsAtLeast. If we do not have at least this - // many characters left in the sample string then the match is bound to fail. - // Therefore it is OK to read a character this far ahead of the current match - // point. - int length_; - // We conservatively consider all character values modulo this length. For - // ASCII there is no loss of precision, since this has a value of 128. - int map_length_; - RegExpCompiler* compiler_; - // 0x7f for ASCII, 0xffff for UTF-16. - int max_char_; - ZoneList*>* bitmaps_; - - int GetSkipTable(int min_lookahead, - int max_lookahead, - Handle boolean_skip_table); - bool FindWorthwhileInterval(int* from, int* to); - int FindBestInterval( - int max_number_of_chars, int old_biggest_points, int* from, int* to); -}; - - #define FOR_EACH_NODE_TYPE(VISIT) \ VISIT(End) \ VISIT(Action) \ @@ -687,7 +540,9 @@ class QuickCheckDetails { class RegExpNode: public ZoneObject { public: - RegExpNode() : first_character_set_(NULL), trace_count_(0) { } + RegExpNode() : first_character_set_(NULL), trace_count_(0) { + bm_info_[0] = bm_info_[1] = NULL; + } virtual ~RegExpNode(); virtual void Accept(NodeVisitor* visitor) = 0; // Generates a goto to this node or actually generates the code at this point. @@ -731,11 +586,18 @@ class RegExpNode: public ZoneObject { // Collects information on the possible code units (mod 128) that can match if // we look forward. This is used for a Boyer-Moore-like string searching // implementation. TODO(erikcorry): This should share more code with - // EatsAtLeast, GetQuickCheckDetails and ComputeFirstCharacterSet. + // EatsAtLeast, GetQuickCheckDetails. virtual void FillInBMInfo( int offset, BoyerMooreLookahead* bm, bool not_at_start) { UNREACHABLE(); } + // We want to avoid recalculating the lookahead info, so we store it on the + // node. Only info that is for this node is stored. We can tell that the + // info is for this node when offset == 0, so the information is calculated + // relative to this node. + void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) { + if (offset == 0) set_bm_info(not_at_start, bm); + } Label* label() { return &label_; } // If non-generic code is generated for a node (i.e. the node is not at the @@ -759,17 +621,6 @@ class RegExpNode: public ZoneObject { SiblingList* siblings() { return &siblings_; } void set_siblings(SiblingList* other) { siblings_ = *other; } - // Return the set of possible next characters recognized by the regexp - // (or a safe subset, potentially the set of all characters). - ZoneList* FirstCharacterSet(); - - // Compute (if possible within the budget of traversed nodes) the - // possible first characters of the input matched by this node and - // its continuation. Returns the remaining budget after the computation. - // If the budget is spent, the result is negative, and the cached - // first_character_set_ value isn't set. - virtual int ComputeFirstCharacterSet(int budget); - // Get and set the cached first character set value. ZoneList* first_character_set() { return first_character_set_; @@ -777,6 +628,9 @@ class RegExpNode: public ZoneObject { void set_first_character_set(ZoneList* character_set) { first_character_set_ = character_set; } + BoyerMooreLookahead* bm_info(bool not_at_start) { + return bm_info_[not_at_start ? 1 : 0]; + } protected: enum LimitResult { DONE, CONTINUE }; @@ -801,6 +655,10 @@ class RegExpNode: public ZoneObject { // processed before it is on a usable state. virtual RegExpNode* Clone() = 0; + void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) { + bm_info_[not_at_start ? 1 : 0] = bm; + } + private: static const int kFirstCharBudget = 10; Label label_; @@ -813,6 +671,7 @@ class RegExpNode: public ZoneObject { // a trace, in which case it is generic and can be reused by flushing the // deferred operations in the current trace and generating a goto. int trace_count_; + BoyerMooreLookahead* bm_info_[2]; }; @@ -833,8 +692,8 @@ class Interval { return (from_ <= value) && (value <= to_); } bool is_empty() { return from_ == kNone; } - int from() { return from_; } - int to() { return to_; } + int from() const { return from_; } + int to() const { return to_; } static Interval Empty() { return Interval(); } static const int kNone = -1; private: @@ -852,6 +711,7 @@ class SeqRegExpNode: public RegExpNode { virtual void FillInBMInfo( int offset, BoyerMooreLookahead* bm, bool not_at_start) { on_success_->FillInBMInfo(offset, bm, not_at_start); + if (offset == 0) set_bm_info(not_at_start, bm); } private: RegExpNode* on_success_; @@ -905,7 +765,6 @@ class ActionNode: public SeqRegExpNode { // TODO(erikcorry): We should allow some action nodes in greedy loops. virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } virtual ActionNode* Clone() { return new ActionNode(*this); } - virtual int ComputeFirstCharacterSet(int budget); private: union { @@ -978,7 +837,6 @@ class TextNode: public SeqRegExpNode { return result; } void CalculateOffsets(); - virtual int ComputeFirstCharacterSet(int budget); private: enum TextEmitPassType { @@ -1009,12 +867,7 @@ class AssertionNode: public SeqRegExpNode { AT_START, AT_BOUNDARY, AT_NON_BOUNDARY, - AFTER_NEWLINE, - // Types not directly expressible in regexp syntax. - // Used for modifying a boundary node if its following character is - // known to be word and/or non-word. - AFTER_NONWORD_CHARACTER, - AFTER_WORD_CHARACTER + AFTER_NEWLINE }; static AssertionNode* AtEnd(RegExpNode* on_success) { return new AssertionNode(AT_END, on_success); @@ -1042,12 +895,16 @@ class AssertionNode: public SeqRegExpNode { bool not_at_start); virtual void FillInBMInfo( int offset, BoyerMooreLookahead* bm, bool not_at_start); - virtual int ComputeFirstCharacterSet(int budget); virtual AssertionNode* Clone() { return new AssertionNode(*this); } AssertionNodeType type() { return type_; } void set_type(AssertionNodeType type) { type_ = type; } private: + void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace); + enum IfPrevious { kIsNonWord, kIsWord }; + void BacktrackIfPrevious(RegExpCompiler* compiler, + Trace* trace, + IfPrevious backtrack_if_previous); AssertionNode(AssertionNodeType t, RegExpNode* on_success) : SeqRegExpNode(on_success), type_(t) { } AssertionNodeType type_; @@ -1076,13 +933,8 @@ class BackReferenceNode: public SeqRegExpNode { return; } virtual void FillInBMInfo( - int offset, BoyerMooreLookahead* bm, bool not_at_start) { - // Working out the set of characters that a backreference can match is too - // hard, so we just say that any character can match. - bm->SetRest(offset); - } + int offset, BoyerMooreLookahead* bm, bool not_at_start); virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); } - virtual int ComputeFirstCharacterSet(int budget); private: int start_reg_; @@ -1249,6 +1101,7 @@ class NegativeLookaheadChoiceNode: public ChoiceNode { virtual void FillInBMInfo( int offset, BoyerMooreLookahead* bm, bool not_at_start) { alternatives_->at(1).node()->FillInBMInfo(offset, bm, not_at_start); + if (offset == 0) set_bm_info(not_at_start, bm); } // For a negative lookahead we don't emit the quick check for the // alternative that is expected to fail. This is because quick check code @@ -1256,7 +1109,6 @@ class NegativeLookaheadChoiceNode: public ChoiceNode { // characters, but on a negative lookahead the negative branch did not take // part in that calculation (EatsAtLeast) so the assumptions don't hold. virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; } - virtual int ComputeFirstCharacterSet(int budget); }; @@ -1279,7 +1131,6 @@ class LoopChoiceNode: public ChoiceNode { bool not_at_start); virtual void FillInBMInfo( int offset, BoyerMooreLookahead* bm, bool not_at_start); - virtual int ComputeFirstCharacterSet(int budget); virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); } RegExpNode* loop_node() { return loop_node_; } RegExpNode* continue_node() { return continue_node_; } @@ -1300,6 +1151,146 @@ class LoopChoiceNode: public ChoiceNode { }; +// Improve the speed that we scan for an initial point where a non-anchored +// regexp can match by using a Boyer-Moore-like table. This is done by +// identifying non-greedy non-capturing loops in the nodes that eat any +// character one at a time. For example in the middle of the regexp +// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly +// inserted at the start of any non-anchored regexp. +// +// When we have found such a loop we look ahead in the nodes to find the set of +// characters that can come at given distances. For example for the regexp +// /.?foo/ we know that there are at least 3 characters ahead of us, and the +// sets of characters that can occur are [any, [f, o], [o]]. We find a range in +// the lookahead info where the set of characters is reasonably constrained. In +// our example this is from index 1 to 2 (0 is not constrained). We can now +// look 3 characters ahead and if we don't find one of [f, o] (the union of +// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). +// +// For Unicode input strings we do the same, but modulo 128. +// +// We also look at the first string fed to the regexp and use that to get a hint +// of the character frequencies in the inputs. This affects the assessment of +// whether the set of characters is 'reasonably constrained'. +// +// We also have another lookahead mechanism (called quick check in the code), +// which uses a wide load of multiple characters followed by a mask and compare +// to determine whether a match is possible at this point. +enum ContainedInLattice { + kNotYet = 0, + kLatticeIn = 1, + kLatticeOut = 2, + kLatticeUnknown = 3 // Can also mean both in and out. +}; + + +inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { + return static_cast(a | b); +} + + +ContainedInLattice AddRange(ContainedInLattice a, + const int* ranges, + int ranges_size, + Interval new_range); + + +class BoyerMoorePositionInfo : public ZoneObject { + public: + BoyerMoorePositionInfo() + : map_(new ZoneList(kMapSize)), + map_count_(0), + w_(kNotYet), + s_(kNotYet), + d_(kNotYet), + surrogate_(kNotYet) { + for (int i = 0; i < kMapSize; i++) { + map_->Add(false); + } + } + + bool& at(int i) { return map_->at(i); } + + static const int kMapSize = 128; + static const int kMask = kMapSize - 1; + + int map_count() const { return map_count_; } + + void Set(int character); + void SetInterval(const Interval& interval); + void SetAll(); + bool is_non_word() { return w_ == kLatticeOut; } + bool is_word() { return w_ == kLatticeIn; } + + private: + ZoneList* map_; + int map_count_; // Number of set bits in the map. + ContainedInLattice w_; // The \w character class. + ContainedInLattice s_; // The \s character class. + ContainedInLattice d_; // The \d character class. + ContainedInLattice surrogate_; // Surrogate UTF-16 code units. +}; + + +class BoyerMooreLookahead : public ZoneObject { + public: + BoyerMooreLookahead(int length, RegExpCompiler* compiler); + + int length() { return length_; } + int max_char() { return max_char_; } + RegExpCompiler* compiler() { return compiler_; } + + int Count(int map_number) { + return bitmaps_->at(map_number)->map_count(); + } + + BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); } + + void Set(int map_number, int character) { + if (character > max_char_) return; + BoyerMoorePositionInfo* info = bitmaps_->at(map_number); + info->Set(character); + } + + void SetInterval(int map_number, const Interval& interval) { + if (interval.from() > max_char_) return; + BoyerMoorePositionInfo* info = bitmaps_->at(map_number); + if (interval.to() > max_char_) { + info->SetInterval(Interval(interval.from(), max_char_)); + } else { + info->SetInterval(interval); + } + } + + void SetAll(int map_number) { + bitmaps_->at(map_number)->SetAll(); + } + + void SetRest(int from_map) { + for (int i = from_map; i < length_; i++) SetAll(i); + } + bool EmitSkipInstructions(RegExpMacroAssembler* masm); + + private: + // This is the value obtained by EatsAtLeast. If we do not have at least this + // many characters left in the sample string then the match is bound to fail. + // Therefore it is OK to read a character this far ahead of the current match + // point. + int length_; + RegExpCompiler* compiler_; + // 0x7f for ASCII, 0xffff for UTF-16. + int max_char_; + ZoneList* bitmaps_; + + int GetSkipTable(int min_lookahead, + int max_lookahead, + Handle boolean_skip_table); + bool FindWorthwhileInterval(int* from, int* to); + int FindBestInterval( + int max_number_of_chars, int old_biggest_points, int* from, int* to); +}; + + // There are many ways to generate code for a node. This class encapsulates // the current way we should be generating. In other words it encapsulates // the current state of the code generator. The effect of this is that we diff --git a/test/cctest/test-regexp.cc b/test/cctest/test-regexp.cc index 54898a0..e89e6cd 100644 --- a/test/cctest/test-regexp.cc +++ b/test/cctest/test-regexp.cc @@ -1590,7 +1590,7 @@ TEST(CharClassDifference) { ZoneScope zone_scope(Isolate::Current(), DELETE_ON_EXIT); ZoneList* base = new ZoneList(1); base->Add(CharacterRange::Everything()); - Vector overlay = CharacterRange::GetWordBounds(); + Vector overlay = CharacterRange::GetWordBounds(); ZoneList* included = NULL; ZoneList* excluded = NULL; CharacterRange::Split(base, overlay, &included, &excluded); @@ -1599,7 +1599,7 @@ TEST(CharClassDifference) { if (in_base) { bool in_overlay = false; for (int j = 0; !in_overlay && j < overlay.length(); j += 2) { - if (overlay[j] <= i && i <= overlay[j+1]) + if (overlay[j] <= i && i < overlay[j+1]) in_overlay = true; } CHECK_EQ(in_overlay, InClass(i, included)); @@ -1672,16 +1672,6 @@ TEST(CanonicalizeCharacterSets) { ASSERT_EQ(30, list->at(0).to()); } -// Checks whether a character is in the set represented by a list of ranges. -static bool CharacterInSet(ZoneList* set, uc16 value) { - for (int i = 0; i < set->length(); i++) { - CharacterRange range = set->at(i); - if (range.from() <= value && value <= range.to()) { - return true; - } - } - return false; -} TEST(CharacterRangeMerge) { v8::internal::V8::Initialize(NULL); @@ -1768,67 +1758,6 @@ TEST(CharacterRangeMerge) { ZoneList first_only(4); ZoneList second_only(4); ZoneList both(4); - - // Merge one direction. - CharacterRange::Merge(&l1, &l2, &first_only, &second_only, &both); - - CHECK(CharacterRange::IsCanonical(&first_only)); - CHECK(CharacterRange::IsCanonical(&second_only)); - CHECK(CharacterRange::IsCanonical(&both)); - - for (uc16 i = 0; i < offset; i++) { - bool in_first = CharacterInSet(&l1, i); - bool in_second = CharacterInSet(&l2, i); - CHECK((in_first && !in_second) == CharacterInSet(&first_only, i)); - CHECK((!in_first && in_second) == CharacterInSet(&second_only, i)); - CHECK((in_first && in_second) == CharacterInSet(&both, i)); - } - - first_only.Clear(); - second_only.Clear(); - both.Clear(); - - // Merge other direction. - CharacterRange::Merge(&l2, &l1, &second_only, &first_only, &both); - - CHECK(CharacterRange::IsCanonical(&first_only)); - CHECK(CharacterRange::IsCanonical(&second_only)); - CHECK(CharacterRange::IsCanonical(&both)); - - for (uc16 i = 0; i < offset; i++) { - bool in_first = CharacterInSet(&l1, i); - bool in_second = CharacterInSet(&l2, i); - CHECK((in_first && !in_second) == CharacterInSet(&first_only, i)); - CHECK((!in_first && in_second) == CharacterInSet(&second_only, i)); - CHECK((in_first && in_second) == CharacterInSet(&both, i)); - } - - first_only.Clear(); - second_only.Clear(); - both.Clear(); - - // Merge but don't record all combinations. - CharacterRange::Merge(&l1, &l2, NULL, NULL, &both); - - CHECK(CharacterRange::IsCanonical(&both)); - - for (uc16 i = 0; i < offset; i++) { - bool in_first = CharacterInSet(&l1, i); - bool in_second = CharacterInSet(&l2, i); - CHECK((in_first && in_second) == CharacterInSet(&both, i)); - } - - // Merge into same set. - ZoneList all(4); - CharacterRange::Merge(&l1, &l2, &all, &all, &all); - - CHECK(CharacterRange::IsCanonical(&all)); - - for (uc16 i = 0; i < offset; i++) { - bool in_first = CharacterInSet(&l1, i); - bool in_second = CharacterInSet(&l2, i); - CHECK((in_first || in_second) == CharacterInSet(&all, i)); - } } -- 2.7.4