From 05507cc3ea6e96e9e9df98d276c235eea32374dd Mon Sep 17 00:00:00 2001 From: erikcorry Date: Wed, 10 Jun 2015 02:55:22 -0700 Subject: [PATCH] Reland II of 'Optimize trivial regexp disjunctions' CL 1176453002 This change rewrites regexps like (ab|ac|z|ad|ae|af) into (a[b-f]|z). We can only reorder disjunctions like this for case-dependent regexps. For case-independent regexps, the disjunctions should be pre-sorted for best results. R=yangguo@chromium.org BUG=chromium:482998 LOG=n Review URL: https://codereview.chromium.org/1180433003 Cr-Commit-Position: refs/heads/master@{#28902} --- src/ast.h | 3 + src/jsregexp.cc | 190 +++++++++++++++++++ src/jsregexp.h | 2 +- src/list-inl.h | 35 +++- src/list.h | 5 + src/vector.h | 14 ++ test/cctest/test-heap.cc | 10 +- test/mjsunit/regress/regress-crbug-482998.js | 22 +++ 8 files changed, 273 insertions(+), 8 deletions(-) create mode 100644 test/mjsunit/regress/regress-crbug-482998.js diff --git a/src/ast.h b/src/ast.h index 01154d376..0dc9e6066 100644 --- a/src/ast.h +++ b/src/ast.h @@ -2872,6 +2872,9 @@ class RegExpDisjunction final : public RegExpTree { int max_match() override { return max_match_; } ZoneList* alternatives() { return alternatives_; } private: + bool SortConsecutiveAtoms(RegExpCompiler* compiler); + void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); + void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); ZoneList* alternatives_; int min_match_; int max_match_; diff --git a/src/jsregexp.cc b/src/jsregexp.cc index b410d47b3..e284e8cb1 100644 --- a/src/jsregexp.cc +++ b/src/jsregexp.cc @@ -4817,10 +4817,200 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, } +int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { + RegExpAtom* atom1 = (*a)->AsAtom(); + RegExpAtom* atom2 = (*b)->AsAtom(); + uc16 character1 = atom1->data().at(0); + uc16 character2 = atom2->data().at(0); + if (character1 < character2) return -1; + if (character1 > character2) return 1; + return 0; +} + + +// We can stable sort runs of atoms, since the order does not matter if they +// start with different characters. +// Returns true if any consecutive atoms were found. +bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { + ZoneList* alternatives = this->alternatives(); + int length = alternatives->length(); + bool found_consecutive_atoms = false; + for (int i = 0; i < length; i++) { + while (i < length) { + RegExpTree* alternative = alternatives->at(i); + if (alternative->IsAtom()) break; + i++; + } + // i is length or it is the index of an atom. + if (i == length) break; + int first_atom = i; + i++; + while (i < length) { + RegExpTree* alternative = alternatives->at(i); + if (!alternative->IsAtom()) break; + i++; + } + // Sort atoms to get ones with common prefixes together. + // This step is not valid if we are in a case-independent regexp, + // because it would change /is|I/ to /I|is/, and order matters when + // the regexp parts don't match only disjoint starting points. To fix + // this would need a version of CompareFirstChar that uses case- + // independent character classes for comparison. + if (!compiler->ignore_case()) { + DCHECK_LT(first_atom, alternatives->length()); + DCHECK_LE(i, alternatives->length()); + DCHECK_LE(first_atom, i); + alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); + } + if (i - first_atom > 1) found_consecutive_atoms = true; + } + return found_consecutive_atoms; +} + + +// Optimizes ab|ac|az to a(?:b|c|d). +void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { + Zone* zone = compiler->zone(); + ZoneList* alternatives = this->alternatives(); + int length = alternatives->length(); + + int write_posn = 0; + int i = 0; + while (i < length) { + RegExpTree* alternative = alternatives->at(i); + if (!alternative->IsAtom()) { + alternatives->at(write_posn++) = alternatives->at(i); + i++; + continue; + } + RegExpAtom* atom = alternative->AsAtom(); + uc16 common_prefix = atom->data().at(0); + int first_with_prefix = i; + int prefix_length = atom->length(); + i++; + while (i < length) { + alternative = alternatives->at(i); + if (!alternative->IsAtom()) break; + atom = alternative->AsAtom(); + if (atom->data().at(0) != common_prefix) break; + prefix_length = Min(prefix_length, atom->length()); + i++; + } + if (i > first_with_prefix + 2) { + // Found worthwhile run of alternatives with common prefix of at least one + // character. The sorting function above did not sort on more than one + // character for reasons of correctness, but there may still be a longer + // common prefix if the terms were similar or presorted in the input. + // Find out how long the common prefix is. + int run_length = i - first_with_prefix; + atom = alternatives->at(first_with_prefix)->AsAtom(); + for (int j = 1; j < run_length && prefix_length > 1; j++) { + RegExpAtom* old_atom = + alternatives->at(j + first_with_prefix)->AsAtom(); + for (int k = 1; k < prefix_length; k++) { + if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k; + } + } + RegExpAtom* prefix = + new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); + ZoneList* pair = new (zone) ZoneList(2, zone); + pair->Add(prefix, zone); + ZoneList* suffixes = + new (zone) ZoneList(run_length, zone); + for (int j = 0; j < run_length; j++) { + RegExpAtom* old_atom = + alternatives->at(j + first_with_prefix)->AsAtom(); + int len = old_atom->length(); + if (len == prefix_length) { + suffixes->Add(new (zone) RegExpEmpty(), zone); + } else { + RegExpTree* suffix = new (zone) RegExpAtom( + old_atom->data().SubVector(prefix_length, old_atom->length())); + suffixes->Add(suffix, zone); + } + } + pair->Add(new (zone) RegExpDisjunction(suffixes), zone); + alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair); + } else { + // Just copy any non-worthwhile alternatives. + for (int j = first_with_prefix; j < i; j++) { + alternatives->at(write_posn++) = alternatives->at(j); + } + } + } + alternatives->Rewind(write_posn); // Trim end of array. +} + + +// Optimizes b|c|z to [bcz]. +void RegExpDisjunction::FixSingleCharacterDisjunctions( + RegExpCompiler* compiler) { + Zone* zone = compiler->zone(); + ZoneList* alternatives = this->alternatives(); + int length = alternatives->length(); + + int write_posn = 0; + int i = 0; + while (i < length) { + RegExpTree* alternative = alternatives->at(i); + if (!alternative->IsAtom()) { + alternatives->at(write_posn++) = alternatives->at(i); + i++; + continue; + } + RegExpAtom* atom = alternative->AsAtom(); + if (atom->length() != 1) { + alternatives->at(write_posn++) = alternatives->at(i); + i++; + continue; + } + int first_in_run = i; + i++; + while (i < length) { + alternative = alternatives->at(i); + if (!alternative->IsAtom()) break; + atom = alternative->AsAtom(); + if (atom->length() != 1) break; + i++; + } + if (i > first_in_run + 1) { + // Found non-trivial run of single-character alternatives. + int run_length = i - first_in_run; + ZoneList* ranges = + new (zone) ZoneList(2, zone); + for (int j = 0; j < run_length; j++) { + RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); + DCHECK_EQ(old_atom->length(), 1); + ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); + } + alternatives->at(write_posn++) = + new (zone) RegExpCharacterClass(ranges, false); + } else { + // Just copy any trivial alternatives. + for (int j = first_in_run; j < i; j++) { + alternatives->at(write_posn++) = alternatives->at(j); + } + } + } + alternatives->Rewind(write_posn); // Trim end of array. +} + + RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { ZoneList* alternatives = this->alternatives(); + + if (alternatives->length() > 2) { + bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); + if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); + FixSingleCharacterDisjunctions(compiler); + if (alternatives->length() == 1) { + return alternatives->at(0)->ToNode(compiler, on_success); + } + } + int length = alternatives->length(); + ChoiceNode* result = new(compiler->zone()) ChoiceNode(length, compiler->zone()); for (int i = 0; i < length; i++) { diff --git a/src/jsregexp.h b/src/jsregexp.h index 6e41c9fa5..ff7759bfe 100644 --- a/src/jsregexp.h +++ b/src/jsregexp.h @@ -212,7 +212,7 @@ class RegExpImpl { // and the total executable memory at any point. static const int kRegExpExecutableMemoryLimit = 16 * MB; static const int kRegExpCompiledLimit = 1 * MB; - static const int kRegExpTooLargeToOptimize = 10 * KB; + static const int kRegExpTooLargeToOptimize = 20 * KB; private: static bool CompileIrregexp(Handle re, diff --git a/src/list-inl.h b/src/list-inl.h index 9b122fdba..c09788e9a 100644 --- a/src/list-inl.h +++ b/src/list-inl.h @@ -195,10 +195,15 @@ int List::CountOccurrences(const T& elm, int start, int end) const { template void List::Sort(int (*cmp)(const T* x, const T* y)) { - ToVector().Sort(cmp); + Sort(cmp, 0, length_); +} + + +template +void List::Sort(int (*cmp)(const T* x, const T* y), size_t s, size_t l) { + ToVector().Sort(cmp, s, l); #ifdef DEBUG - for (int i = 1; i < length_; i++) - DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0); + for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0); #endif } @@ -209,7 +214,29 @@ void List::Sort() { } -template +template +void List::StableSort(int (*cmp)(const T* x, const T* y)) { + StableSort(cmp, 0, length_); +} + + +template +void List::StableSort(int (*cmp)(const T* x, const T* y), size_t s, + size_t l) { + ToVector().StableSort(cmp, s, l); +#ifdef DEBUG + for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0); +#endif +} + + +template +void List::StableSort() { + ToVector().StableSort(); +} + + +template void List::Initialize(int capacity, P allocator) { DCHECK(capacity >= 0); data_ = (capacity > 0) ? NewData(capacity, allocator) : NULL; diff --git a/src/list.h b/src/list.h index 021cafe14..00cbd4031 100644 --- a/src/list.h +++ b/src/list.h @@ -149,8 +149,13 @@ class List { void Iterate(Visitor* visitor); // Sort all list entries (using QuickSort) + void Sort(int (*cmp)(const T* x, const T* y), size_t start, size_t length); void Sort(int (*cmp)(const T* x, const T* y)); void Sort(); + void StableSort(int (*cmp)(const T* x, const T* y), size_t start, + size_t length); + void StableSort(int (*cmp)(const T* x, const T* y)); + void StableSort(); INLINE(void Initialize(int capacity, AllocationPolicy allocator = AllocationPolicy())); diff --git a/src/vector.h b/src/vector.h index 895c61b4e..d022fde3a 100644 --- a/src/vector.h +++ b/src/vector.h @@ -69,6 +69,10 @@ class Vector { return Vector(result, length_); } + void Sort(int (*cmp)(const T*, const T*), size_t s, size_t l) { + std::sort(start() + s, start() + s + l, RawComparer(cmp)); + } + void Sort(int (*cmp)(const T*, const T*)) { std::sort(start(), start() + length(), RawComparer(cmp)); } @@ -77,6 +81,16 @@ class Vector { std::sort(start(), start() + length()); } + void StableSort(int (*cmp)(const T*, const T*), size_t s, size_t l) { + std::stable_sort(start() + s, start() + s + l, RawComparer(cmp)); + } + + void StableSort(int (*cmp)(const T*, const T*)) { + std::stable_sort(start(), start() + length(), RawComparer(cmp)); + } + + void StableSort() { std::stable_sort(start(), start() + length()); } + void Truncate(int length) { DCHECK(length <= length_); length_ = length; diff --git a/test/cctest/test-heap.cc b/test/cctest/test-heap.cc index 02cd5608b..51b829bec 100644 --- a/test/cctest/test-heap.cc +++ b/test/cctest/test-heap.cc @@ -1747,13 +1747,13 @@ TEST(TestSizeOfRegExpCode) { // Adjust source below and this check to match // RegExpImple::kRegExpTooLargeToOptimize. - DCHECK_EQ(i::RegExpImpl::kRegExpTooLargeToOptimize, 10 * KB); + DCHECK_EQ(i::RegExpImpl::kRegExpTooLargeToOptimize, 20 * KB); // Compile a regexp that is much larger if we are using regexp optimizations. CompileRun( "var reg_exp_source = '(?:a|bc|def|ghij|klmno|pqrstu)';" "var half_size_reg_exp;" - "while (reg_exp_source.length < 10 * 1024) {" + "while (reg_exp_source.length < 20 * 1024) {" " half_size_reg_exp = reg_exp_source;" " reg_exp_source = reg_exp_source + reg_exp_source;" "}" @@ -1784,7 +1784,11 @@ TEST(TestSizeOfRegExpCode) { int size_of_regexp_code = size_with_regexp - initial_size; - CHECK_LE(size_of_regexp_code, 1 * MB); + // On some platforms the debug-code flag causes huge amounts of regexp code + // to be emitted, breaking this test. + if (!FLAG_debug_code) { + CHECK_LE(size_of_regexp_code, 1 * MB); + } // Small regexp is half the size, but compiles to more than twice the code // due to the optimization steps. diff --git a/test/mjsunit/regress/regress-crbug-482998.js b/test/mjsunit/regress/regress-crbug-482998.js new file mode 100644 index 000000000..94ff5008e --- /dev/null +++ b/test/mjsunit/regress/regress-crbug-482998.js @@ -0,0 +1,22 @@ +// Copyright 2015 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Should not time out. Running time 0.5s vs. 120s before the change. +function collapse() { + var src = "(?:"; + for (var i = 128; i < 0x1000; i++) { + src += "a" + String.fromCharCode(i) + "|"; + } + src += "aa)"; + var collapsible = new RegExp(src); + var subject = "zzzzzzz" + String.fromCharCode(3000); + for (var i = 0; i < 1000; i++) { + subject += "xxxxxxx"; + } + for (var i = 0; i < 2000; i++) { + assertFalse(collapsible.test(subject)); + } +} + +collapse(); -- 2.34.1