Reland II of 'Optimize trivial regexp disjunctions' CL 1176453002

author erikcorry <erikcorry@chromium.org>

Wed, 10 Jun 2015 09:55:22 +0000 (02:55 -0700)

committer Commit bot <commit-bot@chromium.org>

Wed, 10 Jun 2015 09:55:31 +0000 (09:55 +0000)
author erikcorry <erikcorry@chromium.org>
Wed, 10 Jun 2015 09:55:22 +0000 (02:55 -0700)
committer Commit bot <commit-bot@chromium.org>
Wed, 10 Jun 2015 09:55:31 +0000 (09:55 +0000)
diff --git a/src/ast.h b/src/ast.h

index 01154d37628a9f8518497f8428c8523922db54be..0dc9e6066630898e326880092c60b157e4e8e6a7 100644 (file)
--- a/src/ast.h
+++ b/src/ast.h
@@ -2872,6 +2872,9 @@ class RegExpDisjunction final : public RegExpTree {
    int max_match() override { return max_match_; }
    ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
   private:
+  bool SortConsecutiveAtoms(RegExpCompiler* compiler);
+  void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
+  void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
    ZoneList<RegExpTree*>* alternatives_;
    int min_match_;
    int max_match_;
diff --git a/src/jsregexp.cc b/src/jsregexp.cc

index b410d47b3d1fd41d93dd5d26cf8386050375745e..e284e8cb15f3233eee2533bc9efda8b6735bd1e3 100644 (file)
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -4817,10 +4817,200 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
  }
  
  
+int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
+  RegExpAtom* atom1 = (*a)->AsAtom();
+  RegExpAtom* atom2 = (*b)->AsAtom();
+  uc16 character1 = atom1->data().at(0);
+  uc16 character2 = atom2->data().at(0);
+  if (character1 < character2) return -1;
+  if (character1 > character2) return 1;
+  return 0;
+}
+
+
+// We can stable sort runs of atoms, since the order does not matter if they
+// start with different characters.
+// Returns true if any consecutive atoms were found.
+bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
+  ZoneList<RegExpTree*>* alternatives = this->alternatives();
+  int length = alternatives->length();
+  bool found_consecutive_atoms = false;
+  for (int i = 0; i < length; i++) {
+    while (i < length) {
+      RegExpTree* alternative = alternatives->at(i);
+      if (alternative->IsAtom()) break;
+      i++;
+    }
+    // i is length or it is the index of an atom.
+    if (i == length) break;
+    int first_atom = i;
+    i++;
+    while (i < length) {
+      RegExpTree* alternative = alternatives->at(i);
+      if (!alternative->IsAtom()) break;
+      i++;
+    }
+    // Sort atoms to get ones with common prefixes together.
+    // This step is not valid if we are in a case-independent regexp,
+    // because it would change /is|I/ to /I|is/, and order matters when
+    // the regexp parts don't match only disjoint starting points. To fix
+    // this would need a version of CompareFirstChar that uses case-
+    // independent character classes for comparison.
+    if (!compiler->ignore_case()) {
+      DCHECK_LT(first_atom, alternatives->length());
+      DCHECK_LE(i, alternatives->length());
+      DCHECK_LE(first_atom, i);
+      alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
+    }
+    if (i - first_atom > 1) found_consecutive_atoms = true;
+  }
+  return found_consecutive_atoms;
+}
+
+
+// Optimizes ab|ac|az to a(?:b|c|d).
+void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
+  Zone* zone = compiler->zone();
+  ZoneList<RegExpTree*>* alternatives = this->alternatives();
+  int length = alternatives->length();
+
+  int write_posn = 0;
+  int i = 0;
+  while (i < length) {
+    RegExpTree* alternative = alternatives->at(i);
+    if (!alternative->IsAtom()) {
+      alternatives->at(write_posn++) = alternatives->at(i);
+      i++;
+      continue;
+    }
+    RegExpAtom* atom = alternative->AsAtom();
+    uc16 common_prefix = atom->data().at(0);
+    int first_with_prefix = i;
+    int prefix_length = atom->length();
+    i++;
+    while (i < length) {
+      alternative = alternatives->at(i);
+      if (!alternative->IsAtom()) break;
+      atom = alternative->AsAtom();
+      if (atom->data().at(0) != common_prefix) break;
+      prefix_length = Min(prefix_length, atom->length());
+      i++;
+    }
+    if (i > first_with_prefix + 2) {
+      // Found worthwhile run of alternatives with common prefix of at least one
+      // character.  The sorting function above did not sort on more than one
+      // character for reasons of correctness, but there may still be a longer
+      // common prefix if the terms were similar or presorted in the input.
+      // Find out how long the common prefix is.
+      int run_length = i - first_with_prefix;
+      atom = alternatives->at(first_with_prefix)->AsAtom();
+      for (int j = 1; j < run_length && prefix_length > 1; j++) {
+        RegExpAtom* old_atom =
+            alternatives->at(j + first_with_prefix)->AsAtom();
+        for (int k = 1; k < prefix_length; k++) {
+          if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;
+        }
+      }
+      RegExpAtom* prefix =
+          new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
+      ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
+      pair->Add(prefix, zone);
+      ZoneList<RegExpTree*>* suffixes =
+          new (zone) ZoneList<RegExpTree*>(run_length, zone);
+      for (int j = 0; j < run_length; j++) {
+        RegExpAtom* old_atom =
+            alternatives->at(j + first_with_prefix)->AsAtom();
+        int len = old_atom->length();
+        if (len == prefix_length) {
+          suffixes->Add(new (zone) RegExpEmpty(), zone);
+        } else {
+          RegExpTree* suffix = new (zone) RegExpAtom(
+              old_atom->data().SubVector(prefix_length, old_atom->length()));
+          suffixes->Add(suffix, zone);
+        }
+      }
+      pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
+      alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
+    } else {
+      // Just copy any non-worthwhile alternatives.
+      for (int j = first_with_prefix; j < i; j++) {
+        alternatives->at(write_posn++) = alternatives->at(j);
+      }
+    }
+  }
+  alternatives->Rewind(write_posn);  // Trim end of array.
+}
+
+
+// Optimizes b|c|z to [bcz].
+void RegExpDisjunction::FixSingleCharacterDisjunctions(
+    RegExpCompiler* compiler) {
+  Zone* zone = compiler->zone();
+  ZoneList<RegExpTree*>* alternatives = this->alternatives();
+  int length = alternatives->length();
+
+  int write_posn = 0;
+  int i = 0;
+  while (i < length) {
+    RegExpTree* alternative = alternatives->at(i);
+    if (!alternative->IsAtom()) {
+      alternatives->at(write_posn++) = alternatives->at(i);
+      i++;
+      continue;
+    }
+    RegExpAtom* atom = alternative->AsAtom();
+    if (atom->length() != 1) {
+      alternatives->at(write_posn++) = alternatives->at(i);
+      i++;
+      continue;
+    }
+    int first_in_run = i;
+    i++;
+    while (i < length) {
+      alternative = alternatives->at(i);
+      if (!alternative->IsAtom()) break;
+      atom = alternative->AsAtom();
+      if (atom->length() != 1) break;
+      i++;
+    }
+    if (i > first_in_run + 1) {
+      // Found non-trivial run of single-character alternatives.
+      int run_length = i - first_in_run;
+      ZoneList<CharacterRange>* ranges =
+          new (zone) ZoneList<CharacterRange>(2, zone);
+      for (int j = 0; j < run_length; j++) {
+        RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
+        DCHECK_EQ(old_atom->length(), 1);
+        ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
+      }
+      alternatives->at(write_posn++) =
+          new (zone) RegExpCharacterClass(ranges, false);
+    } else {
+      // Just copy any trivial alternatives.
+      for (int j = first_in_run; j < i; j++) {
+        alternatives->at(write_posn++) = alternatives->at(j);
+      }
+    }
+  }
+  alternatives->Rewind(write_posn);  // Trim end of array.
+}
+
+
  RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
                                        RegExpNode* on_success) {
    ZoneList<RegExpTree*>* alternatives = this->alternatives();
+
+  if (alternatives->length() > 2) {
+    bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
+    if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
+    FixSingleCharacterDisjunctions(compiler);
+    if (alternatives->length() == 1) {
+      return alternatives->at(0)->ToNode(compiler, on_success);
+    }
+  }
+
    int length = alternatives->length();
+
    ChoiceNode* result =
        new(compiler->zone()) ChoiceNode(length, compiler->zone());
    for (int i = 0; i < length; i++) {
diff --git a/src/jsregexp.h b/src/jsregexp.h

index 6e41c9fa56ee7e87236fde0aec7aa274db7fa990..ff7759bfec631bde43a0f59b2f61622bfdd79c62 100644 (file)
--- a/src/jsregexp.h
+++ b/src/jsregexp.h
@@ -212,7 +212,7 @@ class RegExpImpl {
    // and the total executable memory at any point.
    static const int kRegExpExecutableMemoryLimit = 16 * MB;
    static const int kRegExpCompiledLimit = 1 * MB;
-  static const int kRegExpTooLargeToOptimize = 10 * KB;
+  static const int kRegExpTooLargeToOptimize = 20 * KB;
  
   private:
    static bool CompileIrregexp(Handle<JSRegExp> re,
diff --git a/src/list-inl.h b/src/list-inl.h

index 9b122fdbae4417fc710b755cc4be9fe541f4b04b..c09788e9ae0e466e5639673c909215b4e52e8d8f 100644 (file)
--- a/src/list-inl.h
+++ b/src/list-inl.h
@@ -195,10 +195,15 @@ int List<T, P>::CountOccurrences(const T& elm, int start, int end) const {
  
  template<typename T, class P>
  void List<T, P>::Sort(int (*cmp)(const T* x, const T* y)) {
-  ToVector().Sort(cmp);
+  Sort(cmp, 0, length_);
+}
+
+
+template <typename T, class P>
+void List<T, P>::Sort(int (*cmp)(const T* x, const T* y), size_t s, size_t l) {
+  ToVector().Sort(cmp, s, l);
  #ifdef DEBUG
-  for (int i = 1; i < length_; i++)
-    DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
+  for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
  #endif
  }
  
@@ -209,7 +214,29 @@ void List<T, P>::Sort() {
  }
  
  
-template<typename T, class P>
+template <typename T, class P>
+void List<T, P>::StableSort(int (*cmp)(const T* x, const T* y)) {
+  StableSort(cmp, 0, length_);
+}
+
+
+template <typename T, class P>
+void List<T, P>::StableSort(int (*cmp)(const T* x, const T* y), size_t s,
+                            size_t l) {
+  ToVector().StableSort(cmp, s, l);
+#ifdef DEBUG
+  for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
+#endif
+}
+
+
+template <typename T, class P>
+void List<T, P>::StableSort() {
+  ToVector().StableSort();
+}
+
+
+template <typename T, class P>
  void List<T, P>::Initialize(int capacity, P allocator) {
    DCHECK(capacity >= 0);
    data_ = (capacity > 0) ? NewData(capacity, allocator) : NULL;
diff --git a/src/list.h b/src/list.h

index 021cafe1460383f266b055735ac3136925c4385e..00cbd40312025ad8ba2fcc40a1f17b2a8409cd22 100644 (file)
--- a/src/list.h
+++ b/src/list.h
@@ -149,8 +149,13 @@ class List {
    void Iterate(Visitor* visitor);
  
    // Sort all list entries (using QuickSort)
+  void Sort(int (*cmp)(const T* x, const T* y), size_t start, size_t length);
    void Sort(int (*cmp)(const T* x, const T* y));
    void Sort();
+  void StableSort(int (*cmp)(const T* x, const T* y), size_t start,
+                  size_t length);
+  void StableSort(int (*cmp)(const T* x, const T* y));
+  void StableSort();
  
    INLINE(void Initialize(int capacity,
                           AllocationPolicy allocator = AllocationPolicy()));
diff --git a/src/vector.h b/src/vector.h

index 895c61b4ece176acfe8820f13779140ff3c552c2..d022fde3a5bb25d88b7fba5e9ca4ae375e94d20b 100644 (file)
--- a/src/vector.h
+++ b/src/vector.h
@@ -69,6 +69,10 @@ class Vector {
      return Vector<T>(result, length_);
    }
  
+  void Sort(int (*cmp)(const T*, const T*), size_t s, size_t l) {
+    std::sort(start() + s, start() + s + l, RawComparer(cmp));
+  }
+
    void Sort(int (*cmp)(const T*, const T*)) {
      std::sort(start(), start() + length(), RawComparer(cmp));
    }
@@ -77,6 +81,16 @@ class Vector {
      std::sort(start(), start() + length());
    }
  
+  void StableSort(int (*cmp)(const T*, const T*), size_t s, size_t l) {
+    std::stable_sort(start() + s, start() + s + l, RawComparer(cmp));
+  }
+
+  void StableSort(int (*cmp)(const T*, const T*)) {
+    std::stable_sort(start(), start() + length(), RawComparer(cmp));
+  }
+
+  void StableSort() { std::stable_sort(start(), start() + length()); }
+
    void Truncate(int length) {
      DCHECK(length <= length_);
      length_ = length;
diff --git a/test/cctest/test-heap.cc b/test/cctest/test-heap.cc

index 02cd5608bef4aef12c7907c0e5ace11f72aa6b0e..51b829bec20845bcbf67fa0522836e8a006a8fcc 100644 (file)
--- a/test/cctest/test-heap.cc
+++ b/test/cctest/test-heap.cc
@@ -1747,13 +1747,13 @@ TEST(TestSizeOfRegExpCode) {
  
    // Adjust source below and this check to match
    // RegExpImple::kRegExpTooLargeToOptimize.
-  DCHECK_EQ(i::RegExpImpl::kRegExpTooLargeToOptimize, 10 * KB);
+  DCHECK_EQ(i::RegExpImpl::kRegExpTooLargeToOptimize, 20 * KB);
  
    // Compile a regexp that is much larger if we are using regexp optimizations.
    CompileRun(
        "var reg_exp_source = '(?:a|bc|def|ghij|klmno|pqrstu)';"
        "var half_size_reg_exp;"
-      "while (reg_exp_source.length < 10 * 1024) {"
+      "while (reg_exp_source.length < 20 * 1024) {"
        "  half_size_reg_exp = reg_exp_source;"
        "  reg_exp_source = reg_exp_source + reg_exp_source;"
        "}"
@@ -1784,7 +1784,11 @@ TEST(TestSizeOfRegExpCode) {
  
    int size_of_regexp_code = size_with_regexp - initial_size;
  
-  CHECK_LE(size_of_regexp_code, 1 * MB);
+  // On some platforms the debug-code flag causes huge amounts of regexp code
+  // to be emitted, breaking this test.
+  if (!FLAG_debug_code) {
+    CHECK_LE(size_of_regexp_code, 1 * MB);
+  }
  
    // Small regexp is half the size, but compiles to more than twice the code
    // due to the optimization steps.
diff --git a/test/mjsunit/regress/regress-crbug-482998.js b/test/mjsunit/regress/regress-crbug-482998.js

new file mode 100644 (file)

index 0000000..94ff500
--- /dev/null
+++ b/test/mjsunit/regress/regress-crbug-482998.js
@@ -0,0 +1,22 @@
+// Copyright 2015 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Should not time out.  Running time 0.5s vs. 120s before the change.
+function collapse() {
+  var src = "(?:";
+  for (var i = 128; i < 0x1000; i++) {
+    src += "a" + String.fromCharCode(i) + "|";
+  }
+  src += "aa)";
+  var collapsible = new RegExp(src);
+  var subject = "zzzzzzz" + String.fromCharCode(3000);
+  for (var i = 0; i < 1000; i++) {
+    subject += "xxxxxxx";
+  }
+  for (var i = 0; i < 2000; i++) {
+    assertFalse(collapsible.test(subject));
+  }
+}
+
+collapse();
author	erikcorry <erikcorry@chromium.org>
	Wed, 10 Jun 2015 09:55:22 +0000 (02:55 -0700)
committer	Commit bot <commit-bot@chromium.org>
	Wed, 10 Jun 2015 09:55:31 +0000 (09:55 +0000)
src/ast.h		patch \| blob \| history
src/jsregexp.cc		patch \| blob \| history
src/jsregexp.h		patch \| blob \| history
src/list-inl.h		patch \| blob \| history
src/list.h		patch \| blob \| history
src/vector.h		patch \| blob \| history
test/cctest/test-heap.cc		patch \| blob \| history
test/mjsunit/regress/regress-crbug-482998.js	[new file with mode: 0644]	patch \| blob