From 05507cc3ea6e96e9e9df98d276c235eea32374dd Mon Sep 17 00:00:00 2001
From: erikcorry <erikcorry@chromium.org>
Date: Wed, 10 Jun 2015 02:55:22 -0700
Subject: [PATCH] Reland II of 'Optimize trivial regexp disjunctions' CL
 1176453002

This change rewrites regexps like (ab|ac|z|ad|ae|af) into (a[b-f]|z).  We can only reorder disjunctions like this for case-dependent regexps.  For case-independent regexps, the disjunctions should be pre-sorted for best results.

R=yangguo@chromium.org
BUG=chromium:482998
LOG=n

Review URL: https://codereview.chromium.org/1180433003

Cr-Commit-Position: refs/heads/master@{#28902}
---
 src/ast.h                                    |   3 +
 src/jsregexp.cc                              | 190 +++++++++++++++++++
 src/jsregexp.h                               |   2 +-
 src/list-inl.h                               |  35 +++-
 src/list.h                                   |   5 +
 src/vector.h                                 |  14 ++
 test/cctest/test-heap.cc                     |  10 +-
 test/mjsunit/regress/regress-crbug-482998.js |  22 +++
 8 files changed, 273 insertions(+), 8 deletions(-)
 create mode 100644 test/mjsunit/regress/regress-crbug-482998.js

diff --git a/src/ast.h b/src/ast.h
index 01154d376..0dc9e6066 100644
--- a/src/ast.h
+++ b/src/ast.h
@@ -2872,6 +2872,9 @@ class RegExpDisjunction final : public RegExpTree {
   int max_match() override { return max_match_; }
   ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
  private:
+  bool SortConsecutiveAtoms(RegExpCompiler* compiler);
+  void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
+  void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
   ZoneList<RegExpTree*>* alternatives_;
   int min_match_;
   int max_match_;
diff --git a/src/jsregexp.cc b/src/jsregexp.cc
index b410d47b3..e284e8cb1 100644
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -4817,10 +4817,200 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
 }
 
 
+int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
+  RegExpAtom* atom1 = (*a)->AsAtom();
+  RegExpAtom* atom2 = (*b)->AsAtom();
+  uc16 character1 = atom1->data().at(0);
+  uc16 character2 = atom2->data().at(0);
+  if (character1 < character2) return -1;
+  if (character1 > character2) return 1;
+  return 0;
+}
+
+
+// We can stable sort runs of atoms, since the order does not matter if they
+// start with different characters.
+// Returns true if any consecutive atoms were found.
+bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
+  ZoneList<RegExpTree*>* alternatives = this->alternatives();
+  int length = alternatives->length();
+  bool found_consecutive_atoms = false;
+  for (int i = 0; i < length; i++) {
+    while (i < length) {
+      RegExpTree* alternative = alternatives->at(i);
+      if (alternative->IsAtom()) break;
+      i++;
+    }
+    // i is length or it is the index of an atom.
+    if (i == length) break;
+    int first_atom = i;
+    i++;
+    while (i < length) {
+      RegExpTree* alternative = alternatives->at(i);
+      if (!alternative->IsAtom()) break;
+      i++;
+    }
+    // Sort atoms to get ones with common prefixes together.
+    // This step is not valid if we are in a case-independent regexp,
+    // because it would change /is|I/ to /I|is/, and order matters when
+    // the regexp parts don't match only disjoint starting points. To fix
+    // this would need a version of CompareFirstChar that uses case-
+    // independent character classes for comparison.
+    if (!compiler->ignore_case()) {
+      DCHECK_LT(first_atom, alternatives->length());
+      DCHECK_LE(i, alternatives->length());
+      DCHECK_LE(first_atom, i);
+      alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
+    }
+    if (i - first_atom > 1) found_consecutive_atoms = true;
+  }
+  return found_consecutive_atoms;
+}
+
+
+// Optimizes ab|ac|az to a(?:b|c|d).
+void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
+  Zone* zone = compiler->zone();
+  ZoneList<RegExpTree*>* alternatives = this->alternatives();
+  int length = alternatives->length();
+
+  int write_posn = 0;
+  int i = 0;
+  while (i < length) {
+    RegExpTree* alternative = alternatives->at(i);
+    if (!alternative->IsAtom()) {
+      alternatives->at(write_posn++) = alternatives->at(i);
+      i++;
+      continue;
+    }
+    RegExpAtom* atom = alternative->AsAtom();
+    uc16 common_prefix = atom->data().at(0);
+    int first_with_prefix = i;
+    int prefix_length = atom->length();
+    i++;
+    while (i < length) {
+      alternative = alternatives->at(i);
+      if (!alternative->IsAtom()) break;
+      atom = alternative->AsAtom();
+      if (atom->data().at(0) != common_prefix) break;
+      prefix_length = Min(prefix_length, atom->length());
+      i++;
+    }
+    if (i > first_with_prefix + 2) {
+      // Found worthwhile run of alternatives with common prefix of at least one
+      // character.  The sorting function above did not sort on more than one
+      // character for reasons of correctness, but there may still be a longer
+      // common prefix if the terms were similar or presorted in the input.
+      // Find out how long the common prefix is.
+      int run_length = i - first_with_prefix;
+      atom = alternatives->at(first_with_prefix)->AsAtom();
+      for (int j = 1; j < run_length && prefix_length > 1; j++) {
+        RegExpAtom* old_atom =
+            alternatives->at(j + first_with_prefix)->AsAtom();
+        for (int k = 1; k < prefix_length; k++) {
+          if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;
+        }
+      }
+      RegExpAtom* prefix =
+          new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
+      ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
+      pair->Add(prefix, zone);
+      ZoneList<RegExpTree*>* suffixes =
+          new (zone) ZoneList<RegExpTree*>(run_length, zone);
+      for (int j = 0; j < run_length; j++) {
+        RegExpAtom* old_atom =
+            alternatives->at(j + first_with_prefix)->AsAtom();
+        int len = old_atom->length();
+        if (len == prefix_length) {
+          suffixes->Add(new (zone) RegExpEmpty(), zone);
+        } else {
+          RegExpTree* suffix = new (zone) RegExpAtom(
+              old_atom->data().SubVector(prefix_length, old_atom->length()));
+          suffixes->Add(suffix, zone);
+        }
+      }
+      pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
+      alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
+    } else {
+      // Just copy any non-worthwhile alternatives.
+      for (int j = first_with_prefix; j < i; j++) {
+        alternatives->at(write_posn++) = alternatives->at(j);
+      }
+    }
+  }
+  alternatives->Rewind(write_posn);  // Trim end of array.
+}
+
+
+// Optimizes b|c|z to [bcz].
+void RegExpDisjunction::FixSingleCharacterDisjunctions(
+    RegExpCompiler* compiler) {
+  Zone* zone = compiler->zone();
+  ZoneList<RegExpTree*>* alternatives = this->alternatives();
+  int length = alternatives->length();
+
+  int write_posn = 0;
+  int i = 0;
+  while (i < length) {
+    RegExpTree* alternative = alternatives->at(i);
+    if (!alternative->IsAtom()) {
+      alternatives->at(write_posn++) = alternatives->at(i);
+      i++;
+      continue;
+    }
+    RegExpAtom* atom = alternative->AsAtom();
+    if (atom->length() != 1) {
+      alternatives->at(write_posn++) = alternatives->at(i);
+      i++;
+      continue;
+    }
+    int first_in_run = i;
+    i++;
+    while (i < length) {
+      alternative = alternatives->at(i);
+      if (!alternative->IsAtom()) break;
+      atom = alternative->AsAtom();
+      if (atom->length() != 1) break;
+      i++;
+    }
+    if (i > first_in_run + 1) {
+      // Found non-trivial run of single-character alternatives.
+      int run_length = i - first_in_run;
+      ZoneList<CharacterRange>* ranges =
+          new (zone) ZoneList<CharacterRange>(2, zone);
+      for (int j = 0; j < run_length; j++) {
+        RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
+        DCHECK_EQ(old_atom->length(), 1);
+        ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
+      }
+      alternatives->at(write_posn++) =
+          new (zone) RegExpCharacterClass(ranges, false);
+    } else {
+      // Just copy any trivial alternatives.
+      for (int j = first_in_run; j < i; j++) {
+        alternatives->at(write_posn++) = alternatives->at(j);
+      }
+    }
+  }
+  alternatives->Rewind(write_posn);  // Trim end of array.
+}
+
+
 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
                                       RegExpNode* on_success) {
   ZoneList<RegExpTree*>* alternatives = this->alternatives();
+
+  if (alternatives->length() > 2) {
+    bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
+    if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
+    FixSingleCharacterDisjunctions(compiler);
+    if (alternatives->length() == 1) {
+      return alternatives->at(0)->ToNode(compiler, on_success);
+    }
+  }
+
   int length = alternatives->length();
+
   ChoiceNode* result =
       new(compiler->zone()) ChoiceNode(length, compiler->zone());
   for (int i = 0; i < length; i++) {
diff --git a/src/jsregexp.h b/src/jsregexp.h
index 6e41c9fa5..ff7759bfe 100644
--- a/src/jsregexp.h
+++ b/src/jsregexp.h
@@ -212,7 +212,7 @@ class RegExpImpl {
   // and the total executable memory at any point.
   static const int kRegExpExecutableMemoryLimit = 16 * MB;
   static const int kRegExpCompiledLimit = 1 * MB;
-  static const int kRegExpTooLargeToOptimize = 10 * KB;
+  static const int kRegExpTooLargeToOptimize = 20 * KB;
 
  private:
   static bool CompileIrregexp(Handle<JSRegExp> re,
diff --git a/src/list-inl.h b/src/list-inl.h
index 9b122fdba..c09788e9a 100644
--- a/src/list-inl.h
+++ b/src/list-inl.h
@@ -195,10 +195,15 @@ int List<T, P>::CountOccurrences(const T& elm, int start, int end) const {
 
 template<typename T, class P>
 void List<T, P>::Sort(int (*cmp)(const T* x, const T* y)) {
-  ToVector().Sort(cmp);
+  Sort(cmp, 0, length_);
+}
+
+
+template <typename T, class P>
+void List<T, P>::Sort(int (*cmp)(const T* x, const T* y), size_t s, size_t l) {
+  ToVector().Sort(cmp, s, l);
 #ifdef DEBUG
-  for (int i = 1; i < length_; i++)
-    DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
+  for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
 #endif
 }
 
@@ -209,7 +214,29 @@ void List<T, P>::Sort() {
 }
 
 
-template<typename T, class P>
+template <typename T, class P>
+void List<T, P>::StableSort(int (*cmp)(const T* x, const T* y)) {
+  StableSort(cmp, 0, length_);
+}
+
+
+template <typename T, class P>
+void List<T, P>::StableSort(int (*cmp)(const T* x, const T* y), size_t s,
+                            size_t l) {
+  ToVector().StableSort(cmp, s, l);
+#ifdef DEBUG
+  for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
+#endif
+}
+
+
+template <typename T, class P>
+void List<T, P>::StableSort() {
+  ToVector().StableSort();
+}
+
+
+template <typename T, class P>
 void List<T, P>::Initialize(int capacity, P allocator) {
   DCHECK(capacity >= 0);
   data_ = (capacity > 0) ? NewData(capacity, allocator) : NULL;
diff --git a/src/list.h b/src/list.h
index 021cafe14..00cbd4031 100644
--- a/src/list.h
+++ b/src/list.h
@@ -149,8 +149,13 @@ class List {
   void Iterate(Visitor* visitor);
 
   // Sort all list entries (using QuickSort)
+  void Sort(int (*cmp)(const T* x, const T* y), size_t start, size_t length);
   void Sort(int (*cmp)(const T* x, const T* y));
   void Sort();
+  void StableSort(int (*cmp)(const T* x, const T* y), size_t start,
+                  size_t length);
+  void StableSort(int (*cmp)(const T* x, const T* y));
+  void StableSort();
 
   INLINE(void Initialize(int capacity,
                          AllocationPolicy allocator = AllocationPolicy()));
diff --git a/src/vector.h b/src/vector.h
index 895c61b4e..d022fde3a 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -69,6 +69,10 @@ class Vector {
     return Vector<T>(result, length_);
   }
 
+  void Sort(int (*cmp)(const T*, const T*), size_t s, size_t l) {
+    std::sort(start() + s, start() + s + l, RawComparer(cmp));
+  }
+
   void Sort(int (*cmp)(const T*, const T*)) {
     std::sort(start(), start() + length(), RawComparer(cmp));
   }
@@ -77,6 +81,16 @@ class Vector {
     std::sort(start(), start() + length());
   }
 
+  void StableSort(int (*cmp)(const T*, const T*), size_t s, size_t l) {
+    std::stable_sort(start() + s, start() + s + l, RawComparer(cmp));
+  }
+
+  void StableSort(int (*cmp)(const T*, const T*)) {
+    std::stable_sort(start(), start() + length(), RawComparer(cmp));
+  }
+
+  void StableSort() { std::stable_sort(start(), start() + length()); }
+
   void Truncate(int length) {
     DCHECK(length <= length_);
     length_ = length;
diff --git a/test/cctest/test-heap.cc b/test/cctest/test-heap.cc
index 02cd5608b..51b829bec 100644
--- a/test/cctest/test-heap.cc
+++ b/test/cctest/test-heap.cc
@@ -1747,13 +1747,13 @@ TEST(TestSizeOfRegExpCode) {
 
   // Adjust source below and this check to match
   // RegExpImple::kRegExpTooLargeToOptimize.
-  DCHECK_EQ(i::RegExpImpl::kRegExpTooLargeToOptimize, 10 * KB);
+  DCHECK_EQ(i::RegExpImpl::kRegExpTooLargeToOptimize, 20 * KB);
 
   // Compile a regexp that is much larger if we are using regexp optimizations.
   CompileRun(
       "var reg_exp_source = '(?:a|bc|def|ghij|klmno|pqrstu)';"
       "var half_size_reg_exp;"
-      "while (reg_exp_source.length < 10 * 1024) {"
+      "while (reg_exp_source.length < 20 * 1024) {"
       "  half_size_reg_exp = reg_exp_source;"
       "  reg_exp_source = reg_exp_source + reg_exp_source;"
       "}"
@@ -1784,7 +1784,11 @@ TEST(TestSizeOfRegExpCode) {
 
   int size_of_regexp_code = size_with_regexp - initial_size;
 
-  CHECK_LE(size_of_regexp_code, 1 * MB);
+  // On some platforms the debug-code flag causes huge amounts of regexp code
+  // to be emitted, breaking this test.
+  if (!FLAG_debug_code) {
+    CHECK_LE(size_of_regexp_code, 1 * MB);
+  }
 
   // Small regexp is half the size, but compiles to more than twice the code
   // due to the optimization steps.
diff --git a/test/mjsunit/regress/regress-crbug-482998.js b/test/mjsunit/regress/regress-crbug-482998.js
new file mode 100644
index 000000000..94ff5008e
--- /dev/null
+++ b/test/mjsunit/regress/regress-crbug-482998.js
@@ -0,0 +1,22 @@
+// Copyright 2015 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Should not time out.  Running time 0.5s vs. 120s before the change.
+function collapse() {
+  var src = "(?:";
+  for (var i = 128; i < 0x1000; i++) {
+    src += "a" + String.fromCharCode(i) + "|";
+  }
+  src += "aa)";
+  var collapsible = new RegExp(src);
+  var subject = "zzzzzzz" + String.fromCharCode(3000);
+  for (var i = 0; i < 1000; i++) {
+    subject += "xxxxxxx";
+  }
+  for (var i = 0; i < 2000; i++) {
+    assertFalse(collapsible.test(subject));
+  }
+}
+
+collapse();
-- 
2.34.1