From d2135603bcf462e15a1284d8ed969f6692610dda Mon Sep 17 00:00:00 2001
From: erikcorry <erikcorry@chromium.org>
Date: Wed, 24 Jun 2015 11:17:33 -0700
Subject: [PATCH] Extend big-disjunction optimization to case-independent
 regexps

R=yangguo@chromium.org
BUG=chromium:482998
LOG=n

Review URL: https://codereview.chromium.org/1182783009

Cr-Commit-Position: refs/heads/master@{#29264}
---
 src/heap-snapshot-generator.cc               |  3 +-
 src/jsregexp.cc                              | 65 +++++++++++++++++---
 src/list-inl.h                               | 15 +++--
 src/list.h                                   | 13 ++--
 src/vector.h                                 | 27 +++++---
 test/mjsunit/regexp-sort.js                  | 48 +++++++++++++++
 test/mjsunit/regress/regress-crbug-482998.js |  9 +--
 7 files changed, 145 insertions(+), 35 deletions(-)
 create mode 100644 test/mjsunit/regexp-sort.js
diff --git a/src/heap-snapshot-generator.cc b/src/heap-snapshot-generator.cc
index a27f41945..f1bdc71cc 100644
--- a/src/heap-snapshot-generator.cc
+++ b/src/heap-snapshot-generator.cc
@@ -323,7 +323,8 @@ List<HeapEntry*>* HeapSnapshot::GetSortedEntriesList() {
     for (int i = 0; i < entries_.length(); ++i) {
       sorted_entries_[i] = &entries_[i];
     }
-    sorted_entries_.Sort(SortByIds);
+    sorted_entries_.Sort<int (*)(HeapEntry* const*, HeapEntry* const*)>(
+        SortByIds);
   }
   return &sorted_entries_;
 }
diff --git a/src/jsregexp.cc b/src/jsregexp.cc
index 92fdc77aa..a02141d77 100644
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -4837,6 +4837,34 @@ int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
 }
 
 
+static unibrow::uchar Canonical(
+    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+    unibrow::uchar c) {
+  unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
+  int length = canonicalize->get(c, '\0', chars);
+  DCHECK_LE(length, 1);
+  unibrow::uchar canonical = c;
+  if (length == 1) canonical = chars[0];
+  return canonical;
+}
+
+
+int CompareFirstCharCaseIndependent(
+    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+    RegExpTree* const* a, RegExpTree* const* b) {
+  RegExpAtom* atom1 = (*a)->AsAtom();
+  RegExpAtom* atom2 = (*b)->AsAtom();
+  unibrow::uchar character1 = atom1->data().at(0);
+  unibrow::uchar character2 = atom2->data().at(0);
+  if (character1 == character2) return 0;
+  if (character1 >= 'a' || character2 >= 'a') {
+    character1 = Canonical(canonicalize, character1);
+    character2 = Canonical(canonicalize, character2);
+  }
+  return static_cast<int>(character1) - static_cast<int>(character2);
+}
+
+
 // We can stable sort runs of atoms, since the order does not matter if they
 // start with different characters.
 // Returns true if any consecutive atoms were found.
@@ -4860,15 +4888,23 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
       i++;
     }
     // Sort atoms to get ones with common prefixes together.
-    // This step is not valid if we are in a case-independent regexp,
+    // This step is more tricky if we are in a case-independent regexp,
     // because it would change /is|I/ to /I|is/, and order matters when
     // the regexp parts don't match only disjoint starting points. To fix
-    // this would need a version of CompareFirstChar that uses case-
+    // this we have a version of CompareFirstChar that uses case-
     // independent character classes for comparison.
-    if (!compiler->ignore_case()) {
-      DCHECK_LT(first_atom, alternatives->length());
-      DCHECK_LE(i, alternatives->length());
-      DCHECK_LE(first_atom, i);
+    DCHECK_LT(first_atom, alternatives->length());
+    DCHECK_LE(i, alternatives->length());
+    DCHECK_LE(first_atom, i);
+    if (compiler->ignore_case()) {
+      unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
+          compiler->isolate()->regexp_macro_assembler_canonicalize();
+      auto compare_closure =
+          [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
+            return CompareFirstCharCaseIndependent(canonicalize, a, b);
+          };
+      alternatives->StableSort(compare_closure, first_atom, i - first_atom);
+    } else {
       alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
     }
     if (i - first_atom > 1) found_consecutive_atoms = true;
@@ -4893,7 +4929,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
       continue;
     }
     RegExpAtom* atom = alternative->AsAtom();
-    uc16 common_prefix = atom->data().at(0);
+    unibrow::uchar common_prefix = atom->data().at(0);
     int first_with_prefix = i;
     int prefix_length = atom->length();
     i++;
@@ -4901,7 +4937,15 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
       alternative = alternatives->at(i);
       if (!alternative->IsAtom()) break;
       atom = alternative->AsAtom();
-      if (atom->data().at(0) != common_prefix) break;
+      unibrow::uchar new_prefix = atom->data().at(0);
+      if (new_prefix != common_prefix) {
+        if (!compiler->ignore_case()) break;
+        unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
+            compiler->isolate()->regexp_macro_assembler_canonicalize();
+        new_prefix = Canonical(canonicalize, new_prefix);
+        common_prefix = Canonical(canonicalize, common_prefix);
+        if (new_prefix != common_prefix) break;
+      }
       prefix_length = Min(prefix_length, atom->length());
       i++;
     }
@@ -4917,7 +4961,10 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
         RegExpAtom* old_atom =
             alternatives->at(j + first_with_prefix)->AsAtom();
         for (int k = 1; k < prefix_length; k++) {
-          if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;
+          if (atom->data().at(k) != old_atom->data().at(k)) {
+            prefix_length = k;
+            break;
+          }
         }
       }
       RegExpAtom* prefix =
diff --git a/src/list-inl.h b/src/list-inl.h
index c09788e9a..98f0343fa 100644
--- a/src/list-inl.h
+++ b/src/list-inl.h
@@ -193,14 +193,16 @@ int List<T, P>::CountOccurrences(const T& elm, int start, int end) const {
 }
 
 
-template<typename T, class P>
-void List<T, P>::Sort(int (*cmp)(const T* x, const T* y)) {
+template <typename T, class P>
+template <typename CompareFunction>
+void List<T, P>::Sort(CompareFunction cmp) {
   Sort(cmp, 0, length_);
 }
 
 
 template <typename T, class P>
-void List<T, P>::Sort(int (*cmp)(const T* x, const T* y), size_t s, size_t l) {
+template <typename CompareFunction>
+void List<T, P>::Sort(CompareFunction cmp, size_t s, size_t l) {
   ToVector().Sort(cmp, s, l);
 #ifdef DEBUG
   for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
@@ -215,14 +217,15 @@ void List<T, P>::Sort() {
 
 
 template <typename T, class P>
-void List<T, P>::StableSort(int (*cmp)(const T* x, const T* y)) {
+template <typename CompareFunction>
+void List<T, P>::StableSort(CompareFunction cmp) {
   StableSort(cmp, 0, length_);
 }
 
 
 template <typename T, class P>
-void List<T, P>::StableSort(int (*cmp)(const T* x, const T* y), size_t s,
-                            size_t l) {
+template <typename CompareFunction>
+void List<T, P>::StableSort(CompareFunction cmp, size_t s, size_t l) {
   ToVector().StableSort(cmp, s, l);
 #ifdef DEBUG
   for (size_t i = s + 1; i < l; i++) DCHECK(cmp(&data_[i - 1], &data_[i]) <= 0);
diff --git a/src/list.h b/src/list.h
index 00cbd4031..b636449c4 100644
--- a/src/list.h
+++ b/src/list.h
@@ -149,12 +149,15 @@ class List {
   void Iterate(Visitor* visitor);
 
   // Sort all list entries (using QuickSort)
-  void Sort(int (*cmp)(const T* x, const T* y), size_t start, size_t length);
-  void Sort(int (*cmp)(const T* x, const T* y));
+  template <typename CompareFunction>
+  void Sort(CompareFunction cmp, size_t start, size_t length);
+  template <typename CompareFunction>
+  void Sort(CompareFunction cmp);
   void Sort();
-  void StableSort(int (*cmp)(const T* x, const T* y), size_t start,
-                  size_t length);
-  void StableSort(int (*cmp)(const T* x, const T* y));
+  template <typename CompareFunction>
+  void StableSort(CompareFunction cmp, size_t start, size_t length);
+  template <typename CompareFunction>
+  void StableSort(CompareFunction cmp);
   void StableSort();
 
   INLINE(void Initialize(int capacity,
diff --git a/src/vector.h b/src/vector.h
index d022fde3a..4f3128b91 100644
--- a/src/vector.h
+++ b/src/vector.h
@@ -69,24 +69,30 @@ class Vector {
     return Vector<T>(result, length_);
   }
 
-  void Sort(int (*cmp)(const T*, const T*), size_t s, size_t l) {
-    std::sort(start() + s, start() + s + l, RawComparer(cmp));
+  template <typename CompareFunction>
+  void Sort(CompareFunction cmp, size_t s, size_t l) {
+    std::sort(start() + s, start() + s + l, RawComparer<CompareFunction>(cmp));
   }
 
-  void Sort(int (*cmp)(const T*, const T*)) {
-    std::sort(start(), start() + length(), RawComparer(cmp));
+  template <typename CompareFunction>
+  void Sort(CompareFunction cmp) {
+    std::sort(start(), start() + length(), RawComparer<CompareFunction>(cmp));
   }
 
   void Sort() {
     std::sort(start(), start() + length());
   }
 
-  void StableSort(int (*cmp)(const T*, const T*), size_t s, size_t l) {
-    std::stable_sort(start() + s, start() + s + l, RawComparer(cmp));
+  template <typename CompareFunction>
+  void StableSort(CompareFunction cmp, size_t s, size_t l) {
+    std::stable_sort(start() + s, start() + s + l,
+                     RawComparer<CompareFunction>(cmp));
   }
 
-  void StableSort(int (*cmp)(const T*, const T*)) {
-    std::stable_sort(start(), start() + length(), RawComparer(cmp));
+  template <typename CompareFunction>
+  void StableSort(CompareFunction cmp) {
+    std::stable_sort(start(), start() + length(),
+                     RawComparer<CompareFunction>(cmp));
   }
 
   void StableSort() { std::stable_sort(start(), start() + length()); }
@@ -136,15 +142,16 @@ class Vector {
   T* start_;
   int length_;
 
+  template <typename CookedComparer>
   class RawComparer {
    public:
-    explicit RawComparer(int (*cmp)(const T*, const T*)) : cmp_(cmp) {}
+    explicit RawComparer(CookedComparer cmp) : cmp_(cmp) {}
     bool operator()(const T& a, const T& b) {
       return cmp_(&a, &b) < 0;
     }
 
    private:
-    int (*cmp_)(const T*, const T*);
+    CookedComparer cmp_;
   };
 };
 
diff --git a/test/mjsunit/regexp-sort.js b/test/mjsunit/regexp-sort.js
new file mode 100644
index 000000000..57d50701c
--- /dev/null
+++ b/test/mjsunit/regexp-sort.js
@@ -0,0 +1,48 @@
+// Copyright 2015 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+function Test(lower, upper) {
+  var lx = lower + "x";
+  var ux = upper + "x";
+  var lp = lower + "|";
+  var uxp = upper + "x|";
+  assertEquals(lx, new RegExp(uxp + lp + lower + "cat", "i").exec(lx) + "");
+  assertEquals(ux, new RegExp(uxp + lp + lower + "cat", "i").exec(ux) + "");
+  assertEquals(lower, new RegExp(lp + uxp + lower + "cat", "i").exec(lx) + "");
+  assertEquals(upper, new RegExp(lp + uxp + lower + "cat", "i").exec(ux) + "");
+}
+
+function TestFail(lower, upper) {
+  var lx = lower + "x";
+  var ux = upper + "x";
+  var lp = lower + "|";
+  var uxp = upper + "x|";
+  assertEquals(lower, new RegExp(uxp + lp + lower + "cat", "i").exec(lx) + "");
+  assertEquals(ux, new RegExp(uxp + lp + lower + "cat", "i").exec(ux) + "");
+  assertEquals(lower, new RegExp(lp + uxp + lower + "cat", "i").exec(lx) + "");
+  assertEquals(ux, new RegExp(lp + uxp + lower + "cat", "i").exec(ux) + "");
+}
+
+Test("a", "A");
+Test("0", "0");
+TestFail("a", "b");
+// Small and capital o-umlaut
+Test(String.fromCharCode(0xf6), String.fromCharCode(0xd6));
+// Small and capital kha.
+Test(String.fromCharCode(0x445), String.fromCharCode(0x425));
+// Small and capital y-umlaut.
+Test(String.fromCharCode(0xff), String.fromCharCode(0x178));
+// Small and large Greek mu.
+Test(String.fromCharCode(0x3bc), String.fromCharCode(0x39c));
+// Micron and large Greek mu.
+Test(String.fromCharCode(0xb5), String.fromCharCode(0x39c));
+// Micron and small Greek mu.
+Test(String.fromCharCode(0xb5), String.fromCharCode(0x3bc));
+// German double s and capital S. These are not equivalent since one is double.
+TestFail(String.fromCharCode(0xdf), "S");
+// Small i and Turkish capital dotted I. These are not equivalent due to
+// 21.2.2.8.2 section 3g.  One is below 128 and the other is above 127.
+TestFail("i", String.fromCharCode(0x130));
+// Small dotless i and I. These are not equivalent either.
+TestFail(String.fromCharCode(0x131), "I");
diff --git a/test/mjsunit/regress/regress-crbug-482998.js b/test/mjsunit/regress/regress-crbug-482998.js
index 94ff5008e..80933a7a6 100644
--- a/test/mjsunit/regress/regress-crbug-482998.js
+++ b/test/mjsunit/regress/regress-crbug-482998.js
@@ -3,13 +3,13 @@
 // found in the LICENSE file.
 
 // Should not time out.  Running time 0.5s vs. 120s before the change.
-function collapse() {
+function collapse(flags) {
   var src = "(?:";
   for (var i = 128; i < 0x1000; i++) {
-    src += "a" + String.fromCharCode(i) + "|";
+    src += String.fromCharCode(96 + i % 26) + String.fromCharCode(i) + "|";
   }
   src += "aa)";
-  var collapsible = new RegExp(src);
+  var collapsible = new RegExp(src, flags);
   var subject = "zzzzzzz" + String.fromCharCode(3000);
   for (var i = 0; i < 1000; i++) {
     subject += "xxxxxxx";
@@ -19,4 +19,5 @@ function collapse() {
   }
 }
 
-collapse();
+collapse("i");
+collapse("");
-- 
2.34.1