Revert of Speedup stringsearch for two byte strings (patchset #3 id:40001 of https...
authormachenbach <machenbach@chromium.org>
Fri, 4 Sep 2015 13:00:28 +0000 (06:00 -0700)
committerCommit bot <commit-bot@chromium.org>
Fri, 4 Sep 2015 13:00:35 +0000 (13:00 +0000)
Reason for revert:
[Sheriff] Breaks fuzzer and msan:
http://build.chromium.org/p/client.v8/builders/V8%20Fuzzer/builds/4773

Repro with:
tools/fuzz-harness.sh out/Debug/d8
(in a ninja Debug build)

Msan:
http://build.chromium.org/p/client.v8/builders/V8%20Linux%20-%20arm64%20-%20sim%20-%20MSAN/builds/4097

Original issue's description:
> Speedup stringsearch for two byte strings
>
> Uses the lower byte with memchr which is
> significantly faster than a naive compare
>
> Performance difference with bench (http://hastebin.com/xuxexataso.js):
>
> old                             new
>
> single character                single character
> Κ found at 922                  Κ found at 922
> 3324                            616
> ㎡ found at 13217               ㎡ found at 13217
> 42366                           4931
> က found at 4096                 က found at 4096
> 13369                           9836
> ＀ found at 65280                ＀ found at 65280
> 207472                          36149
> ᆬ found at 65445                ᆬ found at 65445
> 209344                          36666
>   found at 8197                   found at 8197
> 26731                           11757
> 倂 found at 20482               倂 found at 20482
> 66071                           17193
>
> linear search                   linear search
> ΚΛ found at 922                 ΚΛ found at 922
> 4112                            504
> ㎡㎢ found at 13217             ㎡㎢ found at 13217
> 55105                           5119
> ᆬᆭ found at 65445               ᆬᆭ found at 65445
> 268016                          35496
>
> linear + bmh search             linear + bmh search
> ΚΛΜΝΞΟΠΡ found at 922           ΚΛΜΝΞΟΠΡ found at 922
> 2897                            522
> ᆬᆭᄃᄄᄅᆰᆱᆲ found at 65445         ᆬᆭᄃᄄᄅᆰᆱᆲ found at 65445
> 167687                          158465
>
> Committed: https://crrev.com/fced280f37588f8a232a414201276e053117e9ea
> Cr-Commit-Position: refs/heads/master@{#30587}

TBR=danno@chromium.org,mstarzinger@chromium.org,jkummerow@chromium.org,karl@skomski.com
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true

Review URL: https://codereview.chromium.org/1331433002

Cr-Commit-Position: refs/heads/master@{#30588}

AUTHORS
src/string-search.h
test/mjsunit/string-indexof-1.js

diff --git a/AUTHORS b/AUTHORS
index 82a9c16..72c23bc 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -67,7 +67,6 @@ Johan Bergström <johan@bergstroem.nu>
 Jonathan Liu <net147@gmail.com>
 JunHo Seo <sejunho@gmail.com>
 Kang-Hao (Kenny) Lu <kennyluck@csail.mit.edu>
-Karl Skomski <karl@skomski.com>
 Luis Reis <luis.m.reis@gmail.com>
 Luke Zarko <lukezarko@gmail.com>
 Maciej Małecki <me@mmalecki.com>
index aeaa9b3..349d4fd 100644 (file)
@@ -190,38 +190,6 @@ class StringSearch : private StringSearchBase {
 };
 
 
-template <typename PatternChar, typename SubjectChar>
-int FindFirstCharacter(Vector<const PatternChar> pattern,
-                       Vector<const SubjectChar> subject, int index) {
-  PatternChar pattern_first_char = pattern[0];
-
-  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
-    const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(memchr(
-        subject.start() + index, pattern_first_char, subject.length() - index));
-    if (char_pos == NULL) return -1;
-    return static_cast<int>(char_pos - subject.start());
-  } else {
-    const uint8_t search_low_byte =
-        static_cast<uint8_t>(pattern_first_char & 0xFF);
-    const SubjectChar search_char =
-        static_cast<SubjectChar>(pattern_first_char);
-    int pos = index;
-    do {
-      const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
-          memchr(subject.start() + pos, search_low_byte,
-                 (subject.length() - pos) * sizeof(SubjectChar)));
-      if (char_pos == NULL) return -1;
-      pos = static_cast<int>(char_pos - subject.start());
-      if (IsAligned(reinterpret_cast<uintptr_t>(char_pos),
-                    sizeof(SubjectChar))) {
-        if (subject[pos] == search_char) return pos;
-      }
-    } while (++pos < subject.length());
-  }
-  return -1;
-}
-
-
 //---------------------------------------------------------------------
 // Single Character Pattern Search Strategy
 //---------------------------------------------------------------------
@@ -233,15 +201,26 @@ int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
     int index) {
   DCHECK_EQ(1, search->pattern_.length());
   PatternChar pattern_first_char = search->pattern_[0];
+  int i = index;
   if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
-    return FindFirstCharacter(search->pattern_, subject, index);
+    const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
+        memchr(subject.start() + i,
+               pattern_first_char,
+               subject.length() - i));
+    if (pos == NULL) return -1;
+    return static_cast<int>(pos - subject.start());
   } else {
     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
       if (exceedsOneByte(pattern_first_char)) {
         return -1;
       }
     }
-    return FindFirstCharacter(search->pattern_, subject, index);
+    SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
+    int n = subject.length();
+    while (i < n) {
+      if (subject[i++] == search_char) return i - 1;
+    }
+    return -1;
   }
 }
 
@@ -275,12 +254,20 @@ int StringSearch<PatternChar, SubjectChar>::LinearSearch(
   Vector<const PatternChar> pattern = search->pattern_;
   DCHECK(pattern.length() > 1);
   int pattern_length = pattern.length();
+  PatternChar pattern_first_char = pattern[0];
   int i = index;
   int n = subject.length() - pattern_length;
   while (i <= n) {
-    i = FindFirstCharacter(pattern, subject, i);
-    if (i == -1) return -1;
-    i++;
+    if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
+      const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
+          memchr(subject.start() + i,
+                 pattern_first_char,
+                 n - i + 1));
+      if (pos == NULL) return -1;
+      i = static_cast<int>(pos - subject.start()) + 1;
+    } else {
+      if (subject[i++] != pattern_first_char) continue;
+    }
     // Loop extracted to separate function to allow using return to do
     // a deeper break.
     if (CharCompare(pattern.start() + 1,
@@ -518,11 +505,22 @@ int StringSearch<PatternChar, SubjectChar>::InitialSearch(
 
   // We know our pattern is at least 2 characters, we cache the first so
   // the common case of the first character not matching is faster.
+  PatternChar pattern_first_char = pattern[0];
   for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
     badness++;
     if (badness <= 0) {
-      i = FindFirstCharacter(pattern, subject, i);
-      if (i == -1) return -1;
+      if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
+        const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
+            memchr(subject.start() + i,
+                   pattern_first_char,
+                   n - i + 1));
+        if (pos == NULL) {
+          return -1;
+        }
+        i = static_cast<int>(pos - subject.start());
+      } else {
+        if (subject[i] != pattern_first_char) continue;
+      }
       int j = 1;
       do {
         if (pattern[j] != subject[i + j]) {
index 366437a..db3623f 100644 (file)
@@ -77,15 +77,6 @@ assertEquals(-1, twoByteString.indexOf("\u0391\u03a3\u0395"),
 //single char pattern
 assertEquals(4, twoByteString.indexOf("\u0395"));
 
-// test string with alignment traps
-var alignmentString = "\u1122\u2211\u2222\uFF00\u00FF\u00FF";
-assertEquals(2, alignmentString.indexOf("\u2222"));
-assertEquals(4, alignmentString.indexOf("\u00FF\u00FF"));
-
-var longAlignmentString = "\uFF00" + "\u00FF".repeat(10);
-assertEquals(1,
-    longAlignmentString.indexOf("\u00FF".repeat(10)));
-
 // Test complex string indexOf algorithms. Only trigger for long strings.
 
 // Long string that isn't a simple repeat of a shorter string.