Reland: Speedup stringsearch for two byte strings

author karl <karl@skomski.com>

Fri, 4 Sep 2015 19:58:35 +0000 (12:58 -0700)

committer Commit bot <commit-bot@chromium.org>

Fri, 4 Sep 2015 19:58:44 +0000 (19:58 +0000)
author karl <karl@skomski.com>
Fri, 4 Sep 2015 19:58:35 +0000 (12:58 -0700)
committer Commit bot <commit-bot@chromium.org>
Fri, 4 Sep 2015 19:58:44 +0000 (19:58 +0000)
diff --git a/AUTHORS b/AUTHORS

index 72c23bc..82a9c16 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -67,6 +67,7 @@ Johan Bergström <johan@bergstroem.nu>
  Jonathan Liu <net147@gmail.com>
  JunHo Seo <sejunho@gmail.com>
  Kang-Hao (Kenny) Lu <kennyluck@csail.mit.edu>
+Karl Skomski <karl@skomski.com>
  Luis Reis <luis.m.reis@gmail.com>
  Luke Zarko <lukezarko@gmail.com>
  Maciej Małecki <me@mmalecki.com>
diff --git a/src/string-search.h b/src/string-search.h

index 349d4fd..e2e540b 100644 (file)
--- a/src/string-search.h
+++ b/src/string-search.h
@@ -190,6 +190,41 @@ class StringSearch : private StringSearchBase {
  };
  
  
+template <typename PatternChar, typename SubjectChar>
+int FindFirstCharacter(Vector<const PatternChar> pattern,
+                       Vector<const SubjectChar> subject, int index) {
+  PatternChar pattern_first_char = pattern[0];
+  const int max_n = (subject.length() - pattern.length() + 1);
+
+  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
+    DCHECK_GE(max_n - index, 0);
+    const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
+        memchr(subject.start() + index, pattern_first_char, max_n - index));
+    if (char_pos == NULL) return -1;
+    return static_cast<int>(char_pos - subject.start());
+  } else {
+    const uint8_t search_low_byte =
+        static_cast<uint8_t>(pattern_first_char & 0xFF);
+    const SubjectChar search_char =
+        static_cast<SubjectChar>(pattern_first_char);
+    int pos = index;
+    do {
+      DCHECK_GE(max_n - pos, 0);
+      const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
+          memchr(subject.start() + pos, search_low_byte,
+                 (max_n - pos) * sizeof(SubjectChar)));
+      if (char_pos == NULL) return -1;
+      pos = static_cast<int>(char_pos - subject.start());
+      if (IsAligned(reinterpret_cast<uintptr_t>(char_pos),
+                    sizeof(SubjectChar))) {
+        if (subject[pos] == search_char) return pos;
+      }
+    } while (++pos < max_n);
+  }
+  return -1;
+}
+
+
  //---------------------------------------------------------------------
  // Single Character Pattern Search Strategy
  //---------------------------------------------------------------------
@@ -201,26 +236,15 @@ int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
      int index) {
    DCHECK_EQ(1, search->pattern_.length());
    PatternChar pattern_first_char = search->pattern_[0];
-  int i = index;
    if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
-    const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
-        memchr(subject.start() + i,
-               pattern_first_char,
-               subject.length() - i));
-    if (pos == NULL) return -1;
-    return static_cast<int>(pos - subject.start());
+    return FindFirstCharacter(search->pattern_, subject, index);
    } else {
      if (sizeof(PatternChar) > sizeof(SubjectChar)) {
        if (exceedsOneByte(pattern_first_char)) {
          return -1;
        }
      }
-    SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
-    int n = subject.length();
-    while (i < n) {
-      if (subject[i++] == search_char) return i - 1;
-    }
-    return -1;
+    return FindFirstCharacter(search->pattern_, subject, index);
    }
  }
  
@@ -254,20 +278,13 @@ int StringSearch<PatternChar, SubjectChar>::LinearSearch(
    Vector<const PatternChar> pattern = search->pattern_;
    DCHECK(pattern.length() > 1);
    int pattern_length = pattern.length();
-  PatternChar pattern_first_char = pattern[0];
    int i = index;
    int n = subject.length() - pattern_length;
    while (i <= n) {
-    if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
-      const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
-          memchr(subject.start() + i,
-                 pattern_first_char,
-                 n - i + 1));
-      if (pos == NULL) return -1;
-      i = static_cast<int>(pos - subject.start()) + 1;
-    } else {
-      if (subject[i++] != pattern_first_char) continue;
-    }
+    i = FindFirstCharacter(pattern, subject, i);
+    if (i == -1) return -1;
+    DCHECK_LE(i, n);
+    i++;
      // Loop extracted to separate function to allow using return to do
      // a deeper break.
      if (CharCompare(pattern.start() + 1,
@@ -505,22 +522,12 @@ int StringSearch<PatternChar, SubjectChar>::InitialSearch(
  
    // We know our pattern is at least 2 characters, we cache the first so
    // the common case of the first character not matching is faster.
-  PatternChar pattern_first_char = pattern[0];
    for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
      badness++;
      if (badness <= 0) {
-      if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
-        const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
-            memchr(subject.start() + i,
-                   pattern_first_char,
-                   n - i + 1));
-        if (pos == NULL) {
-          return -1;
-        }
-        i = static_cast<int>(pos - subject.start());
-      } else {
-        if (subject[i] != pattern_first_char) continue;
-      }
+      i = FindFirstCharacter(pattern, subject, i);
+      if (i == -1) return -1;
+      DCHECK_LE(i, n);
        int j = 1;
        do {
          if (pattern[j] != subject[i + j]) {
diff --git a/test/mjsunit/string-indexof-1.js b/test/mjsunit/string-indexof-1.js

index db3623f..b9dad46 100644 (file)
--- a/test/mjsunit/string-indexof-1.js
+++ b/test/mjsunit/string-indexof-1.js
@@ -77,6 +77,20 @@ assertEquals(-1, twoByteString.indexOf("\u0391\u03a3\u0395"),
  //single char pattern
  assertEquals(4, twoByteString.indexOf("\u0395"));
  
+// test string with alignment traps
+var alignmentString = "\u1122\u2211\u2222\uFF00\u00FF\u00FF";
+assertEquals(2, alignmentString.indexOf("\u2222"));
+assertEquals(4, alignmentString.indexOf("\u00FF\u00FF"));
+
+var longAlignmentString = "\uFF00" + "\u00FF".repeat(10);
+assertEquals(1,
+    longAlignmentString.indexOf("\u00FF".repeat(10)));
+
+// test string with first character match at the end
+var boundsString = "112233";
+assertEquals(-1, boundsString.indexOf("334455"));
+assertEquals(-1, boundsString.indexOf("334455".repeat(10)));
+
  // Test complex string indexOf algorithms. Only trigger for long strings.
  
  // Long string that isn't a simple repeat of a shorter string.
author	karl <karl@skomski.com>
	Fri, 4 Sep 2015 19:58:35 +0000 (12:58 -0700)
committer	Commit bot <commit-bot@chromium.org>
	Fri, 4 Sep 2015 19:58:44 +0000 (19:58 +0000)
AUTHORS		patch \| blob \| history
src/string-search.h		patch \| blob \| history
test/mjsunit/string-indexof-1.js		patch \| blob \| history