From: karl Date: Fri, 4 Sep 2015 19:58:35 +0000 (-0700) Subject: Reland: Speedup stringsearch for two byte strings X-Git-Tag: upstream/4.7.83~443 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=24d481165cfda1669c92c06c0a67348f3e4add91;p=platform%2Fupstream%2Fv8.git Reland: Speedup stringsearch for two byte strings Uses the lower byte with memchr which is significantly faster than a naive compare Performance difference with bench (http://hastebin.com/xuxexataso.js): old new single character single character Κ found at 922 Κ found at 922 3324 616 ㎡ found at 13217 ㎡ found at 13217 42366 4931 က found at 4096 က found at 4096 13369 9836 ＀ found at 65280 ＀ found at 65280 207472 36149 ᆬ found at 65445 ᆬ found at 65445 209344 36666   found at 8197   found at 8197 26731 11757 倂 found at 20482 倂 found at 20482 66071 17193 linear search linear search ΚΛ found at 922 ΚΛ found at 922 4112 504 ㎡㎢ found at 13217 ㎡㎢ found at 13217 55105 5119 ᆬᆭ found at 65445 ᆬᆭ found at 65445 268016 35496 linear + bmh search linear + bmh search ΚΛΜΝΞΟΠΡ found at 922 ΚΛΜΝΞΟΠΡ found at 922 2897 522 ᆬᆭᄃᄄᄅᆰᆱᆲ found at 65445 ᆬᆭᄃᄄᄅᆰᆱᆲ found at 65445 167687 35283 BUG= Review URL: https://codereview.chromium.org/1324453007 Cr-Commit-Position: refs/heads/master@{#30597} --- diff --git a/AUTHORS b/AUTHORS index 72c23bc..82a9c16 100644 --- a/AUTHORS +++ b/AUTHORS @@ -67,6 +67,7 @@ Johan Bergström Jonathan Liu JunHo Seo Kang-Hao (Kenny) Lu +Karl Skomski Luis Reis Luke Zarko Maciej Małecki diff --git a/src/string-search.h b/src/string-search.h index 349d4fd..e2e540b 100644 --- a/src/string-search.h +++ b/src/string-search.h @@ -190,6 +190,41 @@ class StringSearch : private StringSearchBase { }; +template +int FindFirstCharacter(Vector pattern, + Vector subject, int index) { + PatternChar pattern_first_char = pattern[0]; + const int max_n = (subject.length() - pattern.length() + 1); + + if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { + DCHECK_GE(max_n - index, 0); + const SubjectChar* char_pos = reinterpret_cast( + memchr(subject.start() + index, pattern_first_char, max_n - index)); + if (char_pos == NULL) return -1; + return static_cast(char_pos - subject.start()); + } else { + const uint8_t search_low_byte = + static_cast(pattern_first_char & 0xFF); + const SubjectChar search_char = + static_cast(pattern_first_char); + int pos = index; + do { + DCHECK_GE(max_n - pos, 0); + const SubjectChar* char_pos = reinterpret_cast( + memchr(subject.start() + pos, search_low_byte, + (max_n - pos) * sizeof(SubjectChar))); + if (char_pos == NULL) return -1; + pos = static_cast(char_pos - subject.start()); + if (IsAligned(reinterpret_cast(char_pos), + sizeof(SubjectChar))) { + if (subject[pos] == search_char) return pos; + } + } while (++pos < max_n); + } + return -1; +} + + //--------------------------------------------------------------------- // Single Character Pattern Search Strategy //--------------------------------------------------------------------- @@ -201,26 +236,15 @@ int StringSearch::SingleCharSearch( int index) { DCHECK_EQ(1, search->pattern_.length()); PatternChar pattern_first_char = search->pattern_[0]; - int i = index; if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { - const SubjectChar* pos = reinterpret_cast( - memchr(subject.start() + i, - pattern_first_char, - subject.length() - i)); - if (pos == NULL) return -1; - return static_cast(pos - subject.start()); + return FindFirstCharacter(search->pattern_, subject, index); } else { if (sizeof(PatternChar) > sizeof(SubjectChar)) { if (exceedsOneByte(pattern_first_char)) { return -1; } } - SubjectChar search_char = static_cast(pattern_first_char); - int n = subject.length(); - while (i < n) { - if (subject[i++] == search_char) return i - 1; - } - return -1; + return FindFirstCharacter(search->pattern_, subject, index); } } @@ -254,20 +278,13 @@ int StringSearch::LinearSearch( Vector pattern = search->pattern_; DCHECK(pattern.length() > 1); int pattern_length = pattern.length(); - PatternChar pattern_first_char = pattern[0]; int i = index; int n = subject.length() - pattern_length; while (i <= n) { - if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { - const SubjectChar* pos = reinterpret_cast( - memchr(subject.start() + i, - pattern_first_char, - n - i + 1)); - if (pos == NULL) return -1; - i = static_cast(pos - subject.start()) + 1; - } else { - if (subject[i++] != pattern_first_char) continue; - } + i = FindFirstCharacter(pattern, subject, i); + if (i == -1) return -1; + DCHECK_LE(i, n); + i++; // Loop extracted to separate function to allow using return to do // a deeper break. if (CharCompare(pattern.start() + 1, @@ -505,22 +522,12 @@ int StringSearch::InitialSearch( // We know our pattern is at least 2 characters, we cache the first so // the common case of the first character not matching is faster. - PatternChar pattern_first_char = pattern[0]; for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { badness++; if (badness <= 0) { - if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { - const SubjectChar* pos = reinterpret_cast( - memchr(subject.start() + i, - pattern_first_char, - n - i + 1)); - if (pos == NULL) { - return -1; - } - i = static_cast(pos - subject.start()); - } else { - if (subject[i] != pattern_first_char) continue; - } + i = FindFirstCharacter(pattern, subject, i); + if (i == -1) return -1; + DCHECK_LE(i, n); int j = 1; do { if (pattern[j] != subject[i + j]) { diff --git a/test/mjsunit/string-indexof-1.js b/test/mjsunit/string-indexof-1.js index db3623f..b9dad46 100644 --- a/test/mjsunit/string-indexof-1.js +++ b/test/mjsunit/string-indexof-1.js @@ -77,6 +77,20 @@ assertEquals(-1, twoByteString.indexOf("\u0391\u03a3\u0395"), //single char pattern assertEquals(4, twoByteString.indexOf("\u0395")); +// test string with alignment traps +var alignmentString = "\u1122\u2211\u2222\uFF00\u00FF\u00FF"; +assertEquals(2, alignmentString.indexOf("\u2222")); +assertEquals(4, alignmentString.indexOf("\u00FF\u00FF")); + +var longAlignmentString = "\uFF00" + "\u00FF".repeat(10); +assertEquals(1, + longAlignmentString.indexOf("\u00FF".repeat(10))); + +// test string with first character match at the end +var boundsString = "112233"; +assertEquals(-1, boundsString.indexOf("334455")); +assertEquals(-1, boundsString.indexOf("334455".repeat(10))); + // Test complex string indexOf algorithms. Only trigger for long strings. // Long string that isn't a simple repeat of a shorter string.