1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef SRC_STRING_SEARCH_H_
6 #define SRC_STRING_SEARCH_H_
12 namespace stringsearch {
15 // Returns the maximum of the two parameters.
22 static const uint32_t kMaxOneByteCharCodeU = 0xff;
25 static inline size_t NonOneByteStart(const uint16_t* chars, size_t length) {
26 const uint16_t* limit = chars + length;
27 const uint16_t* start = chars;
28 while (chars < limit) {
29 if (*chars > kMaxOneByteCharCodeU)
30 return static_cast<size_t>(chars - start);
33 return static_cast<size_t>(chars - start);
37 static inline bool IsOneByte(const uint16_t* chars, size_t length) {
38 return NonOneByteStart(chars, length) >= length;
45 Vector(T* data, size_t length) : start_(data), length_(length) {
46 ASSERT(length > 0 && data != nullptr);
49 // Returns the length of the vector.
50 size_t length() const { return length_; }
52 T* start() const { return start_; }
54 // Access individual vector elements - checks bounds in debug mode.
55 T& operator[](size_t index) const {
56 ASSERT(0 <= index && index < length_);
60 const T& at(size_t index) const { return operator[](index); }
62 bool operator==(const Vector<T>& other) const {
63 if (length_ != other.length_)
65 if (start_ == other.start_)
67 for (size_t i = 0; i < length_; ++i) {
68 if (start_[i] != other.start_[i]) {
81 //---------------------------------------------------------------------
82 // String Search object.
83 //---------------------------------------------------------------------
85 // Class holding constants and methods that apply to all string search variants,
86 // independently of subject and pattern char size.
87 class StringSearchBase {
89 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
90 // limit, we can fix the size of tables. For a needle longer than this limit,
91 // search will not be optimal, since we only build tables for a suffix
92 // of the string, but it is a safe approximation.
93 static const int kBMMaxShift = 250;
95 // Reduce alphabet to this size.
96 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
97 // proportional to the input alphabet. We reduce the alphabet size by
98 // equating input characters modulo a smaller alphabet size. This gives
99 // a potentially less efficient searching, but is a safe approximation.
100 // For needles using only characters in the same Unicode 256-code point page,
101 // there is no search speed degradation.
102 static const int kLatin1AlphabetSize = 256;
103 static const int kUC16AlphabetSize = 256;
105 // Bad-char shift table stored in the state. It's length is the alphabet size.
106 // For patterns below this length, the skip length of Boyer-Moore is too short
107 // to compensate for the algorithmic overhead compared to simple brute force.
108 static const int kBMMinPatternLength = 8;
110 // Store for the BoyerMoore(Horspool) bad char shift table.
111 static int kBadCharShiftTable[kUC16AlphabetSize];
112 // Store for the BoyerMoore good suffix shift table.
113 static int kGoodSuffixShiftTable[kBMMaxShift + 1];
114 // Table used temporarily while building the BoyerMoore good suffix
116 static int kSuffixTable[kBMMaxShift + 1];
118 static inline bool IsOneByteString(Vector<const uint8_t> string) {
122 static inline bool IsOneByteString(Vector<const uint16_t> string) {
123 return IsOneByte(string.start(), string.length());
127 template <typename PatternChar, typename SubjectChar>
128 class StringSearch : private StringSearchBase {
130 explicit StringSearch(Vector<const PatternChar> pattern)
131 : pattern_(pattern), start_(0) {
132 if (pattern.length() >= kBMMaxShift) {
133 start_ = pattern.length() - kBMMaxShift;
136 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
137 if (!IsOneByteString(pattern_)) {
138 strategy_ = &FailSearch;
142 size_t pattern_length = pattern_.length();
143 CHECK_GT(pattern_length, 0);
144 if (pattern_length < kBMMinPatternLength) {
145 if (pattern_length == 1) {
146 strategy_ = &SingleCharSearch;
149 strategy_ = &LinearSearch;
152 strategy_ = &InitialSearch;
155 size_t Search(Vector<const SubjectChar> subject, size_t index) {
156 return strategy_(this, subject, index);
159 static inline int AlphabetSize() {
160 if (sizeof(PatternChar) == 1) {
162 return kLatin1AlphabetSize;
165 return kUC16AlphabetSize;
168 static_assert(sizeof(PatternChar) == sizeof(uint8_t) ||
169 sizeof(PatternChar) == sizeof(uint16_t),
170 "sizeof(PatternChar) == sizeof(uint16_t) || sizeof(uint8_t)");
174 typedef size_t (*SearchFunction)( // NOLINT - it's not a cast!
175 StringSearch<PatternChar, SubjectChar>*,
176 Vector<const SubjectChar>,
179 static size_t FailSearch(StringSearch<PatternChar, SubjectChar>*,
180 Vector<const SubjectChar> subject,
182 return subject.length();
185 static size_t SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
186 Vector<const SubjectChar> subject,
189 static size_t LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
190 Vector<const SubjectChar> subject,
193 static size_t InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
194 Vector<const SubjectChar> subject,
197 static size_t BoyerMooreHorspoolSearch(
198 StringSearch<PatternChar, SubjectChar>* search,
199 Vector<const SubjectChar> subject,
202 static size_t BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
203 Vector<const SubjectChar> subject,
206 void PopulateBoyerMooreHorspoolTable();
208 void PopulateBoyerMooreTable();
210 static inline bool exceedsOneByte(uint8_t c) { return false; }
212 static inline bool exceedsOneByte(uint16_t c) {
213 return c > kMaxOneByteCharCodeU;
216 static inline int CharOccurrence(int* bad_char_occurrence,
217 SubjectChar char_code) {
218 if (sizeof(SubjectChar) == 1) {
219 return bad_char_occurrence[static_cast<int>(char_code)];
221 if (sizeof(PatternChar) == 1) {
222 if (exceedsOneByte(char_code)) {
225 return bad_char_occurrence[static_cast<unsigned int>(char_code)];
227 // Both pattern and subject are UC16. Reduce character to equivalence class.
228 int equiv_class = char_code % kUC16AlphabetSize;
229 return bad_char_occurrence[equiv_class];
232 // Store for the BoyerMoore(Horspool) bad char shift table.
233 // Return a table covering the last kBMMaxShift+1 positions of
235 int* bad_char_table() { return kBadCharShiftTable; }
237 // Store for the BoyerMoore good suffix shift table.
238 int* good_suffix_shift_table() {
239 // Return biased pointer that maps the range [start_..pattern_.length()
240 // to the kGoodSuffixShiftTable array.
241 return kGoodSuffixShiftTable - start_;
244 // Table used temporarily while building the BoyerMoore good suffix
246 int* suffix_table() {
247 // Return biased pointer that maps the range [start_..pattern_.length()
248 // to the kSuffixTable array.
249 return kSuffixTable - start_;
252 // The pattern to search for.
253 Vector<const PatternChar> pattern_;
254 // Pointer to implementation of the search.
255 SearchFunction strategy_;
256 // Cache value of Max(0, pattern_length() - kBMMaxShift)
261 template <typename T, typename U>
262 inline T AlignDown(T value, U alignment) {
263 return reinterpret_cast<T>(
264 (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
268 inline uint8_t GetHighestValueByte(uint16_t character) {
269 return Max(static_cast<uint8_t>(character & 0xFF),
270 static_cast<uint8_t>(character >> 8));
274 inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
277 template <typename PatternChar, typename SubjectChar>
278 inline size_t FindFirstCharacter(Vector<const PatternChar> pattern,
279 Vector<const SubjectChar> subject, size_t index) {
280 const PatternChar pattern_first_char = pattern[0];
281 const size_t max_n = (subject.length() - pattern.length() + 1);
283 const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
284 const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
287 const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
288 memchr(subject.start() + pos, search_byte,
289 (max_n - pos) * sizeof(SubjectChar)));
290 if (char_pos == nullptr)
291 return subject.length();
292 char_pos = AlignDown(char_pos, sizeof(SubjectChar));
293 pos = static_cast<size_t>(char_pos - subject.start());
294 if (subject[pos] == search_char)
296 } while (++pos < max_n);
298 return subject.length();
303 inline size_t FindFirstCharacter(Vector<const uint8_t> pattern,
304 Vector<const uint8_t> subject,
306 const uint8_t pattern_first_char = pattern[0];
307 const size_t max_n = (subject.length() - pattern.length() + 1);
309 const uint8_t* char_pos = reinterpret_cast<const uint8_t*>(
310 memchr(subject.start() + index, pattern_first_char, max_n - index));
311 if (char_pos == nullptr)
312 return subject.length();
313 return static_cast<size_t>(char_pos - subject.start());
316 //---------------------------------------------------------------------
317 // Single Character Pattern Search Strategy
318 //---------------------------------------------------------------------
320 template <typename PatternChar, typename SubjectChar>
321 size_t StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
322 StringSearch<PatternChar, SubjectChar>* search,
323 Vector<const SubjectChar> subject,
325 CHECK_EQ(1, search->pattern_.length());
326 PatternChar pattern_first_char = search->pattern_[0];
328 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
329 return FindFirstCharacter(search->pattern_, subject, index);
331 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
332 if (exceedsOneByte(pattern_first_char)) {
336 return FindFirstCharacter(search->pattern_, subject, index);
340 //---------------------------------------------------------------------
341 // Linear Search Strategy
342 //---------------------------------------------------------------------
344 template <typename PatternChar, typename SubjectChar>
345 inline bool CharCompare(const PatternChar* pattern,
346 const SubjectChar* subject,
348 ASSERT_GT(length, 0);
351 if (pattern[pos] != subject[pos]) {
355 } while (pos < length);
359 // Simple linear search for short patterns. Never bails out.
360 template <typename PatternChar, typename SubjectChar>
361 size_t StringSearch<PatternChar, SubjectChar>::LinearSearch(
362 StringSearch<PatternChar, SubjectChar>* search,
363 Vector<const SubjectChar> subject,
365 Vector<const PatternChar> pattern = search->pattern_;
366 CHECK_GT(pattern.length(), 1);
367 const size_t pattern_length = pattern.length();
369 const size_t n = subject.length() - pattern_length;
371 i = FindFirstCharacter(pattern, subject, i);
372 if (i == subject.length())
373 return subject.length();
377 // Loop extracted to separate function to allow using return to do
379 if (CharCompare(pattern.start() + 1, subject.start() + i,
380 pattern_length - 1)) {
384 return subject.length();
387 //---------------------------------------------------------------------
388 // Boyer-Moore string search
389 //---------------------------------------------------------------------
391 template <typename PatternChar, typename SubjectChar>
392 size_t StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
393 StringSearch<PatternChar, SubjectChar>* search,
394 Vector<const SubjectChar> subject,
395 size_t start_index) {
396 Vector<const PatternChar> pattern = search->pattern_;
397 const size_t subject_length = subject.length();
398 const size_t pattern_length = pattern.length();
399 // Only preprocess at most kBMMaxShift last characters of pattern.
400 size_t start = search->start_;
402 int* bad_char_occurence = search->bad_char_table();
403 int* good_suffix_shift = search->good_suffix_shift_table();
405 PatternChar last_char = pattern[pattern_length - 1];
406 size_t index = start_index;
407 // Continue search from i.
408 while (index <= subject_length - pattern_length) {
409 size_t j = pattern_length - 1;
411 while (last_char != (c = subject[index + j])) {
412 int shift = j - CharOccurrence(bad_char_occurence, c);
414 if (index > subject_length - pattern_length) {
415 return subject.length();
418 while (j >= 0 && pattern[j] == (c = subject[index + j])) {
425 // we have matched more than our tables allow us to be smart about.
426 // Fall back on BMH shift.
427 index += pattern_length - 1 -
428 CharOccurrence(bad_char_occurence,
429 static_cast<SubjectChar>(last_char));
431 int gs_shift = good_suffix_shift[j + 1];
432 int bc_occ = CharOccurrence(bad_char_occurence, c);
433 int shift = j - bc_occ;
434 if (gs_shift > shift) {
441 return subject.length();
444 template <typename PatternChar, typename SubjectChar>
445 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
446 const size_t pattern_length = pattern_.length();
447 const PatternChar* pattern = pattern_.start();
448 // Only look at the last kBMMaxShift characters of pattern (from start_
449 // to pattern_length).
450 const size_t start = start_;
451 const size_t length = pattern_length - start;
453 // Biased tables so that we can use pattern indices as table indices,
454 // even if we only cover the part of the pattern from offset start.
455 int* shift_table = good_suffix_shift_table();
456 int* suffix_table = this->suffix_table();
459 for (size_t i = start; i < pattern_length; i++) {
460 shift_table[i] = length;
462 shift_table[pattern_length] = 1;
463 suffix_table[pattern_length] = pattern_length + 1;
465 if (pattern_length <= start) {
470 PatternChar last_char = pattern[pattern_length - 1];
471 size_t suffix = pattern_length + 1;
473 size_t i = pattern_length;
475 PatternChar c = pattern[i - 1];
476 while (suffix <= pattern_length && c != pattern[suffix - 1]) {
477 if (static_cast<size_t>(shift_table[suffix]) == length) {
478 shift_table[suffix] = suffix - i;
480 suffix = suffix_table[suffix];
482 suffix_table[--i] = --suffix;
483 if (suffix == pattern_length) {
484 // No suffix to extend, so we check against last_char only.
485 while ((i > start) && (pattern[i - 1] != last_char)) {
486 if (static_cast<size_t>(shift_table[pattern_length]) == length) {
487 shift_table[pattern_length] = pattern_length - i;
489 suffix_table[--i] = pattern_length;
492 suffix_table[--i] = --suffix;
497 // Build shift table using suffixes.
498 if (suffix < pattern_length) {
499 for (size_t i = start; i <= pattern_length; i++) {
500 if (static_cast<size_t>(shift_table[i]) == length) {
501 shift_table[i] = suffix - start;
504 suffix = suffix_table[suffix];
510 //---------------------------------------------------------------------
511 // Boyer-Moore-Horspool string search.
512 //---------------------------------------------------------------------
514 template <typename PatternChar, typename SubjectChar>
515 size_t StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
516 StringSearch<PatternChar, SubjectChar>* search,
517 Vector<const SubjectChar> subject,
518 size_t start_index) {
519 Vector<const PatternChar> pattern = search->pattern_;
520 const size_t subject_length = subject.length();
521 const size_t pattern_length = pattern.length();
522 int* char_occurrences = search->bad_char_table();
523 int64_t badness = -pattern_length;
525 // How bad we are doing without a good-suffix table.
526 PatternChar last_char = pattern[pattern_length - 1];
527 int last_char_shift =
529 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
532 size_t index = start_index; // No matches found prior to this index.
533 while (index <= subject_length - pattern_length) {
534 size_t j = pattern_length - 1;
536 while (last_char != (subject_char = subject[index + j])) {
537 int bc_occ = CharOccurrence(char_occurrences, subject_char);
538 int shift = j - bc_occ;
540 badness += 1 - shift; // at most zero, so badness cannot increase.
541 if (index > subject_length - pattern_length) {
542 return subject_length;
546 while (j >= 0 && pattern[j] == (subject[index + j])) {
552 index += last_char_shift;
553 // Badness increases by the number of characters we have
554 // checked, and decreases by the number of characters we
555 // can skip by shifting. It's a measure of how we are doing
556 // compared to reading each character exactly once.
557 badness += (pattern_length - j) - last_char_shift;
559 search->PopulateBoyerMooreTable();
560 search->strategy_ = &BoyerMooreSearch;
561 return BoyerMooreSearch(search, subject, index);
564 return subject.length();
567 template <typename PatternChar, typename SubjectChar>
568 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
569 const size_t pattern_length = pattern_.length();
571 int* bad_char_occurrence = bad_char_table();
573 // Only preprocess at most kBMMaxShift last characters of pattern.
574 const size_t start = start_;
575 // Run forwards to populate bad_char_table, so that *last* instance
576 // of character equivalence class is the one registered.
577 // Notice: Doesn't include the last character.
578 const size_t table_size = AlphabetSize();
580 // All patterns less than kBMMaxShift in length.
581 memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
583 for (size_t i = 0; i < table_size; i++) {
584 bad_char_occurrence[i] = start - 1;
587 for (size_t i = start; i < pattern_length - 1; i++) {
588 PatternChar c = pattern_[i];
589 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
590 bad_char_occurrence[bucket] = i;
594 //---------------------------------------------------------------------
595 // Linear string search with bailout to BMH.
596 //---------------------------------------------------------------------
598 // Simple linear search for short patterns, which bails out if the string
599 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
600 template <typename PatternChar, typename SubjectChar>
601 size_t StringSearch<PatternChar, SubjectChar>::InitialSearch(
602 StringSearch<PatternChar, SubjectChar>* search,
603 Vector<const SubjectChar> subject,
605 Vector<const PatternChar> pattern = search->pattern_;
606 const size_t pattern_length = pattern.length();
607 // Badness is a count of how much work we have done. When we have
608 // done enough work we decide it's probably worth switching to a better
610 int64_t badness = -10 - (pattern_length << 2);
612 // We know our pattern is at least 2 characters, we cache the first so
613 // the common case of the first character not matching is faster.
614 for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) {
617 i = FindFirstCharacter(pattern, subject, i);
618 if (i == subject.length())
619 return subject.length();
623 if (pattern[j] != subject[i + j]) {
627 } while (j < pattern_length);
628 if (j == pattern_length) {
633 search->PopulateBoyerMooreHorspoolTable();
634 search->strategy_ = &BoyerMooreHorspoolSearch;
635 return BoyerMooreHorspoolSearch(search, subject, i);
638 return subject.length();
641 // Perform a a single stand-alone search.
642 // If searching multiple times for the same pattern, a search
643 // object should be constructed once and the Search function then called
645 template <typename SubjectChar, typename PatternChar>
646 size_t SearchString(Vector<const SubjectChar> subject,
647 Vector<const PatternChar> pattern,
648 size_t start_index) {
649 StringSearch<PatternChar, SubjectChar> search(pattern);
650 return search.Search(subject, start_index);
653 } // namespace node::stringsearch
656 using node::stringsearch::Vector;
658 template <typename SubjectChar, typename PatternChar>
659 size_t SearchString(const SubjectChar* haystack,
660 size_t haystack_length,
661 const PatternChar* needle,
662 size_t needle_length,
663 size_t start_index) {
664 return node::stringsearch::SearchString(
665 Vector<const SubjectChar>(haystack, haystack_length),
666 Vector<const PatternChar>(needle, needle_length),
671 #endif // SRC_STRING_SEARCH_H_