1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef V8_STRING_SEARCH_H_
6 #define V8_STRING_SEARCH_H_
12 //---------------------------------------------------------------------
13 // String Search object.
14 //---------------------------------------------------------------------
16 // Class holding constants and methods that apply to all string search variants,
17 // independently of subject and pattern char size.
18 class StringSearchBase {
20 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
21 // limit, we can fix the size of tables. For a needle longer than this limit,
22 // search will not be optimal, since we only build tables for a suffix
23 // of the string, but it is a safe approximation.
24 static const int kBMMaxShift = Isolate::kBMMaxShift;
26 // Reduce alphabet to this size.
27 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
28 // proportional to the input alphabet. We reduce the alphabet size by
29 // equating input characters modulo a smaller alphabet size. This gives
30 // a potentially less efficient searching, but is a safe approximation.
31 // For needles using only characters in the same Unicode 256-code point page,
32 // there is no search speed degradation.
33 static const int kAsciiAlphabetSize = 256;
34 static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
36 // Bad-char shift table stored in the state. It's length is the alphabet size.
37 // For patterns below this length, the skip length of Boyer-Moore is too short
38 // to compensate for the algorithmic overhead compared to simple brute force.
39 static const int kBMMinPatternLength = 7;
41 static inline bool IsOneByteString(Vector<const uint8_t> string) {
45 static inline bool IsOneByteString(Vector<const uc16> string) {
46 return String::IsOneByte(string.start(), string.length());
53 template <typename PatternChar, typename SubjectChar>
54 class StringSearch : private StringSearchBase {
56 StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
59 start_(Max(0, pattern.length() - kBMMaxShift)) {
60 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
61 if (!IsOneByteString(pattern_)) {
62 strategy_ = &FailSearch;
66 int pattern_length = pattern_.length();
67 if (pattern_length < kBMMinPatternLength) {
68 if (pattern_length == 1) {
69 strategy_ = &SingleCharSearch;
72 strategy_ = &LinearSearch;
75 strategy_ = &InitialSearch;
78 int Search(Vector<const SubjectChar> subject, int index) {
79 return strategy_(this, subject, index);
82 static inline int AlphabetSize() {
83 if (sizeof(PatternChar) == 1) {
85 return kAsciiAlphabetSize;
87 DCHECK(sizeof(PatternChar) == 2);
89 return kUC16AlphabetSize;
94 typedef int (*SearchFunction)( // NOLINT - it's not a cast!
95 StringSearch<PatternChar, SubjectChar>*,
96 Vector<const SubjectChar>,
99 static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
100 Vector<const SubjectChar>,
105 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
106 Vector<const SubjectChar> subject,
109 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
110 Vector<const SubjectChar> subject,
113 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
114 Vector<const SubjectChar> subject,
117 static int BoyerMooreHorspoolSearch(
118 StringSearch<PatternChar, SubjectChar>* search,
119 Vector<const SubjectChar> subject,
122 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
123 Vector<const SubjectChar> subject,
126 void PopulateBoyerMooreHorspoolTable();
128 void PopulateBoyerMooreTable();
130 static inline bool exceedsOneByte(uint8_t c) {
134 static inline bool exceedsOneByte(uint16_t c) {
135 return c > String::kMaxOneByteCharCodeU;
138 static inline int CharOccurrence(int* bad_char_occurrence,
139 SubjectChar char_code) {
140 if (sizeof(SubjectChar) == 1) {
141 return bad_char_occurrence[static_cast<int>(char_code)];
143 if (sizeof(PatternChar) == 1) {
144 if (exceedsOneByte(char_code)) {
147 return bad_char_occurrence[static_cast<unsigned int>(char_code)];
149 // Both pattern and subject are UC16. Reduce character to equivalence class.
150 int equiv_class = char_code % kUC16AlphabetSize;
151 return bad_char_occurrence[equiv_class];
154 // The following tables are shared by all searches.
155 // TODO(lrn): Introduce a way for a pattern to keep its tables
156 // between searches (e.g., for an Atom RegExp).
158 // Store for the BoyerMoore(Horspool) bad char shift table.
159 // Return a table covering the last kBMMaxShift+1 positions of
161 int* bad_char_table() {
162 return isolate_->bad_char_shift_table();
165 // Store for the BoyerMoore good suffix shift table.
166 int* good_suffix_shift_table() {
167 // Return biased pointer that maps the range [start_..pattern_.length()
168 // to the kGoodSuffixShiftTable array.
169 return isolate_->good_suffix_shift_table() - start_;
172 // Table used temporarily while building the BoyerMoore good suffix
174 int* suffix_table() {
175 // Return biased pointer that maps the range [start_..pattern_.length()
176 // to the kSuffixTable array.
177 return isolate_->suffix_table() - start_;
181 // The pattern to search for.
182 Vector<const PatternChar> pattern_;
183 // Pointer to implementation of the search.
184 SearchFunction strategy_;
185 // Cache value of Max(0, pattern_length() - kBMMaxShift)
190 //---------------------------------------------------------------------
191 // Single Character Pattern Search Strategy
192 //---------------------------------------------------------------------
194 template <typename PatternChar, typename SubjectChar>
195 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
196 StringSearch<PatternChar, SubjectChar>* search,
197 Vector<const SubjectChar> subject,
199 DCHECK_EQ(1, search->pattern_.length());
200 PatternChar pattern_first_char = search->pattern_[0];
202 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
203 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
204 memchr(subject.start() + i,
206 subject.length() - i));
207 if (pos == NULL) return -1;
208 return static_cast<int>(pos - subject.start());
210 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
211 if (exceedsOneByte(pattern_first_char)) {
215 SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
216 int n = subject.length();
218 if (subject[i++] == search_char) return i - 1;
224 //---------------------------------------------------------------------
225 // Linear Search Strategy
226 //---------------------------------------------------------------------
229 template <typename PatternChar, typename SubjectChar>
230 inline bool CharCompare(const PatternChar* pattern,
231 const SubjectChar* subject,
236 if (pattern[pos] != subject[pos]) {
240 } while (pos < length);
245 // Simple linear search for short patterns. Never bails out.
246 template <typename PatternChar, typename SubjectChar>
247 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
248 StringSearch<PatternChar, SubjectChar>* search,
249 Vector<const SubjectChar> subject,
251 Vector<const PatternChar> pattern = search->pattern_;
252 DCHECK(pattern.length() > 1);
253 int pattern_length = pattern.length();
254 PatternChar pattern_first_char = pattern[0];
256 int n = subject.length() - pattern_length;
258 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
259 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
260 memchr(subject.start() + i,
263 if (pos == NULL) return -1;
264 i = static_cast<int>(pos - subject.start()) + 1;
266 if (subject[i++] != pattern_first_char) continue;
268 // Loop extracted to separate function to allow using return to do
270 if (CharCompare(pattern.start() + 1,
272 pattern_length - 1)) {
279 //---------------------------------------------------------------------
280 // Boyer-Moore string search
281 //---------------------------------------------------------------------
283 template <typename PatternChar, typename SubjectChar>
284 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
285 StringSearch<PatternChar, SubjectChar>* search,
286 Vector<const SubjectChar> subject,
288 Vector<const PatternChar> pattern = search->pattern_;
289 int subject_length = subject.length();
290 int pattern_length = pattern.length();
291 // Only preprocess at most kBMMaxShift last characters of pattern.
292 int start = search->start_;
294 int* bad_char_occurence = search->bad_char_table();
295 int* good_suffix_shift = search->good_suffix_shift_table();
297 PatternChar last_char = pattern[pattern_length - 1];
298 int index = start_index;
299 // Continue search from i.
300 while (index <= subject_length - pattern_length) {
301 int j = pattern_length - 1;
303 while (last_char != (c = subject[index + j])) {
305 j - CharOccurrence(bad_char_occurence, c);
307 if (index > subject_length - pattern_length) {
311 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
314 } else if (j < start) {
315 // we have matched more than our tables allow us to be smart about.
316 // Fall back on BMH shift.
317 index += pattern_length - 1
318 - CharOccurrence(bad_char_occurence,
319 static_cast<SubjectChar>(last_char));
321 int gs_shift = good_suffix_shift[j + 1];
323 CharOccurrence(bad_char_occurence, c);
324 int shift = j - bc_occ;
325 if (gs_shift > shift) {
336 template <typename PatternChar, typename SubjectChar>
337 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
338 int pattern_length = pattern_.length();
339 const PatternChar* pattern = pattern_.start();
340 // Only look at the last kBMMaxShift characters of pattern (from start_
341 // to pattern_length).
343 int length = pattern_length - start;
345 // Biased tables so that we can use pattern indices as table indices,
346 // even if we only cover the part of the pattern from offset start.
347 int* shift_table = good_suffix_shift_table();
348 int* suffix_table = this->suffix_table();
351 for (int i = start; i < pattern_length; i++) {
352 shift_table[i] = length;
354 shift_table[pattern_length] = 1;
355 suffix_table[pattern_length] = pattern_length + 1;
357 if (pattern_length <= start) {
362 PatternChar last_char = pattern[pattern_length - 1];
363 int suffix = pattern_length + 1;
365 int i = pattern_length;
367 PatternChar c = pattern[i - 1];
368 while (suffix <= pattern_length && c != pattern[suffix - 1]) {
369 if (shift_table[suffix] == length) {
370 shift_table[suffix] = suffix - i;
372 suffix = suffix_table[suffix];
374 suffix_table[--i] = --suffix;
375 if (suffix == pattern_length) {
376 // No suffix to extend, so we check against last_char only.
377 while ((i > start) && (pattern[i - 1] != last_char)) {
378 if (shift_table[pattern_length] == length) {
379 shift_table[pattern_length] = pattern_length - i;
381 suffix_table[--i] = pattern_length;
384 suffix_table[--i] = --suffix;
389 // Build shift table using suffixes.
390 if (suffix < pattern_length) {
391 for (int i = start; i <= pattern_length; i++) {
392 if (shift_table[i] == length) {
393 shift_table[i] = suffix - start;
396 suffix = suffix_table[suffix];
402 //---------------------------------------------------------------------
403 // Boyer-Moore-Horspool string search.
404 //---------------------------------------------------------------------
406 template <typename PatternChar, typename SubjectChar>
407 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
408 StringSearch<PatternChar, SubjectChar>* search,
409 Vector<const SubjectChar> subject,
411 Vector<const PatternChar> pattern = search->pattern_;
412 int subject_length = subject.length();
413 int pattern_length = pattern.length();
414 int* char_occurrences = search->bad_char_table();
415 int badness = -pattern_length;
417 // How bad we are doing without a good-suffix table.
418 PatternChar last_char = pattern[pattern_length - 1];
419 int last_char_shift = pattern_length - 1 -
420 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
422 int index = start_index; // No matches found prior to this index.
423 while (index <= subject_length - pattern_length) {
424 int j = pattern_length - 1;
426 while (last_char != (subject_char = subject[index + j])) {
427 int bc_occ = CharOccurrence(char_occurrences, subject_char);
428 int shift = j - bc_occ;
430 badness += 1 - shift; // at most zero, so badness cannot increase.
431 if (index > subject_length - pattern_length) {
436 while (j >= 0 && pattern[j] == (subject[index + j])) j--;
440 index += last_char_shift;
441 // Badness increases by the number of characters we have
442 // checked, and decreases by the number of characters we
443 // can skip by shifting. It's a measure of how we are doing
444 // compared to reading each character exactly once.
445 badness += (pattern_length - j) - last_char_shift;
447 search->PopulateBoyerMooreTable();
448 search->strategy_ = &BoyerMooreSearch;
449 return BoyerMooreSearch(search, subject, index);
457 template <typename PatternChar, typename SubjectChar>
458 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
459 int pattern_length = pattern_.length();
461 int* bad_char_occurrence = bad_char_table();
463 // Only preprocess at most kBMMaxShift last characters of pattern.
465 // Run forwards to populate bad_char_table, so that *last* instance
466 // of character equivalence class is the one registered.
467 // Notice: Doesn't include the last character.
468 int table_size = AlphabetSize();
469 if (start == 0) { // All patterns less than kBMMaxShift in length.
470 memset(bad_char_occurrence,
472 table_size * sizeof(*bad_char_occurrence));
474 for (int i = 0; i < table_size; i++) {
475 bad_char_occurrence[i] = start - 1;
478 for (int i = start; i < pattern_length - 1; i++) {
479 PatternChar c = pattern_[i];
480 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
481 bad_char_occurrence[bucket] = i;
485 //---------------------------------------------------------------------
486 // Linear string search with bailout to BMH.
487 //---------------------------------------------------------------------
489 // Simple linear search for short patterns, which bails out if the string
490 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
491 template <typename PatternChar, typename SubjectChar>
492 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
493 StringSearch<PatternChar, SubjectChar>* search,
494 Vector<const SubjectChar> subject,
496 Vector<const PatternChar> pattern = search->pattern_;
497 int pattern_length = pattern.length();
498 // Badness is a count of how much work we have done. When we have
499 // done enough work we decide it's probably worth switching to a better
501 int badness = -10 - (pattern_length << 2);
503 // We know our pattern is at least 2 characters, we cache the first so
504 // the common case of the first character not matching is faster.
505 PatternChar pattern_first_char = pattern[0];
506 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
509 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
510 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
511 memchr(subject.start() + i,
517 i = static_cast<int>(pos - subject.start());
519 if (subject[i] != pattern_first_char) continue;
523 if (pattern[j] != subject[i + j]) {
527 } while (j < pattern_length);
528 if (j == pattern_length) {
533 search->PopulateBoyerMooreHorspoolTable();
534 search->strategy_ = &BoyerMooreHorspoolSearch;
535 return BoyerMooreHorspoolSearch(search, subject, i);
542 // Perform a a single stand-alone search.
543 // If searching multiple times for the same pattern, a search
544 // object should be constructed once and the Search function then called
546 template <typename SubjectChar, typename PatternChar>
547 int SearchString(Isolate* isolate,
548 Vector<const SubjectChar> subject,
549 Vector<const PatternChar> pattern,
551 StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
552 return search.Search(subject, start_index);
555 }} // namespace v8::internal
557 #endif // V8_STRING_SEARCH_H_