1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/history/url_index_private_data.h"
14 #include "base/basictypes.h"
15 #include "base/files/file_util.h"
16 #include "base/i18n/break_iterator.h"
17 #include "base/i18n/case_conversion.h"
18 #include "base/metrics/histogram.h"
19 #include "base/strings/string_util.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "base/time/time.h"
22 #include "chrome/browser/history/history_database.h"
23 #include "chrome/browser/history/history_db_task.h"
24 #include "chrome/browser/history/history_service.h"
25 #include "chrome/browser/history/in_memory_url_index.h"
26 #include "components/bookmarks/browser/bookmark_utils.h"
27 #include "components/history/core/browser/history_client.h"
28 #include "net/base/net_util.h"
30 #if defined(USE_SYSTEM_PROTOBUF)
31 #include <google/protobuf/repeated_field.h>
33 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
36 using google::protobuf::RepeatedField;
37 using google::protobuf::RepeatedPtrField;
38 using in_memory_url_index::InMemoryURLIndexCacheItem;
41 static const size_t kMaxVisitsToStoreInCache = 10u;
42 } // anonymous namespace
46 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
47 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
48 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
49 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
50 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
52 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
55 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
56 WordIDHistoryMapEntry;
57 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
58 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
61 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
62 HistoryInfoMapEntry_VisitInfo;
63 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;
64 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
68 // Algorithm Functions ---------------------------------------------------------
70 // Comparison function for sorting search terms by descending length.
71 bool LengthGreater(const base::string16& string_a,
72 const base::string16& string_b) {
73 return string_a.length() > string_b.length();
77 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
79 // HistoryDBTask used to update the recent visit data for a particular
80 // row from the history database.
81 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask {
83 explicit UpdateRecentVisitsFromHistoryDBTask(
84 URLIndexPrivateData* private_data,
87 bool RunOnDBThread(HistoryBackend* backend,
88 history::HistoryDatabase* db) override;
89 void DoneRunOnMainThread() override;
92 ~UpdateRecentVisitsFromHistoryDBTask() override;
94 // The URLIndexPrivateData that gets updated after the historyDB
96 URLIndexPrivateData* private_data_;
97 // The ID of the URL to get visits for and then update.
99 // Whether fetching the recent visits for the URL succeeded.
101 // The awaited data that's shown to private_data_ for it to copy and
103 VisitVector recent_visits_;
105 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
108 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
109 URLIndexPrivateData* private_data,
111 : private_data_(private_data),
116 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
117 HistoryBackend* backend,
118 HistoryDatabase* db) {
119 // Make sure the private data is going to get as many recent visits as
120 // ScoredHistoryMatch::GetFrequency() hopes to use.
121 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
122 succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
123 kMaxVisitsToStoreInCache,
126 recent_visits_.clear();
127 return true; // Always claim to be done; do not retry failures.
130 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
132 private_data_->UpdateRecentVisits(url_id_, recent_visits_);
135 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
139 // URLIndexPrivateData ---------------------------------------------------------
141 URLIndexPrivateData::URLIndexPrivateData()
142 : restored_cache_version_(0),
143 saved_cache_version_(kCurrentCacheFileVersion),
144 pre_filter_item_count_(0),
145 post_filter_item_count_(0),
146 post_scoring_item_count_(0) {
149 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
150 base::string16 search_string,
151 size_t cursor_position,
153 const std::string& languages,
154 HistoryClient* history_client) {
155 // If cursor position is set and useful (not at either end of the
156 // string), allow the search string to be broken at cursor position.
157 // We do this by pretending there's a space where the cursor is.
158 if ((cursor_position != base::string16::npos) &&
159 (cursor_position < search_string.length()) &&
160 (cursor_position > 0)) {
161 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
163 pre_filter_item_count_ = 0;
164 post_filter_item_count_ = 0;
165 post_scoring_item_count_ = 0;
166 // The search string we receive may contain escaped characters. For reducing
167 // the index we need individual, lower-cased words, ignoring escapings. For
168 // the final filtering we need whitespace separated substrings possibly
169 // containing escaped characters.
170 base::string16 lower_raw_string(base::i18n::ToLower(search_string));
171 base::string16 lower_unescaped_string =
172 net::UnescapeURLComponent(lower_raw_string,
173 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
174 // Extract individual 'words' (as opposed to 'terms'; see below) from the
175 // search string. When the user types "colspec=ID%20Mstone Release" we get
176 // four 'words': "colspec", "id", "mstone" and "release".
177 String16Vector lower_words(
178 history::String16VectorFromString16(lower_unescaped_string, false, NULL));
179 ScoredHistoryMatches scored_items;
181 // Do nothing if we have indexed no words (probably because we've not been
182 // initialized yet) or the search string has no words.
183 if (word_list_.empty() || lower_words.empty()) {
184 search_term_cache_.clear(); // Invalidate the term cache.
188 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
190 ResetSearchTermCache();
192 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
194 // Trim the candidate pool if it is large. Note that we do not filter out
195 // items that do not contain the search terms as proper substrings -- doing
196 // so is the performance-costly operation we are trying to avoid in order
197 // to maintain omnibox responsiveness.
198 const size_t kItemsToScoreLimit = 500;
199 pre_filter_item_count_ = history_id_set.size();
200 // If we trim the results set we do not want to cache the results for next
201 // time as the user's ultimately desired result could easily be eliminated
202 // in this early rough filter.
203 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
205 HistoryIDVector history_ids;
206 std::copy(history_id_set.begin(), history_id_set.end(),
207 std::back_inserter(history_ids));
208 // Trim down the set by sorting by typed-count, visit-count, and last
210 HistoryItemFactorGreater
211 item_factor_functor(history_info_map_);
212 std::partial_sort(history_ids.begin(),
213 history_ids.begin() + kItemsToScoreLimit,
215 item_factor_functor);
216 history_id_set.clear();
217 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
218 std::inserter(history_id_set, history_id_set.end()));
219 post_filter_item_count_ = history_id_set.size();
222 // Pass over all of the candidates filtering out any without a proper
223 // substring match, inserting those which pass in order by score. Note that
224 // in this step we are using the raw search string complete with escaped
225 // URL elements. When the user has specifically typed something akin to
226 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
227 // specific substring appears in the URL or page title.
229 // We call these 'terms' (as opposed to 'words'; see above) as in this case
230 // we only want to break up the search string on 'true' whitespace rather than
231 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
232 // get two 'terms': "colspec=id%20mstone" and "release".
233 history::String16Vector lower_raw_terms;
234 if (Tokenize(lower_raw_string, base::kWhitespaceUTF16,
235 &lower_raw_terms) == 0) {
236 // Don't score matches when there are no terms to score against. (It's
237 // possible that the word break iterater that extracts words to search
238 // for in the database allows some whitespace "words" whereas Tokenize
239 // excludes a long list of whitespace.) One could write a scoring
240 // function that gives a reasonable order to matches when there
241 // are no terms (i.e., all the words are some form of whitespace),
242 // but this is such a rare edge case that it's not worth the time.
245 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
246 AddHistoryMatch(*this, languages, history_client, lower_raw_string,
247 lower_raw_terms, base::Time::Now())).ScoredMatches();
249 // Select and sort only the top |max_matches| results.
250 if (scored_items.size() > max_matches) {
251 std::partial_sort(scored_items.begin(),
252 scored_items.begin() +
255 ScoredHistoryMatch::MatchScoreGreater);
256 scored_items.resize(max_matches);
258 std::sort(scored_items.begin(), scored_items.end(),
259 ScoredHistoryMatch::MatchScoreGreater);
261 post_scoring_item_count_ = scored_items.size();
264 search_term_cache_.clear(); // Invalidate the term cache.
266 // Remove any stale SearchTermCacheItems.
267 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
268 cache_iter != search_term_cache_.end(); ) {
269 if (!cache_iter->second.used_)
270 search_term_cache_.erase(cache_iter++);
279 bool URLIndexPrivateData::UpdateURL(
280 HistoryService* history_service,
282 const std::string& languages,
283 const std::set<std::string>& scheme_whitelist,
284 base::CancelableTaskTracker* tracker) {
285 // The row may or may not already be in our index. If it is not already
286 // indexed and it qualifies then it gets indexed. If it is already
287 // indexed and still qualifies then it gets updated, otherwise it
288 // is deleted from the index.
289 bool row_was_updated = false;
290 URLID row_id = row.id();
291 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
292 if (row_pos == history_info_map_.end()) {
293 // This new row should be indexed if it qualifies.
295 new_row.set_id(row_id);
296 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
303 } else if (RowQualifiesAsSignificant(row, base::Time())) {
304 // This indexed row still qualifies and will be re-indexed.
305 // The url won't have changed but the title, visit count, etc.
306 // might have changed.
307 URLRow& row_to_update = row_pos->second.url_row;
308 bool title_updated = row_to_update.title() != row.title();
309 if (row_to_update.visit_count() != row.visit_count() ||
310 row_to_update.typed_count() != row.typed_count() ||
311 row_to_update.last_visit() != row.last_visit() || title_updated) {
312 row_to_update.set_visit_count(row.visit_count());
313 row_to_update.set_typed_count(row.typed_count());
314 row_to_update.set_last_visit(row.last_visit());
315 // If something appears to have changed, update the recent visits
317 ScheduleUpdateRecentVisits(history_service, row_id, tracker);
318 // While the URL is guaranteed to remain stable, the title may have
319 // changed. If so, then update the index with the changed words.
321 // Clear all words associated with this row and re-index both the
323 RemoveRowWordsFromIndex(row_to_update);
324 row_to_update.set_title(row.title());
325 RowWordStarts word_starts;
326 AddRowWordsToIndex(row_to_update, &word_starts, languages);
327 word_starts_map_[row_id] = word_starts;
329 row_was_updated = true;
332 // This indexed row no longer qualifies and will be de-indexed by
333 // clearing all words associated with this row.
334 RemoveRowFromIndex(row);
335 row_was_updated = true;
338 search_term_cache_.clear(); // This invalidates the cache.
339 return row_was_updated;
342 void URLIndexPrivateData::UpdateRecentVisits(
344 const VisitVector& recent_visits) {
345 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
346 if (row_pos != history_info_map_.end()) {
347 VisitInfoVector* visits = &row_pos->second.visits;
350 std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
351 visits->reserve(size);
352 for (size_t i = 0; i < size; i++) {
353 // Copy from the VisitVector the only fields visits needs.
354 visits->push_back(std::make_pair(recent_visits[i].visit_time,
355 recent_visits[i].transition));
358 // Else: Oddly, the URL doesn't seem to exist in the private index.
359 // Ignore this update. This can happen if, for instance, the user
360 // removes the URL from URLIndexPrivateData before the historyDB call
364 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
365 HistoryService* history_service,
367 base::CancelableTaskTracker* tracker) {
368 history_service->ScheduleDBTask(
369 scoped_ptr<history::HistoryDBTask>(
370 new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker);
373 // Helper functor for DeleteURL.
374 class HistoryInfoMapItemHasURL {
376 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
378 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
379 return item.second.url_row.url() == url_;
386 bool URLIndexPrivateData::DeleteURL(const GURL& url) {
387 // Find the matching entry in the history_info_map_.
388 HistoryInfoMap::iterator pos = std::find_if(
389 history_info_map_.begin(),
390 history_info_map_.end(),
391 HistoryInfoMapItemHasURL(url));
392 if (pos == history_info_map_.end())
394 RemoveRowFromIndex(pos->second.url_row);
395 search_term_cache_.clear(); // This invalidates the cache.
400 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
401 const base::FilePath& file_path,
402 const std::string& languages) {
403 base::TimeTicks beginning_time = base::TimeTicks::Now();
404 if (!base::PathExists(file_path))
407 // If there is no cache file then simply give up. This will cause us to
408 // attempt to rebuild from the history database.
409 if (!base::ReadFileToString(file_path, &data))
412 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
413 InMemoryURLIndexCacheItem index_cache;
414 if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
415 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
416 << file_path.value();
417 return restored_data;
420 if (!restored_data->RestorePrivateData(index_cache, languages))
423 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
424 base::TimeTicks::Now() - beginning_time);
425 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
426 restored_data->history_id_word_map_.size());
427 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
428 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
429 restored_data->word_map_.size());
430 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
431 restored_data->char_word_map_.size());
432 if (restored_data->Empty())
433 return NULL; // 'No data' is the same as a failed reload.
434 return restored_data;
438 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
439 HistoryDatabase* history_db,
440 const std::string& languages,
441 const std::set<std::string>& scheme_whitelist) {
445 base::TimeTicks beginning_time = base::TimeTicks::Now();
447 scoped_refptr<URLIndexPrivateData>
448 rebuilt_data(new URLIndexPrivateData);
449 URLDatabase::URLEnumerator history_enum;
450 if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
452 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
453 for (URLRow row; history_enum.GetNextURL(&row); ) {
454 rebuilt_data->IndexRow(
455 history_db, NULL, row, languages, scheme_whitelist, NULL);
458 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
459 base::TimeTicks::Now() - beginning_time);
460 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
461 rebuilt_data->history_id_word_map_.size());
462 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
463 rebuilt_data->word_map_.size());
464 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
465 rebuilt_data->char_word_map_.size());
470 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
471 scoped_refptr<URLIndexPrivateData> private_data,
472 const base::FilePath& file_path) {
473 DCHECK(private_data.get());
474 DCHECK(!file_path.empty());
475 return private_data->SaveToFile(file_path);
478 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
479 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
480 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
481 data_copy->word_list_ = word_list_;
482 data_copy->available_words_ = available_words_;
483 data_copy->word_map_ = word_map_;
484 data_copy->char_word_map_ = char_word_map_;
485 data_copy->word_id_history_map_ = word_id_history_map_;
486 data_copy->history_id_word_map_ = history_id_word_map_;
487 data_copy->history_info_map_ = history_info_map_;
488 data_copy->word_starts_map_ = word_starts_map_;
491 // search_term_cache_
492 // pre_filter_item_count_
493 // post_filter_item_count_
494 // post_scoring_item_count_
497 bool URLIndexPrivateData::Empty() const {
498 return history_info_map_.empty();
501 void URLIndexPrivateData::Clear() {
502 last_time_rebuilt_from_history_ = base::Time();
504 available_words_.clear();
506 char_word_map_.clear();
507 word_id_history_map_.clear();
508 history_id_word_map_.clear();
509 history_info_map_.clear();
510 word_starts_map_.clear();
513 URLIndexPrivateData::~URLIndexPrivateData() {}
515 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
516 const String16Vector& unsorted_words) {
517 // Break the terms down into individual terms (words), get the candidate
518 // set for each term, and intersect each to get a final candidate list.
519 // Note that a single 'term' from the user's perspective might be
520 // a string like "http://www.somewebsite.com" which, from our perspective,
521 // is four words: 'http', 'www', 'somewebsite', and 'com'.
522 HistoryIDSet history_id_set;
523 String16Vector words(unsorted_words);
524 // Sort the words into the longest first as such are likely to narrow down
525 // the results quicker. Also, single character words are the most expensive
526 // to process so save them for last.
527 std::sort(words.begin(), words.end(), LengthGreater);
528 for (String16Vector::iterator iter = words.begin(); iter != words.end();
530 base::string16 uni_word = *iter;
531 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
532 if (term_history_set.empty()) {
533 history_id_set.clear();
536 if (iter == words.begin()) {
537 history_id_set.swap(term_history_set);
539 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>(
540 history_id_set, term_history_set);
541 history_id_set.swap(new_history_id_set);
544 return history_id_set;
547 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
548 const base::string16& term) {
550 return HistoryIDSet();
552 // TODO(mrossetti): Consider optimizing for very common terms such as
553 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
554 // occuring words in the user's searches.
556 size_t term_length = term.length();
557 WordIDSet word_id_set;
558 if (term_length > 1) {
559 // See if this term or a prefix thereof is present in the cache.
560 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
561 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
562 cache_iter != search_term_cache_.end(); ++cache_iter) {
563 if (StartsWith(term, cache_iter->first, false) &&
564 (best_prefix == search_term_cache_.end() ||
565 cache_iter->first.length() > best_prefix->first.length()))
566 best_prefix = cache_iter;
569 // If a prefix was found then determine the leftover characters to be used
570 // for further refining the results from that prefix.
571 Char16Set prefix_chars;
572 base::string16 leftovers(term);
573 if (best_prefix != search_term_cache_.end()) {
574 // If the prefix is an exact match for the term then grab the cached
575 // results and we're done.
576 size_t prefix_length = best_prefix->first.length();
577 if (prefix_length == term_length) {
578 best_prefix->second.used_ = true;
579 return best_prefix->second.history_id_set_;
582 // Otherwise we have a handy starting point.
583 // If there are no history results for this prefix then we can bail early
584 // as there will be no history results for the full term.
585 if (best_prefix->second.history_id_set_.empty()) {
586 search_term_cache_[term] = SearchTermCacheItem();
587 return HistoryIDSet();
589 word_id_set = best_prefix->second.word_id_set_;
590 prefix_chars = Char16SetFromString16(best_prefix->first);
591 leftovers = term.substr(prefix_length);
594 // Filter for each remaining, unique character in the term.
595 Char16Set leftover_chars = Char16SetFromString16(leftovers);
596 Char16Set unique_chars =
597 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
599 // Reduce the word set with any leftover, unprocessed characters.
600 if (!unique_chars.empty()) {
601 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
602 // We might come up empty on the leftovers.
603 if (leftover_set.empty()) {
604 search_term_cache_[term] = SearchTermCacheItem();
605 return HistoryIDSet();
607 // Or there may not have been a prefix from which to start.
608 if (prefix_chars.empty()) {
609 word_id_set.swap(leftover_set);
611 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
612 word_id_set, leftover_set);
613 word_id_set.swap(new_word_id_set);
617 // We must filter the word list because the resulting word set surely
618 // contains words which do not have the search term as a proper subset.
619 for (WordIDSet::iterator word_set_iter = word_id_set.begin();
620 word_set_iter != word_id_set.end(); ) {
621 if (word_list_[*word_set_iter].find(term) == base::string16::npos)
622 word_id_set.erase(word_set_iter++);
627 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
630 // If any words resulted then we can compose a set of history IDs by unioning
631 // the sets from each word.
632 HistoryIDSet history_id_set;
633 if (!word_id_set.empty()) {
634 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
635 word_id_iter != word_id_set.end(); ++word_id_iter) {
636 WordID word_id = *word_id_iter;
637 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
638 if (word_iter != word_id_history_map_.end()) {
639 HistoryIDSet& word_history_id_set(word_iter->second);
640 history_id_set.insert(word_history_id_set.begin(),
641 word_history_id_set.end());
646 // Record a new cache entry for this word if the term is longer than
647 // a single character.
649 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
651 return history_id_set;
654 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
655 const Char16Set& term_chars) {
656 WordIDSet word_id_set;
657 for (Char16Set::const_iterator c_iter = term_chars.begin();
658 c_iter != term_chars.end(); ++c_iter) {
659 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
660 if (char_iter == char_word_map_.end()) {
661 // A character was not found so there are no matching results: bail.
665 WordIDSet& char_word_id_set(char_iter->second);
666 // It is possible for there to no longer be any words associated with
667 // a particular character. Give up in that case.
668 if (char_word_id_set.empty()) {
673 if (c_iter == term_chars.begin()) {
674 // First character results becomes base set of results.
675 word_id_set = char_word_id_set;
677 // Subsequent character results get intersected in.
678 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
679 word_id_set, char_word_id_set);
680 word_id_set.swap(new_word_id_set);
686 bool URLIndexPrivateData::IndexRow(
687 HistoryDatabase* history_db,
688 HistoryService* history_service,
690 const std::string& languages,
691 const std::set<std::string>& scheme_whitelist,
692 base::CancelableTaskTracker* tracker) {
693 const GURL& gurl(row.url());
695 // Index only URLs with a whitelisted scheme.
696 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
699 URLID row_id = row.id();
700 // Strip out username and password before saving and indexing.
701 base::string16 url(net::FormatUrl(gurl, languages,
702 net::kFormatUrlOmitUsernamePassword,
703 net::UnescapeRule::NONE,
706 HistoryID history_id = static_cast<HistoryID>(row_id);
707 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
709 // Add the row for quick lookup in the history info store.
710 URLRow new_row(GURL(url), row_id);
711 new_row.set_visit_count(row.visit_count());
712 new_row.set_typed_count(row.typed_count());
713 new_row.set_last_visit(row.last_visit());
714 new_row.set_title(row.title());
715 history_info_map_[history_id].url_row = new_row;
717 // Index the words contained in the URL and title of the row.
718 RowWordStarts word_starts;
719 AddRowWordsToIndex(new_row, &word_starts, languages);
720 word_starts_map_[history_id] = word_starts;
722 // Update the recent visits information or schedule the update
725 // We'd like to check that we're on the history DB thread.
726 // However, unittest code actually calls this on the UI thread.
727 // So we don't do any thread checks.
728 VisitVector recent_visits;
729 // Make sure the private data is going to get as many recent visits as
730 // ScoredHistoryMatch::GetFrequency() hopes to use.
731 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
732 if (history_db->GetMostRecentVisitsForURL(row_id,
733 kMaxVisitsToStoreInCache,
735 UpdateRecentVisits(row_id, recent_visits);
738 DCHECK(history_service);
739 ScheduleUpdateRecentVisits(history_service, row_id, tracker);
745 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,
746 RowWordStarts* word_starts,
747 const std::string& languages) {
748 HistoryID history_id = static_cast<HistoryID>(row.id());
749 // Split URL into individual, unique words then add in the title words.
750 const GURL& gurl(row.url());
751 const base::string16& url =
752 bookmarks::CleanUpUrlForMatching(gurl, languages, NULL);
753 String16Set url_words = String16SetFromString16(url,
754 word_starts ? &word_starts->url_word_starts_ : NULL);
755 const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title());
756 String16Set title_words = String16SetFromString16(title,
757 word_starts ? &word_starts->title_word_starts_ : NULL);
758 String16Set words = base::STLSetUnion<String16Set>(url_words, title_words);
759 for (String16Set::iterator word_iter = words.begin();
760 word_iter != words.end(); ++word_iter)
761 AddWordToIndex(*word_iter, history_id);
763 search_term_cache_.clear(); // Invalidate the term cache.
766 void URLIndexPrivateData::AddWordToIndex(const base::string16& term,
767 HistoryID history_id) {
768 WordMap::iterator word_pos = word_map_.find(term);
769 if (word_pos != word_map_.end())
770 UpdateWordHistory(word_pos->second, history_id);
772 AddWordHistory(term, history_id);
775 void URLIndexPrivateData::AddWordHistory(const base::string16& term,
776 HistoryID history_id) {
777 WordID word_id = word_list_.size();
778 if (available_words_.empty()) {
779 word_list_.push_back(term);
781 word_id = *(available_words_.begin());
782 word_list_[word_id] = term;
783 available_words_.erase(word_id);
785 word_map_[term] = word_id;
787 HistoryIDSet history_id_set;
788 history_id_set.insert(history_id);
789 word_id_history_map_[word_id] = history_id_set;
790 AddToHistoryIDWordMap(history_id, word_id);
792 // For each character in the newly added word (i.e. a word that is not
793 // already in the word index), add the word to the character index.
794 Char16Set characters = Char16SetFromString16(term);
795 for (Char16Set::iterator uni_char_iter = characters.begin();
796 uni_char_iter != characters.end(); ++uni_char_iter) {
797 base::char16 uni_char = *uni_char_iter;
798 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
799 if (char_iter != char_word_map_.end()) {
800 // Update existing entry in the char/word index.
801 WordIDSet& word_id_set(char_iter->second);
802 word_id_set.insert(word_id);
804 // Create a new entry in the char/word index.
805 WordIDSet word_id_set;
806 word_id_set.insert(word_id);
807 char_word_map_[uni_char] = word_id_set;
812 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
813 HistoryID history_id) {
814 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
815 DCHECK(history_pos != word_id_history_map_.end());
816 HistoryIDSet& history_id_set(history_pos->second);
817 history_id_set.insert(history_id);
818 AddToHistoryIDWordMap(history_id, word_id);
821 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
823 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
824 if (iter != history_id_word_map_.end()) {
825 WordIDSet& word_id_set(iter->second);
826 word_id_set.insert(word_id);
828 WordIDSet word_id_set;
829 word_id_set.insert(word_id);
830 history_id_word_map_[history_id] = word_id_set;
834 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) {
835 RemoveRowWordsFromIndex(row);
836 HistoryID history_id = static_cast<HistoryID>(row.id());
837 history_info_map_.erase(history_id);
838 word_starts_map_.erase(history_id);
841 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {
842 // Remove the entries in history_id_word_map_ and word_id_history_map_ for
844 HistoryID history_id = static_cast<HistoryID>(row.id());
845 WordIDSet word_id_set = history_id_word_map_[history_id];
846 history_id_word_map_.erase(history_id);
848 // Reconcile any changes to word usage.
849 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
850 word_id_iter != word_id_set.end(); ++word_id_iter) {
851 WordID word_id = *word_id_iter;
852 word_id_history_map_[word_id].erase(history_id);
853 if (!word_id_history_map_[word_id].empty())
854 continue; // The word is still in use.
856 // The word is no longer in use. Reconcile any changes to character usage.
857 base::string16 word = word_list_[word_id];
858 Char16Set characters = Char16SetFromString16(word);
859 for (Char16Set::iterator uni_char_iter = characters.begin();
860 uni_char_iter != characters.end(); ++uni_char_iter) {
861 base::char16 uni_char = *uni_char_iter;
862 char_word_map_[uni_char].erase(word_id);
863 if (char_word_map_[uni_char].empty())
864 char_word_map_.erase(uni_char); // No longer in use.
867 // Complete the removal of references to the word.
868 word_id_history_map_.erase(word_id);
869 word_map_.erase(word);
870 word_list_[word_id] = base::string16();
871 available_words_.insert(word_id);
875 void URLIndexPrivateData::ResetSearchTermCache() {
876 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
877 iter != search_term_cache_.end(); ++iter)
878 iter->second.used_ = false;
881 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
882 base::TimeTicks beginning_time = base::TimeTicks::Now();
883 InMemoryURLIndexCacheItem index_cache;
884 SavePrivateData(&index_cache);
886 if (!index_cache.SerializeToString(&data)) {
887 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
891 int size = data.size();
892 if (base::WriteFile(file_path, data.c_str(), size) != size) {
893 LOG(WARNING) << "Failed to write " << file_path.value();
896 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
897 base::TimeTicks::Now() - beginning_time);
901 void URLIndexPrivateData::SavePrivateData(
902 InMemoryURLIndexCacheItem* cache) const {
904 cache->set_last_rebuild_timestamp(
905 last_time_rebuilt_from_history_.ToInternalValue());
906 cache->set_version(saved_cache_version_);
907 // history_item_count_ is no longer used but rather than change the protobuf
908 // definition use a placeholder. This will go away with the switch to SQLite.
909 cache->set_history_item_count(0);
912 SaveCharWordMap(cache);
913 SaveWordIDHistoryMap(cache);
914 SaveHistoryInfoMap(cache);
915 SaveWordStartsMap(cache);
918 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
919 if (word_list_.empty())
921 WordListItem* list_item = cache->mutable_word_list();
922 list_item->set_word_count(word_list_.size());
923 for (String16Vector::const_iterator iter = word_list_.begin();
924 iter != word_list_.end(); ++iter)
925 list_item->add_word(base::UTF16ToUTF8(*iter));
928 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
929 if (word_map_.empty())
931 WordMapItem* map_item = cache->mutable_word_map();
932 map_item->set_item_count(word_map_.size());
933 for (WordMap::const_iterator iter = word_map_.begin();
934 iter != word_map_.end(); ++iter) {
935 WordMapEntry* map_entry = map_item->add_word_map_entry();
936 map_entry->set_word(base::UTF16ToUTF8(iter->first));
937 map_entry->set_word_id(iter->second);
941 void URLIndexPrivateData::SaveCharWordMap(
942 InMemoryURLIndexCacheItem* cache) const {
943 if (char_word_map_.empty())
945 CharWordMapItem* map_item = cache->mutable_char_word_map();
946 map_item->set_item_count(char_word_map_.size());
947 for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
948 iter != char_word_map_.end(); ++iter) {
949 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
950 map_entry->set_char_16(iter->first);
951 const WordIDSet& word_id_set(iter->second);
952 map_entry->set_item_count(word_id_set.size());
953 for (WordIDSet::const_iterator set_iter = word_id_set.begin();
954 set_iter != word_id_set.end(); ++set_iter)
955 map_entry->add_word_id(*set_iter);
959 void URLIndexPrivateData::SaveWordIDHistoryMap(
960 InMemoryURLIndexCacheItem* cache) const {
961 if (word_id_history_map_.empty())
963 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
964 map_item->set_item_count(word_id_history_map_.size());
965 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
966 iter != word_id_history_map_.end(); ++iter) {
967 WordIDHistoryMapEntry* map_entry =
968 map_item->add_word_id_history_map_entry();
969 map_entry->set_word_id(iter->first);
970 const HistoryIDSet& history_id_set(iter->second);
971 map_entry->set_item_count(history_id_set.size());
972 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
973 set_iter != history_id_set.end(); ++set_iter)
974 map_entry->add_history_id(*set_iter);
978 void URLIndexPrivateData::SaveHistoryInfoMap(
979 InMemoryURLIndexCacheItem* cache) const {
980 if (history_info_map_.empty())
982 HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
983 map_item->set_item_count(history_info_map_.size());
984 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
985 iter != history_info_map_.end(); ++iter) {
986 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
987 map_entry->set_history_id(iter->first);
988 const URLRow& url_row(iter->second.url_row);
989 // Note: We only save information that contributes to the index so there
990 // is no need to save search_term_cache_ (not persistent).
991 map_entry->set_visit_count(url_row.visit_count());
992 map_entry->set_typed_count(url_row.typed_count());
993 map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
994 map_entry->set_url(url_row.url().spec());
995 map_entry->set_title(base::UTF16ToUTF8(url_row.title()));
996 const VisitInfoVector& visits(iter->second.visits);
997 for (VisitInfoVector::const_iterator visit_iter = visits.begin();
998 visit_iter != visits.end(); ++visit_iter) {
999 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
1000 visit_info->set_visit_time(visit_iter->first.ToInternalValue());
1001 visit_info->set_transition_type(visit_iter->second);
1006 void URLIndexPrivateData::SaveWordStartsMap(
1007 InMemoryURLIndexCacheItem* cache) const {
1008 if (word_starts_map_.empty())
1010 // For unit testing: Enable saving of the cache as an earlier version to
1011 // allow testing of cache file upgrading in ReadFromFile().
1012 // TODO(mrossetti): Instead of intruding on production code with this kind of
1013 // test harness, save a copy of an older version cache with known results.
1014 // Implement this when switching the caching over to SQLite.
1015 if (saved_cache_version_ < 1)
1018 WordStartsMapItem* map_item = cache->mutable_word_starts_map();
1019 map_item->set_item_count(word_starts_map_.size());
1020 for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
1021 iter != word_starts_map_.end(); ++iter) {
1022 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
1023 map_entry->set_history_id(iter->first);
1024 const RowWordStarts& word_starts(iter->second);
1025 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
1026 i != word_starts.url_word_starts_.end(); ++i)
1027 map_entry->add_url_word_starts(*i);
1028 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
1029 i != word_starts.title_word_starts_.end(); ++i)
1030 map_entry->add_title_word_starts(*i);
1034 bool URLIndexPrivateData::RestorePrivateData(
1035 const InMemoryURLIndexCacheItem& cache,
1036 const std::string& languages) {
1037 last_time_rebuilt_from_history_ =
1038 base::Time::FromInternalValue(cache.last_rebuild_timestamp());
1039 const base::TimeDelta rebuilt_ago =
1040 base::Time::Now() - last_time_rebuilt_from_history_;
1041 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
1042 (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
1043 // Cache is more than a week old or, somehow, from some time in the future.
1044 // It's probably a good time to rebuild the index from history to
1045 // allow synced entries to now appear, expired entries to disappear, etc.
1046 // Allow one day in the future to make the cache not rebuild on simple
1047 // system clock changes such as time zone changes.
1050 if (cache.has_version()) {
1051 if (cache.version() < kCurrentCacheFileVersion) {
1052 // Don't try to restore an old format cache file. (This will cause
1053 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1057 restored_cache_version_ = cache.version();
1059 return RestoreWordList(cache) && RestoreWordMap(cache) &&
1060 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
1061 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
1064 bool URLIndexPrivateData::RestoreWordList(
1065 const InMemoryURLIndexCacheItem& cache) {
1066 if (!cache.has_word_list())
1068 const WordListItem& list_item(cache.word_list());
1069 uint32 expected_item_count = list_item.word_count();
1070 uint32 actual_item_count = list_item.word_size();
1071 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1073 const RepeatedPtrField<std::string>& words(list_item.word());
1074 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
1075 iter != words.end(); ++iter)
1076 word_list_.push_back(base::UTF8ToUTF16(*iter));
1080 bool URLIndexPrivateData::RestoreWordMap(
1081 const InMemoryURLIndexCacheItem& cache) {
1082 if (!cache.has_word_map())
1084 const WordMapItem& list_item(cache.word_map());
1085 uint32 expected_item_count = list_item.item_count();
1086 uint32 actual_item_count = list_item.word_map_entry_size();
1087 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1089 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
1090 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
1091 iter != entries.end(); ++iter)
1092 word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();
1096 bool URLIndexPrivateData::RestoreCharWordMap(
1097 const InMemoryURLIndexCacheItem& cache) {
1098 if (!cache.has_char_word_map())
1100 const CharWordMapItem& list_item(cache.char_word_map());
1101 uint32 expected_item_count = list_item.item_count();
1102 uint32 actual_item_count = list_item.char_word_map_entry_size();
1103 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1105 const RepeatedPtrField<CharWordMapEntry>&
1106 entries(list_item.char_word_map_entry());
1107 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
1108 entries.begin(); iter != entries.end(); ++iter) {
1109 expected_item_count = iter->item_count();
1110 actual_item_count = iter->word_id_size();
1111 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1113 base::char16 uni_char = static_cast<base::char16>(iter->char_16());
1114 WordIDSet word_id_set;
1115 const RepeatedField<int32>& word_ids(iter->word_id());
1116 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
1117 jiter != word_ids.end(); ++jiter)
1118 word_id_set.insert(*jiter);
1119 char_word_map_[uni_char] = word_id_set;
1124 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1125 const InMemoryURLIndexCacheItem& cache) {
1126 if (!cache.has_word_id_history_map())
1128 const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
1129 uint32 expected_item_count = list_item.item_count();
1130 uint32 actual_item_count = list_item.word_id_history_map_entry_size();
1131 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1133 const RepeatedPtrField<WordIDHistoryMapEntry>&
1134 entries(list_item.word_id_history_map_entry());
1135 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1136 entries.begin(); iter != entries.end(); ++iter) {
1137 expected_item_count = iter->item_count();
1138 actual_item_count = iter->history_id_size();
1139 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1141 WordID word_id = iter->word_id();
1142 HistoryIDSet history_id_set;
1143 const RepeatedField<int64>& history_ids(iter->history_id());
1144 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1145 jiter != history_ids.end(); ++jiter) {
1146 history_id_set.insert(*jiter);
1147 AddToHistoryIDWordMap(*jiter, word_id);
1149 word_id_history_map_[word_id] = history_id_set;
1154 bool URLIndexPrivateData::RestoreHistoryInfoMap(
1155 const InMemoryURLIndexCacheItem& cache) {
1156 if (!cache.has_history_info_map())
1158 const HistoryInfoMapItem& list_item(cache.history_info_map());
1159 uint32 expected_item_count = list_item.item_count();
1160 uint32 actual_item_count = list_item.history_info_map_entry_size();
1161 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1163 const RepeatedPtrField<HistoryInfoMapEntry>&
1164 entries(list_item.history_info_map_entry());
1165 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
1166 entries.begin(); iter != entries.end(); ++iter) {
1167 HistoryID history_id = iter->history_id();
1168 GURL url(iter->url());
1169 URLRow url_row(url, history_id);
1170 url_row.set_visit_count(iter->visit_count());
1171 url_row.set_typed_count(iter->typed_count());
1172 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1173 if (iter->has_title()) {
1174 base::string16 title(base::UTF8ToUTF16(iter->title()));
1175 url_row.set_title(title);
1177 history_info_map_[history_id].url_row = url_row;
1179 // Restore visits list.
1180 VisitInfoVector visits;
1181 visits.reserve(iter->visits_size());
1182 for (int i = 0; i < iter->visits_size(); ++i) {
1183 visits.push_back(std::make_pair(
1184 base::Time::FromInternalValue(iter->visits(i).visit_time()),
1185 ui::PageTransitionFromInt(iter->visits(i).transition_type())));
1187 history_info_map_[history_id].visits = visits;
1192 bool URLIndexPrivateData::RestoreWordStartsMap(
1193 const InMemoryURLIndexCacheItem& cache,
1194 const std::string& languages) {
1195 // Note that this function must be called after RestoreHistoryInfoMap() has
1196 // been run as the word starts may have to be recalculated from the urls and
1198 if (cache.has_word_starts_map()) {
1199 const WordStartsMapItem& list_item(cache.word_starts_map());
1200 uint32 expected_item_count = list_item.item_count();
1201 uint32 actual_item_count = list_item.word_starts_map_entry_size();
1202 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1204 const RepeatedPtrField<WordStartsMapEntry>&
1205 entries(list_item.word_starts_map_entry());
1206 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
1207 entries.begin(); iter != entries.end(); ++iter) {
1208 HistoryID history_id = iter->history_id();
1209 RowWordStarts word_starts;
1210 // Restore the URL word starts.
1211 const RepeatedField<int32>& url_starts(iter->url_word_starts());
1212 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
1213 jiter != url_starts.end(); ++jiter)
1214 word_starts.url_word_starts_.push_back(*jiter);
1215 // Restore the page title word starts.
1216 const RepeatedField<int32>& title_starts(iter->title_word_starts());
1217 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
1218 jiter != title_starts.end(); ++jiter)
1219 word_starts.title_word_starts_.push_back(*jiter);
1220 word_starts_map_[history_id] = word_starts;
1223 // Since the cache did not contain any word starts we must rebuild then from
1224 // the URL and page titles.
1225 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1226 iter != history_info_map_.end(); ++iter) {
1227 RowWordStarts word_starts;
1228 const URLRow& row(iter->second.url_row);
1229 const base::string16& url =
1230 bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL);
1231 String16VectorFromString16(url, false, &word_starts.url_word_starts_);
1232 const base::string16& title =
1233 bookmarks::CleanUpTitleForMatching(row.title());
1234 String16VectorFromString16(title, false, &word_starts.title_word_starts_);
1235 word_starts_map_[iter->first] = word_starts;
1242 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1244 const std::set<std::string>& whitelist) {
1245 return whitelist.find(gurl.scheme()) != whitelist.end();
1249 // SearchTermCacheItem ---------------------------------------------------------
1251 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1252 const WordIDSet& word_id_set,
1253 const HistoryIDSet& history_id_set)
1254 : word_id_set_(word_id_set),
1255 history_id_set_(history_id_set),
1258 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem()
1261 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {}
1264 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1266 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1267 const URLIndexPrivateData& private_data,
1268 const std::string& languages,
1269 HistoryClient* history_client,
1270 const base::string16& lower_string,
1271 const String16Vector& lower_terms,
1272 const base::Time now)
1273 : private_data_(private_data),
1274 languages_(languages),
1275 history_client_(history_client),
1276 lower_string_(lower_string),
1277 lower_terms_(lower_terms),
1279 // Calculate offsets for each term. For instance, the offset for
1280 // ".net" should be 1, indicating that the actual word-part of the term
1281 // starts at offset 1.
1282 lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u);
1283 for (size_t i = 0; i < lower_terms_.size(); ++i) {
1284 base::i18n::BreakIterator iter(lower_terms_[i],
1285 base::i18n::BreakIterator::BREAK_WORD);
1286 // If the iterator doesn't work, assume an offset of 0.
1289 // Find the first word start.
1290 while (iter.Advance() && !iter.IsWord()) {}
1292 lower_terms_to_word_starts_offsets_[i] = iter.prev();
1293 // Else: the iterator didn't find a word break. Assume an offset of 0.
1297 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {}
1299 void URLIndexPrivateData::AddHistoryMatch::operator()(
1300 const HistoryID history_id) {
1301 HistoryInfoMap::const_iterator hist_pos =
1302 private_data_.history_info_map_.find(history_id);
1303 if (hist_pos != private_data_.history_info_map_.end()) {
1304 const URLRow& hist_item = hist_pos->second.url_row;
1305 const VisitInfoVector& visits = hist_pos->second.visits;
1306 WordStartsMap::const_iterator starts_pos =
1307 private_data_.word_starts_map_.find(history_id);
1308 DCHECK(starts_pos != private_data_.word_starts_map_.end());
1309 ScoredHistoryMatch match(hist_item, visits, languages_, lower_string_,
1310 lower_terms_, lower_terms_to_word_starts_offsets_,
1311 starts_pos->second, now_, history_client_);
1312 if (match.raw_score() > 0)
1313 scored_matches_.push_back(match);
1318 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1320 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1321 const HistoryInfoMap& history_info_map)
1322 : history_info_map_(history_info_map) {
1325 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
1327 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1329 const HistoryID h2) {
1330 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
1331 if (entry1 == history_info_map_.end())
1333 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
1334 if (entry2 == history_info_map_.end())
1336 const URLRow& r1(entry1->second.url_row);
1337 const URLRow& r2(entry2->second.url_row);
1338 // First cut: typed count, visit count, recency.
1339 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1340 // recently visited (within the last 12/24 hours) as highly important. Get
1341 // input from mpearson.
1342 if (r1.typed_count() != r2.typed_count())
1343 return (r1.typed_count() > r2.typed_count());
1344 if (r1.visit_count() != r2.visit_count())
1345 return (r1.visit_count() > r2.visit_count());
1346 return (r1.last_visit() > r2.last_visit());
1349 } // namespace history