1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/history/url_index_private_data.h"
15 #include "base/basictypes.h"
16 #include "base/file_util.h"
17 #include "base/i18n/case_conversion.h"
18 #include "base/metrics/histogram.h"
19 #include "base/strings/string_util.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "base/time/time.h"
22 #include "chrome/browser/autocomplete/autocomplete_provider.h"
23 #include "chrome/browser/autocomplete/url_prefix.h"
24 #include "chrome/browser/bookmarks/bookmark_service.h"
25 #include "chrome/browser/history/history_database.h"
26 #include "chrome/browser/history/history_db_task.h"
27 #include "chrome/browser/history/history_service.h"
28 #include "chrome/browser/history/in_memory_url_index.h"
29 #include "content/public/browser/browser_thread.h"
30 #include "content/public/browser/notification_details.h"
31 #include "content/public/browser/notification_service.h"
32 #include "content/public/browser/notification_source.h"
33 #include "net/base/net_util.h"
35 #if defined(USE_SYSTEM_PROTOBUF)
36 #include <google/protobuf/repeated_field.h>
38 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
41 using google::protobuf::RepeatedField;
42 using google::protobuf::RepeatedPtrField;
43 using in_memory_url_index::InMemoryURLIndexCacheItem;
46 static const size_t kMaxVisitsToStoreInCache = 10u;
47 } // anonymous namespace
51 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
52 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
53 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
54 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
55 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
57 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
60 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
61 WordIDHistoryMapEntry;
62 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
63 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
66 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
67 HistoryInfoMapEntry_VisitInfo;
68 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;
69 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
73 // Algorithm Functions ---------------------------------------------------------
75 // Comparison function for sorting search terms by descending length.
76 bool LengthGreater(const base::string16& string_a,
77 const base::string16& string_b) {
78 return string_a.length() > string_b.length();
82 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
84 // HistoryDBTask used to update the recent visit data for a particular
85 // row from the history database.
86 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask {
88 explicit UpdateRecentVisitsFromHistoryDBTask(
89 URLIndexPrivateData* private_data,
92 virtual bool RunOnDBThread(HistoryBackend* backend,
93 history::HistoryDatabase* db) OVERRIDE;
94 virtual void DoneRunOnMainThread() OVERRIDE;
97 virtual ~UpdateRecentVisitsFromHistoryDBTask();
99 // The URLIndexPrivateData that gets updated after the historyDB
101 URLIndexPrivateData* private_data_;
102 // The ID of the URL to get visits for and then update.
104 // Whether fetching the recent visits for the URL succeeded.
106 // The awaited data that's shown to private_data_ for it to copy and
108 VisitVector recent_visits_;
110 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
113 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
114 URLIndexPrivateData* private_data,
116 : private_data_(private_data),
121 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
122 HistoryBackend* backend,
123 HistoryDatabase* db) {
124 // Make sure the private data is going to get as many recent visits as
125 // ScoredHistoryMatch::GetFrecency() hopes to use.
126 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
127 succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
128 kMaxVisitsToStoreInCache,
131 recent_visits_.clear();
132 return true; // Always claim to be done; do not retry failures.
135 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
137 private_data_->UpdateRecentVisits(url_id_, recent_visits_);
140 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
144 // URLIndexPrivateData ---------------------------------------------------------
146 URLIndexPrivateData::URLIndexPrivateData()
147 : restored_cache_version_(0),
148 saved_cache_version_(kCurrentCacheFileVersion),
149 pre_filter_item_count_(0),
150 post_filter_item_count_(0),
151 post_scoring_item_count_(0) {
154 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
155 base::string16 search_string,
156 size_t cursor_position,
157 const std::string& languages,
158 BookmarkService* bookmark_service) {
159 // If cursor position is set and useful (not at either end of the
160 // string), allow the search string to be broken at cursor position.
161 // We do this by pretending there's a space where the cursor is.
162 if ((cursor_position != base::string16::npos) &&
163 (cursor_position < search_string.length()) &&
164 (cursor_position > 0)) {
165 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
167 pre_filter_item_count_ = 0;
168 post_filter_item_count_ = 0;
169 post_scoring_item_count_ = 0;
170 // The search string we receive may contain escaped characters. For reducing
171 // the index we need individual, lower-cased words, ignoring escapings. For
172 // the final filtering we need whitespace separated substrings possibly
173 // containing escaped characters.
174 base::string16 lower_raw_string(base::i18n::ToLower(search_string));
175 base::string16 lower_unescaped_string =
176 net::UnescapeURLComponent(lower_raw_string,
177 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
178 // Extract individual 'words' (as opposed to 'terms'; see below) from the
179 // search string. When the user types "colspec=ID%20Mstone Release" we get
180 // four 'words': "colspec", "id", "mstone" and "release".
181 String16Vector lower_words(
182 history::String16VectorFromString16(lower_unescaped_string, false, NULL));
183 ScoredHistoryMatches scored_items;
185 // Do nothing if we have indexed no words (probably because we've not been
186 // initialized yet) or the search string has no words.
187 if (word_list_.empty() || lower_words.empty()) {
188 search_term_cache_.clear(); // Invalidate the term cache.
192 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
194 ResetSearchTermCache();
196 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
198 // Trim the candidate pool if it is large. Note that we do not filter out
199 // items that do not contain the search terms as proper substrings -- doing
200 // so is the performance-costly operation we are trying to avoid in order
201 // to maintain omnibox responsiveness.
202 const size_t kItemsToScoreLimit = 500;
203 pre_filter_item_count_ = history_id_set.size();
204 // If we trim the results set we do not want to cache the results for next
205 // time as the user's ultimately desired result could easily be eliminated
206 // in this early rough filter.
207 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
209 HistoryIDVector history_ids;
210 std::copy(history_id_set.begin(), history_id_set.end(),
211 std::back_inserter(history_ids));
212 // Trim down the set by sorting by typed-count, visit-count, and last
214 HistoryItemFactorGreater
215 item_factor_functor(history_info_map_);
216 std::partial_sort(history_ids.begin(),
217 history_ids.begin() + kItemsToScoreLimit,
219 item_factor_functor);
220 history_id_set.clear();
221 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
222 std::inserter(history_id_set, history_id_set.end()));
223 post_filter_item_count_ = history_id_set.size();
226 // Pass over all of the candidates filtering out any without a proper
227 // substring match, inserting those which pass in order by score. Note that
228 // in this step we are using the raw search string complete with escaped
229 // URL elements. When the user has specifically typed something akin to
230 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
231 // specific substring appears in the URL or page title.
233 // We call these 'terms' (as opposed to 'words'; see above) as in this case
234 // we only want to break up the search string on 'true' whitespace rather than
235 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
236 // get two 'terms': "colspec=id%20mstone" and "release".
237 history::String16Vector lower_raw_terms;
238 if (Tokenize(lower_raw_string, base::kWhitespaceUTF16,
239 &lower_raw_terms) == 0) {
240 // Don't score matches when there are no terms to score against. (It's
241 // possible that the word break iterater that extracts words to search
242 // for in the database allows some whitespace "words" whereas Tokenize
243 // excludes a long list of whitespace.) One could write a scoring
244 // function that gives a reasonable order to matches when there
245 // are no terms (i.e., all the words are some form of whitespace),
246 // but this is such a rare edge case that it's not worth the time.
249 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
250 AddHistoryMatch(*this, languages, bookmark_service, lower_raw_string,
251 lower_raw_terms, base::Time::Now())).ScoredMatches();
253 // Select and sort only the top kMaxMatches results.
254 if (scored_items.size() > AutocompleteProvider::kMaxMatches) {
255 std::partial_sort(scored_items.begin(),
256 scored_items.begin() +
257 AutocompleteProvider::kMaxMatches,
259 ScoredHistoryMatch::MatchScoreGreater);
260 scored_items.resize(AutocompleteProvider::kMaxMatches);
262 std::sort(scored_items.begin(), scored_items.end(),
263 ScoredHistoryMatch::MatchScoreGreater);
265 post_scoring_item_count_ = scored_items.size();
268 search_term_cache_.clear(); // Invalidate the term cache.
270 // Remove any stale SearchTermCacheItems.
271 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
272 cache_iter != search_term_cache_.end(); ) {
273 if (!cache_iter->second.used_)
274 search_term_cache_.erase(cache_iter++);
283 bool URLIndexPrivateData::UpdateURL(
284 HistoryService* history_service,
286 const std::string& languages,
287 const std::set<std::string>& scheme_whitelist) {
288 // The row may or may not already be in our index. If it is not already
289 // indexed and it qualifies then it gets indexed. If it is already
290 // indexed and still qualifies then it gets updated, otherwise it
291 // is deleted from the index.
292 bool row_was_updated = false;
293 URLID row_id = row.id();
294 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
295 if (row_pos == history_info_map_.end()) {
296 // This new row should be indexed if it qualifies.
298 new_row.set_id(row_id);
299 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
300 IndexRow(NULL, history_service, new_row, languages, scheme_whitelist);
301 } else if (RowQualifiesAsSignificant(row, base::Time())) {
302 // This indexed row still qualifies and will be re-indexed.
303 // The url won't have changed but the title, visit count, etc.
304 // might have changed.
305 URLRow& row_to_update = row_pos->second.url_row;
306 bool title_updated = row_to_update.title() != row.title();
307 if (row_to_update.visit_count() != row.visit_count() ||
308 row_to_update.typed_count() != row.typed_count() ||
309 row_to_update.last_visit() != row.last_visit() || title_updated) {
310 row_to_update.set_visit_count(row.visit_count());
311 row_to_update.set_typed_count(row.typed_count());
312 row_to_update.set_last_visit(row.last_visit());
313 // If something appears to have changed, update the recent visits
315 ScheduleUpdateRecentVisits(history_service, row_id);
316 // While the URL is guaranteed to remain stable, the title may have
317 // changed. If so, then update the index with the changed words.
319 // Clear all words associated with this row and re-index both the
321 RemoveRowWordsFromIndex(row_to_update);
322 row_to_update.set_title(row.title());
323 RowWordStarts word_starts;
324 AddRowWordsToIndex(row_to_update, &word_starts, languages);
325 word_starts_map_[row_id] = word_starts;
327 row_was_updated = true;
330 // This indexed row no longer qualifies and will be de-indexed by
331 // clearing all words associated with this row.
332 RemoveRowFromIndex(row);
333 row_was_updated = true;
336 search_term_cache_.clear(); // This invalidates the cache.
337 return row_was_updated;
340 void URLIndexPrivateData::UpdateRecentVisits(
342 const VisitVector& recent_visits) {
343 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
344 if (row_pos != history_info_map_.end()) {
345 VisitInfoVector* visits = &row_pos->second.visits;
348 std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
349 visits->reserve(size);
350 for (size_t i = 0; i < size; i++) {
351 // Copy from the VisitVector the only fields visits needs.
352 visits->push_back(std::make_pair(recent_visits[i].visit_time,
353 recent_visits[i].transition));
356 // Else: Oddly, the URL doesn't seem to exist in the private index.
357 // Ignore this update. This can happen if, for instance, the user
358 // removes the URL from URLIndexPrivateData before the historyDB call
362 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
363 HistoryService* history_service,
365 history_service->ScheduleDBTask(
366 new UpdateRecentVisitsFromHistoryDBTask(this, url_id),
367 &recent_visits_consumer_);
370 // Helper functor for DeleteURL.
371 class HistoryInfoMapItemHasURL {
373 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
375 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
376 return item.second.url_row.url() == url_;
383 bool URLIndexPrivateData::DeleteURL(const GURL& url) {
384 // Find the matching entry in the history_info_map_.
385 HistoryInfoMap::iterator pos = std::find_if(
386 history_info_map_.begin(),
387 history_info_map_.end(),
388 HistoryInfoMapItemHasURL(url));
389 if (pos == history_info_map_.end())
391 RemoveRowFromIndex(pos->second.url_row);
392 search_term_cache_.clear(); // This invalidates the cache.
397 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
398 const base::FilePath& file_path,
399 const std::string& languages) {
400 base::TimeTicks beginning_time = base::TimeTicks::Now();
401 if (!base::PathExists(file_path))
404 // If there is no cache file then simply give up. This will cause us to
405 // attempt to rebuild from the history database.
406 if (!base::ReadFileToString(file_path, &data))
409 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
410 InMemoryURLIndexCacheItem index_cache;
411 if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
412 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
413 << file_path.value();
414 return restored_data;
417 if (!restored_data->RestorePrivateData(index_cache, languages))
420 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
421 base::TimeTicks::Now() - beginning_time);
422 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
423 restored_data->history_id_word_map_.size());
424 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
425 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
426 restored_data->word_map_.size());
427 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
428 restored_data->char_word_map_.size());
429 if (restored_data->Empty())
430 return NULL; // 'No data' is the same as a failed reload.
431 return restored_data;
435 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
436 HistoryDatabase* history_db,
437 const std::string& languages,
438 const std::set<std::string>& scheme_whitelist) {
442 base::TimeTicks beginning_time = base::TimeTicks::Now();
444 scoped_refptr<URLIndexPrivateData>
445 rebuilt_data(new URLIndexPrivateData);
446 URLDatabase::URLEnumerator history_enum;
447 if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
449 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
450 for (URLRow row; history_enum.GetNextURL(&row); ) {
451 rebuilt_data->IndexRow(history_db, NULL, row, languages,
455 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
456 base::TimeTicks::Now() - beginning_time);
457 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
458 rebuilt_data->history_id_word_map_.size());
459 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
460 rebuilt_data->word_map_.size());
461 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
462 rebuilt_data->char_word_map_.size());
467 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
468 scoped_refptr<URLIndexPrivateData> private_data,
469 const base::FilePath& file_path) {
470 DCHECK(private_data.get());
471 DCHECK(!file_path.empty());
472 return private_data->SaveToFile(file_path);
475 void URLIndexPrivateData::CancelPendingUpdates() {
476 recent_visits_consumer_.CancelAllRequests();
479 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
480 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
481 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
482 data_copy->word_list_ = word_list_;
483 data_copy->available_words_ = available_words_;
484 data_copy->word_map_ = word_map_;
485 data_copy->char_word_map_ = char_word_map_;
486 data_copy->word_id_history_map_ = word_id_history_map_;
487 data_copy->history_id_word_map_ = history_id_word_map_;
488 data_copy->history_info_map_ = history_info_map_;
489 data_copy->word_starts_map_ = word_starts_map_;
492 // search_term_cache_
493 // pre_filter_item_count_
494 // post_filter_item_count_
495 // post_scoring_item_count_
498 bool URLIndexPrivateData::Empty() const {
499 return history_info_map_.empty();
502 void URLIndexPrivateData::Clear() {
503 last_time_rebuilt_from_history_ = base::Time();
505 available_words_.clear();
507 char_word_map_.clear();
508 word_id_history_map_.clear();
509 history_id_word_map_.clear();
510 history_info_map_.clear();
511 word_starts_map_.clear();
514 URLIndexPrivateData::~URLIndexPrivateData() {}
516 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
517 const String16Vector& unsorted_words) {
518 // Break the terms down into individual terms (words), get the candidate
519 // set for each term, and intersect each to get a final candidate list.
520 // Note that a single 'term' from the user's perspective might be
521 // a string like "http://www.somewebsite.com" which, from our perspective,
522 // is four words: 'http', 'www', 'somewebsite', and 'com'.
523 HistoryIDSet history_id_set;
524 String16Vector words(unsorted_words);
525 // Sort the words into the longest first as such are likely to narrow down
526 // the results quicker. Also, single character words are the most expensive
527 // to process so save them for last.
528 std::sort(words.begin(), words.end(), LengthGreater);
529 for (String16Vector::iterator iter = words.begin(); iter != words.end();
531 base::string16 uni_word = *iter;
532 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
533 if (term_history_set.empty()) {
534 history_id_set.clear();
537 if (iter == words.begin()) {
538 history_id_set.swap(term_history_set);
540 HistoryIDSet new_history_id_set;
541 std::set_intersection(history_id_set.begin(), history_id_set.end(),
542 term_history_set.begin(), term_history_set.end(),
543 std::inserter(new_history_id_set,
544 new_history_id_set.begin()));
545 history_id_set.swap(new_history_id_set);
548 return history_id_set;
551 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
552 const base::string16& term) {
554 return HistoryIDSet();
556 // TODO(mrossetti): Consider optimizing for very common terms such as
557 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
558 // occuring words in the user's searches.
560 size_t term_length = term.length();
561 WordIDSet word_id_set;
562 if (term_length > 1) {
563 // See if this term or a prefix thereof is present in the cache.
564 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
565 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
566 cache_iter != search_term_cache_.end(); ++cache_iter) {
567 if (StartsWith(term, cache_iter->first, false) &&
568 (best_prefix == search_term_cache_.end() ||
569 cache_iter->first.length() > best_prefix->first.length()))
570 best_prefix = cache_iter;
573 // If a prefix was found then determine the leftover characters to be used
574 // for further refining the results from that prefix.
575 Char16Set prefix_chars;
576 base::string16 leftovers(term);
577 if (best_prefix != search_term_cache_.end()) {
578 // If the prefix is an exact match for the term then grab the cached
579 // results and we're done.
580 size_t prefix_length = best_prefix->first.length();
581 if (prefix_length == term_length) {
582 best_prefix->second.used_ = true;
583 return best_prefix->second.history_id_set_;
586 // Otherwise we have a handy starting point.
587 // If there are no history results for this prefix then we can bail early
588 // as there will be no history results for the full term.
589 if (best_prefix->second.history_id_set_.empty()) {
590 search_term_cache_[term] = SearchTermCacheItem();
591 return HistoryIDSet();
593 word_id_set = best_prefix->second.word_id_set_;
594 prefix_chars = Char16SetFromString16(best_prefix->first);
595 leftovers = term.substr(prefix_length);
598 // Filter for each remaining, unique character in the term.
599 Char16Set leftover_chars = Char16SetFromString16(leftovers);
600 Char16Set unique_chars =
601 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
603 // Reduce the word set with any leftover, unprocessed characters.
604 if (!unique_chars.empty()) {
605 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
606 // We might come up empty on the leftovers.
607 if (leftover_set.empty()) {
608 search_term_cache_[term] = SearchTermCacheItem();
609 return HistoryIDSet();
611 // Or there may not have been a prefix from which to start.
612 if (prefix_chars.empty()) {
613 word_id_set.swap(leftover_set);
615 WordIDSet new_word_id_set;
616 std::set_intersection(word_id_set.begin(), word_id_set.end(),
617 leftover_set.begin(), leftover_set.end(),
618 std::inserter(new_word_id_set,
619 new_word_id_set.begin()));
620 word_id_set.swap(new_word_id_set);
624 // We must filter the word list because the resulting word set surely
625 // contains words which do not have the search term as a proper subset.
626 for (WordIDSet::iterator word_set_iter = word_id_set.begin();
627 word_set_iter != word_id_set.end(); ) {
628 if (word_list_[*word_set_iter].find(term) == base::string16::npos)
629 word_id_set.erase(word_set_iter++);
634 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
637 // If any words resulted then we can compose a set of history IDs by unioning
638 // the sets from each word.
639 HistoryIDSet history_id_set;
640 if (!word_id_set.empty()) {
641 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
642 word_id_iter != word_id_set.end(); ++word_id_iter) {
643 WordID word_id = *word_id_iter;
644 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
645 if (word_iter != word_id_history_map_.end()) {
646 HistoryIDSet& word_history_id_set(word_iter->second);
647 history_id_set.insert(word_history_id_set.begin(),
648 word_history_id_set.end());
653 // Record a new cache entry for this word if the term is longer than
654 // a single character.
656 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
658 return history_id_set;
661 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
662 const Char16Set& term_chars) {
663 WordIDSet word_id_set;
664 for (Char16Set::const_iterator c_iter = term_chars.begin();
665 c_iter != term_chars.end(); ++c_iter) {
666 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
667 if (char_iter == char_word_map_.end()) {
668 // A character was not found so there are no matching results: bail.
672 WordIDSet& char_word_id_set(char_iter->second);
673 // It is possible for there to no longer be any words associated with
674 // a particular character. Give up in that case.
675 if (char_word_id_set.empty()) {
680 if (c_iter == term_chars.begin()) {
681 // First character results becomes base set of results.
682 word_id_set = char_word_id_set;
684 // Subsequent character results get intersected in.
685 WordIDSet new_word_id_set;
686 std::set_intersection(word_id_set.begin(), word_id_set.end(),
687 char_word_id_set.begin(), char_word_id_set.end(),
688 std::inserter(new_word_id_set,
689 new_word_id_set.begin()));
690 word_id_set.swap(new_word_id_set);
696 bool URLIndexPrivateData::IndexRow(
697 HistoryDatabase* history_db,
698 HistoryService* history_service,
700 const std::string& languages,
701 const std::set<std::string>& scheme_whitelist) {
702 const GURL& gurl(row.url());
704 // Index only URLs with a whitelisted scheme.
705 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
708 URLID row_id = row.id();
709 // Strip out username and password before saving and indexing.
710 base::string16 url(net::FormatUrl(gurl, languages,
711 net::kFormatUrlOmitUsernamePassword,
712 net::UnescapeRule::NONE,
715 HistoryID history_id = static_cast<HistoryID>(row_id);
716 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
718 // Add the row for quick lookup in the history info store.
719 URLRow new_row(GURL(url), row_id);
720 new_row.set_visit_count(row.visit_count());
721 new_row.set_typed_count(row.typed_count());
722 new_row.set_last_visit(row.last_visit());
723 new_row.set_title(row.title());
724 history_info_map_[history_id].url_row = new_row;
726 // Index the words contained in the URL and title of the row.
727 RowWordStarts word_starts;
728 AddRowWordsToIndex(new_row, &word_starts, languages);
729 word_starts_map_[history_id] = word_starts;
731 // Update the recent visits information or schedule the update
734 // We'd like to check that we're on the history DB thread.
735 // However, unittest code actually calls this on the UI thread.
736 // So we don't do any thread checks.
737 VisitVector recent_visits;
738 // Make sure the private data is going to get as many recent visits as
739 // ScoredHistoryMatch::GetFrecency() hopes to use.
740 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
741 if (history_db->GetMostRecentVisitsForURL(row_id,
742 kMaxVisitsToStoreInCache,
744 UpdateRecentVisits(row_id, recent_visits);
746 DCHECK(history_service);
747 ScheduleUpdateRecentVisits(history_service, row_id);
753 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,
754 RowWordStarts* word_starts,
755 const std::string& languages) {
756 HistoryID history_id = static_cast<HistoryID>(row.id());
757 // Split URL into individual, unique words then add in the title words.
758 const GURL& gurl(row.url());
759 const base::string16& url = CleanUpUrlForMatching(gurl, languages);
760 String16Set url_words = String16SetFromString16(url,
761 word_starts ? &word_starts->url_word_starts_ : NULL);
762 const base::string16& title = CleanUpTitleForMatching(row.title());
763 String16Set title_words = String16SetFromString16(title,
764 word_starts ? &word_starts->title_word_starts_ : NULL);
766 std::set_union(url_words.begin(), url_words.end(),
767 title_words.begin(), title_words.end(),
768 std::insert_iterator<String16Set>(words, words.begin()));
769 for (String16Set::iterator word_iter = words.begin();
770 word_iter != words.end(); ++word_iter)
771 AddWordToIndex(*word_iter, history_id);
773 search_term_cache_.clear(); // Invalidate the term cache.
776 void URLIndexPrivateData::AddWordToIndex(const base::string16& term,
777 HistoryID history_id) {
778 WordMap::iterator word_pos = word_map_.find(term);
779 if (word_pos != word_map_.end())
780 UpdateWordHistory(word_pos->second, history_id);
782 AddWordHistory(term, history_id);
785 void URLIndexPrivateData::AddWordHistory(const base::string16& term,
786 HistoryID history_id) {
787 WordID word_id = word_list_.size();
788 if (available_words_.empty()) {
789 word_list_.push_back(term);
791 word_id = *(available_words_.begin());
792 word_list_[word_id] = term;
793 available_words_.erase(word_id);
795 word_map_[term] = word_id;
797 HistoryIDSet history_id_set;
798 history_id_set.insert(history_id);
799 word_id_history_map_[word_id] = history_id_set;
800 AddToHistoryIDWordMap(history_id, word_id);
802 // For each character in the newly added word (i.e. a word that is not
803 // already in the word index), add the word to the character index.
804 Char16Set characters = Char16SetFromString16(term);
805 for (Char16Set::iterator uni_char_iter = characters.begin();
806 uni_char_iter != characters.end(); ++uni_char_iter) {
807 base::char16 uni_char = *uni_char_iter;
808 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
809 if (char_iter != char_word_map_.end()) {
810 // Update existing entry in the char/word index.
811 WordIDSet& word_id_set(char_iter->second);
812 word_id_set.insert(word_id);
814 // Create a new entry in the char/word index.
815 WordIDSet word_id_set;
816 word_id_set.insert(word_id);
817 char_word_map_[uni_char] = word_id_set;
822 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
823 HistoryID history_id) {
824 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
825 DCHECK(history_pos != word_id_history_map_.end());
826 HistoryIDSet& history_id_set(history_pos->second);
827 history_id_set.insert(history_id);
828 AddToHistoryIDWordMap(history_id, word_id);
831 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
833 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
834 if (iter != history_id_word_map_.end()) {
835 WordIDSet& word_id_set(iter->second);
836 word_id_set.insert(word_id);
838 WordIDSet word_id_set;
839 word_id_set.insert(word_id);
840 history_id_word_map_[history_id] = word_id_set;
844 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) {
845 RemoveRowWordsFromIndex(row);
846 HistoryID history_id = static_cast<HistoryID>(row.id());
847 history_info_map_.erase(history_id);
848 word_starts_map_.erase(history_id);
851 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {
852 // Remove the entries in history_id_word_map_ and word_id_history_map_ for
854 HistoryID history_id = static_cast<HistoryID>(row.id());
855 WordIDSet word_id_set = history_id_word_map_[history_id];
856 history_id_word_map_.erase(history_id);
858 // Reconcile any changes to word usage.
859 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
860 word_id_iter != word_id_set.end(); ++word_id_iter) {
861 WordID word_id = *word_id_iter;
862 word_id_history_map_[word_id].erase(history_id);
863 if (!word_id_history_map_[word_id].empty())
864 continue; // The word is still in use.
866 // The word is no longer in use. Reconcile any changes to character usage.
867 base::string16 word = word_list_[word_id];
868 Char16Set characters = Char16SetFromString16(word);
869 for (Char16Set::iterator uni_char_iter = characters.begin();
870 uni_char_iter != characters.end(); ++uni_char_iter) {
871 base::char16 uni_char = *uni_char_iter;
872 char_word_map_[uni_char].erase(word_id);
873 if (char_word_map_[uni_char].empty())
874 char_word_map_.erase(uni_char); // No longer in use.
877 // Complete the removal of references to the word.
878 word_id_history_map_.erase(word_id);
879 word_map_.erase(word);
880 word_list_[word_id] = base::string16();
881 available_words_.insert(word_id);
885 void URLIndexPrivateData::ResetSearchTermCache() {
886 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
887 iter != search_term_cache_.end(); ++iter)
888 iter->second.used_ = false;
891 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
892 base::TimeTicks beginning_time = base::TimeTicks::Now();
893 InMemoryURLIndexCacheItem index_cache;
894 SavePrivateData(&index_cache);
896 if (!index_cache.SerializeToString(&data)) {
897 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
901 int size = data.size();
902 if (file_util::WriteFile(file_path, data.c_str(), size) != size) {
903 LOG(WARNING) << "Failed to write " << file_path.value();
906 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
907 base::TimeTicks::Now() - beginning_time);
911 void URLIndexPrivateData::SavePrivateData(
912 InMemoryURLIndexCacheItem* cache) const {
914 cache->set_last_rebuild_timestamp(
915 last_time_rebuilt_from_history_.ToInternalValue());
916 cache->set_version(saved_cache_version_);
917 // history_item_count_ is no longer used but rather than change the protobuf
918 // definition use a placeholder. This will go away with the switch to SQLite.
919 cache->set_history_item_count(0);
922 SaveCharWordMap(cache);
923 SaveWordIDHistoryMap(cache);
924 SaveHistoryInfoMap(cache);
925 SaveWordStartsMap(cache);
928 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
929 if (word_list_.empty())
931 WordListItem* list_item = cache->mutable_word_list();
932 list_item->set_word_count(word_list_.size());
933 for (String16Vector::const_iterator iter = word_list_.begin();
934 iter != word_list_.end(); ++iter)
935 list_item->add_word(base::UTF16ToUTF8(*iter));
938 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
939 if (word_map_.empty())
941 WordMapItem* map_item = cache->mutable_word_map();
942 map_item->set_item_count(word_map_.size());
943 for (WordMap::const_iterator iter = word_map_.begin();
944 iter != word_map_.end(); ++iter) {
945 WordMapEntry* map_entry = map_item->add_word_map_entry();
946 map_entry->set_word(base::UTF16ToUTF8(iter->first));
947 map_entry->set_word_id(iter->second);
951 void URLIndexPrivateData::SaveCharWordMap(
952 InMemoryURLIndexCacheItem* cache) const {
953 if (char_word_map_.empty())
955 CharWordMapItem* map_item = cache->mutable_char_word_map();
956 map_item->set_item_count(char_word_map_.size());
957 for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
958 iter != char_word_map_.end(); ++iter) {
959 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
960 map_entry->set_char_16(iter->first);
961 const WordIDSet& word_id_set(iter->second);
962 map_entry->set_item_count(word_id_set.size());
963 for (WordIDSet::const_iterator set_iter = word_id_set.begin();
964 set_iter != word_id_set.end(); ++set_iter)
965 map_entry->add_word_id(*set_iter);
969 void URLIndexPrivateData::SaveWordIDHistoryMap(
970 InMemoryURLIndexCacheItem* cache) const {
971 if (word_id_history_map_.empty())
973 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
974 map_item->set_item_count(word_id_history_map_.size());
975 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
976 iter != word_id_history_map_.end(); ++iter) {
977 WordIDHistoryMapEntry* map_entry =
978 map_item->add_word_id_history_map_entry();
979 map_entry->set_word_id(iter->first);
980 const HistoryIDSet& history_id_set(iter->second);
981 map_entry->set_item_count(history_id_set.size());
982 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
983 set_iter != history_id_set.end(); ++set_iter)
984 map_entry->add_history_id(*set_iter);
988 void URLIndexPrivateData::SaveHistoryInfoMap(
989 InMemoryURLIndexCacheItem* cache) const {
990 if (history_info_map_.empty())
992 HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
993 map_item->set_item_count(history_info_map_.size());
994 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
995 iter != history_info_map_.end(); ++iter) {
996 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
997 map_entry->set_history_id(iter->first);
998 const URLRow& url_row(iter->second.url_row);
999 // Note: We only save information that contributes to the index so there
1000 // is no need to save search_term_cache_ (not persistent).
1001 map_entry->set_visit_count(url_row.visit_count());
1002 map_entry->set_typed_count(url_row.typed_count());
1003 map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
1004 map_entry->set_url(url_row.url().spec());
1005 map_entry->set_title(base::UTF16ToUTF8(url_row.title()));
1006 const VisitInfoVector& visits(iter->second.visits);
1007 for (VisitInfoVector::const_iterator visit_iter = visits.begin();
1008 visit_iter != visits.end(); ++visit_iter) {
1009 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
1010 visit_info->set_visit_time(visit_iter->first.ToInternalValue());
1011 visit_info->set_transition_type(visit_iter->second);
1016 void URLIndexPrivateData::SaveWordStartsMap(
1017 InMemoryURLIndexCacheItem* cache) const {
1018 if (word_starts_map_.empty())
1020 // For unit testing: Enable saving of the cache as an earlier version to
1021 // allow testing of cache file upgrading in ReadFromFile().
1022 // TODO(mrossetti): Instead of intruding on production code with this kind of
1023 // test harness, save a copy of an older version cache with known results.
1024 // Implement this when switching the caching over to SQLite.
1025 if (saved_cache_version_ < 1)
1028 WordStartsMapItem* map_item = cache->mutable_word_starts_map();
1029 map_item->set_item_count(word_starts_map_.size());
1030 for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
1031 iter != word_starts_map_.end(); ++iter) {
1032 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
1033 map_entry->set_history_id(iter->first);
1034 const RowWordStarts& word_starts(iter->second);
1035 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
1036 i != word_starts.url_word_starts_.end(); ++i)
1037 map_entry->add_url_word_starts(*i);
1038 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
1039 i != word_starts.title_word_starts_.end(); ++i)
1040 map_entry->add_title_word_starts(*i);
1044 bool URLIndexPrivateData::RestorePrivateData(
1045 const InMemoryURLIndexCacheItem& cache,
1046 const std::string& languages) {
1047 last_time_rebuilt_from_history_ =
1048 base::Time::FromInternalValue(cache.last_rebuild_timestamp());
1049 const base::TimeDelta rebuilt_ago =
1050 base::Time::Now() - last_time_rebuilt_from_history_;
1051 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
1052 (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
1053 // Cache is more than a week old or, somehow, from some time in the future.
1054 // It's probably a good time to rebuild the index from history to
1055 // allow synced entries to now appear, expired entries to disappear, etc.
1056 // Allow one day in the future to make the cache not rebuild on simple
1057 // system clock changes such as time zone changes.
1060 if (cache.has_version()) {
1061 if (cache.version() < kCurrentCacheFileVersion) {
1062 // Don't try to restore an old format cache file. (This will cause
1063 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1067 restored_cache_version_ = cache.version();
1069 return RestoreWordList(cache) && RestoreWordMap(cache) &&
1070 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
1071 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
1074 bool URLIndexPrivateData::RestoreWordList(
1075 const InMemoryURLIndexCacheItem& cache) {
1076 if (!cache.has_word_list())
1078 const WordListItem& list_item(cache.word_list());
1079 uint32 expected_item_count = list_item.word_count();
1080 uint32 actual_item_count = list_item.word_size();
1081 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1083 const RepeatedPtrField<std::string>& words(list_item.word());
1084 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
1085 iter != words.end(); ++iter)
1086 word_list_.push_back(base::UTF8ToUTF16(*iter));
1090 bool URLIndexPrivateData::RestoreWordMap(
1091 const InMemoryURLIndexCacheItem& cache) {
1092 if (!cache.has_word_map())
1094 const WordMapItem& list_item(cache.word_map());
1095 uint32 expected_item_count = list_item.item_count();
1096 uint32 actual_item_count = list_item.word_map_entry_size();
1097 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1099 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
1100 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
1101 iter != entries.end(); ++iter)
1102 word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();
1106 bool URLIndexPrivateData::RestoreCharWordMap(
1107 const InMemoryURLIndexCacheItem& cache) {
1108 if (!cache.has_char_word_map())
1110 const CharWordMapItem& list_item(cache.char_word_map());
1111 uint32 expected_item_count = list_item.item_count();
1112 uint32 actual_item_count = list_item.char_word_map_entry_size();
1113 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1115 const RepeatedPtrField<CharWordMapEntry>&
1116 entries(list_item.char_word_map_entry());
1117 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
1118 entries.begin(); iter != entries.end(); ++iter) {
1119 expected_item_count = iter->item_count();
1120 actual_item_count = iter->word_id_size();
1121 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1123 base::char16 uni_char = static_cast<base::char16>(iter->char_16());
1124 WordIDSet word_id_set;
1125 const RepeatedField<int32>& word_ids(iter->word_id());
1126 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
1127 jiter != word_ids.end(); ++jiter)
1128 word_id_set.insert(*jiter);
1129 char_word_map_[uni_char] = word_id_set;
1134 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1135 const InMemoryURLIndexCacheItem& cache) {
1136 if (!cache.has_word_id_history_map())
1138 const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
1139 uint32 expected_item_count = list_item.item_count();
1140 uint32 actual_item_count = list_item.word_id_history_map_entry_size();
1141 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1143 const RepeatedPtrField<WordIDHistoryMapEntry>&
1144 entries(list_item.word_id_history_map_entry());
1145 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1146 entries.begin(); iter != entries.end(); ++iter) {
1147 expected_item_count = iter->item_count();
1148 actual_item_count = iter->history_id_size();
1149 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1151 WordID word_id = iter->word_id();
1152 HistoryIDSet history_id_set;
1153 const RepeatedField<int64>& history_ids(iter->history_id());
1154 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1155 jiter != history_ids.end(); ++jiter) {
1156 history_id_set.insert(*jiter);
1157 AddToHistoryIDWordMap(*jiter, word_id);
1159 word_id_history_map_[word_id] = history_id_set;
1164 bool URLIndexPrivateData::RestoreHistoryInfoMap(
1165 const InMemoryURLIndexCacheItem& cache) {
1166 if (!cache.has_history_info_map())
1168 const HistoryInfoMapItem& list_item(cache.history_info_map());
1169 uint32 expected_item_count = list_item.item_count();
1170 uint32 actual_item_count = list_item.history_info_map_entry_size();
1171 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1173 const RepeatedPtrField<HistoryInfoMapEntry>&
1174 entries(list_item.history_info_map_entry());
1175 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
1176 entries.begin(); iter != entries.end(); ++iter) {
1177 HistoryID history_id = iter->history_id();
1178 GURL url(iter->url());
1179 URLRow url_row(url, history_id);
1180 url_row.set_visit_count(iter->visit_count());
1181 url_row.set_typed_count(iter->typed_count());
1182 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1183 if (iter->has_title()) {
1184 base::string16 title(base::UTF8ToUTF16(iter->title()));
1185 url_row.set_title(title);
1187 history_info_map_[history_id].url_row = url_row;
1189 // Restore visits list.
1190 VisitInfoVector visits;
1191 visits.reserve(iter->visits_size());
1192 for (int i = 0; i < iter->visits_size(); ++i) {
1193 visits.push_back(std::make_pair(
1194 base::Time::FromInternalValue(iter->visits(i).visit_time()),
1195 static_cast<content::PageTransition>(iter->visits(i).
1196 transition_type())));
1198 history_info_map_[history_id].visits = visits;
1203 bool URLIndexPrivateData::RestoreWordStartsMap(
1204 const InMemoryURLIndexCacheItem& cache,
1205 const std::string& languages) {
1206 // Note that this function must be called after RestoreHistoryInfoMap() has
1207 // been run as the word starts may have to be recalculated from the urls and
1209 if (cache.has_word_starts_map()) {
1210 const WordStartsMapItem& list_item(cache.word_starts_map());
1211 uint32 expected_item_count = list_item.item_count();
1212 uint32 actual_item_count = list_item.word_starts_map_entry_size();
1213 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1215 const RepeatedPtrField<WordStartsMapEntry>&
1216 entries(list_item.word_starts_map_entry());
1217 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
1218 entries.begin(); iter != entries.end(); ++iter) {
1219 HistoryID history_id = iter->history_id();
1220 RowWordStarts word_starts;
1221 // Restore the URL word starts.
1222 const RepeatedField<int32>& url_starts(iter->url_word_starts());
1223 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
1224 jiter != url_starts.end(); ++jiter)
1225 word_starts.url_word_starts_.push_back(*jiter);
1226 // Restore the page title word starts.
1227 const RepeatedField<int32>& title_starts(iter->title_word_starts());
1228 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
1229 jiter != title_starts.end(); ++jiter)
1230 word_starts.title_word_starts_.push_back(*jiter);
1231 word_starts_map_[history_id] = word_starts;
1234 // Since the cache did not contain any word starts we must rebuild then from
1235 // the URL and page titles.
1236 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1237 iter != history_info_map_.end(); ++iter) {
1238 RowWordStarts word_starts;
1239 const URLRow& row(iter->second.url_row);
1240 const base::string16& url = CleanUpUrlForMatching(row.url(), languages);
1241 String16VectorFromString16(url, false, &word_starts.url_word_starts_);
1242 const base::string16& title = CleanUpTitleForMatching(row.title());
1243 String16VectorFromString16(title, false, &word_starts.title_word_starts_);
1244 word_starts_map_[iter->first] = word_starts;
1251 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1253 const std::set<std::string>& whitelist) {
1254 return whitelist.find(gurl.scheme()) != whitelist.end();
1258 // SearchTermCacheItem ---------------------------------------------------------
1260 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1261 const WordIDSet& word_id_set,
1262 const HistoryIDSet& history_id_set)
1263 : word_id_set_(word_id_set),
1264 history_id_set_(history_id_set),
1267 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem()
1270 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {}
1273 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1275 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1276 const URLIndexPrivateData& private_data,
1277 const std::string& languages,
1278 BookmarkService* bookmark_service,
1279 const base::string16& lower_string,
1280 const String16Vector& lower_terms,
1281 const base::Time now)
1282 : private_data_(private_data),
1283 languages_(languages),
1284 bookmark_service_(bookmark_service),
1285 lower_string_(lower_string),
1286 lower_terms_(lower_terms),
1289 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {}
1291 void URLIndexPrivateData::AddHistoryMatch::operator()(
1292 const HistoryID history_id) {
1293 HistoryInfoMap::const_iterator hist_pos =
1294 private_data_.history_info_map_.find(history_id);
1295 if (hist_pos != private_data_.history_info_map_.end()) {
1296 const URLRow& hist_item = hist_pos->second.url_row;
1297 const VisitInfoVector& visits = hist_pos->second.visits;
1298 WordStartsMap::const_iterator starts_pos =
1299 private_data_.word_starts_map_.find(history_id);
1300 DCHECK(starts_pos != private_data_.word_starts_map_.end());
1301 ScoredHistoryMatch match(hist_item, visits, languages_, lower_string_,
1302 lower_terms_, starts_pos->second, now_,
1304 if (match.raw_score() > 0)
1305 scored_matches_.push_back(match);
1310 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1312 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1313 const HistoryInfoMap& history_info_map)
1314 : history_info_map_(history_info_map) {
1317 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
1319 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1321 const HistoryID h2) {
1322 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
1323 if (entry1 == history_info_map_.end())
1325 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
1326 if (entry2 == history_info_map_.end())
1328 const URLRow& r1(entry1->second.url_row);
1329 const URLRow& r2(entry2->second.url_row);
1330 // First cut: typed count, visit count, recency.
1331 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1332 // recently visited (within the last 12/24 hours) as highly important. Get
1333 // input from mpearson.
1334 if (r1.typed_count() != r2.typed_count())
1335 return (r1.typed_count() > r2.typed_count());
1336 if (r1.visit_count() != r2.visit_count())
1337 return (r1.visit_count() > r2.visit_count());
1338 return (r1.last_visit() > r2.last_visit());
1341 } // namespace history