src/chrome/browser/history/url_index_private_data.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/browser/history/url_index_private_data.h"
   6
   7 #include <functional>
   8 #include <iterator>
   9 #include <limits>
  10 #include <numeric>
  11 #include <string>
  12 #include <vector>
  13
  14 #include "base/basictypes.h"
  15 #include "base/files/file_util.h"
  16 #include "base/i18n/break_iterator.h"
  17 #include "base/i18n/case_conversion.h"
  18 #include "base/metrics/histogram.h"
  19 #include "base/strings/string_util.h"
  20 #include "base/strings/utf_string_conversions.h"
  21 #include "base/time/time.h"
  22 #include "chrome/browser/history/history_database.h"
  23 #include "chrome/browser/history/history_db_task.h"
  24 #include "chrome/browser/history/history_service.h"
  25 #include "chrome/browser/history/in_memory_url_index.h"
  26 #include "components/bookmarks/browser/bookmark_utils.h"
  27 #include "components/history/core/browser/history_client.h"
  28 #include "net/base/net_util.h"
  29
  30 #if defined(USE_SYSTEM_PROTOBUF)
  31 #include <google/protobuf/repeated_field.h>
  32 #else
  33 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
  34 #endif
  35
  36 using google::protobuf::RepeatedField;
  37 using google::protobuf::RepeatedPtrField;
  38 using in_memory_url_index::InMemoryURLIndexCacheItem;
  39
  40 namespace {
  41 static const size_t kMaxVisitsToStoreInCache = 10u;
  42 }  // anonymous namespace
  43
  44 namespace history {
  45
  46 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
  47 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
  48 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
  49 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
  50 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
  51     CharWordMapEntry;
  52 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
  53     WordIDHistoryMapItem;
  54 typedef imui::
  55     InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
  56     WordIDHistoryMapEntry;
  57 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
  58 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
  59     HistoryInfoMapEntry;
  60 typedef imui::
  61     InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
  62     HistoryInfoMapEntry_VisitInfo;
  63 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;
  64 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
  65     WordStartsMapEntry;
  66
  67
  68 // Algorithm Functions ---------------------------------------------------------
  69
  70 // Comparison function for sorting search terms by descending length.
  71 bool LengthGreater(const base::string16& string_a,
  72                    const base::string16& string_b) {
  73   return string_a.length() > string_b.length();
  74 }
  75
  76
  77 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
  78
  79 // HistoryDBTask used to update the recent visit data for a particular
  80 // row from the history database.
  81 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask {
  82  public:
  83   explicit UpdateRecentVisitsFromHistoryDBTask(
  84       URLIndexPrivateData* private_data,
  85       URLID url_id);
  86
  87   bool RunOnDBThread(HistoryBackend* backend,
  88                      history::HistoryDatabase* db) override;
  89   void DoneRunOnMainThread() override;
  90
  91  private:
  92   ~UpdateRecentVisitsFromHistoryDBTask() override;
  93
  94   // The URLIndexPrivateData that gets updated after the historyDB
  95   // task returns.
  96   URLIndexPrivateData* private_data_;
  97   // The ID of the URL to get visits for and then update.
  98   URLID url_id_;
  99   // Whether fetching the recent visits for the URL succeeded.
 100   bool succeeded_;
 101   // The awaited data that's shown to private_data_ for it to copy and
 102   // store.
 103   VisitVector recent_visits_;
 104
 105   DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
 106 };
 107
 108 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
 109     URLIndexPrivateData* private_data,
 110     URLID url_id)
 111     : private_data_(private_data),
 112       url_id_(url_id),
 113       succeeded_(false) {
 114 }
 115
 116 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
 117     HistoryBackend* backend,
 118     HistoryDatabase* db) {
 119   // Make sure the private data is going to get as many recent visits as
 120   // ScoredHistoryMatch::GetFrequency() hopes to use.
 121   DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
 122   succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
 123                                              kMaxVisitsToStoreInCache,
 124                                              &recent_visits_);
 125   if (!succeeded_)
 126     recent_visits_.clear();
 127   return true;  // Always claim to be done; do not retry failures.
 128 }
 129
 130 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
 131   if (succeeded_)
 132     private_data_->UpdateRecentVisits(url_id_, recent_visits_);
 133 }
 134
 135 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
 136 }
 137
 138
 139 // URLIndexPrivateData ---------------------------------------------------------
 140
 141 URLIndexPrivateData::URLIndexPrivateData()
 142     : restored_cache_version_(0),
 143       saved_cache_version_(kCurrentCacheFileVersion),
 144       pre_filter_item_count_(0),
 145       post_filter_item_count_(0),
 146       post_scoring_item_count_(0) {
 147 }
 148
 149 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
 150     base::string16 search_string,
 151     size_t cursor_position,
 152     size_t max_matches,
 153     const std::string& languages,
 154     HistoryClient* history_client) {
 155   // If cursor position is set and useful (not at either end of the
 156   // string), allow the search string to be broken at cursor position.
 157   // We do this by pretending there's a space where the cursor is.
 158   if ((cursor_position != base::string16::npos) &&
 159       (cursor_position < search_string.length()) &&
 160       (cursor_position > 0)) {
 161     search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
 162   }
 163   pre_filter_item_count_ = 0;
 164   post_filter_item_count_ = 0;
 165   post_scoring_item_count_ = 0;
 166   // The search string we receive may contain escaped characters. For reducing
 167   // the index we need individual, lower-cased words, ignoring escapings. For
 168   // the final filtering we need whitespace separated substrings possibly
 169   // containing escaped characters.
 170   base::string16 lower_raw_string(base::i18n::ToLower(search_string));
 171   base::string16 lower_unescaped_string =
 172       net::UnescapeURLComponent(lower_raw_string,
 173           net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
 174   // Extract individual 'words' (as opposed to 'terms'; see below) from the
 175   // search string. When the user types "colspec=ID%20Mstone Release" we get
 176   // four 'words': "colspec", "id", "mstone" and "release".
 177   String16Vector lower_words(
 178       history::String16VectorFromString16(lower_unescaped_string, false, NULL));
 179   ScoredHistoryMatches scored_items;
 180
 181   // Do nothing if we have indexed no words (probably because we've not been
 182   // initialized yet) or the search string has no words.
 183   if (word_list_.empty() || lower_words.empty()) {
 184     search_term_cache_.clear();  // Invalidate the term cache.
 185     return scored_items;
 186   }
 187
 188   // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
 189   // approach.
 190   ResetSearchTermCache();
 191
 192   HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
 193
 194   // Trim the candidate pool if it is large. Note that we do not filter out
 195   // items that do not contain the search terms as proper substrings -- doing
 196   // so is the performance-costly operation we are trying to avoid in order
 197   // to maintain omnibox responsiveness.
 198   const size_t kItemsToScoreLimit = 500;
 199   pre_filter_item_count_ = history_id_set.size();
 200   // If we trim the results set we do not want to cache the results for next
 201   // time as the user's ultimately desired result could easily be eliminated
 202   // in this early rough filter.
 203   bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
 204   if (was_trimmed) {
 205     HistoryIDVector history_ids;
 206     std::copy(history_id_set.begin(), history_id_set.end(),
 207               std::back_inserter(history_ids));
 208     // Trim down the set by sorting by typed-count, visit-count, and last
 209     // visit.
 210     HistoryItemFactorGreater
 211         item_factor_functor(history_info_map_);
 212     std::partial_sort(history_ids.begin(),
 213                       history_ids.begin() + kItemsToScoreLimit,
 214                       history_ids.end(),
 215                       item_factor_functor);
 216     history_id_set.clear();
 217     std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
 218               std::inserter(history_id_set, history_id_set.end()));
 219     post_filter_item_count_ = history_id_set.size();
 220   }
 221
 222   // Pass over all of the candidates filtering out any without a proper
 223   // substring match, inserting those which pass in order by score. Note that
 224   // in this step we are using the raw search string complete with escaped
 225   // URL elements. When the user has specifically typed something akin to
 226   // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
 227   // specific substring appears in the URL or page title.
 228
 229   // We call these 'terms' (as opposed to 'words'; see above) as in this case
 230   // we only want to break up the search string on 'true' whitespace rather than
 231   // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
 232   // get two 'terms': "colspec=id%20mstone" and "release".
 233   history::String16Vector lower_raw_terms;
 234   if (Tokenize(lower_raw_string, base::kWhitespaceUTF16,
 235                &lower_raw_terms) == 0) {
 236     // Don't score matches when there are no terms to score against.  (It's
 237     // possible that the word break iterater that extracts words to search
 238     // for in the database allows some whitespace "words" whereas Tokenize
 239     // excludes a long list of whitespace.)  One could write a scoring
 240     // function that gives a reasonable order to matches when there
 241     // are no terms (i.e., all the words are some form of whitespace),
 242     // but this is such a rare edge case that it's not worth the time.
 243     return scored_items;
 244   }
 245   scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
 246       AddHistoryMatch(*this, languages, history_client, lower_raw_string,
 247                       lower_raw_terms, base::Time::Now())).ScoredMatches();
 248
 249   // Select and sort only the top |max_matches| results.
 250   if (scored_items.size() > max_matches) {
 251     std::partial_sort(scored_items.begin(),
 252                       scored_items.begin() +
 253                           max_matches,
 254                       scored_items.end(),
 255                       ScoredHistoryMatch::MatchScoreGreater);
 256       scored_items.resize(max_matches);
 257   } else {
 258     std::sort(scored_items.begin(), scored_items.end(),
 259               ScoredHistoryMatch::MatchScoreGreater);
 260   }
 261   post_scoring_item_count_ = scored_items.size();
 262
 263   if (was_trimmed) {
 264     search_term_cache_.clear();  // Invalidate the term cache.
 265   } else {
 266     // Remove any stale SearchTermCacheItems.
 267     for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
 268          cache_iter != search_term_cache_.end(); ) {
 269       if (!cache_iter->second.used_)
 270         search_term_cache_.erase(cache_iter++);
 271       else
 272         ++cache_iter;
 273     }
 274   }
 275
 276   return scored_items;
 277 }
 278
 279 bool URLIndexPrivateData::UpdateURL(
 280     HistoryService* history_service,
 281     const URLRow& row,
 282     const std::string& languages,
 283     const std::set<std::string>& scheme_whitelist,
 284     base::CancelableTaskTracker* tracker) {
 285   // The row may or may not already be in our index. If it is not already
 286   // indexed and it qualifies then it gets indexed. If it is already
 287   // indexed and still qualifies then it gets updated, otherwise it
 288   // is deleted from the index.
 289   bool row_was_updated = false;
 290   URLID row_id = row.id();
 291   HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
 292   if (row_pos == history_info_map_.end()) {
 293     // This new row should be indexed if it qualifies.
 294     URLRow new_row(row);
 295     new_row.set_id(row_id);
 296     row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
 297                       IndexRow(NULL,
 298                                history_service,
 299                                new_row,
 300                                languages,
 301                                scheme_whitelist,
 302                                tracker);
 303   } else if (RowQualifiesAsSignificant(row, base::Time())) {
 304     // This indexed row still qualifies and will be re-indexed.
 305     // The url won't have changed but the title, visit count, etc.
 306     // might have changed.
 307     URLRow& row_to_update = row_pos->second.url_row;
 308     bool title_updated = row_to_update.title() != row.title();
 309     if (row_to_update.visit_count() != row.visit_count() ||
 310         row_to_update.typed_count() != row.typed_count() ||
 311         row_to_update.last_visit() != row.last_visit() || title_updated) {
 312       row_to_update.set_visit_count(row.visit_count());
 313       row_to_update.set_typed_count(row.typed_count());
 314       row_to_update.set_last_visit(row.last_visit());
 315       // If something appears to have changed, update the recent visits
 316       // information.
 317       ScheduleUpdateRecentVisits(history_service, row_id, tracker);
 318       // While the URL is guaranteed to remain stable, the title may have
 319       // changed. If so, then update the index with the changed words.
 320       if (title_updated) {
 321         // Clear all words associated with this row and re-index both the
 322         // URL and title.
 323         RemoveRowWordsFromIndex(row_to_update);
 324         row_to_update.set_title(row.title());
 325         RowWordStarts word_starts;
 326         AddRowWordsToIndex(row_to_update, &word_starts, languages);
 327         word_starts_map_[row_id] = word_starts;
 328       }
 329       row_was_updated = true;
 330     }
 331   } else {
 332     // This indexed row no longer qualifies and will be de-indexed by
 333     // clearing all words associated with this row.
 334     RemoveRowFromIndex(row);
 335     row_was_updated = true;
 336   }
 337   if (row_was_updated)
 338     search_term_cache_.clear();  // This invalidates the cache.
 339   return row_was_updated;
 340 }
 341
 342 void URLIndexPrivateData::UpdateRecentVisits(
 343     URLID url_id,
 344     const VisitVector& recent_visits) {
 345   HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
 346   if (row_pos != history_info_map_.end()) {
 347     VisitInfoVector* visits = &row_pos->second.visits;
 348     visits->clear();
 349     const size_t size =
 350         std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
 351     visits->reserve(size);
 352     for (size_t i = 0; i < size; i++) {
 353       // Copy from the VisitVector the only fields visits needs.
 354       visits->push_back(std::make_pair(recent_visits[i].visit_time,
 355                                        recent_visits[i].transition));
 356     }
 357   }
 358   // Else: Oddly, the URL doesn't seem to exist in the private index.
 359   // Ignore this update.  This can happen if, for instance, the user
 360   // removes the URL from URLIndexPrivateData before the historyDB call
 361   // returns.
 362 }
 363
 364 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
 365     HistoryService* history_service,
 366     URLID url_id,
 367     base::CancelableTaskTracker* tracker) {
 368   history_service->ScheduleDBTask(
 369       scoped_ptr<history::HistoryDBTask>(
 370           new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker);
 371 }
 372
 373 // Helper functor for DeleteURL.
 374 class HistoryInfoMapItemHasURL {
 375  public:
 376   explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
 377
 378   bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
 379     return item.second.url_row.url() == url_;
 380   }
 381
 382  private:
 383   const GURL& url_;
 384 };
 385
 386 bool URLIndexPrivateData::DeleteURL(const GURL& url) {
 387   // Find the matching entry in the history_info_map_.
 388   HistoryInfoMap::iterator pos = std::find_if(
 389       history_info_map_.begin(),
 390       history_info_map_.end(),
 391       HistoryInfoMapItemHasURL(url));
 392   if (pos == history_info_map_.end())
 393     return false;
 394   RemoveRowFromIndex(pos->second.url_row);
 395   search_term_cache_.clear();  // This invalidates the cache.
 396   return true;
 397 }
 398
 399 // static
 400 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
 401     const base::FilePath& file_path,
 402     const std::string& languages) {
 403   base::TimeTicks beginning_time = base::TimeTicks::Now();
 404   if (!base::PathExists(file_path))
 405     return NULL;
 406   std::string data;
 407   // If there is no cache file then simply give up. This will cause us to
 408   // attempt to rebuild from the history database.
 409   if (!base::ReadFileToString(file_path, &data))
 410     return NULL;
 411
 412   scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
 413   InMemoryURLIndexCacheItem index_cache;
 414   if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
 415     LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
 416                  << file_path.value();
 417     return restored_data;
 418   }
 419
 420   if (!restored_data->RestorePrivateData(index_cache, languages))
 421     return NULL;
 422
 423   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
 424                       base::TimeTicks::Now() - beginning_time);
 425   UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
 426                        restored_data->history_id_word_map_.size());
 427   UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
 428   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
 429                              restored_data->word_map_.size());
 430   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
 431                              restored_data->char_word_map_.size());
 432   if (restored_data->Empty())
 433     return NULL;  // 'No data' is the same as a failed reload.
 434   return restored_data;
 435 }
 436
 437 // static
 438 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
 439     HistoryDatabase* history_db,
 440     const std::string& languages,
 441     const std::set<std::string>& scheme_whitelist) {
 442   if (!history_db)
 443     return NULL;
 444
 445   base::TimeTicks beginning_time = base::TimeTicks::Now();
 446
 447   scoped_refptr<URLIndexPrivateData>
 448       rebuilt_data(new URLIndexPrivateData);
 449   URLDatabase::URLEnumerator history_enum;
 450   if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
 451     return NULL;
 452   rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
 453   for (URLRow row; history_enum.GetNextURL(&row); ) {
 454     rebuilt_data->IndexRow(
 455         history_db, NULL, row, languages, scheme_whitelist, NULL);
 456   }
 457
 458   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
 459                       base::TimeTicks::Now() - beginning_time);
 460   UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
 461                        rebuilt_data->history_id_word_map_.size());
 462   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
 463                              rebuilt_data->word_map_.size());
 464   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
 465                              rebuilt_data->char_word_map_.size());
 466   return rebuilt_data;
 467 }
 468
 469 // static
 470 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
 471     scoped_refptr<URLIndexPrivateData> private_data,
 472     const base::FilePath& file_path) {
 473   DCHECK(private_data.get());
 474   DCHECK(!file_path.empty());
 475   return private_data->SaveToFile(file_path);
 476 }
 477
 478 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
 479   scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
 480   data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
 481   data_copy->word_list_ = word_list_;
 482   data_copy->available_words_ = available_words_;
 483   data_copy->word_map_ = word_map_;
 484   data_copy->char_word_map_ = char_word_map_;
 485   data_copy->word_id_history_map_ = word_id_history_map_;
 486   data_copy->history_id_word_map_ = history_id_word_map_;
 487   data_copy->history_info_map_ = history_info_map_;
 488   data_copy->word_starts_map_ = word_starts_map_;
 489   return data_copy;
 490   // Not copied:
 491   //    search_term_cache_
 492   //    pre_filter_item_count_
 493   //    post_filter_item_count_
 494   //    post_scoring_item_count_
 495 }
 496
 497 bool URLIndexPrivateData::Empty() const {
 498   return history_info_map_.empty();
 499 }
 500
 501 void URLIndexPrivateData::Clear() {
 502   last_time_rebuilt_from_history_ = base::Time();
 503   word_list_.clear();
 504   available_words_.clear();
 505   word_map_.clear();
 506   char_word_map_.clear();
 507   word_id_history_map_.clear();
 508   history_id_word_map_.clear();
 509   history_info_map_.clear();
 510   word_starts_map_.clear();
 511 }
 512
 513 URLIndexPrivateData::~URLIndexPrivateData() {}
 514
 515 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
 516     const String16Vector& unsorted_words) {
 517   // Break the terms down into individual terms (words), get the candidate
 518   // set for each term, and intersect each to get a final candidate list.
 519   // Note that a single 'term' from the user's perspective might be
 520   // a string like "http://www.somewebsite.com" which, from our perspective,
 521   // is four words: 'http', 'www', 'somewebsite', and 'com'.
 522   HistoryIDSet history_id_set;
 523   String16Vector words(unsorted_words);
 524   // Sort the words into the longest first as such are likely to narrow down
 525   // the results quicker. Also, single character words are the most expensive
 526   // to process so save them for last.
 527   std::sort(words.begin(), words.end(), LengthGreater);
 528   for (String16Vector::iterator iter = words.begin(); iter != words.end();
 529        ++iter) {
 530     base::string16 uni_word = *iter;
 531     HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
 532     if (term_history_set.empty()) {
 533       history_id_set.clear();
 534       break;
 535     }
 536     if (iter == words.begin()) {
 537       history_id_set.swap(term_history_set);
 538     } else {
 539       HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>(
 540           history_id_set, term_history_set);
 541       history_id_set.swap(new_history_id_set);
 542     }
 543   }
 544   return history_id_set;
 545 }
 546
 547 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
 548     const base::string16& term) {
 549   if (term.empty())
 550     return HistoryIDSet();
 551
 552   // TODO(mrossetti): Consider optimizing for very common terms such as
 553   // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
 554   // occuring words in the user's searches.
 555
 556   size_t term_length = term.length();
 557   WordIDSet word_id_set;
 558   if (term_length > 1) {
 559     // See if this term or a prefix thereof is present in the cache.
 560     SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
 561     for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
 562          cache_iter != search_term_cache_.end(); ++cache_iter) {
 563       if (StartsWith(term, cache_iter->first, false) &&
 564           (best_prefix == search_term_cache_.end() ||
 565            cache_iter->first.length() > best_prefix->first.length()))
 566         best_prefix = cache_iter;
 567     }
 568
 569     // If a prefix was found then determine the leftover characters to be used
 570     // for further refining the results from that prefix.
 571     Char16Set prefix_chars;
 572     base::string16 leftovers(term);
 573     if (best_prefix != search_term_cache_.end()) {
 574       // If the prefix is an exact match for the term then grab the cached
 575       // results and we're done.
 576       size_t prefix_length = best_prefix->first.length();
 577       if (prefix_length == term_length) {
 578         best_prefix->second.used_ = true;
 579         return best_prefix->second.history_id_set_;
 580       }
 581
 582       // Otherwise we have a handy starting point.
 583       // If there are no history results for this prefix then we can bail early
 584       // as there will be no history results for the full term.
 585       if (best_prefix->second.history_id_set_.empty()) {
 586         search_term_cache_[term] = SearchTermCacheItem();
 587         return HistoryIDSet();
 588       }
 589       word_id_set = best_prefix->second.word_id_set_;
 590       prefix_chars = Char16SetFromString16(best_prefix->first);
 591       leftovers = term.substr(prefix_length);
 592     }
 593
 594     // Filter for each remaining, unique character in the term.
 595     Char16Set leftover_chars = Char16SetFromString16(leftovers);
 596     Char16Set unique_chars =
 597         base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
 598
 599     // Reduce the word set with any leftover, unprocessed characters.
 600     if (!unique_chars.empty()) {
 601       WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
 602       // We might come up empty on the leftovers.
 603       if (leftover_set.empty()) {
 604         search_term_cache_[term] = SearchTermCacheItem();
 605         return HistoryIDSet();
 606       }
 607       // Or there may not have been a prefix from which to start.
 608       if (prefix_chars.empty()) {
 609         word_id_set.swap(leftover_set);
 610       } else {
 611         WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
 612             word_id_set, leftover_set);
 613         word_id_set.swap(new_word_id_set);
 614       }
 615     }
 616
 617     // We must filter the word list because the resulting word set surely
 618     // contains words which do not have the search term as a proper subset.
 619     for (WordIDSet::iterator word_set_iter = word_id_set.begin();
 620          word_set_iter != word_id_set.end(); ) {
 621       if (word_list_[*word_set_iter].find(term) == base::string16::npos)
 622         word_id_set.erase(word_set_iter++);
 623       else
 624         ++word_set_iter;
 625     }
 626   } else {
 627     word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
 628   }
 629
 630   // If any words resulted then we can compose a set of history IDs by unioning
 631   // the sets from each word.
 632   HistoryIDSet history_id_set;
 633   if (!word_id_set.empty()) {
 634     for (WordIDSet::iterator word_id_iter = word_id_set.begin();
 635          word_id_iter != word_id_set.end(); ++word_id_iter) {
 636       WordID word_id = *word_id_iter;
 637       WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
 638       if (word_iter != word_id_history_map_.end()) {
 639         HistoryIDSet& word_history_id_set(word_iter->second);
 640         history_id_set.insert(word_history_id_set.begin(),
 641                               word_history_id_set.end());
 642       }
 643     }
 644   }
 645
 646   // Record a new cache entry for this word if the term is longer than
 647   // a single character.
 648   if (term_length > 1)
 649     search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
 650
 651   return history_id_set;
 652 }
 653
 654 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
 655     const Char16Set& term_chars) {
 656   WordIDSet word_id_set;
 657   for (Char16Set::const_iterator c_iter = term_chars.begin();
 658        c_iter != term_chars.end(); ++c_iter) {
 659     CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
 660     if (char_iter == char_word_map_.end()) {
 661       // A character was not found so there are no matching results: bail.
 662       word_id_set.clear();
 663       break;
 664     }
 665     WordIDSet& char_word_id_set(char_iter->second);
 666     // It is possible for there to no longer be any words associated with
 667     // a particular character. Give up in that case.
 668     if (char_word_id_set.empty()) {
 669       word_id_set.clear();
 670       break;
 671     }
 672
 673     if (c_iter == term_chars.begin()) {
 674       // First character results becomes base set of results.
 675       word_id_set = char_word_id_set;
 676     } else {
 677       // Subsequent character results get intersected in.
 678       WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
 679           word_id_set, char_word_id_set);
 680       word_id_set.swap(new_word_id_set);
 681     }
 682   }
 683   return word_id_set;
 684 }
 685
 686 bool URLIndexPrivateData::IndexRow(
 687     HistoryDatabase* history_db,
 688     HistoryService* history_service,
 689     const URLRow& row,
 690     const std::string& languages,
 691     const std::set<std::string>& scheme_whitelist,
 692     base::CancelableTaskTracker* tracker) {
 693   const GURL& gurl(row.url());
 694
 695   // Index only URLs with a whitelisted scheme.
 696   if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
 697     return false;
 698
 699   URLID row_id = row.id();
 700   // Strip out username and password before saving and indexing.
 701   base::string16 url(net::FormatUrl(gurl, languages,
 702       net::kFormatUrlOmitUsernamePassword,
 703       net::UnescapeRule::NONE,
 704       NULL, NULL, NULL));
 705
 706   HistoryID history_id = static_cast<HistoryID>(row_id);
 707   DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
 708
 709   // Add the row for quick lookup in the history info store.
 710   URLRow new_row(GURL(url), row_id);
 711   new_row.set_visit_count(row.visit_count());
 712   new_row.set_typed_count(row.typed_count());
 713   new_row.set_last_visit(row.last_visit());
 714   new_row.set_title(row.title());
 715   history_info_map_[history_id].url_row = new_row;
 716
 717   // Index the words contained in the URL and title of the row.
 718   RowWordStarts word_starts;
 719   AddRowWordsToIndex(new_row, &word_starts, languages);
 720   word_starts_map_[history_id] = word_starts;
 721
 722   // Update the recent visits information or schedule the update
 723   // as appropriate.
 724   if (history_db) {
 725     // We'd like to check that we're on the history DB thread.
 726     // However, unittest code actually calls this on the UI thread.
 727     // So we don't do any thread checks.
 728     VisitVector recent_visits;
 729     // Make sure the private data is going to get as many recent visits as
 730     // ScoredHistoryMatch::GetFrequency() hopes to use.
 731     DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
 732     if (history_db->GetMostRecentVisitsForURL(row_id,
 733                                               kMaxVisitsToStoreInCache,
 734                                               &recent_visits))
 735       UpdateRecentVisits(row_id, recent_visits);
 736   } else {
 737     DCHECK(tracker);
 738     DCHECK(history_service);
 739     ScheduleUpdateRecentVisits(history_service, row_id, tracker);
 740   }
 741
 742   return true;
 743 }
 744
 745 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,
 746                                              RowWordStarts* word_starts,
 747                                              const std::string& languages) {
 748   HistoryID history_id = static_cast<HistoryID>(row.id());
 749   // Split URL into individual, unique words then add in the title words.
 750   const GURL& gurl(row.url());
 751   const base::string16& url =
 752       bookmarks::CleanUpUrlForMatching(gurl, languages, NULL);
 753   String16Set url_words = String16SetFromString16(url,
 754       word_starts ? &word_starts->url_word_starts_ : NULL);
 755   const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title());
 756   String16Set title_words = String16SetFromString16(title,
 757       word_starts ? &word_starts->title_word_starts_ : NULL);
 758   String16Set words = base::STLSetUnion<String16Set>(url_words, title_words);
 759   for (String16Set::iterator word_iter = words.begin();
 760        word_iter != words.end(); ++word_iter)
 761     AddWordToIndex(*word_iter, history_id);
 762
 763   search_term_cache_.clear();  // Invalidate the term cache.
 764 }
 765
 766 void URLIndexPrivateData::AddWordToIndex(const base::string16& term,
 767                                          HistoryID history_id) {
 768   WordMap::iterator word_pos = word_map_.find(term);
 769   if (word_pos != word_map_.end())
 770     UpdateWordHistory(word_pos->second, history_id);
 771   else
 772     AddWordHistory(term, history_id);
 773 }
 774
 775 void URLIndexPrivateData::AddWordHistory(const base::string16& term,
 776                                          HistoryID history_id) {
 777   WordID word_id = word_list_.size();
 778   if (available_words_.empty()) {
 779     word_list_.push_back(term);
 780   } else {
 781     word_id = *(available_words_.begin());
 782     word_list_[word_id] = term;
 783     available_words_.erase(word_id);
 784   }
 785   word_map_[term] = word_id;
 786
 787   HistoryIDSet history_id_set;
 788   history_id_set.insert(history_id);
 789   word_id_history_map_[word_id] = history_id_set;
 790   AddToHistoryIDWordMap(history_id, word_id);
 791
 792   // For each character in the newly added word (i.e. a word that is not
 793   // already in the word index), add the word to the character index.
 794   Char16Set characters = Char16SetFromString16(term);
 795   for (Char16Set::iterator uni_char_iter = characters.begin();
 796        uni_char_iter != characters.end(); ++uni_char_iter) {
 797     base::char16 uni_char = *uni_char_iter;
 798     CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
 799     if (char_iter != char_word_map_.end()) {
 800       // Update existing entry in the char/word index.
 801       WordIDSet& word_id_set(char_iter->second);
 802       word_id_set.insert(word_id);
 803     } else {
 804       // Create a new entry in the char/word index.
 805       WordIDSet word_id_set;
 806       word_id_set.insert(word_id);
 807       char_word_map_[uni_char] = word_id_set;
 808     }
 809   }
 810 }
 811
 812 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
 813                                             HistoryID history_id) {
 814   WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
 815   DCHECK(history_pos != word_id_history_map_.end());
 816   HistoryIDSet& history_id_set(history_pos->second);
 817   history_id_set.insert(history_id);
 818   AddToHistoryIDWordMap(history_id, word_id);
 819 }
 820
 821 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
 822                                                 WordID word_id) {
 823   HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
 824   if (iter != history_id_word_map_.end()) {
 825     WordIDSet& word_id_set(iter->second);
 826     word_id_set.insert(word_id);
 827   } else {
 828     WordIDSet word_id_set;
 829     word_id_set.insert(word_id);
 830     history_id_word_map_[history_id] = word_id_set;
 831   }
 832 }
 833
 834 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) {
 835   RemoveRowWordsFromIndex(row);
 836   HistoryID history_id = static_cast<HistoryID>(row.id());
 837   history_info_map_.erase(history_id);
 838   word_starts_map_.erase(history_id);
 839 }
 840
 841 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {
 842   // Remove the entries in history_id_word_map_ and word_id_history_map_ for
 843   // this row.
 844   HistoryID history_id = static_cast<HistoryID>(row.id());
 845   WordIDSet word_id_set = history_id_word_map_[history_id];
 846   history_id_word_map_.erase(history_id);
 847
 848   // Reconcile any changes to word usage.
 849   for (WordIDSet::iterator word_id_iter = word_id_set.begin();
 850        word_id_iter != word_id_set.end(); ++word_id_iter) {
 851     WordID word_id = *word_id_iter;
 852     word_id_history_map_[word_id].erase(history_id);
 853     if (!word_id_history_map_[word_id].empty())
 854       continue;  // The word is still in use.
 855
 856     // The word is no longer in use. Reconcile any changes to character usage.
 857     base::string16 word = word_list_[word_id];
 858     Char16Set characters = Char16SetFromString16(word);
 859     for (Char16Set::iterator uni_char_iter = characters.begin();
 860          uni_char_iter != characters.end(); ++uni_char_iter) {
 861       base::char16 uni_char = *uni_char_iter;
 862       char_word_map_[uni_char].erase(word_id);
 863       if (char_word_map_[uni_char].empty())
 864         char_word_map_.erase(uni_char);  // No longer in use.
 865     }
 866
 867     // Complete the removal of references to the word.
 868     word_id_history_map_.erase(word_id);
 869     word_map_.erase(word);
 870     word_list_[word_id] = base::string16();
 871     available_words_.insert(word_id);
 872   }
 873 }
 874
 875 void URLIndexPrivateData::ResetSearchTermCache() {
 876   for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
 877        iter != search_term_cache_.end(); ++iter)
 878     iter->second.used_ = false;
 879 }
 880
 881 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
 882   base::TimeTicks beginning_time = base::TimeTicks::Now();
 883   InMemoryURLIndexCacheItem index_cache;
 884   SavePrivateData(&index_cache);
 885   std::string data;
 886   if (!index_cache.SerializeToString(&data)) {
 887     LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
 888     return false;
 889   }
 890
 891   int size = data.size();
 892   if (base::WriteFile(file_path, data.c_str(), size) != size) {
 893     LOG(WARNING) << "Failed to write " << file_path.value();
 894     return false;
 895   }
 896   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
 897                       base::TimeTicks::Now() - beginning_time);
 898   return true;
 899 }
 900
 901 void URLIndexPrivateData::SavePrivateData(
 902     InMemoryURLIndexCacheItem* cache) const {
 903   DCHECK(cache);
 904   cache->set_last_rebuild_timestamp(
 905       last_time_rebuilt_from_history_.ToInternalValue());
 906   cache->set_version(saved_cache_version_);
 907   // history_item_count_ is no longer used but rather than change the protobuf
 908   // definition use a placeholder. This will go away with the switch to SQLite.
 909   cache->set_history_item_count(0);
 910   SaveWordList(cache);
 911   SaveWordMap(cache);
 912   SaveCharWordMap(cache);
 913   SaveWordIDHistoryMap(cache);
 914   SaveHistoryInfoMap(cache);
 915   SaveWordStartsMap(cache);
 916 }
 917
 918 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
 919   if (word_list_.empty())
 920     return;
 921   WordListItem* list_item = cache->mutable_word_list();
 922   list_item->set_word_count(word_list_.size());
 923   for (String16Vector::const_iterator iter = word_list_.begin();
 924        iter != word_list_.end(); ++iter)
 925     list_item->add_word(base::UTF16ToUTF8(*iter));
 926 }
 927
 928 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
 929   if (word_map_.empty())
 930     return;
 931   WordMapItem* map_item = cache->mutable_word_map();
 932   map_item->set_item_count(word_map_.size());
 933   for (WordMap::const_iterator iter = word_map_.begin();
 934        iter != word_map_.end(); ++iter) {
 935     WordMapEntry* map_entry = map_item->add_word_map_entry();
 936     map_entry->set_word(base::UTF16ToUTF8(iter->first));
 937     map_entry->set_word_id(iter->second);
 938   }
 939 }
 940
 941 void URLIndexPrivateData::SaveCharWordMap(
 942     InMemoryURLIndexCacheItem* cache) const {
 943   if (char_word_map_.empty())
 944     return;
 945   CharWordMapItem* map_item = cache->mutable_char_word_map();
 946   map_item->set_item_count(char_word_map_.size());
 947   for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
 948        iter != char_word_map_.end(); ++iter) {
 949     CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
 950     map_entry->set_char_16(iter->first);
 951     const WordIDSet& word_id_set(iter->second);
 952     map_entry->set_item_count(word_id_set.size());
 953     for (WordIDSet::const_iterator set_iter = word_id_set.begin();
 954          set_iter != word_id_set.end(); ++set_iter)
 955       map_entry->add_word_id(*set_iter);
 956   }
 957 }
 958
 959 void URLIndexPrivateData::SaveWordIDHistoryMap(
 960     InMemoryURLIndexCacheItem* cache) const {
 961   if (word_id_history_map_.empty())
 962     return;
 963   WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
 964   map_item->set_item_count(word_id_history_map_.size());
 965   for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
 966        iter != word_id_history_map_.end(); ++iter) {
 967     WordIDHistoryMapEntry* map_entry =
 968         map_item->add_word_id_history_map_entry();
 969     map_entry->set_word_id(iter->first);
 970     const HistoryIDSet& history_id_set(iter->second);
 971     map_entry->set_item_count(history_id_set.size());
 972     for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
 973          set_iter != history_id_set.end(); ++set_iter)
 974       map_entry->add_history_id(*set_iter);
 975   }
 976 }
 977
 978 void URLIndexPrivateData::SaveHistoryInfoMap(
 979     InMemoryURLIndexCacheItem* cache) const {
 980   if (history_info_map_.empty())
 981     return;
 982   HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
 983   map_item->set_item_count(history_info_map_.size());
 984   for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
 985        iter != history_info_map_.end(); ++iter) {
 986     HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
 987     map_entry->set_history_id(iter->first);
 988     const URLRow& url_row(iter->second.url_row);
 989     // Note: We only save information that contributes to the index so there
 990     // is no need to save search_term_cache_ (not persistent).
 991     map_entry->set_visit_count(url_row.visit_count());
 992     map_entry->set_typed_count(url_row.typed_count());
 993     map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
 994     map_entry->set_url(url_row.url().spec());
 995     map_entry->set_title(base::UTF16ToUTF8(url_row.title()));
 996     const VisitInfoVector& visits(iter->second.visits);
 997     for (VisitInfoVector::const_iterator visit_iter = visits.begin();
 998          visit_iter != visits.end(); ++visit_iter) {
 999       HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
1000       visit_info->set_visit_time(visit_iter->first.ToInternalValue());
1001       visit_info->set_transition_type(visit_iter->second);
1002     }
1003   }
1004 }
1005
1006 void URLIndexPrivateData::SaveWordStartsMap(
1007     InMemoryURLIndexCacheItem* cache) const {
1008   if (word_starts_map_.empty())
1009     return;
1010   // For unit testing: Enable saving of the cache as an earlier version to
1011   // allow testing of cache file upgrading in ReadFromFile().
1012   // TODO(mrossetti): Instead of intruding on production code with this kind of
1013   // test harness, save a copy of an older version cache with known results.
1014   // Implement this when switching the caching over to SQLite.
1015   if (saved_cache_version_ < 1)
1016     return;
1017
1018   WordStartsMapItem* map_item = cache->mutable_word_starts_map();
1019   map_item->set_item_count(word_starts_map_.size());
1020   for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
1021        iter != word_starts_map_.end(); ++iter) {
1022     WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
1023     map_entry->set_history_id(iter->first);
1024     const RowWordStarts& word_starts(iter->second);
1025     for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
1026          i != word_starts.url_word_starts_.end(); ++i)
1027       map_entry->add_url_word_starts(*i);
1028     for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
1029          i != word_starts.title_word_starts_.end(); ++i)
1030       map_entry->add_title_word_starts(*i);
1031   }
1032 }
1033
1034 bool URLIndexPrivateData::RestorePrivateData(
1035     const InMemoryURLIndexCacheItem& cache,
1036     const std::string& languages) {
1037   last_time_rebuilt_from_history_ =
1038       base::Time::FromInternalValue(cache.last_rebuild_timestamp());
1039   const base::TimeDelta rebuilt_ago =
1040       base::Time::Now() - last_time_rebuilt_from_history_;
1041   if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
1042       (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
1043     // Cache is more than a week old or, somehow, from some time in the future.
1044     // It's probably a good time to rebuild the index from history to
1045     // allow synced entries to now appear, expired entries to disappear, etc.
1046     // Allow one day in the future to make the cache not rebuild on simple
1047     // system clock changes such as time zone changes.
1048     return false;
1049   }
1050   if (cache.has_version()) {
1051     if (cache.version() < kCurrentCacheFileVersion) {
1052       // Don't try to restore an old format cache file.  (This will cause
1053       // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1054       // from history.)
1055       return false;
1056     }
1057     restored_cache_version_ = cache.version();
1058   }
1059   return RestoreWordList(cache) && RestoreWordMap(cache) &&
1060       RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
1061       RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
1062 }
1063
1064 bool URLIndexPrivateData::RestoreWordList(
1065     const InMemoryURLIndexCacheItem& cache) {
1066   if (!cache.has_word_list())
1067     return false;
1068   const WordListItem& list_item(cache.word_list());
1069   uint32 expected_item_count = list_item.word_count();
1070   uint32 actual_item_count = list_item.word_size();
1071   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1072     return false;
1073   const RepeatedPtrField<std::string>& words(list_item.word());
1074   for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
1075        iter != words.end(); ++iter)
1076     word_list_.push_back(base::UTF8ToUTF16(*iter));
1077   return true;
1078 }
1079
1080 bool URLIndexPrivateData::RestoreWordMap(
1081     const InMemoryURLIndexCacheItem& cache) {
1082   if (!cache.has_word_map())
1083     return false;
1084   const WordMapItem& list_item(cache.word_map());
1085   uint32 expected_item_count = list_item.item_count();
1086   uint32 actual_item_count = list_item.word_map_entry_size();
1087   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1088     return false;
1089   const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
1090   for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
1091        iter != entries.end(); ++iter)
1092     word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();
1093   return true;
1094 }
1095
1096 bool URLIndexPrivateData::RestoreCharWordMap(
1097     const InMemoryURLIndexCacheItem& cache) {
1098   if (!cache.has_char_word_map())
1099     return false;
1100   const CharWordMapItem& list_item(cache.char_word_map());
1101   uint32 expected_item_count = list_item.item_count();
1102   uint32 actual_item_count = list_item.char_word_map_entry_size();
1103   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1104     return false;
1105   const RepeatedPtrField<CharWordMapEntry>&
1106       entries(list_item.char_word_map_entry());
1107   for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
1108        entries.begin(); iter != entries.end(); ++iter) {
1109     expected_item_count = iter->item_count();
1110     actual_item_count = iter->word_id_size();
1111     if (actual_item_count == 0 || actual_item_count != expected_item_count)
1112       return false;
1113     base::char16 uni_char = static_cast<base::char16>(iter->char_16());
1114     WordIDSet word_id_set;
1115     const RepeatedField<int32>& word_ids(iter->word_id());
1116     for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
1117          jiter != word_ids.end(); ++jiter)
1118       word_id_set.insert(*jiter);
1119     char_word_map_[uni_char] = word_id_set;
1120   }
1121   return true;
1122 }
1123
1124 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1125     const InMemoryURLIndexCacheItem& cache) {
1126   if (!cache.has_word_id_history_map())
1127     return false;
1128   const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
1129   uint32 expected_item_count = list_item.item_count();
1130   uint32 actual_item_count = list_item.word_id_history_map_entry_size();
1131   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1132     return false;
1133   const RepeatedPtrField<WordIDHistoryMapEntry>&
1134       entries(list_item.word_id_history_map_entry());
1135   for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1136        entries.begin(); iter != entries.end(); ++iter) {
1137     expected_item_count = iter->item_count();
1138     actual_item_count = iter->history_id_size();
1139     if (actual_item_count == 0 || actual_item_count != expected_item_count)
1140       return false;
1141     WordID word_id = iter->word_id();
1142     HistoryIDSet history_id_set;
1143     const RepeatedField<int64>& history_ids(iter->history_id());
1144     for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1145          jiter != history_ids.end(); ++jiter) {
1146       history_id_set.insert(*jiter);
1147       AddToHistoryIDWordMap(*jiter, word_id);
1148     }
1149     word_id_history_map_[word_id] = history_id_set;
1150   }
1151   return true;
1152 }
1153
1154 bool URLIndexPrivateData::RestoreHistoryInfoMap(
1155     const InMemoryURLIndexCacheItem& cache) {
1156   if (!cache.has_history_info_map())
1157     return false;
1158   const HistoryInfoMapItem& list_item(cache.history_info_map());
1159   uint32 expected_item_count = list_item.item_count();
1160   uint32 actual_item_count = list_item.history_info_map_entry_size();
1161   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1162     return false;
1163   const RepeatedPtrField<HistoryInfoMapEntry>&
1164       entries(list_item.history_info_map_entry());
1165   for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
1166        entries.begin(); iter != entries.end(); ++iter) {
1167     HistoryID history_id = iter->history_id();
1168     GURL url(iter->url());
1169     URLRow url_row(url, history_id);
1170     url_row.set_visit_count(iter->visit_count());
1171     url_row.set_typed_count(iter->typed_count());
1172     url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1173     if (iter->has_title()) {
1174       base::string16 title(base::UTF8ToUTF16(iter->title()));
1175       url_row.set_title(title);
1176     }
1177     history_info_map_[history_id].url_row = url_row;
1178
1179     // Restore visits list.
1180     VisitInfoVector visits;
1181     visits.reserve(iter->visits_size());
1182     for (int i = 0; i < iter->visits_size(); ++i) {
1183       visits.push_back(std::make_pair(
1184           base::Time::FromInternalValue(iter->visits(i).visit_time()),
1185           ui::PageTransitionFromInt(iter->visits(i).transition_type())));
1186     }
1187     history_info_map_[history_id].visits = visits;
1188   }
1189   return true;
1190 }
1191
1192 bool URLIndexPrivateData::RestoreWordStartsMap(
1193     const InMemoryURLIndexCacheItem& cache,
1194     const std::string& languages) {
1195   // Note that this function must be called after RestoreHistoryInfoMap() has
1196   // been run as the word starts may have to be recalculated from the urls and
1197   // page titles.
1198   if (cache.has_word_starts_map()) {
1199     const WordStartsMapItem& list_item(cache.word_starts_map());
1200     uint32 expected_item_count = list_item.item_count();
1201     uint32 actual_item_count = list_item.word_starts_map_entry_size();
1202     if (actual_item_count == 0 || actual_item_count != expected_item_count)
1203       return false;
1204     const RepeatedPtrField<WordStartsMapEntry>&
1205         entries(list_item.word_starts_map_entry());
1206     for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
1207          entries.begin(); iter != entries.end(); ++iter) {
1208       HistoryID history_id = iter->history_id();
1209       RowWordStarts word_starts;
1210       // Restore the URL word starts.
1211       const RepeatedField<int32>& url_starts(iter->url_word_starts());
1212       for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
1213            jiter != url_starts.end(); ++jiter)
1214         word_starts.url_word_starts_.push_back(*jiter);
1215       // Restore the page title word starts.
1216       const RepeatedField<int32>& title_starts(iter->title_word_starts());
1217       for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
1218            jiter != title_starts.end(); ++jiter)
1219         word_starts.title_word_starts_.push_back(*jiter);
1220       word_starts_map_[history_id] = word_starts;
1221     }
1222   } else {
1223     // Since the cache did not contain any word starts we must rebuild then from
1224     // the URL and page titles.
1225     for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1226          iter != history_info_map_.end(); ++iter) {
1227       RowWordStarts word_starts;
1228       const URLRow& row(iter->second.url_row);
1229       const base::string16& url =
1230           bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL);
1231       String16VectorFromString16(url, false, &word_starts.url_word_starts_);
1232       const base::string16& title =
1233           bookmarks::CleanUpTitleForMatching(row.title());
1234       String16VectorFromString16(title, false, &word_starts.title_word_starts_);
1235       word_starts_map_[iter->first] = word_starts;
1236     }
1237   }
1238   return true;
1239 }
1240
1241 // static
1242 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1243     const GURL& gurl,
1244     const std::set<std::string>& whitelist) {
1245   return whitelist.find(gurl.scheme()) != whitelist.end();
1246 }
1247
1248
1249 // SearchTermCacheItem ---------------------------------------------------------
1250
1251 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1252     const WordIDSet& word_id_set,
1253     const HistoryIDSet& history_id_set)
1254     : word_id_set_(word_id_set),
1255       history_id_set_(history_id_set),
1256       used_(true) {}
1257
1258 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem()
1259     : used_(true) {}
1260
1261 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {}
1262
1263
1264 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1265
1266 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1267     const URLIndexPrivateData& private_data,
1268     const std::string& languages,
1269     HistoryClient* history_client,
1270     const base::string16& lower_string,
1271     const String16Vector& lower_terms,
1272     const base::Time now)
1273     : private_data_(private_data),
1274       languages_(languages),
1275       history_client_(history_client),
1276       lower_string_(lower_string),
1277       lower_terms_(lower_terms),
1278       now_(now) {
1279   // Calculate offsets for each term.  For instance, the offset for
1280   // ".net" should be 1, indicating that the actual word-part of the term
1281   // starts at offset 1.
1282   lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u);
1283   for (size_t i = 0; i < lower_terms_.size(); ++i) {
1284     base::i18n::BreakIterator iter(lower_terms_[i],
1285                                    base::i18n::BreakIterator::BREAK_WORD);
1286     // If the iterator doesn't work, assume an offset of 0.
1287     if (!iter.Init())
1288       continue;
1289     // Find the first word start.
1290     while (iter.Advance() && !iter.IsWord()) {}
1291     if (iter.IsWord())
1292       lower_terms_to_word_starts_offsets_[i] = iter.prev();
1293     // Else: the iterator didn't find a word break.  Assume an offset of 0.
1294   }
1295 }
1296
1297 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {}
1298
1299 void URLIndexPrivateData::AddHistoryMatch::operator()(
1300     const HistoryID history_id) {
1301   HistoryInfoMap::const_iterator hist_pos =
1302       private_data_.history_info_map_.find(history_id);
1303   if (hist_pos != private_data_.history_info_map_.end()) {
1304     const URLRow& hist_item = hist_pos->second.url_row;
1305     const VisitInfoVector& visits = hist_pos->second.visits;
1306     WordStartsMap::const_iterator starts_pos =
1307         private_data_.word_starts_map_.find(history_id);
1308     DCHECK(starts_pos != private_data_.word_starts_map_.end());
1309     ScoredHistoryMatch match(hist_item, visits, languages_, lower_string_,
1310                              lower_terms_, lower_terms_to_word_starts_offsets_,
1311                              starts_pos->second, now_, history_client_);
1312     if (match.raw_score() > 0)
1313       scored_matches_.push_back(match);
1314   }
1315 }
1316
1317
1318 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1319
1320 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1321     const HistoryInfoMap& history_info_map)
1322     : history_info_map_(history_info_map) {
1323 }
1324
1325 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
1326
1327 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1328     const HistoryID h1,
1329     const HistoryID h2) {
1330   HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
1331   if (entry1 == history_info_map_.end())
1332     return false;
1333   HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
1334   if (entry2 == history_info_map_.end())
1335     return true;
1336   const URLRow& r1(entry1->second.url_row);
1337   const URLRow& r2(entry2->second.url_row);
1338   // First cut: typed count, visit count, recency.
1339   // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1340   // recently visited (within the last 12/24 hours) as highly important. Get
1341   // input from mpearson.
1342   if (r1.typed_count() != r2.typed_count())
1343     return (r1.typed_count() > r2.typed_count());
1344   if (r1.visit_count() != r2.visit_count())
1345     return (r1.visit_count() > r2.visit_count());
1346   return (r1.last_visit() > r2.last_visit());
1347 }
1348
1349 }  // namespace history