src/chrome/browser/history/url_index_private_data.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/browser/history/url_index_private_data.h"
   6
   7 #include <algorithm>
   8 #include <functional>
   9 #include <iterator>
  10 #include <limits>
  11 #include <numeric>
  12 #include <string>
  13 #include <vector>
  14
  15 #include "base/basictypes.h"
  16 #include "base/file_util.h"
  17 #include "base/i18n/break_iterator.h"
  18 #include "base/i18n/case_conversion.h"
  19 #include "base/metrics/histogram.h"
  20 #include "base/strings/string_util.h"
  21 #include "base/strings/utf_string_conversions.h"
  22 #include "base/time/time.h"
  23 #include "chrome/browser/autocomplete/autocomplete_provider.h"
  24 #include "chrome/browser/autocomplete/url_prefix.h"
  25 #include "chrome/browser/history/history_database.h"
  26 #include "chrome/browser/history/history_db_task.h"
  27 #include "chrome/browser/history/history_service.h"
  28 #include "chrome/browser/history/in_memory_url_index.h"
  29 #include "components/bookmarks/core/browser/bookmark_service.h"
  30 #include "components/bookmarks/core/browser/bookmark_utils.h"
  31 #include "net/base/net_util.h"
  32
  33 #if defined(USE_SYSTEM_PROTOBUF)
  34 #include <google/protobuf/repeated_field.h>
  35 #else
  36 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
  37 #endif
  38
  39 using google::protobuf::RepeatedField;
  40 using google::protobuf::RepeatedPtrField;
  41 using in_memory_url_index::InMemoryURLIndexCacheItem;
  42
  43 namespace {
  44 static const size_t kMaxVisitsToStoreInCache = 10u;
  45 }  // anonymous namespace
  46
  47 namespace history {
  48
  49 typedef imui::InMemoryURLIndexCacheItem_WordListItem WordListItem;
  50 typedef imui::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry;
  51 typedef imui::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
  52 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem;
  53 typedef imui::InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry
  54     CharWordMapEntry;
  55 typedef imui::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
  56     WordIDHistoryMapItem;
  57 typedef imui::
  58     InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
  59     WordIDHistoryMapEntry;
  60 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem;
  61 typedef imui::InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
  62     HistoryInfoMapEntry;
  63 typedef imui::
  64     InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
  65     HistoryInfoMapEntry_VisitInfo;
  66 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem;
  67 typedef imui::InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
  68     WordStartsMapEntry;
  69
  70
  71 // Algorithm Functions ---------------------------------------------------------
  72
  73 // Comparison function for sorting search terms by descending length.
  74 bool LengthGreater(const base::string16& string_a,
  75                    const base::string16& string_b) {
  76   return string_a.length() > string_b.length();
  77 }
  78
  79
  80 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
  81
  82 // HistoryDBTask used to update the recent visit data for a particular
  83 // row from the history database.
  84 class UpdateRecentVisitsFromHistoryDBTask : public HistoryDBTask {
  85  public:
  86   explicit UpdateRecentVisitsFromHistoryDBTask(
  87       URLIndexPrivateData* private_data,
  88       URLID url_id);
  89
  90   virtual bool RunOnDBThread(HistoryBackend* backend,
  91                              history::HistoryDatabase* db) OVERRIDE;
  92   virtual void DoneRunOnMainThread() OVERRIDE;
  93
  94  private:
  95   virtual ~UpdateRecentVisitsFromHistoryDBTask();
  96
  97   // The URLIndexPrivateData that gets updated after the historyDB
  98   // task returns.
  99   URLIndexPrivateData* private_data_;
 100   // The ID of the URL to get visits for and then update.
 101   URLID url_id_;
 102   // Whether fetching the recent visits for the URL succeeded.
 103   bool succeeded_;
 104   // The awaited data that's shown to private_data_ for it to copy and
 105   // store.
 106   VisitVector recent_visits_;
 107
 108   DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
 109 };
 110
 111 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
 112     URLIndexPrivateData* private_data,
 113     URLID url_id)
 114     : private_data_(private_data),
 115       url_id_(url_id),
 116       succeeded_(false) {
 117 }
 118
 119 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
 120     HistoryBackend* backend,
 121     HistoryDatabase* db) {
 122   // Make sure the private data is going to get as many recent visits as
 123   // ScoredHistoryMatch::GetFrecency() hopes to use.
 124   DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
 125   succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
 126                                              kMaxVisitsToStoreInCache,
 127                                              &recent_visits_);
 128   if (!succeeded_)
 129     recent_visits_.clear();
 130   return true;  // Always claim to be done; do not retry failures.
 131 }
 132
 133 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
 134   if (succeeded_)
 135     private_data_->UpdateRecentVisits(url_id_, recent_visits_);
 136 }
 137
 138 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
 139 }
 140
 141
 142 // URLIndexPrivateData ---------------------------------------------------------
 143
 144 URLIndexPrivateData::URLIndexPrivateData()
 145     : restored_cache_version_(0),
 146       saved_cache_version_(kCurrentCacheFileVersion),
 147       pre_filter_item_count_(0),
 148       post_filter_item_count_(0),
 149       post_scoring_item_count_(0) {
 150 }
 151
 152 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
 153     base::string16 search_string,
 154     size_t cursor_position,
 155     const std::string& languages,
 156     BookmarkService* bookmark_service) {
 157   // If cursor position is set and useful (not at either end of the
 158   // string), allow the search string to be broken at cursor position.
 159   // We do this by pretending there's a space where the cursor is.
 160   if ((cursor_position != base::string16::npos) &&
 161       (cursor_position < search_string.length()) &&
 162       (cursor_position > 0)) {
 163     search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
 164   }
 165   pre_filter_item_count_ = 0;
 166   post_filter_item_count_ = 0;
 167   post_scoring_item_count_ = 0;
 168   // The search string we receive may contain escaped characters. For reducing
 169   // the index we need individual, lower-cased words, ignoring escapings. For
 170   // the final filtering we need whitespace separated substrings possibly
 171   // containing escaped characters.
 172   base::string16 lower_raw_string(base::i18n::ToLower(search_string));
 173   base::string16 lower_unescaped_string =
 174       net::UnescapeURLComponent(lower_raw_string,
 175           net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
 176   // Extract individual 'words' (as opposed to 'terms'; see below) from the
 177   // search string. When the user types "colspec=ID%20Mstone Release" we get
 178   // four 'words': "colspec", "id", "mstone" and "release".
 179   String16Vector lower_words(
 180       history::String16VectorFromString16(lower_unescaped_string, false, NULL));
 181   ScoredHistoryMatches scored_items;
 182
 183   // Do nothing if we have indexed no words (probably because we've not been
 184   // initialized yet) or the search string has no words.
 185   if (word_list_.empty() || lower_words.empty()) {
 186     search_term_cache_.clear();  // Invalidate the term cache.
 187     return scored_items;
 188   }
 189
 190   // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
 191   // approach.
 192   ResetSearchTermCache();
 193
 194   HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
 195
 196   // Trim the candidate pool if it is large. Note that we do not filter out
 197   // items that do not contain the search terms as proper substrings -- doing
 198   // so is the performance-costly operation we are trying to avoid in order
 199   // to maintain omnibox responsiveness.
 200   const size_t kItemsToScoreLimit = 500;
 201   pre_filter_item_count_ = history_id_set.size();
 202   // If we trim the results set we do not want to cache the results for next
 203   // time as the user's ultimately desired result could easily be eliminated
 204   // in this early rough filter.
 205   bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
 206   if (was_trimmed) {
 207     HistoryIDVector history_ids;
 208     std::copy(history_id_set.begin(), history_id_set.end(),
 209               std::back_inserter(history_ids));
 210     // Trim down the set by sorting by typed-count, visit-count, and last
 211     // visit.
 212     HistoryItemFactorGreater
 213         item_factor_functor(history_info_map_);
 214     std::partial_sort(history_ids.begin(),
 215                       history_ids.begin() + kItemsToScoreLimit,
 216                       history_ids.end(),
 217                       item_factor_functor);
 218     history_id_set.clear();
 219     std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
 220               std::inserter(history_id_set, history_id_set.end()));
 221     post_filter_item_count_ = history_id_set.size();
 222   }
 223
 224   // Pass over all of the candidates filtering out any without a proper
 225   // substring match, inserting those which pass in order by score. Note that
 226   // in this step we are using the raw search string complete with escaped
 227   // URL elements. When the user has specifically typed something akin to
 228   // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
 229   // specific substring appears in the URL or page title.
 230
 231   // We call these 'terms' (as opposed to 'words'; see above) as in this case
 232   // we only want to break up the search string on 'true' whitespace rather than
 233   // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
 234   // get two 'terms': "colspec=id%20mstone" and "release".
 235   history::String16Vector lower_raw_terms;
 236   if (Tokenize(lower_raw_string, base::kWhitespaceUTF16,
 237                &lower_raw_terms) == 0) {
 238     // Don't score matches when there are no terms to score against.  (It's
 239     // possible that the word break iterater that extracts words to search
 240     // for in the database allows some whitespace "words" whereas Tokenize
 241     // excludes a long list of whitespace.)  One could write a scoring
 242     // function that gives a reasonable order to matches when there
 243     // are no terms (i.e., all the words are some form of whitespace),
 244     // but this is such a rare edge case that it's not worth the time.
 245     return scored_items;
 246   }
 247   scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
 248       AddHistoryMatch(*this, languages, bookmark_service, lower_raw_string,
 249                       lower_raw_terms, base::Time::Now())).ScoredMatches();
 250
 251   // Select and sort only the top kMaxMatches results.
 252   if (scored_items.size() > AutocompleteProvider::kMaxMatches) {
 253     std::partial_sort(scored_items.begin(),
 254                       scored_items.begin() +
 255                           AutocompleteProvider::kMaxMatches,
 256                       scored_items.end(),
 257                       ScoredHistoryMatch::MatchScoreGreater);
 258       scored_items.resize(AutocompleteProvider::kMaxMatches);
 259   } else {
 260     std::sort(scored_items.begin(), scored_items.end(),
 261               ScoredHistoryMatch::MatchScoreGreater);
 262   }
 263   post_scoring_item_count_ = scored_items.size();
 264
 265   if (was_trimmed) {
 266     search_term_cache_.clear();  // Invalidate the term cache.
 267   } else {
 268     // Remove any stale SearchTermCacheItems.
 269     for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
 270          cache_iter != search_term_cache_.end(); ) {
 271       if (!cache_iter->second.used_)
 272         search_term_cache_.erase(cache_iter++);
 273       else
 274         ++cache_iter;
 275     }
 276   }
 277
 278   return scored_items;
 279 }
 280
 281 bool URLIndexPrivateData::UpdateURL(
 282     HistoryService* history_service,
 283     const URLRow& row,
 284     const std::string& languages,
 285     const std::set<std::string>& scheme_whitelist) {
 286   // The row may or may not already be in our index. If it is not already
 287   // indexed and it qualifies then it gets indexed. If it is already
 288   // indexed and still qualifies then it gets updated, otherwise it
 289   // is deleted from the index.
 290   bool row_was_updated = false;
 291   URLID row_id = row.id();
 292   HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
 293   if (row_pos == history_info_map_.end()) {
 294     // This new row should be indexed if it qualifies.
 295     URLRow new_row(row);
 296     new_row.set_id(row_id);
 297     row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
 298         IndexRow(NULL, history_service, new_row, languages, scheme_whitelist);
 299   } else if (RowQualifiesAsSignificant(row, base::Time())) {
 300     // This indexed row still qualifies and will be re-indexed.
 301     // The url won't have changed but the title, visit count, etc.
 302     // might have changed.
 303     URLRow& row_to_update = row_pos->second.url_row;
 304     bool title_updated = row_to_update.title() != row.title();
 305     if (row_to_update.visit_count() != row.visit_count() ||
 306         row_to_update.typed_count() != row.typed_count() ||
 307         row_to_update.last_visit() != row.last_visit() || title_updated) {
 308       row_to_update.set_visit_count(row.visit_count());
 309       row_to_update.set_typed_count(row.typed_count());
 310       row_to_update.set_last_visit(row.last_visit());
 311       // If something appears to have changed, update the recent visits
 312       // information.
 313       ScheduleUpdateRecentVisits(history_service, row_id);
 314       // While the URL is guaranteed to remain stable, the title may have
 315       // changed. If so, then update the index with the changed words.
 316       if (title_updated) {
 317         // Clear all words associated with this row and re-index both the
 318         // URL and title.
 319         RemoveRowWordsFromIndex(row_to_update);
 320         row_to_update.set_title(row.title());
 321         RowWordStarts word_starts;
 322         AddRowWordsToIndex(row_to_update, &word_starts, languages);
 323         word_starts_map_[row_id] = word_starts;
 324       }
 325       row_was_updated = true;
 326     }
 327   } else {
 328     // This indexed row no longer qualifies and will be de-indexed by
 329     // clearing all words associated with this row.
 330     RemoveRowFromIndex(row);
 331     row_was_updated = true;
 332   }
 333   if (row_was_updated)
 334     search_term_cache_.clear();  // This invalidates the cache.
 335   return row_was_updated;
 336 }
 337
 338 void URLIndexPrivateData::UpdateRecentVisits(
 339     URLID url_id,
 340     const VisitVector& recent_visits) {
 341   HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
 342   if (row_pos != history_info_map_.end()) {
 343     VisitInfoVector* visits = &row_pos->second.visits;
 344     visits->clear();
 345     const size_t size =
 346         std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
 347     visits->reserve(size);
 348     for (size_t i = 0; i < size; i++) {
 349       // Copy from the VisitVector the only fields visits needs.
 350       visits->push_back(std::make_pair(recent_visits[i].visit_time,
 351                                        recent_visits[i].transition));
 352     }
 353   }
 354   // Else: Oddly, the URL doesn't seem to exist in the private index.
 355   // Ignore this update.  This can happen if, for instance, the user
 356   // removes the URL from URLIndexPrivateData before the historyDB call
 357   // returns.
 358 }
 359
 360 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
 361     HistoryService* history_service,
 362     URLID url_id) {
 363   history_service->ScheduleDBTask(
 364       new UpdateRecentVisitsFromHistoryDBTask(this, url_id),
 365       &recent_visits_consumer_);
 366 }
 367
 368 // Helper functor for DeleteURL.
 369 class HistoryInfoMapItemHasURL {
 370  public:
 371   explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
 372
 373   bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
 374     return item.second.url_row.url() == url_;
 375   }
 376
 377  private:
 378   const GURL& url_;
 379 };
 380
 381 bool URLIndexPrivateData::DeleteURL(const GURL& url) {
 382   // Find the matching entry in the history_info_map_.
 383   HistoryInfoMap::iterator pos = std::find_if(
 384       history_info_map_.begin(),
 385       history_info_map_.end(),
 386       HistoryInfoMapItemHasURL(url));
 387   if (pos == history_info_map_.end())
 388     return false;
 389   RemoveRowFromIndex(pos->second.url_row);
 390   search_term_cache_.clear();  // This invalidates the cache.
 391   return true;
 392 }
 393
 394 // static
 395 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
 396     const base::FilePath& file_path,
 397     const std::string& languages) {
 398   base::TimeTicks beginning_time = base::TimeTicks::Now();
 399   if (!base::PathExists(file_path))
 400     return NULL;
 401   std::string data;
 402   // If there is no cache file then simply give up. This will cause us to
 403   // attempt to rebuild from the history database.
 404   if (!base::ReadFileToString(file_path, &data))
 405     return NULL;
 406
 407   scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
 408   InMemoryURLIndexCacheItem index_cache;
 409   if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
 410     LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
 411                  << file_path.value();
 412     return restored_data;
 413   }
 414
 415   if (!restored_data->RestorePrivateData(index_cache, languages))
 416     return NULL;
 417
 418   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
 419                       base::TimeTicks::Now() - beginning_time);
 420   UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
 421                        restored_data->history_id_word_map_.size());
 422   UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
 423   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
 424                              restored_data->word_map_.size());
 425   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
 426                              restored_data->char_word_map_.size());
 427   if (restored_data->Empty())
 428     return NULL;  // 'No data' is the same as a failed reload.
 429   return restored_data;
 430 }
 431
 432 // static
 433 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
 434     HistoryDatabase* history_db,
 435     const std::string& languages,
 436     const std::set<std::string>& scheme_whitelist) {
 437   if (!history_db)
 438     return NULL;
 439
 440   base::TimeTicks beginning_time = base::TimeTicks::Now();
 441
 442   scoped_refptr<URLIndexPrivateData>
 443       rebuilt_data(new URLIndexPrivateData);
 444   URLDatabase::URLEnumerator history_enum;
 445   if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
 446     return NULL;
 447   rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
 448   for (URLRow row; history_enum.GetNextURL(&row); ) {
 449     rebuilt_data->IndexRow(history_db, NULL, row, languages,
 450                            scheme_whitelist);
 451   }
 452
 453   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
 454                       base::TimeTicks::Now() - beginning_time);
 455   UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
 456                        rebuilt_data->history_id_word_map_.size());
 457   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
 458                              rebuilt_data->word_map_.size());
 459   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
 460                              rebuilt_data->char_word_map_.size());
 461   return rebuilt_data;
 462 }
 463
 464 // static
 465 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
 466     scoped_refptr<URLIndexPrivateData> private_data,
 467     const base::FilePath& file_path) {
 468   DCHECK(private_data.get());
 469   DCHECK(!file_path.empty());
 470   return private_data->SaveToFile(file_path);
 471 }
 472
 473 void URLIndexPrivateData::CancelPendingUpdates() {
 474   recent_visits_consumer_.CancelAllRequests();
 475 }
 476
 477 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
 478   scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
 479   data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
 480   data_copy->word_list_ = word_list_;
 481   data_copy->available_words_ = available_words_;
 482   data_copy->word_map_ = word_map_;
 483   data_copy->char_word_map_ = char_word_map_;
 484   data_copy->word_id_history_map_ = word_id_history_map_;
 485   data_copy->history_id_word_map_ = history_id_word_map_;
 486   data_copy->history_info_map_ = history_info_map_;
 487   data_copy->word_starts_map_ = word_starts_map_;
 488   return data_copy;
 489   // Not copied:
 490   //    search_term_cache_
 491   //    pre_filter_item_count_
 492   //    post_filter_item_count_
 493   //    post_scoring_item_count_
 494 };
 495
 496 bool URLIndexPrivateData::Empty() const {
 497   return history_info_map_.empty();
 498 }
 499
 500 void URLIndexPrivateData::Clear() {
 501   last_time_rebuilt_from_history_ = base::Time();
 502   word_list_.clear();
 503   available_words_.clear();
 504   word_map_.clear();
 505   char_word_map_.clear();
 506   word_id_history_map_.clear();
 507   history_id_word_map_.clear();
 508   history_info_map_.clear();
 509   word_starts_map_.clear();
 510 }
 511
 512 URLIndexPrivateData::~URLIndexPrivateData() {}
 513
 514 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
 515     const String16Vector& unsorted_words) {
 516   // Break the terms down into individual terms (words), get the candidate
 517   // set for each term, and intersect each to get a final candidate list.
 518   // Note that a single 'term' from the user's perspective might be
 519   // a string like "http://www.somewebsite.com" which, from our perspective,
 520   // is four words: 'http', 'www', 'somewebsite', and 'com'.
 521   HistoryIDSet history_id_set;
 522   String16Vector words(unsorted_words);
 523   // Sort the words into the longest first as such are likely to narrow down
 524   // the results quicker. Also, single character words are the most expensive
 525   // to process so save them for last.
 526   std::sort(words.begin(), words.end(), LengthGreater);
 527   for (String16Vector::iterator iter = words.begin(); iter != words.end();
 528        ++iter) {
 529     base::string16 uni_word = *iter;
 530     HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
 531     if (term_history_set.empty()) {
 532       history_id_set.clear();
 533       break;
 534     }
 535     if (iter == words.begin()) {
 536       history_id_set.swap(term_history_set);
 537     } else {
 538       HistoryIDSet new_history_id_set;
 539       std::set_intersection(history_id_set.begin(), history_id_set.end(),
 540                             term_history_set.begin(), term_history_set.end(),
 541                             std::inserter(new_history_id_set,
 542                                           new_history_id_set.begin()));
 543       history_id_set.swap(new_history_id_set);
 544     }
 545   }
 546   return history_id_set;
 547 }
 548
 549 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
 550     const base::string16& term) {
 551   if (term.empty())
 552     return HistoryIDSet();
 553
 554   // TODO(mrossetti): Consider optimizing for very common terms such as
 555   // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
 556   // occuring words in the user's searches.
 557
 558   size_t term_length = term.length();
 559   WordIDSet word_id_set;
 560   if (term_length > 1) {
 561     // See if this term or a prefix thereof is present in the cache.
 562     SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
 563     for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
 564          cache_iter != search_term_cache_.end(); ++cache_iter) {
 565       if (StartsWith(term, cache_iter->first, false) &&
 566           (best_prefix == search_term_cache_.end() ||
 567            cache_iter->first.length() > best_prefix->first.length()))
 568         best_prefix = cache_iter;
 569     }
 570
 571     // If a prefix was found then determine the leftover characters to be used
 572     // for further refining the results from that prefix.
 573     Char16Set prefix_chars;
 574     base::string16 leftovers(term);
 575     if (best_prefix != search_term_cache_.end()) {
 576       // If the prefix is an exact match for the term then grab the cached
 577       // results and we're done.
 578       size_t prefix_length = best_prefix->first.length();
 579       if (prefix_length == term_length) {
 580         best_prefix->second.used_ = true;
 581         return best_prefix->second.history_id_set_;
 582       }
 583
 584       // Otherwise we have a handy starting point.
 585       // If there are no history results for this prefix then we can bail early
 586       // as there will be no history results for the full term.
 587       if (best_prefix->second.history_id_set_.empty()) {
 588         search_term_cache_[term] = SearchTermCacheItem();
 589         return HistoryIDSet();
 590       }
 591       word_id_set = best_prefix->second.word_id_set_;
 592       prefix_chars = Char16SetFromString16(best_prefix->first);
 593       leftovers = term.substr(prefix_length);
 594     }
 595
 596     // Filter for each remaining, unique character in the term.
 597     Char16Set leftover_chars = Char16SetFromString16(leftovers);
 598     Char16Set unique_chars =
 599         base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
 600
 601     // Reduce the word set with any leftover, unprocessed characters.
 602     if (!unique_chars.empty()) {
 603       WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
 604       // We might come up empty on the leftovers.
 605       if (leftover_set.empty()) {
 606         search_term_cache_[term] = SearchTermCacheItem();
 607         return HistoryIDSet();
 608       }
 609       // Or there may not have been a prefix from which to start.
 610       if (prefix_chars.empty()) {
 611         word_id_set.swap(leftover_set);
 612       } else {
 613         WordIDSet new_word_id_set;
 614         std::set_intersection(word_id_set.begin(), word_id_set.end(),
 615                               leftover_set.begin(), leftover_set.end(),
 616                               std::inserter(new_word_id_set,
 617                                             new_word_id_set.begin()));
 618         word_id_set.swap(new_word_id_set);
 619       }
 620     }
 621
 622     // We must filter the word list because the resulting word set surely
 623     // contains words which do not have the search term as a proper subset.
 624     for (WordIDSet::iterator word_set_iter = word_id_set.begin();
 625          word_set_iter != word_id_set.end(); ) {
 626       if (word_list_[*word_set_iter].find(term) == base::string16::npos)
 627         word_id_set.erase(word_set_iter++);
 628       else
 629         ++word_set_iter;
 630     }
 631   } else {
 632     word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
 633   }
 634
 635   // If any words resulted then we can compose a set of history IDs by unioning
 636   // the sets from each word.
 637   HistoryIDSet history_id_set;
 638   if (!word_id_set.empty()) {
 639     for (WordIDSet::iterator word_id_iter = word_id_set.begin();
 640          word_id_iter != word_id_set.end(); ++word_id_iter) {
 641       WordID word_id = *word_id_iter;
 642       WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
 643       if (word_iter != word_id_history_map_.end()) {
 644         HistoryIDSet& word_history_id_set(word_iter->second);
 645         history_id_set.insert(word_history_id_set.begin(),
 646                               word_history_id_set.end());
 647       }
 648     }
 649   }
 650
 651   // Record a new cache entry for this word if the term is longer than
 652   // a single character.
 653   if (term_length > 1)
 654     search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
 655
 656   return history_id_set;
 657 }
 658
 659 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
 660     const Char16Set& term_chars) {
 661   WordIDSet word_id_set;
 662   for (Char16Set::const_iterator c_iter = term_chars.begin();
 663        c_iter != term_chars.end(); ++c_iter) {
 664     CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
 665     if (char_iter == char_word_map_.end()) {
 666       // A character was not found so there are no matching results: bail.
 667       word_id_set.clear();
 668       break;
 669     }
 670     WordIDSet& char_word_id_set(char_iter->second);
 671     // It is possible for there to no longer be any words associated with
 672     // a particular character. Give up in that case.
 673     if (char_word_id_set.empty()) {
 674       word_id_set.clear();
 675       break;
 676     }
 677
 678     if (c_iter == term_chars.begin()) {
 679       // First character results becomes base set of results.
 680       word_id_set = char_word_id_set;
 681     } else {
 682       // Subsequent character results get intersected in.
 683       WordIDSet new_word_id_set;
 684       std::set_intersection(word_id_set.begin(), word_id_set.end(),
 685                             char_word_id_set.begin(), char_word_id_set.end(),
 686                             std::inserter(new_word_id_set,
 687                                           new_word_id_set.begin()));
 688       word_id_set.swap(new_word_id_set);
 689     }
 690   }
 691   return word_id_set;
 692 }
 693
 694 bool URLIndexPrivateData::IndexRow(
 695     HistoryDatabase* history_db,
 696     HistoryService* history_service,
 697     const URLRow& row,
 698     const std::string& languages,
 699     const std::set<std::string>& scheme_whitelist) {
 700   const GURL& gurl(row.url());
 701
 702   // Index only URLs with a whitelisted scheme.
 703   if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
 704     return false;
 705
 706   URLID row_id = row.id();
 707   // Strip out username and password before saving and indexing.
 708   base::string16 url(net::FormatUrl(gurl, languages,
 709       net::kFormatUrlOmitUsernamePassword,
 710       net::UnescapeRule::NONE,
 711       NULL, NULL, NULL));
 712
 713   HistoryID history_id = static_cast<HistoryID>(row_id);
 714   DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
 715
 716   // Add the row for quick lookup in the history info store.
 717   URLRow new_row(GURL(url), row_id);
 718   new_row.set_visit_count(row.visit_count());
 719   new_row.set_typed_count(row.typed_count());
 720   new_row.set_last_visit(row.last_visit());
 721   new_row.set_title(row.title());
 722   history_info_map_[history_id].url_row = new_row;
 723
 724   // Index the words contained in the URL and title of the row.
 725   RowWordStarts word_starts;
 726   AddRowWordsToIndex(new_row, &word_starts, languages);
 727   word_starts_map_[history_id] = word_starts;
 728
 729   // Update the recent visits information or schedule the update
 730   // as appropriate.
 731   if (history_db) {
 732     // We'd like to check that we're on the history DB thread.
 733     // However, unittest code actually calls this on the UI thread.
 734     // So we don't do any thread checks.
 735     VisitVector recent_visits;
 736     // Make sure the private data is going to get as many recent visits as
 737     // ScoredHistoryMatch::GetFrecency() hopes to use.
 738     DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
 739     if (history_db->GetMostRecentVisitsForURL(row_id,
 740                                               kMaxVisitsToStoreInCache,
 741                                               &recent_visits))
 742       UpdateRecentVisits(row_id, recent_visits);
 743   } else {
 744     DCHECK(history_service);
 745     ScheduleUpdateRecentVisits(history_service, row_id);
 746   }
 747
 748   return true;
 749 }
 750
 751 void URLIndexPrivateData::AddRowWordsToIndex(const URLRow& row,
 752                                              RowWordStarts* word_starts,
 753                                              const std::string& languages) {
 754   HistoryID history_id = static_cast<HistoryID>(row.id());
 755   // Split URL into individual, unique words then add in the title words.
 756   const GURL& gurl(row.url());
 757   const base::string16& url =
 758       bookmark_utils::CleanUpUrlForMatching(gurl, languages, NULL);
 759   String16Set url_words = String16SetFromString16(url,
 760       word_starts ? &word_starts->url_word_starts_ : NULL);
 761   const base::string16& title =
 762       bookmark_utils::CleanUpTitleForMatching(row.title());
 763   String16Set title_words = String16SetFromString16(title,
 764       word_starts ? &word_starts->title_word_starts_ : NULL);
 765   String16Set words;
 766   std::set_union(url_words.begin(), url_words.end(),
 767                  title_words.begin(), title_words.end(),
 768                  std::insert_iterator<String16Set>(words, words.begin()));
 769   for (String16Set::iterator word_iter = words.begin();
 770        word_iter != words.end(); ++word_iter)
 771     AddWordToIndex(*word_iter, history_id);
 772
 773   search_term_cache_.clear();  // Invalidate the term cache.
 774 }
 775
 776 void URLIndexPrivateData::AddWordToIndex(const base::string16& term,
 777                                          HistoryID history_id) {
 778   WordMap::iterator word_pos = word_map_.find(term);
 779   if (word_pos != word_map_.end())
 780     UpdateWordHistory(word_pos->second, history_id);
 781   else
 782     AddWordHistory(term, history_id);
 783 }
 784
 785 void URLIndexPrivateData::AddWordHistory(const base::string16& term,
 786                                          HistoryID history_id) {
 787   WordID word_id = word_list_.size();
 788   if (available_words_.empty()) {
 789     word_list_.push_back(term);
 790   } else {
 791     word_id = *(available_words_.begin());
 792     word_list_[word_id] = term;
 793     available_words_.erase(word_id);
 794   }
 795   word_map_[term] = word_id;
 796
 797   HistoryIDSet history_id_set;
 798   history_id_set.insert(history_id);
 799   word_id_history_map_[word_id] = history_id_set;
 800   AddToHistoryIDWordMap(history_id, word_id);
 801
 802   // For each character in the newly added word (i.e. a word that is not
 803   // already in the word index), add the word to the character index.
 804   Char16Set characters = Char16SetFromString16(term);
 805   for (Char16Set::iterator uni_char_iter = characters.begin();
 806        uni_char_iter != characters.end(); ++uni_char_iter) {
 807     base::char16 uni_char = *uni_char_iter;
 808     CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
 809     if (char_iter != char_word_map_.end()) {
 810       // Update existing entry in the char/word index.
 811       WordIDSet& word_id_set(char_iter->second);
 812       word_id_set.insert(word_id);
 813     } else {
 814       // Create a new entry in the char/word index.
 815       WordIDSet word_id_set;
 816       word_id_set.insert(word_id);
 817       char_word_map_[uni_char] = word_id_set;
 818     }
 819   }
 820 }
 821
 822 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
 823                                             HistoryID history_id) {
 824   WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
 825   DCHECK(history_pos != word_id_history_map_.end());
 826   HistoryIDSet& history_id_set(history_pos->second);
 827   history_id_set.insert(history_id);
 828   AddToHistoryIDWordMap(history_id, word_id);
 829 }
 830
 831 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
 832                                                 WordID word_id) {
 833   HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
 834   if (iter != history_id_word_map_.end()) {
 835     WordIDSet& word_id_set(iter->second);
 836     word_id_set.insert(word_id);
 837   } else {
 838     WordIDSet word_id_set;
 839     word_id_set.insert(word_id);
 840     history_id_word_map_[history_id] = word_id_set;
 841   }
 842 }
 843
 844 void URLIndexPrivateData::RemoveRowFromIndex(const URLRow& row) {
 845   RemoveRowWordsFromIndex(row);
 846   HistoryID history_id = static_cast<HistoryID>(row.id());
 847   history_info_map_.erase(history_id);
 848   word_starts_map_.erase(history_id);
 849 }
 850
 851 void URLIndexPrivateData::RemoveRowWordsFromIndex(const URLRow& row) {
 852   // Remove the entries in history_id_word_map_ and word_id_history_map_ for
 853   // this row.
 854   HistoryID history_id = static_cast<HistoryID>(row.id());
 855   WordIDSet word_id_set = history_id_word_map_[history_id];
 856   history_id_word_map_.erase(history_id);
 857
 858   // Reconcile any changes to word usage.
 859   for (WordIDSet::iterator word_id_iter = word_id_set.begin();
 860        word_id_iter != word_id_set.end(); ++word_id_iter) {
 861     WordID word_id = *word_id_iter;
 862     word_id_history_map_[word_id].erase(history_id);
 863     if (!word_id_history_map_[word_id].empty())
 864       continue;  // The word is still in use.
 865
 866     // The word is no longer in use. Reconcile any changes to character usage.
 867     base::string16 word = word_list_[word_id];
 868     Char16Set characters = Char16SetFromString16(word);
 869     for (Char16Set::iterator uni_char_iter = characters.begin();
 870          uni_char_iter != characters.end(); ++uni_char_iter) {
 871       base::char16 uni_char = *uni_char_iter;
 872       char_word_map_[uni_char].erase(word_id);
 873       if (char_word_map_[uni_char].empty())
 874         char_word_map_.erase(uni_char);  // No longer in use.
 875     }
 876
 877     // Complete the removal of references to the word.
 878     word_id_history_map_.erase(word_id);
 879     word_map_.erase(word);
 880     word_list_[word_id] = base::string16();
 881     available_words_.insert(word_id);
 882   }
 883 }
 884
 885 void URLIndexPrivateData::ResetSearchTermCache() {
 886   for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
 887        iter != search_term_cache_.end(); ++iter)
 888     iter->second.used_ = false;
 889 }
 890
 891 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
 892   base::TimeTicks beginning_time = base::TimeTicks::Now();
 893   InMemoryURLIndexCacheItem index_cache;
 894   SavePrivateData(&index_cache);
 895   std::string data;
 896   if (!index_cache.SerializeToString(&data)) {
 897     LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
 898     return false;
 899   }
 900
 901   int size = data.size();
 902   if (base::WriteFile(file_path, data.c_str(), size) != size) {
 903     LOG(WARNING) << "Failed to write " << file_path.value();
 904     return false;
 905   }
 906   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
 907                       base::TimeTicks::Now() - beginning_time);
 908   return true;
 909 }
 910
 911 void URLIndexPrivateData::SavePrivateData(
 912     InMemoryURLIndexCacheItem* cache) const {
 913   DCHECK(cache);
 914   cache->set_last_rebuild_timestamp(
 915       last_time_rebuilt_from_history_.ToInternalValue());
 916   cache->set_version(saved_cache_version_);
 917   // history_item_count_ is no longer used but rather than change the protobuf
 918   // definition use a placeholder. This will go away with the switch to SQLite.
 919   cache->set_history_item_count(0);
 920   SaveWordList(cache);
 921   SaveWordMap(cache);
 922   SaveCharWordMap(cache);
 923   SaveWordIDHistoryMap(cache);
 924   SaveHistoryInfoMap(cache);
 925   SaveWordStartsMap(cache);
 926 }
 927
 928 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
 929   if (word_list_.empty())
 930     return;
 931   WordListItem* list_item = cache->mutable_word_list();
 932   list_item->set_word_count(word_list_.size());
 933   for (String16Vector::const_iterator iter = word_list_.begin();
 934        iter != word_list_.end(); ++iter)
 935     list_item->add_word(base::UTF16ToUTF8(*iter));
 936 }
 937
 938 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
 939   if (word_map_.empty())
 940     return;
 941   WordMapItem* map_item = cache->mutable_word_map();
 942   map_item->set_item_count(word_map_.size());
 943   for (WordMap::const_iterator iter = word_map_.begin();
 944        iter != word_map_.end(); ++iter) {
 945     WordMapEntry* map_entry = map_item->add_word_map_entry();
 946     map_entry->set_word(base::UTF16ToUTF8(iter->first));
 947     map_entry->set_word_id(iter->second);
 948   }
 949 }
 950
 951 void URLIndexPrivateData::SaveCharWordMap(
 952     InMemoryURLIndexCacheItem* cache) const {
 953   if (char_word_map_.empty())
 954     return;
 955   CharWordMapItem* map_item = cache->mutable_char_word_map();
 956   map_item->set_item_count(char_word_map_.size());
 957   for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
 958        iter != char_word_map_.end(); ++iter) {
 959     CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
 960     map_entry->set_char_16(iter->first);
 961     const WordIDSet& word_id_set(iter->second);
 962     map_entry->set_item_count(word_id_set.size());
 963     for (WordIDSet::const_iterator set_iter = word_id_set.begin();
 964          set_iter != word_id_set.end(); ++set_iter)
 965       map_entry->add_word_id(*set_iter);
 966   }
 967 }
 968
 969 void URLIndexPrivateData::SaveWordIDHistoryMap(
 970     InMemoryURLIndexCacheItem* cache) const {
 971   if (word_id_history_map_.empty())
 972     return;
 973   WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
 974   map_item->set_item_count(word_id_history_map_.size());
 975   for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
 976        iter != word_id_history_map_.end(); ++iter) {
 977     WordIDHistoryMapEntry* map_entry =
 978         map_item->add_word_id_history_map_entry();
 979     map_entry->set_word_id(iter->first);
 980     const HistoryIDSet& history_id_set(iter->second);
 981     map_entry->set_item_count(history_id_set.size());
 982     for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
 983          set_iter != history_id_set.end(); ++set_iter)
 984       map_entry->add_history_id(*set_iter);
 985   }
 986 }
 987
 988 void URLIndexPrivateData::SaveHistoryInfoMap(
 989     InMemoryURLIndexCacheItem* cache) const {
 990   if (history_info_map_.empty())
 991     return;
 992   HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
 993   map_item->set_item_count(history_info_map_.size());
 994   for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
 995        iter != history_info_map_.end(); ++iter) {
 996     HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
 997     map_entry->set_history_id(iter->first);
 998     const URLRow& url_row(iter->second.url_row);
 999     // Note: We only save information that contributes to the index so there
1000     // is no need to save search_term_cache_ (not persistent).
1001     map_entry->set_visit_count(url_row.visit_count());
1002     map_entry->set_typed_count(url_row.typed_count());
1003     map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
1004     map_entry->set_url(url_row.url().spec());
1005     map_entry->set_title(base::UTF16ToUTF8(url_row.title()));
1006     const VisitInfoVector& visits(iter->second.visits);
1007     for (VisitInfoVector::const_iterator visit_iter = visits.begin();
1008          visit_iter != visits.end(); ++visit_iter) {
1009       HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
1010       visit_info->set_visit_time(visit_iter->first.ToInternalValue());
1011       visit_info->set_transition_type(visit_iter->second);
1012     }
1013   }
1014 }
1015
1016 void URLIndexPrivateData::SaveWordStartsMap(
1017     InMemoryURLIndexCacheItem* cache) const {
1018   if (word_starts_map_.empty())
1019     return;
1020   // For unit testing: Enable saving of the cache as an earlier version to
1021   // allow testing of cache file upgrading in ReadFromFile().
1022   // TODO(mrossetti): Instead of intruding on production code with this kind of
1023   // test harness, save a copy of an older version cache with known results.
1024   // Implement this when switching the caching over to SQLite.
1025   if (saved_cache_version_ < 1)
1026     return;
1027
1028   WordStartsMapItem* map_item = cache->mutable_word_starts_map();
1029   map_item->set_item_count(word_starts_map_.size());
1030   for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
1031        iter != word_starts_map_.end(); ++iter) {
1032     WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
1033     map_entry->set_history_id(iter->first);
1034     const RowWordStarts& word_starts(iter->second);
1035     for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
1036          i != word_starts.url_word_starts_.end(); ++i)
1037       map_entry->add_url_word_starts(*i);
1038     for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
1039          i != word_starts.title_word_starts_.end(); ++i)
1040       map_entry->add_title_word_starts(*i);
1041   }
1042 }
1043
1044 bool URLIndexPrivateData::RestorePrivateData(
1045     const InMemoryURLIndexCacheItem& cache,
1046     const std::string& languages) {
1047   last_time_rebuilt_from_history_ =
1048       base::Time::FromInternalValue(cache.last_rebuild_timestamp());
1049   const base::TimeDelta rebuilt_ago =
1050       base::Time::Now() - last_time_rebuilt_from_history_;
1051   if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
1052       (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
1053     // Cache is more than a week old or, somehow, from some time in the future.
1054     // It's probably a good time to rebuild the index from history to
1055     // allow synced entries to now appear, expired entries to disappear, etc.
1056     // Allow one day in the future to make the cache not rebuild on simple
1057     // system clock changes such as time zone changes.
1058     return false;
1059   }
1060   if (cache.has_version()) {
1061     if (cache.version() < kCurrentCacheFileVersion) {
1062       // Don't try to restore an old format cache file.  (This will cause
1063       // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1064       // from history.)
1065       return false;
1066     }
1067     restored_cache_version_ = cache.version();
1068   }
1069   return RestoreWordList(cache) && RestoreWordMap(cache) &&
1070       RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
1071       RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
1072 }
1073
1074 bool URLIndexPrivateData::RestoreWordList(
1075     const InMemoryURLIndexCacheItem& cache) {
1076   if (!cache.has_word_list())
1077     return false;
1078   const WordListItem& list_item(cache.word_list());
1079   uint32 expected_item_count = list_item.word_count();
1080   uint32 actual_item_count = list_item.word_size();
1081   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1082     return false;
1083   const RepeatedPtrField<std::string>& words(list_item.word());
1084   for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
1085        iter != words.end(); ++iter)
1086     word_list_.push_back(base::UTF8ToUTF16(*iter));
1087   return true;
1088 }
1089
1090 bool URLIndexPrivateData::RestoreWordMap(
1091     const InMemoryURLIndexCacheItem& cache) {
1092   if (!cache.has_word_map())
1093     return false;
1094   const WordMapItem& list_item(cache.word_map());
1095   uint32 expected_item_count = list_item.item_count();
1096   uint32 actual_item_count = list_item.word_map_entry_size();
1097   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1098     return false;
1099   const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
1100   for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
1101        iter != entries.end(); ++iter)
1102     word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();
1103   return true;
1104 }
1105
1106 bool URLIndexPrivateData::RestoreCharWordMap(
1107     const InMemoryURLIndexCacheItem& cache) {
1108   if (!cache.has_char_word_map())
1109     return false;
1110   const CharWordMapItem& list_item(cache.char_word_map());
1111   uint32 expected_item_count = list_item.item_count();
1112   uint32 actual_item_count = list_item.char_word_map_entry_size();
1113   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1114     return false;
1115   const RepeatedPtrField<CharWordMapEntry>&
1116       entries(list_item.char_word_map_entry());
1117   for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
1118        entries.begin(); iter != entries.end(); ++iter) {
1119     expected_item_count = iter->item_count();
1120     actual_item_count = iter->word_id_size();
1121     if (actual_item_count == 0 || actual_item_count != expected_item_count)
1122       return false;
1123     base::char16 uni_char = static_cast<base::char16>(iter->char_16());
1124     WordIDSet word_id_set;
1125     const RepeatedField<int32>& word_ids(iter->word_id());
1126     for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
1127          jiter != word_ids.end(); ++jiter)
1128       word_id_set.insert(*jiter);
1129     char_word_map_[uni_char] = word_id_set;
1130   }
1131   return true;
1132 }
1133
1134 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1135     const InMemoryURLIndexCacheItem& cache) {
1136   if (!cache.has_word_id_history_map())
1137     return false;
1138   const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
1139   uint32 expected_item_count = list_item.item_count();
1140   uint32 actual_item_count = list_item.word_id_history_map_entry_size();
1141   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1142     return false;
1143   const RepeatedPtrField<WordIDHistoryMapEntry>&
1144       entries(list_item.word_id_history_map_entry());
1145   for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1146        entries.begin(); iter != entries.end(); ++iter) {
1147     expected_item_count = iter->item_count();
1148     actual_item_count = iter->history_id_size();
1149     if (actual_item_count == 0 || actual_item_count != expected_item_count)
1150       return false;
1151     WordID word_id = iter->word_id();
1152     HistoryIDSet history_id_set;
1153     const RepeatedField<int64>& history_ids(iter->history_id());
1154     for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1155          jiter != history_ids.end(); ++jiter) {
1156       history_id_set.insert(*jiter);
1157       AddToHistoryIDWordMap(*jiter, word_id);
1158     }
1159     word_id_history_map_[word_id] = history_id_set;
1160   }
1161   return true;
1162 }
1163
1164 bool URLIndexPrivateData::RestoreHistoryInfoMap(
1165     const InMemoryURLIndexCacheItem& cache) {
1166   if (!cache.has_history_info_map())
1167     return false;
1168   const HistoryInfoMapItem& list_item(cache.history_info_map());
1169   uint32 expected_item_count = list_item.item_count();
1170   uint32 actual_item_count = list_item.history_info_map_entry_size();
1171   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1172     return false;
1173   const RepeatedPtrField<HistoryInfoMapEntry>&
1174       entries(list_item.history_info_map_entry());
1175   for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
1176        entries.begin(); iter != entries.end(); ++iter) {
1177     HistoryID history_id = iter->history_id();
1178     GURL url(iter->url());
1179     URLRow url_row(url, history_id);
1180     url_row.set_visit_count(iter->visit_count());
1181     url_row.set_typed_count(iter->typed_count());
1182     url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1183     if (iter->has_title()) {
1184       base::string16 title(base::UTF8ToUTF16(iter->title()));
1185       url_row.set_title(title);
1186     }
1187     history_info_map_[history_id].url_row = url_row;
1188
1189     // Restore visits list.
1190     VisitInfoVector visits;
1191     visits.reserve(iter->visits_size());
1192     for (int i = 0; i < iter->visits_size(); ++i) {
1193       visits.push_back(std::make_pair(
1194           base::Time::FromInternalValue(iter->visits(i).visit_time()),
1195           static_cast<content::PageTransition>(iter->visits(i).
1196                                                transition_type())));
1197     }
1198     history_info_map_[history_id].visits = visits;
1199   }
1200   return true;
1201 }
1202
1203 bool URLIndexPrivateData::RestoreWordStartsMap(
1204     const InMemoryURLIndexCacheItem& cache,
1205     const std::string& languages) {
1206   // Note that this function must be called after RestoreHistoryInfoMap() has
1207   // been run as the word starts may have to be recalculated from the urls and
1208   // page titles.
1209   if (cache.has_word_starts_map()) {
1210     const WordStartsMapItem& list_item(cache.word_starts_map());
1211     uint32 expected_item_count = list_item.item_count();
1212     uint32 actual_item_count = list_item.word_starts_map_entry_size();
1213     if (actual_item_count == 0 || actual_item_count != expected_item_count)
1214       return false;
1215     const RepeatedPtrField<WordStartsMapEntry>&
1216         entries(list_item.word_starts_map_entry());
1217     for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
1218          entries.begin(); iter != entries.end(); ++iter) {
1219       HistoryID history_id = iter->history_id();
1220       RowWordStarts word_starts;
1221       // Restore the URL word starts.
1222       const RepeatedField<int32>& url_starts(iter->url_word_starts());
1223       for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
1224            jiter != url_starts.end(); ++jiter)
1225         word_starts.url_word_starts_.push_back(*jiter);
1226       // Restore the page title word starts.
1227       const RepeatedField<int32>& title_starts(iter->title_word_starts());
1228       for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
1229            jiter != title_starts.end(); ++jiter)
1230         word_starts.title_word_starts_.push_back(*jiter);
1231       word_starts_map_[history_id] = word_starts;
1232     }
1233   } else {
1234     // Since the cache did not contain any word starts we must rebuild then from
1235     // the URL and page titles.
1236     for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1237          iter != history_info_map_.end(); ++iter) {
1238       RowWordStarts word_starts;
1239       const URLRow& row(iter->second.url_row);
1240       const base::string16& url = bookmark_utils::CleanUpUrlForMatching(
1241           row.url(), languages, NULL);
1242       String16VectorFromString16(url, false, &word_starts.url_word_starts_);
1243       const base::string16& title =
1244           bookmark_utils::CleanUpTitleForMatching(row.title());
1245       String16VectorFromString16(title, false, &word_starts.title_word_starts_);
1246       word_starts_map_[iter->first] = word_starts;
1247     }
1248   }
1249   return true;
1250 }
1251
1252 // static
1253 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1254     const GURL& gurl,
1255     const std::set<std::string>& whitelist) {
1256   return whitelist.find(gurl.scheme()) != whitelist.end();
1257 }
1258
1259
1260 // SearchTermCacheItem ---------------------------------------------------------
1261
1262 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1263     const WordIDSet& word_id_set,
1264     const HistoryIDSet& history_id_set)
1265     : word_id_set_(word_id_set),
1266       history_id_set_(history_id_set),
1267       used_(true) {}
1268
1269 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem()
1270     : used_(true) {}
1271
1272 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {}
1273
1274
1275 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1276
1277 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1278     const URLIndexPrivateData& private_data,
1279     const std::string& languages,
1280     BookmarkService* bookmark_service,
1281     const base::string16& lower_string,
1282     const String16Vector& lower_terms,
1283     const base::Time now)
1284   : private_data_(private_data),
1285     languages_(languages),
1286     bookmark_service_(bookmark_service),
1287     lower_string_(lower_string),
1288     lower_terms_(lower_terms),
1289     now_(now) {
1290   // Calculate offsets for each term.  For instance, the offset for
1291   // ".net" should be 1, indicating that the actual word-part of the term
1292   // starts at offset 1.
1293   lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u);
1294   for (size_t i = 0; i < lower_terms_.size(); ++i) {
1295     base::i18n::BreakIterator iter(lower_terms_[i],
1296                                    base::i18n::BreakIterator::BREAK_WORD);
1297     // If the iterator doesn't work, assume an offset of 0.
1298     if (!iter.Init())
1299       continue;
1300     // Find the first word start.
1301     while (iter.Advance() && !iter.IsWord()) {}
1302     if (iter.IsWord())
1303       lower_terms_to_word_starts_offsets_[i] = iter.prev();
1304     // Else: the iterator didn't find a word break.  Assume an offset of 0.
1305   }
1306 }
1307
1308 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {}
1309
1310 void URLIndexPrivateData::AddHistoryMatch::operator()(
1311     const HistoryID history_id) {
1312   HistoryInfoMap::const_iterator hist_pos =
1313       private_data_.history_info_map_.find(history_id);
1314   if (hist_pos != private_data_.history_info_map_.end()) {
1315     const URLRow& hist_item = hist_pos->second.url_row;
1316     const VisitInfoVector& visits = hist_pos->second.visits;
1317     WordStartsMap::const_iterator starts_pos =
1318         private_data_.word_starts_map_.find(history_id);
1319     DCHECK(starts_pos != private_data_.word_starts_map_.end());
1320     ScoredHistoryMatch match(hist_item, visits, languages_, lower_string_,
1321                              lower_terms_, lower_terms_to_word_starts_offsets_,
1322                              starts_pos->second, now_, bookmark_service_);
1323     if (match.raw_score() > 0)
1324       scored_matches_.push_back(match);
1325   }
1326 }
1327
1328
1329 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1330
1331 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1332     const HistoryInfoMap& history_info_map)
1333     : history_info_map_(history_info_map) {
1334 }
1335
1336 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
1337
1338 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1339     const HistoryID h1,
1340     const HistoryID h2) {
1341   HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
1342   if (entry1 == history_info_map_.end())
1343     return false;
1344   HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
1345   if (entry2 == history_info_map_.end())
1346     return true;
1347   const URLRow& r1(entry1->second.url_row);
1348   const URLRow& r2(entry2->second.url_row);
1349   // First cut: typed count, visit count, recency.
1350   // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1351   // recently visited (within the last 12/24 hours) as highly important. Get
1352   // input from mpearson.
1353   if (r1.typed_count() != r2.typed_count())
1354     return (r1.typed_count() > r2.typed_count());
1355   if (r1.visit_count() != r2.visit_count())
1356     return (r1.visit_count() > r2.visit_count());
1357   return (r1.last_visit() > r2.last_visit());
1358 }
1359
1360 }  // namespace history