1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/history/scored_history_match.h"
15 #include "base/metrics/histogram.h"
16 #include "base/strings/string_util.h"
17 #include "base/strings/utf_string_conversions.h"
18 #include "chrome/browser/autocomplete/history_url_provider.h"
19 #include "chrome/browser/autocomplete/url_prefix.h"
20 #include "chrome/browser/bookmarks/bookmark_service.h"
21 #include "chrome/common/chrome_switches.h"
22 #include "content/public/browser/browser_thread.h"
26 // ScoredHistoryMatch ----------------------------------------------------------
28 bool ScoredHistoryMatch::initialized_ = false;
29 const size_t ScoredHistoryMatch::kMaxVisitsToScore = 10u;
30 bool ScoredHistoryMatch::also_do_hup_like_scoring = false;
31 int ScoredHistoryMatch::max_assigned_score_for_non_inlineable_matches = -1;
33 ScoredHistoryMatch::ScoredHistoryMatch()
37 InitializeAlsoDoHUPLikeScoringFieldAndMaxScoreField();
42 ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& row,
43 const VisitInfoVector& visits,
44 const std::string& languages,
45 const string16& lower_string,
46 const String16Vector& terms,
47 const RowWordStarts& word_starts,
49 BookmarkService* bookmark_service)
50 : HistoryMatch(row, 0, false, false),
54 InitializeAlsoDoHUPLikeScoringFieldAndMaxScoreField();
58 GURL gurl = row.url();
62 // Figure out where each search term appears in the URL and/or page title
63 // so that we can score as well as provide autocomplete highlighting.
64 string16 url = CleanUpUrlForMatching(gurl, languages);
65 string16 title = CleanUpTitleForMatching(row.title());
67 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
69 string16 term = *iter;
70 TermMatches url_term_matches = MatchTermInString(term, url, term_num);
71 TermMatches title_term_matches = MatchTermInString(term, title, term_num);
72 if (url_term_matches.empty() && title_term_matches.empty())
73 return; // A term was not found in either URL or title - reject.
74 url_matches.insert(url_matches.end(), url_term_matches.begin(),
75 url_term_matches.end());
76 title_matches.insert(title_matches.end(), title_term_matches.begin(),
77 title_term_matches.end());
80 // Sort matches by offset and eliminate any which overlap.
81 // TODO(mpearson): Investigate whether this has any meaningful
82 // effect on scoring. (It's necessary at some point: removing
83 // overlaps and sorting is needed to decide what to highlight in the
84 // suggestion string. But this sort and de-overlap doesn't have to
85 // be done before scoring.)
86 url_matches = SortAndDeoverlapMatches(url_matches);
87 title_matches = SortAndDeoverlapMatches(title_matches);
89 // We can inline autocomplete a match if:
90 // 1) there is only one search term
91 // 2) AND the match begins immediately after one of the prefixes in
92 // URLPrefix such as http://www and https:// (note that one of these
93 // is the empty prefix, for cases where the user has typed the scheme)
94 // 3) AND the search string does not end in whitespace (making it look to
95 // the IMUI as though there is a single search term when actually there
96 // is a second, empty term).
97 // |best_inlineable_prefix| stores the inlineable prefix computed in
98 // clause (2) or NULL if no such prefix exists. (The URL is not inlineable.)
99 // Note that using the best prefix here means that when multiple
100 // prefixes match, we'll choose to inline following the longest one.
101 // For a URL like "http://www.washingtonmutual.com", this means
102 // typing "w" will inline "ashington..." instead of "ww.washington...".
103 const URLPrefix* best_inlineable_prefix =
104 (!url_matches.empty() && (terms.size() == 1)) ?
105 URLPrefix::BestURLPrefix(UTF8ToUTF16(gurl.spec()), terms[0]) :
107 can_inline = (best_inlineable_prefix != NULL) &&
108 !IsWhitespace(*(lower_string.rbegin()));
109 match_in_scheme = can_inline && best_inlineable_prefix->prefix.empty();
111 // Initialize innermost_match.
112 // The idea here is that matches that occur in the scheme or
113 // "www." are worse than matches which don't. For the URLs
114 // "http://www.google.com" and "http://wellsfargo.com", we want
115 // the omnibox input "w" to cause the latter URL to rank higher
116 // than the former. Note that this is not the same as checking
117 // whether one match's inlinable prefix has more components than
118 // the other match's, since in this example, both matches would
119 // have an inlinable prefix of "http://", which is one component.
121 // Instead, we look for the overall best (i.e., most components)
122 // prefix of the current URL, and then check whether the inlinable
123 // prefix has that many components. If it does, this is an
124 // "innermost" match, and should be boosted. In the example
125 // above, the best prefixes for the two URLs have two and one
126 // components respectively, while the inlinable prefixes each
127 // have one component; this means the first match is not innermost
128 // and the second match is innermost, resulting in us boosting the
131 // Now, the code that implements this.
132 // The deepest prefix for this URL regardless of where the match is.
133 const URLPrefix* best_prefix =
134 URLPrefix::BestURLPrefix(UTF8ToUTF16(gurl.spec()), string16());
135 DCHECK(best_prefix != NULL);
136 const int num_components_in_best_prefix = best_prefix->num_components;
137 // If the URL is inlineable, we must have a match. Note the prefix that
138 // makes it inlineable may be empty.
139 DCHECK(best_inlineable_prefix != NULL);
140 const int num_components_in_best_inlineable_prefix =
141 best_inlineable_prefix->num_components;
142 innermost_match = (num_components_in_best_inlineable_prefix ==
143 num_components_in_best_prefix);
146 const float topicality_score = GetTopicalityScore(
147 terms.size(), url, url_matches, title_matches, word_starts);
148 const float frecency_score = GetFrecency(now, visits);
149 raw_score = GetFinalRelevancyScore(topicality_score, frecency_score);
151 (raw_score <= kint32max) ? static_cast<int>(raw_score) : kint32max;
153 if (also_do_hup_like_scoring && can_inline) {
154 // HistoryURL-provider-like scoring gives any match that is
155 // capable of being inlined a certain minimum score. Some of these
156 // are given a higher score that lets them be shown in inline.
157 // This test here derives from the test in
158 // HistoryURLProvider::PromoteMatchForInlineAutocomplete().
159 const bool promote_to_inline = (row.typed_count() > 1) ||
160 (IsHostOnly() && (row.typed_count() == 1));
161 int hup_like_score = promote_to_inline ?
162 HistoryURLProvider::kScoreForBestInlineableResult :
163 HistoryURLProvider::kBaseScoreForNonInlineableResult;
165 // Also, if the user types the hostname of a host with a typed
166 // visit, then everything from that host get given inlineable scores
167 // (because the URL-that-you-typed will go first and everything
168 // else will be assigned one minus the previous score, as coded
169 // at the end of HistoryURLProvider::DoAutocomplete().
170 if (UTF8ToUTF16(gurl.host()) == terms[0])
171 hup_like_score = HistoryURLProvider::kScoreForBestInlineableResult;
173 // HistoryURLProvider has the function PromoteOrCreateShorterSuggestion()
174 // that's meant to promote prefixes of the best match (if they've
175 // been visited enough related to the best match) or
176 // create/promote host-only suggestions (even if they've never
177 // been typed). The code is complicated and we don't try to
178 // duplicate the logic here. Instead, we handle a simple case: in
179 // low-typed-count ranges, give host-only matches (i.e.,
180 // http://www.foo.com/ vs. http://www.foo.com/bar.html) a boost so
181 // that the host-only match outscores all the other matches that
182 // would normally have the same base score. This behavior is not
183 // identical to what happens in HistoryURLProvider even in these
184 // low typed count ranges--sometimes it will create/promote when
185 // this test does not (indeed, we cannot create matches like HUP
186 // can) and vice versa--but the underlying philosophy is similar.
187 if (!promote_to_inline && IsHostOnly())
190 // All the other logic to goes into hup-like-scoring happens in
191 // the tie-breaker case of MatchScoreGreater().
193 // Incorporate hup_like_score into raw_score.
194 raw_score = std::max(raw_score, hup_like_score);
197 // If this match is not inlineable and there's a cap on the maximum
198 // score that can be given to non-inlineable matches, apply the cap.
199 if (!can_inline && (max_assigned_score_for_non_inlineable_matches != -1)) {
200 raw_score = std::min(max_assigned_score_for_non_inlineable_matches,
205 ScoredHistoryMatch::~ScoredHistoryMatch() {}
207 // Comparison function for sorting ScoredMatches by their scores with
208 // intelligent tie-breaking.
209 bool ScoredHistoryMatch::MatchScoreGreater(const ScoredHistoryMatch& m1,
210 const ScoredHistoryMatch& m2) {
211 if (m1.raw_score != m2.raw_score)
212 return m1.raw_score > m2.raw_score;
214 // This tie-breaking logic is inspired by / largely copied from the
215 // ordering logic in history_url_provider.cc CompareHistoryMatch().
217 // A URL that has been typed at all is better than one that has never been
218 // typed. (Note "!"s on each side.)
219 if (!m1.url_info.typed_count() != !m2.url_info.typed_count())
220 return m1.url_info.typed_count() > m2.url_info.typed_count();
222 // Innermost matches (matches after any scheme or "www.") are better than
223 // non-innermost matches.
224 if (m1.innermost_match != m2.innermost_match)
225 return m1.innermost_match;
227 // URLs that have been typed more often are better.
228 if (m1.url_info.typed_count() != m2.url_info.typed_count())
229 return m1.url_info.typed_count() > m2.url_info.typed_count();
231 // For URLs that have each been typed once, a host (alone) is better
232 // than a page inside.
233 if (m1.url_info.typed_count() == 1) {
234 if (m1.IsHostOnly() != m2.IsHostOnly())
235 return m1.IsHostOnly();
238 // URLs that have been visited more often are better.
239 if (m1.url_info.visit_count() != m2.url_info.visit_count())
240 return m1.url_info.visit_count() > m2.url_info.visit_count();
242 // URLs that have been visited more recently are better.
243 return m1.url_info.last_visit() > m2.url_info.last_visit();
247 float ScoredHistoryMatch::GetTopicalityScore(
250 const TermMatches& url_matches,
251 const TermMatches& title_matches,
252 const RowWordStarts& word_starts) {
253 // Because the below thread is not thread safe, we check that we're
254 // only calling it from one thread: the UI thread. Specifically,
255 // we check "if we've heard of the UI thread then we'd better
256 // be on it." The first part is necessary so unit tests pass. (Many
257 // unit tests don't set up the threading naming system; hence
258 // CurrentlyOn(UI thread) will fail.)
259 DCHECK(!content::BrowserThread::IsThreadInitialized(
260 content::BrowserThread::UI) ||
261 content::BrowserThread::CurrentlyOn(content::BrowserThread::UI));
262 if (raw_term_score_to_topicality_score == NULL) {
263 raw_term_score_to_topicality_score = new float[kMaxRawTermScore];
264 FillInTermScoreToTopicalityScoreArray();
266 // A vector that accumulates per-term scores. The strongest match--a
267 // match in the hostname at a word boundary--is worth 10 points.
268 // Everything else is less. In general, a match that's not at a word
269 // boundary is worth about 1/4th or 1/5th of a match at the word boundary
270 // in the same part of the URL/title.
271 DCHECK_GT(num_terms, 0);
272 std::vector<int> term_scores(num_terms, 0);
273 std::vector<size_t>::const_iterator next_word_starts =
274 word_starts.url_word_starts_.begin();
275 std::vector<size_t>::const_iterator end_word_starts =
276 word_starts.url_word_starts_.end();
277 const size_t question_mark_pos = url.find('?');
278 const size_t colon_pos = url.find(':');
279 // The + 3 skips the // that probably appears in the protocol
280 // after the colon. If the protocol doesn't have two slashes after
281 // the colon, that's okay--all this ends up doing is starting our
282 // search for the next / a few characters into the hostname. The
283 // only times this can cause problems is if we have a protocol without
284 // a // after the colon and the hostname is only one or two characters.
285 // This isn't worth worrying about.
286 const size_t end_of_hostname_pos = (colon_pos != std::string::npos) ?
287 url.find('/', colon_pos + 3) : url.find('/');
288 size_t last_part_of_hostname_pos =
289 (end_of_hostname_pos != std::string::npos) ?
290 url.rfind('.', end_of_hostname_pos) :
292 // Loop through all URL matches and score them appropriately.
293 for (TermMatches::const_iterator iter = url_matches.begin();
294 iter != url_matches.end(); ++iter) {
295 // Advance next_word_starts until it's >= the position of the term
296 // we're considering.
297 while ((next_word_starts != end_word_starts) &&
298 (*next_word_starts < iter->offset)) {
301 const bool at_word_boundary = (next_word_starts != end_word_starts) &&
302 (*next_word_starts == iter->offset);
303 if ((question_mark_pos != std::string::npos) &&
304 (iter->offset > question_mark_pos)) {
305 // match in CGI ?... fragment
306 term_scores[iter->term_num] += at_word_boundary ? 5 : 0;
307 } else if ((end_of_hostname_pos != std::string::npos) &&
308 (iter->offset > end_of_hostname_pos)) {
310 term_scores[iter->term_num] += at_word_boundary ? 8 : 0;
311 } else if ((colon_pos == std::string::npos) ||
312 (iter->offset > colon_pos)) {
314 if ((last_part_of_hostname_pos == std::string::npos) ||
315 (iter->offset < last_part_of_hostname_pos)) {
316 // Either there are no dots in the hostname or this match isn't
317 // the last dotted component.
318 term_scores[iter->term_num] += at_word_boundary ? 10 : 2;
319 } // else: match in the last part of a dotted hostname (usually
320 // this is the top-level domain .com, .net, etc.). Do not
321 // count this match for scoring.
322 } // else: match in protocol. Do not count this match for scoring.
324 // Now do the analogous loop over all matches in the title.
325 next_word_starts = word_starts.title_word_starts_.begin();
326 end_word_starts = word_starts.title_word_starts_.end();
328 for (TermMatches::const_iterator iter = title_matches.begin();
329 iter != title_matches.end(); ++iter) {
330 // Advance next_word_starts until it's >= the position of the term
331 // we're considering.
332 while ((next_word_starts != end_word_starts) &&
333 (*next_word_starts < iter->offset)) {
337 if (word_num >= 10) break; // only count the first ten words
338 const bool at_word_boundary = (next_word_starts != end_word_starts) &&
339 (*next_word_starts == iter->offset);
340 term_scores[iter->term_num] += at_word_boundary ? 8 : 0;
342 // TODO(mpearson): Restore logic for penalizing out-of-order matches.
343 // (Perhaps discount them by 0.8?)
344 // TODO(mpearson): Consider: if the earliest match occurs late in the string,
345 // should we discount it?
346 // TODO(mpearson): Consider: do we want to score based on how much of the
347 // input string the input covers? (I'm leaning toward no.)
349 // Compute the topicality_score as the sum of transformed term_scores.
350 float topicality_score = 0;
351 for (size_t i = 0; i < term_scores.size(); ++i) {
352 // Drop this URL if it seems like a term didn't appear or, more precisely,
353 // didn't appear in a part of the URL or title that we trust enough
354 // to give it credit for. For instance, terms that appear in the middle
355 // of a CGI parameter get no credit. Almost all the matches dropped
356 // due to this test would look stupid if shown to the user.
357 if (term_scores[i] == 0)
359 topicality_score += raw_term_score_to_topicality_score[
360 (term_scores[i] >= kMaxRawTermScore) ? (kMaxRawTermScore - 1) :
363 // TODO(mpearson): If there are multiple terms, consider taking the
364 // geometric mean of per-term scores rather than the arithmetic mean.
366 return topicality_score / num_terms;
370 float* ScoredHistoryMatch::raw_term_score_to_topicality_score = NULL;
373 void ScoredHistoryMatch::FillInTermScoreToTopicalityScoreArray() {
374 for (int term_score = 0; term_score < kMaxRawTermScore; ++term_score) {
375 float topicality_score;
376 if (term_score < 10) {
377 // If the term scores less than 10 points (no full-credit hit, or
378 // no combination of hits that score that well), then the topicality
379 // score is linear in the term score.
380 topicality_score = 0.1 * term_score;
382 // For term scores of at least ten points, pass them through a log
383 // function so a score of 10 points gets a 1.0 (to meet up exactly
384 // with the linear component) and increases logarithmically until
385 // maxing out at 30 points, with computes to a score around 2.1.
386 topicality_score = (1.0 + 2.25 * log10(0.1 * term_score));
388 raw_term_score_to_topicality_score[term_score] = topicality_score;
393 float* ScoredHistoryMatch::days_ago_to_recency_score = NULL;
396 float ScoredHistoryMatch::GetRecencyScore(int last_visit_days_ago) {
397 // Because the below thread is not thread safe, we check that we're
398 // only calling it from one thread: the UI thread. Specifically,
399 // we check "if we've heard of the UI thread then we'd better
400 // be on it." The first part is necessary so unit tests pass. (Many
401 // unit tests don't set up the threading naming system; hence
402 // CurrentlyOn(UI thread) will fail.)
403 DCHECK(!content::BrowserThread::IsThreadInitialized(
404 content::BrowserThread::UI) ||
405 content::BrowserThread::CurrentlyOn(content::BrowserThread::UI));
406 if (days_ago_to_recency_score == NULL) {
407 days_ago_to_recency_score = new float[kDaysToPrecomputeRecencyScoresFor];
408 FillInDaysAgoToRecencyScoreArray();
410 // Lookup the score in days_ago_to_recency_score, treating
411 // everything older than what we've precomputed as the oldest thing
412 // we've precomputed. The std::max is to protect against corruption
413 // in the database (in case last_visit_days_ago is negative).
414 return days_ago_to_recency_score[
416 std::min(last_visit_days_ago, kDaysToPrecomputeRecencyScoresFor - 1),
420 void ScoredHistoryMatch::FillInDaysAgoToRecencyScoreArray() {
421 for (int days_ago = 0; days_ago < kDaysToPrecomputeRecencyScoresFor;
423 int unnormalized_recency_score;
425 unnormalized_recency_score = 100;
426 } else if (days_ago <= 14) {
427 // Linearly extrapolate between 4 and 14 days so 14 days has a score
429 unnormalized_recency_score = 70 + (14 - days_ago) * (100 - 70) / (14 - 4);
430 } else if (days_ago <= 31) {
431 // Linearly extrapolate between 14 and 31 days so 31 days has a score
433 unnormalized_recency_score = 50 + (31 - days_ago) * (70 - 50) / (31 - 14);
434 } else if (days_ago <= 90) {
435 // Linearly extrapolate between 30 and 90 days so 90 days has a score
437 unnormalized_recency_score = 30 + (90 - days_ago) * (50 - 30) / (90 - 30);
439 // Linearly extrapolate between 90 and 365 days so 365 days has a score
441 unnormalized_recency_score =
442 10 + (365 - days_ago) * (20 - 10) / (365 - 90);
444 days_ago_to_recency_score[days_ago] = unnormalized_recency_score / 100.0;
446 DCHECK_LE(days_ago_to_recency_score[days_ago],
447 days_ago_to_recency_score[days_ago - 1]);
453 float ScoredHistoryMatch::GetFrecency(const base::Time& now,
454 const VisitInfoVector& visits) {
455 // Compute the weighted average |value_of_transition| over the last at
456 // most kMaxVisitsToScore visits, where each visit is weighted using
457 // GetRecencyScore() based on how many days ago it happened. Use
458 // kMaxVisitsToScore as the denominator for the average regardless of
459 // how many visits there were in order to penalize a match that has
460 // fewer visits than kMaxVisitsToScore.
461 const int total_sampled_visits = std::min(visits.size(), kMaxVisitsToScore);
462 if (total_sampled_visits == 0)
464 float summed_visit_points = 0;
465 for (int i = 0; i < total_sampled_visits; ++i) {
466 const int value_of_transition =
467 (visits[i].second == content::PAGE_TRANSITION_TYPED) ? 20 : 1;
468 const float bucket_weight =
469 GetRecencyScore((now - visits[i].first).InDays());
470 summed_visit_points += (value_of_transition * bucket_weight);
472 return visits.size() * summed_visit_points / total_sampled_visits;
476 float ScoredHistoryMatch::GetFinalRelevancyScore(float topicality_score,
477 float frecency_score) {
478 if (topicality_score == 0)
480 // Here's how to interpret intermediate_score: Suppose the omnibox
481 // has one input term. Suppose we have a URL for which the omnibox
482 // input term has a single URL hostname hit at a word boundary. (This
483 // implies topicality_score = 1.0.). Then the intermediate_score for
484 // this URL will depend entirely on the frecency_score with
485 // this interpretation:
486 // - a single typed visit more than three months ago, no other visits -> 0.2
487 // - a visit every three days, no typed visits -> 0.706
488 // - a visit every day, no typed visits -> 0.916
489 // - a single typed visit yesterday, no other visits -> 2.0
490 // - a typed visit once a week -> 11.77
491 // - a typed visit every three days -> 14.12
492 // - at least ten typed visits today -> 20.0 (maximum score)
493 const float intermediate_score = topicality_score * frecency_score;
494 // The below code maps intermediate_score to the range [0, 1399].
495 // The score maxes out at 1400 (i.e., cannot beat a good inline result).
496 if (intermediate_score <= 1) {
497 // Linearly extrapolate between 0 and 1.5 so 0 has a score of 400
498 // and 1.5 has a score of 600.
499 const float slope = (600 - 400) / (1.5f - 0.0f);
500 return 400 + slope * intermediate_score;
502 if (intermediate_score <= 12.0) {
503 // Linearly extrapolate up to 12 so 12 has a score of 1300.
504 const float slope = (1300 - 600) / (12.0f - 1.5f);
505 return 600 + slope * (intermediate_score - 1.5);
507 // Linearly extrapolate so a score of 20 (or more) has a score of 1399.
508 // (Scores above 20 are possible for URLs that have multiple term hits
509 // in the URL and/or title and that are visited practically all
510 // the time using typed visits. We don't attempt to distinguish
511 // between these very good results.)
512 const float slope = (1399 - 1300) / (20.0f - 12.0f);
513 return std::min(1399.0, 1300 + slope * (intermediate_score - 12.0));
516 void ScoredHistoryMatch::InitializeAlsoDoHUPLikeScoringFieldAndMaxScoreField() {
517 also_do_hup_like_scoring = false;
518 // When doing HUP-like scoring, don't allow a non-inlineable match
519 // to beat the score of good inlineable matches. This is a problem
520 // because if a non-inlineable match ends up with the highest score
521 // from HistoryQuick provider, all HistoryQuick matches get demoted
522 // to non-inlineable scores (scores less than 1200). Without
523 // HUP-like-scoring, these results would actually come from the HUP
524 // and not be demoted, thus outscoring the demoted HQP results.
525 // When the HQP provides these, we need to clamp the non-inlineable
526 // results to preserve this behavior.
527 if (also_do_hup_like_scoring) {
528 max_assigned_score_for_non_inlineable_matches =
529 HistoryURLProvider::kScoreForBestInlineableResult - 1;
533 } // namespace history