1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
7 #include "base/auto_reset.h"
8 #include "base/strings/string16.h"
9 #include "base/strings/utf_string_conversions.h"
10 #include "chrome/browser/history/scored_history_match.h"
11 #include "components/history/core/test/history_client_fake_bookmarks.h"
12 #include "testing/gtest/include/gtest/gtest.h"
14 using base::ASCIIToUTF16;
18 // Returns a VisitInfoVector that includes |num_visits| spread over the
19 // last |frequency|*|num_visits| days (relative to |now|). A frequency of
20 // one means one visit each day, two means every other day, etc.
21 VisitInfoVector CreateVisitInfoVector(int num_visits,
24 VisitInfoVector visits;
25 for (int i = 0; i < num_visits; ++i) {
27 std::make_pair(now - base::TimeDelta::FromDays(i * frequency),
28 ui::PAGE_TRANSITION_LINK));
33 class ScoredHistoryMatchTest : public testing::Test {
35 // Convenience function to create a URLRow with basic data for |url|, |title|,
36 // |visit_count|, and |typed_count|. |days_since_last_visit| gives the number
37 // of days ago to which to set the URL's last_visit.
38 URLRow MakeURLRow(const char* url,
41 int days_since_last_visit,
44 // Convenience function to set the word starts information from a URLRow's
46 void PopulateWordStarts(const URLRow& url_row, RowWordStarts* word_starts);
48 // Convenience functions for easily creating vectors of search terms.
49 String16Vector Make1Term(const char* term) const;
50 String16Vector Make2Terms(const char* term_1, const char* term_2) const;
52 // Convenience function for GetTopicalityScore() that builds the
53 // term match and word break information automatically that are needed
54 // to call GetTopicalityScore(). It only works for scoring a single term,
55 // not multiple terms.
56 float GetTopicalityScoreOfTermAgainstURLAndTitle(const base::string16& term,
57 const base::string16& url,
58 const base::string16& title);
61 URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url,
64 int days_since_last_visit,
66 URLRow row(GURL(url), 0);
67 row.set_title(ASCIIToUTF16(title));
68 row.set_visit_count(visit_count);
69 row.set_typed_count(typed_count);
70 row.set_last_visit(base::Time::NowFromSystemTime() -
71 base::TimeDelta::FromDays(days_since_last_visit));
75 void ScoredHistoryMatchTest::PopulateWordStarts(
76 const URLRow& url_row, RowWordStarts* word_starts) {
77 String16SetFromString16(ASCIIToUTF16(url_row.url().spec()),
78 &word_starts->url_word_starts_);
79 String16SetFromString16(url_row.title(), &word_starts->title_word_starts_);
83 String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const {
84 String16Vector original_terms;
85 original_terms.push_back(ASCIIToUTF16(term));
86 return original_terms;
89 String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1,
90 const char* term_2) const {
91 String16Vector original_terms;
92 original_terms.push_back(ASCIIToUTF16(term_1));
93 original_terms.push_back(ASCIIToUTF16(term_2));
94 return original_terms;
97 float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
98 const base::string16& term,
99 const base::string16& url,
100 const base::string16& title) {
101 // Make an empty match and simply populate the fields we need in order
102 // to call GetTopicalityScore().
103 ScoredHistoryMatch scored_match;
104 scored_match.url_matches_ = MatchTermInString(term, url, 0);
105 scored_match.title_matches_ = MatchTermInString(term, title, 0);
106 RowWordStarts word_starts;
107 String16SetFromString16(url, &word_starts.url_word_starts_);
108 String16SetFromString16(title, &word_starts.title_word_starts_);
109 WordStarts one_word_no_offset(1, 0u);
110 return scored_match.GetTopicalityScore(1, url, one_word_no_offset,
114 TEST_F(ScoredHistoryMatchTest, Scoring) {
115 // We use NowFromSystemTime() because MakeURLRow uses the same function
116 // to calculate last visit time when building a row.
117 base::Time now = base::Time::NowFromSystemTime();
119 URLRow row_a(MakeURLRow("http://fedcba", "abcd bcd", 3, 30, 1));
120 RowWordStarts word_starts_a;
121 PopulateWordStarts(row_a, &word_starts_a);
122 WordStarts one_word_no_offset(1, 0u);
123 VisitInfoVector visits_a = CreateVisitInfoVector(3, 30, now);
124 // Mark one visit as typed.
125 visits_a[0].second = ui::PAGE_TRANSITION_TYPED;
126 ScoredHistoryMatch scored_a(row_a, visits_a, std::string(),
127 ASCIIToUTF16("abc"), Make1Term("abc"),
128 one_word_no_offset, word_starts_a, now, NULL);
130 // Test scores based on visit_count.
131 URLRow row_b(MakeURLRow("http://abcdef", "abcd bcd", 10, 30, 1));
132 RowWordStarts word_starts_b;
133 PopulateWordStarts(row_b, &word_starts_b);
134 VisitInfoVector visits_b = CreateVisitInfoVector(10, 30, now);
135 visits_b[0].second = ui::PAGE_TRANSITION_TYPED;
136 ScoredHistoryMatch scored_b(row_b, visits_b, std::string(),
137 ASCIIToUTF16("abc"), Make1Term("abc"),
138 one_word_no_offset, word_starts_b, now, NULL);
139 EXPECT_GT(scored_b.raw_score(), scored_a.raw_score());
141 // Test scores based on last_visit.
142 URLRow row_c(MakeURLRow("http://abcdef", "abcd bcd", 3, 10, 1));
143 RowWordStarts word_starts_c;
144 PopulateWordStarts(row_c, &word_starts_c);
145 VisitInfoVector visits_c = CreateVisitInfoVector(3, 10, now);
146 visits_c[0].second = ui::PAGE_TRANSITION_TYPED;
147 ScoredHistoryMatch scored_c(row_c, visits_c, std::string(),
148 ASCIIToUTF16("abc"), Make1Term("abc"),
149 one_word_no_offset, word_starts_c, now, NULL);
150 EXPECT_GT(scored_c.raw_score(), scored_a.raw_score());
152 // Test scores based on typed_count.
153 URLRow row_d(MakeURLRow("http://abcdef", "abcd bcd", 3, 30, 3));
154 RowWordStarts word_starts_d;
155 PopulateWordStarts(row_d, &word_starts_d);
156 VisitInfoVector visits_d = CreateVisitInfoVector(3, 30, now);
157 visits_d[0].second = ui::PAGE_TRANSITION_TYPED;
158 visits_d[1].second = ui::PAGE_TRANSITION_TYPED;
159 visits_d[2].second = ui::PAGE_TRANSITION_TYPED;
160 ScoredHistoryMatch scored_d(row_d, visits_d, std::string(),
161 ASCIIToUTF16("abc"), Make1Term("abc"),
162 one_word_no_offset, word_starts_d, now, NULL);
163 EXPECT_GT(scored_d.raw_score(), scored_a.raw_score());
165 // Test scores based on a terms appearing multiple times.
166 URLRow row_e(MakeURLRow("http://csi.csi.csi/csi_csi",
167 "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 3));
168 RowWordStarts word_starts_e;
169 PopulateWordStarts(row_e, &word_starts_e);
170 const VisitInfoVector visits_e = visits_d;
171 ScoredHistoryMatch scored_e(row_e, visits_e, std::string(),
172 ASCIIToUTF16("csi"), Make1Term("csi"),
173 one_word_no_offset, word_starts_e, now, NULL);
174 EXPECT_LT(scored_e.raw_score(), 1400);
176 // Test that a result with only a mid-term match (i.e., not at a word
177 // boundary) scores 0.
178 ScoredHistoryMatch scored_f(row_a, visits_a, std::string(),
179 ASCIIToUTF16("cd"), Make1Term("cd"),
180 one_word_no_offset, word_starts_a, now, NULL);
181 EXPECT_EQ(scored_f.raw_score(), 0);
184 TEST_F(ScoredHistoryMatchTest, ScoringBookmarks) {
185 // We use NowFromSystemTime() because MakeURLRow uses the same function
186 // to calculate last visit time when building a row.
187 base::Time now = base::Time::NowFromSystemTime();
189 std::string url_string("http://fedcba");
190 const GURL url(url_string);
191 URLRow row(MakeURLRow(url_string.c_str(), "abcd bcd", 8, 3, 1));
192 RowWordStarts word_starts;
193 PopulateWordStarts(row, &word_starts);
194 WordStarts one_word_no_offset(1, 0u);
195 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
196 ScoredHistoryMatch scored(row, visits, std::string(),
197 ASCIIToUTF16("abc"), Make1Term("abc"),
198 one_word_no_offset, word_starts, now, NULL);
199 // Now bookmark that URL and make sure its score increases.
200 base::AutoReset<int> reset(&ScoredHistoryMatch::bookmark_value_, 5);
201 history::HistoryClientFakeBookmarks history_client;
202 history_client.AddBookmark(url);
203 ScoredHistoryMatch scored_with_bookmark(
204 row, visits, std::string(), ASCIIToUTF16("abc"), Make1Term("abc"),
205 one_word_no_offset, word_starts, now, &history_client);
206 EXPECT_GT(scored_with_bookmark.raw_score(), scored.raw_score());
209 TEST_F(ScoredHistoryMatchTest, ScoringTLD) {
210 // We use NowFromSystemTime() because MakeURLRow uses the same function
211 // to calculate last visit time when building a row.
212 base::Time now = base::Time::NowFromSystemTime();
214 // By default the URL should not be returned for a query that includes "com".
215 std::string url_string("http://fedcba.com/");
216 const GURL url(url_string);
217 URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1));
218 RowWordStarts word_starts;
219 PopulateWordStarts(row, &word_starts);
220 WordStarts two_words_no_offsets(2, 0u);
221 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
222 ScoredHistoryMatch scored(row, visits, std::string(),
223 ASCIIToUTF16("fed com"), Make2Terms("fed", "com"),
224 two_words_no_offsets, word_starts, now, NULL);
225 EXPECT_EQ(0, scored.raw_score());
227 // Now allow credit for the match in the TLD.
228 base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_tld_matches_, true);
229 ScoredHistoryMatch scored_with_tld(
230 row, visits, std::string(), ASCIIToUTF16("fed com"),
231 Make2Terms("fed", "com"), two_words_no_offsets, word_starts, now, NULL);
232 EXPECT_GT(scored_with_tld.raw_score(), 0);
235 TEST_F(ScoredHistoryMatchTest, ScoringScheme) {
236 // We use NowFromSystemTime() because MakeURLRow uses the same function
237 // to calculate last visit time when building a row.
238 base::Time now = base::Time::NowFromSystemTime();
240 // By default the URL should not be returned for a query that includes "http".
241 std::string url_string("http://fedcba/");
242 const GURL url(url_string);
243 URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1));
244 RowWordStarts word_starts;
245 PopulateWordStarts(row, &word_starts);
246 WordStarts two_words_no_offsets(2, 0u);
247 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
248 ScoredHistoryMatch scored(row, visits, std::string(),
249 ASCIIToUTF16("fed http"), Make2Terms("fed", "http"),
250 two_words_no_offsets, word_starts, now, NULL);
251 EXPECT_EQ(0, scored.raw_score());
253 // Now allow credit for the match in the scheme.
254 base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_scheme_matches_, true);
255 ScoredHistoryMatch scored_with_scheme(
256 row, visits, std::string(), ASCIIToUTF16("fed http"),
257 Make2Terms("fed", "http"), two_words_no_offsets, word_starts, now, NULL);
258 EXPECT_GT(scored_with_scheme.raw_score(), 0);
261 TEST_F(ScoredHistoryMatchTest, Inlining) {
262 // We use NowFromSystemTime() because MakeURLRow uses the same function
263 // to calculate last visit time when building a row.
264 base::Time now = base::Time::NowFromSystemTime();
265 RowWordStarts word_starts;
266 WordStarts one_word_no_offset(1, 0u);
267 VisitInfoVector visits;
270 URLRow row(MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1));
271 PopulateWordStarts(row, &word_starts);
272 ScoredHistoryMatch scored_a(row, visits, std::string(),
273 ASCIIToUTF16("g"), Make1Term("g"),
274 one_word_no_offset, word_starts, now, NULL);
275 EXPECT_TRUE(scored_a.can_inline());
276 EXPECT_FALSE(scored_a.match_in_scheme);
277 ScoredHistoryMatch scored_b(row, visits, std::string(),
278 ASCIIToUTF16("w"), Make1Term("w"),
279 one_word_no_offset, word_starts, now, NULL);
280 EXPECT_TRUE(scored_b.can_inline());
281 EXPECT_FALSE(scored_b.match_in_scheme);
282 ScoredHistoryMatch scored_c(row, visits, std::string(),
283 ASCIIToUTF16("h"), Make1Term("h"),
284 one_word_no_offset, word_starts, now, NULL);
285 EXPECT_TRUE(scored_c.can_inline());
286 EXPECT_TRUE(scored_c.match_in_scheme);
287 ScoredHistoryMatch scored_d(row, visits, std::string(),
288 ASCIIToUTF16("o"), Make1Term("o"),
289 one_word_no_offset, word_starts, now, NULL);
290 EXPECT_FALSE(scored_d.can_inline());
291 EXPECT_FALSE(scored_d.match_in_scheme);
295 URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1));
296 PopulateWordStarts(row, &word_starts);
297 ScoredHistoryMatch scored_a(row, visits, std::string(),
298 ASCIIToUTF16("t"), Make1Term("t"),
299 one_word_no_offset, word_starts, now, NULL);
300 EXPECT_TRUE(scored_a.can_inline());
301 EXPECT_FALSE(scored_a.match_in_scheme);
302 ScoredHistoryMatch scored_b(row, visits, std::string(),
303 ASCIIToUTF16("f"), Make1Term("f"),
304 one_word_no_offset, word_starts, now, NULL);
305 EXPECT_FALSE(scored_b.can_inline());
306 EXPECT_FALSE(scored_b.match_in_scheme);
307 ScoredHistoryMatch scored_c(row, visits, std::string(),
308 ASCIIToUTF16("o"), Make1Term("o"),
309 one_word_no_offset, word_starts, now, NULL);
310 EXPECT_FALSE(scored_c.can_inline());
311 EXPECT_FALSE(scored_c.match_in_scheme);
315 URLRow row(MakeURLRow("https://www.testing.com", "abcdef", 3, 30, 1));
316 PopulateWordStarts(row, &word_starts);
317 ScoredHistoryMatch scored_a(row, visits, std::string(),
318 ASCIIToUTF16("t"), Make1Term("t"),
319 one_word_no_offset, word_starts, now, NULL);
320 EXPECT_TRUE(scored_a.can_inline());
321 EXPECT_FALSE(scored_a.match_in_scheme);
322 ScoredHistoryMatch scored_b(row, visits, std::string(),
323 ASCIIToUTF16("h"), Make1Term("h"),
324 one_word_no_offset, word_starts, now, NULL);
325 EXPECT_TRUE(scored_b.can_inline());
326 EXPECT_TRUE(scored_b.match_in_scheme);
327 ScoredHistoryMatch scored_c(row, visits, std::string(),
328 ASCIIToUTF16("w"), Make1Term("w"),
329 one_word_no_offset, word_starts, now, NULL);
330 EXPECT_TRUE(scored_c.can_inline());
331 EXPECT_FALSE(scored_c.match_in_scheme);
335 TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) {
336 const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle(
338 ASCIIToUTF16("http://abc.def.com/"),
339 ASCIIToUTF16("Non-Matching Title"));
340 const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle(
342 ASCIIToUTF16("http://abc.def.com"),
343 ASCIIToUTF16("Non-Matching Title"));
344 EXPECT_EQ(hostname_no_slash, hostname);
347 // This function only tests scoring of single terms that match exactly
348 // once somewhere in the URL or title.
349 TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) {
350 base::string16 url = ASCIIToUTF16("http://abc.def.com/path1/path2?"
351 "arg1=val1&arg2=val2#hash_component");
352 base::string16 title = ASCIIToUTF16("here is a title");
353 const float hostname_score =
354 GetTopicalityScoreOfTermAgainstURLAndTitle(
355 ASCIIToUTF16("abc"), url, title);
356 const float hostname_mid_word_score =
357 GetTopicalityScoreOfTermAgainstURLAndTitle(
358 ASCIIToUTF16("bc"), url, title);
359 const float domain_name_score =
360 GetTopicalityScoreOfTermAgainstURLAndTitle(
361 ASCIIToUTF16("def"), url, title);
362 const float domain_name_mid_word_score =
363 GetTopicalityScoreOfTermAgainstURLAndTitle(
364 ASCIIToUTF16("ef"), url, title);
365 const float tld_score =
366 GetTopicalityScoreOfTermAgainstURLAndTitle(
367 ASCIIToUTF16("com"), url, title);
368 const float tld_mid_word_score =
369 GetTopicalityScoreOfTermAgainstURLAndTitle(
370 ASCIIToUTF16("om"), url, title);
371 const float path_score =
372 GetTopicalityScoreOfTermAgainstURLAndTitle(
373 ASCIIToUTF16("path1"), url, title);
374 const float path_mid_word_score =
375 GetTopicalityScoreOfTermAgainstURLAndTitle(
376 ASCIIToUTF16("ath1"), url, title);
377 const float arg_score =
378 GetTopicalityScoreOfTermAgainstURLAndTitle(
379 ASCIIToUTF16("arg2"), url, title);
380 const float arg_mid_word_score =
381 GetTopicalityScoreOfTermAgainstURLAndTitle(
382 ASCIIToUTF16("rg2"), url, title);
383 const float protocol_score =
384 GetTopicalityScoreOfTermAgainstURLAndTitle(
385 ASCIIToUTF16("htt"), url, title);
386 const float protocol_mid_word_score =
387 GetTopicalityScoreOfTermAgainstURLAndTitle(
388 ASCIIToUTF16("tt"), url, title);
389 const float title_score =
390 GetTopicalityScoreOfTermAgainstURLAndTitle(
391 ASCIIToUTF16("her"), url, title);
392 const float title_mid_word_score =
393 GetTopicalityScoreOfTermAgainstURLAndTitle(
394 ASCIIToUTF16("er"), url, title);
395 // Verify hostname and domain name > path > arg.
396 EXPECT_GT(hostname_score, path_score);
397 EXPECT_GT(domain_name_score, path_score);
398 EXPECT_GT(path_score, arg_score);
399 // Verify that domain name > path and domain name > arg for non-word
401 EXPECT_GT(hostname_mid_word_score, path_mid_word_score);
402 EXPECT_GT(domain_name_mid_word_score, path_mid_word_score);
403 EXPECT_GT(domain_name_mid_word_score, arg_mid_word_score);
404 EXPECT_GT(hostname_mid_word_score, arg_mid_word_score);
405 // Also verify that the matches at non-word-boundaries all score
406 // worse than the matches at word boundaries. These three sets suffice.
407 EXPECT_GT(arg_score, hostname_mid_word_score);
408 EXPECT_GT(arg_score, domain_name_mid_word_score);
409 EXPECT_GT(title_score, title_mid_word_score);
410 // Check that title matches fit somewhere reasonable compared to the
411 // various types of URL matches.
412 EXPECT_GT(title_score, arg_score);
413 EXPECT_GT(arg_score, title_mid_word_score);
414 // Finally, verify that protocol matches and top level domain name
415 // matches (.com, .net, etc.) score worse than some of the mid-word
416 // matches that actually count.
417 EXPECT_GT(hostname_mid_word_score, protocol_score);
418 EXPECT_GT(hostname_mid_word_score, protocol_mid_word_score);
419 EXPECT_GT(hostname_mid_word_score, tld_score);
420 EXPECT_GT(hostname_mid_word_score, tld_mid_word_score);
423 } // namespace history