Upstream version 10.39.225.0
[platform/framework/web/crosswalk.git] / src / chrome / browser / history / scored_history_match_unittest.cc
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <algorithm>
6
7 #include "base/auto_reset.h"
8 #include "base/strings/string16.h"
9 #include "base/strings/utf_string_conversions.h"
10 #include "chrome/browser/history/scored_history_match.h"
11 #include "components/history/core/test/history_client_fake_bookmarks.h"
12 #include "testing/gtest/include/gtest/gtest.h"
13
14 using base::ASCIIToUTF16;
15
16 namespace history {
17
18 // Returns a VisitInfoVector that includes |num_visits| spread over the
19 // last |frequency|*|num_visits| days (relative to |now|).  A frequency of
20 // one means one visit each day, two means every other day, etc.
21 VisitInfoVector CreateVisitInfoVector(int num_visits,
22                                       int frequency,
23                                       base::Time now) {
24   VisitInfoVector visits;
25   for (int i = 0; i < num_visits; ++i) {
26     visits.push_back(
27         std::make_pair(now - base::TimeDelta::FromDays(i * frequency),
28                        ui::PAGE_TRANSITION_LINK));
29   }
30   return visits;
31 }
32
33 class ScoredHistoryMatchTest : public testing::Test {
34  protected:
35   // Convenience function to create a URLRow with basic data for |url|, |title|,
36   // |visit_count|, and |typed_count|. |days_since_last_visit| gives the number
37   // of days ago to which to set the URL's last_visit.
38   URLRow MakeURLRow(const char* url,
39                     const char* title,
40                     int visit_count,
41                     int days_since_last_visit,
42                     int typed_count);
43
44   // Convenience function to set the word starts information from a URLRow's
45   // URL and title.
46   void PopulateWordStarts(const URLRow& url_row, RowWordStarts* word_starts);
47
48   // Convenience functions for easily creating vectors of search terms.
49   String16Vector Make1Term(const char* term) const;
50   String16Vector Make2Terms(const char* term_1, const char* term_2) const;
51
52   // Convenience function for GetTopicalityScore() that builds the
53   // term match and word break information automatically that are needed
54   // to call GetTopicalityScore().  It only works for scoring a single term,
55   // not multiple terms.
56   float GetTopicalityScoreOfTermAgainstURLAndTitle(const base::string16& term,
57                                                    const base::string16& url,
58                                                    const base::string16& title);
59 };
60
61 URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url,
62                                           const char* title,
63                                           int visit_count,
64                                           int days_since_last_visit,
65                                           int typed_count) {
66   URLRow row(GURL(url), 0);
67   row.set_title(ASCIIToUTF16(title));
68   row.set_visit_count(visit_count);
69   row.set_typed_count(typed_count);
70   row.set_last_visit(base::Time::NowFromSystemTime() -
71                      base::TimeDelta::FromDays(days_since_last_visit));
72   return row;
73 }
74
75 void ScoredHistoryMatchTest::PopulateWordStarts(
76     const URLRow& url_row, RowWordStarts* word_starts) {
77   String16SetFromString16(ASCIIToUTF16(url_row.url().spec()),
78                           &word_starts->url_word_starts_);
79   String16SetFromString16(url_row.title(), &word_starts->title_word_starts_);
80 }
81
82
83 String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const {
84   String16Vector original_terms;
85   original_terms.push_back(ASCIIToUTF16(term));
86   return original_terms;
87 }
88
89 String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1,
90                                                   const char* term_2) const {
91   String16Vector original_terms;
92   original_terms.push_back(ASCIIToUTF16(term_1));
93   original_terms.push_back(ASCIIToUTF16(term_2));
94   return original_terms;
95 }
96
97 float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
98     const base::string16& term,
99     const base::string16& url,
100     const base::string16& title) {
101   // Make an empty match and simply populate the fields we need in order
102   // to call GetTopicalityScore().
103   ScoredHistoryMatch scored_match;
104   scored_match.url_matches_ = MatchTermInString(term, url, 0);
105   scored_match.title_matches_ = MatchTermInString(term, title, 0);
106   RowWordStarts word_starts;
107   String16SetFromString16(url, &word_starts.url_word_starts_);
108   String16SetFromString16(title, &word_starts.title_word_starts_);
109   WordStarts one_word_no_offset(1, 0u);
110   return scored_match.GetTopicalityScore(1, url, one_word_no_offset,
111                                          word_starts);
112 }
113
114 TEST_F(ScoredHistoryMatchTest, Scoring) {
115   // We use NowFromSystemTime() because MakeURLRow uses the same function
116   // to calculate last visit time when building a row.
117   base::Time now = base::Time::NowFromSystemTime();
118
119   URLRow row_a(MakeURLRow("http://fedcba", "abcd bcd", 3, 30, 1));
120   RowWordStarts word_starts_a;
121   PopulateWordStarts(row_a, &word_starts_a);
122   WordStarts one_word_no_offset(1, 0u);
123   VisitInfoVector visits_a = CreateVisitInfoVector(3, 30, now);
124   // Mark one visit as typed.
125   visits_a[0].second = ui::PAGE_TRANSITION_TYPED;
126   ScoredHistoryMatch scored_a(row_a, visits_a, std::string(),
127                               ASCIIToUTF16("abc"), Make1Term("abc"),
128                               one_word_no_offset, word_starts_a, now, NULL);
129
130   // Test scores based on visit_count.
131   URLRow row_b(MakeURLRow("http://abcdef", "abcd bcd", 10, 30, 1));
132   RowWordStarts word_starts_b;
133   PopulateWordStarts(row_b, &word_starts_b);
134   VisitInfoVector visits_b = CreateVisitInfoVector(10, 30, now);
135   visits_b[0].second = ui::PAGE_TRANSITION_TYPED;
136   ScoredHistoryMatch scored_b(row_b, visits_b, std::string(),
137                               ASCIIToUTF16("abc"), Make1Term("abc"),
138                               one_word_no_offset, word_starts_b, now, NULL);
139   EXPECT_GT(scored_b.raw_score(), scored_a.raw_score());
140
141   // Test scores based on last_visit.
142   URLRow row_c(MakeURLRow("http://abcdef", "abcd bcd", 3, 10, 1));
143   RowWordStarts word_starts_c;
144   PopulateWordStarts(row_c, &word_starts_c);
145   VisitInfoVector visits_c = CreateVisitInfoVector(3, 10, now);
146   visits_c[0].second = ui::PAGE_TRANSITION_TYPED;
147   ScoredHistoryMatch scored_c(row_c, visits_c, std::string(),
148                               ASCIIToUTF16("abc"), Make1Term("abc"),
149                               one_word_no_offset, word_starts_c, now, NULL);
150   EXPECT_GT(scored_c.raw_score(), scored_a.raw_score());
151
152   // Test scores based on typed_count.
153   URLRow row_d(MakeURLRow("http://abcdef", "abcd bcd", 3, 30, 3));
154   RowWordStarts word_starts_d;
155   PopulateWordStarts(row_d, &word_starts_d);
156   VisitInfoVector visits_d = CreateVisitInfoVector(3, 30, now);
157   visits_d[0].second = ui::PAGE_TRANSITION_TYPED;
158   visits_d[1].second = ui::PAGE_TRANSITION_TYPED;
159   visits_d[2].second = ui::PAGE_TRANSITION_TYPED;
160   ScoredHistoryMatch scored_d(row_d, visits_d, std::string(),
161                               ASCIIToUTF16("abc"), Make1Term("abc"),
162                               one_word_no_offset, word_starts_d, now, NULL);
163   EXPECT_GT(scored_d.raw_score(), scored_a.raw_score());
164
165   // Test scores based on a terms appearing multiple times.
166   URLRow row_e(MakeURLRow("http://csi.csi.csi/csi_csi",
167       "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 3));
168   RowWordStarts word_starts_e;
169   PopulateWordStarts(row_e, &word_starts_e);
170   const VisitInfoVector visits_e = visits_d;
171   ScoredHistoryMatch scored_e(row_e, visits_e, std::string(),
172                               ASCIIToUTF16("csi"), Make1Term("csi"),
173                               one_word_no_offset, word_starts_e, now, NULL);
174   EXPECT_LT(scored_e.raw_score(), 1400);
175
176   // Test that a result with only a mid-term match (i.e., not at a word
177   // boundary) scores 0.
178   ScoredHistoryMatch scored_f(row_a, visits_a, std::string(),
179                               ASCIIToUTF16("cd"), Make1Term("cd"),
180                               one_word_no_offset, word_starts_a, now, NULL);
181   EXPECT_EQ(scored_f.raw_score(), 0);
182 }
183
184 TEST_F(ScoredHistoryMatchTest, ScoringBookmarks) {
185   // We use NowFromSystemTime() because MakeURLRow uses the same function
186   // to calculate last visit time when building a row.
187   base::Time now = base::Time::NowFromSystemTime();
188
189   std::string url_string("http://fedcba");
190   const GURL url(url_string);
191   URLRow row(MakeURLRow(url_string.c_str(), "abcd bcd", 8, 3, 1));
192   RowWordStarts word_starts;
193   PopulateWordStarts(row, &word_starts);
194   WordStarts one_word_no_offset(1, 0u);
195   VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
196   ScoredHistoryMatch scored(row, visits, std::string(),
197                             ASCIIToUTF16("abc"), Make1Term("abc"),
198                             one_word_no_offset, word_starts, now, NULL);
199   // Now bookmark that URL and make sure its score increases.
200   base::AutoReset<int> reset(&ScoredHistoryMatch::bookmark_value_, 5);
201   history::HistoryClientFakeBookmarks history_client;
202   history_client.AddBookmark(url);
203   ScoredHistoryMatch scored_with_bookmark(
204       row, visits, std::string(), ASCIIToUTF16("abc"), Make1Term("abc"),
205       one_word_no_offset, word_starts, now, &history_client);
206   EXPECT_GT(scored_with_bookmark.raw_score(), scored.raw_score());
207 }
208
209 TEST_F(ScoredHistoryMatchTest, ScoringTLD) {
210   // We use NowFromSystemTime() because MakeURLRow uses the same function
211   // to calculate last visit time when building a row.
212   base::Time now = base::Time::NowFromSystemTime();
213
214   // By default the URL should not be returned for a query that includes "com".
215   std::string url_string("http://fedcba.com/");
216   const GURL url(url_string);
217   URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1));
218   RowWordStarts word_starts;
219   PopulateWordStarts(row, &word_starts);
220   WordStarts two_words_no_offsets(2, 0u);
221   VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
222   ScoredHistoryMatch scored(row, visits, std::string(),
223                             ASCIIToUTF16("fed com"), Make2Terms("fed", "com"),
224                             two_words_no_offsets, word_starts, now, NULL);
225   EXPECT_EQ(0, scored.raw_score());
226
227   // Now allow credit for the match in the TLD.
228   base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_tld_matches_, true);
229   ScoredHistoryMatch scored_with_tld(
230       row, visits, std::string(), ASCIIToUTF16("fed com"),
231       Make2Terms("fed", "com"), two_words_no_offsets, word_starts, now, NULL);
232   EXPECT_GT(scored_with_tld.raw_score(), 0);
233 }
234
235 TEST_F(ScoredHistoryMatchTest, ScoringScheme) {
236   // We use NowFromSystemTime() because MakeURLRow uses the same function
237   // to calculate last visit time when building a row.
238   base::Time now = base::Time::NowFromSystemTime();
239
240   // By default the URL should not be returned for a query that includes "http".
241   std::string url_string("http://fedcba/");
242   const GURL url(url_string);
243   URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1));
244   RowWordStarts word_starts;
245   PopulateWordStarts(row, &word_starts);
246   WordStarts two_words_no_offsets(2, 0u);
247   VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
248   ScoredHistoryMatch scored(row, visits, std::string(),
249                             ASCIIToUTF16("fed http"), Make2Terms("fed", "http"),
250                             two_words_no_offsets, word_starts, now, NULL);
251   EXPECT_EQ(0, scored.raw_score());
252
253   // Now allow credit for the match in the scheme.
254   base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_scheme_matches_, true);
255   ScoredHistoryMatch scored_with_scheme(
256       row, visits, std::string(), ASCIIToUTF16("fed http"),
257       Make2Terms("fed", "http"), two_words_no_offsets, word_starts, now, NULL);
258   EXPECT_GT(scored_with_scheme.raw_score(), 0);
259 }
260
261 TEST_F(ScoredHistoryMatchTest, Inlining) {
262   // We use NowFromSystemTime() because MakeURLRow uses the same function
263   // to calculate last visit time when building a row.
264   base::Time now = base::Time::NowFromSystemTime();
265   RowWordStarts word_starts;
266   WordStarts one_word_no_offset(1, 0u);
267   VisitInfoVector visits;
268
269   {
270     URLRow row(MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1));
271     PopulateWordStarts(row, &word_starts);
272     ScoredHistoryMatch scored_a(row, visits, std::string(),
273                                 ASCIIToUTF16("g"), Make1Term("g"),
274                                 one_word_no_offset, word_starts, now, NULL);
275     EXPECT_TRUE(scored_a.can_inline());
276     EXPECT_FALSE(scored_a.match_in_scheme);
277     ScoredHistoryMatch scored_b(row, visits, std::string(),
278                                 ASCIIToUTF16("w"), Make1Term("w"),
279                                 one_word_no_offset, word_starts, now, NULL);
280     EXPECT_TRUE(scored_b.can_inline());
281     EXPECT_FALSE(scored_b.match_in_scheme);
282     ScoredHistoryMatch scored_c(row, visits, std::string(),
283                                 ASCIIToUTF16("h"), Make1Term("h"),
284                                 one_word_no_offset, word_starts, now, NULL);
285     EXPECT_TRUE(scored_c.can_inline());
286     EXPECT_TRUE(scored_c.match_in_scheme);
287     ScoredHistoryMatch scored_d(row, visits, std::string(),
288                                 ASCIIToUTF16("o"), Make1Term("o"),
289                                 one_word_no_offset, word_starts, now, NULL);
290     EXPECT_FALSE(scored_d.can_inline());
291     EXPECT_FALSE(scored_d.match_in_scheme);
292   }
293
294   {
295     URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1));
296     PopulateWordStarts(row, &word_starts);
297     ScoredHistoryMatch scored_a(row, visits, std::string(),
298                                 ASCIIToUTF16("t"), Make1Term("t"),
299                                 one_word_no_offset, word_starts, now, NULL);
300     EXPECT_TRUE(scored_a.can_inline());
301     EXPECT_FALSE(scored_a.match_in_scheme);
302     ScoredHistoryMatch scored_b(row, visits, std::string(),
303                                 ASCIIToUTF16("f"), Make1Term("f"),
304                                 one_word_no_offset, word_starts, now, NULL);
305     EXPECT_FALSE(scored_b.can_inline());
306     EXPECT_FALSE(scored_b.match_in_scheme);
307     ScoredHistoryMatch scored_c(row, visits, std::string(),
308                                 ASCIIToUTF16("o"), Make1Term("o"),
309                                 one_word_no_offset, word_starts, now, NULL);
310     EXPECT_FALSE(scored_c.can_inline());
311     EXPECT_FALSE(scored_c.match_in_scheme);
312   }
313
314   {
315     URLRow row(MakeURLRow("https://www.testing.com", "abcdef", 3, 30, 1));
316     PopulateWordStarts(row, &word_starts);
317     ScoredHistoryMatch scored_a(row, visits, std::string(),
318                                 ASCIIToUTF16("t"), Make1Term("t"),
319                                 one_word_no_offset, word_starts, now, NULL);
320     EXPECT_TRUE(scored_a.can_inline());
321     EXPECT_FALSE(scored_a.match_in_scheme);
322     ScoredHistoryMatch scored_b(row, visits, std::string(),
323                                 ASCIIToUTF16("h"), Make1Term("h"),
324                                 one_word_no_offset, word_starts, now, NULL);
325     EXPECT_TRUE(scored_b.can_inline());
326     EXPECT_TRUE(scored_b.match_in_scheme);
327     ScoredHistoryMatch scored_c(row, visits, std::string(),
328                                 ASCIIToUTF16("w"), Make1Term("w"),
329                                 one_word_no_offset, word_starts, now, NULL);
330     EXPECT_TRUE(scored_c.can_inline());
331     EXPECT_FALSE(scored_c.match_in_scheme);
332   }
333 }
334
335 TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) {
336   const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle(
337       ASCIIToUTF16("def"),
338       ASCIIToUTF16("http://abc.def.com/"),
339       ASCIIToUTF16("Non-Matching Title"));
340   const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle(
341       ASCIIToUTF16("def"),
342       ASCIIToUTF16("http://abc.def.com"),
343       ASCIIToUTF16("Non-Matching Title"));
344   EXPECT_EQ(hostname_no_slash, hostname);
345 }
346
347 // This function only tests scoring of single terms that match exactly
348 // once somewhere in the URL or title.
349 TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) {
350   base::string16 url = ASCIIToUTF16("http://abc.def.com/path1/path2?"
351       "arg1=val1&arg2=val2#hash_component");
352   base::string16 title = ASCIIToUTF16("here is a title");
353   const float hostname_score =
354       GetTopicalityScoreOfTermAgainstURLAndTitle(
355           ASCIIToUTF16("abc"), url, title);
356   const float hostname_mid_word_score =
357       GetTopicalityScoreOfTermAgainstURLAndTitle(
358           ASCIIToUTF16("bc"), url, title);
359   const float domain_name_score =
360       GetTopicalityScoreOfTermAgainstURLAndTitle(
361           ASCIIToUTF16("def"), url, title);
362   const float domain_name_mid_word_score =
363       GetTopicalityScoreOfTermAgainstURLAndTitle(
364           ASCIIToUTF16("ef"), url, title);
365   const float tld_score =
366       GetTopicalityScoreOfTermAgainstURLAndTitle(
367           ASCIIToUTF16("com"), url, title);
368   const float tld_mid_word_score =
369       GetTopicalityScoreOfTermAgainstURLAndTitle(
370           ASCIIToUTF16("om"), url, title);
371   const float path_score =
372       GetTopicalityScoreOfTermAgainstURLAndTitle(
373           ASCIIToUTF16("path1"), url, title);
374   const float path_mid_word_score =
375       GetTopicalityScoreOfTermAgainstURLAndTitle(
376           ASCIIToUTF16("ath1"), url, title);
377   const float arg_score =
378       GetTopicalityScoreOfTermAgainstURLAndTitle(
379           ASCIIToUTF16("arg2"), url, title);
380   const float arg_mid_word_score =
381       GetTopicalityScoreOfTermAgainstURLAndTitle(
382           ASCIIToUTF16("rg2"), url, title);
383   const float protocol_score =
384       GetTopicalityScoreOfTermAgainstURLAndTitle(
385           ASCIIToUTF16("htt"), url, title);
386   const float protocol_mid_word_score =
387       GetTopicalityScoreOfTermAgainstURLAndTitle(
388           ASCIIToUTF16("tt"), url, title);
389   const float title_score =
390       GetTopicalityScoreOfTermAgainstURLAndTitle(
391           ASCIIToUTF16("her"), url, title);
392   const float title_mid_word_score =
393       GetTopicalityScoreOfTermAgainstURLAndTitle(
394           ASCIIToUTF16("er"), url, title);
395   // Verify hostname and domain name > path > arg.
396   EXPECT_GT(hostname_score, path_score);
397   EXPECT_GT(domain_name_score, path_score);
398   EXPECT_GT(path_score, arg_score);
399   // Verify that domain name > path and domain name > arg for non-word
400   // boundaries.
401   EXPECT_GT(hostname_mid_word_score, path_mid_word_score);
402   EXPECT_GT(domain_name_mid_word_score, path_mid_word_score);
403   EXPECT_GT(domain_name_mid_word_score, arg_mid_word_score);
404   EXPECT_GT(hostname_mid_word_score, arg_mid_word_score);
405   // Also verify that the matches at non-word-boundaries all score
406   // worse than the matches at word boundaries.  These three sets suffice.
407   EXPECT_GT(arg_score, hostname_mid_word_score);
408   EXPECT_GT(arg_score, domain_name_mid_word_score);
409   EXPECT_GT(title_score, title_mid_word_score);
410   // Check that title matches fit somewhere reasonable compared to the
411   // various types of URL matches.
412   EXPECT_GT(title_score, arg_score);
413   EXPECT_GT(arg_score, title_mid_word_score);
414   // Finally, verify that protocol matches and top level domain name
415   // matches (.com, .net, etc.) score worse than some of the mid-word
416   // matches that actually count.
417   EXPECT_GT(hostname_mid_word_score, protocol_score);
418   EXPECT_GT(hostname_mid_word_score, protocol_mid_word_score);
419   EXPECT_GT(hostname_mid_word_score, tld_score);
420   EXPECT_GT(hostname_mid_word_score, tld_mid_word_score);
421 }
422
423 }  // namespace history