Upstream version 5.34.104.0
[platform/framework/web/crosswalk.git] / src / chrome / renderer / safe_browsing / phishing_term_feature_extractor_unittest.cc
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6
7 #include <string>
8
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/containers/hash_tables.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/message_loop/message_loop.h"
14 #include "base/strings/string16.h"
15 #include "base/strings/stringprintf.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/time/time.h"
18 #include "chrome/renderer/safe_browsing/features.h"
19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
21 #include "chrome/renderer/safe_browsing/test_utils.h"
22 #include "crypto/sha2.h"
23 #include "testing/gmock/include/gmock/gmock.h"
24 #include "testing/gtest/include/gtest/gtest.h"
25
26 using base::ASCIIToUTF16;
27 using ::testing::Return;
28
29 namespace safe_browsing {
30
31 class PhishingTermFeatureExtractorTest : public ::testing::Test {
32  protected:
33   virtual void SetUp() {
34     base::hash_set<std::string> terms;
35     terms.insert("one");
36     terms.insert("one one");
37     terms.insert("two");
38     terms.insert("multi word test");
39     terms.insert("capitalization");
40     terms.insert("space");
41     terms.insert("separator");
42     terms.insert("punctuation");
43     // Chinese (translation of "hello")
44     terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
45     // Chinese (translation of "goodbye")
46     terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
47
48     for (base::hash_set<std::string>::iterator it = terms.begin();
49          it != terms.end(); ++it) {
50       term_hashes_.insert(crypto::SHA256HashString(*it));
51     }
52
53     base::hash_set<std::string> words;
54     words.insert("one");
55     words.insert("two");
56     words.insert("multi");
57     words.insert("word");
58     words.insert("test");
59     words.insert("capitalization");
60     words.insert("space");
61     words.insert("separator");
62     words.insert("punctuation");
63     words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
64     words.insert("\xe5\x86\x8d\xe8\xa7\x81");
65
66     static const uint32 kMurmurHash3Seed = 2777808611U;
67     for (base::hash_set<std::string>::iterator it = words.begin();
68          it != words.end(); ++it) {
69       word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
70     }
71
72     extractor_.reset(new PhishingTermFeatureExtractor(
73         &term_hashes_,
74         &word_hashes_,
75         3 /* max_words_per_term */,
76         kMurmurHash3Seed,
77         &clock_));
78   }
79
80   // Runs the TermFeatureExtractor on |page_text|, waiting for the
81   // completion callback.  Returns the success boolean from the callback.
82   bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) {
83     success_ = false;
84     extractor_->ExtractFeatures(
85         page_text,
86         features,
87         base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
88                    base::Unretained(this)));
89     msg_loop_.Run();
90     return success_;
91   }
92
93   void PartialExtractFeatures(const base::string16* page_text,
94                               FeatureMap* features) {
95     extractor_->ExtractFeatures(
96         page_text,
97         features,
98         base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
99                    base::Unretained(this)));
100     msg_loop_.PostTask(
101         FROM_HERE,
102         base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction,
103                    base::Unretained(this)));
104     msg_loop_.RunUntilIdle();
105   }
106
107   // Completion callback for feature extraction.
108   void ExtractionDone(bool success) {
109     success_ = success;
110     msg_loop_.Quit();
111   }
112
113   void QuitExtraction() {
114     extractor_->CancelPendingExtraction();
115     msg_loop_.Quit();
116   }
117
118   base::MessageLoop msg_loop_;
119   MockFeatureExtractorClock clock_;
120   scoped_ptr<PhishingTermFeatureExtractor> extractor_;
121   base::hash_set<std::string> term_hashes_;
122   base::hash_set<uint32> word_hashes_;
123   bool success_;  // holds the success value from ExtractFeatures
124 };
125
126 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
127   // This test doesn't exercise the extraction timing.
128   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
129
130   base::string16 page_text = ASCIIToUTF16("blah");
131   FeatureMap expected_features;  // initially empty
132
133   FeatureMap features;
134   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
135   ExpectFeatureMapsAreEqual(features, expected_features);
136
137   page_text = ASCIIToUTF16("one one");
138   expected_features.Clear();
139   expected_features.AddBooleanFeature(features::kPageTerm +
140                                       std::string("one"));
141   expected_features.AddBooleanFeature(features::kPageTerm +
142                                       std::string("one one"));
143
144   features.Clear();
145   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
146   ExpectFeatureMapsAreEqual(features, expected_features);
147
148   page_text = ASCIIToUTF16("bla bla multi word test bla");
149   expected_features.Clear();
150   expected_features.AddBooleanFeature(features::kPageTerm +
151                                       std::string("multi word test"));
152
153   features.Clear();
154   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
155   ExpectFeatureMapsAreEqual(features, expected_features);
156
157   // This text has all of the words for one of the terms, but they are
158   // not in the correct order.
159   page_text = ASCIIToUTF16("bla bla test word multi bla");
160   expected_features.Clear();
161
162   features.Clear();
163   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
164   ExpectFeatureMapsAreEqual(features, expected_features);
165
166   page_text = ASCIIToUTF16("Capitalization plus non-space\n"
167                            "separator... punctuation!");
168   expected_features.Clear();
169   expected_features.AddBooleanFeature(features::kPageTerm +
170                                       std::string("capitalization"));
171   expected_features.AddBooleanFeature(features::kPageTerm +
172                                       std::string("space"));
173   expected_features.AddBooleanFeature(features::kPageTerm +
174                                       std::string("separator"));
175   expected_features.AddBooleanFeature(features::kPageTerm +
176                                       std::string("punctuation"));
177
178   features.Clear();
179   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
180   ExpectFeatureMapsAreEqual(features, expected_features);
181
182   // Test with empty page text.
183   page_text = base::string16();
184   expected_features.Clear();
185   features.Clear();
186   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
187   ExpectFeatureMapsAreEqual(features, expected_features);
188
189   // Chinese translation of the phrase "hello goodbye". This tests that
190   // we can correctly separate terms in languages that don't use spaces.
191   page_text =
192       base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
193   expected_features.Clear();
194   expected_features.AddBooleanFeature(
195       features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
196   expected_features.AddBooleanFeature(
197       features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
198
199   features.Clear();
200   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
201   ExpectFeatureMapsAreEqual(features, expected_features);
202 }
203
204 TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
205   // For this test, we'll cause the feature extraction to run multiple
206   // iterations by incrementing the clock.
207
208   // This page has a total of 30 words.  For the features to be computed
209   // correctly, the extractor has to process the entire string of text.
210   base::string16 page_text(ASCIIToUTF16("one "));
211   for (int i = 0; i < 28; ++i) {
212     page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
213   }
214   page_text.append(ASCIIToUTF16("two"));
215
216   // Advance the clock 3 ms every 5 words processed, 10 ms between chunks.
217   // Note that this assumes kClockCheckGranularity = 5 and
218   // kMaxTimePerChunkMs = 10.
219   base::TimeTicks now = base::TimeTicks::Now();
220   EXPECT_CALL(clock_, Now())
221       // Time check at the start of extraction.
222       .WillOnce(Return(now))
223       // Time check at the start of the first chunk of work.
224       .WillOnce(Return(now))
225       // Time check after the first 5 words.
226       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(3)))
227       // Time check after the next 5 words.
228       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6)))
229       // Time check after the next 5 words.
230       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(9)))
231       // Time check after the next 5 words.  This is over the chunk
232       // time limit, so a continuation task will be posted.
233       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12)))
234       // Time check at the start of the second chunk of work.
235       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22)))
236       // Time check after the next 5 words.
237       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25)))
238       // Time check after the next 5 words.
239       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28)))
240       // A final check for the histograms.
241       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)));
242
243   FeatureMap expected_features;
244   expected_features.AddBooleanFeature(features::kPageTerm +
245                                       std::string("one"));
246   expected_features.AddBooleanFeature(features::kPageTerm +
247                                       std::string("two"));
248
249   FeatureMap features;
250   ASSERT_TRUE(ExtractFeatures(&page_text, &features));
251   ExpectFeatureMapsAreEqual(features, expected_features);
252   // Make sure none of the mock expectations carry over to the next test.
253   ::testing::Mock::VerifyAndClearExpectations(&clock_);
254
255   // Now repeat the test with the same text, but advance the clock faster so
256   // that the extraction time exceeds the maximum total time for the feature
257   // extractor.  Extraction should fail.  Note that this assumes
258   // kMaxTotalTimeMs = 500.
259   EXPECT_CALL(clock_, Now())
260       // Time check at the start of extraction.
261       .WillOnce(Return(now))
262       // Time check at the start of the first chunk of work.
263       .WillOnce(Return(now))
264       // Time check after the first 5 words,
265       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
266       // Time check at the start of the second chunk of work.
267       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
268       // Time check after the next 5 words.  This is over the limit.
269       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
270       // A final time check for the histograms.
271       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
272
273   features.Clear();
274   EXPECT_FALSE(ExtractFeatures(&page_text, &features));
275 }
276
277 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
278   scoped_ptr<base::string16> page_text(
279       new base::string16(ASCIIToUTF16("one ")));
280   for (int i = 0; i < 28; ++i) {
281     page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
282   }
283
284   base::TimeTicks now = base::TimeTicks::Now();
285   EXPECT_CALL(clock_, Now())
286       // Time check at the start of extraction.
287       .WillOnce(Return(now))
288       // Time check at the start of the first chunk of work.
289       .WillOnce(Return(now))
290       // Time check after the first 5 words.
291       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7)))
292       // Time check after the next 5 words. This should be greater than
293       // kMaxTimePerChunkMs so that we stop and schedule extraction for later.
294       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
295
296   FeatureMap features;
297   // Extract first 10 words then stop.
298   PartialExtractFeatures(page_text.get(), &features);
299
300   page_text.reset(new base::string16());
301   for (int i = 30; i < 58; ++i) {
302     page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
303   }
304   page_text->append(ASCIIToUTF16("multi word test "));
305   features.Clear();
306
307   // This part doesn't exercise the extraction timing.
308   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
309
310   // Now extract normally and make sure nothing breaks.
311   EXPECT_TRUE(ExtractFeatures(page_text.get(), &features));
312
313   FeatureMap expected_features;
314   expected_features.AddBooleanFeature(features::kPageTerm +
315                                       std::string("multi word test"));
316   ExpectFeatureMapsAreEqual(features, expected_features);
317 }
318
319 }  // namespace safe_browsing