Update To 11.40.268.0
[platform/framework/web/crosswalk.git] / src / chrome / renderer / safe_browsing / phishing_term_feature_extractor_unittest.cc
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
6
7 #include <string>
8
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/containers/hash_tables.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/message_loop/message_loop.h"
14 #include "base/strings/string16.h"
15 #include "base/strings/stringprintf.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/time/time.h"
18 #include "chrome/renderer/safe_browsing/features.h"
19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
21 #include "chrome/renderer/safe_browsing/test_utils.h"
22 #include "crypto/sha2.h"
23 #include "testing/gmock/include/gmock/gmock.h"
24 #include "testing/gtest/include/gtest/gtest.h"
25
26 using base::ASCIIToUTF16;
27 using ::testing::Return;
28
29
30 static const uint32 kMurmurHash3Seed = 2777808611U;
31
32 namespace safe_browsing {
33
34 class PhishingTermFeatureExtractorTest : public ::testing::Test {
35  protected:
36   void SetUp() override {
37     base::hash_set<std::string> terms;
38     terms.insert("one");
39     terms.insert("one one");
40     terms.insert("two");
41     terms.insert("multi word test");
42     terms.insert("capitalization");
43     terms.insert("space");
44     terms.insert("separator");
45     terms.insert("punctuation");
46     // Chinese (translation of "hello")
47     terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
48     // Chinese (translation of "goodbye")
49     terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
50
51     for (base::hash_set<std::string>::iterator it = terms.begin();
52          it != terms.end(); ++it) {
53       term_hashes_.insert(crypto::SHA256HashString(*it));
54     }
55
56     base::hash_set<std::string> words;
57     words.insert("one");
58     words.insert("two");
59     words.insert("multi");
60     words.insert("word");
61     words.insert("test");
62     words.insert("capitalization");
63     words.insert("space");
64     words.insert("separator");
65     words.insert("punctuation");
66     words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
67     words.insert("\xe5\x86\x8d\xe8\xa7\x81");
68
69     for (base::hash_set<std::string>::iterator it = words.begin();
70          it != words.end(); ++it) {
71       word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
72     }
73
74     ResetExtractor(3 /* max shingles per page */);
75   }
76
77   void ResetExtractor(size_t max_shingles_per_page) {
78     extractor_.reset(new PhishingTermFeatureExtractor(
79         &term_hashes_,
80         &word_hashes_,
81         3 /* max_words_per_term */,
82         kMurmurHash3Seed,
83         max_shingles_per_page,
84         4 /* shingle_size */,
85         &clock_));
86   }
87
88   // Runs the TermFeatureExtractor on |page_text|, waiting for the
89   // completion callback.  Returns the success boolean from the callback.
90   bool ExtractFeatures(const base::string16* page_text,
91                        FeatureMap* features,
92                        std::set<uint32>* shingle_hashes) {
93     success_ = false;
94     extractor_->ExtractFeatures(
95         page_text,
96         features,
97         shingle_hashes,
98         base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
99                    base::Unretained(this)));
100     msg_loop_.Run();
101     return success_;
102   }
103
104   void PartialExtractFeatures(const base::string16* page_text,
105                               FeatureMap* features,
106                               std::set<uint32>* shingle_hashes) {
107     extractor_->ExtractFeatures(
108         page_text,
109         features,
110         shingle_hashes,
111         base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
112                    base::Unretained(this)));
113     msg_loop_.PostTask(
114         FROM_HERE,
115         base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction,
116                    base::Unretained(this)));
117     msg_loop_.RunUntilIdle();
118   }
119
120   // Completion callback for feature extraction.
121   void ExtractionDone(bool success) {
122     success_ = success;
123     msg_loop_.Quit();
124   }
125
126   void QuitExtraction() {
127     extractor_->CancelPendingExtraction();
128     msg_loop_.Quit();
129   }
130
131   base::MessageLoop msg_loop_;
132   MockFeatureExtractorClock clock_;
133   scoped_ptr<PhishingTermFeatureExtractor> extractor_;
134   base::hash_set<std::string> term_hashes_;
135   base::hash_set<uint32> word_hashes_;
136   bool success_;  // holds the success value from ExtractFeatures
137 };
138
139 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
140   // This test doesn't exercise the extraction timing.
141   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
142
143   base::string16 page_text = ASCIIToUTF16("blah");
144   FeatureMap expected_features;  // initially empty
145   std::set<uint32> expected_shingle_hashes;
146
147   FeatureMap features;
148   std::set<uint32> shingle_hashes;
149   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
150   ExpectFeatureMapsAreEqual(features, expected_features);
151   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
152
153   page_text = ASCIIToUTF16("one one");
154   expected_features.Clear();
155   expected_features.AddBooleanFeature(features::kPageTerm +
156                                       std::string("one"));
157   expected_features.AddBooleanFeature(features::kPageTerm +
158                                       std::string("one one"));
159   expected_shingle_hashes.clear();
160
161   features.Clear();
162   shingle_hashes.clear();
163   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
164   ExpectFeatureMapsAreEqual(features, expected_features);
165   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
166
167   page_text = ASCIIToUTF16("bla bla multi word test bla");
168   expected_features.Clear();
169   expected_features.AddBooleanFeature(features::kPageTerm +
170                                       std::string("multi word test"));
171   expected_shingle_hashes.clear();
172   expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ",
173                                                    kMurmurHash3Seed));
174   expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ",
175                                                    kMurmurHash3Seed));
176   expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ",
177                                                    kMurmurHash3Seed));
178
179   features.Clear();
180   shingle_hashes.clear();
181   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
182   ExpectFeatureMapsAreEqual(features, expected_features);
183   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
184
185   // This text has all of the words for one of the terms, but they are
186   // not in the correct order.
187   page_text = ASCIIToUTF16("bla bla test word multi bla");
188   expected_features.Clear();
189   expected_shingle_hashes.clear();
190   expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ",
191                                                    kMurmurHash3Seed));
192   expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ",
193                                                    kMurmurHash3Seed));
194   expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ",
195                                                    kMurmurHash3Seed));
196
197   features.Clear();
198   shingle_hashes.clear();
199   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
200   ExpectFeatureMapsAreEqual(features, expected_features);
201   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
202
203   // Test various separators.
204   page_text = ASCIIToUTF16("Capitalization plus non-space\n"
205                            "separator... punctuation!");
206   expected_features.Clear();
207   expected_features.AddBooleanFeature(features::kPageTerm +
208                                       std::string("capitalization"));
209   expected_features.AddBooleanFeature(features::kPageTerm +
210                                       std::string("space"));
211   expected_features.AddBooleanFeature(features::kPageTerm +
212                                       std::string("separator"));
213   expected_features.AddBooleanFeature(features::kPageTerm +
214                                       std::string("punctuation"));
215   expected_shingle_hashes.clear();
216   expected_shingle_hashes.insert(
217       MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed));
218   expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ",
219                                                    kMurmurHash3Seed));
220   expected_shingle_hashes.insert(
221       MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed));
222
223   features.Clear();
224   shingle_hashes.clear();
225   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
226   ExpectFeatureMapsAreEqual(features, expected_features);
227   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
228
229   // Test a page with too many words and we should only 3 minimum hashes.
230   page_text = ASCIIToUTF16("This page has way too many words.");
231   expected_features.Clear();
232   expected_shingle_hashes.clear();
233   expected_shingle_hashes.insert(MurmurHash3String("this page has way ",
234                                                    kMurmurHash3Seed));
235   expected_shingle_hashes.insert(MurmurHash3String("page has way too ",
236                                                    kMurmurHash3Seed));
237   expected_shingle_hashes.insert(MurmurHash3String("has way too many ",
238                                                    kMurmurHash3Seed));
239   expected_shingle_hashes.insert(MurmurHash3String("way too many words ",
240                                                    kMurmurHash3Seed));
241   std::set<uint32>::iterator it = expected_shingle_hashes.end();
242   expected_shingle_hashes.erase(--it);
243
244   features.Clear();
245   shingle_hashes.clear();
246   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
247   ExpectFeatureMapsAreEqual(features, expected_features);
248   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
249
250   // Test with empty page text.
251   page_text = base::string16();
252   expected_features.Clear();
253   expected_shingle_hashes.clear();
254   features.Clear();
255   shingle_hashes.clear();
256   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
257   ExpectFeatureMapsAreEqual(features, expected_features);
258   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
259
260 #if !defined(OS_ANDROID)
261   // The test code is disabled due to http://crbug.com/392234
262   // The client-side detection feature is not enabled on Android yet.
263   // If we decided to enable the feature, we need to fix the bug first.
264
265   // Chinese translation of the phrase "hello goodbye hello goodbye". This tests
266   // that we can correctly separate terms in languages that don't use spaces.
267   page_text =
268       base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"
269                         "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
270   expected_features.Clear();
271   expected_features.AddBooleanFeature(
272       features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
273   expected_features.AddBooleanFeature(
274       features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
275   expected_shingle_hashes.clear();
276   expected_shingle_hashes.insert(MurmurHash3String(
277       "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 "
278       "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed));
279
280   features.Clear();
281   shingle_hashes.clear();
282   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
283   ExpectFeatureMapsAreEqual(features, expected_features);
284   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
285 #endif
286 }
287
288 TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
289   // For this test, we'll cause the feature extraction to run multiple
290   // iterations by incrementing the clock.
291   ResetExtractor(200 /* max shingles per page */);
292
293   // This page has a total of 30 words.  For the features to be computed
294   // correctly, the extractor has to process the entire string of text.
295   base::string16 page_text(ASCIIToUTF16("one "));
296   for (int i = 0; i < 28; ++i) {
297     page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
298   }
299   page_text.append(ASCIIToUTF16("two"));
300
301   // Advance the clock 3 ms every 5 words processed, 10 ms between chunks.
302   // Note that this assumes kClockCheckGranularity = 5 and
303   // kMaxTimePerChunkMs = 10.
304   base::TimeTicks now = base::TimeTicks::Now();
305   EXPECT_CALL(clock_, Now())
306       // Time check at the start of extraction.
307       .WillOnce(Return(now))
308       // Time check at the start of the first chunk of work.
309       .WillOnce(Return(now))
310       // Time check after the first 5 words.
311       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(3)))
312       // Time check after the next 5 words.
313       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6)))
314       // Time check after the next 5 words.
315       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(9)))
316       // Time check after the next 5 words.  This is over the chunk
317       // time limit, so a continuation task will be posted.
318       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12)))
319       // Time check at the start of the second chunk of work.
320       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22)))
321       // Time check after the next 5 words.
322       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25)))
323       // Time check after the next 5 words.
324       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28)))
325       // A final check for the histograms.
326       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)));
327
328   FeatureMap expected_features;
329   expected_features.AddBooleanFeature(features::kPageTerm +
330                                       std::string("one"));
331   expected_features.AddBooleanFeature(features::kPageTerm +
332                                       std::string("two"));
333   std::set<uint32> expected_shingle_hashes;
334   expected_shingle_hashes.insert(
335       MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed));
336   expected_shingle_hashes.insert(
337       MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed));
338   expected_shingle_hashes.insert(
339       MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed));
340   expected_shingle_hashes.insert(
341       MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed));
342   expected_shingle_hashes.insert(
343       MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed));
344   expected_shingle_hashes.insert(
345       MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed));
346   expected_shingle_hashes.insert(
347       MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed));
348   expected_shingle_hashes.insert(
349       MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed));
350   expected_shingle_hashes.insert(
351       MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed));
352   expected_shingle_hashes.insert(
353       MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed));
354   expected_shingle_hashes.insert(
355       MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed));
356   expected_shingle_hashes.insert(
357       MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed));
358   expected_shingle_hashes.insert(
359       MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed));
360   expected_shingle_hashes.insert(
361       MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed));
362   expected_shingle_hashes.insert(
363       MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed));
364   expected_shingle_hashes.insert(
365       MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed));
366   expected_shingle_hashes.insert(
367       MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed));
368   expected_shingle_hashes.insert(
369       MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed));
370   expected_shingle_hashes.insert(
371       MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed));
372   expected_shingle_hashes.insert(
373       MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed));
374   expected_shingle_hashes.insert(
375       MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed));
376   expected_shingle_hashes.insert(
377       MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed));
378   expected_shingle_hashes.insert(
379       MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed));
380   expected_shingle_hashes.insert(
381       MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed));
382   expected_shingle_hashes.insert(
383       MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed));
384   expected_shingle_hashes.insert(
385       MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed));
386   expected_shingle_hashes.insert(
387       MurmurHash3String("25 26 27 two ", kMurmurHash3Seed));
388
389   FeatureMap features;
390   std::set<uint32> shingle_hashes;
391   ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
392   ExpectFeatureMapsAreEqual(features, expected_features);
393   EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
394   // Make sure none of the mock expectations carry over to the next test.
395   ::testing::Mock::VerifyAndClearExpectations(&clock_);
396
397   // Now repeat the test with the same text, but advance the clock faster so
398   // that the extraction time exceeds the maximum total time for the feature
399   // extractor.  Extraction should fail.  Note that this assumes
400   // kMaxTotalTimeMs = 500.
401   EXPECT_CALL(clock_, Now())
402       // Time check at the start of extraction.
403       .WillOnce(Return(now))
404       // Time check at the start of the first chunk of work.
405       .WillOnce(Return(now))
406       // Time check after the first 5 words,
407       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
408       // Time check at the start of the second chunk of work.
409       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
410       // Time check after the next 5 words.  This is over the limit.
411       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
412       // A final time check for the histograms.
413       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
414
415   features.Clear();
416   shingle_hashes.clear();
417   EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes));
418 }
419
420 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
421   scoped_ptr<base::string16> page_text(
422       new base::string16(ASCIIToUTF16("one ")));
423   for (int i = 0; i < 28; ++i) {
424     page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
425   }
426
427   base::TimeTicks now = base::TimeTicks::Now();
428   EXPECT_CALL(clock_, Now())
429       // Time check at the start of extraction.
430       .WillOnce(Return(now))
431       // Time check at the start of the first chunk of work.
432       .WillOnce(Return(now))
433       // Time check after the first 5 words.
434       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7)))
435       // Time check after the next 5 words. This should be greater than
436       // kMaxTimePerChunkMs so that we stop and schedule extraction for later.
437       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
438
439   FeatureMap features;
440   std::set<uint32> shingle_hashes;
441   // Extract first 10 words then stop.
442   PartialExtractFeatures(page_text.get(), &features, &shingle_hashes);
443
444   page_text.reset(new base::string16());
445   for (int i = 30; i < 58; ++i) {
446     page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
447   }
448   page_text->append(ASCIIToUTF16("multi word test "));
449   features.Clear();
450   shingle_hashes.clear();
451
452   // This part doesn't exercise the extraction timing.
453   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
454
455   // Now extract normally and make sure nothing breaks.
456   EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes));
457
458   FeatureMap expected_features;
459   expected_features.AddBooleanFeature(features::kPageTerm +
460                                       std::string("multi word test"));
461   ExpectFeatureMapsAreEqual(features, expected_features);
462 }
463
464 }  // namespace safe_browsing