Update To 11.40.268.0
[platform/framework/web/crosswalk.git] / src / chrome / renderer / safe_browsing / phishing_term_feature_extractor_unittest.cc
index ec5e253..4c15605 100644 (file)
 #include "testing/gmock/include/gmock/gmock.h"
 #include "testing/gtest/include/gtest/gtest.h"
 
+using base::ASCIIToUTF16;
 using ::testing::Return;
 
+
+static const uint32 kMurmurHash3Seed = 2777808611U;
+
 namespace safe_browsing {
 
 class PhishingTermFeatureExtractorTest : public ::testing::Test {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     base::hash_set<std::string> terms;
     terms.insert("one");
     terms.insert("one one");
@@ -62,37 +66,48 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test {
     words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
     words.insert("\xe5\x86\x8d\xe8\xa7\x81");
 
-    static const uint32 kMurmurHash3Seed = 2777808611U;
     for (base::hash_set<std::string>::iterator it = words.begin();
          it != words.end(); ++it) {
       word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
     }
 
+    ResetExtractor(3 /* max shingles per page */);
+  }
+
+  void ResetExtractor(size_t max_shingles_per_page) {
     extractor_.reset(new PhishingTermFeatureExtractor(
         &term_hashes_,
         &word_hashes_,
         3 /* max_words_per_term */,
         kMurmurHash3Seed,
+        max_shingles_per_page,
+        4 /* shingle_size */,
         &clock_));
   }
 
   // Runs the TermFeatureExtractor on |page_text|, waiting for the
   // completion callback.  Returns the success boolean from the callback.
-  bool ExtractFeatures(const string16* page_text, FeatureMap* features) {
+  bool ExtractFeatures(const base::string16* page_text,
+                       FeatureMap* features,
+                       std::set<uint32>* shingle_hashes) {
     success_ = false;
     extractor_->ExtractFeatures(
         page_text,
         features,
+        shingle_hashes,
         base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
                    base::Unretained(this)));
     msg_loop_.Run();
     return success_;
   }
 
-  void PartialExtractFeatures(const string16* page_text, FeatureMap* features) {
+  void PartialExtractFeatures(const base::string16* page_text,
+                              FeatureMap* features,
+                              std::set<uint32>* shingle_hashes) {
     extractor_->ExtractFeatures(
         page_text,
         features,
+        shingle_hashes,
         base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
                    base::Unretained(this)));
     msg_loop_.PostTask(
@@ -125,12 +140,15 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
   // This test doesn't exercise the extraction timing.
   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
 
-  string16 page_text = ASCIIToUTF16("blah");
+  base::string16 page_text = ASCIIToUTF16("blah");
   FeatureMap expected_features;  // initially empty
+  std::set<uint32> expected_shingle_hashes;
 
   FeatureMap features;
-  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  std::set<uint32> shingle_hashes;
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
   ExpectFeatureMapsAreEqual(features, expected_features);
+  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
 
   page_text = ASCIIToUTF16("one one");
   expected_features.Clear();
@@ -138,29 +156,51 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
                                       std::string("one"));
   expected_features.AddBooleanFeature(features::kPageTerm +
                                       std::string("one one"));
+  expected_shingle_hashes.clear();
 
   features.Clear();
-  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  shingle_hashes.clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
   ExpectFeatureMapsAreEqual(features, expected_features);
+  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
 
   page_text = ASCIIToUTF16("bla bla multi word test bla");
   expected_features.Clear();
   expected_features.AddBooleanFeature(features::kPageTerm +
                                       std::string("multi word test"));
+  expected_shingle_hashes.clear();
+  expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ",
+                                                   kMurmurHash3Seed));
+  expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ",
+                                                   kMurmurHash3Seed));
+  expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ",
+                                                   kMurmurHash3Seed));
 
   features.Clear();
-  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  shingle_hashes.clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
   ExpectFeatureMapsAreEqual(features, expected_features);
+  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
 
   // This text has all of the words for one of the terms, but they are
   // not in the correct order.
   page_text = ASCIIToUTF16("bla bla test word multi bla");
   expected_features.Clear();
+  expected_shingle_hashes.clear();
+  expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ",
+                                                   kMurmurHash3Seed));
+  expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ",
+                                                   kMurmurHash3Seed));
+  expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ",
+                                                   kMurmurHash3Seed));
 
   features.Clear();
-  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  shingle_hashes.clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
   ExpectFeatureMapsAreEqual(features, expected_features);
+  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
 
+  // Test various separators.
   page_text = ASCIIToUTF16("Capitalization plus non-space\n"
                            "separator... punctuation!");
   expected_features.Clear();
@@ -172,39 +212,87 @@ TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
                                       std::string("separator"));
   expected_features.AddBooleanFeature(features::kPageTerm +
                                       std::string("punctuation"));
+  expected_shingle_hashes.clear();
+  expected_shingle_hashes.insert(
+      MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ",
+                                                   kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed));
 
   features.Clear();
-  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  shingle_hashes.clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
   ExpectFeatureMapsAreEqual(features, expected_features);
+  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
 
-  // Test with empty page text.
-  page_text = string16();
+  // Test a page with too many words and we should only 3 minimum hashes.
+  page_text = ASCIIToUTF16("This page has way too many words.");
   expected_features.Clear();
+  expected_shingle_hashes.clear();
+  expected_shingle_hashes.insert(MurmurHash3String("this page has way ",
+                                                   kMurmurHash3Seed));
+  expected_shingle_hashes.insert(MurmurHash3String("page has way too ",
+                                                   kMurmurHash3Seed));
+  expected_shingle_hashes.insert(MurmurHash3String("has way too many ",
+                                                   kMurmurHash3Seed));
+  expected_shingle_hashes.insert(MurmurHash3String("way too many words ",
+                                                   kMurmurHash3Seed));
+  std::set<uint32>::iterator it = expected_shingle_hashes.end();
+  expected_shingle_hashes.erase(--it);
+
   features.Clear();
-  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  shingle_hashes.clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
   ExpectFeatureMapsAreEqual(features, expected_features);
+  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
 
-  // Chinese translation of the phrase "hello goodbye". This tests that
-  // we can correctly separate terms in languages that don't use spaces.
-  page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
+  // Test with empty page text.
+  page_text = base::string16();
+  expected_features.Clear();
+  expected_shingle_hashes.clear();
+  features.Clear();
+  shingle_hashes.clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
+  ExpectFeatureMapsAreEqual(features, expected_features);
+  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
+
+#if !defined(OS_ANDROID)
+  // The test code is disabled due to http://crbug.com/392234
+  // The client-side detection feature is not enabled on Android yet.
+  // If we decided to enable the feature, we need to fix the bug first.
+
+  // Chinese translation of the phrase "hello goodbye hello goodbye". This tests
+  // that we can correctly separate terms in languages that don't use spaces.
+  page_text =
+      base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"
+                        "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
   expected_features.Clear();
   expected_features.AddBooleanFeature(
       features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
   expected_features.AddBooleanFeature(
       features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
+  expected_shingle_hashes.clear();
+  expected_shingle_hashes.insert(MurmurHash3String(
+      "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 "
+      "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed));
 
   features.Clear();
-  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  shingle_hashes.clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
   ExpectFeatureMapsAreEqual(features, expected_features);
+  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
+#endif
 }
 
 TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
   // For this test, we'll cause the feature extraction to run multiple
   // iterations by incrementing the clock.
+  ResetExtractor(200 /* max shingles per page */);
 
   // This page has a total of 30 words.  For the features to be computed
   // correctly, the extractor has to process the entire string of text.
-  string16 page_text(ASCIIToUTF16("one "));
+  base::string16 page_text(ASCIIToUTF16("one "));
   for (int i = 0; i < 28; ++i) {
     page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
   }
@@ -242,10 +330,67 @@ TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
                                       std::string("one"));
   expected_features.AddBooleanFeature(features::kPageTerm +
                                       std::string("two"));
+  std::set<uint32> expected_shingle_hashes;
+  expected_shingle_hashes.insert(
+      MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed));
+  expected_shingle_hashes.insert(
+      MurmurHash3String("25 26 27 two ", kMurmurHash3Seed));
 
   FeatureMap features;
-  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  std::set<uint32> shingle_hashes;
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
   ExpectFeatureMapsAreEqual(features, expected_features);
+  EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
   // Make sure none of the mock expectations carry over to the next test.
   ::testing::Mock::VerifyAndClearExpectations(&clock_);
 
@@ -268,11 +413,13 @@ TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
 
   features.Clear();
-  EXPECT_FALSE(ExtractFeatures(&page_text, &features));
+  shingle_hashes.clear();
+  EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes));
 }
 
 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
-  scoped_ptr<string16> page_text(new string16(ASCIIToUTF16("one ")));
+  scoped_ptr<base::string16> page_text(
+      new base::string16(ASCIIToUTF16("one ")));
   for (int i = 0; i < 28; ++i) {
     page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
   }
@@ -290,21 +437,23 @@ TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
 
   FeatureMap features;
+  std::set<uint32> shingle_hashes;
   // Extract first 10 words then stop.
-  PartialExtractFeatures(page_text.get(), &features);
+  PartialExtractFeatures(page_text.get(), &features, &shingle_hashes);
 
-  page_text.reset(new string16());
+  page_text.reset(new base::string16());
   for (int i = 30; i < 58; ++i) {
     page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
   }
   page_text->append(ASCIIToUTF16("multi word test "));
   features.Clear();
+  shingle_hashes.clear();
 
   // This part doesn't exercise the extraction timing.
   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
 
   // Now extract normally and make sure nothing breaks.
-  EXPECT_TRUE(ExtractFeatures(page_text.get(), &features));
+  EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes));
 
   FeatureMap expected_features;
   expected_features.AddBooleanFeature(features::kPageTerm +