#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
+using base::ASCIIToUTF16;
using ::testing::Return;
+
+static const uint32 kMurmurHash3Seed = 2777808611U;
+
namespace safe_browsing {
class PhishingTermFeatureExtractorTest : public ::testing::Test {
protected:
- virtual void SetUp() {
+ void SetUp() override {
base::hash_set<std::string> terms;
terms.insert("one");
terms.insert("one one");
words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
words.insert("\xe5\x86\x8d\xe8\xa7\x81");
- static const uint32 kMurmurHash3Seed = 2777808611U;
for (base::hash_set<std::string>::iterator it = words.begin();
it != words.end(); ++it) {
word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
}
+ ResetExtractor(3 /* max shingles per page */);
+ }
+
+ void ResetExtractor(size_t max_shingles_per_page) {
extractor_.reset(new PhishingTermFeatureExtractor(
&term_hashes_,
&word_hashes_,
3 /* max_words_per_term */,
kMurmurHash3Seed,
+ max_shingles_per_page,
+ 4 /* shingle_size */,
&clock_));
}
// Runs the TermFeatureExtractor on |page_text|, waiting for the
// completion callback. Returns the success boolean from the callback.
- bool ExtractFeatures(const string16* page_text, FeatureMap* features) {
+ bool ExtractFeatures(const base::string16* page_text,
+ FeatureMap* features,
+ std::set<uint32>* shingle_hashes) {
success_ = false;
extractor_->ExtractFeatures(
page_text,
features,
+ shingle_hashes,
base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
base::Unretained(this)));
msg_loop_.Run();
return success_;
}
- void PartialExtractFeatures(const string16* page_text, FeatureMap* features) {
+ void PartialExtractFeatures(const base::string16* page_text,
+ FeatureMap* features,
+ std::set<uint32>* shingle_hashes) {
extractor_->ExtractFeatures(
page_text,
features,
+ shingle_hashes,
base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
base::Unretained(this)));
msg_loop_.PostTask(
// This test doesn't exercise the extraction timing.
EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
- string16 page_text = ASCIIToUTF16("blah");
+ base::string16 page_text = ASCIIToUTF16("blah");
FeatureMap expected_features; // initially empty
+ std::set<uint32> expected_shingle_hashes;
FeatureMap features;
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ std::set<uint32> shingle_hashes;
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
page_text = ASCIIToUTF16("one one");
expected_features.Clear();
std::string("one"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("one one"));
+ expected_shingle_hashes.clear();
features.Clear();
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
page_text = ASCIIToUTF16("bla bla multi word test bla");
expected_features.Clear();
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("multi word test"));
+ expected_shingle_hashes.clear();
+ expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ",
+ kMurmurHash3Seed));
features.Clear();
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
// This text has all of the words for one of the terms, but they are
// not in the correct order.
page_text = ASCIIToUTF16("bla bla test word multi bla");
expected_features.Clear();
+ expected_shingle_hashes.clear();
+ expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ",
+ kMurmurHash3Seed));
features.Clear();
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
+ // Test various separators.
page_text = ASCIIToUTF16("Capitalization plus non-space\n"
"separator... punctuation!");
expected_features.Clear();
std::string("separator"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("punctuation"));
+ expected_shingle_hashes.clear();
+ expected_shingle_hashes.insert(
+ MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed));
features.Clear();
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
- // Test with empty page text.
- page_text = string16();
+ // Test a page with too many words and we should only 3 minimum hashes.
+ page_text = ASCIIToUTF16("This page has way too many words.");
expected_features.Clear();
+ expected_shingle_hashes.clear();
+ expected_shingle_hashes.insert(MurmurHash3String("this page has way ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("page has way too ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("has way too many ",
+ kMurmurHash3Seed));
+ expected_shingle_hashes.insert(MurmurHash3String("way too many words ",
+ kMurmurHash3Seed));
+ std::set<uint32>::iterator it = expected_shingle_hashes.end();
+ expected_shingle_hashes.erase(--it);
+
features.Clear();
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
- // Chinese translation of the phrase "hello goodbye". This tests that
- // we can correctly separate terms in languages that don't use spaces.
- page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
+ // Test with empty page text.
+ page_text = base::string16();
+ expected_features.Clear();
+ expected_shingle_hashes.clear();
+ features.Clear();
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
+ ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
+
+#if !defined(OS_ANDROID)
+ // The test code is disabled due to http://crbug.com/392234
+ // The client-side detection feature is not enabled on Android yet.
+ // If we decided to enable the feature, we need to fix the bug first.
+
+ // Chinese translation of the phrase "hello goodbye hello goodbye". This tests
+ // that we can correctly separate terms in languages that don't use spaces.
+ page_text =
+ base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"
+ "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
expected_features.Clear();
expected_features.AddBooleanFeature(
features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
expected_features.AddBooleanFeature(
features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
+ expected_shingle_hashes.clear();
+ expected_shingle_hashes.insert(MurmurHash3String(
+ "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 "
+ "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed));
features.Clear();
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
+#endif
}
TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
// For this test, we'll cause the feature extraction to run multiple
// iterations by incrementing the clock.
+ ResetExtractor(200 /* max shingles per page */);
// This page has a total of 30 words. For the features to be computed
// correctly, the extractor has to process the entire string of text.
- string16 page_text(ASCIIToUTF16("one "));
+ base::string16 page_text(ASCIIToUTF16("one "));
for (int i = 0; i < 28; ++i) {
page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
}
std::string("one"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("two"));
+ std::set<uint32> expected_shingle_hashes;
+ expected_shingle_hashes.insert(
+ MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed));
+ expected_shingle_hashes.insert(
+ MurmurHash3String("25 26 27 two ", kMurmurHash3Seed));
FeatureMap features;
- ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+ std::set<uint32> shingle_hashes;
+ ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
ExpectFeatureMapsAreEqual(features, expected_features);
+ EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
// Make sure none of the mock expectations carry over to the next test.
::testing::Mock::VerifyAndClearExpectations(&clock_);
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
features.Clear();
- EXPECT_FALSE(ExtractFeatures(&page_text, &features));
+ shingle_hashes.clear();
+ EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes));
}
TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
- scoped_ptr<string16> page_text(new string16(ASCIIToUTF16("one ")));
+ scoped_ptr<base::string16> page_text(
+ new base::string16(ASCIIToUTF16("one ")));
for (int i = 0; i < 28; ++i) {
page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
}
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
FeatureMap features;
+ std::set<uint32> shingle_hashes;
// Extract first 10 words then stop.
- PartialExtractFeatures(page_text.get(), &features);
+ PartialExtractFeatures(page_text.get(), &features, &shingle_hashes);
- page_text.reset(new string16());
+ page_text.reset(new base::string16());
for (int i = 30; i < 58; ++i) {
page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
}
page_text->append(ASCIIToUTF16("multi word test "));
features.Clear();
+ shingle_hashes.clear();
// This part doesn't exercise the extraction timing.
EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
// Now extract normally and make sure nothing breaks.
- EXPECT_TRUE(ExtractFeatures(page_text.get(), &features));
+ EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes));
FeatureMap expected_features;
expected_features.AddBooleanFeature(features::kPageTerm +