1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
11 #include "base/logging.h"
12 #include "base/metrics/histogram.h"
13 #include "base/strings/string_split.h"
14 #include "base/strings/string_util.h"
15 #include "base/timer/elapsed_timer.h"
16 #include "chrome/renderer/safe_browsing/features.h"
17 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
20 namespace safe_browsing {
22 PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {}
24 PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {}
26 bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url,
27 FeatureMap* features) {
28 base::ElapsedTimer timer;
29 if (url.HostIsIPAddress()) {
30 if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress))
34 TrimString(url.host(), ".", &host); // Remove any leading/trailing dots.
36 // TODO(bryner): Ensure that the url encoding is consistent with
37 // the features in the model.
39 // Disallow unknown registries so that we don't classify
40 // partial hostnames (e.g. "www.subdomain").
41 size_t registry_length =
42 net::registry_controlled_domains::GetRegistryLength(
44 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
45 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
47 if (registry_length == 0 || registry_length == std::string::npos) {
48 DVLOG(1) << "Could not find TLD for host: " << host;
51 DCHECK_LT(registry_length, host.size()) << "Non-zero registry length, but "
52 "host is only a TLD: " << host;
53 size_t tld_start = host.size() - registry_length;
54 if (!features->AddBooleanFeature(features::kUrlTldToken +
55 host.substr(tld_start)))
58 // Pull off the TLD and the preceeding dot.
59 host.erase(tld_start - 1);
60 std::vector<std::string> host_tokens;
61 base::SplitStringDontTrim(host, '.', &host_tokens);
62 // Get rid of any empty components.
63 std::vector<std::string>::iterator new_end =
64 std::remove(host_tokens.begin(), host_tokens.end(), "");
65 host_tokens.erase(new_end, host_tokens.end());
66 if (host_tokens.empty()) {
67 DVLOG(1) << "Could not find domain for host: " << host;
70 if (!features->AddBooleanFeature(features::kUrlDomainToken +
73 host_tokens.pop_back();
75 // Now we're just left with the "other" host tokens.
76 for (std::vector<std::string>::iterator it = host_tokens.begin();
77 it != host_tokens.end(); ++it) {
78 if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it))
82 if (host_tokens.size() > 1) {
83 if (!features->AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne))
85 if (host_tokens.size() > 3) {
86 if (!features->AddBooleanFeature(
87 features::kUrlNumOtherHostTokensGTThree))
93 std::vector<std::string> long_tokens;
94 SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens);
95 for (std::vector<std::string>::iterator it = long_tokens.begin();
96 it != long_tokens.end(); ++it) {
97 if (!features->AddBooleanFeature(features::kUrlPathToken + *it))
101 UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed());
106 void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(
107 const std::string& full,
108 std::vector<std::string>* tokens) {
109 // Split on common non-alphanumerics.
110 // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly.
111 static const char kTokenSeparators[] = ".,\\/_-|=%:!&";
112 std::vector<std::string> raw_splits;
113 Tokenize(full, kTokenSeparators, &raw_splits);
115 // Copy over only the splits that are 3 or more chars long.
116 // TODO(bryner): Determine a meaningful min size.
117 for (std::vector<std::string>::iterator it = raw_splits.begin();
118 it != raw_splits.end(); ++it) {
119 if (it->length() >= kMinPathComponentLength)
120 tokens->push_back(*it);
124 } // namespace safe_browsing