- add sources.
[platform/framework/web/crosswalk.git] / src / chrome / renderer / safe_browsing / phishing_url_feature_extractor.cc
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
6
7 #include <algorithm>
8 #include <string>
9 #include <vector>
10
11 #include "base/logging.h"
12 #include "base/metrics/histogram.h"
13 #include "base/strings/string_split.h"
14 #include "base/strings/string_util.h"
15 #include "base/timer/elapsed_timer.h"
16 #include "chrome/renderer/safe_browsing/features.h"
17 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
18 #include "url/gurl.h"
19
20 namespace safe_browsing {
21
22 PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {}
23
24 PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {}
25
26 bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url,
27                                                   FeatureMap* features) {
28   base::ElapsedTimer timer;
29   if (url.HostIsIPAddress()) {
30     if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress))
31       return false;
32   } else {
33     std::string host;
34     TrimString(url.host(), ".", &host);  // Remove any leading/trailing dots.
35
36     // TODO(bryner): Ensure that the url encoding is consistent with
37     // the features in the model.
38
39     // Disallow unknown registries so that we don't classify
40     // partial hostnames (e.g. "www.subdomain").
41     size_t registry_length =
42         net::registry_controlled_domains::GetRegistryLength(
43             host,
44             net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
45             net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
46
47     if (registry_length == 0 || registry_length == std::string::npos) {
48       DVLOG(1) << "Could not find TLD for host: " << host;
49       return false;
50     }
51     DCHECK_LT(registry_length, host.size()) << "Non-zero registry length, but "
52         "host is only a TLD: " << host;
53     size_t tld_start = host.size() - registry_length;
54     if (!features->AddBooleanFeature(features::kUrlTldToken +
55                                      host.substr(tld_start)))
56       return false;
57
58     // Pull off the TLD and the preceeding dot.
59     host.erase(tld_start - 1);
60     std::vector<std::string> host_tokens;
61     base::SplitStringDontTrim(host, '.', &host_tokens);
62     // Get rid of any empty components.
63     std::vector<std::string>::iterator new_end =
64         std::remove(host_tokens.begin(), host_tokens.end(), "");
65     host_tokens.erase(new_end, host_tokens.end());
66     if (host_tokens.empty()) {
67       DVLOG(1) << "Could not find domain for host: " << host;
68       return false;
69     }
70     if (!features->AddBooleanFeature(features::kUrlDomainToken +
71                                      host_tokens.back()))
72       return false;
73     host_tokens.pop_back();
74
75     // Now we're just left with the "other" host tokens.
76     for (std::vector<std::string>::iterator it = host_tokens.begin();
77          it != host_tokens.end(); ++it) {
78       if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it))
79         return false;
80     }
81
82     if (host_tokens.size() > 1) {
83       if (!features->AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne))
84         return false;
85       if (host_tokens.size() > 3) {
86         if (!features->AddBooleanFeature(
87                 features::kUrlNumOtherHostTokensGTThree))
88           return false;
89       }
90     }
91   }
92
93   std::vector<std::string> long_tokens;
94   SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens);
95   for (std::vector<std::string>::iterator it = long_tokens.begin();
96        it != long_tokens.end(); ++it) {
97     if (!features->AddBooleanFeature(features::kUrlPathToken + *it))
98       return false;
99   }
100
101   UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed());
102   return true;
103 }
104
105 // static
106 void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(
107     const std::string& full,
108     std::vector<std::string>* tokens) {
109   // Split on common non-alphanumerics.
110   // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly.
111   static const char kTokenSeparators[] = ".,\\/_-|=%:!&";
112   std::vector<std::string> raw_splits;
113   Tokenize(full, kTokenSeparators, &raw_splits);
114
115   // Copy over only the splits that are 3 or more chars long.
116   // TODO(bryner): Determine a meaningful min size.
117   for (std::vector<std::string>::iterator it = raw_splits.begin();
118        it != raw_splits.end(); ++it) {
119     if (it->length() >= kMinPathComponentLength)
120       tokens->push_back(*it);
121   }
122 }
123
124 }  // namespace safe_browsing