Upstream version 9.38.198.0
[platform/framework/web/crosswalk.git] / src / chrome / browser / safe_browsing / browser_feature_extractor.cc
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/safe_browsing/browser_feature_extractor.h"
6
7 #include <map>
8 #include <utility>
9
10 #include "base/bind.h"
11 #include "base/bind_helpers.h"
12 #include "base/format_macros.h"
13 #include "base/stl_util.h"
14 #include "base/strings/stringprintf.h"
15 #include "base/time/time.h"
16 #include "chrome/browser/history/history_service.h"
17 #include "chrome/browser/history/history_service_factory.h"
18 #include "chrome/browser/history/history_types.h"
19 #include "chrome/browser/profiles/profile.h"
20 #include "chrome/browser/safe_browsing/browser_features.h"
21 #include "chrome/browser/safe_browsing/client_side_detection_host.h"
22 #include "chrome/browser/safe_browsing/database_manager.h"
23 #include "chrome/common/safe_browsing/csd.pb.h"
24 #include "content/public/browser/browser_thread.h"
25 #include "content/public/browser/navigation_controller.h"
26 #include "content/public/browser/navigation_entry.h"
27 #include "content/public/browser/web_contents.h"
28 #include "content/public/common/page_transition_types.h"
29 #include "url/gurl.h"
30
31 using content::BrowserThread;
32 using content::NavigationController;
33 using content::NavigationEntry;
34 using content::ResourceType;
35 using content::WebContents;
36
37 namespace safe_browsing {
38
39 namespace {
40
41 const int kMaxMalwareIPPerRequest = 5;
42
43 void FilterBenignIpsOnIOThread(
44     scoped_refptr<SafeBrowsingDatabaseManager> database_manager,
45     IPUrlMap* ips) {
46   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
47   for (IPUrlMap::iterator it = ips->begin(); it != ips->end();) {
48     if (!database_manager.get() ||
49         !database_manager->MatchMalwareIP(it->first)) {
50       // it++ here returns a copy of the old iterator and passes it to erase.
51       ips->erase(it++);
52     } else {
53       ++it;
54     }
55   }
56 }
57 }  // namespace
58
59 IPUrlInfo::IPUrlInfo(const std::string& url,
60                      const std::string& method,
61                      const std::string& referrer,
62                      const ResourceType& resource_type)
63       : url(url),
64         method(method),
65         referrer(referrer),
66         resource_type(resource_type) {
67 }
68
69 IPUrlInfo::~IPUrlInfo() {}
70
71 BrowseInfo::BrowseInfo() : http_status_code(0) {}
72
73 BrowseInfo::~BrowseInfo() {}
74
75 static void AddFeature(const std::string& feature_name,
76                        double feature_value,
77                        ClientPhishingRequest* request) {
78   DCHECK(request);
79   ClientPhishingRequest::Feature* feature =
80       request->add_non_model_feature_map();
81   feature->set_name(feature_name);
82   feature->set_value(feature_value);
83   VLOG(2) << "Browser feature: " << feature->name() << " " << feature->value();
84 }
85
86 static void AddMalwareIpUrlInfo(const std::string& ip,
87                                 const std::vector<IPUrlInfo>& meta_infos,
88                                 ClientMalwareRequest* request) {
89   DCHECK(request);
90   for (std::vector<IPUrlInfo>::const_iterator it = meta_infos.begin();
91        it != meta_infos.end(); ++it) {
92     ClientMalwareRequest::UrlInfo* urlinfo =
93         request->add_bad_ip_url_info();
94     // We add the information about url on the bad ip.
95     urlinfo->set_ip(ip);
96     urlinfo->set_url(it->url);
97     urlinfo->set_method(it->method);
98     urlinfo->set_referrer(it->referrer);
99     urlinfo->set_resource_type(static_cast<int>(it->resource_type));
100   }
101   DVLOG(2) << "Added url info for bad ip: " << ip;
102 }
103
104 static void AddNavigationFeatures(
105     const std::string& feature_prefix,
106     const NavigationController& controller,
107     int index,
108     const std::vector<GURL>& redirect_chain,
109     ClientPhishingRequest* request) {
110   NavigationEntry* entry = controller.GetEntryAtIndex(index);
111   bool is_secure_referrer = entry->GetReferrer().url.SchemeIsSecure();
112   if (!is_secure_referrer) {
113     AddFeature(base::StringPrintf("%s%s=%s",
114                                   feature_prefix.c_str(),
115                                   features::kReferrer,
116                                   entry->GetReferrer().url.spec().c_str()),
117                1.0,
118                request);
119   }
120   AddFeature(feature_prefix + features::kHasSSLReferrer,
121              is_secure_referrer ? 1.0 : 0.0,
122              request);
123   AddFeature(feature_prefix + features::kPageTransitionType,
124              static_cast<double>(
125                  content::PageTransitionStripQualifier(
126                     entry->GetTransitionType())),
127              request);
128   AddFeature(feature_prefix + features::kIsFirstNavigation,
129              index == 0 ? 1.0 : 0.0,
130              request);
131   // Redirect chain should always be at least of size one, as the rendered
132   // url is the last element in the chain.
133   if (redirect_chain.empty()) {
134     NOTREACHED();
135     return;
136   }
137   if (redirect_chain.back() != entry->GetURL()) {
138     // I originally had this as a DCHECK but I saw a failure once that I
139     // can't reproduce. It looks like it might be related to the
140     // navigation controller only keeping a limited number of navigation
141     // events. For now we'll just attach a feature specifying that this is
142     // a mismatch and try and figure out what to do with it on the server.
143     DLOG(WARNING) << "Expected:" << entry->GetURL()
144                  << " Actual:" << redirect_chain.back();
145     AddFeature(feature_prefix + features::kRedirectUrlMismatch,
146                1.0,
147                request);
148     return;
149   }
150   // We skip the last element since it should just be the current url.
151   for (size_t i = 0; i < redirect_chain.size() - 1; i++) {
152     std::string printable_redirect = redirect_chain[i].spec();
153     if (redirect_chain[i].SchemeIsSecure()) {
154       printable_redirect = features::kSecureRedirectValue;
155     }
156     AddFeature(base::StringPrintf("%s%s[%" PRIuS "]=%s",
157                                   feature_prefix.c_str(),
158                                   features::kRedirect,
159                                   i,
160                                   printable_redirect.c_str()),
161                1.0,
162                request);
163   }
164 }
165
166 BrowserFeatureExtractor::BrowserFeatureExtractor(
167     WebContents* tab,
168     ClientSideDetectionHost* host)
169     : tab_(tab),
170       host_(host),
171       weak_factory_(this) {
172   DCHECK(tab);
173 }
174
175 BrowserFeatureExtractor::~BrowserFeatureExtractor() {
176   weak_factory_.InvalidateWeakPtrs();
177 }
178
179 void BrowserFeatureExtractor::ExtractFeatures(const BrowseInfo* info,
180                                               ClientPhishingRequest* request,
181                                               const DoneCallback& callback) {
182   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
183   DCHECK(request);
184   DCHECK(info);
185   DCHECK_EQ(0U, request->url().find("http:"));
186   DCHECK(!callback.is_null());
187   // Extract features pertaining to this navigation.
188   const NavigationController& controller = tab_->GetController();
189   int url_index = -1;
190   int first_host_index = -1;
191
192   GURL request_url(request->url());
193   int index = controller.GetCurrentEntryIndex();
194   // The url that we are extracting features for should already be commited.
195   DCHECK_NE(index, -1);
196   for (; index >= 0; index--) {
197     NavigationEntry* entry = controller.GetEntryAtIndex(index);
198     if (url_index == -1 && entry->GetURL() == request_url) {
199       // It's possible that we've been on the on the possibly phishy url before
200       // in this tab, so make sure that we use the latest navigation for
201       // features.
202       // Note that it's possible that the url_index should always be the
203       // latest entry, but I'm worried about possible races during a navigation
204       // and transient entries (i.e. interstiatials) so for now we will just
205       // be cautious.
206       url_index = index;
207     } else if (index < url_index) {
208       if (entry->GetURL().host() == request_url.host()) {
209         first_host_index = index;
210       } else {
211         // We have found the possibly phishing url, but we are no longer on the
212         // host. No reason to look back any further.
213         break;
214       }
215     }
216   }
217
218   // Add features pertaining to how we got to
219   //   1) The candidate url
220   //   2) The first url on the same host as the candidate url (assuming that
221   //      it's different from the candidate url).
222   if (url_index != -1) {
223     AddNavigationFeatures(
224         std::string(), controller, url_index, info->url_redirects, request);
225   }
226   if (first_host_index != -1) {
227     AddNavigationFeatures(features::kHostPrefix,
228                           controller,
229                           first_host_index,
230                           info->host_redirects,
231                           request);
232   }
233
234   // The API doesn't take a scoped_ptr because the API gets mocked and we
235   // cannot mock an API that takes scoped_ptr as arguments.
236   scoped_ptr<ClientPhishingRequest> req(request);
237
238   ExtractBrowseInfoFeatures(*info, request);
239   base::MessageLoop::current()->PostTask(
240       FROM_HERE,
241       base::Bind(&BrowserFeatureExtractor::StartExtractFeatures,
242                  weak_factory_.GetWeakPtr(),
243                  base::Passed(&req),
244                  callback));
245 }
246
247 void BrowserFeatureExtractor::ExtractMalwareFeatures(
248     BrowseInfo* info,
249     ClientMalwareRequest* request,
250     const MalwareDoneCallback& callback) {
251   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
252   DCHECK(!callback.is_null());
253
254   // Grab the IPs because they might go away before we're done
255   // checking them against the IP blacklist on the IO thread.
256   scoped_ptr<IPUrlMap> ips(new IPUrlMap);
257   ips->swap(info->ips);
258
259   IPUrlMap* ips_ptr = ips.get();
260
261   // The API doesn't take a scoped_ptr because the API gets mocked and we
262   // cannot mock an API that takes scoped_ptr as arguments.
263   scoped_ptr<ClientMalwareRequest> req(request);
264
265   // IP blacklist lookups have to happen on the IO thread.
266   BrowserThread::PostTaskAndReply(
267       BrowserThread::IO,
268       FROM_HERE,
269       base::Bind(&FilterBenignIpsOnIOThread,
270                  host_->database_manager(),
271                  ips_ptr),
272       base::Bind(&BrowserFeatureExtractor::FinishExtractMalwareFeatures,
273                  weak_factory_.GetWeakPtr(),
274                  base::Passed(&ips), callback, base::Passed(&req)));
275 }
276
277 void BrowserFeatureExtractor::ExtractBrowseInfoFeatures(
278     const BrowseInfo& info,
279     ClientPhishingRequest* request) {
280   if (info.unsafe_resource.get()) {
281     // A SafeBrowsing interstitial was shown for the current URL.
282     AddFeature(features::kSafeBrowsingMaliciousUrl +
283                info.unsafe_resource->url.spec(),
284                1.0,
285                request);
286     AddFeature(features::kSafeBrowsingOriginalUrl +
287                info.unsafe_resource->original_url.spec(),
288                1.0,
289                request);
290     AddFeature(features::kSafeBrowsingIsSubresource,
291                info.unsafe_resource->is_subresource ? 1.0 : 0.0,
292                request);
293     AddFeature(features::kSafeBrowsingThreatType,
294                static_cast<double>(info.unsafe_resource->threat_type),
295                request);
296   }
297   if (info.http_status_code != 0) {
298     AddFeature(features::kHttpStatusCode, info.http_status_code, request);
299   }
300 }
301
302 void BrowserFeatureExtractor::StartExtractFeatures(
303     scoped_ptr<ClientPhishingRequest> request,
304     const DoneCallback& callback) {
305   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
306   HistoryService* history;
307   if (!request || !request->IsInitialized() || !GetHistoryService(&history)) {
308     callback.Run(false, request.Pass());
309     return;
310   }
311   GURL request_url(request->url());
312   history->QueryURL(request_url,
313                     true /* wants_visits */,
314                     base::Bind(&BrowserFeatureExtractor::QueryUrlHistoryDone,
315                                base::Unretained(this),
316                                base::Passed(&request),
317                                callback),
318                     &cancelable_task_tracker_);
319 }
320
321 void BrowserFeatureExtractor::QueryUrlHistoryDone(
322     scoped_ptr<ClientPhishingRequest> request,
323     const DoneCallback& callback,
324     bool success,
325     const history::URLRow& row,
326     const history::VisitVector& visits) {
327   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
328   DCHECK(request);
329   DCHECK(!callback.is_null());
330   if (!success) {
331     // URL is not found in the history.  In practice this should not
332     // happen (unless there is a real error) because we just visited
333     // that URL.
334     callback.Run(false, request.Pass());
335     return;
336   }
337   AddFeature(features::kUrlHistoryVisitCount,
338              static_cast<double>(row.visit_count()),
339              request.get());
340
341   base::Time threshold = base::Time::Now() - base::TimeDelta::FromDays(1);
342   int num_visits_24h_ago = 0;
343   int num_visits_typed = 0;
344   int num_visits_link = 0;
345   for (history::VisitVector::const_iterator it = visits.begin();
346        it != visits.end();
347        ++it) {
348     if (!content::PageTransitionIsMainFrame(it->transition)) {
349       continue;
350     }
351     if (it->visit_time < threshold) {
352       ++num_visits_24h_ago;
353     }
354     content::PageTransition transition = content::PageTransitionStripQualifier(
355         it->transition);
356     if (transition == content::PAGE_TRANSITION_TYPED) {
357       ++num_visits_typed;
358     } else if (transition == content::PAGE_TRANSITION_LINK) {
359       ++num_visits_link;
360     }
361   }
362   AddFeature(features::kUrlHistoryVisitCountMoreThan24hAgo,
363              static_cast<double>(num_visits_24h_ago),
364              request.get());
365   AddFeature(features::kUrlHistoryTypedCount,
366              static_cast<double>(num_visits_typed),
367              request.get());
368   AddFeature(features::kUrlHistoryLinkCount,
369              static_cast<double>(num_visits_link),
370              request.get());
371
372   // Issue next history lookup for host visits.
373   HistoryService* history;
374   if (!GetHistoryService(&history)) {
375     callback.Run(false, request.Pass());
376     return;
377   }
378   GURL request_url(request->url());
379   history->GetVisibleVisitCountToHost(
380       request_url,
381       base::Bind(&BrowserFeatureExtractor::QueryHttpHostVisitsDone,
382                  base::Unretained(this),
383                  base::Passed(&request),
384                  callback),
385       &cancelable_task_tracker_);
386 }
387
388 void BrowserFeatureExtractor::QueryHttpHostVisitsDone(
389     scoped_ptr<ClientPhishingRequest> request,
390     const DoneCallback& callback,
391     bool success,
392     int num_visits,
393     base::Time first_visit) {
394   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
395   DCHECK(request);
396   DCHECK(!callback.is_null());
397   if (!success) {
398     callback.Run(false, request.Pass());
399     return;
400   }
401   SetHostVisitsFeatures(num_visits, first_visit, true, request.get());
402
403   // Same lookup but for the HTTPS URL.
404   HistoryService* history;
405   if (!GetHistoryService(&history)) {
406     callback.Run(false, request.Pass());
407     return;
408   }
409   std::string https_url = request->url();
410   history->GetVisibleVisitCountToHost(
411       GURL(https_url.replace(0, 5, "https:")),
412       base::Bind(&BrowserFeatureExtractor::QueryHttpsHostVisitsDone,
413                  base::Unretained(this),
414                  base::Passed(&request),
415                  callback),
416       &cancelable_task_tracker_);
417 }
418
419 void BrowserFeatureExtractor::QueryHttpsHostVisitsDone(
420     scoped_ptr<ClientPhishingRequest> request,
421     const DoneCallback& callback,
422     bool success,
423     int num_visits,
424     base::Time first_visit) {
425   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
426   DCHECK(request);
427   DCHECK(!callback.is_null());
428   if (!success) {
429     callback.Run(false, request.Pass());
430     return;
431   }
432   SetHostVisitsFeatures(num_visits, first_visit, false, request.get());
433   callback.Run(true, request.Pass());
434 }
435
436 void BrowserFeatureExtractor::SetHostVisitsFeatures(
437     int num_visits,
438     base::Time first_visit,
439     bool is_http_query,
440     ClientPhishingRequest* request) {
441   DCHECK(request);
442   AddFeature(is_http_query ?
443              features::kHttpHostVisitCount : features::kHttpsHostVisitCount,
444              static_cast<double>(num_visits),
445              request);
446   if (num_visits > 0) {
447     AddFeature(
448         is_http_query ?
449         features::kFirstHttpHostVisitMoreThan24hAgo :
450         features::kFirstHttpsHostVisitMoreThan24hAgo,
451         (first_visit < (base::Time::Now() - base::TimeDelta::FromDays(1))) ?
452         1.0 : 0.0,
453         request);
454   }
455 }
456
457 bool BrowserFeatureExtractor::GetHistoryService(HistoryService** history) {
458   *history = NULL;
459   if (tab_ && tab_->GetBrowserContext()) {
460     Profile* profile = Profile::FromBrowserContext(tab_->GetBrowserContext());
461     *history = HistoryServiceFactory::GetForProfile(profile,
462                                                     Profile::EXPLICIT_ACCESS);
463     if (*history) {
464       return true;
465     }
466   }
467   VLOG(2) << "Unable to query history.  No history service available.";
468   return false;
469 }
470
471 void BrowserFeatureExtractor::FinishExtractMalwareFeatures(
472     scoped_ptr<IPUrlMap> bad_ips,
473     MalwareDoneCallback callback,
474     scoped_ptr<ClientMalwareRequest> request) {
475   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
476   int matched_bad_ips = 0;
477   for (IPUrlMap::const_iterator it = bad_ips->begin();
478        it != bad_ips->end(); ++it) {
479     AddMalwareIpUrlInfo(it->first, it->second, request.get());
480     ++matched_bad_ips;
481     // Limit the number of matched bad IPs in one request to control
482     // the request's size
483     if (matched_bad_ips >= kMaxMalwareIPPerRequest) {
484       break;
485     }
486   }
487   callback.Run(true, request.Pass());
488 }
489
490 }  // namespace safe_browsing