Upstream version 11.40.277.0
[platform/framework/web/crosswalk.git] / src / chrome / renderer / safe_browsing / phishing_classifier.cc
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
6
7 #include <string>
8
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/compiler_specific.h"
12 #include "base/logging.h"
13 #include "base/message_loop/message_loop.h"
14 #include "base/metrics/histogram.h"
15 #include "base/strings/string_util.h"
16 #include "chrome/common/safe_browsing/csd.pb.h"
17 #include "chrome/common/url_constants.h"
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
19 #include "chrome/renderer/safe_browsing/features.h"
20 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
21 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
22 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
23 #include "chrome/renderer/safe_browsing/scorer.h"
24 #include "content/public/renderer/render_view.h"
25 #include "crypto/sha2.h"
26 #include "third_party/WebKit/public/platform/WebURL.h"
27 #include "third_party/WebKit/public/platform/WebURLRequest.h"
28 #include "third_party/WebKit/public/web/WebDataSource.h"
29 #include "third_party/WebKit/public/web/WebDocument.h"
30 #include "third_party/WebKit/public/web/WebFrame.h"
31 #include "third_party/WebKit/public/web/WebView.h"
32 #include "url/gurl.h"
33
34 namespace safe_browsing {
35
36 const float PhishingClassifier::kInvalidScore = -1.0;
37 const float PhishingClassifier::kPhishyThreshold = 0.5;
38
39 PhishingClassifier::PhishingClassifier(content::RenderView* render_view,
40                                        FeatureExtractorClock* clock)
41     : render_view_(render_view),
42       scorer_(NULL),
43       clock_(clock),
44       weak_factory_(this) {
45   Clear();
46 }
47
48 PhishingClassifier::~PhishingClassifier() {
49   // The RenderView should have called CancelPendingClassification() before
50   // we are destroyed.
51   CheckNoPendingClassification();
52 }
53
54 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
55   CheckNoPendingClassification();
56   scorer_ = scorer;
57   if (scorer_) {
58     url_extractor_.reset(new PhishingUrlFeatureExtractor);
59     dom_extractor_.reset(
60         new PhishingDOMFeatureExtractor(render_view_, clock_.get()));
61     term_extractor_.reset(new PhishingTermFeatureExtractor(
62         &scorer_->page_terms(),
63         &scorer_->page_words(),
64         scorer_->max_words_per_term(),
65         scorer_->murmurhash3_seed(),
66         scorer_->max_shingles_per_page(),
67         scorer_->shingle_size(),
68         clock_.get()));
69   } else {
70     // We're disabling client-side phishing detection, so tear down all
71     // of the relevant objects.
72     url_extractor_.reset();
73     dom_extractor_.reset();
74     term_extractor_.reset();
75   }
76 }
77
78 bool PhishingClassifier::is_ready() const {
79   return scorer_ != NULL;
80 }
81
82 void PhishingClassifier::BeginClassification(
83     const base::string16* page_text,
84     const DoneCallback& done_callback) {
85   DCHECK(is_ready());
86
87   // The RenderView should have called CancelPendingClassification() before
88   // starting a new classification, so DCHECK this.
89   CheckNoPendingClassification();
90   // However, in an opt build, we will go ahead and clean up the pending
91   // classification so that we can start in a known state.
92   CancelPendingClassification();
93
94   page_text_ = page_text;
95   done_callback_ = done_callback;
96
97   // For consistency, we always want to invoke the DoneCallback
98   // asynchronously, rather than directly from this method.  To ensure that
99   // this is the case, post a task to begin feature extraction on the next
100   // iteration of the message loop.
101   base::MessageLoop::current()->PostTask(
102       FROM_HERE,
103       base::Bind(&PhishingClassifier::BeginFeatureExtraction,
104                  weak_factory_.GetWeakPtr()));
105 }
106
107 void PhishingClassifier::BeginFeatureExtraction() {
108   blink::WebView* web_view = render_view_->GetWebView();
109   if (!web_view) {
110     RunFailureCallback();
111     return;
112   }
113
114   blink::WebFrame* frame = web_view->mainFrame();
115   if (!frame) {
116     RunFailureCallback();
117     return;
118   }
119
120   // Check whether the URL is one that we should classify.
121   // Currently, we only classify http: URLs that are GET requests.
122   GURL url(frame->document().url());
123   if (!url.SchemeIs(url::kHttpScheme)) {
124     RunFailureCallback();
125     return;
126   }
127
128   blink::WebDataSource* ds = frame->dataSource();
129   if (!ds || !EqualsASCII(ds->request().httpMethod(), "GET")) {
130     RunFailureCallback();
131     return;
132   }
133
134   features_.reset(new FeatureMap);
135   if (!url_extractor_->ExtractFeatures(url, features_.get())) {
136     RunFailureCallback();
137     return;
138   }
139
140   // DOM feature extraction can take awhile, so it runs asynchronously
141   // in several chunks of work and invokes the callback when finished.
142   dom_extractor_->ExtractFeatures(
143       features_.get(),
144       base::Bind(&PhishingClassifier::DOMExtractionFinished,
145                  base::Unretained(this)));
146 }
147
148 void PhishingClassifier::CancelPendingClassification() {
149   // Note that cancelling the feature extractors is simply a no-op if they
150   // were not running.
151   DCHECK(is_ready());
152   dom_extractor_->CancelPendingExtraction();
153   term_extractor_->CancelPendingExtraction();
154   weak_factory_.InvalidateWeakPtrs();
155   Clear();
156 }
157
158 void PhishingClassifier::DOMExtractionFinished(bool success) {
159   shingle_hashes_.reset(new std::set<uint32>);
160   if (success) {
161     // Term feature extraction can take awhile, so it runs asynchronously
162     // in several chunks of work and invokes the callback when finished.
163     term_extractor_->ExtractFeatures(
164         page_text_,
165         features_.get(),
166         shingle_hashes_.get(),
167         base::Bind(&PhishingClassifier::TermExtractionFinished,
168                    base::Unretained(this)));
169   } else {
170     RunFailureCallback();
171   }
172 }
173
174 void PhishingClassifier::TermExtractionFinished(bool success) {
175   if (success) {
176     blink::WebView* web_view = render_view_->GetWebView();
177     if (!web_view) {
178       RunFailureCallback();
179       return;
180     }
181     blink::WebFrame* main_frame = web_view->mainFrame();
182     if (!main_frame) {
183       RunFailureCallback();
184       return;
185     }
186
187     // Hash all of the features so that they match the model, then compute
188     // the score.
189     FeatureMap hashed_features;
190     ClientPhishingRequest verdict;
191     verdict.set_model_version(scorer_->model_version());
192     verdict.set_url(main_frame->document().url().spec());
193     for (base::hash_map<std::string, double>::const_iterator it =
194              features_->features().begin();
195          it != features_->features().end(); ++it) {
196       VLOG(2) << "Feature: " << it->first << " = " << it->second;
197       bool result = hashed_features.AddRealFeature(
198           crypto::SHA256HashString(it->first), it->second);
199       DCHECK(result);
200       ClientPhishingRequest::Feature* feature = verdict.add_feature_map();
201       feature->set_name(it->first);
202       feature->set_value(it->second);
203     }
204     for (std::set<uint32>::const_iterator it = shingle_hashes_->begin();
205          it != shingle_hashes_->end(); ++it) {
206       verdict.add_shingle_hashes(*it);
207     }
208     float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
209     verdict.set_client_score(score);
210     verdict.set_is_phishing(score >= kPhishyThreshold);
211     RunCallback(verdict);
212   } else {
213     RunFailureCallback();
214   }
215 }
216
217 void PhishingClassifier::CheckNoPendingClassification() {
218   DCHECK(done_callback_.is_null());
219   DCHECK(!page_text_);
220   if (!done_callback_.is_null() || page_text_) {
221     LOG(ERROR) << "Classification in progress, missing call to "
222                << "CancelPendingClassification";
223     UMA_HISTOGRAM_COUNTS("SBClientPhishing.CheckNoPendingClassificationFailed",
224                          1);
225   }
226 }
227
228 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {
229   done_callback_.Run(verdict);
230   Clear();
231 }
232
233 void PhishingClassifier::RunFailureCallback() {
234   ClientPhishingRequest verdict;
235   // In this case we're not guaranteed to have a valid URL.  Just set it
236   // to the empty string to make sure we have a valid protocol buffer.
237   verdict.set_url("");
238   verdict.set_client_score(kInvalidScore);
239   verdict.set_is_phishing(false);
240   RunCallback(verdict);
241 }
242
243 void PhishingClassifier::Clear() {
244   page_text_ = NULL;
245   done_callback_.Reset();
246   features_.reset(NULL);
247   shingle_hashes_.reset(NULL);
248 }
249
250 }  // namespace safe_browsing