- add sources.
[platform/framework/web/crosswalk.git] / src / chrome / renderer / safe_browsing / phishing_classifier_delegate.cc
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_classifier_delegate.h"
6
7 #include <set>
8
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/lazy_instance.h"
12 #include "base/logging.h"
13 #include "base/metrics/histogram.h"
14 #include "chrome/common/safe_browsing/csd.pb.h"
15 #include "chrome/common/safe_browsing/safebrowsing_messages.h"
16 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
17 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
18 #include "chrome/renderer/safe_browsing/scorer.h"
19 #include "content/public/renderer/document_state.h"
20 #include "content/public/renderer/navigation_state.h"
21 #include "content/public/renderer/render_thread.h"
22 #include "content/public/renderer/render_view.h"
23 #include "third_party/WebKit/public/platform/WebURL.h"
24 #include "third_party/WebKit/public/web/WebDocument.h"
25 #include "third_party/WebKit/public/web/WebFrame.h"
26 #include "third_party/WebKit/public/web/WebView.h"
27
28 using content::DocumentState;
29 using content::NavigationState;
30 using content::RenderThread;
31
32 namespace safe_browsing {
33
34 static GURL StripRef(const GURL& url) {
35   GURL::Replacements replacements;
36   replacements.ClearRef();
37   return url.ReplaceComponents(replacements);
38 }
39
40 typedef std::set<PhishingClassifierDelegate*> PhishingClassifierDelegates;
41 static base::LazyInstance<PhishingClassifierDelegates>
42     g_delegates = LAZY_INSTANCE_INITIALIZER;
43
44 static base::LazyInstance<scoped_ptr<const safe_browsing::Scorer> >
45     g_phishing_scorer = LAZY_INSTANCE_INITIALIZER;
46
47 // static
48 PhishingClassifierFilter* PhishingClassifierFilter::Create() {
49   // Private constructor and public static Create() method to facilitate
50   // stubbing out this class for binary-size reduction purposes.
51   return new PhishingClassifierFilter();
52 }
53
54 PhishingClassifierFilter::PhishingClassifierFilter()
55     : RenderProcessObserver() {}
56
57 PhishingClassifierFilter::~PhishingClassifierFilter() {}
58
59 bool PhishingClassifierFilter::OnControlMessageReceived(
60     const IPC::Message& message) {
61   bool handled = true;
62   IPC_BEGIN_MESSAGE_MAP(PhishingClassifierFilter, message)
63     IPC_MESSAGE_HANDLER(SafeBrowsingMsg_SetPhishingModel, OnSetPhishingModel)
64     IPC_MESSAGE_UNHANDLED(handled = false)
65   IPC_END_MESSAGE_MAP()
66   return handled;
67 }
68
69 void PhishingClassifierFilter::OnSetPhishingModel(const std::string& model) {
70   safe_browsing::Scorer* scorer = NULL;
71   // An empty model string means we should disable client-side phishing
72   // detection.
73   if (!model.empty()) {
74     scorer = safe_browsing::Scorer::Create(model);
75     if (!scorer) {
76       DLOG(ERROR) << "Unable to create a PhishingScorer - corrupt model?";
77       return;
78     }
79   }
80   PhishingClassifierDelegates::iterator i;
81   for (i = g_delegates.Get().begin(); i != g_delegates.Get().end(); ++i) {
82     (*i)->SetPhishingScorer(scorer);
83   }
84   g_phishing_scorer.Get().reset(scorer);
85 }
86
87 // static
88 PhishingClassifierDelegate* PhishingClassifierDelegate::Create(
89     content::RenderView* render_view, PhishingClassifier* classifier) {
90   // Private constructor and public static Create() method to facilitate
91   // stubbing out this class for binary-size reduction purposes.
92   return new PhishingClassifierDelegate(render_view, classifier);
93 }
94
95 PhishingClassifierDelegate::PhishingClassifierDelegate(
96     content::RenderView* render_view,
97     PhishingClassifier* classifier)
98     : content::RenderViewObserver(render_view),
99       last_main_frame_transition_(content::PAGE_TRANSITION_LINK),
100       have_page_text_(false),
101       is_classifying_(false) {
102   g_delegates.Get().insert(this);
103   if (!classifier) {
104     classifier = new PhishingClassifier(render_view,
105                                         new FeatureExtractorClock());
106   }
107
108   classifier_.reset(classifier);
109
110   if (g_phishing_scorer.Get().get())
111     SetPhishingScorer(g_phishing_scorer.Get().get());
112 }
113
114 PhishingClassifierDelegate::~PhishingClassifierDelegate() {
115   CancelPendingClassification(SHUTDOWN);
116   g_delegates.Get().erase(this);
117 }
118
119 void PhishingClassifierDelegate::SetPhishingScorer(
120     const safe_browsing::Scorer* scorer) {
121   if (!render_view()->GetWebView())
122     return;  // RenderView is tearing down.
123   if (is_classifying_) {
124     // If there is a classification going on right now it means we're
125     // actually replacing an existing scorer with a new model.  In
126     // this case we simply cancel the current classification.
127     // TODO(noelutz): if this happens too frequently we could also
128     // replace the old scorer with the new one once classification is done
129     // but this would complicate the code somewhat.
130     CancelPendingClassification(NEW_PHISHING_SCORER);
131   }
132   classifier_->set_phishing_scorer(scorer);
133   // Start classifying the current page if all conditions are met.
134   // See MaybeStartClassification() for details.
135   MaybeStartClassification();
136 }
137
138 void PhishingClassifierDelegate::OnStartPhishingDetection(const GURL& url) {
139   last_url_received_from_browser_ = StripRef(url);
140   // Start classifying the current page if all conditions are met.
141   // See MaybeStartClassification() for details.
142   MaybeStartClassification();
143 }
144
145 void PhishingClassifierDelegate::DidCommitProvisionalLoad(
146     WebKit::WebFrame* frame, bool is_new_navigation) {
147   // A new page is starting to load, so cancel classificaiton.
148   //
149   // TODO(bryner): We shouldn't need to cancel classification if the navigation
150   // is within the same page.  However, if we let classification continue in
151   // this case, we need to properly deal with the fact that PageCaptured will
152   // be called again for the in-page navigation.  We need to be sure not to
153   // swap out the page text while the term feature extractor is still running.
154   DocumentState* document_state = DocumentState::FromDataSource(
155       frame->dataSource());
156   NavigationState* navigation_state = document_state->navigation_state();
157   CancelPendingClassification(navigation_state->was_within_same_page() ?
158                               NAVIGATE_WITHIN_PAGE : NAVIGATE_AWAY);
159   if (frame == render_view()->GetWebView()->mainFrame()) {
160     last_main_frame_transition_ = navigation_state->transition_type();
161   }
162 }
163
164 void PhishingClassifierDelegate::PageCaptured(string16* page_text,
165                                               bool preliminary_capture) {
166   if (preliminary_capture) {
167     return;
168   }
169   // Make sure there's no classification in progress.  We don't want to swap
170   // out the page text string from underneath the term feature extractor.
171   //
172   // Note: Currently, if the url hasn't changed, we won't restart
173   // classification in this case.  We may want to adjust this.
174   CancelPendingClassification(PAGE_RECAPTURED);
175   last_finished_load_url_ = GetToplevelUrl();
176   classifier_page_text_.swap(*page_text);
177   have_page_text_ = true;
178   MaybeStartClassification();
179 }
180
181 void PhishingClassifierDelegate::CancelPendingClassification(
182     CancelClassificationReason reason) {
183   if (is_classifying_) {
184     UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.CancelClassificationReason",
185                               reason,
186                               CANCEL_CLASSIFICATION_MAX);
187     is_classifying_ = false;
188   }
189   if (classifier_->is_ready()) {
190     classifier_->CancelPendingClassification();
191   }
192   classifier_page_text_.clear();
193   have_page_text_ = false;
194 }
195
196 bool PhishingClassifierDelegate::OnMessageReceived(
197     const IPC::Message& message) {
198   bool handled = true;
199   IPC_BEGIN_MESSAGE_MAP(PhishingClassifierDelegate, message)
200     IPC_MESSAGE_HANDLER(SafeBrowsingMsg_StartPhishingDetection,
201                         OnStartPhishingDetection)
202     IPC_MESSAGE_UNHANDLED(handled = false)
203   IPC_END_MESSAGE_MAP()
204   return handled;
205 }
206
207 void PhishingClassifierDelegate::ClassificationDone(
208     const ClientPhishingRequest& verdict) {
209   // We no longer need the page text.
210   classifier_page_text_.clear();
211   VLOG(2) << "Phishy verdict = " << verdict.is_phishing()
212           << " score = " << verdict.client_score();
213   if (verdict.client_score() != PhishingClassifier::kInvalidScore) {
214     DCHECK_EQ(last_url_sent_to_classifier_.spec(), verdict.url());
215     RenderThread::Get()->Send(new SafeBrowsingHostMsg_PhishingDetectionDone(
216         routing_id(), verdict.SerializeAsString()));
217   }
218 }
219
220 GURL PhishingClassifierDelegate::GetToplevelUrl() {
221   return render_view()->GetWebView()->mainFrame()->document().url();
222 }
223
224 void PhishingClassifierDelegate::MaybeStartClassification() {
225   // We can begin phishing classification when the following conditions are
226   // met:
227   //  1. A Scorer has been created
228   //  2. The browser has sent a StartPhishingDetection message for the current
229   //     toplevel URL.
230   //  3. The page has finished loading and the page text has been extracted.
231   //  4. The load is a new navigation (not a session history navigation).
232   //  5. The toplevel URL has not already been classified.
233   //
234   // Note that if we determine that this particular navigation should not be
235   // classified at all (as opposed to deferring it until we get an IPC or the
236   // load completes), we discard the page text since it won't be needed.
237   if (!classifier_->is_ready()) {
238     VLOG(2) << "Not starting classification, no Scorer created.";
239     // Keep classifier_page_text_, in case a Scorer is set later.
240     return;
241   }
242
243   if (last_main_frame_transition_ & content::PAGE_TRANSITION_FORWARD_BACK) {
244     // Skip loads from session history navigation.  However, update the
245     // last URL sent to the classifier, so that we'll properly detect
246     // in-page navigations.
247     VLOG(2) << "Not starting classification for back/forward navigation";
248     last_url_sent_to_classifier_ = last_finished_load_url_;
249     classifier_page_text_.clear();  // we won't need this.
250     have_page_text_ = false;
251     return;
252   }
253
254   GURL stripped_last_load_url(StripRef(last_finished_load_url_));
255   if (stripped_last_load_url == StripRef(last_url_sent_to_classifier_)) {
256     // We've already classified this toplevel URL, so this was likely an
257     // in-page navigation or a subframe navigation.  The browser should not
258     // send a StartPhishingDetection IPC in this case.
259     VLOG(2) << "Toplevel URL is unchanged, not starting classification.";
260     classifier_page_text_.clear();  // we won't need this.
261     have_page_text_ = false;
262     return;
263   }
264
265   if (!have_page_text_) {
266     VLOG(2) << "Not starting classification, there is no page text ready.";
267     return;
268   }
269
270   if (last_url_received_from_browser_ != stripped_last_load_url) {
271     // The browser has not yet confirmed that this URL should be classified,
272     // so defer classification for now.  Note: the ref does not affect
273     // any of the browser's preclassification checks, so we don't require it
274     // to match.
275     VLOG(2) << "Not starting classification, last url from browser is "
276             << last_url_received_from_browser_ << ", last finished load is "
277             << last_finished_load_url_;
278     // Keep classifier_page_text_, in case the browser notifies us later that
279     // we should classify the URL.
280     return;
281   }
282
283   VLOG(2) << "Starting classification for " << last_finished_load_url_;
284   last_url_sent_to_classifier_ = last_finished_load_url_;
285   is_classifying_ = true;
286   classifier_->BeginClassification(
287       &classifier_page_text_,
288       base::Bind(&PhishingClassifierDelegate::ClassificationDone,
289                  base::Unretained(this)));
290 }
291
292 }  // namespace safe_browsing