src/chrome/renderer/safe_browsing/phishing_classifier_delegate.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/renderer/safe_browsing/phishing_classifier_delegate.h"
   6
   7 #include <set>
   8
   9 #include "base/bind.h"
  10 #include "base/callback.h"
  11 #include "base/lazy_instance.h"
  12 #include "base/logging.h"
  13 #include "base/metrics/histogram.h"
  14 #include "chrome/common/safe_browsing/csd.pb.h"
  15 #include "chrome/common/safe_browsing/safebrowsing_messages.h"
  16 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
  17 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
  18 #include "chrome/renderer/safe_browsing/scorer.h"
  19 #include "content/public/renderer/document_state.h"
  20 #include "content/public/renderer/navigation_state.h"
  21 #include "content/public/renderer/render_thread.h"
  22 #include "content/public/renderer/render_view.h"
  23 #include "third_party/WebKit/public/platform/WebURL.h"
  24 #include "third_party/WebKit/public/web/WebDocument.h"
  25 #include "third_party/WebKit/public/web/WebFrame.h"
  26 #include "third_party/WebKit/public/web/WebView.h"
  27
  28 using content::DocumentState;
  29 using content::NavigationState;
  30 using content::RenderThread;
  31
  32 namespace safe_browsing {
  33
  34 static GURL StripRef(const GURL& url) {
  35   GURL::Replacements replacements;
  36   replacements.ClearRef();
  37   return url.ReplaceComponents(replacements);
  38 }
  39
  40 typedef std::set<PhishingClassifierDelegate*> PhishingClassifierDelegates;
  41 static base::LazyInstance<PhishingClassifierDelegates>
  42     g_delegates = LAZY_INSTANCE_INITIALIZER;
  43
  44 static base::LazyInstance<scoped_ptr<const safe_browsing::Scorer> >
  45     g_phishing_scorer = LAZY_INSTANCE_INITIALIZER;
  46
  47 // static
  48 PhishingClassifierFilter* PhishingClassifierFilter::Create() {
  49   // Private constructor and public static Create() method to facilitate
  50   // stubbing out this class for binary-size reduction purposes.
  51   return new PhishingClassifierFilter();
  52 }
  53
  54 PhishingClassifierFilter::PhishingClassifierFilter()
  55     : RenderProcessObserver() {}
  56
  57 PhishingClassifierFilter::~PhishingClassifierFilter() {}
  58
  59 bool PhishingClassifierFilter::OnControlMessageReceived(
  60     const IPC::Message& message) {
  61   bool handled = true;
  62   IPC_BEGIN_MESSAGE_MAP(PhishingClassifierFilter, message)
  63     IPC_MESSAGE_HANDLER(SafeBrowsingMsg_SetPhishingModel, OnSetPhishingModel)
  64     IPC_MESSAGE_UNHANDLED(handled = false)
  65   IPC_END_MESSAGE_MAP()
  66   return handled;
  67 }
  68
  69 void PhishingClassifierFilter::OnSetPhishingModel(const std::string& model) {
  70   safe_browsing::Scorer* scorer = NULL;
  71   // An empty model string means we should disable client-side phishing
  72   // detection.
  73   if (!model.empty()) {
  74     scorer = safe_browsing::Scorer::Create(model);
  75     if (!scorer) {
  76       DLOG(ERROR) << "Unable to create a PhishingScorer - corrupt model?";
  77       return;
  78     }
  79   }
  80   PhishingClassifierDelegates::iterator i;
  81   for (i = g_delegates.Get().begin(); i != g_delegates.Get().end(); ++i) {
  82     (*i)->SetPhishingScorer(scorer);
  83   }
  84   g_phishing_scorer.Get().reset(scorer);
  85 }
  86
  87 // static
  88 PhishingClassifierDelegate* PhishingClassifierDelegate::Create(
  89     content::RenderView* render_view, PhishingClassifier* classifier) {
  90   // Private constructor and public static Create() method to facilitate
  91   // stubbing out this class for binary-size reduction purposes.
  92   return new PhishingClassifierDelegate(render_view, classifier);
  93 }
  94
  95 PhishingClassifierDelegate::PhishingClassifierDelegate(
  96     content::RenderView* render_view,
  97     PhishingClassifier* classifier)
  98     : content::RenderViewObserver(render_view),
  99       last_main_frame_transition_(content::PAGE_TRANSITION_LINK),
 100       have_page_text_(false),
 101       is_classifying_(false) {
 102   g_delegates.Get().insert(this);
 103   if (!classifier) {
 104     classifier = new PhishingClassifier(render_view,
 105                                         new FeatureExtractorClock());
 106   }
 107
 108   classifier_.reset(classifier);
 109
 110   if (g_phishing_scorer.Get().get())
 111     SetPhishingScorer(g_phishing_scorer.Get().get());
 112 }
 113
 114 PhishingClassifierDelegate::~PhishingClassifierDelegate() {
 115   CancelPendingClassification(SHUTDOWN);
 116   g_delegates.Get().erase(this);
 117 }
 118
 119 void PhishingClassifierDelegate::SetPhishingScorer(
 120     const safe_browsing::Scorer* scorer) {
 121   if (!render_view()->GetWebView())
 122     return;  // RenderView is tearing down.
 123   if (is_classifying_) {
 124     // If there is a classification going on right now it means we're
 125     // actually replacing an existing scorer with a new model.  In
 126     // this case we simply cancel the current classification.
 127     // TODO(noelutz): if this happens too frequently we could also
 128     // replace the old scorer with the new one once classification is done
 129     // but this would complicate the code somewhat.
 130     CancelPendingClassification(NEW_PHISHING_SCORER);
 131   }
 132   classifier_->set_phishing_scorer(scorer);
 133   // Start classifying the current page if all conditions are met.
 134   // See MaybeStartClassification() for details.
 135   MaybeStartClassification();
 136 }
 137
 138 void PhishingClassifierDelegate::OnStartPhishingDetection(const GURL& url) {
 139   last_url_received_from_browser_ = StripRef(url);
 140   // Start classifying the current page if all conditions are met.
 141   // See MaybeStartClassification() for details.
 142   MaybeStartClassification();
 143 }
 144
 145 void PhishingClassifierDelegate::DidCommitProvisionalLoad(
 146     WebKit::WebFrame* frame, bool is_new_navigation) {
 147   // A new page is starting to load, so cancel classificaiton.
 148   //
 149   // TODO(bryner): We shouldn't need to cancel classification if the navigation
 150   // is within the same page.  However, if we let classification continue in
 151   // this case, we need to properly deal with the fact that PageCaptured will
 152   // be called again for the in-page navigation.  We need to be sure not to
 153   // swap out the page text while the term feature extractor is still running.
 154   DocumentState* document_state = DocumentState::FromDataSource(
 155       frame->dataSource());
 156   NavigationState* navigation_state = document_state->navigation_state();
 157   CancelPendingClassification(navigation_state->was_within_same_page() ?
 158                               NAVIGATE_WITHIN_PAGE : NAVIGATE_AWAY);
 159   if (frame == render_view()->GetWebView()->mainFrame()) {
 160     last_main_frame_transition_ = navigation_state->transition_type();
 161   }
 162 }
 163
 164 void PhishingClassifierDelegate::PageCaptured(string16* page_text,
 165                                               bool preliminary_capture) {
 166   if (preliminary_capture) {
 167     return;
 168   }
 169   // Make sure there's no classification in progress.  We don't want to swap
 170   // out the page text string from underneath the term feature extractor.
 171   //
 172   // Note: Currently, if the url hasn't changed, we won't restart
 173   // classification in this case.  We may want to adjust this.
 174   CancelPendingClassification(PAGE_RECAPTURED);
 175   last_finished_load_url_ = GetToplevelUrl();
 176   classifier_page_text_.swap(*page_text);
 177   have_page_text_ = true;
 178   MaybeStartClassification();
 179 }
 180
 181 void PhishingClassifierDelegate::CancelPendingClassification(
 182     CancelClassificationReason reason) {
 183   if (is_classifying_) {
 184     UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.CancelClassificationReason",
 185                               reason,
 186                               CANCEL_CLASSIFICATION_MAX);
 187     is_classifying_ = false;
 188   }
 189   if (classifier_->is_ready()) {
 190     classifier_->CancelPendingClassification();
 191   }
 192   classifier_page_text_.clear();
 193   have_page_text_ = false;
 194 }
 195
 196 bool PhishingClassifierDelegate::OnMessageReceived(
 197     const IPC::Message& message) {
 198   bool handled = true;
 199   IPC_BEGIN_MESSAGE_MAP(PhishingClassifierDelegate, message)
 200     IPC_MESSAGE_HANDLER(SafeBrowsingMsg_StartPhishingDetection,
 201                         OnStartPhishingDetection)
 202     IPC_MESSAGE_UNHANDLED(handled = false)
 203   IPC_END_MESSAGE_MAP()
 204   return handled;
 205 }
 206
 207 void PhishingClassifierDelegate::ClassificationDone(
 208     const ClientPhishingRequest& verdict) {
 209   // We no longer need the page text.
 210   classifier_page_text_.clear();
 211   VLOG(2) << "Phishy verdict = " << verdict.is_phishing()
 212           << " score = " << verdict.client_score();
 213   if (verdict.client_score() != PhishingClassifier::kInvalidScore) {
 214     DCHECK_EQ(last_url_sent_to_classifier_.spec(), verdict.url());
 215     RenderThread::Get()->Send(new SafeBrowsingHostMsg_PhishingDetectionDone(
 216         routing_id(), verdict.SerializeAsString()));
 217   }
 218 }
 219
 220 GURL PhishingClassifierDelegate::GetToplevelUrl() {
 221   return render_view()->GetWebView()->mainFrame()->document().url();
 222 }
 223
 224 void PhishingClassifierDelegate::MaybeStartClassification() {
 225   // We can begin phishing classification when the following conditions are
 226   // met:
 227   //  1. A Scorer has been created
 228   //  2. The browser has sent a StartPhishingDetection message for the current
 229   //     toplevel URL.
 230   //  3. The page has finished loading and the page text has been extracted.
 231   //  4. The load is a new navigation (not a session history navigation).
 232   //  5. The toplevel URL has not already been classified.
 233   //
 234   // Note that if we determine that this particular navigation should not be
 235   // classified at all (as opposed to deferring it until we get an IPC or the
 236   // load completes), we discard the page text since it won't be needed.
 237   if (!classifier_->is_ready()) {
 238     VLOG(2) << "Not starting classification, no Scorer created.";
 239     // Keep classifier_page_text_, in case a Scorer is set later.
 240     return;
 241   }
 242
 243   if (last_main_frame_transition_ & content::PAGE_TRANSITION_FORWARD_BACK) {
 244     // Skip loads from session history navigation.  However, update the
 245     // last URL sent to the classifier, so that we'll properly detect
 246     // in-page navigations.
 247     VLOG(2) << "Not starting classification for back/forward navigation";
 248     last_url_sent_to_classifier_ = last_finished_load_url_;
 249     classifier_page_text_.clear();  // we won't need this.
 250     have_page_text_ = false;
 251     return;
 252   }
 253
 254   GURL stripped_last_load_url(StripRef(last_finished_load_url_));
 255   if (stripped_last_load_url == StripRef(last_url_sent_to_classifier_)) {
 256     // We've already classified this toplevel URL, so this was likely an
 257     // in-page navigation or a subframe navigation.  The browser should not
 258     // send a StartPhishingDetection IPC in this case.
 259     VLOG(2) << "Toplevel URL is unchanged, not starting classification.";
 260     classifier_page_text_.clear();  // we won't need this.
 261     have_page_text_ = false;
 262     return;
 263   }
 264
 265   if (!have_page_text_) {
 266     VLOG(2) << "Not starting classification, there is no page text ready.";
 267     return;
 268   }
 269
 270   if (last_url_received_from_browser_ != stripped_last_load_url) {
 271     // The browser has not yet confirmed that this URL should be classified,
 272     // so defer classification for now.  Note: the ref does not affect
 273     // any of the browser's preclassification checks, so we don't require it
 274     // to match.
 275     VLOG(2) << "Not starting classification, last url from browser is "
 276             << last_url_received_from_browser_ << ", last finished load is "
 277             << last_finished_load_url_;
 278     // Keep classifier_page_text_, in case the browser notifies us later that
 279     // we should classify the URL.
 280     return;
 281   }
 282
 283   VLOG(2) << "Starting classification for " << last_finished_load_url_;
 284   last_url_sent_to_classifier_ = last_finished_load_url_;
 285   is_classifying_ = true;
 286   classifier_->BeginClassification(
 287       &classifier_page_text_,
 288       base::Bind(&PhishingClassifierDelegate::ClassificationDone,
 289                  base::Unretained(this)));
 290 }
 291
 292 }  // namespace safe_browsing