Upstream version 5.34.104.0
[platform/framework/web/crosswalk.git] / src / chrome / renderer / safe_browsing / phishing_dom_feature_extractor.cc
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
6
7 #include "base/bind.h"
8 #include "base/compiler_specific.h"
9 #include "base/containers/hash_tables.h"
10 #include "base/logging.h"
11 #include "base/message_loop/message_loop.h"
12 #include "base/metrics/histogram.h"
13 #include "base/strings/string_util.h"
14 #include "base/time/time.h"
15 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
16 #include "chrome/renderer/safe_browsing/features.h"
17 #include "content/public/renderer/render_view.h"
18 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
19 #include "third_party/WebKit/public/platform/WebString.h"
20 #include "third_party/WebKit/public/web/WebElement.h"
21 #include "third_party/WebKit/public/web/WebElementCollection.h"
22 #include "third_party/WebKit/public/web/WebFrame.h"
23 #include "third_party/WebKit/public/web/WebView.h"
24
25 namespace safe_browsing {
26
27 // This time should be short enough that it doesn't noticeably disrupt the
28 // user's interaction with the page.
29 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10;
30
31 // Experimenting shows that we get a reasonable gain in performance by
32 // increasing this up to around 10, but there's not much benefit in
33 // increasing it past that.
34 const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
35
36 // This should be longer than we expect feature extraction to take on any
37 // actual phishing page.
38 const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
39
40 // Intermediate state used for computing features.  See features.h for
41 // descriptions of the DOM features that are computed.
42 struct PhishingDOMFeatureExtractor::PageFeatureState {
43   // Link related features
44   int external_links;
45   base::hash_set<std::string> external_domains;
46   int secure_links;
47   int total_links;
48
49   // Form related features
50   int num_forms;
51   int num_text_inputs;
52   int num_pswd_inputs;
53   int num_radio_inputs;
54   int num_check_inputs;
55   int action_other_domain;
56   int total_actions;
57
58   // Image related features
59   int img_other_domain;
60   int total_imgs;
61
62   // How many script tags
63   int num_script_tags;
64
65   // The time at which we started feature extraction for the current page.
66   base::TimeTicks start_time;
67
68   // The number of iterations we've done for the current extraction.
69   int num_iterations;
70
71   explicit PageFeatureState(base::TimeTicks start_time_ticks)
72       : external_links(0),
73         secure_links(0),
74         total_links(0),
75         num_forms(0),
76         num_text_inputs(0),
77         num_pswd_inputs(0),
78         num_radio_inputs(0),
79         num_check_inputs(0),
80         action_other_domain(0),
81         total_actions(0),
82         img_other_domain(0),
83         total_imgs(0),
84         num_script_tags(0),
85         start_time(start_time_ticks),
86         num_iterations(0) {}
87
88   ~PageFeatureState() {}
89 };
90
91 // Per-frame state
92 struct PhishingDOMFeatureExtractor::FrameData {
93   // This is our reference to document.all, which is an iterator over all
94   // of the elements in the document.  It keeps track of our current position.
95   blink::WebElementCollection elements;
96   // The domain of the document URL, stored here so that we don't need to
97   // recompute it every time it's needed.
98   std::string domain;
99 };
100
101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
102     content::RenderView* render_view,
103     FeatureExtractorClock* clock)
104     : render_view_(render_view),
105       clock_(clock),
106       weak_factory_(this) {
107   Clear();
108 }
109
110 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
111   // The RenderView should have called CancelPendingExtraction() before
112   // we are destroyed.
113   CheckNoPendingExtraction();
114 }
115
116 void PhishingDOMFeatureExtractor::ExtractFeatures(
117     FeatureMap* features,
118     const DoneCallback& done_callback) {
119   // The RenderView should have called CancelPendingExtraction() before
120   // starting a new extraction, so DCHECK this.
121   CheckNoPendingExtraction();
122   // However, in an opt build, we will go ahead and clean up the pending
123   // extraction so that we can start in a known state.
124   CancelPendingExtraction();
125
126   features_ = features;
127   done_callback_ = done_callback;
128
129   page_feature_state_.reset(new PageFeatureState(clock_->Now()));
130   blink::WebView* web_view = render_view_->GetWebView();
131   if (web_view && web_view->mainFrame()) {
132     cur_document_ = web_view->mainFrame()->document();
133   }
134
135   base::MessageLoop::current()->PostTask(
136       FROM_HERE,
137       base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
138                  weak_factory_.GetWeakPtr()));
139 }
140
141 void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
142   // Cancel any pending callbacks, and clear our state.
143   weak_factory_.InvalidateWeakPtrs();
144   Clear();
145 }
146
147 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
148   DCHECK(page_feature_state_.get());
149   ++page_feature_state_->num_iterations;
150   base::TimeTicks current_chunk_start_time = clock_->Now();
151
152   if (cur_document_.isNull()) {
153     // This will only happen if we weren't able to get the document for the
154     // main frame.  We'll treat this as an extraction failure.
155     RunCallback(false);
156     return;
157   }
158
159   int num_elements = 0;
160   for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) {
161     blink::WebElement cur_element;
162     if (cur_frame_data_.get()) {
163       // We're resuming traversal of a frame, so just advance to the next
164       // element.
165       cur_element = cur_frame_data_->elements.nextItem();
166       // When we resume the traversal, the first call to nextItem() potentially
167       // has to walk through the document again from the beginning, if it was
168       // modified between our chunks of work.  Log how long this takes, so we
169       // can tell if it's too slow.
170       UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
171                           clock_->Now() - current_chunk_start_time);
172     } else {
173       // We just moved to a new frame, so update our frame state
174       // and advance to the first element.
175       ResetFrameData();
176       cur_element = cur_frame_data_->elements.firstItem();
177     }
178
179     for (; !cur_element.isNull();
180          cur_element = cur_frame_data_->elements.nextItem()) {
181       if (cur_element.hasTagName("a")) {
182         HandleLink(cur_element);
183       } else if (cur_element.hasTagName("form")) {
184         HandleForm(cur_element);
185       } else if (cur_element.hasTagName("img")) {
186         HandleImage(cur_element);
187       } else if (cur_element.hasTagName("input")) {
188         HandleInput(cur_element);
189       } else if (cur_element.hasTagName("script")) {
190         HandleScript(cur_element);
191       }
192
193       if (++num_elements >= kClockCheckGranularity) {
194         num_elements = 0;
195         base::TimeTicks now = clock_->Now();
196         if (now - page_feature_state_->start_time >=
197             base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
198           DLOG(ERROR) << "Feature extraction took too long, giving up";
199           // We expect this to happen infrequently, so record when it does.
200           UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
201           RunCallback(false);
202           return;
203         }
204         base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
205         if (chunk_elapsed >=
206             base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
207           // The time limit for the current chunk is up, so post a task to
208           // continue extraction.
209           //
210           // Record how much time we actually spent on the chunk. If this is
211           // much higher than kMaxTimePerChunkMs, we may need to adjust the
212           // clock granularity.
213           UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
214                               chunk_elapsed);
215           base::MessageLoop::current()->PostTask(
216               FROM_HERE,
217               base::Bind(
218                   &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
219                   weak_factory_.GetWeakPtr()));
220           return;
221         }
222         // Otherwise, continue.
223       }
224     }
225
226     // We're done with this frame, recalculate the FrameData when we
227     // advance to the next frame.
228     cur_frame_data_.reset();
229   }
230
231   InsertFeatures();
232   RunCallback(true);
233 }
234
235 void PhishingDOMFeatureExtractor::HandleLink(
236     const blink::WebElement& element) {
237   // Count the number of times we link to a different host.
238   if (!element.hasAttribute("href")) {
239     DVLOG(1) << "Skipping anchor tag with no href";
240     return;
241   }
242
243   // Retrieve the link and resolve the link in case it's relative.
244   blink::WebURL full_url = element.document().completeURL(
245       element.getAttribute("href"));
246
247   std::string domain;
248   bool is_external = IsExternalDomain(full_url, &domain);
249   if (domain.empty()) {
250     DVLOG(1) << "Could not extract domain from link: " << full_url;
251     return;
252   }
253
254   if (is_external) {
255     ++page_feature_state_->external_links;
256
257     // Record each unique domain that we link to.
258     page_feature_state_->external_domains.insert(domain);
259   }
260
261   // Check how many are https links.
262   if (GURL(full_url).SchemeIs("https")) {
263     ++page_feature_state_->secure_links;
264   }
265
266   ++page_feature_state_->total_links;
267 }
268
269 void PhishingDOMFeatureExtractor::HandleForm(
270     const blink::WebElement& element) {
271   // Increment the number of forms on this page.
272   ++page_feature_state_->num_forms;
273
274   // Record whether the action points to a different domain.
275   if (!element.hasAttribute("action")) {
276     return;
277   }
278
279   blink::WebURL full_url = element.document().completeURL(
280       element.getAttribute("action"));
281
282   std::string domain;
283   bool is_external = IsExternalDomain(full_url, &domain);
284   if (domain.empty()) {
285     DVLOG(1) << "Could not extract domain from form action: " << full_url;
286     return;
287   }
288
289   if (is_external) {
290     ++page_feature_state_->action_other_domain;
291   }
292   ++page_feature_state_->total_actions;
293 }
294
295 void PhishingDOMFeatureExtractor::HandleImage(
296     const blink::WebElement& element) {
297   if (!element.hasAttribute("src")) {
298     DVLOG(1) << "Skipping img tag with no src";
299   }
300
301   // Record whether the image points to a different domain.
302   blink::WebURL full_url = element.document().completeURL(
303       element.getAttribute("src"));
304   std::string domain;
305   bool is_external = IsExternalDomain(full_url, &domain);
306   if (domain.empty()) {
307     DVLOG(1) << "Could not extract domain from image src: " << full_url;
308     return;
309   }
310
311   if (is_external) {
312     ++page_feature_state_->img_other_domain;
313   }
314   ++page_feature_state_->total_imgs;
315 }
316
317 void PhishingDOMFeatureExtractor::HandleInput(
318     const blink::WebElement& element) {
319   // The HTML spec says that if the type is unspecified, it defaults to text.
320   // In addition, any unrecognized type will be treated as a text input.
321   //
322   // Note that we use the attribute value rather than
323   // WebFormControlElement::formControlType() for consistency with the
324   // way the phishing classification model is created.
325   std::string type = element.getAttribute("type").utf8();
326   StringToLowerASCII(&type);
327   if (type == "password") {
328     ++page_feature_state_->num_pswd_inputs;
329   } else if (type == "radio") {
330     ++page_feature_state_->num_radio_inputs;
331   } else if (type == "checkbox") {
332     ++page_feature_state_->num_check_inputs;
333   } else if (type != "submit" && type != "reset" && type != "file" &&
334              type != "hidden" && type != "image" && type != "button") {
335     // Note that there are a number of new input types in HTML5 that are not
336     // handled above.  For now, we will consider these as text inputs since
337     // they could be used to capture user input.
338     ++page_feature_state_->num_text_inputs;
339   }
340 }
341
342 void PhishingDOMFeatureExtractor::HandleScript(
343     const blink::WebElement& element) {
344   ++page_feature_state_->num_script_tags;
345 }
346
347 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
348   DCHECK(done_callback_.is_null());
349   DCHECK(!cur_frame_data_.get());
350   DCHECK(cur_document_.isNull());
351   if (!done_callback_.is_null() || cur_frame_data_.get() ||
352       !cur_document_.isNull()) {
353     LOG(ERROR) << "Extraction in progress, missing call to "
354                << "CancelPendingExtraction";
355   }
356 }
357
358 void PhishingDOMFeatureExtractor::RunCallback(bool success) {
359   // Record some timing stats that we can use to evaluate feature extraction
360   // performance.  These include both successful and failed extractions.
361   DCHECK(page_feature_state_.get());
362   UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
363                        page_feature_state_->num_iterations);
364   UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
365                       clock_->Now() - page_feature_state_->start_time);
366
367   DCHECK(!done_callback_.is_null());
368   done_callback_.Run(success);
369   Clear();
370 }
371
372 void PhishingDOMFeatureExtractor::Clear() {
373   features_ = NULL;
374   done_callback_.Reset();
375   cur_frame_data_.reset(NULL);
376   cur_document_.reset();
377 }
378
379 void PhishingDOMFeatureExtractor::ResetFrameData() {
380   DCHECK(!cur_document_.isNull());
381   DCHECK(!cur_frame_data_.get());
382
383   cur_frame_data_.reset(new FrameData());
384   cur_frame_data_->elements = cur_document_.all();
385   cur_frame_data_->domain =
386       net::registry_controlled_domains::GetDomainAndRegistry(
387           cur_document_.url(),
388           net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
389 }
390
391 blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
392   DCHECK(!cur_document_.isNull());
393   blink::WebFrame* frame = cur_document_.frame();
394   // Advance to the next frame that contains a document, with no wrapping.
395   if (frame) {
396     while ((frame = frame->traverseNext(false))) {
397       if (!frame->document().isNull()) {
398         return frame->document();
399       }
400     }
401   } else {
402     // Keep track of how often frame traversal got "stuck" due to the
403     // current subdocument getting removed from the frame tree.
404     UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
405   }
406   return blink::WebDocument();
407 }
408
409 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
410                                                    std::string* domain) const {
411   DCHECK(domain);
412   DCHECK(cur_frame_data_.get());
413
414   if (cur_frame_data_->domain.empty()) {
415     return false;
416   }
417
418   // TODO(bryner): Ensure that the url encoding is consistent with the features
419   // in the model.
420   if (url.HostIsIPAddress()) {
421     domain->assign(url.host());
422   } else {
423     domain->assign(net::registry_controlled_domains::GetDomainAndRegistry(
424         url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES));
425   }
426
427   return !domain->empty() && *domain != cur_frame_data_->domain;
428 }
429
430 void PhishingDOMFeatureExtractor::InsertFeatures() {
431   DCHECK(page_feature_state_.get());
432
433   if (page_feature_state_->total_links > 0) {
434     // Add a feature for the fraction of times the page links to an external
435     // domain vs. an internal domain.
436     double link_freq = static_cast<double>(
437         page_feature_state_->external_links) /
438         page_feature_state_->total_links;
439     features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
440
441     // Add a feature for each unique domain that we're linking to
442     for (base::hash_set<std::string>::iterator it =
443              page_feature_state_->external_domains.begin();
444          it != page_feature_state_->external_domains.end(); ++it) {
445       features_->AddBooleanFeature(features::kPageLinkDomain + *it);
446     }
447
448     // Fraction of links that use https.
449     double secure_freq = static_cast<double>(
450         page_feature_state_->secure_links) / page_feature_state_->total_links;
451     features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
452   }
453
454   // Record whether forms appear and whether various form elements appear.
455   if (page_feature_state_->num_forms > 0) {
456     features_->AddBooleanFeature(features::kPageHasForms);
457   }
458   if (page_feature_state_->num_text_inputs > 0) {
459     features_->AddBooleanFeature(features::kPageHasTextInputs);
460   }
461   if (page_feature_state_->num_pswd_inputs > 0) {
462     features_->AddBooleanFeature(features::kPageHasPswdInputs);
463   }
464   if (page_feature_state_->num_radio_inputs > 0) {
465     features_->AddBooleanFeature(features::kPageHasRadioInputs);
466   }
467   if (page_feature_state_->num_check_inputs > 0) {
468     features_->AddBooleanFeature(features::kPageHasCheckInputs);
469   }
470
471   // Record fraction of form actions that point to a different domain.
472   if (page_feature_state_->total_actions > 0) {
473     double action_freq = static_cast<double>(
474         page_feature_state_->action_other_domain) /
475         page_feature_state_->total_actions;
476     features_->AddRealFeature(features::kPageActionOtherDomainFreq,
477                               action_freq);
478   }
479
480   // Record how many image src attributes point to a different domain.
481   if (page_feature_state_->total_imgs > 0) {
482     double img_freq = static_cast<double>(
483         page_feature_state_->img_other_domain) /
484         page_feature_state_->total_imgs;
485     features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
486   }
487
488   // Record number of script tags (discretized for numerical stability.)
489   if (page_feature_state_->num_script_tags > 1) {
490     features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
491     if (page_feature_state_->num_script_tags > 6) {
492       features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
493     }
494   }
495 }
496
497 }  // namespace safe_browsing