b55232f5c471c3e4eca9221b84e38d46230d832d
[platform/framework/web/crosswalk.git] / src / chrome / renderer / safe_browsing / phishing_dom_feature_extractor_browsertest.cc
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // Note that although this is not a "browser" test, it runs as part of
6 // browser_tests.  This is because WebKit does not work properly if it is
7 // shutdown and re-initialized.  Since browser_tests runs each test in a
8 // new process, this avoids the problem.
9
10 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
11
12 #include "base/bind.h"
13 #include "base/callback.h"
14 #include "base/command_line.h"
15 #include "base/compiler_specific.h"
16 #include "base/memory/weak_ptr.h"
17 #include "base/message_loop/message_loop.h"
18 #include "base/strings/string_number_conversions.h"
19 #include "base/time/time.h"
20 #include "chrome/browser/ui/browser.h"
21 #include "chrome/browser/ui/tabs/tab_strip_model.h"
22 #include "chrome/common/chrome_switches.h"
23 #include "chrome/renderer/safe_browsing/features.h"
24 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
25 #include "chrome/renderer/safe_browsing/test_utils.h"
26 #include "chrome/test/base/in_process_browser_test.h"
27 #include "chrome/test/base/ui_test_utils.h"
28 #include "content/public/browser/interstitial_page.h"
29 #include "content/public/browser/web_contents.h"
30 #include "content/public/renderer/render_view.h"
31 #include "content/public/test/browser_test_utils.h"
32 #include "content/public/test/test_utils.h"
33 #include "net/dns/mock_host_resolver.h"
34 #include "net/test/embedded_test_server/embedded_test_server.h"
35 #include "net/test/embedded_test_server/http_request.h"
36 #include "net/test/embedded_test_server/http_response.h"
37 #include "testing/gmock/include/gmock/gmock.h"
38 #include "third_party/WebKit/public/platform/WebString.h"
39 #include "third_party/WebKit/public/web/WebFrame.h"
40 #include "third_party/WebKit/public/web/WebScriptSource.h"
41 #include "third_party/WebKit/public/web/WebView.h"
42
43 using ::testing::DoAll;
44 using ::testing::Invoke;
45 using ::testing::Return;
46
47 namespace {
48
49 // The first RenderFrame is routing ID 1, and the first RenderView is 2.
50 const int kRenderViewRoutingId = 2;
51
52 }
53
54 namespace safe_browsing {
55
56 class PhishingDOMFeatureExtractorTest : public InProcessBrowserTest {
57  public:
58   content::WebContents* GetWebContents() {
59     return browser()->tab_strip_model()->GetActiveWebContents();
60   }
61
62   // Helper for the SubframeRemoval test that posts a message to remove
63   // the iframe "frame1" from the document.
64   void ScheduleRemoveIframe() {
65     base::MessageLoop::current()->PostTask(
66         FROM_HERE,
67         base::Bind(&PhishingDOMFeatureExtractorTest::RemoveIframe,
68                    weak_factory_.GetWeakPtr()));
69   }
70
71  protected:
72   PhishingDOMFeatureExtractorTest() : weak_factory_(this) {}
73
74   ~PhishingDOMFeatureExtractorTest() override {}
75
76   void SetUpCommandLine(CommandLine* command_line) override {
77     command_line->AppendSwitch(switches::kSingleProcess);
78 #if defined(OS_WIN)
79     // Don't want to try to create a GPU process.
80     command_line->AppendSwitch(switches::kDisableGpu);
81 #endif
82   }
83
84   void SetUpOnMainThread() override {
85     extractor_.reset(new PhishingDOMFeatureExtractor(
86         content::RenderView::FromRoutingID(kRenderViewRoutingId), &clock_));
87
88     ASSERT_TRUE(StartTestServer());
89     host_resolver()->AddRule("*", "127.0.0.1");
90   }
91
92   // Runs the DOMFeatureExtractor on the RenderView, waiting for the
93   // completion callback.  Returns the success boolean from the callback.
94   bool ExtractFeatures(FeatureMap* features) {
95     success_ = false;
96     PostTaskToInProcessRendererAndWait(
97         base::Bind(&PhishingDOMFeatureExtractorTest::ExtractFeaturesInternal,
98         base::Unretained(this),
99         features));
100     return success_;
101   }
102
103   void ExtractFeaturesInternal(FeatureMap* features) {
104     scoped_refptr<content::MessageLoopRunner> message_loop =
105         new content::MessageLoopRunner;
106     extractor_->ExtractFeatures(
107         features,
108         base::Bind(&PhishingDOMFeatureExtractorTest::ExtractionDone,
109                    base::Unretained(this),
110                    message_loop->QuitClosure()));
111     message_loop->Run();
112   }
113
114   // Completion callback for feature extraction.
115   void ExtractionDone(const base::Closure& quit_closure,
116                       bool success) {
117     success_ = success;
118     quit_closure.Run();
119   }
120
121   // Does the actual work of removing the iframe "frame1" from the document.
122   void RemoveIframe() {
123     content::RenderView* render_view =
124         content::RenderView::FromRoutingID(kRenderViewRoutingId);
125     blink::WebFrame* main_frame = render_view->GetWebView()->mainFrame();
126     ASSERT_TRUE(main_frame);
127     main_frame->executeScript(
128         blink::WebString(
129             "document.body.removeChild(document.getElementById('frame1'));"));
130   }
131
132   bool StartTestServer() {
133     CHECK(!embedded_test_server_);
134     embedded_test_server_.reset(new net::test_server::EmbeddedTestServer());
135     embedded_test_server_->RegisterRequestHandler(
136         base::Bind(&PhishingDOMFeatureExtractorTest::HandleRequest,
137                    base::Unretained(this)));
138     return embedded_test_server_->InitializeAndWaitUntilReady();
139   }
140
141   scoped_ptr<net::test_server::HttpResponse> HandleRequest(
142       const net::test_server::HttpRequest& request) {
143     std::map<std::string, std::string>::const_iterator host_it =
144         request.headers.find("Host");
145     if (host_it == request.headers.end())
146       return scoped_ptr<net::test_server::HttpResponse>();
147
148     std::string url =
149         std::string("http://") + host_it->second + request.relative_url;
150     std::map<std::string, std::string>::const_iterator it =
151         responses_.find(url);
152     if (it == responses_.end())
153       return scoped_ptr<net::test_server::HttpResponse>();
154
155     scoped_ptr<net::test_server::BasicHttpResponse> http_response(
156         new net::test_server::BasicHttpResponse());
157     http_response->set_code(net::HTTP_OK);
158     http_response->set_content_type("text/html");
159     http_response->set_content(it->second);
160     return http_response.Pass();
161   }
162
163   GURL GetURL(const std::string& host, const std::string& path) {
164     GURL::Replacements replace;
165     replace.SetHostStr(host);
166     replace.SetPathStr(path);
167     return embedded_test_server_->base_url().ReplaceComponents(replace);
168   }
169
170   // Returns the URL that was loaded.
171   GURL LoadHtml(const std::string& host, const std::string& content) {
172     GURL url(GetURL(host, ""));
173     responses_[url.spec()] = content;
174     ui_test_utils::NavigateToURL(browser(), url);
175     return url;
176   }
177
178   // Map of url -> response body for network requests from the renderer.
179   // Any urls not in this map are served a 404 error.
180   std::map<std::string, std::string> responses_;
181
182   scoped_ptr<net::test_server::EmbeddedTestServer> embedded_test_server_;
183   MockFeatureExtractorClock clock_;
184   scoped_ptr<PhishingDOMFeatureExtractor> extractor_;
185   bool success_;  // holds the success value from ExtractFeatures
186   base::WeakPtrFactory<PhishingDOMFeatureExtractorTest> weak_factory_;
187 };
188
189 IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, FormFeatures) {
190   // This test doesn't exercise the extraction timing.
191   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
192
193   FeatureMap expected_features;
194   expected_features.AddBooleanFeature(features::kPageHasForms);
195   expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.25);
196   expected_features.AddBooleanFeature(features::kPageHasTextInputs);
197   expected_features.AddBooleanFeature(features::kPageHasCheckInputs);
198
199   FeatureMap features;
200   LoadHtml(
201       "host.com",
202       "<html><head><body>"
203       "<form action=\"query\"><input type=text><input type=checkbox></form>"
204       "<form action=\"http://cgi.host.com/submit\"></form>"
205       "<form action=\"http://other.com/\"></form>"
206       "<form action=\"query\"></form>"
207       "<form></form></body></html>");
208   ASSERT_TRUE(ExtractFeatures(&features));
209   ExpectFeatureMapsAreEqual(features, expected_features);
210
211   expected_features.Clear();
212   expected_features.AddBooleanFeature(features::kPageHasRadioInputs);
213   expected_features.AddBooleanFeature(features::kPageHasPswdInputs);
214
215   features.Clear();
216   LoadHtml(
217       "host.com",
218       "<html><head><body>"
219       "<input type=\"radio\"><input type=password></body></html>");
220   ASSERT_TRUE(ExtractFeatures(&features));
221   ExpectFeatureMapsAreEqual(features, expected_features);
222
223   expected_features.Clear();
224   expected_features.AddBooleanFeature(features::kPageHasTextInputs);
225
226   features.Clear();
227   LoadHtml(
228       "host.com",
229       "<html><head><body><input></body></html>");
230   ASSERT_TRUE(ExtractFeatures(&features));
231   ExpectFeatureMapsAreEqual(features, expected_features);
232
233   expected_features.Clear();
234   expected_features.AddBooleanFeature(features::kPageHasTextInputs);
235
236   features.Clear();
237   LoadHtml(
238       "host.com",
239       "<html><head><body><input type=\"invalid\"></body></html>");
240   ASSERT_TRUE(ExtractFeatures(&features));
241   ExpectFeatureMapsAreEqual(features, expected_features);
242 }
243
244 IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, LinkFeatures) {
245   // This test doesn't exercise the extraction timing.
246   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
247
248   FeatureMap expected_features;
249   expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.5);
250   expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.0);
251   expected_features.AddBooleanFeature(features::kPageLinkDomain +
252                                       std::string("chromium.org"));
253
254   FeatureMap features;
255   LoadHtml(
256       "www.host.com",
257       "<html><head><body>"
258       "<a href=\"http://www2.host.com/abc\">link</a>"
259       "<a name=page_anchor></a>"
260       "<a href=\"http://www.chromium.org/\">chromium</a>"
261       "</body></html");
262   ASSERT_TRUE(ExtractFeatures(&features));
263   ExpectFeatureMapsAreEqual(features, expected_features);
264
265   expected_features.Clear();
266   expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.25);
267   expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.5);
268   expected_features.AddBooleanFeature(features::kPageLinkDomain +
269                                       std::string("chromium.org"));
270
271   net::SpawnedTestServer https_server(
272       net::SpawnedTestServer::TYPE_HTTPS,
273       net::SpawnedTestServer::kLocalhost,
274       base::FilePath(FILE_PATH_LITERAL("chrome/test/data")));
275   ASSERT_TRUE(https_server.Start());
276
277   // The PhishingDOMFeatureExtractor depends on URLs being domains and not IPs,
278   // so use a domain.
279   std::string url_str = "https://host.com:";
280   url_str += base::IntToString(https_server.host_port_pair().port());
281   url_str += "/files/safe_browsing/secure_link_features.html";
282   ui_test_utils::NavigateToURL(browser(), GURL(url_str));
283
284   // Click through the certificate error interstitial.
285   content::InterstitialPage* interstitial_page =
286       GetWebContents()->GetInterstitialPage();
287   interstitial_page->Proceed();
288   content::WaitForLoadStop(GetWebContents());
289
290   features.Clear();
291   ASSERT_TRUE(ExtractFeatures(&features));
292   ExpectFeatureMapsAreEqual(features, expected_features);
293 }
294
295 IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest,
296                        ScriptAndImageFeatures) {
297   // This test doesn't exercise the extraction timing.
298   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
299
300   FeatureMap expected_features;
301   expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne);
302
303   FeatureMap features;
304   LoadHtml(
305       "host.com",
306       "<html><head><script></script><script></script></head></html>");
307   ASSERT_TRUE(ExtractFeatures(&features));
308   ExpectFeatureMapsAreEqual(features, expected_features);
309
310   expected_features.Clear();
311   expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne);
312   expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTSix);
313   expected_features.AddRealFeature(features::kPageImgOtherDomainFreq, 0.5);
314
315   features.Clear();
316   net::SpawnedTestServer https_server(
317       net::SpawnedTestServer::TYPE_HTTPS,
318       net::SpawnedTestServer::kLocalhost,
319       base::FilePath(FILE_PATH_LITERAL("chrome/test/data")));
320   ASSERT_TRUE(https_server.Start());
321
322   // The PhishingDOMFeatureExtractor depends on URLs being domains and not IPs,
323   // so use a domain.
324   std::string url_str = "https://host.com:";
325   url_str += base::IntToString(https_server.host_port_pair().port());
326   url_str += "/files/safe_browsing/secure_script_and_image.html";
327   ui_test_utils::NavigateToURL(browser(), GURL(url_str));
328
329   // Click through the certificate error interstitial.
330   content::InterstitialPage* interstitial_page =
331       GetWebContents()->GetInterstitialPage();
332   interstitial_page->Proceed();
333   content::WaitForLoadStop(GetWebContents());
334
335   ASSERT_TRUE(ExtractFeatures(&features));
336   ExpectFeatureMapsAreEqual(features, expected_features);
337 }
338
339 IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, SubFrames) {
340   // This test doesn't exercise the extraction timing.
341   EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
342
343   // Test that features are aggregated across all frames.
344
345   std::string port = base::IntToString(embedded_test_server_->port());
346   responses_[GetURL("host2.com", "").spec()] =
347       "<html><head><script></script><body>"
348       "<form action=\"http://host4.com/\"><input type=checkbox></form>"
349       "<form action=\"http://host2.com/submit\"></form>"
350       "<a href=\"http://www.host2.com/home\">link</a>"
351       "<iframe src=\"nested.html\"></iframe>"
352       "<body></html>";
353
354   responses_[GetURL("host2.com", "nested.html").spec()] =
355       "<html><body><input type=password>"
356       "<a href=\"https://host4.com/\">link</a>"
357       "<a href=\"relative\">another</a>"
358       "</body></html>";
359
360   responses_[GetURL("host3.com", "").spec()] =
361       "<html><head><script></script><body>"
362       "<img src=\"http://host.com/123.png\">"
363       "</body></html>";
364
365   FeatureMap expected_features;
366   expected_features.AddBooleanFeature(features::kPageHasForms);
367   // Form action domains are compared to the URL of the document they're in,
368   // not the URL of the toplevel page.  So http://host2.com/ has two form
369   // actions, one of which is external.
370   expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.5);
371   expected_features.AddBooleanFeature(features::kPageHasTextInputs);
372   expected_features.AddBooleanFeature(features::kPageHasPswdInputs);
373   expected_features.AddBooleanFeature(features::kPageHasCheckInputs);
374   expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.25);
375   expected_features.AddBooleanFeature(features::kPageLinkDomain +
376                                       std::string("host4.com"));
377   expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.25);
378   expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne);
379   expected_features.AddRealFeature(features::kPageImgOtherDomainFreq, 1.0);
380
381   FeatureMap features;
382   std::string html(
383       "<html><body><input type=text><a href=\"info.html\">link</a>"
384       "<iframe src=\"http://host2.com:");
385   html += port;
386   html += std::string(
387       "/\"></iframe>"
388       "<iframe src=\"http://host3.com:");
389   html += port;
390   html += std::string("/\"></iframe></body></html>");
391
392   LoadHtml("host.com", html);
393   ASSERT_TRUE(ExtractFeatures(&features));
394   ExpectFeatureMapsAreEqual(features, expected_features);
395 }
396
397 IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, Continuation) {
398   // For this test, we'll cause the feature extraction to run multiple
399   // iterations by incrementing the clock.
400
401   // This page has a total of 50 elements.  For the external forms feature to
402   // be computed correctly, the extractor has to examine the whole document.
403   // Note: the empty HEAD is important -- WebKit will synthesize a HEAD if
404   // there isn't one present, which can be confusing for the element counts.
405   std::string response = "<html><head></head><body>"
406       "<form action=\"ondomain\"></form>";
407   for (int i = 0; i < 45; ++i) {
408     response.append("<p>");
409   }
410   response.append("<form action=\"http://host2.com/\"></form></body></html>");
411
412   // Advance the clock 6 ms every 10 elements processed, 10 ms between chunks.
413   // Note that this assumes kClockCheckGranularity = 10 and
414   // kMaxTimePerChunkMs = 10.
415   base::TimeTicks now = base::TimeTicks::Now();
416   EXPECT_CALL(clock_, Now())
417       // Time check at the start of extraction.
418       .WillOnce(Return(now))
419       // Time check at the start of the first chunk of work.
420       .WillOnce(Return(now))
421       // Time check after the first 10 elements.
422       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6)))
423       // Time check after the next 10 elements.  This is over the chunk
424       // time limit, so a continuation task will be posted.
425       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12)))
426       // Time check at the start of the second chunk of work.
427       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22)))
428       // Time check after resuming iteration for the second chunk.
429       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(24)))
430       // Time check after the next 10 elements.
431       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)))
432       // Time check after the next 10 elements.  This will trigger another
433       // continuation task.
434       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(36)))
435       // Time check at the start of the third chunk of work.
436       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(46)))
437       // Time check after resuming iteration for the third chunk.
438       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(48)))
439       // Time check after the last 10 elements.
440       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(54)))
441       // A final time check for the histograms.
442       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(56)));
443
444   FeatureMap expected_features;
445   expected_features.AddBooleanFeature(features::kPageHasForms);
446   expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.5);
447
448   FeatureMap features;
449   LoadHtml("host.com", response);
450   ASSERT_TRUE(ExtractFeatures(&features));
451   ExpectFeatureMapsAreEqual(features, expected_features);
452   // Make sure none of the mock expectations carry over to the next test.
453   ::testing::Mock::VerifyAndClearExpectations(&clock_);
454
455   // Now repeat the test with the same page, but advance the clock faster so
456   // that the extraction time exceeds the maximum total time for the feature
457   // extractor.  Extraction should fail.  Note that this assumes
458   // kMaxTotalTimeMs = 500.
459   EXPECT_CALL(clock_, Now())
460       // Time check at the start of extraction.
461       .WillOnce(Return(now))
462       // Time check at the start of the first chunk of work.
463       .WillOnce(Return(now))
464       // Time check after the first 10 elements.
465       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
466       // Time check at the start of the second chunk of work.
467       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
468       // Time check after resuming iteration for the second chunk.
469       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(360)))
470       // Time check after the next 10 elements.  This is over the limit.
471       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
472       // A final time check for the histograms.
473       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
474
475   features.Clear();
476   EXPECT_FALSE(ExtractFeatures(&features));
477 }
478
479 IN_PROC_BROWSER_TEST_F(PhishingDOMFeatureExtractorTest, SubframeRemoval) {
480   // In this test, we'll advance the feature extractor so that it is positioned
481   // inside an iframe, and have it pause due to exceeding the chunk time limit.
482   // Then, prior to continuation, the iframe is removed from the document.
483   // As currently implemented, this should finish extraction from the removed
484   // iframe document.
485   responses_[GetURL("host.com", "frame.html").spec()] =
486       "<html><body><p><p><p><input type=password></body></html>";
487
488   base::TimeTicks now = base::TimeTicks::Now();
489   EXPECT_CALL(clock_, Now())
490       // Time check at the start of extraction.
491       .WillOnce(Return(now))
492       // Time check at the start of the first chunk of work.
493       .WillOnce(Return(now))
494       // Time check after the first 10 elements.  Enough time has passed
495       // to stop extraction.  Schedule the iframe removal to happen as soon as
496       // the feature extractor returns control to the message loop.
497       .WillOnce(DoAll(
498           Invoke(this, &PhishingDOMFeatureExtractorTest::ScheduleRemoveIframe),
499           Return(now + base::TimeDelta::FromMilliseconds(21))))
500       // Time check at the start of the second chunk of work.
501       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25)))
502       // Time check after resuming iteration for the second chunk.
503       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(27)))
504       // A final time check for the histograms.
505       .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(33)));
506
507   FeatureMap expected_features;
508   expected_features.AddBooleanFeature(features::kPageHasForms);
509   expected_features.AddBooleanFeature(features::kPageHasPswdInputs);
510
511   FeatureMap features;
512   LoadHtml(
513       "host.com",
514       "<html><head></head><body>"
515       "<iframe src=\"frame.html\" id=\"frame1\"></iframe>"
516       "<form></form></body></html>");
517   ASSERT_TRUE(ExtractFeatures(&features));
518   ExpectFeatureMapsAreEqual(features, expected_features);
519 }
520
521 }  // namespace safe_browsing