Upstream version 7.36.149.0
[platform/framework/web/crosswalk.git] / src / components / dom_distiller / standalone / content_extractor.cc
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <sstream>
6
7 #include "base/command_line.h"
8 #include "base/files/scoped_temp_dir.h"
9 #include "base/message_loop/message_loop.h"
10 #include "base/path_service.h"
11 #include "base/run_loop.h"
12 #include "components/dom_distiller/content/distiller_page_web_contents.h"
13 #include "components/dom_distiller/core/distiller.h"
14 #include "components/dom_distiller/core/dom_distiller_database.h"
15 #include "components/dom_distiller/core/dom_distiller_service.h"
16 #include "components/dom_distiller/core/dom_distiller_store.h"
17 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
18 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
19 #include "components/dom_distiller/core/task_tracker.h"
20 #include "content/public/browser/browser_context.h"
21 #include "content/public/browser/browser_thread.h"
22 #include "content/public/test/content_browser_test.h"
23 #include "content/shell/browser/shell.h"
24 #include "net/dns/mock_host_resolver.h"
25 #include "ui/base/resource/resource_bundle.h"
26
27 using content::ContentBrowserTest;
28
29 namespace dom_distiller {
30
31 namespace {
32
33 // The url to distill.
34 const char* kUrlSwitch = "url";
35
36 // Indicates that DNS resolution should be disabled for this test.
37 const char* kDisableDnsSwitch = "disable-dns";
38
39 // Will write the distilled output to the given file instead of to stdout.
40 const char* kOutputFile = "output-file";
41
42 // Indicates to output a serialized protocol buffer instead of human-readable
43 // output.
44 const char* kShouldOutputBinary = "output-binary";
45
46 scoped_ptr<DomDistillerService> CreateDomDistillerService(
47     content::BrowserContext* context,
48     const base::FilePath& db_path) {
49   scoped_refptr<base::SequencedTaskRunner> background_task_runner =
50       content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
51           content::BrowserThread::GetBlockingPool()->GetSequenceToken());
52
53   // TODO(cjhopman): use an in-memory database instead of an on-disk one with
54   // temporary directory.
55   scoped_ptr<DomDistillerDatabase> db(
56       new DomDistillerDatabase(background_task_runner));
57   scoped_ptr<DomDistillerStore> dom_distiller_store(new DomDistillerStore(
58       db.PassAs<DomDistillerDatabaseInterface>(), db_path));
59
60   scoped_ptr<DistillerPageFactory> distiller_page_factory(
61       new DistillerPageWebContentsFactory(context));
62   scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
63       new DistillerURLFetcherFactory(context->GetRequestContext()));
64   scoped_ptr<DistillerFactory> distiller_factory(
65       new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass()));
66
67   return scoped_ptr<DomDistillerService>(new DomDistillerService(
68       dom_distiller_store.PassAs<DomDistillerStoreInterface>(),
69       distiller_factory.Pass(),
70       distiller_page_factory.Pass()));
71 }
72
73 void AddComponentsResources() {
74   base::FilePath pak_file;
75   base::FilePath pak_dir;
76   PathService::Get(base::DIR_MODULE, &pak_dir);
77   pak_file = pak_dir.Append(FILE_PATH_LITERAL("components_resources.pak"));
78   ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
79       pak_file, ui::SCALE_FACTOR_NONE);
80 }
81
82 void LogArticle(const DistilledArticleProto& article_proto) {
83   std::stringstream output;
84   if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
85     output << article_proto.SerializeAsString();
86   } else {
87     output << "Article Title: " << article_proto.title() << std::endl;
88     output << "# of pages: " << article_proto.pages_size() << std::endl;
89     for (int i = 0; i < article_proto.pages_size(); ++i) {
90       const DistilledPageProto& page = article_proto.pages(i);
91       output << "Page " << i << std::endl;
92       output << "URL: " << page.url() << std::endl;
93       output << "Content: " << page.html() << std::endl;
94     }
95   }
96
97   std::string data = output.str();
98   if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
99     base::FilePath filename =
100         CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
101     base::WriteFile(filename, data.c_str(), data.size());
102   } else {
103     VLOG(0) << data;
104   }
105 }
106
107 }  // namespace
108
109 class ContentExtractionRequest : public ViewRequestDelegate {
110  public:
111   void Start(DomDistillerService* service, base::Closure finished_callback) {
112     finished_callback_ = finished_callback;
113     viewer_handle_ =
114         service->ViewUrl(this, service->CreateDefaultDistillerPage(), url_);
115   }
116
117   DistilledArticleProto GetArticleCopy() {
118     return *article_proto_;
119   }
120
121   static scoped_ptr<ContentExtractionRequest> CreateForCommandLine(
122       const CommandLine& command_line) {
123     GURL url;
124     if (command_line.HasSwitch(kUrlSwitch)) {
125       std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
126       url = GURL(url_string);
127     }
128     if (!url.is_valid()) {
129       ADD_FAILURE() << "No valid url provided";
130       return scoped_ptr<ContentExtractionRequest>();
131     }
132     return scoped_ptr<ContentExtractionRequest>(
133         new ContentExtractionRequest(url));
134   }
135
136  private:
137   ContentExtractionRequest(const GURL& url) : url_(url) {}
138
139   virtual void OnArticleUpdated(ArticleDistillationUpdate article_update)
140       OVERRIDE {}
141
142   virtual void OnArticleReady(const DistilledArticleProto* article_proto)
143       OVERRIDE {
144     article_proto_ = article_proto;
145     base::MessageLoop::current()->PostTask(
146         FROM_HERE,
147         finished_callback_);
148   }
149
150   const DistilledArticleProto* article_proto_;
151   scoped_ptr<ViewerHandle> viewer_handle_;
152   GURL url_;
153   base::Closure finished_callback_;
154 };
155
156 class ContentExtractor : public ContentBrowserTest {
157   // Change behavior of the default host resolver to avoid DNS lookup errors, so
158   // we can make network calls.
159   virtual void SetUpOnMainThread() OVERRIDE {
160     if (!CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) {
161       EnableDNSLookupForThisTest();
162     }
163     CHECK(db_dir_.CreateUniqueTempDir());
164     AddComponentsResources();
165   }
166
167   virtual void TearDownOnMainThread() OVERRIDE {
168     DisableDNSLookupForThisTest();
169   }
170
171  protected:
172   // Creates the DomDistillerService and creates and starts the extraction
173   // request.
174   void Start() {
175     content::BrowserContext* context =
176         shell()->web_contents()->GetBrowserContext();
177     service_ = CreateDomDistillerService(context,
178                                          db_dir_.path());
179     const CommandLine& command_line = *CommandLine::ForCurrentProcess();
180     request_ = ContentExtractionRequest::CreateForCommandLine(command_line);
181     request_->Start(
182         service_.get(),
183         base::Bind(&ContentExtractor::Finish, base::Unretained(this)));
184   }
185
186  private:
187   // Change behavior of the default host resolver to allow DNS lookup
188   // to proceed instead of being blocked by the test infrastructure.
189   void EnableDNSLookupForThisTest() {
190     // mock_host_resolver_override_ takes ownership of the resolver.
191     scoped_refptr<net::RuleBasedHostResolverProc> resolver =
192         new net::RuleBasedHostResolverProc(host_resolver());
193     resolver->AllowDirectLookup("*");
194     mock_host_resolver_override_.reset(
195         new net::ScopedDefaultHostResolverProc(resolver.get()));
196   }
197
198   // We need to reset the DNS lookup when we finish, or the test will fail.
199   void DisableDNSLookupForThisTest() {
200     mock_host_resolver_override_.reset();
201   }
202
203   void Finish() {
204     LogArticle(request_->GetArticleCopy());
205     request_.reset();
206     service_.reset();
207     base::MessageLoop::current()->PostTask(
208         FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
209   }
210
211   base::ScopedTempDir db_dir_;
212   scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
213   scoped_ptr<DomDistillerService> service_;
214   scoped_ptr<ContentExtractionRequest> request_;
215 };
216
217 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
218   Start();
219   base::RunLoop().Run();
220 }
221
222 }  // namespace dom_distiller