89446e21ab013c52bc472c353f1a8d7d9165fe92
[platform/framework/web/crosswalk.git] / src / components / dom_distiller / standalone / content_extractor.cc
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <sstream>
6
7 #include "base/command_line.h"
8 #include "base/files/scoped_temp_dir.h"
9 #include "base/message_loop/message_loop.h"
10 #include "base/path_service.h"
11 #include "base/run_loop.h"
12 #include "base/strings/string_number_conversions.h"
13 #include "base/strings/string_split.h"
14 #include "components/dom_distiller/content/distiller_page_web_contents.h"
15 #include "components/dom_distiller/core/article_entry.h"
16 #include "components/dom_distiller/core/distilled_page_prefs.h"
17 #include "components/dom_distiller/core/distiller.h"
18 #include "components/dom_distiller/core/dom_distiller_service.h"
19 #include "components/dom_distiller/core/dom_distiller_store.h"
20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
22 #include "components/dom_distiller/core/task_tracker.h"
23 #include "components/leveldb_proto/proto_database.h"
24 #include "components/leveldb_proto/proto_database_impl.h"
25 #include "components/pref_registry/testing_pref_service_syncable.h"
26 #include "content/public/browser/browser_context.h"
27 #include "content/public/browser/browser_thread.h"
28 #include "content/public/test/content_browser_test.h"
29 #include "content/shell/browser/shell.h"
30 #include "google/protobuf/io/coded_stream.h"
31 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
32 #include "net/dns/mock_host_resolver.h"
33 #include "third_party/dom_distiller_js/dom_distiller.pb.h"
34 #include "ui/base/resource/resource_bundle.h"
35
36 using content::ContentBrowserTest;
37
38 namespace dom_distiller {
39
40 namespace {
41
42 // The url to distill.
43 const char* kUrlSwitch = "url";
44
45 // A space-separated list of urls to distill.
46 const char* kUrlsSwitch = "urls";
47
48 // Indicates that DNS resolution should be disabled for this test.
49 const char* kDisableDnsSwitch = "disable-dns";
50
51 // Will write the distilled output to the given file instead of to stdout.
52 const char* kOutputFile = "output-file";
53
54 // Indicates to output a serialized protocol buffer instead of human-readable
55 // output.
56 const char* kShouldOutputBinary = "output-binary";
57
58 // Indicates to output only the text of the article and not the enclosing html.
59 const char* kExtractTextOnly = "extract-text-only";
60
61 // Indicates to include debug output.
62 const char* kDebugLevel = "debug-level";
63
64 // Maximum number of concurrent started extractor requests.
65 const int kMaxExtractorTasks = 8;
66
67 scoped_ptr<DomDistillerService> CreateDomDistillerService(
68     content::BrowserContext* context,
69     const base::FilePath& db_path) {
70   scoped_refptr<base::SequencedTaskRunner> background_task_runner =
71       content::BrowserThread::GetBlockingPool()->GetSequencedTaskRunner(
72           content::BrowserThread::GetBlockingPool()->GetSequenceToken());
73
74   // TODO(cjhopman): use an in-memory database instead of an on-disk one with
75   // temporary directory.
76   scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
77       new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
78           background_task_runner));
79   scoped_ptr<DomDistillerStore> dom_distiller_store(new DomDistillerStore(
80       db.PassAs<leveldb_proto::ProtoDatabase<ArticleEntry> >(), db_path));
81
82   scoped_ptr<DistillerPageFactory> distiller_page_factory(
83       new DistillerPageWebContentsFactory(context));
84   scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
85       new DistillerURLFetcherFactory(context->GetRequestContext()));
86
87   dom_distiller::proto::DomDistillerOptions options;
88   if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) {
89     options.set_extract_text_only(true);
90   }
91   int debug_level = 0;
92   if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
93       base::StringToInt(
94           base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
95               kDebugLevel),
96           &debug_level)) {
97     options.set_debug_level(debug_level);
98   }
99   scoped_ptr<DistillerFactory> distiller_factory(
100       new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), options));
101
102   // Setting up PrefService for DistilledPagePrefs.
103   user_prefs::TestingPrefServiceSyncable* pref_service =
104       new user_prefs::TestingPrefServiceSyncable();
105   DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
106
107   return scoped_ptr<DomDistillerService>(new DomDistillerService(
108       dom_distiller_store.PassAs<DomDistillerStoreInterface>(),
109       distiller_factory.Pass(),
110       distiller_page_factory.Pass(),
111       scoped_ptr<DistilledPagePrefs>(
112           new DistilledPagePrefs(pref_service))));
113 }
114
115 void AddComponentsResources() {
116   base::FilePath pak_file;
117   base::FilePath pak_dir;
118   PathService::Get(base::DIR_MODULE, &pak_dir);
119   pak_file = pak_dir.Append(FILE_PATH_LITERAL("components_resources.pak"));
120   ui::ResourceBundle::GetSharedInstance().AddDataPackFromPath(
121       pak_file, ui::SCALE_FACTOR_NONE);
122 }
123
124 bool WriteProtobufWithSize(
125     const google::protobuf::MessageLite& message,
126     google::protobuf::io::ZeroCopyOutputStream* output_stream) {
127   google::protobuf::io::CodedOutputStream coded_output(output_stream);
128
129   // Write the size.
130   const int size = message.ByteSize();
131   coded_output.WriteLittleEndian32(size);
132   message.SerializeWithCachedSizes(&coded_output);
133   return !coded_output.HadError();
134 }
135
136 std::string GetReadableArticleString(
137     const DistilledArticleProto& article_proto) {
138   std::stringstream output;
139   output << "Article Title: " << article_proto.title() << std::endl;
140   output << "# of pages: " << article_proto.pages_size() << std::endl;
141   for (int i = 0; i < article_proto.pages_size(); ++i) {
142     const DistilledPageProto& page = article_proto.pages(i);
143     output << "Page " << i << std::endl;
144     output << "URL: " << page.url() << std::endl;
145     output << "Content: " << page.html() << std::endl;
146   }
147   return output.str();
148 }
149
150 }  // namespace
151
152 class ContentExtractionRequest : public ViewRequestDelegate {
153  public:
154   void Start(DomDistillerService* service, const gfx::Size& render_view_size,
155              base::Closure finished_callback) {
156     finished_callback_ = finished_callback;
157     viewer_handle_ =
158         service->ViewUrl(this,
159                          service->CreateDefaultDistillerPage(render_view_size),
160                          url_);
161   }
162
163   DistilledArticleProto GetArticleCopy() {
164     return *article_proto_;
165   }
166
167   static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
168       const CommandLine& command_line) {
169     ScopedVector<ContentExtractionRequest> requests;
170     if (command_line.HasSwitch(kUrlSwitch)) {
171       GURL url;
172       std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
173       url = GURL(url_string);
174       if (url.is_valid()) {
175         requests.push_back(new ContentExtractionRequest(url));
176       }
177     } else if (command_line.HasSwitch(kUrlsSwitch)) {
178       std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
179       std::vector<std::string> urls;
180       base::SplitString(urls_string, ' ', &urls);
181       for (size_t i = 0; i < urls.size(); ++i) {
182         GURL url(urls[i]);
183         if (url.is_valid()) {
184           requests.push_back(new ContentExtractionRequest(url));
185         } else {
186           ADD_FAILURE() << "Bad url";
187         }
188       }
189     }
190     if (requests.empty()) {
191       ADD_FAILURE() << "No valid url provided";
192     }
193
194     return requests.Pass();
195   }
196
197  private:
198   ContentExtractionRequest(const GURL& url) : url_(url) {}
199
200   virtual void OnArticleUpdated(ArticleDistillationUpdate article_update)
201       OVERRIDE {}
202
203   virtual void OnArticleReady(const DistilledArticleProto* article_proto)
204       OVERRIDE {
205     article_proto_ = article_proto;
206     base::MessageLoop::current()->PostTask(
207         FROM_HERE,
208         finished_callback_);
209   }
210
211   const DistilledArticleProto* article_proto_;
212   scoped_ptr<ViewerHandle> viewer_handle_;
213   GURL url_;
214   base::Closure finished_callback_;
215 };
216
217 class ContentExtractor : public ContentBrowserTest {
218  public:
219   ContentExtractor()
220       : pending_tasks_(0),
221         max_tasks_(kMaxExtractorTasks),
222         next_request_(0),
223         output_data_(),
224         protobuf_output_stream_(
225             new google::protobuf::io::StringOutputStream(&output_data_)) {}
226
227   // Change behavior of the default host resolver to avoid DNS lookup errors, so
228   // we can make network calls.
229   virtual void SetUpOnMainThread() OVERRIDE {
230     if (!CommandLine::ForCurrentProcess()->HasSwitch(kDisableDnsSwitch)) {
231       EnableDNSLookupForThisTest();
232     }
233     CHECK(db_dir_.CreateUniqueTempDir());
234     AddComponentsResources();
235   }
236
237   virtual void TearDownOnMainThread() OVERRIDE {
238     DisableDNSLookupForThisTest();
239   }
240
241  protected:
242   // Creates the DomDistillerService and creates and starts the extraction
243   // request.
244   void Start() {
245     content::BrowserContext* context =
246         shell()->web_contents()->GetBrowserContext();
247     service_ = CreateDomDistillerService(context,
248                                          db_dir_.path());
249     const CommandLine& command_line = *CommandLine::ForCurrentProcess();
250     requests_ = ContentExtractionRequest::CreateForCommandLine(command_line);
251     PumpQueue();
252   }
253
254   void PumpQueue() {
255     while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
256       requests_[next_request_]->Start(
257           service_.get(),
258           shell()->web_contents()->GetContainerBounds().size(),
259           base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
260       ++next_request_;
261       ++pending_tasks_;
262     }
263   }
264
265  private:
266   // Change behavior of the default host resolver to allow DNS lookup
267   // to proceed instead of being blocked by the test infrastructure.
268   void EnableDNSLookupForThisTest() {
269     // mock_host_resolver_override_ takes ownership of the resolver.
270     scoped_refptr<net::RuleBasedHostResolverProc> resolver =
271         new net::RuleBasedHostResolverProc(host_resolver());
272     resolver->AllowDirectLookup("*");
273     mock_host_resolver_override_.reset(
274         new net::ScopedDefaultHostResolverProc(resolver.get()));
275   }
276
277   // We need to reset the DNS lookup when we finish, or the test will fail.
278   void DisableDNSLookupForThisTest() {
279     mock_host_resolver_override_.reset();
280   }
281
282   void FinishRequest() {
283     --pending_tasks_;
284     if (next_request_ == requests_.size() && pending_tasks_ == 0) {
285       Finish();
286     } else {
287       PumpQueue();
288     }
289   }
290
291   void DoArticleOutput() {
292     for (size_t i = 0; i < requests_.size(); ++i) {
293       const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
294       if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
295         WriteProtobufWithSize(article, protobuf_output_stream_.get());
296       } else {
297         output_data_ += GetReadableArticleString(article) + "\n";
298       }
299     }
300
301     if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
302       base::FilePath filename =
303           CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
304       ASSERT_EQ(
305           (int)output_data_.size(),
306           base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
307     } else {
308       VLOG(0) << output_data_;
309     }
310   }
311
312   void Finish() {
313     DoArticleOutput();
314     requests_.clear();
315     service_.reset();
316     base::MessageLoop::current()->PostTask(
317         FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
318   }
319
320   size_t pending_tasks_;
321   size_t max_tasks_;
322   size_t next_request_;
323
324   base::ScopedTempDir db_dir_;
325   scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
326   scoped_ptr<DomDistillerService> service_;
327   ScopedVector<ContentExtractionRequest> requests_;
328
329   std::string output_data_;
330   scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
331 };
332
333 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {
334   Start();
335   base::RunLoop().Run();
336 }
337
338 }  // namespace dom_distiller