Upstream version 10.39.225.0
[platform/framework/web/crosswalk.git] / src / components / dom_distiller / standalone / content_extractor.cc
index 760b165..8e83963 100644 (file)
@@ -9,19 +9,28 @@
 #include "base/message_loop/message_loop.h"
 #include "base/path_service.h"
 #include "base/run_loop.h"
+#include "base/strings/string_number_conversions.h"
+#include "base/strings/string_split.h"
 #include "components/dom_distiller/content/distiller_page_web_contents.h"
+#include "components/dom_distiller/core/article_entry.h"
+#include "components/dom_distiller/core/distilled_page_prefs.h"
 #include "components/dom_distiller/core/distiller.h"
-#include "components/dom_distiller/core/dom_distiller_database.h"
 #include "components/dom_distiller/core/dom_distiller_service.h"
 #include "components/dom_distiller/core/dom_distiller_store.h"
 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
 #include "components/dom_distiller/core/task_tracker.h"
+#include "components/leveldb_proto/proto_database.h"
+#include "components/leveldb_proto/proto_database_impl.h"
+#include "components/pref_registry/testing_pref_service_syncable.h"
 #include "content/public/browser/browser_context.h"
 #include "content/public/browser/browser_thread.h"
 #include "content/public/test/content_browser_test.h"
 #include "content/shell/browser/shell.h"
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
 #include "net/dns/mock_host_resolver.h"
+#include "third_party/dom_distiller_js/dom_distiller.pb.h"
 #include "ui/base/resource/resource_bundle.h"
 
 using content::ContentBrowserTest;
@@ -33,6 +42,9 @@ namespace {
 // The url to distill.
 const char* kUrlSwitch = "url";
 
+// A space-separated list of urls to distill.
+const char* kUrlsSwitch = "urls";
+
 // Indicates that DNS resolution should be disabled for this test.
 const char* kDisableDnsSwitch = "disable-dns";
 
@@ -43,6 +55,15 @@ const char* kOutputFile = "output-file";
 // output.
 const char* kShouldOutputBinary = "output-binary";
 
+// Indicates to output only the text of the article and not the enclosing html.
+const char* kExtractTextOnly = "extract-text-only";
+
+// Indicates to include debug output.
+const char* kDebugLevel = "debug-level";
+
+// Maximum number of concurrent started extractor requests.
+const int kMaxExtractorTasks = 8;
+
 scoped_ptr<DomDistillerService> CreateDomDistillerService(
     content::BrowserContext* context,
     const base::FilePath& db_path) {
@@ -52,22 +73,43 @@ scoped_ptr<DomDistillerService> CreateDomDistillerService(
 
   // TODO(cjhopman): use an in-memory database instead of an on-disk one with
   // temporary directory.
-  scoped_ptr<DomDistillerDatabase> db(
-      new DomDistillerDatabase(background_task_runner));
+  scoped_ptr<leveldb_proto::ProtoDatabaseImpl<ArticleEntry> > db(
+      new leveldb_proto::ProtoDatabaseImpl<ArticleEntry>(
+          background_task_runner));
   scoped_ptr<DomDistillerStore> dom_distiller_store(new DomDistillerStore(
-      db.PassAs<DomDistillerDatabaseInterface>(), db_path));
+      db.PassAs<leveldb_proto::ProtoDatabase<ArticleEntry> >(), db_path));
 
   scoped_ptr<DistillerPageFactory> distiller_page_factory(
       new DistillerPageWebContentsFactory(context));
   scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory(
       new DistillerURLFetcherFactory(context->GetRequestContext()));
+
+  dom_distiller::proto::DomDistillerOptions options;
+  if (base::CommandLine::ForCurrentProcess()->HasSwitch(kExtractTextOnly)) {
+    options.set_extract_text_only(true);
+  }
+  int debug_level = 0;
+  if (base::CommandLine::ForCurrentProcess()->HasSwitch(kDebugLevel) &&
+      base::StringToInt(
+          base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
+              kDebugLevel),
+          &debug_level)) {
+    options.set_debug_level(debug_level);
+  }
   scoped_ptr<DistillerFactory> distiller_factory(
-      new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass()));
+      new DistillerFactoryImpl(distiller_url_fetcher_factory.Pass(), options));
+
+  // Setting up PrefService for DistilledPagePrefs.
+  user_prefs::TestingPrefServiceSyncable* pref_service =
+      new user_prefs::TestingPrefServiceSyncable();
+  DistilledPagePrefs::RegisterProfilePrefs(pref_service->registry());
 
   return scoped_ptr<DomDistillerService>(new DomDistillerService(
       dom_distiller_store.PassAs<DomDistillerStoreInterface>(),
       distiller_factory.Pass(),
-      distiller_page_factory.Pass()));
+      distiller_page_factory.Pass(),
+      scoped_ptr<DistilledPagePrefs>(
+          new DistilledPagePrefs(pref_service))));
 }
 
 void AddComponentsResources() {
@@ -79,58 +121,79 @@ void AddComponentsResources() {
       pak_file, ui::SCALE_FACTOR_NONE);
 }
 
-void LogArticle(const DistilledArticleProto& article_proto) {
-  std::stringstream output;
-  if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
-    output << article_proto.SerializeAsString();
-  } else {
-    output << "Article Title: " << article_proto.title() << std::endl;
-    output << "# of pages: " << article_proto.pages_size() << std::endl;
-    for (int i = 0; i < article_proto.pages_size(); ++i) {
-      const DistilledPageProto& page = article_proto.pages(i);
-      output << "Page " << i << std::endl;
-      output << "URL: " << page.url() << std::endl;
-      output << "Content: " << page.html() << std::endl;
-    }
-  }
+bool WriteProtobufWithSize(
+    const google::protobuf::MessageLite& message,
+    google::protobuf::io::ZeroCopyOutputStream* output_stream) {
+  google::protobuf::io::CodedOutputStream coded_output(output_stream);
 
-  std::string data = output.str();
-  if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
-    base::FilePath filename =
-        CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
-    base::WriteFile(filename, data.c_str(), data.size());
-  } else {
-    VLOG(0) << data;
+  // Write the size.
+  const int size = message.ByteSize();
+  coded_output.WriteLittleEndian32(size);
+  message.SerializeWithCachedSizes(&coded_output);
+  return !coded_output.HadError();
+}
+
+std::string GetReadableArticleString(
+    const DistilledArticleProto& article_proto) {
+  std::stringstream output;
+  output << "Article Title: " << article_proto.title() << std::endl;
+  output << "# of pages: " << article_proto.pages_size() << std::endl;
+  for (int i = 0; i < article_proto.pages_size(); ++i) {
+    const DistilledPageProto& page = article_proto.pages(i);
+    output << "Page " << i << std::endl;
+    output << "URL: " << page.url() << std::endl;
+    output << "Content: " << page.html() << std::endl;
+    if (page.has_debug_info() && page.debug_info().has_log())
+      output << "Log: " << page.debug_info().log() << std::endl;
   }
+  return output.str();
 }
 
 }  // namespace
 
 class ContentExtractionRequest : public ViewRequestDelegate {
  public:
-  void Start(DomDistillerService* service, base::Closure finished_callback) {
+  void Start(DomDistillerService* service, const gfx::Size& render_view_size,
+             base::Closure finished_callback) {
     finished_callback_ = finished_callback;
     viewer_handle_ =
-        service->ViewUrl(this, service->CreateDefaultDistillerPage(), url_);
+        service->ViewUrl(this,
+                         service->CreateDefaultDistillerPage(render_view_size),
+                         url_);
   }
 
   DistilledArticleProto GetArticleCopy() {
     return *article_proto_;
   }
 
-  static scoped_ptr<ContentExtractionRequest> CreateForCommandLine(
+  static ScopedVector<ContentExtractionRequest> CreateForCommandLine(
       const CommandLine& command_line) {
-    GURL url;
+    ScopedVector<ContentExtractionRequest> requests;
     if (command_line.HasSwitch(kUrlSwitch)) {
+      GURL url;
       std::string url_string = command_line.GetSwitchValueASCII(kUrlSwitch);
       url = GURL(url_string);
+      if (url.is_valid()) {
+        requests.push_back(new ContentExtractionRequest(url));
+      }
+    } else if (command_line.HasSwitch(kUrlsSwitch)) {
+      std::string urls_string = command_line.GetSwitchValueASCII(kUrlsSwitch);
+      std::vector<std::string> urls;
+      base::SplitString(urls_string, ' ', &urls);
+      for (size_t i = 0; i < urls.size(); ++i) {
+        GURL url(urls[i]);
+        if (url.is_valid()) {
+          requests.push_back(new ContentExtractionRequest(url));
+        } else {
+          ADD_FAILURE() << "Bad url";
+        }
+      }
     }
-    if (!url.is_valid()) {
+    if (requests.empty()) {
       ADD_FAILURE() << "No valid url provided";
-      return scoped_ptr<ContentExtractionRequest>();
     }
-    return scoped_ptr<ContentExtractionRequest>(
-        new ContentExtractionRequest(url));
+
+    return requests.Pass();
   }
 
  private:
@@ -142,6 +205,7 @@ class ContentExtractionRequest : public ViewRequestDelegate {
   virtual void OnArticleReady(const DistilledArticleProto* article_proto)
       OVERRIDE {
     article_proto_ = article_proto;
+    CHECK(article_proto->pages_size()) << "Failed extracting " << url_;
     base::MessageLoop::current()->PostTask(
         FROM_HERE,
         finished_callback_);
@@ -154,6 +218,15 @@ class ContentExtractionRequest : public ViewRequestDelegate {
 };
 
 class ContentExtractor : public ContentBrowserTest {
+ public:
+  ContentExtractor()
+      : pending_tasks_(0),
+        max_tasks_(kMaxExtractorTasks),
+        next_request_(0),
+        output_data_(),
+        protobuf_output_stream_(
+            new google::protobuf::io::StringOutputStream(&output_data_)) {}
+
   // Change behavior of the default host resolver to avoid DNS lookup errors, so
   // we can make network calls.
   virtual void SetUpOnMainThread() OVERRIDE {
@@ -177,10 +250,19 @@ class ContentExtractor : public ContentBrowserTest {
     service_ = CreateDomDistillerService(context,
                                          db_dir_.path());
     const CommandLine& command_line = *CommandLine::ForCurrentProcess();
-    request_ = ContentExtractionRequest::CreateForCommandLine(command_line);
-    request_->Start(
-        service_.get(),
-        base::Bind(&ContentExtractor::Finish, base::Unretained(this)));
+    requests_ = ContentExtractionRequest::CreateForCommandLine(command_line);
+    PumpQueue();
+  }
+
+  void PumpQueue() {
+    while (pending_tasks_ < max_tasks_ && next_request_ < requests_.size()) {
+      requests_[next_request_]->Start(
+          service_.get(),
+          shell()->web_contents()->GetContainerBounds().size(),
+          base::Bind(&ContentExtractor::FinishRequest, base::Unretained(this)));
+      ++next_request_;
+      ++pending_tasks_;
+    }
   }
 
  private:
@@ -200,18 +282,55 @@ class ContentExtractor : public ContentBrowserTest {
     mock_host_resolver_override_.reset();
   }
 
+  void FinishRequest() {
+    --pending_tasks_;
+    if (next_request_ == requests_.size() && pending_tasks_ == 0) {
+      Finish();
+    } else {
+      PumpQueue();
+    }
+  }
+
+  void DoArticleOutput() {
+    for (size_t i = 0; i < requests_.size(); ++i) {
+      const DistilledArticleProto& article = requests_[i]->GetArticleCopy();
+      if (CommandLine::ForCurrentProcess()->HasSwitch(kShouldOutputBinary)) {
+        WriteProtobufWithSize(article, protobuf_output_stream_.get());
+      } else {
+        output_data_ += GetReadableArticleString(article) + "\n";
+      }
+    }
+
+    if (CommandLine::ForCurrentProcess()->HasSwitch(kOutputFile)) {
+      base::FilePath filename =
+          CommandLine::ForCurrentProcess()->GetSwitchValuePath(kOutputFile);
+      ASSERT_EQ(
+          (int)output_data_.size(),
+          base::WriteFile(filename, output_data_.c_str(), output_data_.size()));
+    } else {
+      VLOG(0) << output_data_;
+    }
+  }
+
   void Finish() {
-    LogArticle(request_->GetArticleCopy());
-    request_.reset();
+    DoArticleOutput();
+    requests_.clear();
     service_.reset();
     base::MessageLoop::current()->PostTask(
         FROM_HERE, base::MessageLoop::QuitWhenIdleClosure());
   }
 
+  size_t pending_tasks_;
+  size_t max_tasks_;
+  size_t next_request_;
+
   base::ScopedTempDir db_dir_;
   scoped_ptr<net::ScopedDefaultHostResolverProc> mock_host_resolver_override_;
   scoped_ptr<DomDistillerService> service_;
-  scoped_ptr<ContentExtractionRequest> request_;
+  ScopedVector<ContentExtractionRequest> requests_;
+
+  std::string output_data_;
+  scoped_ptr<google::protobuf::io::StringOutputStream> protobuf_output_stream_;
 };
 
 IN_PROC_BROWSER_TEST_F(ContentExtractor, MANUAL_ExtractUrl) {