Upstream version 10.39.225.0
[platform/framework/web/crosswalk.git] / src / components / dom_distiller / core / distiller.cc
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/dom_distiller/core/distiller.h"
6
7 #include <map>
8 #include <vector>
9
10 #include "base/auto_reset.h"
11 #include "base/bind.h"
12 #include "base/callback.h"
13 #include "base/location.h"
14 #include "base/message_loop/message_loop.h"
15 #include "base/strings/string_number_conversions.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/values.h"
18 #include "components/dom_distiller/core/distiller_page.h"
19 #include "components/dom_distiller/core/distiller_url_fetcher.h"
20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
22 #include "net/url_request/url_request_context_getter.h"
23
24 namespace {
25 // Maximum number of distilled pages in an article.
26 const size_t kMaxPagesInArticle = 32;
27 }
28
29 namespace dom_distiller {
30
31 DistillerFactoryImpl::DistillerFactoryImpl(
32     scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
33     const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
34     : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()),
35       dom_distiller_options_(dom_distiller_options) {
36 }
37
38 DistillerFactoryImpl::~DistillerFactoryImpl() {}
39
40 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
41   scoped_ptr<DistillerImpl> distiller(new DistillerImpl(
42       *distiller_url_fetcher_factory_, dom_distiller_options_));
43   return distiller.PassAs<Distiller>();
44 }
45
46 DistillerImpl::DistilledPageData::DistilledPageData() {}
47
48 DistillerImpl::DistilledPageData::~DistilledPageData() {}
49
50 DistillerImpl::DistillerImpl(
51     const DistillerURLFetcherFactory& distiller_url_fetcher_factory,
52     const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
53     : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
54       dom_distiller_options_(dom_distiller_options),
55       max_pages_in_article_(kMaxPagesInArticle),
56       destruction_allowed_(true),
57       weak_factory_(this) {
58 }
59
60 DistillerImpl::~DistillerImpl() {
61   DCHECK(destruction_allowed_);
62 }
63
64 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
65   max_pages_in_article_ = max_num_pages;
66 }
67
68 bool DistillerImpl::AreAllPagesFinished() const {
69   return started_pages_index_.empty() && waiting_pages_.empty();
70 }
71
72 size_t DistillerImpl::TotalPageCount() const {
73   return waiting_pages_.size() + started_pages_index_.size() +
74          finished_pages_index_.size();
75 }
76
77 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
78   if (!IsPageNumberInUse(page_num) && url.is_valid() &&
79       TotalPageCount() < max_pages_in_article_ &&
80       seen_urls_.find(url.spec()) == seen_urls_.end()) {
81     waiting_pages_[page_num] = url;
82   }
83 }
84
85 bool DistillerImpl::IsPageNumberInUse(int page_num) const {
86   return waiting_pages_.find(page_num) != waiting_pages_.end() ||
87          started_pages_index_.find(page_num) != started_pages_index_.end() ||
88          finished_pages_index_.find(page_num) != finished_pages_index_.end();
89 }
90
91 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index)
92     const {
93   DCHECK_LT(index, pages_.size());
94   DistilledPageData* page_data = pages_[index];
95   DCHECK(page_data);
96   return page_data;
97 }
98
99 void DistillerImpl::DistillPage(const GURL& url,
100                                 scoped_ptr<DistillerPage> distiller_page,
101                                 const DistillationFinishedCallback& finished_cb,
102                                 const DistillationUpdateCallback& update_cb) {
103   DCHECK(AreAllPagesFinished());
104   distiller_page_ = distiller_page.Pass();
105   finished_cb_ = finished_cb;
106   update_cb_ = update_cb;
107
108   AddToDistillationQueue(0, url);
109   DistillNextPage();
110 }
111
112 void DistillerImpl::DistillNextPage() {
113   if (!waiting_pages_.empty()) {
114     std::map<int, GURL>::iterator front = waiting_pages_.begin();
115     int page_num = front->first;
116     const GURL url = front->second;
117
118     waiting_pages_.erase(front);
119     DCHECK(url.is_valid());
120     DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
121     DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
122     seen_urls_.insert(url.spec());
123     pages_.push_back(new DistilledPageData());
124     started_pages_index_[page_num] = pages_.size() - 1;
125     distiller_page_->DistillPage(
126         url,
127         dom_distiller_options_,
128         base::Bind(&DistillerImpl::OnPageDistillationFinished,
129                    weak_factory_.GetWeakPtr(),
130                    page_num,
131                    url));
132   }
133 }
134
135 void DistillerImpl::OnPageDistillationFinished(
136     int page_num,
137     const GURL& page_url,
138     scoped_ptr<proto::DomDistillerResult> distiller_result,
139     bool distillation_successful) {
140   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
141   if (distillation_successful) {
142     DCHECK(distiller_result.get());
143     DistilledPageData* page_data =
144         GetPageAtIndex(started_pages_index_[page_num]);
145     page_data->distilled_page_proto =
146         new base::RefCountedData<DistilledPageProto>();
147     page_data->page_num = page_num;
148     if (distiller_result->has_title()) {
149       page_data->distilled_page_proto->data.set_title(
150           distiller_result->title());
151     }
152     page_data->distilled_page_proto->data.set_url(page_url.spec());
153     if (distiller_result->has_distilled_content() &&
154         distiller_result->distilled_content().has_html()) {
155       page_data->distilled_page_proto->data.set_html(
156           distiller_result->distilled_content().html());
157     }
158     if (distiller_result->has_debug_info() &&
159         distiller_result->debug_info().has_log()) {
160       page_data->distilled_page_proto->data.mutable_debug_info()->set_log(
161           distiller_result->debug_info().log());
162     }
163
164     if (distiller_result->has_pagination_info()) {
165       proto::PaginationInfo pagination_info =
166           distiller_result->pagination_info();
167       if (pagination_info.has_next_page()) {
168         GURL next_page_url(pagination_info.next_page());
169         if (next_page_url.is_valid()) {
170           // The pages should be in same origin.
171           DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
172           AddToDistillationQueue(page_num + 1, next_page_url);
173         }
174       }
175
176       if (pagination_info.has_prev_page()) {
177         GURL prev_page_url(pagination_info.prev_page());
178         if (prev_page_url.is_valid()) {
179           DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
180           AddToDistillationQueue(page_num - 1, prev_page_url);
181         }
182       }
183     }
184
185     for (int img_num = 0; img_num < distiller_result->image_urls_size();
186          ++img_num) {
187       std::string image_id =
188           base::IntToString(page_num + 1) + "_" + base::IntToString(img_num);
189       FetchImage(page_num, image_id, distiller_result->image_urls(img_num));
190     }
191
192     AddPageIfDone(page_num);
193     DistillNextPage();
194   } else {
195     started_pages_index_.erase(page_num);
196     RunDistillerCallbackIfDone();
197   }
198 }
199
200 void DistillerImpl::FetchImage(int page_num,
201                                const std::string& image_id,
202                                const std::string& item) {
203   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
204   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
205   DistillerURLFetcher* fetcher =
206       distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
207   page_data->image_fetchers_.push_back(fetcher);
208
209   fetcher->FetchURL(item,
210                     base::Bind(&DistillerImpl::OnFetchImageDone,
211                                weak_factory_.GetWeakPtr(),
212                                page_num,
213                                base::Unretained(fetcher),
214                                image_id));
215 }
216
217 void DistillerImpl::OnFetchImageDone(int page_num,
218                                      DistillerURLFetcher* url_fetcher,
219                                      const std::string& id,
220                                      const std::string& response) {
221   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
222   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
223   DCHECK(page_data->distilled_page_proto.get());
224   DCHECK(url_fetcher);
225   ScopedVector<DistillerURLFetcher>::iterator fetcher_it =
226       std::find(page_data->image_fetchers_.begin(),
227                 page_data->image_fetchers_.end(),
228                 url_fetcher);
229
230   DCHECK(fetcher_it != page_data->image_fetchers_.end());
231   // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
232   // callback is invoked by the |url_fetcher|.
233   page_data->image_fetchers_.weak_erase(fetcher_it);
234   base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher);
235
236   DistilledPageProto_Image* image =
237       page_data->distilled_page_proto->data.add_image();
238   image->set_name(id);
239   image->set_data(response);
240
241   AddPageIfDone(page_num);
242 }
243
244 void DistillerImpl::AddPageIfDone(int page_num) {
245   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
246   DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
247   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
248   if (page_data->image_fetchers_.empty()) {
249     finished_pages_index_[page_num] = started_pages_index_[page_num];
250     started_pages_index_.erase(page_num);
251     const ArticleDistillationUpdate& article_update =
252         CreateDistillationUpdate();
253     DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size());
254     update_cb_.Run(article_update);
255     RunDistillerCallbackIfDone();
256   }
257 }
258
259 const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate()
260     const {
261   bool has_prev_page = false;
262   bool has_next_page = false;
263   if (!finished_pages_index_.empty()) {
264     int prev_page_num = finished_pages_index_.begin()->first - 1;
265     int next_page_num = finished_pages_index_.rbegin()->first + 1;
266     has_prev_page = IsPageNumberInUse(prev_page_num);
267     has_next_page = IsPageNumberInUse(next_page_num);
268   }
269
270   std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto> >
271       update_pages;
272   for (std::map<int, size_t>::const_iterator it = finished_pages_index_.begin();
273        it != finished_pages_index_.end();
274        ++it) {
275     update_pages.push_back(pages_[it->second]->distilled_page_proto);
276   }
277   return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page);
278 }
279
280 void DistillerImpl::RunDistillerCallbackIfDone() {
281   DCHECK(!finished_cb_.is_null());
282   if (AreAllPagesFinished()) {
283     bool first_page = true;
284     scoped_ptr<DistilledArticleProto> article_proto(
285         new DistilledArticleProto());
286     // Stitch the pages back into the article.
287     for (std::map<int, size_t>::iterator it = finished_pages_index_.begin();
288          it != finished_pages_index_.end();) {
289       DistilledPageData* page_data = GetPageAtIndex(it->second);
290       *(article_proto->add_pages()) = page_data->distilled_page_proto->data;
291
292       if (first_page) {
293         article_proto->set_title(page_data->distilled_page_proto->data.title());
294         first_page = false;
295       }
296
297       finished_pages_index_.erase(it++);
298     }
299
300     pages_.clear();
301     DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
302               max_pages_in_article_);
303
304     DCHECK(pages_.empty());
305     DCHECK(finished_pages_index_.empty());
306
307     base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_,
308                                                        false);
309     finished_cb_.Run(article_proto.Pass());
310     finished_cb_.Reset();
311   }
312 }
313
314 }  // namespace dom_distiller