Upstream version 7.36.149.0
[platform/framework/web/crosswalk.git] / src / components / dom_distiller / core / distiller.cc
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/dom_distiller/core/distiller.h"
6
7 #include <map>
8 #include <vector>
9
10 #include "base/auto_reset.h"
11 #include "base/bind.h"
12 #include "base/callback.h"
13 #include "base/location.h"
14 #include "base/message_loop/message_loop.h"
15 #include "base/strings/string_number_conversions.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/values.h"
18 #include "components/dom_distiller/core/distiller_page.h"
19 #include "components/dom_distiller/core/distiller_url_fetcher.h"
20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
22 #include "net/url_request/url_request_context_getter.h"
23
24 namespace {
25 // Maximum number of distilled pages in an article.
26 const size_t kMaxPagesInArticle = 32;
27 }
28
29 namespace dom_distiller {
30
31 DistillerFactoryImpl::DistillerFactoryImpl(
32     scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory)
33     : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {
34 }
35
36 DistillerFactoryImpl::~DistillerFactoryImpl() {}
37
38 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
39   scoped_ptr<DistillerImpl> distiller(
40       new DistillerImpl(*distiller_url_fetcher_factory_));
41   return distiller.PassAs<Distiller>();
42 }
43
44 DistillerImpl::DistilledPageData::DistilledPageData() {}
45
46 DistillerImpl::DistilledPageData::~DistilledPageData() {}
47
48 DistillerImpl::DistillerImpl(
49     const DistillerURLFetcherFactory& distiller_url_fetcher_factory)
50     : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
51       max_pages_in_article_(kMaxPagesInArticle),
52       destruction_allowed_(true),
53       weak_factory_(this) {
54 }
55
56 DistillerImpl::~DistillerImpl() {
57   DCHECK(destruction_allowed_);
58 }
59
60 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
61   max_pages_in_article_ = max_num_pages;
62 }
63
64 bool DistillerImpl::AreAllPagesFinished() const {
65   return started_pages_index_.empty() && waiting_pages_.empty();
66 }
67
68 size_t DistillerImpl::TotalPageCount() const {
69   return waiting_pages_.size() + started_pages_index_.size() +
70          finished_pages_index_.size();
71 }
72
73 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
74   if (!IsPageNumberInUse(page_num) && url.is_valid() &&
75       TotalPageCount() < max_pages_in_article_ &&
76       seen_urls_.find(url.spec()) == seen_urls_.end()) {
77     waiting_pages_[page_num] = url;
78   }
79 }
80
81 bool DistillerImpl::IsPageNumberInUse(int page_num) const {
82   return waiting_pages_.find(page_num) != waiting_pages_.end() ||
83          started_pages_index_.find(page_num) != started_pages_index_.end() ||
84          finished_pages_index_.find(page_num) != finished_pages_index_.end();
85 }
86
87 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index)
88     const {
89   DCHECK_LT(index, pages_.size());
90   DistilledPageData* page_data = pages_[index];
91   DCHECK(page_data);
92   return page_data;
93 }
94
95 void DistillerImpl::DistillPage(const GURL& url,
96                                 scoped_ptr<DistillerPage> distiller_page,
97                                 const DistillationFinishedCallback& finished_cb,
98                                 const DistillationUpdateCallback& update_cb) {
99   DCHECK(AreAllPagesFinished());
100   distiller_page_ = distiller_page.Pass();
101   finished_cb_ = finished_cb;
102   update_cb_ = update_cb;
103
104   AddToDistillationQueue(0, url);
105   DistillNextPage();
106 }
107
108 void DistillerImpl::DistillNextPage() {
109   if (!waiting_pages_.empty()) {
110     std::map<int, GURL>::iterator front = waiting_pages_.begin();
111     int page_num = front->first;
112     const GURL url = front->second;
113
114     waiting_pages_.erase(front);
115     DCHECK(url.is_valid());
116     DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
117     DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
118     seen_urls_.insert(url.spec());
119     pages_.push_back(new DistilledPageData());
120     started_pages_index_[page_num] = pages_.size() - 1;
121     distiller_page_->DistillPage(
122         url,
123         base::Bind(&DistillerImpl::OnPageDistillationFinished,
124                    weak_factory_.GetWeakPtr(),
125                    page_num,
126                    url));
127   }
128 }
129
130 void DistillerImpl::OnPageDistillationFinished(
131     int page_num,
132     const GURL& page_url,
133     scoped_ptr<DistilledPageInfo> distilled_page,
134     bool distillation_successful) {
135   DCHECK(distilled_page.get());
136   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
137   if (distillation_successful) {
138     DistilledPageData* page_data =
139         GetPageAtIndex(started_pages_index_[page_num]);
140     page_data->distilled_page_proto =
141         new base::RefCountedData<DistilledPageProto>();
142     page_data->page_num = page_num;
143     page_data->title = distilled_page->title;
144
145     page_data->distilled_page_proto->data.set_url(page_url.spec());
146     page_data->distilled_page_proto->data.set_html(distilled_page->html);
147
148     GURL next_page_url(distilled_page->next_page_url);
149     if (next_page_url.is_valid()) {
150       // The pages should be in same origin.
151       DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
152       AddToDistillationQueue(page_num + 1, next_page_url);
153     }
154
155     GURL prev_page_url(distilled_page->prev_page_url);
156     if (prev_page_url.is_valid()) {
157       DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
158       AddToDistillationQueue(page_num - 1, prev_page_url);
159     }
160
161     for (size_t img_num = 0; img_num < distilled_page->image_urls.size();
162          ++img_num) {
163       std::string image_id =
164           base::IntToString(page_num + 1) + "_" + base::IntToString(img_num);
165       FetchImage(page_num, image_id, distilled_page->image_urls[img_num]);
166     }
167
168     AddPageIfDone(page_num);
169     DistillNextPage();
170   } else {
171     started_pages_index_.erase(page_num);
172     RunDistillerCallbackIfDone();
173   }
174 }
175
176 void DistillerImpl::FetchImage(int page_num,
177                                const std::string& image_id,
178                                const std::string& item) {
179   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
180   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
181   DistillerURLFetcher* fetcher =
182       distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
183   page_data->image_fetchers_.push_back(fetcher);
184
185   fetcher->FetchURL(item,
186                     base::Bind(&DistillerImpl::OnFetchImageDone,
187                                weak_factory_.GetWeakPtr(),
188                                page_num,
189                                base::Unretained(fetcher),
190                                image_id));
191 }
192
193 void DistillerImpl::OnFetchImageDone(int page_num,
194                                      DistillerURLFetcher* url_fetcher,
195                                      const std::string& id,
196                                      const std::string& response) {
197   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
198   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
199   DCHECK(page_data->distilled_page_proto);
200   DCHECK(url_fetcher);
201   ScopedVector<DistillerURLFetcher>::iterator fetcher_it =
202       std::find(page_data->image_fetchers_.begin(),
203                 page_data->image_fetchers_.end(),
204                 url_fetcher);
205
206   DCHECK(fetcher_it != page_data->image_fetchers_.end());
207   // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
208   // callback is invoked by the |url_fetcher|.
209   page_data->image_fetchers_.weak_erase(fetcher_it);
210   base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher);
211
212   DistilledPageProto_Image* image =
213       page_data->distilled_page_proto->data.add_image();
214   image->set_name(id);
215   image->set_data(response);
216
217   AddPageIfDone(page_num);
218 }
219
220 void DistillerImpl::AddPageIfDone(int page_num) {
221   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
222   DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
223   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
224   if (page_data->image_fetchers_.empty()) {
225     finished_pages_index_[page_num] = started_pages_index_[page_num];
226     started_pages_index_.erase(page_num);
227     const ArticleDistillationUpdate& article_update =
228         CreateDistillationUpdate();
229     DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size());
230     update_cb_.Run(article_update);
231     RunDistillerCallbackIfDone();
232   }
233 }
234
235 const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate()
236     const {
237   bool has_prev_page = false;
238   bool has_next_page = false;
239   if (!finished_pages_index_.empty()) {
240     int prev_page_num = finished_pages_index_.begin()->first - 1;
241     int next_page_num = finished_pages_index_.rbegin()->first + 1;
242     has_prev_page = IsPageNumberInUse(prev_page_num);
243     has_next_page = IsPageNumberInUse(next_page_num);
244   }
245
246   std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto> >
247       update_pages;
248   for (std::map<int, size_t>::const_iterator it = finished_pages_index_.begin();
249        it != finished_pages_index_.end();
250        ++it) {
251     update_pages.push_back(pages_[it->second]->distilled_page_proto);
252   }
253   return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page);
254 }
255
256 void DistillerImpl::RunDistillerCallbackIfDone() {
257   DCHECK(!finished_cb_.is_null());
258   if (AreAllPagesFinished()) {
259     bool first_page = true;
260     scoped_ptr<DistilledArticleProto> article_proto(
261         new DistilledArticleProto());
262     // Stitch the pages back into the article.
263     for (std::map<int, size_t>::iterator it = finished_pages_index_.begin();
264          it != finished_pages_index_.end();) {
265       DistilledPageData* page_data = GetPageAtIndex(it->second);
266       *(article_proto->add_pages()) = page_data->distilled_page_proto->data;
267
268       if (first_page) {
269         article_proto->set_title(page_data->title);
270         first_page = false;
271       }
272
273       finished_pages_index_.erase(it++);
274     }
275
276     pages_.clear();
277     DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
278               max_pages_in_article_);
279
280     DCHECK(pages_.empty());
281     DCHECK(finished_pages_index_.empty());
282
283     base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_,
284                                                        false);
285     finished_cb_.Run(article_proto.Pass());
286     finished_cb_.Reset();
287   }
288 }
289
290 }  // namespace dom_distiller