1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/dom_distiller/core/distiller.h"
10 #include "base/auto_reset.h"
11 #include "base/bind.h"
12 #include "base/callback.h"
13 #include "base/location.h"
14 #include "base/message_loop/message_loop.h"
15 #include "base/strings/string_number_conversions.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/values.h"
18 #include "components/dom_distiller/core/distiller_page.h"
19 #include "components/dom_distiller/core/distiller_url_fetcher.h"
20 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
21 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
22 #include "net/url_request/url_request_context_getter.h"
25 // Maximum number of distilled pages in an article.
26 const size_t kMaxPagesInArticle = 32;
29 namespace dom_distiller {
31 DistillerFactoryImpl::DistillerFactoryImpl(
32 scoped_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory)
33 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory.Pass()) {
36 DistillerFactoryImpl::~DistillerFactoryImpl() {}
38 scoped_ptr<Distiller> DistillerFactoryImpl::CreateDistiller() {
39 scoped_ptr<DistillerImpl> distiller(
40 new DistillerImpl(*distiller_url_fetcher_factory_));
41 return distiller.PassAs<Distiller>();
44 DistillerImpl::DistilledPageData::DistilledPageData() {}
46 DistillerImpl::DistilledPageData::~DistilledPageData() {}
48 DistillerImpl::DistillerImpl(
49 const DistillerURLFetcherFactory& distiller_url_fetcher_factory)
50 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
51 max_pages_in_article_(kMaxPagesInArticle),
52 destruction_allowed_(true),
56 DistillerImpl::~DistillerImpl() {
57 DCHECK(destruction_allowed_);
60 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
61 max_pages_in_article_ = max_num_pages;
64 bool DistillerImpl::AreAllPagesFinished() const {
65 return started_pages_index_.empty() && waiting_pages_.empty();
68 size_t DistillerImpl::TotalPageCount() const {
69 return waiting_pages_.size() + started_pages_index_.size() +
70 finished_pages_index_.size();
73 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
74 if (!IsPageNumberInUse(page_num) && url.is_valid() &&
75 TotalPageCount() < max_pages_in_article_ &&
76 seen_urls_.find(url.spec()) == seen_urls_.end()) {
77 waiting_pages_[page_num] = url;
81 bool DistillerImpl::IsPageNumberInUse(int page_num) const {
82 return waiting_pages_.find(page_num) != waiting_pages_.end() ||
83 started_pages_index_.find(page_num) != started_pages_index_.end() ||
84 finished_pages_index_.find(page_num) != finished_pages_index_.end();
87 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(size_t index)
89 DCHECK_LT(index, pages_.size());
90 DistilledPageData* page_data = pages_[index];
95 void DistillerImpl::DistillPage(const GURL& url,
96 scoped_ptr<DistillerPage> distiller_page,
97 const DistillationFinishedCallback& finished_cb,
98 const DistillationUpdateCallback& update_cb) {
99 DCHECK(AreAllPagesFinished());
100 distiller_page_ = distiller_page.Pass();
101 finished_cb_ = finished_cb;
102 update_cb_ = update_cb;
104 AddToDistillationQueue(0, url);
108 void DistillerImpl::DistillNextPage() {
109 if (!waiting_pages_.empty()) {
110 std::map<int, GURL>::iterator front = waiting_pages_.begin();
111 int page_num = front->first;
112 const GURL url = front->second;
114 waiting_pages_.erase(front);
115 DCHECK(url.is_valid());
116 DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
117 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
118 seen_urls_.insert(url.spec());
119 pages_.push_back(new DistilledPageData());
120 started_pages_index_[page_num] = pages_.size() - 1;
121 distiller_page_->DistillPage(
123 base::Bind(&DistillerImpl::OnPageDistillationFinished,
124 weak_factory_.GetWeakPtr(),
130 void DistillerImpl::OnPageDistillationFinished(
132 const GURL& page_url,
133 scoped_ptr<DistilledPageInfo> distilled_page,
134 bool distillation_successful) {
135 DCHECK(distilled_page.get());
136 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
137 if (distillation_successful) {
138 DistilledPageData* page_data =
139 GetPageAtIndex(started_pages_index_[page_num]);
140 page_data->distilled_page_proto =
141 new base::RefCountedData<DistilledPageProto>();
142 page_data->page_num = page_num;
143 page_data->title = distilled_page->title;
145 page_data->distilled_page_proto->data.set_url(page_url.spec());
146 page_data->distilled_page_proto->data.set_html(distilled_page->html);
148 GURL next_page_url(distilled_page->next_page_url);
149 if (next_page_url.is_valid()) {
150 // The pages should be in same origin.
151 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
152 AddToDistillationQueue(page_num + 1, next_page_url);
155 GURL prev_page_url(distilled_page->prev_page_url);
156 if (prev_page_url.is_valid()) {
157 DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
158 AddToDistillationQueue(page_num - 1, prev_page_url);
161 for (size_t img_num = 0; img_num < distilled_page->image_urls.size();
163 std::string image_id =
164 base::IntToString(page_num + 1) + "_" + base::IntToString(img_num);
165 FetchImage(page_num, image_id, distilled_page->image_urls[img_num]);
168 AddPageIfDone(page_num);
171 started_pages_index_.erase(page_num);
172 RunDistillerCallbackIfDone();
176 void DistillerImpl::FetchImage(int page_num,
177 const std::string& image_id,
178 const std::string& item) {
179 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
180 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
181 DistillerURLFetcher* fetcher =
182 distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
183 page_data->image_fetchers_.push_back(fetcher);
185 fetcher->FetchURL(item,
186 base::Bind(&DistillerImpl::OnFetchImageDone,
187 weak_factory_.GetWeakPtr(),
189 base::Unretained(fetcher),
193 void DistillerImpl::OnFetchImageDone(int page_num,
194 DistillerURLFetcher* url_fetcher,
195 const std::string& id,
196 const std::string& response) {
197 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
198 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
199 DCHECK(page_data->distilled_page_proto);
201 ScopedVector<DistillerURLFetcher>::iterator fetcher_it =
202 std::find(page_data->image_fetchers_.begin(),
203 page_data->image_fetchers_.end(),
206 DCHECK(fetcher_it != page_data->image_fetchers_.end());
207 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
208 // callback is invoked by the |url_fetcher|.
209 page_data->image_fetchers_.weak_erase(fetcher_it);
210 base::MessageLoop::current()->DeleteSoon(FROM_HERE, url_fetcher);
212 DistilledPageProto_Image* image =
213 page_data->distilled_page_proto->data.add_image();
215 image->set_data(response);
217 AddPageIfDone(page_num);
220 void DistillerImpl::AddPageIfDone(int page_num) {
221 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
222 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
223 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
224 if (page_data->image_fetchers_.empty()) {
225 finished_pages_index_[page_num] = started_pages_index_[page_num];
226 started_pages_index_.erase(page_num);
227 const ArticleDistillationUpdate& article_update =
228 CreateDistillationUpdate();
229 DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size());
230 update_cb_.Run(article_update);
231 RunDistillerCallbackIfDone();
235 const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate()
237 bool has_prev_page = false;
238 bool has_next_page = false;
239 if (!finished_pages_index_.empty()) {
240 int prev_page_num = finished_pages_index_.begin()->first - 1;
241 int next_page_num = finished_pages_index_.rbegin()->first + 1;
242 has_prev_page = IsPageNumberInUse(prev_page_num);
243 has_next_page = IsPageNumberInUse(next_page_num);
246 std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto> >
248 for (std::map<int, size_t>::const_iterator it = finished_pages_index_.begin();
249 it != finished_pages_index_.end();
251 update_pages.push_back(pages_[it->second]->distilled_page_proto);
253 return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page);
256 void DistillerImpl::RunDistillerCallbackIfDone() {
257 DCHECK(!finished_cb_.is_null());
258 if (AreAllPagesFinished()) {
259 bool first_page = true;
260 scoped_ptr<DistilledArticleProto> article_proto(
261 new DistilledArticleProto());
262 // Stitch the pages back into the article.
263 for (std::map<int, size_t>::iterator it = finished_pages_index_.begin();
264 it != finished_pages_index_.end();) {
265 DistilledPageData* page_data = GetPageAtIndex(it->second);
266 *(article_proto->add_pages()) = page_data->distilled_page_proto->data;
269 article_proto->set_title(page_data->title);
273 finished_pages_index_.erase(it++);
277 DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
278 max_pages_in_article_);
280 DCHECK(pages_.empty());
281 DCHECK(finished_pages_index_.empty());
283 base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_,
285 finished_cb_.Run(article_proto.Pass());
286 finished_cb_.Reset();
290 } // namespace dom_distiller