1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "content/renderer/savable_resources.h"
9 #include "base/compiler_specific.h"
10 #include "base/logging.h"
11 #include "base/strings/string_util.h"
12 #include "third_party/WebKit/public/platform/WebString.h"
13 #include "third_party/WebKit/public/platform/WebVector.h"
14 #include "third_party/WebKit/public/web/WebDocument.h"
15 #include "third_party/WebKit/public/web/WebElement.h"
16 #include "third_party/WebKit/public/web/WebElementCollection.h"
17 #include "third_party/WebKit/public/web/WebFrame.h"
18 #include "third_party/WebKit/public/web/WebInputElement.h"
19 #include "third_party/WebKit/public/web/WebNode.h"
20 #include "third_party/WebKit/public/web/WebNodeList.h"
21 #include "third_party/WebKit/public/web/WebView.h"
23 using blink::WebDocument;
24 using blink::WebElement;
25 using blink::WebElementCollection;
26 using blink::WebFrame;
27 using blink::WebInputElement;
29 using blink::WebNodeList;
30 using blink::WebString;
31 using blink::WebVector;
37 // Structure for storage the unique set of all savable resource links for
38 // making sure that no duplicated resource link in final result. The consumer
39 // of the SavableResourcesUniqueCheck is responsible for keeping these pointers
40 // valid for the lifetime of the SavableResourcesUniqueCheck instance.
41 struct SavableResourcesUniqueCheck {
42 // Unique set of all sub resource links.
43 std::set<GURL>* resources_set;
44 // Unique set of all frame links.
45 std::set<GURL>* frames_set;
46 // Collection of all frames we go through when getting all savable resource
48 std::vector<WebFrame*>* frames;
50 SavableResourcesUniqueCheck()
51 : resources_set(NULL),
55 SavableResourcesUniqueCheck(std::set<GURL>* resources_set,
56 std::set<GURL>* frames_set, std::vector<WebFrame*>* frames)
57 : resources_set(resources_set),
58 frames_set(frames_set),
62 // Get all savable resource links from current element. One element might
63 // have more than one resource link. It is possible to have some links
64 // in one CSS stylesheet.
65 void GetSavableResourceLinkForElement(
66 const WebElement& element,
67 const WebDocument& current_doc,
68 SavableResourcesUniqueCheck* unique_check,
69 SavableResourcesResult* result) {
71 // Handle frame and iframe tag.
72 if (element.hasTagName("iframe") ||
73 element.hasTagName("frame")) {
74 WebFrame* sub_frame = WebFrame::fromFrameOwnerElement(element);
76 unique_check->frames->push_back(sub_frame);
80 // Check whether the node has sub resource URL or not.
81 WebString value = GetSubResourceLinkFromElement(element);
85 GURL u = current_doc.completeURL(value);
89 // Ignore those URLs which are not standard protocols. Because FTP
90 // protocol does no have cache mechanism, we will skip all
91 // sub-resources if they use FTP protocol.
92 if (!u.SchemeIsHTTPOrHTTPS() && !u.SchemeIs("file"))
94 // Ignore duplicated resource link.
95 if (!unique_check->resources_set->insert(u).second)
97 result->resources_list->push_back(u);
98 // Insert referrer for above new resource link.
99 result->referrer_urls_list->push_back(GURL());
100 result->referrer_policies_list->push_back(blink::WebReferrerPolicyDefault);
103 // Get all savable resource links from current WebFrameImpl object pointer.
104 void GetAllSavableResourceLinksForFrame(WebFrame* current_frame,
105 SavableResourcesUniqueCheck* unique_check,
106 SavableResourcesResult* result,
107 const char** savable_schemes) {
108 // Get current frame's URL.
109 GURL current_frame_url = current_frame->document().url();
111 // If url of current frame is invalid, ignore it.
112 if (!current_frame_url.is_valid())
115 // If url of current frame is not a savable protocol, ignore it.
116 bool is_valid_protocol = false;
117 for (int i = 0; savable_schemes[i] != NULL; ++i) {
118 if (current_frame_url.SchemeIs(savable_schemes[i])) {
119 is_valid_protocol = true;
123 if (!is_valid_protocol)
126 // If find same frame we have recorded, ignore it.
127 if (!unique_check->frames_set->insert(current_frame_url).second)
130 // Get current using document.
131 WebDocument current_doc = current_frame->document();
132 // Go through all descent nodes.
133 WebElementCollection all = current_doc.all();
134 // Go through all elements in this frame.
135 for (WebElement element = all.firstItem(); !element.isNull();
136 element = all.nextItem()) {
137 GetSavableResourceLinkForElement(element,
146 WebString GetSubResourceLinkFromElement(const WebElement& element) {
147 const char* attribute_name = NULL;
148 if (element.hasHTMLTagName("img") ||
149 element.hasHTMLTagName("script")) {
150 attribute_name = "src";
151 } else if (element.hasHTMLTagName("input")) {
152 const WebInputElement input = element.toConst<WebInputElement>();
153 if (input.isImageButton()) {
154 attribute_name = "src";
156 } else if (element.hasHTMLTagName("body") ||
157 element.hasHTMLTagName("table") ||
158 element.hasHTMLTagName("tr") ||
159 element.hasHTMLTagName("td")) {
160 attribute_name = "background";
161 } else if (element.hasHTMLTagName("blockquote") ||
162 element.hasHTMLTagName("q") ||
163 element.hasHTMLTagName("del") ||
164 element.hasHTMLTagName("ins")) {
165 attribute_name = "cite";
166 } else if (element.hasHTMLTagName("link")) {
167 // If the link element is not linked to css, ignore it.
168 if (LowerCaseEqualsASCII(element.getAttribute("type"), "text/css")) {
169 // TODO(jnd): Add support for extracting links of sub-resources which
170 // are inside style-sheet such as @import, url(), etc.
171 // See bug: http://b/issue?id=1111667.
172 attribute_name = "href";
177 WebString value = element.getAttribute(WebString::fromUTF8(attribute_name));
178 // If value has content and not start with "javascript:" then return it,
179 // otherwise return NULL.
180 if (!value.isNull() && !value.isEmpty() &&
181 !StartsWithASCII(value.utf8(), "javascript:", false))
187 // Get all savable resource links from current webview, include main
188 // frame and sub-frame
189 bool GetAllSavableResourceLinksForCurrentPage(WebView* view,
190 const GURL& page_url, SavableResourcesResult* result,
191 const char** savable_schemes) {
192 WebFrame* main_frame = view->mainFrame();
196 std::set<GURL> resources_set;
197 std::set<GURL> frames_set;
198 std::vector<WebFrame*> frames;
199 SavableResourcesUniqueCheck unique_check(&resources_set,
203 GURL main_page_gurl(main_frame->document().url());
205 // Make sure we are saving same page between embedder and webkit.
206 // If page has being navigated, embedder will get three empty vector,
207 // which will make the saving page job ended.
208 if (page_url != main_page_gurl)
211 // First, process main frame.
212 frames.push_back(main_frame);
214 // Check all resource in this page, include sub-frame.
215 for (int i = 0; i < static_cast<int>(frames.size()); ++i) {
216 // Get current frame's all savable resource links.
217 GetAllSavableResourceLinksForFrame(frames[i], &unique_check, result,
221 // Since frame's src can also point to sub-resources link, so it is possible
222 // that some URLs in frames_list are also in resources_list. For those
223 // URLs, we will remove it from frame_list, only keep them in resources_list.
224 for (std::set<GURL>::iterator it = frames_set.begin();
225 it != frames_set.end(); ++it) {
226 // Append unique frame source to savable frame list.
227 if (resources_set.find(*it) == resources_set.end())
228 result->frames_list->push_back(*it);
234 } // namespace content