2 * Copyright (C) 2011 Google Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 #include "PageSerializer.h"
34 #include "CSSImageValue.h"
35 #include "CSSImportRule.h"
36 #include "CSSStyleRule.h"
37 #include "CachedImage.h"
41 #include "HTMLFrameOwnerElement.h"
42 #include "HTMLHeadElement.h"
43 #include "HTMLImageElement.h"
44 #include "HTMLLinkElement.h"
45 #include "HTMLMetaCharsetParser.h"
46 #include "HTMLNames.h"
47 #include "HTMLStyleElement.h"
48 #include "HTTPParsers.h"
50 #include "MIMETypeRegistry.h"
51 #include "MarkupAccumulator.h"
53 #include "StyleCachedImage.h"
54 #include "StyleImage.h"
55 #include "StyleRule.h"
56 #include "StyleSheetContents.h"
58 #include "TextEncoding.h"
59 #include <wtf/text/CString.h>
60 #include <wtf/text/StringBuilder.h>
61 #include <wtf/text/WTFString.h>
65 static bool isCharsetSpecifyingNode(Node* node)
67 if (!node->isHTMLElement())
70 HTMLElement* element = toHTMLElement(node);
71 if (!element->hasTagName(HTMLNames::metaTag))
73 HTMLMetaCharsetParser::AttributeList attributes;
74 if (element->hasAttributes()) {
75 for (unsigned i = 0; i < element->attributeCount(); ++i) {
76 const Attribute* attribute = element->attributeItem(i);
77 // FIXME: We should deal appropriately with the attribute if they have a namespace.
78 attributes.append(std::make_pair(attribute->name().toString(), attribute->value().string()));
81 TextEncoding textEncoding = HTMLMetaCharsetParser::encodingFromMetaAttributes(attributes);
82 return textEncoding.isValid();
85 static bool shouldIgnoreElement(Element* element)
87 return element->hasTagName(HTMLNames::scriptTag) || element->hasTagName(HTMLNames::noscriptTag) || isCharsetSpecifyingNode(element);
90 static const QualifiedName& frameOwnerURLAttributeName(const HTMLFrameOwnerElement& frameOwner)
92 // FIXME: We should support all frame owners including applets.
93 return frameOwner.hasTagName(HTMLNames::objectTag) ? HTMLNames::dataAttr : HTMLNames::srcAttr;
96 class SerializerMarkupAccumulator : public WebCore::MarkupAccumulator {
98 SerializerMarkupAccumulator(PageSerializer*, Document*, Vector<Node*>*);
99 virtual ~SerializerMarkupAccumulator();
102 virtual void appendText(StringBuilder& out, Text*);
103 virtual void appendElement(StringBuilder& out, Element*, Namespaces*);
104 virtual void appendCustomAttributes(StringBuilder& out, Element*, Namespaces*);
105 virtual void appendEndTag(Node*);
108 PageSerializer* m_serializer;
109 Document* m_document;
112 SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer* serializer, Document* document, Vector<Node*>* nodes)
113 : MarkupAccumulator(nodes, ResolveAllURLs)
114 , m_serializer(serializer)
115 , m_document(document)
117 // MarkupAccumulator does not serialize the <?xml ... line, so we add it explicitely to ensure the right encoding is specified.
118 if (m_document->isXHTMLDocument() || m_document->xmlStandalone() || m_document->isSVGDocument())
119 appendString("<?xml version=\"" + m_document->xmlVersion() + "\" encoding=\"" + m_document->charset() + "\"?>");
122 SerializerMarkupAccumulator::~SerializerMarkupAccumulator()
126 void SerializerMarkupAccumulator::appendText(StringBuilder& out, Text* text)
128 Element* parent = text->parentElement();
129 if (parent && !shouldIgnoreElement(parent))
130 MarkupAccumulator::appendText(out, text);
133 void SerializerMarkupAccumulator::appendElement(StringBuilder& out, Element* element, Namespaces* namespaces)
135 if (!shouldIgnoreElement(element))
136 MarkupAccumulator::appendElement(out, element, namespaces);
138 if (element->hasTagName(HTMLNames::headTag)) {
139 out.append("<meta charset=\"");
140 out.append(m_document->charset());
144 // FIXME: For object (plugins) tags and video tag we could replace them by an image of their current contents.
147 void SerializerMarkupAccumulator::appendCustomAttributes(StringBuilder& out, Element* element, Namespaces* namespaces)
149 if (!element->isFrameOwnerElement())
152 HTMLFrameOwnerElement* frameOwner = static_cast<HTMLFrameOwnerElement*>(element);
153 Frame* frame = frameOwner->contentFrame();
157 KURL url = frame->document()->url();
158 if (url.isValid() && !url.isBlankURL())
161 // We need to give a fake location to blank frames so they can be referenced by the serialized frame.
162 url = m_serializer->urlForBlankFrame(frame);
163 appendAttribute(out, element, Attribute(frameOwnerURLAttributeName(*frameOwner), url.string()), namespaces);
166 void SerializerMarkupAccumulator::appendEndTag(Node* node)
168 if (node->isElementNode() && !shouldIgnoreElement(toElement(node)))
169 MarkupAccumulator::appendEndTag(node);
172 PageSerializer::Resource::Resource()
176 PageSerializer::Resource::Resource(const KURL& url, const String& mimeType, PassRefPtr<SharedBuffer> data)
183 PageSerializer::PageSerializer(Vector<PageSerializer::Resource>* resources)
184 : m_resources(resources)
185 , m_blankFrameCounter(0)
189 void PageSerializer::serialize(Page* page)
191 serializeFrame(page->mainFrame());
194 void PageSerializer::serializeFrame(Frame* frame)
196 Document* document = frame->document();
197 KURL url = document->url();
198 if (!url.isValid() || url.isBlankURL()) {
199 // For blank frames we generate a fake URL so they can be referenced by their containing frame.
200 url = urlForBlankFrame(frame);
203 if (m_resourceURLs.contains(url)) {
204 // FIXME: We could have 2 frame with the same URL but which were dynamically changed and have now
205 // different content. So we should serialize both and somehow rename the frame src in the containing
211 SerializerMarkupAccumulator accumulator(this, document, &nodes);
212 TextEncoding textEncoding(document->charset());
214 if (!textEncoding.isValid()) {
215 // FIXME: iframes used as images trigger this. We should deal with them correctly.
218 String text = accumulator.serializeNodes(document->documentElement(), 0, IncludeNode);
219 CString frameHTML = textEncoding.encode(text.characters(), text.length(), EntitiesForUnencodables);
220 m_resources->append(Resource(url, document->suggestedMIMEType(), SharedBuffer::create(frameHTML.data(), frameHTML.length())));
221 m_resourceURLs.add(url);
223 for (Vector<Node*>::iterator iter = nodes.begin(); iter != nodes.end(); ++iter) {
225 if (!node->isElementNode())
228 Element* element = toElement(node);
229 // We have to process in-line style as it might contain some resources (typically background images).
230 if (element->isStyledElement())
231 retrieveResourcesForProperties(static_cast<StyledElement*>(element)->inlineStyle(), document);
233 if (element->hasTagName(HTMLNames::imgTag)) {
234 HTMLImageElement* imageElement = static_cast<HTMLImageElement*>(element);
235 KURL url = document->completeURL(imageElement->getAttribute(HTMLNames::srcAttr));
236 CachedImage* cachedImage = imageElement->cachedImage();
237 addImageToResources(cachedImage, imageElement->renderer(), url);
238 } else if (element->hasTagName(HTMLNames::linkTag)) {
239 HTMLLinkElement* linkElement = static_cast<HTMLLinkElement*>(element);
240 if (CSSStyleSheet* sheet = linkElement->sheet()) {
241 KURL url = document->completeURL(linkElement->getAttribute(HTMLNames::hrefAttr));
242 serializeCSSStyleSheet(sheet, url);
243 ASSERT(m_resourceURLs.contains(url));
245 } else if (element->hasTagName(HTMLNames::styleTag)) {
246 HTMLStyleElement* styleElement = static_cast<HTMLStyleElement*>(element);
247 if (CSSStyleSheet* sheet = styleElement->sheet())
248 serializeCSSStyleSheet(sheet, KURL());
252 for (Frame* childFrame = frame->tree()->firstChild(); childFrame; childFrame = childFrame->tree()->nextSibling())
253 serializeFrame(childFrame);
256 void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet* styleSheet, const KURL& url)
258 StringBuilder cssText;
259 for (unsigned i = 0; i < styleSheet->length(); ++i) {
260 CSSRule* rule = styleSheet->item(i);
261 String itemText = rule->cssText();
262 if (!itemText.isEmpty()) {
263 cssText.append(itemText);
264 if (i < styleSheet->length() - 1)
265 cssText.append("\n\n");
267 Document* document = styleSheet->ownerDocument();
268 // Some rules have resources associated with them that we need to retrieve.
269 if (rule->isImportRule()) {
270 CSSImportRule* importRule = static_cast<CSSImportRule*>(rule);
271 KURL importURL = document->completeURL(importRule->href());
272 if (m_resourceURLs.contains(importURL))
274 serializeCSSStyleSheet(importRule->styleSheet(), importURL);
275 } else if (rule->isFontFaceRule()) {
276 // FIXME: Add support for font face rule. It is not clear to me at this point if the actual otf/eot file can
277 // be retrieved from the CSSFontFaceRule object.
278 } else if (rule->isStyleRule())
279 retrieveResourcesForRule(static_cast<CSSStyleRule*>(rule)->styleRule(), document);
282 if (url.isValid() && !m_resourceURLs.contains(url)) {
283 // FIXME: We should check whether a charset has been specified and if none was found add one.
284 TextEncoding textEncoding(styleSheet->contents()->charset());
285 ASSERT(textEncoding.isValid());
286 String textString = cssText.toString();
287 CString text = textEncoding.encode(textString.characters(), textString.length(), EntitiesForUnencodables);
288 m_resources->append(Resource(url, String("text/css"), SharedBuffer::create(text.data(), text.length())));
289 m_resourceURLs.add(url);
293 void PageSerializer::addImageToResources(CachedImage* image, RenderObject* imageRenderer, const KURL& url)
295 if (!url.isValid() || m_resourceURLs.contains(url))
298 if (!image || image->image() == Image::nullImage())
301 RefPtr<SharedBuffer> data = imageRenderer ? image->imageForRenderer(imageRenderer)->data() : image->image()->data();
303 // SVG images don't return data at this point. Bug 99102.
304 LOG_ERROR("No data for image %s", url.string().utf8().data());
307 String mimeType = image->response().mimeType();
308 m_resources->append(Resource(url, mimeType, data));
309 m_resourceURLs.add(url);
312 void PageSerializer::retrieveResourcesForRule(StyleRule* rule, Document* document)
314 retrieveResourcesForProperties(rule->properties(), document);
317 void PageSerializer::retrieveResourcesForProperties(const StylePropertySet* styleDeclaration, Document* document)
319 if (!styleDeclaration)
322 // The background-image and list-style-image (for ul or ol) are the CSS properties
323 // that make use of images. We iterate to make sure we include any other
324 // image properties there might be.
325 unsigned propertyCount = styleDeclaration->propertyCount();
326 for (unsigned i = 0; i < propertyCount; ++i) {
327 RefPtr<CSSValue> cssValue = styleDeclaration->propertyAt(i).value();
328 if (!cssValue->isImageValue())
331 CSSImageValue* imageValue = static_cast<CSSImageValue*>(cssValue.get());
332 StyleImage* styleImage = imageValue->cachedOrPendingImage();
333 // Non cached-images are just place-holders and do not contain data.
334 if (!styleImage || !styleImage->isCachedImage())
337 CachedImage* image = static_cast<StyleCachedImage*>(styleImage)->cachedImage();
339 KURL url = document->completeURL(image->url());
340 addImageToResources(image, 0, url);
344 KURL PageSerializer::urlForBlankFrame(Frame* frame)
346 HashMap<Frame*, KURL>::iterator iter = m_blankFrameURLs.find(frame);
347 if (iter != m_blankFrameURLs.end())
349 String url = "wyciwyg://frame/" + String::number(m_blankFrameCounter++);
350 KURL fakeURL(ParsedURLString, url);
351 m_blankFrameURLs.add(frame, fakeURL);