src/third_party/WebKit/Source/web/WebPageSerializerImpl.cpp

   1 /*
   2  * Copyright (C) 2009 Google Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions are
   6  * met:
   7  *
   8  *     * Redistributions of source code must retain the above copyright
   9  * notice, this list of conditions and the following disclaimer.
  10  *     * Redistributions in binary form must reproduce the above
  11  * copyright notice, this list of conditions and the following disclaimer
  12  * in the documentation and/or other materials provided with the
  13  * distribution.
  14  *     * Neither the name of Google Inc. nor the names of its
  15  * contributors may be used to endorse or promote products derived from
  16  * this software without specific prior written permission.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29  */
  30
  31 // How we handle the base tag better.
  32 // Current status:
  33 // At now the normal way we use to handling base tag is
  34 // a) For those links which have corresponding local saved files, such as
  35 // savable CSS, JavaScript files, they will be written to relative URLs which
  36 // point to local saved file. Why those links can not be resolved as absolute
  37 // file URLs, because if they are resolved as absolute URLs, after moving the
  38 // file location from one directory to another directory, the file URLs will
  39 // be dead links.
  40 // b) For those links which have not corresponding local saved files, such as
  41 // links in A, AREA tags, they will be resolved as absolute URLs.
  42 // c) We comment all base tags when serialzing DOM for the page.
  43 // FireFox also uses above way to handle base tag.
  44 //
  45 // Problem:
  46 // This way can not handle the following situation:
  47 // the base tag is written by JavaScript.
  48 // For example. The page "www.yahoo.com" use
  49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
  50 // of page when loading page. So when saving page as completed-HTML, we assume
  51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
  52 // completed-HTML page, then the JavaScript will insert a base tag
  53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
  54 // local saved resource files will be resolved as
  55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved  resource
  56 // files can not be loaded correctly. Also the page will be rendered ugly since
  57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
  58 // files can not be fetched.
  59 // Now FireFox, IE and WebKit based Browser all have this problem.
  60 //
  61 // Solution:
  62 // My solution is that we comment old base tag and write new base tag:
  63 // <base href="." ...> after the previous commented base tag. In WebKit, it
  64 // always uses the latest "href" attribute of base tag to set document's base
  65 // URL. Based on this behavior, when we encounter a base tag, we comment it and
  66 // write a new base tag <base href="."> after the previous commented base tag.
  67 // The new added base tag can help engine to locate correct base URL for
  68 // correctly loading local saved resource files. Also I think we need to inherit
  69 // the base target value from document object when appending new base tag.
  70 // If there are multiple base tags in original document, we will comment all old
  71 // base tags and append new base tag after each old base tag because we do not
  72 // know those old base tags are original content or added by JavaScript. If
  73 // they are added by JavaScript, it means when loading saved page, the script(s)
  74 // will still insert base tag(s) to DOM, so the new added base tag(s) can
  75 // override the incorrect base URL and make sure we alway load correct local
  76 // saved resource files.
  77
  78 #include "config.h"
  79 #include "web/WebPageSerializerImpl.h"
  80
  81 #include "core/HTMLNames.h"
  82 #include "core/dom/Document.h"
  83 #include "core/dom/DocumentType.h"
  84 #include "core/dom/Element.h"
  85 #include "core/editing/markup.h"
  86 #include "core/html/HTMLAllCollection.h"
  87 #include "core/html/HTMLElement.h"
  88 #include "core/html/HTMLFormElement.h"
  89 #include "core/html/HTMLHtmlElement.h"
  90 #include "core/html/HTMLMetaElement.h"
  91 #include "core/loader/DocumentLoader.h"
  92 #include "core/loader/FrameLoader.h"
  93 #include "public/platform/WebVector.h"
  94 #include "web/WebLocalFrameImpl.h"
  95 #include "wtf/text/TextEncoding.h"
  96
  97 namespace blink {
  98
  99 // Maximum length of data buffer which is used to temporary save generated
 100 // html content data. This is a soft limit which might be passed if a very large
 101 // contegious string is found in the page.
 102 static const unsigned dataBufferCapacity = 65536;
 103
 104 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,
 105                                                             const WTF::TextEncoding& textEncoding,
 106                                                             Document* document,
 107                                                             const String& directoryName)
 108     : url(url)
 109     , textEncoding(textEncoding)
 110     , document(document)
 111     , directoryName(directoryName)
 112     , isHTMLDocument(document->isHTMLDocument())
 113     , haveSeenDocType(false)
 114     , haveAddedCharsetDeclaration(false)
 115     , skipMetaElement(0)
 116     , isInScriptOrStyleTag(false)
 117     , haveAddedXMLProcessingDirective(false)
 118     , haveAddedContentsBeforeEnd(false)
 119 {
 120 }
 121
 122 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(
 123     const Element* element, SerializeDomParam* param, bool* needSkip)
 124 {
 125     StringBuilder result;
 126
 127     *needSkip = false;
 128     if (param->isHTMLDocument) {
 129         // Skip the open tag of original META tag which declare charset since we
 130         // have overrided the META which have correct charset declaration after
 131         // serializing open tag of HEAD element.
 132         ASSERT(element);
 133         if (isHTMLMetaElement(*element)) {
 134             const HTMLMetaElement& meta = toHTMLMetaElement(*element);
 135             // Check whether the META tag has declared charset or not.
 136             String equiv = meta.httpEquiv();
 137             if (equalIgnoringCase(equiv, "content-type")) {
 138                 String content = meta.content();
 139                 if (content.length() && content.contains("charset", false)) {
 140                     // Find META tag declared charset, we need to skip it when
 141                     // serializing DOM.
 142                     param->skipMetaElement = element;
 143                     *needSkip = true;
 144                 }
 145             }
 146         } else if (isHTMLHtmlElement(*element)) {
 147             // Check something before processing the open tag of HEAD element.
 148             // First we add doc type declaration if original document has it.
 149             if (!param->haveSeenDocType) {
 150                 param->haveSeenDocType = true;
 151                 result.append(createMarkup(param->document->doctype()));
 152             }
 153
 154             // Add MOTW declaration before html tag.
 155             // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
 156             result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url));
 157         } else if (isHTMLBaseElement(*element)) {
 158             // Comment the BASE tag when serializing dom.
 159             result.appendLiteral("<!--");
 160         }
 161     } else {
 162         // Write XML declaration.
 163         if (!param->haveAddedXMLProcessingDirective) {
 164             param->haveAddedXMLProcessingDirective = true;
 165             // Get encoding info.
 166             String xmlEncoding = param->document->xmlEncoding();
 167             if (xmlEncoding.isEmpty())
 168                 xmlEncoding = param->document->encodingName();
 169             if (xmlEncoding.isEmpty())
 170                 xmlEncoding = UTF8Encoding().name();
 171             result.appendLiteral("<?xml version=\"");
 172             result.append(param->document->xmlVersion());
 173             result.appendLiteral("\" encoding=\"");
 174             result.append(xmlEncoding);
 175             if (param->document->xmlStandalone())
 176                 result.appendLiteral("\" standalone=\"yes");
 177             result.appendLiteral("\"?>\n");
 178         }
 179         // Add doc type declaration if original document has it.
 180         if (!param->haveSeenDocType) {
 181             param->haveSeenDocType = true;
 182             result.append(createMarkup(param->document->doctype()));
 183         }
 184     }
 185     return result.toString();
 186 }
 187
 188 String WebPageSerializerImpl::postActionAfterSerializeOpenTag(
 189     const Element* element, SerializeDomParam* param)
 190 {
 191     StringBuilder result;
 192
 193     param->haveAddedContentsBeforeEnd = false;
 194     if (!param->isHTMLDocument)
 195         return result.toString();
 196     // Check after processing the open tag of HEAD element
 197     if (!param->haveAddedCharsetDeclaration
 198         && isHTMLHeadElement(*element)) {
 199         param->haveAddedCharsetDeclaration = true;
 200         // Check meta element. WebKit only pre-parse the first 512 bytes
 201         // of the document. If the whole <HEAD> is larger and meta is the
 202         // end of head part, then this kind of pages aren't decoded correctly
 203         // because of this issue. So when we serialize the DOM, we need to
 204         // make sure the meta will in first child of head tag.
 205         // See http://bugs.webkit.org/show_bug.cgi?id=16621.
 206         // First we generate new content for writing correct META element.
 207         result.append(WebPageSerializer::generateMetaCharsetDeclaration(
 208             String(param->textEncoding.name())));
 209
 210         param->haveAddedContentsBeforeEnd = true;
 211         // Will search each META which has charset declaration, and skip them all
 212         // in PreActionBeforeSerializeOpenTag.
 213     } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
 214         param->isInScriptOrStyleTag = true;
 215     }
 216
 217     return result.toString();
 218 }
 219
 220 String WebPageSerializerImpl::preActionBeforeSerializeEndTag(
 221     const Element* element, SerializeDomParam* param, bool* needSkip)
 222 {
 223     String result;
 224
 225     *needSkip = false;
 226     if (!param->isHTMLDocument)
 227         return result;
 228     // Skip the end tag of original META tag which declare charset.
 229     // Need not to check whether it's META tag since we guarantee
 230     // skipMetaElement is definitely META tag if it's not 0.
 231     if (param->skipMetaElement == element) {
 232         *needSkip = true;
 233     } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) {
 234         ASSERT(param->isInScriptOrStyleTag);
 235         param->isInScriptOrStyleTag = false;
 236     }
 237
 238     return result;
 239 }
 240
 241 // After we finish serializing end tag of a element, we give the target
 242 // element a chance to do some post work to add some additional data.
 243 String WebPageSerializerImpl::postActionAfterSerializeEndTag(
 244     const Element* element, SerializeDomParam* param)
 245 {
 246     StringBuilder result;
 247
 248     if (!param->isHTMLDocument)
 249         return result.toString();
 250     // Comment the BASE tag when serializing DOM.
 251     if (isHTMLBaseElement(*element)) {
 252         result.appendLiteral("-->");
 253         // Append a new base tag declaration.
 254         result.append(WebPageSerializer::generateBaseTagDeclaration(
 255             param->document->baseTarget()));
 256     }
 257
 258     return result.toString();
 259 }
 260
 261 void WebPageSerializerImpl::saveHTMLContentToBuffer(
 262     const String& result, SerializeDomParam* param)
 263 {
 264     m_dataBuffer.append(result);
 265     encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,
 266                          param,
 267                          DoNotForceFlush);
 268 }
 269
 270 void WebPageSerializerImpl::encodeAndFlushBuffer(
 271     WebPageSerializerClient::PageSerializationStatus status,
 272     SerializeDomParam* param,
 273     FlushOption flushOption)
 274 {
 275     // Data buffer is not full nor do we want to force flush.
 276     if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
 277         return;
 278
 279     String content = m_dataBuffer.toString();
 280     m_dataBuffer.clear();
 281
 282     CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables);
 283
 284     // Send result to the client.
 285     m_client->didSerializeDataForFrame(param->url,
 286                                        WebCString(encodedContent.data(), encodedContent.length()),
 287                                        status);
 288 }
 289
 290 void WebPageSerializerImpl::openTagToString(Element* element,
 291                                             SerializeDomParam* param)
 292 {
 293     bool needSkip;
 294     StringBuilder result;
 295     // Do pre action for open tag.
 296     result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));
 297     if (needSkip)
 298         return;
 299     // Add open tag
 300     result.append('<');
 301     result.append(element->nodeName().lower());
 302     // Go through all attributes and serialize them.
 303     AttributeCollection attributes = element->attributes();
 304     AttributeCollection::iterator end = attributes.end();
 305     for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) {
 306         result.append(' ');
 307         // Add attribute pair
 308         result.append(it->name().toString());
 309         result.appendLiteral("=\"");
 310         if (!it->value().isEmpty()) {
 311             const String& attrValue = it->value();
 312
 313             // Check whether we need to replace some resource links
 314             // with local resource paths.
 315             const QualifiedName& attrName = it->name();
 316             if (element->hasLegalLinkAttribute(attrName)) {
 317                 // For links start with "javascript:", we do not change it.
 318                 if (attrValue.startsWith("javascript:", false)) {
 319                     result.append(attrValue);
 320                 } else {
 321                     // Get the absolute link
 322                     WebLocalFrameImpl* subFrame = WebLocalFrameImpl::fromFrameOwnerElement(element);
 323                     String completeURL = subFrame ? subFrame->frame()->document()->url() :
 324                                                     param->document->completeURL(attrValue);
 325                     // Check whether we have local files for those link.
 326                     if (m_localLinks.contains(completeURL)) {
 327                         if (!param->directoryName.isEmpty()) {
 328                             result.appendLiteral("./");
 329                             result.append(param->directoryName);
 330                             result.append('/');
 331                         }
 332                         result.append(m_localLinks.get(completeURL));
 333                     } else {
 334                         result.append(completeURL);
 335                     }
 336                 }
 337             } else {
 338                 if (param->isHTMLDocument)
 339                     result.append(m_htmlEntities.convertEntitiesInString(attrValue));
 340                 else
 341                     result.append(m_xmlEntities.convertEntitiesInString(attrValue));
 342             }
 343         }
 344         result.append('\"');
 345     }
 346
 347     // Do post action for open tag.
 348     String addedContents = postActionAfterSerializeOpenTag(element, param);
 349     // Complete the open tag for element when it has child/children.
 350     if (element->hasChildren() || param->haveAddedContentsBeforeEnd)
 351         result.append('>');
 352     // Append the added contents generate in  post action of open tag.
 353     result.append(addedContents);
 354     // Save the result to data buffer.
 355     saveHTMLContentToBuffer(result.toString(), param);
 356 }
 357
 358 // Serialize end tag of an specified element.
 359 void WebPageSerializerImpl::endTagToString(Element* element,
 360                                            SerializeDomParam* param)
 361 {
 362     bool needSkip;
 363     StringBuilder result;
 364     // Do pre action for end tag.
 365     result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));
 366     if (needSkip)
 367         return;
 368     // Write end tag when element has child/children.
 369     if (element->hasChildren() || param->haveAddedContentsBeforeEnd) {
 370         result.appendLiteral("</");
 371         result.append(element->nodeName().lower());
 372         result.append('>');
 373     } else {
 374         // Check whether we have to write end tag for empty element.
 375         if (param->isHTMLDocument) {
 376             result.append('>');
 377             // FIXME: This code is horribly wrong.  WebPageSerializerImpl must die.
 378             if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) {
 379                 // We need to write end tag when it is required.
 380                 result.appendLiteral("</");
 381                 result.append(element->nodeName().lower());
 382                 result.append('>');
 383             }
 384         } else {
 385             // For xml base document.
 386             result.appendLiteral(" />");
 387         }
 388     }
 389     // Do post action for end tag.
 390     result.append(postActionAfterSerializeEndTag(element, param));
 391     // Save the result to data buffer.
 392     saveHTMLContentToBuffer(result.toString(), param);
 393 }
 394
 395 void WebPageSerializerImpl::buildContentForNode(Node* node,
 396                                                 SerializeDomParam* param)
 397 {
 398     switch (node->nodeType()) {
 399     case Node::ELEMENT_NODE:
 400         // Process open tag of element.
 401         openTagToString(toElement(node), param);
 402         // Walk through the children nodes and process it.
 403         for (Node *child = node->firstChild(); child; child = child->nextSibling())
 404             buildContentForNode(child, param);
 405         // Process end tag of element.
 406         endTagToString(toElement(node), param);
 407         break;
 408     case Node::TEXT_NODE:
 409         saveHTMLContentToBuffer(createMarkup(node), param);
 410         break;
 411     case Node::ATTRIBUTE_NODE:
 412     case Node::DOCUMENT_NODE:
 413     case Node::DOCUMENT_FRAGMENT_NODE:
 414         // Should not exist.
 415         ASSERT_NOT_REACHED();
 416         break;
 417     // Document type node can be in DOM?
 418     case Node::DOCUMENT_TYPE_NODE:
 419         param->haveSeenDocType = true;
 420     default:
 421         // For other type node, call default action.
 422         saveHTMLContentToBuffer(createMarkup(node), param);
 423         break;
 424     }
 425 }
 426
 427 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,
 428                                              bool recursiveSerialization,
 429                                              WebPageSerializerClient* client,
 430                                              const WebVector<WebURL>& links,
 431                                              const WebVector<WebString>& localPaths,
 432                                              const WebString& localDirectoryName)
 433     : m_client(client)
 434     , m_recursiveSerialization(recursiveSerialization)
 435     , m_framesCollected(false)
 436     , m_localDirectoryName(localDirectoryName)
 437     , m_htmlEntities(false)
 438     , m_xmlEntities(true)
 439 {
 440     // Must specify available webframe.
 441     ASSERT(frame);
 442     m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame);
 443     // Make sure we have non 0 client.
 444     ASSERT(client);
 445     // Build local resources map.
 446     ASSERT(links.size() == localPaths.size());
 447     for (size_t i = 0; i < links.size(); i++) {
 448         KURL url = links[i];
 449         ASSERT(!m_localLinks.contains(url.string()));
 450         m_localLinks.set(url.string(), localPaths[i]);
 451     }
 452
 453     ASSERT(m_dataBuffer.isEmpty());
 454 }
 455
 456 void WebPageSerializerImpl::collectTargetFrames()
 457 {
 458     ASSERT(!m_framesCollected);
 459     m_framesCollected = true;
 460
 461     // First, process main frame.
 462     m_frames.append(m_specifiedWebLocalFrameImpl);
 463     // Return now if user only needs to serialize specified frame, not including
 464     // all sub-frames.
 465     if (!m_recursiveSerialization)
 466         return;
 467     // Collect all frames inside the specified frame.
 468     for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {
 469         WebLocalFrameImpl* currentFrame = m_frames[i];
 470         // Get current using document.
 471         Document* currentDoc = currentFrame->frame()->document();
 472         // Go through sub-frames.
 473         RefPtrWillBeRawPtr<HTMLAllCollection> all = currentDoc->all();
 474
 475         for (unsigned i = 0; Element* element = all->item(i); ++i) {
 476             if (!element->isHTMLElement())
 477                 continue;
 478             WebLocalFrameImpl* webFrame =
 479                 WebLocalFrameImpl::fromFrameOwnerElement(element);
 480             if (webFrame)
 481                 m_frames.append(webFrame);
 482         }
 483     }
 484 }
 485
 486 bool WebPageSerializerImpl::serialize()
 487 {
 488     if (!m_framesCollected)
 489         collectTargetFrames();
 490
 491     bool didSerialization = false;
 492     KURL mainURL = m_specifiedWebLocalFrameImpl->frame()->document()->url();
 493
 494     for (unsigned i = 0; i < m_frames.size(); ++i) {
 495         WebLocalFrameImpl* webFrame = m_frames[i];
 496         Document* document = webFrame->frame()->document();
 497         const KURL& url = document->url();
 498
 499         if (!url.isValid() || !m_localLinks.contains(url.string()))
 500             continue;
 501
 502         didSerialization = true;
 503
 504         const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding();
 505         String directoryName = url == mainURL ? m_localDirectoryName : "";
 506
 507         SerializeDomParam param(url, textEncoding, document, directoryName);
 508
 509         Element* documentElement = document->documentElement();
 510         if (documentElement)
 511             buildContentForNode(documentElement, &param);
 512
 513         encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &param, ForceFlush);
 514     }
 515
 516     ASSERT(m_dataBuffer.isEmpty());
 517     m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
 518     return didSerialization;
 519 }
 520
 521 }  // namespace blink