src/third_party/WebKit/Source/core/html/parser/BackgroundHTMLParser.cpp

   1 /*
   2  * Copyright (C) 2013 Google, Inc. All Rights Reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GOOGLE INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #include "config.h"
  27 #include "core/html/parser/BackgroundHTMLParser.h"
  28
  29 #include "core/html/parser/HTMLDocumentParser.h"
  30 #include "core/html/parser/TextResourceDecoder.h"
  31 #include "core/html/parser/XSSAuditor.h"
  32 #include "wtf/MainThread.h"
  33 #include "wtf/text/TextPosition.h"
  34
  35 namespace blink {
  36
  37 // On a network with high latency and high bandwidth, using a device
  38 // with a fast CPU, we could end up speculatively tokenizing
  39 // the whole document, well ahead of when the main-thread actually needs it.
  40 // This is a waste of memory (and potentially time if the speculation fails).
  41 // So we limit our outstanding tokens arbitrarily to 10,000.
  42 // Our maximal memory spent speculating will be approximately:
  43 // (outstandingTokenLimit + pendingTokenLimit) * sizeof(CompactToken)
  44 // We use a separate low and high water mark to avoid constantly topping
  45 // off the main thread's token buffer.
  46 // At time of writing, this is (10000 + 1000) * 28 bytes = ~308kb of memory.
  47 // These numbers have not been tuned.
  48 static const size_t outstandingTokenLimit = 10000;
  49
  50 // We limit our chucks to 1000 tokens, to make sure the main
  51 // thread is never waiting on the parser thread for tokens.
  52 // This was tuned in https://bugs.webkit.org/show_bug.cgi?id=110408.
  53 static const size_t pendingTokenLimit = 1000;
  54
  55 using namespace HTMLNames;
  56
  57 #if ENABLE(ASSERT)
  58
  59 static void checkThatTokensAreSafeToSendToAnotherThread(const CompactHTMLTokenStream* tokens)
  60 {
  61     for (size_t i = 0; i < tokens->size(); ++i)
  62         ASSERT(tokens->at(i).isSafeToSendToAnotherThread());
  63 }
  64
  65 static void checkThatPreloadsAreSafeToSendToAnotherThread(const PreloadRequestStream& preloads)
  66 {
  67     for (size_t i = 0; i < preloads.size(); ++i)
  68         ASSERT(preloads[i]->isSafeToSendToAnotherThread());
  69 }
  70
  71 static void checkThatXSSInfosAreSafeToSendToAnotherThread(const XSSInfoStream& infos)
  72 {
  73     for (size_t i = 0; i < infos.size(); ++i)
  74         ASSERT(infos[i]->isSafeToSendToAnotherThread());
  75 }
  76
  77 #endif
  78
  79 void BackgroundHTMLParser::start(PassRefPtr<WeakReference<BackgroundHTMLParser> > reference, PassOwnPtr<Configuration> config)
  80 {
  81     new BackgroundHTMLParser(reference, config);
  82     // Caller must free by calling stop().
  83 }
  84
  85 BackgroundHTMLParser::BackgroundHTMLParser(PassRefPtr<WeakReference<BackgroundHTMLParser> > reference, PassOwnPtr<Configuration> config)
  86     : m_weakFactory(reference, this)
  87     , m_token(adoptPtr(new HTMLToken))
  88     , m_tokenizer(HTMLTokenizer::create(config->options))
  89     , m_treeBuilderSimulator(config->options)
  90     , m_options(config->options)
  91     , m_parser(config->parser)
  92     , m_pendingTokens(adoptPtr(new CompactHTMLTokenStream))
  93     , m_xssAuditor(config->xssAuditor.release())
  94     , m_preloadScanner(config->preloadScanner.release())
  95     , m_decoder(config->decoder.release())
  96 {
  97 }
  98
  99 BackgroundHTMLParser::~BackgroundHTMLParser()
 100 {
 101 }
 102
 103 void BackgroundHTMLParser::appendRawBytesFromParserThread(const char* data, int dataLength)
 104 {
 105     ASSERT(m_decoder);
 106     updateDocument(m_decoder->decode(data, dataLength));
 107 }
 108
 109 void BackgroundHTMLParser::appendRawBytesFromMainThread(PassOwnPtr<Vector<char> > buffer)
 110 {
 111     ASSERT(m_decoder);
 112     updateDocument(m_decoder->decode(buffer->data(), buffer->size()));
 113 }
 114
 115 void BackgroundHTMLParser::appendDecodedBytes(const String& input)
 116 {
 117     ASSERT(!m_input.current().isClosed());
 118     m_input.append(input);
 119     pumpTokenizer();
 120 }
 121
 122 void BackgroundHTMLParser::setDecoder(PassOwnPtr<TextResourceDecoder> decoder)
 123 {
 124     ASSERT(decoder);
 125     m_decoder = decoder;
 126 }
 127
 128 void BackgroundHTMLParser::flush()
 129 {
 130     ASSERT(m_decoder);
 131     updateDocument(m_decoder->flush());
 132 }
 133
 134 void BackgroundHTMLParser::updateDocument(const String& decodedData)
 135 {
 136     DocumentEncodingData encodingData(*m_decoder.get());
 137
 138     if (encodingData != m_lastSeenEncodingData) {
 139         m_lastSeenEncodingData = encodingData;
 140
 141         m_xssAuditor->setEncoding(encodingData.encoding());
 142         callOnMainThread(bind(&HTMLDocumentParser::didReceiveEncodingDataFromBackgroundParser, m_parser, encodingData));
 143     }
 144
 145     if (decodedData.isEmpty())
 146         return;
 147
 148     appendDecodedBytes(decodedData);
 149 }
 150
 151 void BackgroundHTMLParser::resumeFrom(PassOwnPtr<Checkpoint> checkpoint)
 152 {
 153     m_parser = checkpoint->parser;
 154     m_token = checkpoint->token.release();
 155     m_tokenizer = checkpoint->tokenizer.release();
 156     m_treeBuilderSimulator.setState(checkpoint->treeBuilderState);
 157     m_input.rewindTo(checkpoint->inputCheckpoint, checkpoint->unparsedInput);
 158     m_preloadScanner->rewindTo(checkpoint->preloadScannerCheckpoint);
 159     pumpTokenizer();
 160 }
 161
 162 void BackgroundHTMLParser::startedChunkWithCheckpoint(HTMLInputCheckpoint inputCheckpoint)
 163 {
 164     // Note, we should not have to worry about the index being invalid
 165     // as messages from the main thread will be processed in FIFO order.
 166     m_input.invalidateCheckpointsBefore(inputCheckpoint);
 167     pumpTokenizer();
 168 }
 169
 170 void BackgroundHTMLParser::finish()
 171 {
 172     markEndOfFile();
 173     pumpTokenizer();
 174 }
 175
 176 void BackgroundHTMLParser::stop()
 177 {
 178     delete this;
 179 }
 180
 181 void BackgroundHTMLParser::forcePlaintextForTextDocument()
 182 {
 183     // This is only used by the TextDocumentParser (a subclass of HTMLDocumentParser)
 184     // to force us into the PLAINTEXT state w/o using a <plaintext> tag.
 185     // The TextDocumentParser uses a <pre> tag for historical/compatibility reasons.
 186     m_tokenizer->setState(HTMLTokenizer::PLAINTEXTState);
 187 }
 188
 189 void BackgroundHTMLParser::markEndOfFile()
 190 {
 191     ASSERT(!m_input.current().isClosed());
 192     m_input.append(String(&kEndOfFileMarker, 1));
 193     m_input.close();
 194 }
 195
 196 void BackgroundHTMLParser::pumpTokenizer()
 197 {
 198     // No need to start speculating until the main thread has almost caught up.
 199     if (m_input.totalCheckpointTokenCount() > outstandingTokenLimit)
 200         return;
 201
 202     while (true) {
 203         m_sourceTracker.start(m_input.current(), m_tokenizer.get(), *m_token);
 204         if (!m_tokenizer->nextToken(m_input.current(), *m_token)) {
 205             // We've reached the end of our current input.
 206             sendTokensToMainThread();
 207             break;
 208         }
 209         m_sourceTracker.end(m_input.current(), m_tokenizer.get(), *m_token);
 210
 211         {
 212             TextPosition position = TextPosition(m_input.current().currentLine(), m_input.current().currentColumn());
 213
 214             if (OwnPtr<XSSInfo> xssInfo = m_xssAuditor->filterToken(FilterTokenRequest(*m_token, m_sourceTracker, m_tokenizer->shouldAllowCDATA()))) {
 215                 xssInfo->m_textPosition = position;
 216                 m_pendingXSSInfos.append(xssInfo.release());
 217             }
 218
 219             CompactHTMLToken token(m_token.get(), TextPosition(m_input.current().currentLine(), m_input.current().currentColumn()));
 220
 221             m_preloadScanner->scan(token, m_input.current(), m_pendingPreloads);
 222
 223             m_pendingTokens->append(token);
 224         }
 225
 226         m_token->clear();
 227
 228         if (!m_treeBuilderSimulator.simulate(m_pendingTokens->last(), m_tokenizer.get()) || m_pendingTokens->size() >= pendingTokenLimit) {
 229             sendTokensToMainThread();
 230             // If we're far ahead of the main thread, yield for a bit to avoid consuming too much memory.
 231             if (m_input.totalCheckpointTokenCount() > outstandingTokenLimit)
 232                 break;
 233         }
 234     }
 235 }
 236
 237 void BackgroundHTMLParser::sendTokensToMainThread()
 238 {
 239     if (m_pendingTokens->isEmpty())
 240         return;
 241
 242 #if ENABLE(ASSERT)
 243     checkThatTokensAreSafeToSendToAnotherThread(m_pendingTokens.get());
 244     checkThatPreloadsAreSafeToSendToAnotherThread(m_pendingPreloads);
 245     checkThatXSSInfosAreSafeToSendToAnotherThread(m_pendingXSSInfos);
 246 #endif
 247
 248     OwnPtr<HTMLDocumentParser::ParsedChunk> chunk = adoptPtr(new HTMLDocumentParser::ParsedChunk);
 249     chunk->preloads.swap(m_pendingPreloads);
 250     chunk->xssInfos.swap(m_pendingXSSInfos);
 251     chunk->tokenizerState = m_tokenizer->state();
 252     chunk->treeBuilderState = m_treeBuilderSimulator.state();
 253     chunk->inputCheckpoint = m_input.createCheckpoint(m_pendingTokens->size());
 254     chunk->preloadScannerCheckpoint = m_preloadScanner->createCheckpoint();
 255     chunk->tokens = m_pendingTokens.release();
 256     callOnMainThread(bind(&HTMLDocumentParser::didReceiveParsedChunkFromBackgroundParser, m_parser, chunk.release()));
 257
 258     m_pendingTokens = adoptPtr(new CompactHTMLTokenStream);
 259 }
 260
 261 }