src/third_party/WebKit/Source/core/html/parser/HTMLMetaCharsetParser.cpp

   1 /*
   2  * Copyright (C) 2010 Google Inc. All Rights Reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #include "config.h"
  27 #include "core/html/parser/HTMLMetaCharsetParser.h"
  28
  29 #include "core/HTMLNames.h"
  30 #include "core/html/parser/HTMLParserIdioms.h"
  31 #include "core/html/parser/HTMLParserOptions.h"
  32 #include "core/html/parser/HTMLTokenizer.h"
  33 #include "wtf/text/TextEncodingRegistry.h"
  34 #include "wtf/text/WTFString.h"
  35
  36 using namespace WTF;
  37
  38 namespace blink {
  39
  40 using namespace HTMLNames;
  41
  42 HTMLMetaCharsetParser::HTMLMetaCharsetParser()
  43     : m_tokenizer(HTMLTokenizer::create(HTMLParserOptions(0)))
  44     , m_assumedCodec(newTextCodec(Latin1Encoding()))
  45     , m_inHeadSection(true)
  46     , m_doneChecking(false)
  47 {
  48 }
  49
  50 HTMLMetaCharsetParser::~HTMLMetaCharsetParser()
  51 {
  52 }
  53
  54 bool HTMLMetaCharsetParser::processMeta()
  55 {
  56     const HTMLToken::AttributeList& tokenAttributes = m_token.attributes();
  57     HTMLAttributeList attributes;
  58     for (const HTMLToken::Attribute& tokenAttribute : tokenAttributes) {
  59         String attributeName = attemptStaticStringCreation(tokenAttribute.name, Likely8Bit);
  60         String attributeValue = StringImpl::create8BitIfPossible(tokenAttribute.value);
  61         attributes.append(std::make_pair(attributeName, attributeValue));
  62     }
  63
  64     m_encoding = encodingFromMetaAttributes(attributes);
  65     return m_encoding.isValid();
  66 }
  67
  68 static const int bytesToCheckUnconditionally = 1024; // That many input bytes will be checked for meta charset even if <head> section is over.
  69
  70 bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length)
  71 {
  72     if (m_doneChecking)
  73         return true;
  74
  75     ASSERT(!m_encoding.isValid());
  76
  77     // We still don't have an encoding, and are in the head.
  78     // The following tags are allowed in <head>:
  79     // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
  80
  81     // We stop scanning when a tag that is not permitted in <head>
  82     // is seen, rather when </head> is seen, because that more closely
  83     // matches behavior in other browsers; more details in
  84     // <http://bugs.webkit.org/show_bug.cgi?id=3590>.
  85
  86     // Additionally, we ignore things that looks like tags in <title>, <script>
  87     // and <noscript>; see <http://bugs.webkit.org/show_bug.cgi?id=4560>,
  88     // <http://bugs.webkit.org/show_bug.cgi?id=12165> and
  89     // <http://bugs.webkit.org/show_bug.cgi?id=12389>.
  90
  91     // Since many sites have charset declarations after <body> or other tags
  92     // that are disallowed in <head>, we don't bail out until we've checked at
  93     // least bytesToCheckUnconditionally bytes of input.
  94
  95     m_input.append(SegmentedString(m_assumedCodec->decode(data, length)));
  96
  97     while (m_tokenizer->nextToken(m_input, m_token)) {
  98         bool end = m_token.type() == HTMLToken::EndTag;
  99         if (end || m_token.type() == HTMLToken::StartTag) {
 100             String tagName = attemptStaticStringCreation(m_token.name(), Likely8Bit);
 101             if (!end) {
 102                 m_tokenizer->updateStateFor(tagName);
 103                 if (threadSafeMatch(tagName, metaTag) && processMeta()) {
 104                     m_doneChecking = true;
 105                     return true;
 106                 }
 107             }
 108
 109             if (!threadSafeMatch(tagName, scriptTag) && !threadSafeMatch(tagName, noscriptTag)
 110                 && !threadSafeMatch(tagName, styleTag) && !threadSafeMatch(tagName, linkTag)
 111                 && !threadSafeMatch(tagName, metaTag) && !threadSafeMatch(tagName, objectTag)
 112                 && !threadSafeMatch(tagName, titleTag) && !threadSafeMatch(tagName, baseTag)
 113                 && (end || !threadSafeMatch(tagName, htmlTag)) && (end || !threadSafeMatch(tagName, headTag))) {
 114                 m_inHeadSection = false;
 115             }
 116         }
 117
 118         if (!m_inHeadSection && m_input.numberOfCharactersConsumed() >= bytesToCheckUnconditionally) {
 119             m_doneChecking = true;
 120             return true;
 121         }
 122
 123         m_token.clear();
 124     }
 125
 126     return false;
 127 }
 128
 129 }