src/third_party/WebKit/Source/core/html/parser/HTMLToken.h

   1 /*
   2  * Copyright (C) 2013 Google, Inc. All Rights Reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #ifndef HTMLToken_h
  27 #define HTMLToken_h
  28
  29 #include "core/dom/Attribute.h"
  30 #include "wtf/PassOwnPtr.h"
  31 #include "wtf/RefCounted.h"
  32 #include "wtf/RefPtr.h"
  33
  34 namespace blink {
  35
  36 class DoctypeData {
  37     WTF_MAKE_NONCOPYABLE(DoctypeData);
  38 public:
  39     DoctypeData()
  40         : m_hasPublicIdentifier(false)
  41         , m_hasSystemIdentifier(false)
  42         , m_forceQuirks(false)
  43     {
  44     }
  45
  46     bool m_hasPublicIdentifier;
  47     bool m_hasSystemIdentifier;
  48     WTF::Vector<UChar> m_publicIdentifier;
  49     WTF::Vector<UChar> m_systemIdentifier;
  50     bool m_forceQuirks;
  51 };
  52
  53 static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes, const QualifiedName& name)
  54 {
  55     for (unsigned i = 0; i < attributes.size(); ++i) {
  56         if (attributes.at(i).name().matches(name))
  57             return &attributes.at(i);
  58     }
  59     return 0;
  60 }
  61
  62 class HTMLToken {
  63     WTF_MAKE_NONCOPYABLE(HTMLToken);
  64     WTF_MAKE_FAST_ALLOCATED;
  65 public:
  66     enum Type {
  67         Uninitialized,
  68         DOCTYPE,
  69         StartTag,
  70         EndTag,
  71         Comment,
  72         Character,
  73         EndOfFile,
  74     };
  75
  76     class Attribute {
  77     public:
  78         class Range {
  79         public:
  80             int start;
  81             int end;
  82         };
  83
  84         Range nameRange;
  85         Range valueRange;
  86         Vector<UChar, 32> name;
  87         Vector<UChar, 32> value;
  88     };
  89
  90     typedef Vector<Attribute, 10> AttributeList;
  91
  92     // By using an inline capacity of 256, we avoid spilling over into an malloced buffer
  93     // approximately 99% of the time based on a non-scientific browse around a number of
  94     // popular web sites on 23 May 2013.
  95     typedef Vector<UChar, 256> DataVector;
  96
  97     HTMLToken() { clear(); }
  98
  99     void clear()
 100     {
 101         m_type = Uninitialized;
 102         m_range.start = 0;
 103         m_range.end = 0;
 104         m_baseOffset = 0;
 105         // Don't call Vector::clear() as that would destroy the
 106         // alloced VectorBuffer. If the innerHTML'd content has
 107         // two 257 character text nodes in a row, we'll needlessly
 108         // thrash malloc. When we finally finish the parse the
 109         // HTMLToken will be destroyed and the VectorBuffer released.
 110         m_data.shrink(0);
 111         m_orAllData = 0;
 112     }
 113
 114     bool isUninitialized() { return m_type == Uninitialized; }
 115     Type type() const { return m_type; }
 116
 117     void makeEndOfFile()
 118     {
 119         ASSERT(m_type == Uninitialized);
 120         m_type = EndOfFile;
 121     }
 122
 123     /* Range and offset methods exposed for HTMLSourceTracker and HTMLViewSourceParser */
 124     int startIndex() const { return m_range.start; }
 125     int endIndex() const { return m_range.end; }
 126
 127     void setBaseOffset(int offset)
 128     {
 129         m_baseOffset = offset;
 130     }
 131
 132     void end(int endOffset)
 133     {
 134         m_range.end = endOffset - m_baseOffset;
 135     }
 136
 137     const DataVector& data() const
 138     {
 139         ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag);
 140         return m_data;
 141     }
 142
 143     bool isAll8BitData() const
 144     {
 145         return (m_orAllData <= 0xff);
 146     }
 147
 148     const DataVector& name() const
 149     {
 150         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
 151         return m_data;
 152     }
 153
 154     void appendToName(UChar character)
 155     {
 156         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
 157         ASSERT(character);
 158         m_data.append(character);
 159         m_orAllData |= character;
 160     }
 161
 162     /* DOCTYPE Tokens */
 163
 164     bool forceQuirks() const
 165     {
 166         ASSERT(m_type == DOCTYPE);
 167         return m_doctypeData->m_forceQuirks;
 168     }
 169
 170     void setForceQuirks()
 171     {
 172         ASSERT(m_type == DOCTYPE);
 173         m_doctypeData->m_forceQuirks = true;
 174     }
 175
 176     void beginDOCTYPE()
 177     {
 178         ASSERT(m_type == Uninitialized);
 179         m_type = DOCTYPE;
 180         m_doctypeData = adoptPtr(new DoctypeData);
 181     }
 182
 183     void beginDOCTYPE(UChar character)
 184     {
 185         ASSERT(character);
 186         beginDOCTYPE();
 187         m_data.append(character);
 188         m_orAllData |= character;
 189     }
 190
 191     // FIXME: Distinguish between a missing public identifer and an empty one.
 192     const WTF::Vector<UChar>& publicIdentifier() const
 193     {
 194         ASSERT(m_type == DOCTYPE);
 195         return m_doctypeData->m_publicIdentifier;
 196     }
 197
 198     // FIXME: Distinguish between a missing system identifer and an empty one.
 199     const WTF::Vector<UChar>& systemIdentifier() const
 200     {
 201         ASSERT(m_type == DOCTYPE);
 202         return m_doctypeData->m_systemIdentifier;
 203     }
 204
 205     void setPublicIdentifierToEmptyString()
 206     {
 207         ASSERT(m_type == DOCTYPE);
 208         m_doctypeData->m_hasPublicIdentifier = true;
 209         m_doctypeData->m_publicIdentifier.clear();
 210     }
 211
 212     void setSystemIdentifierToEmptyString()
 213     {
 214         ASSERT(m_type == DOCTYPE);
 215         m_doctypeData->m_hasSystemIdentifier = true;
 216         m_doctypeData->m_systemIdentifier.clear();
 217     }
 218
 219     void appendToPublicIdentifier(UChar character)
 220     {
 221         ASSERT(character);
 222         ASSERT(m_type == DOCTYPE);
 223         ASSERT(m_doctypeData->m_hasPublicIdentifier);
 224         m_doctypeData->m_publicIdentifier.append(character);
 225     }
 226
 227     void appendToSystemIdentifier(UChar character)
 228     {
 229         ASSERT(character);
 230         ASSERT(m_type == DOCTYPE);
 231         ASSERT(m_doctypeData->m_hasSystemIdentifier);
 232         m_doctypeData->m_systemIdentifier.append(character);
 233     }
 234
 235     PassOwnPtr<DoctypeData> releaseDoctypeData()
 236     {
 237         return m_doctypeData.release();
 238     }
 239
 240     /* Start/End Tag Tokens */
 241
 242     bool selfClosing() const
 243     {
 244         ASSERT(m_type == StartTag || m_type == EndTag);
 245         return m_selfClosing;
 246     }
 247
 248     void setSelfClosing()
 249     {
 250         ASSERT(m_type == StartTag || m_type == EndTag);
 251         m_selfClosing = true;
 252     }
 253
 254     void beginStartTag(UChar character)
 255     {
 256         ASSERT(character);
 257         ASSERT(m_type == Uninitialized);
 258         m_type = StartTag;
 259         m_selfClosing = false;
 260         m_currentAttribute = 0;
 261         m_attributes.clear();
 262
 263         m_data.append(character);
 264         m_orAllData |= character;
 265     }
 266
 267     void beginEndTag(LChar character)
 268     {
 269         ASSERT(m_type == Uninitialized);
 270         m_type = EndTag;
 271         m_selfClosing = false;
 272         m_currentAttribute = 0;
 273         m_attributes.clear();
 274
 275         m_data.append(character);
 276     }
 277
 278     void beginEndTag(const Vector<LChar, 32>& characters)
 279     {
 280         ASSERT(m_type == Uninitialized);
 281         m_type = EndTag;
 282         m_selfClosing = false;
 283         m_currentAttribute = 0;
 284         m_attributes.clear();
 285
 286         m_data.appendVector(characters);
 287     }
 288
 289     void addNewAttribute()
 290     {
 291         ASSERT(m_type == StartTag || m_type == EndTag);
 292         m_attributes.grow(m_attributes.size() + 1);
 293         m_currentAttribute = &m_attributes.last();
 294 #if ENABLE(ASSERT)
 295         m_currentAttribute->nameRange.start = 0;
 296         m_currentAttribute->nameRange.end = 0;
 297         m_currentAttribute->valueRange.start = 0;
 298         m_currentAttribute->valueRange.end = 0;
 299 #endif
 300     }
 301
 302     void beginAttributeName(int offset)
 303     {
 304         m_currentAttribute->nameRange.start = offset - m_baseOffset;
 305     }
 306
 307     void endAttributeName(int offset)
 308     {
 309         int index = offset - m_baseOffset;
 310         m_currentAttribute->nameRange.end = index;
 311         m_currentAttribute->valueRange.start = index;
 312         m_currentAttribute->valueRange.end = index;
 313     }
 314
 315     void beginAttributeValue(int offset)
 316     {
 317         m_currentAttribute->valueRange.start = offset - m_baseOffset;
 318 #if ENABLE(ASSERT)
 319         m_currentAttribute->valueRange.end = 0;
 320 #endif
 321     }
 322
 323     void endAttributeValue(int offset)
 324     {
 325         m_currentAttribute->valueRange.end = offset - m_baseOffset;
 326     }
 327
 328     void appendToAttributeName(UChar character)
 329     {
 330         ASSERT(character);
 331         ASSERT(m_type == StartTag || m_type == EndTag);
 332         ASSERT(m_currentAttribute->nameRange.start);
 333         m_currentAttribute->name.append(character);
 334     }
 335
 336     void appendToAttributeValue(UChar character)
 337     {
 338         ASSERT(character);
 339         ASSERT(m_type == StartTag || m_type == EndTag);
 340         ASSERT(m_currentAttribute->valueRange.start);
 341         m_currentAttribute->value.append(character);
 342     }
 343
 344     void appendToAttributeValue(size_t i, const String& value)
 345     {
 346         ASSERT(!value.isEmpty());
 347         ASSERT(m_type == StartTag || m_type == EndTag);
 348         append(m_attributes[i].value, value);
 349     }
 350
 351     const AttributeList& attributes() const
 352     {
 353         ASSERT(m_type == StartTag || m_type == EndTag);
 354         return m_attributes;
 355     }
 356
 357     const Attribute* getAttributeItem(const QualifiedName& name) const
 358     {
 359         for (unsigned i = 0; i < m_attributes.size(); ++i) {
 360             if (AtomicString(m_attributes.at(i).name) == name.localName())
 361                 return &m_attributes.at(i);
 362         }
 363         return 0;
 364     }
 365
 366     // Used by the XSSAuditor to nuke XSS-laden attributes.
 367     void eraseValueOfAttribute(size_t i)
 368     {
 369         ASSERT(m_type == StartTag || m_type == EndTag);
 370         m_attributes[i].value.clear();
 371     }
 372
 373     /* Character Tokens */
 374
 375     // Starting a character token works slightly differently than starting
 376     // other types of tokens because we want to save a per-character branch.
 377     void ensureIsCharacterToken()
 378     {
 379         ASSERT(m_type == Uninitialized || m_type == Character);
 380         m_type = Character;
 381     }
 382
 383     const DataVector& characters() const
 384     {
 385         ASSERT(m_type == Character);
 386         return m_data;
 387     }
 388
 389     void appendToCharacter(char character)
 390     {
 391         ASSERT(m_type == Character);
 392         m_data.append(character);
 393     }
 394
 395     void appendToCharacter(UChar character)
 396     {
 397         ASSERT(m_type == Character);
 398         m_data.append(character);
 399         m_orAllData |= character;
 400     }
 401
 402     void appendToCharacter(const Vector<LChar, 32>& characters)
 403     {
 404         ASSERT(m_type == Character);
 405         m_data.appendVector(characters);
 406     }
 407
 408     /* Comment Tokens */
 409
 410     const DataVector& comment() const
 411     {
 412         ASSERT(m_type == Comment);
 413         return m_data;
 414     }
 415
 416     void beginComment()
 417     {
 418         ASSERT(m_type == Uninitialized);
 419         m_type = Comment;
 420     }
 421
 422     void appendToComment(UChar character)
 423     {
 424         ASSERT(character);
 425         ASSERT(m_type == Comment);
 426         m_data.append(character);
 427         m_orAllData |= character;
 428     }
 429
 430     // Only for XSSAuditor
 431     void eraseCharacters()
 432     {
 433         ASSERT(m_type == Character);
 434         m_data.clear();
 435         m_orAllData = 0;
 436     }
 437
 438 private:
 439     Type m_type;
 440     Attribute::Range m_range; // Always starts at zero.
 441     int m_baseOffset;
 442     DataVector m_data;
 443     UChar m_orAllData;
 444
 445     // For StartTag and EndTag
 446     bool m_selfClosing;
 447     AttributeList m_attributes;
 448
 449     // A pointer into m_attributes used during lexing.
 450     Attribute* m_currentAttribute;
 451
 452     // For DOCTYPE
 453     OwnPtr<DoctypeData> m_doctypeData;
 454 };
 455
 456 }
 457
 458 #endif