src/third_party/WebKit/Source/core/html/parser/HTMLEntityParser.cpp

   1 /*
   2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
   3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
   4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26  */
  27
  28 #include "config.h"
  29 #include "core/html/parser/HTMLEntityParser.h"
  30
  31 #include "core/html/parser/HTMLEntitySearch.h"
  32 #include "core/html/parser/HTMLEntityTable.h"
  33 #include "wtf/text/StringBuilder.h"
  34
  35 using namespace WTF;
  36
  37 namespace blink {
  38
  39 static const UChar windowsLatin1ExtensionArray[32] = {
  40     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
  41     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
  42     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
  43     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
  44 };
  45
  46 static bool isAlphaNumeric(UChar cc)
  47 {
  48     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
  49 }
  50
  51 static UChar adjustEntity(UChar32 value)
  52 {
  53     if ((value & ~0x1F) != 0x0080)
  54         return value;
  55     return windowsLatin1ExtensionArray[value - 0x80];
  56 }
  57
  58 static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)
  59 {
  60     // FIXME: A number of specific entity values generate parse errors.
  61     if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
  62         decodedEntity.append(0xFFFD);
  63         return;
  64     }
  65     if (U_IS_BMP(c)) {
  66         decodedEntity.append(adjustEntity(c));
  67         return;
  68     }
  69     decodedEntity.append(c);
  70 }
  71
  72 static const UChar32 kInvalidUnicode = -1;
  73
  74 static bool isHexDigit(UChar cc)
  75 {
  76     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
  77 }
  78
  79 static UChar asHexDigit(UChar cc)
  80 {
  81     if (cc >= '0' && cc <= '9')
  82       return cc - '0';
  83     if (cc >= 'a' && cc <= 'z')
  84       return 10 + cc - 'a';
  85     if (cc >= 'A' && cc <= 'Z')
  86       return 10 + cc - 'A';
  87     ASSERT_NOT_REACHED();
  88     return 0;
  89 }
  90
  91 typedef Vector<UChar, 64> ConsumedCharacterBuffer;
  92
  93 static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters)
  94 {
  95     if (consumedCharacters.size() == 1)
  96         source.push(consumedCharacters[0]);
  97     else if (consumedCharacters.size() == 2) {
  98         source.push(consumedCharacters[0]);
  99         source.push(consumedCharacters[1]);
 100     } else
 101         source.prepend(SegmentedString(String(consumedCharacters)));
 102 }
 103
 104 static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
 105 {
 106     ConsumedCharacterBuffer consumedCharacters;
 107     HTMLEntitySearch entitySearch;
 108     while (!source.isEmpty()) {
 109         cc = source.currentChar();
 110         entitySearch.advance(cc);
 111         if (!entitySearch.isEntityPrefix())
 112             break;
 113         consumedCharacters.append(cc);
 114         source.advanceAndASSERT(cc);
 115     }
 116     notEnoughCharacters = source.isEmpty();
 117     if (notEnoughCharacters) {
 118         // We can't decide on an entity because there might be a longer entity
 119         // that we could match if we had more data.
 120         unconsumeCharacters(source, consumedCharacters);
 121         return false;
 122     }
 123     if (!entitySearch.mostRecentMatch()) {
 124         unconsumeCharacters(source, consumedCharacters);
 125         return false;
 126     }
 127     if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
 128         // We've consumed too many characters. We need to walk the
 129         // source back to the point at which we had consumed an
 130         // actual entity.
 131         unconsumeCharacters(source, consumedCharacters);
 132         consumedCharacters.clear();
 133         const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch();
 134         const int length = mostRecent->length;
 135         const LChar* reference = HTMLEntityTable::entityString(*mostRecent);
 136         for (int i = 0; i < length; ++i) {
 137             cc = source.currentChar();
 138             ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++));
 139             consumedCharacters.append(cc);
 140             source.advanceAndASSERT(cc);
 141             ASSERT(!source.isEmpty());
 142         }
 143         cc = source.currentChar();
 144     }
 145     if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
 146         || !additionalAllowedCharacter
 147         || !(isAlphaNumeric(cc) || cc == '=')) {
 148         decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
 149         if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)
 150             decodedEntity.append(second);
 151         return true;
 152     }
 153     unconsumeCharacters(source, consumedCharacters);
 154     return false;
 155 }
 156
 157 bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
 158 {
 159     ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
 160     ASSERT(!notEnoughCharacters);
 161     ASSERT(decodedEntity.isEmpty());
 162
 163     enum EntityState {
 164         Initial,
 165         Number,
 166         MaybeHexLowerCaseX,
 167         MaybeHexUpperCaseX,
 168         Hex,
 169         Decimal,
 170         Named
 171     };
 172     EntityState entityState = Initial;
 173     UChar32 result = 0;
 174     ConsumedCharacterBuffer consumedCharacters;
 175
 176     while (!source.isEmpty()) {
 177         UChar cc = source.currentChar();
 178         switch (entityState) {
 179         case Initial: {
 180             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
 181                 return false;
 182             if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
 183                 return false;
 184             if (cc == '#') {
 185                 entityState = Number;
 186                 break;
 187             }
 188             if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
 189                 entityState = Named;
 190                 continue;
 191             }
 192             return false;
 193         }
 194         case Number: {
 195             if (cc == 'x') {
 196                 entityState = MaybeHexLowerCaseX;
 197                 break;
 198             }
 199             if (cc == 'X') {
 200                 entityState = MaybeHexUpperCaseX;
 201                 break;
 202             }
 203             if (cc >= '0' && cc <= '9') {
 204                 entityState = Decimal;
 205                 continue;
 206             }
 207             source.push('#');
 208             return false;
 209         }
 210         case MaybeHexLowerCaseX: {
 211             if (isHexDigit(cc)) {
 212                 entityState = Hex;
 213                 continue;
 214             }
 215             source.push('#');
 216             source.push('x');
 217             return false;
 218         }
 219         case MaybeHexUpperCaseX: {
 220             if (isHexDigit(cc)) {
 221                 entityState = Hex;
 222                 continue;
 223             }
 224             source.push('#');
 225             source.push('X');
 226             return false;
 227         }
 228         case Hex: {
 229             if (isHexDigit(cc)) {
 230                 if (result != kInvalidUnicode)
 231                     result = result * 16 + asHexDigit(cc);
 232             } else if (cc == ';') {
 233                 source.advanceAndASSERT(cc);
 234                 appendLegalEntityFor(result, decodedEntity);
 235                 return true;
 236             } else {
 237                 appendLegalEntityFor(result, decodedEntity);
 238                 return true;
 239             }
 240             break;
 241         }
 242         case Decimal: {
 243             if (cc >= '0' && cc <= '9') {
 244                 if (result != kInvalidUnicode)
 245                     result = result * 10 + cc - '0';
 246             } else if (cc == ';') {
 247                 source.advanceAndASSERT(cc);
 248                 appendLegalEntityFor(result, decodedEntity);
 249                 return true;
 250             } else {
 251                 appendLegalEntityFor(result, decodedEntity);
 252                 return true;
 253             }
 254             break;
 255         }
 256         case Named: {
 257             return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc);
 258         }
 259         }
 260
 261         if (result > UCHAR_MAX_VALUE)
 262             result = kInvalidUnicode;
 263
 264         consumedCharacters.append(cc);
 265         source.advanceAndASSERT(cc);
 266     }
 267     ASSERT(source.isEmpty());
 268     notEnoughCharacters = true;
 269     unconsumeCharacters(source, consumedCharacters);
 270     return false;
 271 }
 272
 273 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
 274 {
 275     if (U_IS_BMP(value)) {
 276         UChar character = static_cast<UChar>(value);
 277         ASSERT(character == value);
 278         result[0] = character;
 279         return 1;
 280     }
 281
 282     result[0] = U16_LEAD(value);
 283     result[1] = U16_TRAIL(value);
 284     return 2;
 285 }
 286
 287 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
 288 {
 289     HTMLEntitySearch search;
 290     while (*name) {
 291         search.advance(*name++);
 292         if (!search.isEntityPrefix())
 293             return 0;
 294     }
 295     search.advance(';');
 296     if (!search.isEntityPrefix())
 297         return 0;
 298
 299     size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
 300     if (!search.mostRecentMatch()->secondValue)
 301         return numberOfCodePoints;
 302     return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
 303 }
 304
 305 } // namespace blink