src/third_party/WebKit/Source/core/html/parser/HTMLEntityParser.cpp

   1 /*
   2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
   3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
   4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26  */
  27
  28 #include "config.h"
  29 #include "core/html/parser/HTMLEntityParser.h"
  30
  31 #include "core/html/parser/HTMLEntitySearch.h"
  32 #include "core/html/parser/HTMLEntityTable.h"
  33 #include "wtf/text/StringBuilder.h"
  34
  35 using namespace WTF;
  36
  37 namespace WebCore {
  38
  39 static const UChar windowsLatin1ExtensionArray[32] = {
  40     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
  41     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
  42     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
  43     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
  44 };
  45
  46 static bool isAlphaNumeric(UChar cc)
  47 {
  48     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
  49 }
  50
  51 static UChar adjustEntity(UChar32 value)
  52 {
  53     if ((value & ~0x1F) != 0x0080)
  54         return value;
  55     return windowsLatin1ExtensionArray[value - 0x80];
  56 }
  57
  58 static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)
  59 {
  60     // FIXME: A number of specific entity values generate parse errors.
  61     if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
  62         decodedEntity.append(0xFFFD);
  63         return;
  64     }
  65     if (U_IS_BMP(c)) {
  66         decodedEntity.append(adjustEntity(c));
  67         return;
  68     }
  69     decodedEntity.append(c);
  70 }
  71
  72 static const UChar32 kInvalidUnicode = -1;
  73
  74 static bool isHexDigit(UChar cc)
  75 {
  76     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
  77 }
  78
  79 static UChar asHexDigit(UChar cc)
  80 {
  81     if (cc >= '0' && cc <= '9')
  82       return cc - '0';
  83     if (cc >= 'a' && cc <= 'z')
  84       return 10 + cc - 'a';
  85     if (cc >= 'A' && cc <= 'Z')
  86       return 10 + cc - 'A';
  87     ASSERT_NOT_REACHED();
  88     return 0;
  89 }
  90
  91 typedef Vector<UChar, 64> ConsumedCharacterBuffer;
  92
  93 static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters)
  94 {
  95     if (consumedCharacters.size() == 1)
  96         source.push(consumedCharacters[0]);
  97     else if (consumedCharacters.size() == 2) {
  98         source.push(consumedCharacters[0]);
  99         source.push(consumedCharacters[1]);
 100     } else
 101         source.prepend(SegmentedString(String(consumedCharacters)));
 102 }
 103
 104 static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
 105 {
 106     ConsumedCharacterBuffer consumedCharacters;
 107     HTMLEntitySearch entitySearch;
 108     while (!source.isEmpty()) {
 109         cc = source.currentChar();
 110         entitySearch.advance(cc);
 111         if (!entitySearch.isEntityPrefix())
 112             break;
 113         consumedCharacters.append(cc);
 114         source.advanceAndASSERT(cc);
 115     }
 116     notEnoughCharacters = source.isEmpty();
 117     if (notEnoughCharacters) {
 118         // We can't an entity because there might be a longer entity
 119         // that we could match if we had more data.
 120         unconsumeCharacters(source, consumedCharacters);
 121         return false;
 122     }
 123     if (!entitySearch.mostRecentMatch()) {
 124         unconsumeCharacters(source, consumedCharacters);
 125         return false;
 126     }
 127     if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
 128         // We've consumed too many characters. We need to walk the
 129         // source back to the point at which we had consumed an
 130         // actual entity.
 131         unconsumeCharacters(source, consumedCharacters);
 132         consumedCharacters.clear();
 133         const int length = entitySearch.mostRecentMatch()->length;
 134         const UChar* reference = entitySearch.mostRecentMatch()->entity;
 135         for (int i = 0; i < length; ++i) {
 136             cc = source.currentChar();
 137             ASSERT_UNUSED(reference, cc == *reference++);
 138             consumedCharacters.append(cc);
 139             source.advanceAndASSERT(cc);
 140             ASSERT(!source.isEmpty());
 141         }
 142         cc = source.currentChar();
 143     }
 144     if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
 145         || !additionalAllowedCharacter
 146         || !(isAlphaNumeric(cc) || cc == '=')) {
 147         decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
 148         if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)
 149             decodedEntity.append(second);
 150         return true;
 151     }
 152     unconsumeCharacters(source, consumedCharacters);
 153     return false;
 154 }
 155
 156 bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
 157 {
 158     ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
 159     ASSERT(!notEnoughCharacters);
 160     ASSERT(decodedEntity.isEmpty());
 161
 162     enum EntityState {
 163         Initial,
 164         Number,
 165         MaybeHexLowerCaseX,
 166         MaybeHexUpperCaseX,
 167         Hex,
 168         Decimal,
 169         Named
 170     };
 171     EntityState entityState = Initial;
 172     UChar32 result = 0;
 173     ConsumedCharacterBuffer consumedCharacters;
 174
 175     while (!source.isEmpty()) {
 176         UChar cc = source.currentChar();
 177         switch (entityState) {
 178         case Initial: {
 179             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
 180                 return false;
 181             if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
 182                 return false;
 183             if (cc == '#') {
 184                 entityState = Number;
 185                 break;
 186             }
 187             if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
 188                 entityState = Named;
 189                 continue;
 190             }
 191             return false;
 192         }
 193         case Number: {
 194             if (cc == 'x') {
 195                 entityState = MaybeHexLowerCaseX;
 196                 break;
 197             }
 198             if (cc == 'X') {
 199                 entityState = MaybeHexUpperCaseX;
 200                 break;
 201             }
 202             if (cc >= '0' && cc <= '9') {
 203                 entityState = Decimal;
 204                 continue;
 205             }
 206             source.push('#');
 207             return false;
 208         }
 209         case MaybeHexLowerCaseX: {
 210             if (isHexDigit(cc)) {
 211                 entityState = Hex;
 212                 continue;
 213             }
 214             source.push('#');
 215             source.push('x');
 216             return false;
 217         }
 218         case MaybeHexUpperCaseX: {
 219             if (isHexDigit(cc)) {
 220                 entityState = Hex;
 221                 continue;
 222             }
 223             source.push('#');
 224             source.push('X');
 225             return false;
 226         }
 227         case Hex: {
 228             if (isHexDigit(cc)) {
 229                 if (result != kInvalidUnicode)
 230                     result = result * 16 + asHexDigit(cc);
 231             } else if (cc == ';') {
 232                 source.advanceAndASSERT(cc);
 233                 appendLegalEntityFor(result, decodedEntity);
 234                 return true;
 235             } else {
 236                 appendLegalEntityFor(result, decodedEntity);
 237                 return true;
 238             }
 239             break;
 240         }
 241         case Decimal: {
 242             if (cc >= '0' && cc <= '9') {
 243                 if (result != kInvalidUnicode)
 244                     result = result * 10 + cc - '0';
 245             } else if (cc == ';') {
 246                 source.advanceAndASSERT(cc);
 247                 appendLegalEntityFor(result, decodedEntity);
 248                 return true;
 249             } else {
 250                 appendLegalEntityFor(result, decodedEntity);
 251                 return true;
 252             }
 253             break;
 254         }
 255         case Named: {
 256             return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc);
 257         }
 258         }
 259
 260         if (result > UCHAR_MAX_VALUE)
 261             result = kInvalidUnicode;
 262
 263         consumedCharacters.append(cc);
 264         source.advanceAndASSERT(cc);
 265     }
 266     ASSERT(source.isEmpty());
 267     notEnoughCharacters = true;
 268     unconsumeCharacters(source, consumedCharacters);
 269     return false;
 270 }
 271
 272 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
 273 {
 274     if (U_IS_BMP(value)) {
 275         UChar character = static_cast<UChar>(value);
 276         ASSERT(character == value);
 277         result[0] = character;
 278         return 1;
 279     }
 280
 281     result[0] = U16_LEAD(value);
 282     result[1] = U16_TRAIL(value);
 283     return 2;
 284 }
 285
 286 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
 287 {
 288     HTMLEntitySearch search;
 289     while (*name) {
 290         search.advance(*name++);
 291         if (!search.isEntityPrefix())
 292             return 0;
 293     }
 294     search.advance(';');
 295     if (!search.isEntityPrefix())
 296         return 0;
 297
 298     size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
 299     if (!search.mostRecentMatch()->secondValue)
 300         return numberOfCodePoints;
 301     return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
 302 }
 303
 304 } // namespace WebCore