2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include "core/html/parser/HTMLEntityParser.h"
31 #include "core/html/parser/HTMLEntitySearch.h"
32 #include "core/html/parser/HTMLEntityTable.h"
33 #include "wtf/text/StringBuilder.h"
39 static const UChar windowsLatin1ExtensionArray[32] = {
40 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
41 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
42 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
43 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
46 static bool isAlphaNumeric(UChar cc)
48 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
51 static UChar adjustEntity(UChar32 value)
53 if ((value & ~0x1F) != 0x0080)
55 return windowsLatin1ExtensionArray[value - 0x80];
58 static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)
60 // FIXME: A number of specific entity values generate parse errors.
61 if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
62 decodedEntity.append(0xFFFD);
66 decodedEntity.append(adjustEntity(c));
69 decodedEntity.append(c);
72 static const UChar32 kInvalidUnicode = -1;
74 static bool isHexDigit(UChar cc)
76 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
79 static UChar asHexDigit(UChar cc)
81 if (cc >= '0' && cc <= '9')
83 if (cc >= 'a' && cc <= 'z')
85 if (cc >= 'A' && cc <= 'Z')
91 typedef Vector<UChar, 64> ConsumedCharacterBuffer;
93 static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters)
95 if (consumedCharacters.size() == 1)
96 source.push(consumedCharacters[0]);
97 else if (consumedCharacters.size() == 2) {
98 source.push(consumedCharacters[0]);
99 source.push(consumedCharacters[1]);
101 source.prepend(SegmentedString(String(consumedCharacters)));
104 static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
106 ConsumedCharacterBuffer consumedCharacters;
107 HTMLEntitySearch entitySearch;
108 while (!source.isEmpty()) {
109 cc = source.currentChar();
110 entitySearch.advance(cc);
111 if (!entitySearch.isEntityPrefix())
113 consumedCharacters.append(cc);
114 source.advanceAndASSERT(cc);
116 notEnoughCharacters = source.isEmpty();
117 if (notEnoughCharacters) {
118 // We can't decide on an entity because there might be a longer entity
119 // that we could match if we had more data.
120 unconsumeCharacters(source, consumedCharacters);
123 if (!entitySearch.mostRecentMatch()) {
124 unconsumeCharacters(source, consumedCharacters);
127 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
128 // We've consumed too many characters. We need to walk the
129 // source back to the point at which we had consumed an
131 unconsumeCharacters(source, consumedCharacters);
132 consumedCharacters.clear();
133 const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch();
134 const int length = mostRecent->length;
135 const LChar* reference = HTMLEntityTable::entityString(*mostRecent);
136 for (int i = 0; i < length; ++i) {
137 cc = source.currentChar();
138 ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++));
139 consumedCharacters.append(cc);
140 source.advanceAndASSERT(cc);
141 ASSERT(!source.isEmpty());
143 cc = source.currentChar();
145 if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
146 || !additionalAllowedCharacter
147 || !(isAlphaNumeric(cc) || cc == '=')) {
148 decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
149 if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)
150 decodedEntity.append(second);
153 unconsumeCharacters(source, consumedCharacters);
157 bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
159 ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
160 ASSERT(!notEnoughCharacters);
161 ASSERT(decodedEntity.isEmpty());
172 EntityState entityState = Initial;
174 ConsumedCharacterBuffer consumedCharacters;
176 while (!source.isEmpty()) {
177 UChar cc = source.currentChar();
178 switch (entityState) {
180 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
182 if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
185 entityState = Number;
188 if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
196 entityState = MaybeHexLowerCaseX;
200 entityState = MaybeHexUpperCaseX;
203 if (cc >= '0' && cc <= '9') {
204 entityState = Decimal;
210 case MaybeHexLowerCaseX: {
211 if (isHexDigit(cc)) {
219 case MaybeHexUpperCaseX: {
220 if (isHexDigit(cc)) {
229 if (isHexDigit(cc)) {
230 if (result != kInvalidUnicode)
231 result = result * 16 + asHexDigit(cc);
232 } else if (cc == ';') {
233 source.advanceAndASSERT(cc);
234 appendLegalEntityFor(result, decodedEntity);
237 appendLegalEntityFor(result, decodedEntity);
243 if (cc >= '0' && cc <= '9') {
244 if (result != kInvalidUnicode)
245 result = result * 10 + cc - '0';
246 } else if (cc == ';') {
247 source.advanceAndASSERT(cc);
248 appendLegalEntityFor(result, decodedEntity);
251 appendLegalEntityFor(result, decodedEntity);
257 return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc);
261 if (result > UCHAR_MAX_VALUE)
262 result = kInvalidUnicode;
264 consumedCharacters.append(cc);
265 source.advanceAndASSERT(cc);
267 ASSERT(source.isEmpty());
268 notEnoughCharacters = true;
269 unconsumeCharacters(source, consumedCharacters);
273 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
275 if (U_IS_BMP(value)) {
276 UChar character = static_cast<UChar>(value);
277 ASSERT(character == value);
278 result[0] = character;
282 result[0] = U16_LEAD(value);
283 result[1] = U16_TRAIL(value);
287 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
289 HTMLEntitySearch search;
291 search.advance(*name++);
292 if (!search.isEntityPrefix())
296 if (!search.isEntityPrefix())
299 size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
300 if (!search.mostRecentMatch()->secondValue)
301 return numberOfCodePoints;
302 return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);