2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include "core/html/parser/HTMLEntityParser.h"
31 #include "core/html/parser/HTMLEntitySearch.h"
32 #include "core/html/parser/HTMLEntityTable.h"
33 #include "wtf/text/StringBuilder.h"
39 static const UChar windowsLatin1ExtensionArray[32] = {
40 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
41 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
42 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
43 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
46 static bool isAlphaNumeric(UChar cc)
48 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
51 static UChar adjustEntity(UChar32 value)
53 if ((value & ~0x1F) != 0x0080)
55 return windowsLatin1ExtensionArray[value - 0x80];
58 static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)
60 // FIXME: A number of specific entity values generate parse errors.
61 if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
62 decodedEntity.append(0xFFFD);
66 decodedEntity.append(adjustEntity(c));
69 decodedEntity.append(c);
72 static const UChar32 kInvalidUnicode = -1;
74 static bool isHexDigit(UChar cc)
76 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
79 static UChar asHexDigit(UChar cc)
81 if (cc >= '0' && cc <= '9')
83 if (cc >= 'a' && cc <= 'z')
85 if (cc >= 'A' && cc <= 'Z')
91 typedef Vector<UChar, 64> ConsumedCharacterBuffer;
93 static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters)
95 if (consumedCharacters.size() == 1)
96 source.push(consumedCharacters[0]);
97 else if (consumedCharacters.size() == 2) {
98 source.push(consumedCharacters[0]);
99 source.push(consumedCharacters[1]);
101 source.prepend(SegmentedString(String(consumedCharacters)));
104 static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
106 ConsumedCharacterBuffer consumedCharacters;
107 HTMLEntitySearch entitySearch;
108 while (!source.isEmpty()) {
109 cc = source.currentChar();
110 entitySearch.advance(cc);
111 if (!entitySearch.isEntityPrefix())
113 consumedCharacters.append(cc);
114 source.advanceAndASSERT(cc);
116 notEnoughCharacters = source.isEmpty();
117 if (notEnoughCharacters) {
118 // We can't an entity because there might be a longer entity
119 // that we could match if we had more data.
120 unconsumeCharacters(source, consumedCharacters);
123 if (!entitySearch.mostRecentMatch()) {
124 unconsumeCharacters(source, consumedCharacters);
127 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
128 // We've consumed too many characters. We need to walk the
129 // source back to the point at which we had consumed an
131 unconsumeCharacters(source, consumedCharacters);
132 consumedCharacters.clear();
133 const int length = entitySearch.mostRecentMatch()->length;
134 const UChar* reference = entitySearch.mostRecentMatch()->entity;
135 for (int i = 0; i < length; ++i) {
136 cc = source.currentChar();
137 ASSERT_UNUSED(reference, cc == *reference++);
138 consumedCharacters.append(cc);
139 source.advanceAndASSERT(cc);
140 ASSERT(!source.isEmpty());
142 cc = source.currentChar();
144 if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
145 || !additionalAllowedCharacter
146 || !(isAlphaNumeric(cc) || cc == '=')) {
147 decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
148 if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)
149 decodedEntity.append(second);
152 unconsumeCharacters(source, consumedCharacters);
156 bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
158 ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
159 ASSERT(!notEnoughCharacters);
160 ASSERT(decodedEntity.isEmpty());
171 EntityState entityState = Initial;
173 ConsumedCharacterBuffer consumedCharacters;
175 while (!source.isEmpty()) {
176 UChar cc = source.currentChar();
177 switch (entityState) {
179 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
181 if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
184 entityState = Number;
187 if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
195 entityState = MaybeHexLowerCaseX;
199 entityState = MaybeHexUpperCaseX;
202 if (cc >= '0' && cc <= '9') {
203 entityState = Decimal;
209 case MaybeHexLowerCaseX: {
210 if (isHexDigit(cc)) {
218 case MaybeHexUpperCaseX: {
219 if (isHexDigit(cc)) {
228 if (isHexDigit(cc)) {
229 if (result != kInvalidUnicode)
230 result = result * 16 + asHexDigit(cc);
231 } else if (cc == ';') {
232 source.advanceAndASSERT(cc);
233 appendLegalEntityFor(result, decodedEntity);
236 appendLegalEntityFor(result, decodedEntity);
242 if (cc >= '0' && cc <= '9') {
243 if (result != kInvalidUnicode)
244 result = result * 10 + cc - '0';
245 } else if (cc == ';') {
246 source.advanceAndASSERT(cc);
247 appendLegalEntityFor(result, decodedEntity);
250 appendLegalEntityFor(result, decodedEntity);
256 return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc);
260 if (result > UCHAR_MAX_VALUE)
261 result = kInvalidUnicode;
263 consumedCharacters.append(cc);
264 source.advanceAndASSERT(cc);
266 ASSERT(source.isEmpty());
267 notEnoughCharacters = true;
268 unconsumeCharacters(source, consumedCharacters);
272 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
274 if (U_IS_BMP(value)) {
275 UChar character = static_cast<UChar>(value);
276 ASSERT(character == value);
277 result[0] = character;
281 result[0] = U16_LEAD(value);
282 result[1] = U16_TRAIL(value);
286 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
288 HTMLEntitySearch search;
290 search.advance(*name++);
291 if (!search.isEntityPrefix())
295 if (!search.isEntityPrefix())
298 size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
299 if (!search.mostRecentMatch()->secondValue)
300 return numberOfCodePoints;
301 return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
304 } // namespace WebCore