2 * Copyright (C) 2003 Lars Knoll (knoll@kde.org)
3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com)
4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved.
5 * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com>
6 * Copyright (C) 2008 Eric Seidel <eric@webkit.org>
7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/)
8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved.
9 * Copyright (C) 2012 Intel Corporation. All rights reserved.
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Library General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Library General Public License for more details.
21 * You should have received a copy of the GNU Library General Public License
22 * along with this library; see the file COPYING.LIB. If not, write to
23 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
24 * Boston, MA 02110-1301, USA.
28 #include "core/css/parser/CSSTokenizer.h"
30 #include "core/css/CSSKeyframeRule.h"
31 #include "core/css/MediaQuery.h"
32 #include "core/css/StyleRule.h"
33 #include "core/css/parser/BisonCSSParser.h"
34 #include "core/css/parser/CSSParserValues.h"
35 #include "core/html/parser/HTMLParserIdioms.h"
36 #include "core/svg/SVGParserUtilities.h"
40 #include "core/CSSGrammar.h"
43 // Types for the main switch.
45 // The first 4 types must be grouped together, as they
46 // represent the allowed chars in an identifier.
48 CharacterIdentifierStart,
55 CharacterEndMediaQueryOrSupports,
58 CharacterExclamationMark,
74 static const CharacterType typesOfASCIICharacters[128] = {
75 /* 0 - Null */ CharacterNull,
76 /* 1 - Start of Heading */ CharacterOther,
77 /* 2 - Start of Text */ CharacterOther,
78 /* 3 - End of Text */ CharacterOther,
79 /* 4 - End of Transm. */ CharacterOther,
80 /* 5 - Enquiry */ CharacterOther,
81 /* 6 - Acknowledgment */ CharacterOther,
82 /* 7 - Bell */ CharacterOther,
83 /* 8 - Back Space */ CharacterOther,
84 /* 9 - Horizontal Tab */ CharacterWhiteSpace,
85 /* 10 - Line Feed */ CharacterWhiteSpace,
86 /* 11 - Vertical Tab */ CharacterOther,
87 /* 12 - Form Feed */ CharacterWhiteSpace,
88 /* 13 - Carriage Return */ CharacterWhiteSpace,
89 /* 14 - Shift Out */ CharacterOther,
90 /* 15 - Shift In */ CharacterOther,
91 /* 16 - Data Line Escape */ CharacterOther,
92 /* 17 - Device Control 1 */ CharacterOther,
93 /* 18 - Device Control 2 */ CharacterOther,
94 /* 19 - Device Control 3 */ CharacterOther,
95 /* 20 - Device Control 4 */ CharacterOther,
96 /* 21 - Negative Ack. */ CharacterOther,
97 /* 22 - Synchronous Idle */ CharacterOther,
98 /* 23 - End of Transmit */ CharacterOther,
99 /* 24 - Cancel */ CharacterOther,
100 /* 25 - End of Medium */ CharacterOther,
101 /* 26 - Substitute */ CharacterOther,
102 /* 27 - Escape */ CharacterOther,
103 /* 28 - File Separator */ CharacterOther,
104 /* 29 - Group Separator */ CharacterOther,
105 /* 30 - Record Separator */ CharacterOther,
106 /* 31 - Unit Separator */ CharacterOther,
107 /* 32 - Space */ CharacterWhiteSpace,
108 /* 33 - ! */ CharacterExclamationMark,
109 /* 34 - " */ CharacterQuote,
110 /* 35 - # */ CharacterHashmark,
111 /* 36 - $ */ CharacterDollar,
112 /* 37 - % */ CharacterOther,
113 /* 38 - & */ CharacterOther,
114 /* 39 - ' */ CharacterQuote,
115 /* 40 - ( */ CharacterOther,
116 /* 41 - ) */ CharacterEndNthChild,
117 /* 42 - * */ CharacterAsterisk,
118 /* 43 - + */ CharacterPlus,
119 /* 44 - , */ CharacterOther,
120 /* 45 - - */ CharacterDash,
121 /* 46 - . */ CharacterDot,
122 /* 47 - / */ CharacterSlash,
123 /* 48 - 0 */ CharacterNumber,
124 /* 49 - 1 */ CharacterNumber,
125 /* 50 - 2 */ CharacterNumber,
126 /* 51 - 3 */ CharacterNumber,
127 /* 52 - 4 */ CharacterNumber,
128 /* 53 - 5 */ CharacterNumber,
129 /* 54 - 6 */ CharacterNumber,
130 /* 55 - 7 */ CharacterNumber,
131 /* 56 - 8 */ CharacterNumber,
132 /* 57 - 9 */ CharacterNumber,
133 /* 58 - : */ CharacterOther,
134 /* 59 - ; */ CharacterEndMediaQueryOrSupports,
135 /* 60 - < */ CharacterLess,
136 /* 61 - = */ CharacterOther,
137 /* 62 - > */ CharacterOther,
138 /* 63 - ? */ CharacterOther,
139 /* 64 - @ */ CharacterAt,
140 /* 65 - A */ CharacterIdentifierStart,
141 /* 66 - B */ CharacterIdentifierStart,
142 /* 67 - C */ CharacterIdentifierStart,
143 /* 68 - D */ CharacterIdentifierStart,
144 /* 69 - E */ CharacterIdentifierStart,
145 /* 70 - F */ CharacterIdentifierStart,
146 /* 71 - G */ CharacterIdentifierStart,
147 /* 72 - H */ CharacterIdentifierStart,
148 /* 73 - I */ CharacterIdentifierStart,
149 /* 74 - J */ CharacterIdentifierStart,
150 /* 75 - K */ CharacterIdentifierStart,
151 /* 76 - L */ CharacterIdentifierStart,
152 /* 77 - M */ CharacterIdentifierStart,
153 /* 78 - N */ CharacterIdentifierStart,
154 /* 79 - O */ CharacterIdentifierStart,
155 /* 80 - P */ CharacterIdentifierStart,
156 /* 81 - Q */ CharacterIdentifierStart,
157 /* 82 - R */ CharacterIdentifierStart,
158 /* 83 - S */ CharacterIdentifierStart,
159 /* 84 - T */ CharacterIdentifierStart,
160 /* 85 - U */ CharacterCaselessU,
161 /* 86 - V */ CharacterIdentifierStart,
162 /* 87 - W */ CharacterIdentifierStart,
163 /* 88 - X */ CharacterIdentifierStart,
164 /* 89 - Y */ CharacterIdentifierStart,
165 /* 90 - Z */ CharacterIdentifierStart,
166 /* 91 - [ */ CharacterOther,
167 /* 92 - \ */ CharacterBackSlash,
168 /* 93 - ] */ CharacterOther,
169 /* 94 - ^ */ CharacterXor,
170 /* 95 - _ */ CharacterIdentifierStart,
171 /* 96 - ` */ CharacterOther,
172 /* 97 - a */ CharacterIdentifierStart,
173 /* 98 - b */ CharacterIdentifierStart,
174 /* 99 - c */ CharacterIdentifierStart,
175 /* 100 - d */ CharacterIdentifierStart,
176 /* 101 - e */ CharacterIdentifierStart,
177 /* 102 - f */ CharacterIdentifierStart,
178 /* 103 - g */ CharacterIdentifierStart,
179 /* 104 - h */ CharacterIdentifierStart,
180 /* 105 - i */ CharacterIdentifierStart,
181 /* 106 - j */ CharacterIdentifierStart,
182 /* 107 - k */ CharacterIdentifierStart,
183 /* 108 - l */ CharacterIdentifierStart,
184 /* 109 - m */ CharacterIdentifierStart,
185 /* 110 - n */ CharacterIdentifierStart,
186 /* 111 - o */ CharacterIdentifierStart,
187 /* 112 - p */ CharacterIdentifierStart,
188 /* 113 - q */ CharacterIdentifierStart,
189 /* 114 - r */ CharacterIdentifierStart,
190 /* 115 - s */ CharacterIdentifierStart,
191 /* 116 - t */ CharacterIdentifierStart,
192 /* 117 - u */ CharacterCaselessU,
193 /* 118 - v */ CharacterIdentifierStart,
194 /* 119 - w */ CharacterIdentifierStart,
195 /* 120 - x */ CharacterIdentifierStart,
196 /* 121 - y */ CharacterIdentifierStart,
197 /* 122 - z */ CharacterIdentifierStart,
198 /* 123 - { */ CharacterEndMediaQueryOrSupports,
199 /* 124 - | */ CharacterVerticalBar,
200 /* 125 - } */ CharacterOther,
201 /* 126 - ~ */ CharacterTilde,
202 /* 127 - Delete */ CharacterOther,
205 // Utility functions for the CSS tokenizer.
207 template <typename CharacterType>
208 static inline bool isCSSLetter(CharacterType character)
210 return character >= 128 || typesOfASCIICharacters[character] <= CharacterDash;
213 template <typename CharacterType>
214 static inline bool isCSSEscape(CharacterType character)
216 return character >= ' ' && character != 127;
219 template <typename CharacterType>
220 static inline bool isURILetter(CharacterType character)
222 return (character >= '*' && character != 127) || (character >= '#' && character <= '&') || character == '!';
225 template <typename CharacterType>
226 static inline bool isIdentifierStartAfterDash(CharacterType* currentCharacter)
228 return isASCIIAlpha(currentCharacter[0]) || currentCharacter[0] == '_' || currentCharacter[0] >= 128
229 || (currentCharacter[0] == '\\' && isCSSEscape(currentCharacter[1]));
232 template <typename CharacterType>
233 static inline bool isEqualToCSSIdentifier(CharacterType* cssString, const char* constantString)
235 // Compare an character memory data with a zero terminated string.
237 // The input must be part of an identifier if constantChar or constString
238 // contains '-'. Otherwise toASCIILowerUnchecked('\r') would be equal to '-'.
239 ASSERT((*constantString >= 'a' && *constantString <= 'z') || *constantString == '-');
240 ASSERT(*constantString != '-' || isCSSLetter(*cssString));
241 if (toASCIILowerUnchecked(*cssString++) != (*constantString++))
243 } while (*constantString);
247 template <typename CharacterType>
248 static inline bool isEqualToCSSCaseSensitiveIdentifier(CharacterType* string, const char* constantString)
250 ASSERT(*constantString);
253 if (*string++ != *constantString++)
255 } while (*constantString);
259 template <typename CharacterType>
260 static CharacterType* checkAndSkipEscape(CharacterType* currentCharacter)
262 // Returns with 0, if escape check is failed. Otherwise
263 // it returns with the following character.
264 ASSERT(*currentCharacter == '\\');
267 if (!isCSSEscape(*currentCharacter))
270 if (isASCIIHexDigit(*currentCharacter)) {
275 } while (isASCIIHexDigit(*currentCharacter) && --length);
277 // Optional space after the escape sequence.
278 if (isHTMLSpace<CharacterType>(*currentCharacter))
280 return currentCharacter;
282 return currentCharacter + 1;
285 template <typename CharacterType>
286 static inline CharacterType* skipWhiteSpace(CharacterType* currentCharacter)
288 while (isHTMLSpace<CharacterType>(*currentCharacter))
290 return currentCharacter;
293 // Main CSS tokenizer functions.
296 inline LChar*& CSSTokenizer::currentCharacter<LChar>()
298 return m_currentCharacter8;
302 inline UChar*& CSSTokenizer::currentCharacter<UChar>()
304 return m_currentCharacter16;
307 UChar* CSSTokenizer::allocateStringBuffer16(size_t len)
309 // Allocates and returns a CSSTokenizer owned buffer for storing
310 // UTF-16 data. Used to get a suitable life span for UTF-16
311 // strings, identifiers and URIs created by the tokenizer.
312 OwnPtr<UChar[]> buffer = adoptArrayPtr(new UChar[len]);
314 UChar* bufferPtr = buffer.get();
316 m_cssStrings16.append(buffer.release());
321 inline LChar* CSSTokenizer::dataStart<LChar>()
323 return m_dataStart8.get();
327 inline UChar* CSSTokenizer::dataStart<UChar>()
329 return m_dataStart16.get();
332 template <typename CharacterType>
333 inline CSSParserLocation CSSTokenizer::tokenLocation()
335 CSSParserLocation location;
336 location.token.init(tokenStart<CharacterType>(), currentCharacter<CharacterType>() - tokenStart<CharacterType>());
337 location.lineNumber = m_tokenStartLineNumber;
338 location.offset = tokenStart<CharacterType>() - dataStart<CharacterType>();
342 CSSParserLocation CSSTokenizer::currentLocation()
345 return tokenLocation<LChar>();
346 return tokenLocation<UChar>();
349 template <typename CharacterType>
350 inline bool CSSTokenizer::isIdentifierStart()
352 // Check whether an identifier is started.
353 return isIdentifierStartAfterDash((*currentCharacter<CharacterType>() != '-') ? currentCharacter<CharacterType>() : currentCharacter<CharacterType>() + 1);
356 enum CheckStringValidationMode {
361 template <typename CharacterType>
362 static inline CharacterType* checkAndSkipString(CharacterType* currentCharacter, int quote, CheckStringValidationMode mode)
364 // If mode is AbortIfInvalid and the string check fails it returns
365 // with 0. Otherwise it returns with a pointer to the first
366 // character after the string.
368 if (UNLIKELY(*currentCharacter == quote)) {
369 // String parsing is successful.
370 return currentCharacter + 1;
372 if (UNLIKELY(!*currentCharacter)) {
373 // String parsing is successful up to end of input.
374 return currentCharacter;
376 if (mode == AbortIfInvalid && UNLIKELY(*currentCharacter <= '\r' && (*currentCharacter == '\n' || (*currentCharacter | 0x1) == '\r'))) {
377 // String parsing is failed for character '\n', '\f' or '\r'.
381 if (LIKELY(currentCharacter[0] != '\\')) {
383 } else if (currentCharacter[1] == '\n' || currentCharacter[1] == '\f') {
384 currentCharacter += 2;
385 } else if (currentCharacter[1] == '\r') {
386 currentCharacter += currentCharacter[2] == '\n' ? 3 : 2;
388 CharacterType* next = checkAndSkipEscape(currentCharacter);
390 if (mode == AbortIfInvalid)
392 next = currentCharacter + 1;
394 currentCharacter = next;
399 template <typename CharacterType>
400 unsigned CSSTokenizer::parseEscape(CharacterType*& src)
402 ASSERT(*src == '\\' && isCSSEscape(src[1]));
404 unsigned unicode = 0;
407 if (isASCIIHexDigit(*src)) {
412 unicode = (unicode << 4) + toASCIIHexValue(*src++);
413 } while (--length && isASCIIHexDigit(*src));
415 // Characters above 0x10ffff are not handled.
416 if (unicode > 0x10ffff)
419 // Optional space after the escape sequence.
420 if (isHTMLSpace<CharacterType>(*src))
430 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode)
432 ASSERT(unicode <= 0xff);
439 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode)
441 // Replace unicode with a surrogate pairs when it is bigger than 0xffff
442 if (U16_LENGTH(unicode) == 2) {
443 *result++ = U16_LEAD(unicode);
444 *result = U16_TRAIL(unicode);
452 template <typename SrcCharacterType>
453 size_t CSSTokenizer::peekMaxIdentifierLen(SrcCharacterType* src)
455 // The decoded form of an identifier (after resolving escape
456 // sequences) will not contain more characters (ASCII or UTF-16
457 // codepoints) than the input. This code can therefore ignore
458 // escape sequences completely.
459 SrcCharacterType* start = src;
461 if (LIKELY(*src != '\\'))
464 parseEscape<SrcCharacterType>(src);
465 } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1])));
470 template <typename SrcCharacterType, typename DestCharacterType>
471 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType*& src, DestCharacterType*& result, bool& hasEscape)
475 if (LIKELY(*src != '\\')) {
479 SrcCharacterType* savedEscapeStart = src;
480 unsigned unicode = parseEscape<SrcCharacterType>(src);
481 if (unicode > 0xff && sizeof(DestCharacterType) == 1) {
482 src = savedEscapeStart;
485 UnicodeToChars(result, unicode);
487 } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1])));
492 template <typename CharacterType>
493 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserString& resultString, bool& hasEscape)
495 // If a valid identifier start is found, we can safely
496 // parse the identifier until the next invalid character.
497 ASSERT(isIdentifierStart<CharacterType>());
499 CharacterType* start = currentCharacter<CharacterType>();
500 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), result, hasEscape))) {
501 // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue
502 ASSERT(is8BitSource());
503 UChar* result16 = allocateStringBuffer16((result - start) + peekMaxIdentifierLen(currentCharacter<CharacterType>()));
504 UChar* start16 = result16;
506 for (; i < result - start; i++)
507 result16[i] = start[i];
511 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, hasEscape);
513 resultString.init(start16, result16 - start16);
518 resultString.init(start, result - start);
521 template <typename SrcCharacterType>
522 size_t CSSTokenizer::peekMaxStringLen(SrcCharacterType* src, UChar quote)
524 // The decoded form of a CSS string (after resolving escape
525 // sequences) will not contain more characters (ASCII or UTF-16
526 // codepoints) than the input. This code can therefore ignore
527 // escape sequences completely and just return the length of the
528 // input string (possibly including terminating quote if any).
529 SrcCharacterType* end = checkAndSkipString(src, quote, SkipInvalid);
530 return end ? end - src : 0;
533 template <typename SrcCharacterType, typename DestCharacterType>
534 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType*& src, DestCharacterType*& result, UChar quote)
537 if (UNLIKELY(*src == quote)) {
538 // String parsing is done.
542 if (UNLIKELY(!*src)) {
543 // String parsing is done, but don't advance pointer if at the end of input.
546 if (LIKELY(src[0] != '\\')) {
548 } else if (src[1] == '\n' || src[1] == '\f') {
550 } else if (src[1] == '\r') {
551 src += src[2] == '\n' ? 3 : 2;
553 SrcCharacterType* savedEscapeStart = src;
554 unsigned unicode = parseEscape<SrcCharacterType>(src);
555 if (unicode > 0xff && sizeof(DestCharacterType) == 1) {
556 src = savedEscapeStart;
559 UnicodeToChars(result, unicode);
566 template <typename CharacterType>
567 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& resultString, UChar quote)
569 CharacterType* start = currentCharacter<CharacterType>();
571 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {
572 // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue
573 ASSERT(is8BitSource());
574 UChar* result16 = allocateStringBuffer16((result - start) + peekMaxStringLen(currentCharacter<CharacterType>(), quote));
575 UChar* start16 = result16;
577 for (; i < result - start; i++)
578 result16[i] = start[i];
582 parseStringInternal(currentCharacter<CharacterType>(), result16, quote);
584 resultString.init(start16, result16 - start16);
588 resultString.init(start, result - start);
591 template <typename CharacterType>
592 inline bool CSSTokenizer::findURI(CharacterType*& start, CharacterType*& end, UChar& quote)
594 start = skipWhiteSpace(currentCharacter<CharacterType>());
596 if (*start == '"' || *start == '\'') {
598 end = checkAndSkipString(start, quote, AbortIfInvalid);
604 while (isURILetter(*end)) {
605 if (LIKELY(*end != '\\')) {
608 end = checkAndSkipEscape(end);
615 end = skipWhiteSpace(end);
622 template <typename SrcCharacterType>
623 inline size_t CSSTokenizer::peekMaxURILen(SrcCharacterType* src, UChar quote)
625 // The decoded form of a URI (after resolving escape sequences)
626 // will not contain more characters (ASCII or UTF-16 codepoints)
627 // than the input. This code can therefore ignore escape sequences
629 SrcCharacterType* start = src;
631 ASSERT(quote == '"' || quote == '\'');
632 return peekMaxStringLen(src, quote);
635 while (isURILetter(*src)) {
636 if (LIKELY(*src != '\\'))
639 parseEscape<SrcCharacterType>(src);
645 template <typename SrcCharacterType, typename DestCharacterType>
646 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacterType*& dest, UChar quote)
649 ASSERT(quote == '"' || quote == '\'');
650 return parseStringInternal(src, dest, quote);
653 while (isURILetter(*src)) {
654 if (LIKELY(*src != '\\')) {
657 unsigned unicode = parseEscape<SrcCharacterType>(src);
658 if (unicode > 0xff && sizeof(DestCharacterType) == 1)
660 UnicodeToChars(dest, unicode);
667 template <typename CharacterType>
668 inline void CSSTokenizer::parseURI(CSSParserString& string)
670 CharacterType* uriStart;
671 CharacterType* uriEnd;
673 if (!findURI(uriStart, uriEnd, quote))
676 CharacterType* dest = currentCharacter<CharacterType>() = uriStart;
677 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote))) {
678 string.init(uriStart, dest - uriStart);
680 // An escape sequence was encountered that can't be stored in 8 bits.
681 // Reset the current character to the start of the URI and re-parse with
682 // a 16-bit destination.
683 ASSERT(is8BitSource());
684 currentCharacter<CharacterType>() = uriStart;
685 UChar* result16 = allocateStringBuffer16(peekMaxURILen(currentCharacter<CharacterType>(), quote));
686 UChar* uriStart16 = result16;
687 bool result = parseURIInternal(currentCharacter<CharacterType>(), result16, quote);
688 ASSERT_UNUSED(result, result);
689 string.init(uriStart16, result16 - uriStart16);
692 currentCharacter<CharacterType>() = uriEnd + 1;
696 template <typename CharacterType>
697 inline bool CSSTokenizer::parseUnicodeRange()
699 CharacterType* character = currentCharacter<CharacterType>() + 1;
701 ASSERT(*currentCharacter<CharacterType>() == '+');
703 while (isASCIIHexDigit(*character) && length) {
708 if (length && *character == '?') {
709 // At most 5 hex digit followed by a question mark.
713 } while (*character == '?' && length);
714 currentCharacter<CharacterType>() = character;
719 // At least one hex digit.
720 if (character[0] == '-' && isASCIIHexDigit(character[1])) {
721 // Followed by a dash and a hex digit.
726 } while (--length && isASCIIHexDigit(*character));
728 currentCharacter<CharacterType>() = character;
734 template <typename CharacterType>
735 bool CSSTokenizer::parseNthChild()
737 CharacterType* character = currentCharacter<CharacterType>();
739 while (isASCIIDigit(*character))
741 if (isASCIIAlphaCaselessEqual(*character, 'n')) {
742 currentCharacter<CharacterType>() = character + 1;
748 template <typename CharacterType>
749 bool CSSTokenizer::parseNthChildExtra()
751 CharacterType* character = skipWhiteSpace(currentCharacter<CharacterType>());
752 if (*character != '+' && *character != '-')
755 character = skipWhiteSpace(character + 1);
756 if (!isASCIIDigit(*character))
761 } while (isASCIIDigit(*character));
763 currentCharacter<CharacterType>() = character;
767 template <typename CharacterType>
768 inline bool CSSTokenizer::detectFunctionTypeToken(int length)
771 CharacterType* name = tokenStart<CharacterType>();
772 SWITCH(name, length) {
774 m_token = NOTFUNCTION;
782 m_token = CUEFUNCTION;
786 m_token = CALCFUNCTION;
790 m_token = HOSTFUNCTION;
793 CASE("host-context") {
794 m_token = HOSTCONTEXTFUNCTION;
798 m_parsingMode = NthChildMode;
801 CASE("nth-of-type") {
802 m_parsingMode = NthChildMode;
805 CASE("nth-last-child") {
806 m_parsingMode = NthChildMode;
809 CASE("nth-last-of-type") {
810 m_parsingMode = NthChildMode;
817 template <typename CharacterType>
818 inline void CSSTokenizer::detectMediaQueryToken(int length)
820 ASSERT(m_parsingMode == MediaQueryMode);
821 CharacterType* name = tokenStart<CharacterType>();
823 SWITCH(name, length) {
831 m_token = MEDIA_ONLY;
839 template <typename CharacterType>
840 inline void CSSTokenizer::detectNumberToken(CharacterType* type, int length)
844 SWITCH(type, length) {
855 // There is a discussion about the name of this unit on www-style.
856 // Keep this compile time guard in place until that is resolved.
857 // http://lists.w3.org/Archives/Public/www-style/2012May/0915.html
932 template <typename CharacterType>
933 inline void CSSTokenizer::detectDashToken(int length)
935 CharacterType* name = tokenStart<CharacterType>();
937 // Ignore leading dash.
941 SWITCH(name, length) {
943 m_token = ANYFUNCTION;
945 CASE("webkit-calc") {
946 m_token = CALCFUNCTION;
951 template <typename CharacterType>
952 inline void CSSTokenizer::detectAtToken(int length, bool hasEscape)
954 CharacterType* name = tokenStart<CharacterType>();
955 ASSERT(name[0] == '@' && length >= 2);
961 // charset, font-face, import, media, namespace, page, supports,
962 // -webkit-keyframes, keyframes, and -webkit-mediaquery are not affected by hasEscape.
963 SWITCH(name, length) {
964 CASE("bottom-left") {
965 if (LIKELY(!hasEscape))
966 m_token = BOTTOMLEFT_SYM;
968 CASE("bottom-right") {
969 if (LIKELY(!hasEscape))
970 m_token = BOTTOMRIGHT_SYM;
972 CASE("bottom-center") {
973 if (LIKELY(!hasEscape))
974 m_token = BOTTOMCENTER_SYM;
976 CASE("bottom-left-corner") {
977 if (LIKELY(!hasEscape))
978 m_token = BOTTOMLEFTCORNER_SYM;
980 CASE("bottom-right-corner") {
981 if (LIKELY(!hasEscape))
982 m_token = BOTTOMRIGHTCORNER_SYM;
985 if (name - 1 == dataStart<CharacterType>())
986 m_token = CHARSET_SYM;
989 m_token = FONT_FACE_SYM;
992 m_parsingMode = MediaQueryMode;
993 m_token = IMPORT_SYM;
996 if (RuntimeEnabledFeatures::cssAnimationUnprefixedEnabled())
997 m_token = KEYFRAMES_SYM;
1000 if (LIKELY(!hasEscape))
1001 m_token = LEFTTOP_SYM;
1003 CASE("left-middle") {
1004 if (LIKELY(!hasEscape))
1005 m_token = LEFTMIDDLE_SYM;
1007 CASE("left-bottom") {
1008 if (LIKELY(!hasEscape))
1009 m_token = LEFTBOTTOM_SYM;
1012 m_parsingMode = MediaQueryMode;
1013 m_token = MEDIA_SYM;
1016 m_token = NAMESPACE_SYM;
1022 if (LIKELY(!hasEscape))
1023 m_token = RIGHTTOP_SYM;
1025 CASE("right-middle") {
1026 if (LIKELY(!hasEscape))
1027 m_token = RIGHTMIDDLE_SYM;
1029 CASE("right-bottom") {
1030 if (LIKELY(!hasEscape))
1031 m_token = RIGHTBOTTOM_SYM;
1034 m_parsingMode = SupportsMode;
1035 m_token = SUPPORTS_SYM;
1038 if (LIKELY(!hasEscape))
1039 m_token = TOPLEFT_SYM;
1042 if (LIKELY(!hasEscape))
1043 m_token = TOPRIGHT_SYM;
1045 CASE("top-center") {
1046 if (LIKELY(!hasEscape))
1047 m_token = TOPCENTER_SYM;
1049 CASE("top-left-corner") {
1050 if (LIKELY(!hasEscape))
1051 m_token = TOPLEFTCORNER_SYM;
1053 CASE("top-right-corner") {
1054 if (LIKELY(!hasEscape))
1055 m_token = TOPRIGHTCORNER_SYM;
1058 m_token = VIEWPORT_RULE_SYM;
1060 CASE("-internal-rule") {
1061 if (LIKELY(!hasEscape && m_internal))
1062 m_token = INTERNAL_RULE_SYM;
1064 CASE("-internal-decls") {
1065 if (LIKELY(!hasEscape && m_internal))
1066 m_token = INTERNAL_DECLS_SYM;
1068 CASE("-internal-value") {
1069 if (LIKELY(!hasEscape && m_internal))
1070 m_token = INTERNAL_VALUE_SYM;
1072 CASE("-webkit-keyframes") {
1073 m_token = WEBKIT_KEYFRAMES_SYM;
1075 CASE("-internal-selector") {
1076 if (LIKELY(!hasEscape && m_internal))
1077 m_token = INTERNAL_SELECTOR_SYM;
1079 CASE("-internal-keyframe-rule") {
1080 if (LIKELY(!hasEscape && m_internal))
1081 m_token = INTERNAL_KEYFRAME_RULE_SYM;
1083 CASE("-internal-keyframe-key-list") {
1086 m_token = INTERNAL_KEYFRAME_KEY_LIST_SYM;
1088 CASE("-internal-supports-condition") {
1091 m_parsingMode = SupportsMode;
1092 m_token = INTERNAL_SUPPORTS_CONDITION_SYM;
1097 template <typename CharacterType>
1098 inline void CSSTokenizer::detectSupportsToken(int length)
1100 ASSERT(m_parsingMode == SupportsMode);
1101 CharacterType* name = tokenStart<CharacterType>();
1103 SWITCH(name, length) {
1105 m_token = SUPPORTS_OR;
1108 m_token = SUPPORTS_AND;
1111 m_token = SUPPORTS_NOT;
1116 template <typename SrcCharacterType>
1117 int CSSTokenizer::realLex(void* yylvalWithoutType)
1119 YYSTYPE* yylval = static_cast<YYSTYPE*>(yylvalWithoutType);
1120 // Write pointer for the next character.
1121 SrcCharacterType* result;
1122 CSSParserString resultString;
1125 // The input buffer is terminated by a \0 character, so
1126 // it is safe to read one character ahead of a known non-null.
1128 // In debug we check with an ASSERT that the length is > 0 for string types.
1129 yylval->string.clear();
1132 restartAfterComment:
1133 result = currentCharacter<SrcCharacterType>();
1134 setTokenStart(result);
1135 m_tokenStartLineNumber = m_lineNumber;
1136 m_token = *currentCharacter<SrcCharacterType>();
1137 ++currentCharacter<SrcCharacterType>();
1139 switch ((m_token <= 127) ? typesOfASCIICharacters[m_token] : CharacterIdentifierStart) {
1140 case CharacterCaselessU:
1141 if (UNLIKELY(*currentCharacter<SrcCharacterType>() == '+')) {
1142 if (parseUnicodeRange<SrcCharacterType>()) {
1143 m_token = UNICODERANGE;
1144 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1148 // Fall through to CharacterIdentifierStart.
1150 case CharacterIdentifierStart:
1151 --currentCharacter<SrcCharacterType>();
1152 parseIdentifier(result, yylval->string, hasEscape);
1155 if (UNLIKELY(*currentCharacter<SrcCharacterType>() == '(')) {
1156 if (m_parsingMode == SupportsMode && !hasEscape) {
1157 detectSupportsToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1158 if (m_token != IDENT)
1164 detectFunctionTypeToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1167 ++currentCharacter<SrcCharacterType>();
1170 if (m_token == URI) {
1172 // Check whether it is really an URI.
1173 if (yylval->string.is8Bit())
1174 parseURI<LChar>(yylval->string);
1176 parseURI<UChar>(yylval->string);
1178 } else if (UNLIKELY(m_parsingMode != NormalMode) && !hasEscape) {
1179 if (m_parsingMode == MediaQueryMode) {
1180 detectMediaQueryToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1181 } else if (m_parsingMode == SupportsMode) {
1182 detectSupportsToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1183 } else if (m_parsingMode == NthChildMode && isASCIIAlphaCaselessEqual(tokenStart<SrcCharacterType>()[0], 'n')) {
1184 if (result - tokenStart<SrcCharacterType>() == 1) {
1185 // String "n" is IDENT but "n+1" is NTH.
1186 if (parseNthChildExtra<SrcCharacterType>()) {
1188 yylval->string.m_length = currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>();
1190 } else if (result - tokenStart<SrcCharacterType>() >= 2 && tokenStart<SrcCharacterType>()[1] == '-') {
1191 // String "n-" is IDENT but "n-1" is NTH.
1192 // Set currentCharacter to '-' to continue parsing.
1193 SrcCharacterType* nextCharacter = result;
1194 currentCharacter<SrcCharacterType>() = tokenStart<SrcCharacterType>() + 1;
1195 if (parseNthChildExtra<SrcCharacterType>()) {
1197 yylval->string.setLength(currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1199 // Revert the change to currentCharacter if unsuccessful.
1200 currentCharacter<SrcCharacterType>() = nextCharacter;
1208 if (!isASCIIDigit(currentCharacter<SrcCharacterType>()[0]))
1210 // Fall through to CharacterNumber.
1212 case CharacterNumber: {
1213 bool dotSeen = (m_token == '.');
1216 if (!isASCIIDigit(currentCharacter<SrcCharacterType>()[0])) {
1217 // Only one dot is allowed for a number,
1218 // and it must be followed by a digit.
1219 if (currentCharacter<SrcCharacterType>()[0] != '.' || dotSeen || !isASCIIDigit(currentCharacter<SrcCharacterType>()[1]))
1223 ++currentCharacter<SrcCharacterType>();
1226 if (UNLIKELY(m_parsingMode == NthChildMode) && !dotSeen && isASCIIAlphaCaselessEqual(*currentCharacter<SrcCharacterType>(), 'n')) {
1227 // "[0-9]+n" is always an NthChild.
1228 ++currentCharacter<SrcCharacterType>();
1229 parseNthChildExtra<SrcCharacterType>();
1231 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1235 // We need to take care of units like 'em' or 'ex'.
1236 SrcCharacterType* character = currentCharacter<SrcCharacterType>();
1237 if (isASCIIAlphaCaselessEqual(*character, 'e')) {
1238 ASSERT(character - tokenStart<SrcCharacterType>() > 0);
1240 if (*character == '-' || *character == '+' || isASCIIDigit(*character)) {
1242 while (isASCIIDigit(*character))
1244 // Use FLOATTOKEN if the string contains exponents.
1246 currentCharacter<SrcCharacterType>() = character;
1250 yylval->number = charactersToDouble(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1252 // Type of the function.
1253 if (isIdentifierStart<SrcCharacterType>()) {
1254 SrcCharacterType* type = currentCharacter<SrcCharacterType>();
1255 result = currentCharacter<SrcCharacterType>();
1257 parseIdentifier(result, resultString, hasEscape);
1261 detectNumberToken(type, currentCharacter<SrcCharacterType>() - type);
1263 if (m_token == DIMEN) {
1264 // The decoded number is overwritten, but this is intentional.
1265 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1267 } else if (*currentCharacter<SrcCharacterType>() == '%') {
1268 // Although the CSS grammar says {num}% we follow
1269 // webkit at the moment which uses {num}%+.
1271 ++currentCharacter<SrcCharacterType>();
1272 } while (*currentCharacter<SrcCharacterType>() == '%');
1273 m_token = PERCENTAGE;
1275 m_token = dotSeen ? FLOATTOKEN : INTEGER;
1281 if (isIdentifierStartAfterDash(currentCharacter<SrcCharacterType>())) {
1282 --currentCharacter<SrcCharacterType>();
1283 parseIdentifier(result, resultString, hasEscape);
1286 if (*currentCharacter<SrcCharacterType>() == '(') {
1289 detectDashToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1290 ++currentCharacter<SrcCharacterType>();
1292 } else if (UNLIKELY(m_parsingMode == NthChildMode) && !hasEscape && isASCIIAlphaCaselessEqual(tokenStart<SrcCharacterType>()[1], 'n')) {
1293 if (result - tokenStart<SrcCharacterType>() == 2) {
1294 // String "-n" is IDENT but "-n+1" is NTH.
1295 if (parseNthChildExtra<SrcCharacterType>()) {
1297 result = currentCharacter<SrcCharacterType>();
1299 } else if (result - tokenStart<SrcCharacterType>() >= 3 && tokenStart<SrcCharacterType>()[2] == '-') {
1300 // String "-n-" is IDENT but "-n-1" is NTH.
1301 // Set currentCharacter to second '-' of '-n-' to continue parsing.
1302 SrcCharacterType* nextCharacter = result;
1303 currentCharacter<SrcCharacterType>() = tokenStart<SrcCharacterType>() + 2;
1304 if (parseNthChildExtra<SrcCharacterType>()) {
1306 result = currentCharacter<SrcCharacterType>();
1308 // Revert the change to currentCharacter if unsuccessful.
1309 currentCharacter<SrcCharacterType>() = nextCharacter;
1312 resultString.setLength(result - tokenStart<SrcCharacterType>());
1314 yylval->string = resultString;
1315 } else if (currentCharacter<SrcCharacterType>()[0] == '-' && currentCharacter<SrcCharacterType>()[1] == '>') {
1316 currentCharacter<SrcCharacterType>() += 2;
1318 } else if (UNLIKELY(m_parsingMode == NthChildMode)) {
1319 // "-[0-9]+n" is always an NthChild.
1320 if (parseNthChild<SrcCharacterType>()) {
1321 parseNthChildExtra<SrcCharacterType>();
1323 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1328 case CharacterOther:
1329 // m_token is simply the current character.
1333 // Do not advance pointer at the end of input.
1334 --currentCharacter<SrcCharacterType>();
1337 case CharacterWhiteSpace:
1338 m_token = WHITESPACE;
1339 // Might start with a '\n'.
1340 --currentCharacter<SrcCharacterType>();
1342 if (*currentCharacter<SrcCharacterType>() == '\n')
1344 ++currentCharacter<SrcCharacterType>();
1345 } while (*currentCharacter<SrcCharacterType>() <= ' ' && (typesOfASCIICharacters[*currentCharacter<SrcCharacterType>()] == CharacterWhiteSpace));
1348 case CharacterEndMediaQueryOrSupports:
1349 if (m_parsingMode == MediaQueryMode || m_parsingMode == SupportsMode)
1350 m_parsingMode = NormalMode;
1353 case CharacterEndNthChild:
1354 if (m_parsingMode == NthChildMode)
1355 m_parsingMode = NormalMode;
1358 case CharacterQuote:
1359 if (checkAndSkipString(currentCharacter<SrcCharacterType>(), m_token, AbortIfInvalid)) {
1361 parseString<SrcCharacterType>(result, yylval->string, m_token);
1366 case CharacterExclamationMark: {
1367 SrcCharacterType* start = skipWhiteSpace(currentCharacter<SrcCharacterType>());
1368 if (isEqualToCSSIdentifier(start, "important")) {
1369 m_token = IMPORTANT_SYM;
1370 currentCharacter<SrcCharacterType>() = start + 9;
1375 case CharacterHashmark: {
1376 SrcCharacterType* start = currentCharacter<SrcCharacterType>();
1377 result = currentCharacter<SrcCharacterType>();
1379 if (isASCIIDigit(*currentCharacter<SrcCharacterType>())) {
1380 // This must be a valid hex number token.
1382 ++currentCharacter<SrcCharacterType>();
1383 } while (isASCIIHexDigit(*currentCharacter<SrcCharacterType>()));
1385 yylval->string.init(start, currentCharacter<SrcCharacterType>() - start);
1386 } else if (isIdentifierStart<SrcCharacterType>()) {
1388 parseIdentifier(result, yylval->string, hasEscape);
1390 // Check whether the identifier is also a valid hex number.
1391 SrcCharacterType* current = start;
1394 if (!isASCIIHexDigit(*current)) {
1399 } while (current < result);
1405 case CharacterSlash:
1406 // Ignore comments. They are not even considered as white spaces.
1407 if (*currentCharacter<SrcCharacterType>() == '*') {
1408 const CSSParserLocation startLocation = currentLocation();
1409 if (m_parser.m_observer) {
1410 unsigned startOffset = currentCharacter<SrcCharacterType>() - dataStart<SrcCharacterType>() - 1; // Start with a slash.
1411 m_parser.m_observer->startComment(startOffset - m_parsedTextPrefixLength);
1413 ++currentCharacter<SrcCharacterType>();
1414 while (currentCharacter<SrcCharacterType>()[0] != '*' || currentCharacter<SrcCharacterType>()[1] != '/') {
1415 if (*currentCharacter<SrcCharacterType>() == '\n')
1417 if (*currentCharacter<SrcCharacterType>() == '\0') {
1418 // Unterminated comments are simply ignored.
1419 currentCharacter<SrcCharacterType>() -= 2;
1420 m_parser.reportError(startLocation, UnterminatedCommentCSSError);
1423 ++currentCharacter<SrcCharacterType>();
1425 currentCharacter<SrcCharacterType>() += 2;
1426 if (m_parser.m_observer) {
1427 unsigned endOffset = currentCharacter<SrcCharacterType>() - dataStart<SrcCharacterType>();
1428 unsigned userTextEndOffset = static_cast<unsigned>(m_length - 1 - m_parsedTextSuffixLength);
1429 m_parser.m_observer->endComment(std::min(endOffset, userTextEndOffset) - m_parsedTextPrefixLength);
1431 goto restartAfterComment;
1435 case CharacterDollar:
1436 if (*currentCharacter<SrcCharacterType>() == '=') {
1437 ++currentCharacter<SrcCharacterType>();
1442 case CharacterAsterisk:
1443 if (*currentCharacter<SrcCharacterType>() == '=') {
1444 ++currentCharacter<SrcCharacterType>();
1450 if (UNLIKELY(m_parsingMode == NthChildMode)) {
1451 // Simplest case. "+[0-9]*n" is always NthChild.
1452 if (parseNthChild<SrcCharacterType>()) {
1453 parseNthChildExtra<SrcCharacterType>();
1455 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1461 if (currentCharacter<SrcCharacterType>()[0] == '!' && currentCharacter<SrcCharacterType>()[1] == '-' && currentCharacter<SrcCharacterType>()[2] == '-') {
1462 currentCharacter<SrcCharacterType>() += 3;
1468 if (isIdentifierStart<SrcCharacterType>()) {
1469 m_token = ATKEYWORD;
1471 parseIdentifier(result, resultString, hasEscape);
1472 // The standard enables unicode escapes in at-rules. In this case only the resultString will contain the
1473 // correct identifier, hence we have to use it to determine its length instead of the usual pointer arithmetic.
1474 detectAtToken<SrcCharacterType>(resultString.length() + 1, hasEscape);
1478 case CharacterBackSlash:
1479 if (isCSSEscape(*currentCharacter<SrcCharacterType>())) {
1480 --currentCharacter<SrcCharacterType>();
1481 parseIdentifier(result, yylval->string, hasEscape);
1487 if (*currentCharacter<SrcCharacterType>() == '=') {
1488 ++currentCharacter<SrcCharacterType>();
1489 m_token = BEGINSWITH;
1493 case CharacterVerticalBar:
1494 if (*currentCharacter<SrcCharacterType>() == '=') {
1495 ++currentCharacter<SrcCharacterType>();
1496 m_token = DASHMATCH;
1500 case CharacterTilde:
1501 if (*currentCharacter<SrcCharacterType>() == '=') {
1502 ++currentCharacter<SrcCharacterType>();
1508 ASSERT_NOT_REACHED();
1516 inline void CSSTokenizer::setTokenStart<LChar>(LChar* tokenStart)
1518 m_tokenStart.ptr8 = tokenStart;
1522 inline void CSSTokenizer::setTokenStart<UChar>(UChar* tokenStart)
1524 m_tokenStart.ptr16 = tokenStart;
1527 void CSSTokenizer::setupTokenizer(const char* prefix, unsigned prefixLength, const String& string, const char* suffix, unsigned suffixLength)
1529 m_parsedTextPrefixLength = prefixLength;
1530 m_parsedTextSuffixLength = suffixLength;
1531 unsigned stringLength = string.length();
1532 unsigned length = stringLength + m_parsedTextPrefixLength + m_parsedTextSuffixLength + 1;
1535 if (!stringLength || string.is8Bit()) {
1536 m_dataStart8 = adoptArrayPtr(new LChar[length]);
1537 for (unsigned i = 0; i < m_parsedTextPrefixLength; i++)
1538 m_dataStart8[i] = prefix[i];
1541 memcpy(m_dataStart8.get() + m_parsedTextPrefixLength, string.characters8(), stringLength * sizeof(LChar));
1543 unsigned start = m_parsedTextPrefixLength + stringLength;
1544 unsigned end = start + suffixLength;
1545 for (unsigned i = start; i < end; i++)
1546 m_dataStart8[i] = suffix[i - start];
1548 m_dataStart8[length - 1] = 0;
1550 m_is8BitSource = true;
1551 m_currentCharacter8 = m_dataStart8.get();
1552 m_currentCharacter16 = 0;
1553 setTokenStart<LChar>(m_currentCharacter8);
1554 m_lexFunc = &CSSTokenizer::realLex<LChar>;
1558 m_dataStart16 = adoptArrayPtr(new UChar[length]);
1559 for (unsigned i = 0; i < m_parsedTextPrefixLength; i++)
1560 m_dataStart16[i] = prefix[i];
1562 ASSERT(stringLength);
1563 memcpy(m_dataStart16.get() + m_parsedTextPrefixLength, string.characters16(), stringLength * sizeof(UChar));
1565 unsigned start = m_parsedTextPrefixLength + stringLength;
1566 unsigned end = start + suffixLength;
1567 for (unsigned i = start; i < end; i++)
1568 m_dataStart16[i] = suffix[i - start];
1570 m_dataStart16[length - 1] = 0;
1572 m_is8BitSource = false;
1573 m_currentCharacter8 = 0;
1574 m_currentCharacter16 = m_dataStart16.get();
1575 setTokenStart<UChar>(m_currentCharacter16);
1576 m_lexFunc = &CSSTokenizer::realLex<UChar>;
1579 } // namespace blink