Upstream version 10.39.225.0
[platform/framework/web/crosswalk.git] / src / third_party / WebKit / Source / core / css / parser / CSSTokenizer-in.cpp
1 /*
2  * Copyright (C) 2003 Lars Knoll (knoll@kde.org)
3  * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com)
4  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved.
5  * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com>
6  * Copyright (C) 2008 Eric Seidel <eric@webkit.org>
7  * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/)
8  * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved.
9  * Copyright (C) 2012 Intel Corporation. All rights reserved.
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Library General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Library General Public License for more details.
20  *
21  * You should have received a copy of the GNU Library General Public License
22  * along with this library; see the file COPYING.LIB.  If not, write to
23  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
24  * Boston, MA 02110-1301, USA.
25  */
26
27 #include "config.h"
28 #include "core/css/parser/CSSTokenizer.h"
29
30 #include "core/css/CSSKeyframeRule.h"
31 #include "core/css/MediaQuery.h"
32 #include "core/css/StyleRule.h"
33 #include "core/css/parser/BisonCSSParser.h"
34 #include "core/css/parser/CSSParserValues.h"
35 #include "core/html/parser/HTMLParserIdioms.h"
36 #include "core/svg/SVGParserUtilities.h"
37
38 namespace blink {
39
40 #include "core/CSSGrammar.h"
41
42 enum CharacterType {
43     // Types for the main switch.
44
45     // The first 4 types must be grouped together, as they
46     // represent the allowed chars in an identifier.
47     CharacterCaselessU,
48     CharacterIdentifierStart,
49     CharacterNumber,
50     CharacterDash,
51
52     CharacterOther,
53     CharacterNull,
54     CharacterWhiteSpace,
55     CharacterEndMediaQueryOrSupports,
56     CharacterEndNthChild,
57     CharacterQuote,
58     CharacterExclamationMark,
59     CharacterHashmark,
60     CharacterDollar,
61     CharacterAsterisk,
62     CharacterPlus,
63     CharacterDot,
64     CharacterSlash,
65     CharacterLess,
66     CharacterAt,
67     CharacterBackSlash,
68     CharacterXor,
69     CharacterVerticalBar,
70     CharacterTilde,
71 };
72
73 // 128 ASCII codes
74 static const CharacterType typesOfASCIICharacters[128] = {
75 /*   0 - Null               */ CharacterNull,
76 /*   1 - Start of Heading   */ CharacterOther,
77 /*   2 - Start of Text      */ CharacterOther,
78 /*   3 - End of Text        */ CharacterOther,
79 /*   4 - End of Transm.     */ CharacterOther,
80 /*   5 - Enquiry            */ CharacterOther,
81 /*   6 - Acknowledgment     */ CharacterOther,
82 /*   7 - Bell               */ CharacterOther,
83 /*   8 - Back Space         */ CharacterOther,
84 /*   9 - Horizontal Tab     */ CharacterWhiteSpace,
85 /*  10 - Line Feed          */ CharacterWhiteSpace,
86 /*  11 - Vertical Tab       */ CharacterOther,
87 /*  12 - Form Feed          */ CharacterWhiteSpace,
88 /*  13 - Carriage Return    */ CharacterWhiteSpace,
89 /*  14 - Shift Out          */ CharacterOther,
90 /*  15 - Shift In           */ CharacterOther,
91 /*  16 - Data Line Escape   */ CharacterOther,
92 /*  17 - Device Control 1   */ CharacterOther,
93 /*  18 - Device Control 2   */ CharacterOther,
94 /*  19 - Device Control 3   */ CharacterOther,
95 /*  20 - Device Control 4   */ CharacterOther,
96 /*  21 - Negative Ack.      */ CharacterOther,
97 /*  22 - Synchronous Idle   */ CharacterOther,
98 /*  23 - End of Transmit    */ CharacterOther,
99 /*  24 - Cancel             */ CharacterOther,
100 /*  25 - End of Medium      */ CharacterOther,
101 /*  26 - Substitute         */ CharacterOther,
102 /*  27 - Escape             */ CharacterOther,
103 /*  28 - File Separator     */ CharacterOther,
104 /*  29 - Group Separator    */ CharacterOther,
105 /*  30 - Record Separator   */ CharacterOther,
106 /*  31 - Unit Separator     */ CharacterOther,
107 /*  32 - Space              */ CharacterWhiteSpace,
108 /*  33 - !                  */ CharacterExclamationMark,
109 /*  34 - "                  */ CharacterQuote,
110 /*  35 - #                  */ CharacterHashmark,
111 /*  36 - $                  */ CharacterDollar,
112 /*  37 - %                  */ CharacterOther,
113 /*  38 - &                  */ CharacterOther,
114 /*  39 - '                  */ CharacterQuote,
115 /*  40 - (                  */ CharacterOther,
116 /*  41 - )                  */ CharacterEndNthChild,
117 /*  42 - *                  */ CharacterAsterisk,
118 /*  43 - +                  */ CharacterPlus,
119 /*  44 - ,                  */ CharacterOther,
120 /*  45 - -                  */ CharacterDash,
121 /*  46 - .                  */ CharacterDot,
122 /*  47 - /                  */ CharacterSlash,
123 /*  48 - 0                  */ CharacterNumber,
124 /*  49 - 1                  */ CharacterNumber,
125 /*  50 - 2                  */ CharacterNumber,
126 /*  51 - 3                  */ CharacterNumber,
127 /*  52 - 4                  */ CharacterNumber,
128 /*  53 - 5                  */ CharacterNumber,
129 /*  54 - 6                  */ CharacterNumber,
130 /*  55 - 7                  */ CharacterNumber,
131 /*  56 - 8                  */ CharacterNumber,
132 /*  57 - 9                  */ CharacterNumber,
133 /*  58 - :                  */ CharacterOther,
134 /*  59 - ;                  */ CharacterEndMediaQueryOrSupports,
135 /*  60 - <                  */ CharacterLess,
136 /*  61 - =                  */ CharacterOther,
137 /*  62 - >                  */ CharacterOther,
138 /*  63 - ?                  */ CharacterOther,
139 /*  64 - @                  */ CharacterAt,
140 /*  65 - A                  */ CharacterIdentifierStart,
141 /*  66 - B                  */ CharacterIdentifierStart,
142 /*  67 - C                  */ CharacterIdentifierStart,
143 /*  68 - D                  */ CharacterIdentifierStart,
144 /*  69 - E                  */ CharacterIdentifierStart,
145 /*  70 - F                  */ CharacterIdentifierStart,
146 /*  71 - G                  */ CharacterIdentifierStart,
147 /*  72 - H                  */ CharacterIdentifierStart,
148 /*  73 - I                  */ CharacterIdentifierStart,
149 /*  74 - J                  */ CharacterIdentifierStart,
150 /*  75 - K                  */ CharacterIdentifierStart,
151 /*  76 - L                  */ CharacterIdentifierStart,
152 /*  77 - M                  */ CharacterIdentifierStart,
153 /*  78 - N                  */ CharacterIdentifierStart,
154 /*  79 - O                  */ CharacterIdentifierStart,
155 /*  80 - P                  */ CharacterIdentifierStart,
156 /*  81 - Q                  */ CharacterIdentifierStart,
157 /*  82 - R                  */ CharacterIdentifierStart,
158 /*  83 - S                  */ CharacterIdentifierStart,
159 /*  84 - T                  */ CharacterIdentifierStart,
160 /*  85 - U                  */ CharacterCaselessU,
161 /*  86 - V                  */ CharacterIdentifierStart,
162 /*  87 - W                  */ CharacterIdentifierStart,
163 /*  88 - X                  */ CharacterIdentifierStart,
164 /*  89 - Y                  */ CharacterIdentifierStart,
165 /*  90 - Z                  */ CharacterIdentifierStart,
166 /*  91 - [                  */ CharacterOther,
167 /*  92 - \                  */ CharacterBackSlash,
168 /*  93 - ]                  */ CharacterOther,
169 /*  94 - ^                  */ CharacterXor,
170 /*  95 - _                  */ CharacterIdentifierStart,
171 /*  96 - `                  */ CharacterOther,
172 /*  97 - a                  */ CharacterIdentifierStart,
173 /*  98 - b                  */ CharacterIdentifierStart,
174 /*  99 - c                  */ CharacterIdentifierStart,
175 /* 100 - d                  */ CharacterIdentifierStart,
176 /* 101 - e                  */ CharacterIdentifierStart,
177 /* 102 - f                  */ CharacterIdentifierStart,
178 /* 103 - g                  */ CharacterIdentifierStart,
179 /* 104 - h                  */ CharacterIdentifierStart,
180 /* 105 - i                  */ CharacterIdentifierStart,
181 /* 106 - j                  */ CharacterIdentifierStart,
182 /* 107 - k                  */ CharacterIdentifierStart,
183 /* 108 - l                  */ CharacterIdentifierStart,
184 /* 109 - m                  */ CharacterIdentifierStart,
185 /* 110 - n                  */ CharacterIdentifierStart,
186 /* 111 - o                  */ CharacterIdentifierStart,
187 /* 112 - p                  */ CharacterIdentifierStart,
188 /* 113 - q                  */ CharacterIdentifierStart,
189 /* 114 - r                  */ CharacterIdentifierStart,
190 /* 115 - s                  */ CharacterIdentifierStart,
191 /* 116 - t                  */ CharacterIdentifierStart,
192 /* 117 - u                  */ CharacterCaselessU,
193 /* 118 - v                  */ CharacterIdentifierStart,
194 /* 119 - w                  */ CharacterIdentifierStart,
195 /* 120 - x                  */ CharacterIdentifierStart,
196 /* 121 - y                  */ CharacterIdentifierStart,
197 /* 122 - z                  */ CharacterIdentifierStart,
198 /* 123 - {                  */ CharacterEndMediaQueryOrSupports,
199 /* 124 - |                  */ CharacterVerticalBar,
200 /* 125 - }                  */ CharacterOther,
201 /* 126 - ~                  */ CharacterTilde,
202 /* 127 - Delete             */ CharacterOther,
203 };
204
205 // Utility functions for the CSS tokenizer.
206
207 template <typename CharacterType>
208 static inline bool isCSSLetter(CharacterType character)
209 {
210     return character >= 128 || typesOfASCIICharacters[character] <= CharacterDash;
211 }
212
213 template <typename CharacterType>
214 static inline bool isCSSEscape(CharacterType character)
215 {
216     return character >= ' ' && character != 127;
217 }
218
219 template <typename CharacterType>
220 static inline bool isURILetter(CharacterType character)
221 {
222     return (character >= '*' && character != 127) || (character >= '#' && character <= '&') || character == '!';
223 }
224
225 template <typename CharacterType>
226 static inline bool isIdentifierStartAfterDash(CharacterType* currentCharacter)
227 {
228     return isASCIIAlpha(currentCharacter[0]) || currentCharacter[0] == '_' || currentCharacter[0] >= 128
229         || (currentCharacter[0] == '\\' && isCSSEscape(currentCharacter[1]));
230 }
231
232 template <typename CharacterType>
233 static inline bool isEqualToCSSIdentifier(CharacterType* cssString, const char* constantString)
234 {
235     // Compare an character memory data with a zero terminated string.
236     do {
237         // The input must be part of an identifier if constantChar or constString
238         // contains '-'. Otherwise toASCIILowerUnchecked('\r') would be equal to '-'.
239         ASSERT((*constantString >= 'a' && *constantString <= 'z') || *constantString == '-');
240         ASSERT(*constantString != '-' || isCSSLetter(*cssString));
241         if (toASCIILowerUnchecked(*cssString++) != (*constantString++))
242             return false;
243     } while (*constantString);
244     return true;
245 }
246
247 template <typename CharacterType>
248 static inline bool isEqualToCSSCaseSensitiveIdentifier(CharacterType* string, const char* constantString)
249 {
250     ASSERT(*constantString);
251
252     do {
253         if (*string++ != *constantString++)
254             return false;
255     } while (*constantString);
256     return true;
257 }
258
259 template <typename CharacterType>
260 static CharacterType* checkAndSkipEscape(CharacterType* currentCharacter)
261 {
262     // Returns with 0, if escape check is failed. Otherwise
263     // it returns with the following character.
264     ASSERT(*currentCharacter == '\\');
265
266     ++currentCharacter;
267     if (!isCSSEscape(*currentCharacter))
268         return 0;
269
270     if (isASCIIHexDigit(*currentCharacter)) {
271         int length = 6;
272
273         do {
274             ++currentCharacter;
275         } while (isASCIIHexDigit(*currentCharacter) && --length);
276
277         // Optional space after the escape sequence.
278         if (isHTMLSpace<CharacterType>(*currentCharacter))
279             ++currentCharacter;
280         return currentCharacter;
281     }
282     return currentCharacter + 1;
283 }
284
285 template <typename CharacterType>
286 static inline CharacterType* skipWhiteSpace(CharacterType* currentCharacter)
287 {
288     while (isHTMLSpace<CharacterType>(*currentCharacter))
289         ++currentCharacter;
290     return currentCharacter;
291 }
292
293 // Main CSS tokenizer functions.
294
295 template <>
296 inline LChar*& CSSTokenizer::currentCharacter<LChar>()
297 {
298     return m_currentCharacter8;
299 }
300
301 template <>
302 inline UChar*& CSSTokenizer::currentCharacter<UChar>()
303 {
304     return m_currentCharacter16;
305 }
306
307 UChar* CSSTokenizer::allocateStringBuffer16(size_t len)
308 {
309     // Allocates and returns a CSSTokenizer owned buffer for storing
310     // UTF-16 data. Used to get a suitable life span for UTF-16
311     // strings, identifiers and URIs created by the tokenizer.
312     OwnPtr<UChar[]> buffer = adoptArrayPtr(new UChar[len]);
313
314     UChar* bufferPtr = buffer.get();
315
316     m_cssStrings16.append(buffer.release());
317     return bufferPtr;
318 }
319
320 template <>
321 inline LChar* CSSTokenizer::dataStart<LChar>()
322 {
323     return m_dataStart8.get();
324 }
325
326 template <>
327 inline UChar* CSSTokenizer::dataStart<UChar>()
328 {
329     return m_dataStart16.get();
330 }
331
332 template <typename CharacterType>
333 inline CSSParserLocation CSSTokenizer::tokenLocation()
334 {
335     CSSParserLocation location;
336     location.token.init(tokenStart<CharacterType>(), currentCharacter<CharacterType>() - tokenStart<CharacterType>());
337     location.lineNumber = m_tokenStartLineNumber;
338     location.offset = tokenStart<CharacterType>() - dataStart<CharacterType>();
339     return location;
340 }
341
342 CSSParserLocation CSSTokenizer::currentLocation()
343 {
344     if (is8BitSource())
345         return tokenLocation<LChar>();
346     return tokenLocation<UChar>();
347 }
348
349 template <typename CharacterType>
350 inline bool CSSTokenizer::isIdentifierStart()
351 {
352     // Check whether an identifier is started.
353     return isIdentifierStartAfterDash((*currentCharacter<CharacterType>() != '-') ? currentCharacter<CharacterType>() : currentCharacter<CharacterType>() + 1);
354 }
355
356 enum CheckStringValidationMode {
357     AbortIfInvalid,
358     SkipInvalid
359 };
360
361 template <typename CharacterType>
362 static inline CharacterType* checkAndSkipString(CharacterType* currentCharacter, int quote, CheckStringValidationMode mode)
363 {
364     // If mode is AbortIfInvalid and the string check fails it returns
365     // with 0. Otherwise it returns with a pointer to the first
366     // character after the string.
367     while (true) {
368         if (UNLIKELY(*currentCharacter == quote)) {
369             // String parsing is successful.
370             return currentCharacter + 1;
371         }
372         if (UNLIKELY(!*currentCharacter)) {
373             // String parsing is successful up to end of input.
374             return currentCharacter;
375         }
376         if (mode == AbortIfInvalid && UNLIKELY(*currentCharacter <= '\r' && (*currentCharacter == '\n' || (*currentCharacter | 0x1) == '\r'))) {
377             // String parsing is failed for character '\n', '\f' or '\r'.
378             return 0;
379         }
380
381         if (LIKELY(currentCharacter[0] != '\\')) {
382             ++currentCharacter;
383         } else if (currentCharacter[1] == '\n' || currentCharacter[1] == '\f') {
384             currentCharacter += 2;
385         } else if (currentCharacter[1] == '\r') {
386             currentCharacter += currentCharacter[2] == '\n' ? 3 : 2;
387         } else {
388             CharacterType* next = checkAndSkipEscape(currentCharacter);
389             if (!next) {
390                 if (mode == AbortIfInvalid)
391                     return 0;
392                 next = currentCharacter + 1;
393             }
394             currentCharacter = next;
395         }
396     }
397 }
398
399 template <typename CharacterType>
400 unsigned CSSTokenizer::parseEscape(CharacterType*& src)
401 {
402     ASSERT(*src == '\\' && isCSSEscape(src[1]));
403
404     unsigned unicode = 0;
405
406     ++src;
407     if (isASCIIHexDigit(*src)) {
408
409         int length = 6;
410
411         do {
412             unicode = (unicode << 4) + toASCIIHexValue(*src++);
413         } while (--length && isASCIIHexDigit(*src));
414
415         // Characters above 0x10ffff are not handled.
416         if (unicode > 0x10ffff)
417             unicode = 0xfffd;
418
419         // Optional space after the escape sequence.
420         if (isHTMLSpace<CharacterType>(*src))
421             ++src;
422
423         return unicode;
424     }
425
426     return *src++;
427 }
428
429 template <>
430 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode)
431 {
432     ASSERT(unicode <= 0xff);
433     *result = unicode;
434
435     ++result;
436 }
437
438 template <>
439 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode)
440 {
441     // Replace unicode with a surrogate pairs when it is bigger than 0xffff
442     if (U16_LENGTH(unicode) == 2) {
443         *result++ = U16_LEAD(unicode);
444         *result = U16_TRAIL(unicode);
445     } else {
446         *result = unicode;
447     }
448
449     ++result;
450 }
451
452 template <typename SrcCharacterType>
453 size_t CSSTokenizer::peekMaxIdentifierLen(SrcCharacterType* src)
454 {
455     // The decoded form of an identifier (after resolving escape
456     // sequences) will not contain more characters (ASCII or UTF-16
457     // codepoints) than the input. This code can therefore ignore
458     // escape sequences completely.
459     SrcCharacterType* start = src;
460     do {
461         if (LIKELY(*src != '\\'))
462             src++;
463         else
464             parseEscape<SrcCharacterType>(src);
465     } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1])));
466
467     return src - start;
468 }
469
470 template <typename SrcCharacterType, typename DestCharacterType>
471 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType*& src, DestCharacterType*& result, bool& hasEscape)
472 {
473     hasEscape = false;
474     do {
475         if (LIKELY(*src != '\\')) {
476             *result++ = *src++;
477         } else {
478             hasEscape = true;
479             SrcCharacterType* savedEscapeStart = src;
480             unsigned unicode = parseEscape<SrcCharacterType>(src);
481             if (unicode > 0xff && sizeof(DestCharacterType) == 1) {
482                 src = savedEscapeStart;
483                 return false;
484             }
485             UnicodeToChars(result, unicode);
486         }
487     } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1])));
488
489     return true;
490 }
491
492 template <typename CharacterType>
493 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserString& resultString, bool& hasEscape)
494 {
495     // If a valid identifier start is found, we can safely
496     // parse the identifier until the next invalid character.
497     ASSERT(isIdentifierStart<CharacterType>());
498
499     CharacterType* start = currentCharacter<CharacterType>();
500     if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), result, hasEscape))) {
501         // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue
502         ASSERT(is8BitSource());
503         UChar* result16 = allocateStringBuffer16((result - start) + peekMaxIdentifierLen(currentCharacter<CharacterType>()));
504         UChar* start16 = result16;
505         int i = 0;
506         for (; i < result - start; i++)
507             result16[i] = start[i];
508
509         result16 += i;
510
511         parseIdentifierInternal(currentCharacter<CharacterType>(), result16, hasEscape);
512
513         resultString.init(start16, result16 - start16);
514
515         return;
516     }
517
518     resultString.init(start, result - start);
519 }
520
521 template <typename SrcCharacterType>
522 size_t CSSTokenizer::peekMaxStringLen(SrcCharacterType* src, UChar quote)
523 {
524     // The decoded form of a CSS string (after resolving escape
525     // sequences) will not contain more characters (ASCII or UTF-16
526     // codepoints) than the input. This code can therefore ignore
527     // escape sequences completely and just return the length of the
528     // input string (possibly including terminating quote if any).
529     SrcCharacterType* end = checkAndSkipString(src, quote, SkipInvalid);
530     return end ? end - src : 0;
531 }
532
533 template <typename SrcCharacterType, typename DestCharacterType>
534 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType*& src, DestCharacterType*& result, UChar quote)
535 {
536     while (true) {
537         if (UNLIKELY(*src == quote)) {
538             // String parsing is done.
539             ++src;
540             return true;
541         }
542         if (UNLIKELY(!*src)) {
543             // String parsing is done, but don't advance pointer if at the end of input.
544             return true;
545         }
546         if (LIKELY(src[0] != '\\')) {
547             *result++ = *src++;
548         } else if (src[1] == '\n' || src[1] == '\f') {
549             src += 2;
550         } else if (src[1] == '\r') {
551             src += src[2] == '\n' ? 3 : 2;
552         } else {
553             SrcCharacterType* savedEscapeStart = src;
554             unsigned unicode = parseEscape<SrcCharacterType>(src);
555             if (unicode > 0xff && sizeof(DestCharacterType) == 1) {
556                 src = savedEscapeStart;
557                 return false;
558             }
559             UnicodeToChars(result, unicode);
560         }
561     }
562
563     return true;
564 }
565
566 template <typename CharacterType>
567 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& resultString, UChar quote)
568 {
569     CharacterType* start = currentCharacter<CharacterType>();
570
571     if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {
572         // Found an escape we couldn't handle with 8 bits, copy what has been recognized and continue
573         ASSERT(is8BitSource());
574         UChar* result16 = allocateStringBuffer16((result - start) + peekMaxStringLen(currentCharacter<CharacterType>(), quote));
575         UChar* start16 = result16;
576         int i = 0;
577         for (; i < result - start; i++)
578             result16[i] = start[i];
579
580         result16 += i;
581
582         parseStringInternal(currentCharacter<CharacterType>(), result16, quote);
583
584         resultString.init(start16, result16 - start16);
585         return;
586     }
587
588     resultString.init(start, result - start);
589 }
590
591 template <typename CharacterType>
592 inline bool CSSTokenizer::findURI(CharacterType*& start, CharacterType*& end, UChar& quote)
593 {
594     start = skipWhiteSpace(currentCharacter<CharacterType>());
595
596     if (*start == '"' || *start == '\'') {
597         quote = *start++;
598         end = checkAndSkipString(start, quote, AbortIfInvalid);
599         if (!end)
600             return false;
601     } else {
602         quote = 0;
603         end = start;
604         while (isURILetter(*end)) {
605             if (LIKELY(*end != '\\')) {
606                 ++end;
607             } else {
608                 end = checkAndSkipEscape(end);
609                 if (!end)
610                     return false;
611             }
612         }
613     }
614
615     end = skipWhiteSpace(end);
616     if (*end != ')')
617         return false;
618
619     return true;
620 }
621
622 template <typename SrcCharacterType>
623 inline size_t CSSTokenizer::peekMaxURILen(SrcCharacterType* src, UChar quote)
624 {
625     // The decoded form of a URI (after resolving escape sequences)
626     // will not contain more characters (ASCII or UTF-16 codepoints)
627     // than the input. This code can therefore ignore escape sequences
628     // completely.
629     SrcCharacterType* start = src;
630     if (quote) {
631         ASSERT(quote == '"' || quote == '\'');
632         return peekMaxStringLen(src, quote);
633     }
634
635     while (isURILetter(*src)) {
636         if (LIKELY(*src != '\\'))
637             src++;
638         else
639             parseEscape<SrcCharacterType>(src);
640     }
641
642     return src - start;
643 }
644
645 template <typename SrcCharacterType, typename DestCharacterType>
646 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacterType*& dest, UChar quote)
647 {
648     if (quote) {
649         ASSERT(quote == '"' || quote == '\'');
650         return parseStringInternal(src, dest, quote);
651     }
652
653     while (isURILetter(*src)) {
654         if (LIKELY(*src != '\\')) {
655             *dest++ = *src++;
656         } else {
657             unsigned unicode = parseEscape<SrcCharacterType>(src);
658             if (unicode > 0xff && sizeof(DestCharacterType) == 1)
659                 return false;
660             UnicodeToChars(dest, unicode);
661         }
662     }
663
664     return true;
665 }
666
667 template <typename CharacterType>
668 inline void CSSTokenizer::parseURI(CSSParserString& string)
669 {
670     CharacterType* uriStart;
671     CharacterType* uriEnd;
672     UChar quote;
673     if (!findURI(uriStart, uriEnd, quote))
674         return;
675
676     CharacterType* dest = currentCharacter<CharacterType>() = uriStart;
677     if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote))) {
678         string.init(uriStart, dest - uriStart);
679     } else {
680         // An escape sequence was encountered that can't be stored in 8 bits.
681         // Reset the current character to the start of the URI and re-parse with
682         // a 16-bit destination.
683         ASSERT(is8BitSource());
684         currentCharacter<CharacterType>() = uriStart;
685         UChar* result16 = allocateStringBuffer16(peekMaxURILen(currentCharacter<CharacterType>(), quote));
686         UChar* uriStart16 = result16;
687         bool result = parseURIInternal(currentCharacter<CharacterType>(), result16, quote);
688         ASSERT_UNUSED(result, result);
689         string.init(uriStart16, result16 - uriStart16);
690     }
691
692     currentCharacter<CharacterType>() = uriEnd + 1;
693     m_token = URI;
694 }
695
696 template <typename CharacterType>
697 inline bool CSSTokenizer::parseUnicodeRange()
698 {
699     CharacterType* character = currentCharacter<CharacterType>() + 1;
700     int length = 6;
701     ASSERT(*currentCharacter<CharacterType>() == '+');
702
703     while (isASCIIHexDigit(*character) && length) {
704         ++character;
705         --length;
706     }
707
708     if (length && *character == '?') {
709         // At most 5 hex digit followed by a question mark.
710         do {
711             ++character;
712             --length;
713         } while (*character == '?' && length);
714         currentCharacter<CharacterType>() = character;
715         return true;
716     }
717
718     if (length < 6) {
719         // At least one hex digit.
720         if (character[0] == '-' && isASCIIHexDigit(character[1])) {
721             // Followed by a dash and a hex digit.
722             ++character;
723             length = 6;
724             do {
725                 ++character;
726             } while (--length && isASCIIHexDigit(*character));
727         }
728         currentCharacter<CharacterType>() = character;
729         return true;
730     }
731     return false;
732 }
733
734 template <typename CharacterType>
735 bool CSSTokenizer::parseNthChild()
736 {
737     CharacterType* character = currentCharacter<CharacterType>();
738
739     while (isASCIIDigit(*character))
740         ++character;
741     if (isASCIIAlphaCaselessEqual(*character, 'n')) {
742         currentCharacter<CharacterType>() = character + 1;
743         return true;
744     }
745     return false;
746 }
747
748 template <typename CharacterType>
749 bool CSSTokenizer::parseNthChildExtra()
750 {
751     CharacterType* character = skipWhiteSpace(currentCharacter<CharacterType>());
752     if (*character != '+' && *character != '-')
753         return false;
754
755     character = skipWhiteSpace(character + 1);
756     if (!isASCIIDigit(*character))
757         return false;
758
759     do {
760         ++character;
761     } while (isASCIIDigit(*character));
762
763     currentCharacter<CharacterType>() = character;
764     return true;
765 }
766
767 template <typename CharacterType>
768 inline bool CSSTokenizer::detectFunctionTypeToken(int length)
769 {
770     ASSERT(length > 0);
771     CharacterType* name = tokenStart<CharacterType>();
772     SWITCH(name, length) {
773         CASE("not") {
774             m_token = NOTFUNCTION;
775             return true;
776         }
777         CASE("url") {
778             m_token = URI;
779             return true;
780         }
781         CASE("cue") {
782             m_token = CUEFUNCTION;
783             return true;
784         }
785         CASE("calc") {
786             m_token = CALCFUNCTION;
787             return true;
788         }
789         CASE("host") {
790             m_token = HOSTFUNCTION;
791             return true;
792         }
793         CASE("host-context") {
794             m_token = HOSTCONTEXTFUNCTION;
795             return true;
796         }
797         CASE("nth-child") {
798             m_parsingMode = NthChildMode;
799             return true;
800         }
801         CASE("nth-of-type") {
802             m_parsingMode = NthChildMode;
803             return true;
804         }
805         CASE("nth-last-child") {
806             m_parsingMode = NthChildMode;
807             return true;
808         }
809         CASE("nth-last-of-type") {
810             m_parsingMode = NthChildMode;
811             return true;
812         }
813     }
814     return false;
815 }
816
817 template <typename CharacterType>
818 inline void CSSTokenizer::detectMediaQueryToken(int length)
819 {
820     ASSERT(m_parsingMode == MediaQueryMode);
821     CharacterType* name = tokenStart<CharacterType>();
822
823     SWITCH(name, length) {
824         CASE("and") {
825             m_token = MEDIA_AND;
826         }
827         CASE("not") {
828             m_token = MEDIA_NOT;
829         }
830         CASE("only") {
831             m_token = MEDIA_ONLY;
832         }
833         CASE("or") {
834             m_token = MEDIA_OR;
835         }
836     }
837 }
838
839 template <typename CharacterType>
840 inline void CSSTokenizer::detectNumberToken(CharacterType* type, int length)
841 {
842     ASSERT(length > 0);
843
844     SWITCH(type, length) {
845         CASE("cm") {
846             m_token = CMS;
847         }
848         CASE("ch") {
849             m_token = CHS;
850         }
851         CASE("deg") {
852             m_token = DEGS;
853         }
854         CASE("dppx") {
855             // There is a discussion about the name of this unit on www-style.
856             // Keep this compile time guard in place until that is resolved.
857             // http://lists.w3.org/Archives/Public/www-style/2012May/0915.html
858             m_token = DPPX;
859         }
860         CASE("dpcm") {
861             m_token = DPCM;
862         }
863         CASE("dpi") {
864             m_token = DPI;
865         }
866         CASE("em") {
867             m_token = EMS;
868         }
869         CASE("ex") {
870             m_token = EXS;
871         }
872         CASE("fr") {
873             m_token = FR;
874         }
875         CASE("grad") {
876             m_token = GRADS;
877         }
878         CASE("hz") {
879             m_token = HERTZ;
880         }
881         CASE("in") {
882             m_token = INS;
883         }
884         CASE("khz") {
885             m_token = KHERTZ;
886         }
887         CASE("mm") {
888             m_token = MMS;
889         }
890         CASE("ms") {
891             m_token = MSECS;
892         }
893         CASE("px") {
894             m_token = PXS;
895         }
896         CASE("pt") {
897             m_token = PTS;
898         }
899         CASE("pc") {
900             m_token = PCS;
901         }
902         CASE("rad") {
903             m_token = RADS;
904         }
905         CASE("rem") {
906             m_token = REMS;
907         }
908         CASE("s") {
909             m_token = SECS;
910         }
911         CASE("turn") {
912             m_token = TURNS;
913         }
914         CASE("vw") {
915             m_token = VW;
916         }
917         CASE("vh") {
918             m_token = VH;
919         }
920         CASE("vmin") {
921             m_token = VMIN;
922         }
923         CASE("vmax") {
924             m_token = VMAX;
925         }
926         CASE("__qem") {
927             m_token = QEMS;
928         }
929     }
930 }
931
932 template <typename CharacterType>
933 inline void CSSTokenizer::detectDashToken(int length)
934 {
935     CharacterType* name = tokenStart<CharacterType>();
936
937     // Ignore leading dash.
938     ++name;
939     --length;
940
941     SWITCH(name, length) {
942         CASE("webkit-any") {
943             m_token = ANYFUNCTION;
944         }
945         CASE("webkit-calc") {
946             m_token = CALCFUNCTION;
947         }
948     }
949 }
950
951 template <typename CharacterType>
952 inline void CSSTokenizer::detectAtToken(int length, bool hasEscape)
953 {
954     CharacterType* name = tokenStart<CharacterType>();
955     ASSERT(name[0] == '@' && length >= 2);
956
957     // Ignore leading @.
958     ++name;
959     --length;
960
961     // charset, font-face, import, media, namespace, page, supports,
962     // -webkit-keyframes, keyframes, and -webkit-mediaquery are not affected by hasEscape.
963     SWITCH(name, length) {
964         CASE("bottom-left") {
965             if (LIKELY(!hasEscape))
966                 m_token = BOTTOMLEFT_SYM;
967         }
968         CASE("bottom-right") {
969             if (LIKELY(!hasEscape))
970                 m_token = BOTTOMRIGHT_SYM;
971         }
972         CASE("bottom-center") {
973             if (LIKELY(!hasEscape))
974                 m_token = BOTTOMCENTER_SYM;
975         }
976         CASE("bottom-left-corner") {
977             if (LIKELY(!hasEscape))
978                 m_token = BOTTOMLEFTCORNER_SYM;
979         }
980         CASE("bottom-right-corner") {
981             if (LIKELY(!hasEscape))
982                 m_token = BOTTOMRIGHTCORNER_SYM;
983         }
984         CASE("charset") {
985             if (name - 1 == dataStart<CharacterType>())
986                 m_token = CHARSET_SYM;
987         }
988         CASE("font-face") {
989             m_token = FONT_FACE_SYM;
990         }
991         CASE("import") {
992             m_parsingMode = MediaQueryMode;
993             m_token = IMPORT_SYM;
994         }
995         CASE("keyframes") {
996             if (RuntimeEnabledFeatures::cssAnimationUnprefixedEnabled())
997                 m_token = KEYFRAMES_SYM;
998         }
999         CASE("left-top") {
1000             if (LIKELY(!hasEscape))
1001                 m_token = LEFTTOP_SYM;
1002         }
1003         CASE("left-middle") {
1004             if (LIKELY(!hasEscape))
1005                 m_token = LEFTMIDDLE_SYM;
1006         }
1007         CASE("left-bottom") {
1008             if (LIKELY(!hasEscape))
1009                 m_token = LEFTBOTTOM_SYM;
1010         }
1011         CASE("media") {
1012             m_parsingMode = MediaQueryMode;
1013             m_token = MEDIA_SYM;
1014         }
1015         CASE("namespace") {
1016             m_token = NAMESPACE_SYM;
1017         }
1018         CASE("page") {
1019             m_token = PAGE_SYM;
1020         }
1021         CASE("right-top") {
1022             if (LIKELY(!hasEscape))
1023                 m_token = RIGHTTOP_SYM;
1024         }
1025         CASE("right-middle") {
1026             if (LIKELY(!hasEscape))
1027                 m_token = RIGHTMIDDLE_SYM;
1028         }
1029         CASE("right-bottom") {
1030             if (LIKELY(!hasEscape))
1031                 m_token = RIGHTBOTTOM_SYM;
1032         }
1033         CASE("supports") {
1034             m_parsingMode = SupportsMode;
1035             m_token = SUPPORTS_SYM;
1036         }
1037         CASE("top-left") {
1038             if (LIKELY(!hasEscape))
1039                 m_token = TOPLEFT_SYM;
1040         }
1041         CASE("top-right") {
1042             if (LIKELY(!hasEscape))
1043                 m_token = TOPRIGHT_SYM;
1044         }
1045         CASE("top-center") {
1046             if (LIKELY(!hasEscape))
1047                 m_token = TOPCENTER_SYM;
1048         }
1049         CASE("top-left-corner") {
1050             if (LIKELY(!hasEscape))
1051                 m_token = TOPLEFTCORNER_SYM;
1052         }
1053         CASE("top-right-corner") {
1054             if (LIKELY(!hasEscape))
1055                 m_token = TOPRIGHTCORNER_SYM;
1056         }
1057         CASE("viewport") {
1058             m_token = VIEWPORT_RULE_SYM;
1059         }
1060         CASE("-internal-rule") {
1061             if (LIKELY(!hasEscape && m_internal))
1062                 m_token = INTERNAL_RULE_SYM;
1063         }
1064         CASE("-internal-decls") {
1065             if (LIKELY(!hasEscape && m_internal))
1066                 m_token = INTERNAL_DECLS_SYM;
1067         }
1068         CASE("-internal-value") {
1069             if (LIKELY(!hasEscape && m_internal))
1070                 m_token = INTERNAL_VALUE_SYM;
1071         }
1072         CASE("-webkit-keyframes") {
1073             m_token = WEBKIT_KEYFRAMES_SYM;
1074         }
1075         CASE("-internal-selector") {
1076             if (LIKELY(!hasEscape && m_internal))
1077                 m_token = INTERNAL_SELECTOR_SYM;
1078         }
1079         CASE("-internal-keyframe-rule") {
1080             if (LIKELY(!hasEscape && m_internal))
1081                 m_token = INTERNAL_KEYFRAME_RULE_SYM;
1082         }
1083         CASE("-internal-keyframe-key-list") {
1084             if (!m_internal)
1085                 return;
1086             m_token = INTERNAL_KEYFRAME_KEY_LIST_SYM;
1087         }
1088         CASE("-internal-supports-condition") {
1089             if (!m_internal)
1090                 return;
1091             m_parsingMode = SupportsMode;
1092             m_token = INTERNAL_SUPPORTS_CONDITION_SYM;
1093         }
1094     }
1095 }
1096
1097 template <typename CharacterType>
1098 inline void CSSTokenizer::detectSupportsToken(int length)
1099 {
1100     ASSERT(m_parsingMode == SupportsMode);
1101     CharacterType* name = tokenStart<CharacterType>();
1102
1103     SWITCH(name, length) {
1104         CASE("or") {
1105             m_token = SUPPORTS_OR;
1106         }
1107         CASE("and") {
1108             m_token = SUPPORTS_AND;
1109         }
1110         CASE("not") {
1111             m_token = SUPPORTS_NOT;
1112         }
1113     }
1114 }
1115
1116 template <typename SrcCharacterType>
1117 int CSSTokenizer::realLex(void* yylvalWithoutType)
1118 {
1119     YYSTYPE* yylval = static_cast<YYSTYPE*>(yylvalWithoutType);
1120     // Write pointer for the next character.
1121     SrcCharacterType* result;
1122     CSSParserString resultString;
1123     bool hasEscape;
1124
1125     // The input buffer is terminated by a \0 character, so
1126     // it is safe to read one character ahead of a known non-null.
1127 #if ENABLE(ASSERT)
1128     // In debug we check with an ASSERT that the length is > 0 for string types.
1129     yylval->string.clear();
1130 #endif
1131
1132 restartAfterComment:
1133     result = currentCharacter<SrcCharacterType>();
1134     setTokenStart(result);
1135     m_tokenStartLineNumber = m_lineNumber;
1136     m_token = *currentCharacter<SrcCharacterType>();
1137     ++currentCharacter<SrcCharacterType>();
1138
1139     switch ((m_token <= 127) ? typesOfASCIICharacters[m_token] : CharacterIdentifierStart) {
1140     case CharacterCaselessU:
1141         if (UNLIKELY(*currentCharacter<SrcCharacterType>() == '+')) {
1142             if (parseUnicodeRange<SrcCharacterType>()) {
1143                 m_token = UNICODERANGE;
1144                 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1145                 break;
1146             }
1147         }
1148         // Fall through to CharacterIdentifierStart.
1149
1150     case CharacterIdentifierStart:
1151         --currentCharacter<SrcCharacterType>();
1152         parseIdentifier(result, yylval->string, hasEscape);
1153         m_token = IDENT;
1154
1155         if (UNLIKELY(*currentCharacter<SrcCharacterType>() == '(')) {
1156             if (m_parsingMode == SupportsMode && !hasEscape) {
1157                 detectSupportsToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1158                 if (m_token != IDENT)
1159                     break;
1160             }
1161
1162             m_token = FUNCTION;
1163             if (!hasEscape)
1164                 detectFunctionTypeToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1165
1166             // Skip parenthesis
1167             ++currentCharacter<SrcCharacterType>();
1168             ++result;
1169
1170             if (m_token == URI) {
1171                 m_token = FUNCTION;
1172                 // Check whether it is really an URI.
1173                 if (yylval->string.is8Bit())
1174                     parseURI<LChar>(yylval->string);
1175                 else
1176                     parseURI<UChar>(yylval->string);
1177             }
1178         } else if (UNLIKELY(m_parsingMode != NormalMode) && !hasEscape) {
1179             if (m_parsingMode == MediaQueryMode) {
1180                 detectMediaQueryToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1181             } else if (m_parsingMode == SupportsMode) {
1182                 detectSupportsToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1183             } else if (m_parsingMode == NthChildMode && isASCIIAlphaCaselessEqual(tokenStart<SrcCharacterType>()[0], 'n')) {
1184                 if (result - tokenStart<SrcCharacterType>() == 1) {
1185                     // String "n" is IDENT but "n+1" is NTH.
1186                     if (parseNthChildExtra<SrcCharacterType>()) {
1187                         m_token = NTH;
1188                         yylval->string.m_length = currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>();
1189                     }
1190                 } else if (result - tokenStart<SrcCharacterType>() >= 2 && tokenStart<SrcCharacterType>()[1] == '-') {
1191                     // String "n-" is IDENT but "n-1" is NTH.
1192                     // Set currentCharacter to '-' to continue parsing.
1193                     SrcCharacterType* nextCharacter = result;
1194                     currentCharacter<SrcCharacterType>() = tokenStart<SrcCharacterType>() + 1;
1195                     if (parseNthChildExtra<SrcCharacterType>()) {
1196                         m_token = NTH;
1197                         yylval->string.setLength(currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1198                     } else {
1199                         // Revert the change to currentCharacter if unsuccessful.
1200                         currentCharacter<SrcCharacterType>() = nextCharacter;
1201                     }
1202                 }
1203             }
1204         }
1205         break;
1206
1207     case CharacterDot:
1208         if (!isASCIIDigit(currentCharacter<SrcCharacterType>()[0]))
1209             break;
1210         // Fall through to CharacterNumber.
1211
1212     case CharacterNumber: {
1213         bool dotSeen = (m_token == '.');
1214
1215         while (true) {
1216             if (!isASCIIDigit(currentCharacter<SrcCharacterType>()[0])) {
1217                 // Only one dot is allowed for a number,
1218                 // and it must be followed by a digit.
1219                 if (currentCharacter<SrcCharacterType>()[0] != '.' || dotSeen || !isASCIIDigit(currentCharacter<SrcCharacterType>()[1]))
1220                     break;
1221                 dotSeen = true;
1222             }
1223             ++currentCharacter<SrcCharacterType>();
1224         }
1225
1226         if (UNLIKELY(m_parsingMode == NthChildMode) && !dotSeen && isASCIIAlphaCaselessEqual(*currentCharacter<SrcCharacterType>(), 'n')) {
1227             // "[0-9]+n" is always an NthChild.
1228             ++currentCharacter<SrcCharacterType>();
1229             parseNthChildExtra<SrcCharacterType>();
1230             m_token = NTH;
1231             yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1232             break;
1233         }
1234
1235         // We need to take care of units like 'em' or 'ex'.
1236         SrcCharacterType* character = currentCharacter<SrcCharacterType>();
1237         if (isASCIIAlphaCaselessEqual(*character, 'e')) {
1238             ASSERT(character - tokenStart<SrcCharacterType>() > 0);
1239             ++character;
1240             if (*character == '-' || *character == '+' || isASCIIDigit(*character)) {
1241                 ++character;
1242                 while (isASCIIDigit(*character))
1243                     ++character;
1244                 // Use FLOATTOKEN if the string contains exponents.
1245                 dotSeen = true;
1246                 currentCharacter<SrcCharacterType>() = character;
1247             }
1248         }
1249
1250         yylval->number = charactersToDouble(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1251
1252         // Type of the function.
1253         if (isIdentifierStart<SrcCharacterType>()) {
1254             SrcCharacterType* type = currentCharacter<SrcCharacterType>();
1255             result = currentCharacter<SrcCharacterType>();
1256
1257             parseIdentifier(result, resultString, hasEscape);
1258
1259             m_token = DIMEN;
1260             if (!hasEscape)
1261                 detectNumberToken(type, currentCharacter<SrcCharacterType>() - type);
1262
1263             if (m_token == DIMEN) {
1264                 // The decoded number is overwritten, but this is intentional.
1265                 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1266             }
1267         } else if (*currentCharacter<SrcCharacterType>() == '%') {
1268             // Although the CSS grammar says {num}% we follow
1269             // webkit at the moment which uses {num}%+.
1270             do {
1271                 ++currentCharacter<SrcCharacterType>();
1272             } while (*currentCharacter<SrcCharacterType>() == '%');
1273             m_token = PERCENTAGE;
1274         } else {
1275             m_token = dotSeen ? FLOATTOKEN : INTEGER;
1276         }
1277         break;
1278     }
1279
1280     case CharacterDash:
1281         if (isIdentifierStartAfterDash(currentCharacter<SrcCharacterType>())) {
1282             --currentCharacter<SrcCharacterType>();
1283             parseIdentifier(result, resultString, hasEscape);
1284             m_token = IDENT;
1285
1286             if (*currentCharacter<SrcCharacterType>() == '(') {
1287                 m_token = FUNCTION;
1288                 if (!hasEscape)
1289                     detectDashToken<SrcCharacterType>(result - tokenStart<SrcCharacterType>());
1290                 ++currentCharacter<SrcCharacterType>();
1291                 ++result;
1292             } else if (UNLIKELY(m_parsingMode == NthChildMode) && !hasEscape && isASCIIAlphaCaselessEqual(tokenStart<SrcCharacterType>()[1], 'n')) {
1293                 if (result - tokenStart<SrcCharacterType>() == 2) {
1294                     // String "-n" is IDENT but "-n+1" is NTH.
1295                     if (parseNthChildExtra<SrcCharacterType>()) {
1296                         m_token = NTH;
1297                         result = currentCharacter<SrcCharacterType>();
1298                     }
1299                 } else if (result - tokenStart<SrcCharacterType>() >= 3 && tokenStart<SrcCharacterType>()[2] == '-') {
1300                     // String "-n-" is IDENT but "-n-1" is NTH.
1301                     // Set currentCharacter to second '-' of '-n-' to continue parsing.
1302                     SrcCharacterType* nextCharacter = result;
1303                     currentCharacter<SrcCharacterType>() = tokenStart<SrcCharacterType>() + 2;
1304                     if (parseNthChildExtra<SrcCharacterType>()) {
1305                         m_token = NTH;
1306                         result = currentCharacter<SrcCharacterType>();
1307                     } else {
1308                         // Revert the change to currentCharacter if unsuccessful.
1309                         currentCharacter<SrcCharacterType>() = nextCharacter;
1310                     }
1311                 }
1312                 resultString.setLength(result - tokenStart<SrcCharacterType>());
1313             }
1314             yylval->string = resultString;
1315         } else if (currentCharacter<SrcCharacterType>()[0] == '-' && currentCharacter<SrcCharacterType>()[1] == '>') {
1316             currentCharacter<SrcCharacterType>() += 2;
1317             m_token = SGML_CD;
1318         } else if (UNLIKELY(m_parsingMode == NthChildMode)) {
1319             // "-[0-9]+n" is always an NthChild.
1320             if (parseNthChild<SrcCharacterType>()) {
1321                 parseNthChildExtra<SrcCharacterType>();
1322                 m_token = NTH;
1323                 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1324             }
1325         }
1326         break;
1327
1328     case CharacterOther:
1329         // m_token is simply the current character.
1330         break;
1331
1332     case CharacterNull:
1333         // Do not advance pointer at the end of input.
1334         --currentCharacter<SrcCharacterType>();
1335         break;
1336
1337     case CharacterWhiteSpace:
1338         m_token = WHITESPACE;
1339         // Might start with a '\n'.
1340         --currentCharacter<SrcCharacterType>();
1341         do {
1342             if (*currentCharacter<SrcCharacterType>() == '\n')
1343                 ++m_lineNumber;
1344             ++currentCharacter<SrcCharacterType>();
1345         } while (*currentCharacter<SrcCharacterType>() <= ' ' && (typesOfASCIICharacters[*currentCharacter<SrcCharacterType>()] == CharacterWhiteSpace));
1346         break;
1347
1348     case CharacterEndMediaQueryOrSupports:
1349         if (m_parsingMode == MediaQueryMode || m_parsingMode == SupportsMode)
1350             m_parsingMode = NormalMode;
1351         break;
1352
1353     case CharacterEndNthChild:
1354         if (m_parsingMode == NthChildMode)
1355             m_parsingMode = NormalMode;
1356         break;
1357
1358     case CharacterQuote:
1359         if (checkAndSkipString(currentCharacter<SrcCharacterType>(), m_token, AbortIfInvalid)) {
1360             ++result;
1361             parseString<SrcCharacterType>(result, yylval->string, m_token);
1362             m_token = STRING;
1363         }
1364         break;
1365
1366     case CharacterExclamationMark: {
1367         SrcCharacterType* start = skipWhiteSpace(currentCharacter<SrcCharacterType>());
1368         if (isEqualToCSSIdentifier(start, "important")) {
1369             m_token = IMPORTANT_SYM;
1370             currentCharacter<SrcCharacterType>() = start + 9;
1371         }
1372         break;
1373     }
1374
1375     case CharacterHashmark: {
1376         SrcCharacterType* start = currentCharacter<SrcCharacterType>();
1377         result = currentCharacter<SrcCharacterType>();
1378
1379         if (isASCIIDigit(*currentCharacter<SrcCharacterType>())) {
1380             // This must be a valid hex number token.
1381             do {
1382                 ++currentCharacter<SrcCharacterType>();
1383             } while (isASCIIHexDigit(*currentCharacter<SrcCharacterType>()));
1384             m_token = HEX;
1385             yylval->string.init(start, currentCharacter<SrcCharacterType>() - start);
1386         } else if (isIdentifierStart<SrcCharacterType>()) {
1387             m_token = IDSEL;
1388             parseIdentifier(result, yylval->string, hasEscape);
1389             if (!hasEscape) {
1390                 // Check whether the identifier is also a valid hex number.
1391                 SrcCharacterType* current = start;
1392                 m_token = HEX;
1393                 do {
1394                     if (!isASCIIHexDigit(*current)) {
1395                         m_token = IDSEL;
1396                         break;
1397                     }
1398                     ++current;
1399                 } while (current < result);
1400             }
1401         }
1402         break;
1403     }
1404
1405     case CharacterSlash:
1406         // Ignore comments. They are not even considered as white spaces.
1407         if (*currentCharacter<SrcCharacterType>() == '*') {
1408             const CSSParserLocation startLocation = currentLocation();
1409             if (m_parser.m_observer) {
1410                 unsigned startOffset = currentCharacter<SrcCharacterType>() - dataStart<SrcCharacterType>() - 1; // Start with a slash.
1411                 m_parser.m_observer->startComment(startOffset - m_parsedTextPrefixLength);
1412             }
1413             ++currentCharacter<SrcCharacterType>();
1414             while (currentCharacter<SrcCharacterType>()[0] != '*' || currentCharacter<SrcCharacterType>()[1] != '/') {
1415                 if (*currentCharacter<SrcCharacterType>() == '\n')
1416                     ++m_lineNumber;
1417                 if (*currentCharacter<SrcCharacterType>() == '\0') {
1418                     // Unterminated comments are simply ignored.
1419                     currentCharacter<SrcCharacterType>() -= 2;
1420                     m_parser.reportError(startLocation, UnterminatedCommentCSSError);
1421                     break;
1422                 }
1423                 ++currentCharacter<SrcCharacterType>();
1424             }
1425             currentCharacter<SrcCharacterType>() += 2;
1426             if (m_parser.m_observer) {
1427                 unsigned endOffset = currentCharacter<SrcCharacterType>() - dataStart<SrcCharacterType>();
1428                 unsigned userTextEndOffset = static_cast<unsigned>(m_length - 1 - m_parsedTextSuffixLength);
1429                 m_parser.m_observer->endComment(std::min(endOffset, userTextEndOffset) - m_parsedTextPrefixLength);
1430             }
1431             goto restartAfterComment;
1432         }
1433         break;
1434
1435     case CharacterDollar:
1436         if (*currentCharacter<SrcCharacterType>() == '=') {
1437             ++currentCharacter<SrcCharacterType>();
1438             m_token = ENDSWITH;
1439         }
1440         break;
1441
1442     case CharacterAsterisk:
1443         if (*currentCharacter<SrcCharacterType>() == '=') {
1444             ++currentCharacter<SrcCharacterType>();
1445             m_token = CONTAINS;
1446         }
1447         break;
1448
1449     case CharacterPlus:
1450         if (UNLIKELY(m_parsingMode == NthChildMode)) {
1451             // Simplest case. "+[0-9]*n" is always NthChild.
1452             if (parseNthChild<SrcCharacterType>()) {
1453                 parseNthChildExtra<SrcCharacterType>();
1454                 m_token = NTH;
1455                 yylval->string.init(tokenStart<SrcCharacterType>(), currentCharacter<SrcCharacterType>() - tokenStart<SrcCharacterType>());
1456             }
1457         }
1458         break;
1459
1460     case CharacterLess:
1461         if (currentCharacter<SrcCharacterType>()[0] == '!' && currentCharacter<SrcCharacterType>()[1] == '-' && currentCharacter<SrcCharacterType>()[2] == '-') {
1462             currentCharacter<SrcCharacterType>() += 3;
1463             m_token = SGML_CD;
1464         }
1465         break;
1466
1467     case CharacterAt:
1468         if (isIdentifierStart<SrcCharacterType>()) {
1469             m_token = ATKEYWORD;
1470             ++result;
1471             parseIdentifier(result, resultString, hasEscape);
1472             // The standard enables unicode escapes in at-rules. In this case only the resultString will contain the
1473             // correct identifier, hence we have to use it to determine its length instead of the usual pointer arithmetic.
1474             detectAtToken<SrcCharacterType>(resultString.length() + 1, hasEscape);
1475         }
1476         break;
1477
1478     case CharacterBackSlash:
1479         if (isCSSEscape(*currentCharacter<SrcCharacterType>())) {
1480             --currentCharacter<SrcCharacterType>();
1481             parseIdentifier(result, yylval->string, hasEscape);
1482             m_token = IDENT;
1483         }
1484         break;
1485
1486     case CharacterXor:
1487         if (*currentCharacter<SrcCharacterType>() == '=') {
1488             ++currentCharacter<SrcCharacterType>();
1489             m_token = BEGINSWITH;
1490         }
1491         break;
1492
1493     case CharacterVerticalBar:
1494         if (*currentCharacter<SrcCharacterType>() == '=') {
1495             ++currentCharacter<SrcCharacterType>();
1496             m_token = DASHMATCH;
1497         }
1498         break;
1499
1500     case CharacterTilde:
1501         if (*currentCharacter<SrcCharacterType>() == '=') {
1502             ++currentCharacter<SrcCharacterType>();
1503             m_token = INCLUDES;
1504         }
1505         break;
1506
1507     default:
1508         ASSERT_NOT_REACHED();
1509         break;
1510     }
1511
1512     return m_token;
1513 }
1514
1515 template <>
1516 inline void CSSTokenizer::setTokenStart<LChar>(LChar* tokenStart)
1517 {
1518     m_tokenStart.ptr8 = tokenStart;
1519 }
1520
1521 template <>
1522 inline void CSSTokenizer::setTokenStart<UChar>(UChar* tokenStart)
1523 {
1524     m_tokenStart.ptr16 = tokenStart;
1525 }
1526
1527 void CSSTokenizer::setupTokenizer(const char* prefix, unsigned prefixLength, const String& string, const char* suffix, unsigned suffixLength)
1528 {
1529     m_parsedTextPrefixLength = prefixLength;
1530     m_parsedTextSuffixLength = suffixLength;
1531     unsigned stringLength = string.length();
1532     unsigned length = stringLength + m_parsedTextPrefixLength + m_parsedTextSuffixLength + 1;
1533     m_length = length;
1534
1535     if (!stringLength || string.is8Bit()) {
1536         m_dataStart8 = adoptArrayPtr(new LChar[length]);
1537         for (unsigned i = 0; i < m_parsedTextPrefixLength; i++)
1538             m_dataStart8[i] = prefix[i];
1539
1540         if (stringLength)
1541             memcpy(m_dataStart8.get() + m_parsedTextPrefixLength, string.characters8(), stringLength * sizeof(LChar));
1542
1543         unsigned start = m_parsedTextPrefixLength + stringLength;
1544         unsigned end = start + suffixLength;
1545         for (unsigned i = start; i < end; i++)
1546             m_dataStart8[i] = suffix[i - start];
1547
1548         m_dataStart8[length - 1] = 0;
1549
1550         m_is8BitSource = true;
1551         m_currentCharacter8 = m_dataStart8.get();
1552         m_currentCharacter16 = 0;
1553         setTokenStart<LChar>(m_currentCharacter8);
1554         m_lexFunc = &CSSTokenizer::realLex<LChar>;
1555         return;
1556     }
1557
1558     m_dataStart16 = adoptArrayPtr(new UChar[length]);
1559     for (unsigned i = 0; i < m_parsedTextPrefixLength; i++)
1560         m_dataStart16[i] = prefix[i];
1561
1562     ASSERT(stringLength);
1563     memcpy(m_dataStart16.get() + m_parsedTextPrefixLength, string.characters16(), stringLength * sizeof(UChar));
1564
1565     unsigned start = m_parsedTextPrefixLength + stringLength;
1566     unsigned end = start + suffixLength;
1567     for (unsigned i = start; i < end; i++)
1568         m_dataStart16[i] = suffix[i - start];
1569
1570     m_dataStart16[length - 1] = 0;
1571
1572     m_is8BitSource = false;
1573     m_currentCharacter8 = 0;
1574     m_currentCharacter16 = m_dataStart16.get();
1575     setTokenStart<UChar>(m_currentCharacter16);
1576     m_lexFunc = &CSSTokenizer::realLex<UChar>;
1577 }
1578
1579 } // namespace blink