1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
6 #include "core/css/parser/MediaQueryTokenizer.h"
9 #include "MediaQueryTokenizerCodepoints.cpp"
12 #include "core/css/parser/MediaQueryInputStream.h"
13 #include "core/html/parser/HTMLParserIdioms.h"
14 #include "wtf/unicode/CharacterNames.h"
18 // http://dev.w3.org/csswg/css-syntax/#name-start-code-point
19 static bool isNameStart(UChar c)
28 // http://dev.w3.org/csswg/css-syntax/#name-code-point
29 static bool isNameChar(UChar c)
31 return isNameStart(c) || isASCIIDigit(c) || c == '-';
34 // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-escape
35 static bool twoCharsAreValidEscape(UChar first, UChar second)
37 return ((first == '\\') && (second != '\n') && (second != kEndOfFileMarker));
40 MediaQueryTokenizer::MediaQueryTokenizer(MediaQueryInputStream& inputStream)
41 : m_input(inputStream)
45 void MediaQueryTokenizer::reconsume(UChar c)
50 UChar MediaQueryTokenizer::consume()
52 UChar current = m_input.currentInputChar();
57 void MediaQueryTokenizer::consume(unsigned offset)
59 m_input.advance(offset);
62 MediaQueryToken MediaQueryTokenizer::whiteSpace(UChar cc)
64 // CSS Tokenization is currently lossy, but we could record
65 // the exact whitespace instead of discarding it here.
66 consumeUntilNonWhitespace();
67 return MediaQueryToken(WhitespaceToken);
70 static bool popIfBlockMatches(Vector<MediaQueryTokenType>& blockStack, MediaQueryTokenType type)
72 if (!blockStack.isEmpty() && blockStack.last() == type) {
73 blockStack.removeLast();
79 MediaQueryToken MediaQueryTokenizer::blockStart(MediaQueryTokenType type)
81 m_blockStack.append(type);
82 return MediaQueryToken(type, MediaQueryToken::BlockStart);
85 MediaQueryToken MediaQueryTokenizer::blockStart(MediaQueryTokenType blockType, MediaQueryTokenType type, String name)
87 m_blockStack.append(blockType);
88 return MediaQueryToken(type, name, MediaQueryToken::BlockStart);
91 MediaQueryToken MediaQueryTokenizer::blockEnd(MediaQueryTokenType type, MediaQueryTokenType startType)
93 if (popIfBlockMatches(m_blockStack, startType))
94 return MediaQueryToken(type, MediaQueryToken::BlockEnd);
95 return MediaQueryToken(type);
98 MediaQueryToken MediaQueryTokenizer::leftParenthesis(UChar cc)
100 return blockStart(LeftParenthesisToken);
103 MediaQueryToken MediaQueryTokenizer::rightParenthesis(UChar cc)
105 return blockEnd(RightParenthesisToken, LeftParenthesisToken);
108 MediaQueryToken MediaQueryTokenizer::leftBracket(UChar cc)
110 return blockStart(LeftBracketToken);
113 MediaQueryToken MediaQueryTokenizer::rightBracket(UChar cc)
115 return blockEnd(RightBracketToken, LeftBracketToken);
118 MediaQueryToken MediaQueryTokenizer::leftBrace(UChar cc)
120 return blockStart(LeftBraceToken);
123 MediaQueryToken MediaQueryTokenizer::rightBrace(UChar cc)
125 return blockEnd(RightBraceToken, LeftBraceToken);
128 MediaQueryToken MediaQueryTokenizer::plusOrFullStop(UChar cc)
130 if (nextCharsAreNumber()) {
132 return consumeNumericToken();
134 return MediaQueryToken(DelimiterToken, cc);
137 MediaQueryToken MediaQueryTokenizer::asterisk(UChar cc)
139 return MediaQueryToken(DelimiterToken, cc);
142 MediaQueryToken MediaQueryTokenizer::comma(UChar cc)
144 return MediaQueryToken(CommaToken);
147 MediaQueryToken MediaQueryTokenizer::hyphenMinus(UChar cc)
149 if (nextCharsAreNumber()) {
151 return consumeNumericToken();
153 if (nextCharsAreIdentifier()) {
155 return consumeIdentLikeToken();
157 return MediaQueryToken(DelimiterToken, cc);
160 MediaQueryToken MediaQueryTokenizer::solidus(UChar cc)
162 if (consumeIfNext('*')) {
163 // We're intentionally deviating from the spec here, by creating tokens for CSS comments.
164 return consumeUntilCommentEndFound()? MediaQueryToken(CommentToken): MediaQueryToken(EOFToken);
167 return MediaQueryToken(DelimiterToken, cc);
170 MediaQueryToken MediaQueryTokenizer::colon(UChar cc)
172 return MediaQueryToken(ColonToken);
175 MediaQueryToken MediaQueryTokenizer::semiColon(UChar cc)
177 return MediaQueryToken(SemicolonToken);
180 MediaQueryToken MediaQueryTokenizer::reverseSolidus(UChar cc)
182 if (twoCharsAreValidEscape(cc, m_input.currentInputChar())) {
184 return consumeIdentLikeToken();
186 return MediaQueryToken(DelimiterToken, cc);
189 MediaQueryToken MediaQueryTokenizer::asciiDigit(UChar cc)
192 return consumeNumericToken();
195 MediaQueryToken MediaQueryTokenizer::nameStart(UChar cc)
198 return consumeIdentLikeToken();
201 MediaQueryToken MediaQueryTokenizer::stringStart(UChar cc)
203 return consumeStringTokenUntil(cc);
206 MediaQueryToken MediaQueryTokenizer::endOfFile(UChar cc)
208 return MediaQueryToken(EOFToken);
211 void MediaQueryTokenizer::tokenize(String string, Vector<MediaQueryToken>& outTokens)
213 // According to the spec, we should perform preprocessing here.
214 // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing
216 // However, we can skip this step since:
217 // * We're using HTML spaces (which accept \r and \f as a valid white space)
218 // * Do not count white spaces
219 // * consumeEscape replaces NULLs for replacement characters
221 if (string.isEmpty())
224 MediaQueryInputStream input(string);
225 MediaQueryTokenizer tokenizer(input);
227 MediaQueryToken token = tokenizer.nextToken();
228 outTokens.append(token);
229 if (token.type() == EOFToken)
234 MediaQueryToken MediaQueryTokenizer::nextToken()
236 // Unlike the HTMLTokenizer, the CSS Syntax spec is written
237 // as a stateless, (fixed-size) look-ahead tokenizer.
238 // We could move to the stateful model and instead create
239 // states for all the "next 3 codepoints are X" cases.
240 // State-machine tokenizers are easier to write to handle
241 // incremental tokenization of partial sources.
242 // However, for now we follow the spec exactly.
243 UChar cc = consume();
244 CodePoint codePointFunc = 0;
247 ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber);
248 codePointFunc = codePoints[cc];
250 codePointFunc = &MediaQueryTokenizer::nameStart;
254 return ((this)->*(codePointFunc))(cc);
255 return MediaQueryToken(DelimiterToken, cc);
258 static int getSign(MediaQueryInputStream& input, unsigned& offset)
261 if (input.currentInputChar() == '+') {
263 } else if (input.peek(offset) == '-') {
270 static unsigned long long getInteger(MediaQueryInputStream& input, unsigned& offset)
272 unsigned intStartPos = offset;
273 offset = input.skipWhilePredicate<isASCIIDigit>(offset);
274 unsigned intEndPos = offset;
275 return input.getUInt(intStartPos, intEndPos);
278 static double getFraction(MediaQueryInputStream& input, unsigned& offset, unsigned& digitsNumber)
280 unsigned fractionStartPos = 0;
281 unsigned fractionEndPos = 0;
282 if (input.peek(offset) == '.' && isASCIIDigit(input.peek(++offset))) {
283 fractionStartPos = offset - 1;
284 offset = input.skipWhilePredicate<isASCIIDigit>(offset);
285 fractionEndPos = offset;
287 digitsNumber = fractionEndPos- fractionStartPos;
288 return input.getDouble(fractionStartPos, fractionEndPos);
291 static unsigned long long getExponent(MediaQueryInputStream& input, unsigned& offset, int& sign)
293 unsigned exponentStartPos = 0;
294 unsigned exponentEndPos = 0;
295 if ((input.peek(offset) == 'E' || input.peek(offset) == 'e')) {
296 int offsetBeforeExponent = offset;
298 if (input.peek(offset) == '+') {
300 } else if (input.peek(offset) =='-') {
304 exponentStartPos = offset;
305 offset = input.skipWhilePredicate<isASCIIDigit>(offset);
306 exponentEndPos = offset;
307 if (exponentEndPos == exponentStartPos)
308 offset = offsetBeforeExponent;
310 return input.getUInt(exponentStartPos, exponentEndPos);
313 // This method merges the following spec sections for efficiency
314 // http://www.w3.org/TR/css3-syntax/#consume-a-number
315 // http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number
316 MediaQueryToken MediaQueryTokenizer::consumeNumber()
318 ASSERT(nextCharsAreNumber());
319 NumericValueType type = IntegerValueType;
322 int exponentSign = 1;
323 unsigned fractionDigits;
324 int sign = getSign(m_input, offset);
325 unsigned long long integerPart = getInteger(m_input, offset);
326 double fractionPart = getFraction(m_input, offset, fractionDigits);
327 unsigned long long exponentPart = getExponent(m_input, offset, exponentSign);
328 double exponent = pow(10, (float)exponentSign * (double)exponentPart);
329 value = (double)sign * ((double)integerPart + fractionPart) * exponent;
331 m_input.advance(offset);
332 if (fractionDigits > 0)
333 type = NumberValueType;
335 return MediaQueryToken(NumberToken, value, type);
338 // http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token
339 MediaQueryToken MediaQueryTokenizer::consumeNumericToken()
341 MediaQueryToken token = consumeNumber();
342 if (nextCharsAreIdentifier())
343 token.convertToDimensionWithUnit(consumeName());
344 else if (consumeIfNext('%'))
345 token.convertToPercentage();
349 // http://www.w3.org/TR/css3-syntax/#consume-an-ident-like-token
350 MediaQueryToken MediaQueryTokenizer::consumeIdentLikeToken()
352 String name = consumeName();
353 if (consumeIfNext('(')) {
354 return blockStart(LeftParenthesisToken, FunctionToken, name);
356 return MediaQueryToken(IdentToken, name);
359 static bool isNewLine(UChar cc)
361 // We check \r and \f here, since we have no preprocessing stage
362 return (cc == '\r' || cc == '\n' || cc == '\f');
365 // http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
366 MediaQueryToken MediaQueryTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
368 StringBuilder output;
370 UChar cc = consume();
371 if (cc == endingCodePoint || cc == kEndOfFileMarker) {
372 // The "reconsume" here deviates from the spec, but is required to avoid consuming past the EOF
373 if (cc == kEndOfFileMarker)
375 return MediaQueryToken(StringToken, output.toString());
379 return MediaQueryToken(BadStringToken);
382 if (m_input.currentInputChar() == kEndOfFileMarker)
384 if (isNewLine(m_input.currentInputChar()))
387 output.append(consumeEscape());
394 void MediaQueryTokenizer::consumeUntilNonWhitespace()
396 // Using HTML space here rather than CSS space since we don't do preprocessing
397 while (isHTMLSpace<UChar>(m_input.currentInputChar()))
401 bool MediaQueryTokenizer::consumeUntilCommentEndFound()
405 if (c == kEndOfFileMarker)
418 bool MediaQueryTokenizer::consumeIfNext(UChar character)
420 if (m_input.currentInputChar() == character) {
427 // http://www.w3.org/TR/css3-syntax/#consume-a-name
428 String MediaQueryTokenizer::consumeName()
430 // FIXME: Is this as efficient as it can be?
431 // The possibility of escape chars mandates a copy AFAICT.
432 StringBuilder result;
434 UChar cc = consume();
435 if (isNameChar(cc)) {
439 if (twoCharsAreValidEscape(cc, m_input.currentInputChar())) {
440 result.append(consumeEscape());
444 return result.toString();
448 // http://dev.w3.org/csswg/css-syntax/#consume-an-escaped-code-point
449 UChar MediaQueryTokenizer::consumeEscape()
451 UChar cc = consume();
453 if (isASCIIHexDigit(cc)) {
454 unsigned consumedHexDigits = 1;
455 StringBuilder hexChars;
457 while (consumedHexDigits < 6 && isASCIIHexDigit(m_input.currentInputChar())) {
463 UChar codePoint = hexChars.toString().toUIntStrict(&ok, 16);
465 return WTF::Unicode::replacementCharacter;
469 // Replaces NULLs with replacement characters, since we do not perform preprocessing
470 if (cc == kEndOfFileMarker)
471 return WTF::Unicode::replacementCharacter;
475 bool MediaQueryTokenizer::nextTwoCharsAreValidEscape(unsigned offset)
477 if (m_input.leftChars() < offset + 1)
479 return twoCharsAreValidEscape(m_input.peek(offset), m_input.peek(offset + 1));
482 // http://www.w3.org/TR/css3-syntax/#starts-with-a-number
483 bool MediaQueryTokenizer::nextCharsAreNumber()
485 UChar first = m_input.currentInputChar();
486 UChar second = m_input.peek(1);
487 if (isASCIIDigit(first))
489 if (first == '+' || first == '-')
490 return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input.peek(2))));
492 return (isASCIIDigit(second));
496 // http://www.w3.org/TR/css3-syntax/#would-start-an-identifier
497 bool MediaQueryTokenizer::nextCharsAreIdentifier()
499 UChar firstChar = m_input.currentInputChar();
500 if (isNameStart(firstChar) || nextTwoCharsAreValidEscape(0))
503 if (firstChar == '-') {
504 if (isNameStart(m_input.peek(1)))
506 return nextTwoCharsAreValidEscape(1);
512 } // namespace WebCore