src/third_party/WebKit/Source/core/css/parser/MediaQueryTokenizer.cpp

   1 // Copyright 2014 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "config.h"
   6 #include "core/css/parser/MediaQueryTokenizer.h"
   7
   8 namespace WebCore {
   9 #include "MediaQueryTokenizerCodepoints.cpp"
  10 }
  11
  12 #include "core/css/parser/MediaQueryInputStream.h"
  13 #include "core/html/parser/HTMLParserIdioms.h"
  14 #include "wtf/unicode/CharacterNames.h"
  15
  16 namespace WebCore {
  17
  18 // http://dev.w3.org/csswg/css-syntax/#name-start-code-point
  19 static bool isNameStart(UChar c)
  20 {
  21     if (isASCIIAlpha(c))
  22         return true;
  23     if (c == '_')
  24         return true;
  25     return !isASCII(c);
  26 }
  27
  28 // http://dev.w3.org/csswg/css-syntax/#name-code-point
  29 static bool isNameChar(UChar c)
  30 {
  31     return isNameStart(c) || isASCIIDigit(c) || c == '-';
  32 }
  33
  34 // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-escape
  35 static bool twoCharsAreValidEscape(UChar first, UChar second)
  36 {
  37     return ((first == '\\') && (second != '\n') && (second != kEndOfFileMarker));
  38 }
  39
  40 MediaQueryTokenizer::MediaQueryTokenizer(MediaQueryInputStream& inputStream)
  41     : m_input(inputStream)
  42 {
  43 }
  44
  45 void MediaQueryTokenizer::reconsume(UChar c)
  46 {
  47     m_input.pushBack(c);
  48 }
  49
  50 UChar MediaQueryTokenizer::consume()
  51 {
  52     UChar current = m_input.currentInputChar();
  53     m_input.advance();
  54     return current;
  55 }
  56
  57 void MediaQueryTokenizer::consume(unsigned offset)
  58 {
  59     m_input.advance(offset);
  60 }
  61
  62 MediaQueryToken MediaQueryTokenizer::whiteSpace(UChar cc)
  63 {
  64     // CSS Tokenization is currently lossy, but we could record
  65     // the exact whitespace instead of discarding it here.
  66     consumeUntilNonWhitespace();
  67     return MediaQueryToken(WhitespaceToken);
  68 }
  69
  70 static bool popIfBlockMatches(Vector<MediaQueryTokenType>& blockStack, MediaQueryTokenType type)
  71 {
  72     if (!blockStack.isEmpty() && blockStack.last() == type) {
  73         blockStack.removeLast();
  74         return true;
  75     }
  76     return false;
  77 }
  78
  79 MediaQueryToken MediaQueryTokenizer::blockStart(MediaQueryTokenType type)
  80 {
  81     m_blockStack.append(type);
  82     return MediaQueryToken(type, MediaQueryToken::BlockStart);
  83 }
  84
  85 MediaQueryToken MediaQueryTokenizer::blockStart(MediaQueryTokenType blockType, MediaQueryTokenType type, String name)
  86 {
  87     m_blockStack.append(blockType);
  88     return MediaQueryToken(type, name, MediaQueryToken::BlockStart);
  89 }
  90
  91 MediaQueryToken MediaQueryTokenizer::blockEnd(MediaQueryTokenType type, MediaQueryTokenType startType)
  92 {
  93     if (popIfBlockMatches(m_blockStack, startType))
  94         return MediaQueryToken(type, MediaQueryToken::BlockEnd);
  95     return MediaQueryToken(type);
  96 }
  97
  98 MediaQueryToken MediaQueryTokenizer::leftParenthesis(UChar cc)
  99 {
 100     return blockStart(LeftParenthesisToken);
 101 }
 102
 103 MediaQueryToken MediaQueryTokenizer::rightParenthesis(UChar cc)
 104 {
 105     return blockEnd(RightParenthesisToken, LeftParenthesisToken);
 106 }
 107
 108 MediaQueryToken MediaQueryTokenizer::leftBracket(UChar cc)
 109 {
 110     return blockStart(LeftBracketToken);
 111 }
 112
 113 MediaQueryToken MediaQueryTokenizer::rightBracket(UChar cc)
 114 {
 115     return blockEnd(RightBracketToken, LeftBracketToken);
 116 }
 117
 118 MediaQueryToken MediaQueryTokenizer::leftBrace(UChar cc)
 119 {
 120     return blockStart(LeftBraceToken);
 121 }
 122
 123 MediaQueryToken MediaQueryTokenizer::rightBrace(UChar cc)
 124 {
 125     return blockEnd(RightBraceToken, LeftBraceToken);
 126 }
 127
 128 MediaQueryToken MediaQueryTokenizer::plusOrFullStop(UChar cc)
 129 {
 130     if (nextCharsAreNumber()) {
 131         reconsume(cc);
 132         return consumeNumericToken();
 133     }
 134     return MediaQueryToken(DelimiterToken, cc);
 135 }
 136
 137 MediaQueryToken MediaQueryTokenizer::asterisk(UChar cc)
 138 {
 139     return MediaQueryToken(DelimiterToken, cc);
 140 }
 141
 142 MediaQueryToken MediaQueryTokenizer::comma(UChar cc)
 143 {
 144     return MediaQueryToken(CommaToken);
 145 }
 146
 147 MediaQueryToken MediaQueryTokenizer::hyphenMinus(UChar cc)
 148 {
 149     if (nextCharsAreNumber()) {
 150         reconsume(cc);
 151         return consumeNumericToken();
 152     }
 153     if (nextCharsAreIdentifier()) {
 154         reconsume(cc);
 155         return consumeIdentLikeToken();
 156     }
 157     return MediaQueryToken(DelimiterToken, cc);
 158 }
 159
 160 MediaQueryToken MediaQueryTokenizer::solidus(UChar cc)
 161 {
 162     if (consumeIfNext('*')) {
 163         // We're intentionally deviating from the spec here, by creating tokens for CSS comments.
 164         return consumeUntilCommentEndFound()? MediaQueryToken(CommentToken): MediaQueryToken(EOFToken);
 165     }
 166
 167     return MediaQueryToken(DelimiterToken, cc);
 168 }
 169
 170 MediaQueryToken MediaQueryTokenizer::colon(UChar cc)
 171 {
 172     return MediaQueryToken(ColonToken);
 173 }
 174
 175 MediaQueryToken MediaQueryTokenizer::semiColon(UChar cc)
 176 {
 177     return MediaQueryToken(SemicolonToken);
 178 }
 179
 180 MediaQueryToken MediaQueryTokenizer::reverseSolidus(UChar cc)
 181 {
 182     if (twoCharsAreValidEscape(cc, m_input.currentInputChar())) {
 183         reconsume(cc);
 184         return consumeIdentLikeToken();
 185     }
 186     return MediaQueryToken(DelimiterToken, cc);
 187 }
 188
 189 MediaQueryToken MediaQueryTokenizer::asciiDigit(UChar cc)
 190 {
 191     reconsume(cc);
 192     return consumeNumericToken();
 193 }
 194
 195 MediaQueryToken MediaQueryTokenizer::nameStart(UChar cc)
 196 {
 197     reconsume(cc);
 198     return consumeIdentLikeToken();
 199 }
 200
 201 MediaQueryToken MediaQueryTokenizer::stringStart(UChar cc)
 202 {
 203     return consumeStringTokenUntil(cc);
 204 }
 205
 206 MediaQueryToken MediaQueryTokenizer::endOfFile(UChar cc)
 207 {
 208     return MediaQueryToken(EOFToken);
 209 }
 210
 211 void MediaQueryTokenizer::tokenize(String string, Vector<MediaQueryToken>& outTokens)
 212 {
 213     // According to the spec, we should perform preprocessing here.
 214     // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing
 215     //
 216     // However, we can skip this step since:
 217     // * We're using HTML spaces (which accept \r and \f as a valid white space)
 218     // * Do not count white spaces
 219     // * consumeEscape replaces NULLs for replacement characters
 220
 221     if (string.isEmpty())
 222         return;
 223
 224     MediaQueryInputStream input(string);
 225     MediaQueryTokenizer tokenizer(input);
 226     while (true) {
 227         MediaQueryToken token = tokenizer.nextToken();
 228         outTokens.append(token);
 229         if (token.type() == EOFToken)
 230             return;
 231     }
 232 }
 233
 234 MediaQueryToken MediaQueryTokenizer::nextToken()
 235 {
 236     // Unlike the HTMLTokenizer, the CSS Syntax spec is written
 237     // as a stateless, (fixed-size) look-ahead tokenizer.
 238     // We could move to the stateful model and instead create
 239     // states for all the "next 3 codepoints are X" cases.
 240     // State-machine tokenizers are easier to write to handle
 241     // incremental tokenization of partial sources.
 242     // However, for now we follow the spec exactly.
 243     UChar cc = consume();
 244     CodePoint codePointFunc = 0;
 245
 246     if (isASCII(cc)) {
 247         ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber);
 248         codePointFunc = codePoints[cc];
 249     } else {
 250         codePointFunc = &MediaQueryTokenizer::nameStart;
 251     }
 252
 253     if (codePointFunc)
 254         return ((this)->*(codePointFunc))(cc);
 255     return MediaQueryToken(DelimiterToken, cc);
 256 }
 257
 258 static int getSign(MediaQueryInputStream& input, unsigned& offset)
 259 {
 260     int sign = 1;
 261     if (input.currentInputChar() == '+') {
 262         ++offset;
 263     } else if (input.peek(offset) == '-') {
 264         sign = -1;
 265         ++offset;
 266     }
 267     return sign;
 268 }
 269
 270 static unsigned long long getInteger(MediaQueryInputStream& input, unsigned& offset)
 271 {
 272     unsigned intStartPos = offset;
 273     offset = input.skipWhilePredicate<isASCIIDigit>(offset);
 274     unsigned intEndPos = offset;
 275     return input.getUInt(intStartPos, intEndPos);
 276 }
 277
 278 static double getFraction(MediaQueryInputStream& input, unsigned& offset, unsigned& digitsNumber)
 279 {
 280     unsigned fractionStartPos = 0;
 281     unsigned fractionEndPos = 0;
 282     if (input.peek(offset) == '.' && isASCIIDigit(input.peek(++offset))) {
 283         fractionStartPos = offset - 1;
 284         offset = input.skipWhilePredicate<isASCIIDigit>(offset);
 285         fractionEndPos = offset;
 286     }
 287     digitsNumber = fractionEndPos- fractionStartPos;
 288     return input.getDouble(fractionStartPos, fractionEndPos);
 289 }
 290
 291 static unsigned long long getExponent(MediaQueryInputStream& input, unsigned& offset, int& sign)
 292 {
 293     unsigned exponentStartPos = 0;
 294     unsigned exponentEndPos = 0;
 295     if ((input.peek(offset) == 'E' || input.peek(offset) == 'e')) {
 296         int offsetBeforeExponent = offset;
 297         ++offset;
 298         if (input.peek(offset) == '+') {
 299             ++offset;
 300         } else if (input.peek(offset) =='-') {
 301             sign = -1;
 302             ++offset;
 303         }
 304         exponentStartPos = offset;
 305         offset = input.skipWhilePredicate<isASCIIDigit>(offset);
 306         exponentEndPos = offset;
 307         if (exponentEndPos == exponentStartPos)
 308             offset = offsetBeforeExponent;
 309     }
 310     return input.getUInt(exponentStartPos, exponentEndPos);
 311 }
 312
 313 // This method merges the following spec sections for efficiency
 314 // http://www.w3.org/TR/css3-syntax/#consume-a-number
 315 // http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number
 316 MediaQueryToken MediaQueryTokenizer::consumeNumber()
 317 {
 318     ASSERT(nextCharsAreNumber());
 319     NumericValueType type = IntegerValueType;
 320     double value = 0;
 321     unsigned offset = 0;
 322     int exponentSign = 1;
 323     unsigned fractionDigits;
 324     int sign = getSign(m_input, offset);
 325     unsigned long long integerPart = getInteger(m_input, offset);
 326     double fractionPart = getFraction(m_input, offset, fractionDigits);
 327     unsigned long long exponentPart = getExponent(m_input, offset, exponentSign);
 328     double exponent = pow(10, (float)exponentSign * (double)exponentPart);
 329     value = (double)sign * ((double)integerPart + fractionPart) * exponent;
 330
 331     m_input.advance(offset);
 332     if (fractionDigits > 0)
 333         type = NumberValueType;
 334
 335     return MediaQueryToken(NumberToken, value, type);
 336 }
 337
 338 // http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token
 339 MediaQueryToken MediaQueryTokenizer::consumeNumericToken()
 340 {
 341     MediaQueryToken token = consumeNumber();
 342     if (nextCharsAreIdentifier())
 343         token.convertToDimensionWithUnit(consumeName());
 344     else if (consumeIfNext('%'))
 345         token.convertToPercentage();
 346     return token;
 347 }
 348
 349 // http://www.w3.org/TR/css3-syntax/#consume-an-ident-like-token
 350 MediaQueryToken MediaQueryTokenizer::consumeIdentLikeToken()
 351 {
 352     String name = consumeName();
 353     if (consumeIfNext('(')) {
 354         return blockStart(LeftParenthesisToken, FunctionToken, name);
 355     }
 356     return MediaQueryToken(IdentToken, name);
 357 }
 358
 359 static bool isNewLine(UChar cc)
 360 {
 361     // We check \r and \f here, since we have no preprocessing stage
 362     return (cc == '\r' || cc == '\n' || cc == '\f');
 363 }
 364
 365 // http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
 366 MediaQueryToken MediaQueryTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
 367 {
 368     StringBuilder output;
 369     while (true) {
 370         UChar cc = consume();
 371         if (cc == endingCodePoint || cc == kEndOfFileMarker) {
 372             // The "reconsume" here deviates from the spec, but is required to avoid consuming past the EOF
 373             if (cc == kEndOfFileMarker)
 374                 reconsume(cc);
 375             return MediaQueryToken(StringToken, output.toString());
 376         }
 377         if (isNewLine(cc)) {
 378             reconsume(cc);
 379             return MediaQueryToken(BadStringToken);
 380         }
 381         if (cc == '\\') {
 382             if (m_input.currentInputChar() == kEndOfFileMarker)
 383                 continue;
 384             if (isNewLine(m_input.currentInputChar()))
 385                 consume();
 386             else
 387                 output.append(consumeEscape());
 388         } else {
 389             output.append(cc);
 390         }
 391     }
 392 }
 393
 394 void MediaQueryTokenizer::consumeUntilNonWhitespace()
 395 {
 396     // Using HTML space here rather than CSS space since we don't do preprocessing
 397     while (isHTMLSpace<UChar>(m_input.currentInputChar()))
 398         consume();
 399 }
 400
 401 bool MediaQueryTokenizer::consumeUntilCommentEndFound()
 402 {
 403     UChar c = consume();
 404     while (true) {
 405         if (c == kEndOfFileMarker)
 406             return false;
 407         if (c != '*') {
 408             c = consume();
 409             continue;
 410         }
 411         c = consume();
 412         if (c == '/')
 413             break;
 414     }
 415     return true;
 416 }
 417
 418 bool MediaQueryTokenizer::consumeIfNext(UChar character)
 419 {
 420     if (m_input.currentInputChar() == character) {
 421         consume();
 422         return true;
 423     }
 424     return false;
 425 }
 426
 427 // http://www.w3.org/TR/css3-syntax/#consume-a-name
 428 String MediaQueryTokenizer::consumeName()
 429 {
 430     // FIXME: Is this as efficient as it can be?
 431     // The possibility of escape chars mandates a copy AFAICT.
 432     StringBuilder result;
 433     while (true) {
 434         UChar cc = consume();
 435         if (isNameChar(cc)) {
 436             result.append(cc);
 437             continue;
 438         }
 439         if (twoCharsAreValidEscape(cc, m_input.currentInputChar())) {
 440             result.append(consumeEscape());
 441             continue;
 442         }
 443         reconsume(cc);
 444         return result.toString();
 445     }
 446 }
 447
 448 // http://dev.w3.org/csswg/css-syntax/#consume-an-escaped-code-point
 449 UChar MediaQueryTokenizer::consumeEscape()
 450 {
 451     UChar cc = consume();
 452     ASSERT(cc != '\n');
 453     if (isASCIIHexDigit(cc)) {
 454         unsigned consumedHexDigits = 1;
 455         StringBuilder hexChars;
 456         hexChars.append(cc);
 457         while (consumedHexDigits < 6 && isASCIIHexDigit(m_input.currentInputChar())) {
 458             cc = consume();
 459             hexChars.append(cc);
 460             consumedHexDigits++;
 461         };
 462         bool ok = false;
 463         UChar codePoint = hexChars.toString().toUIntStrict(&ok, 16);
 464         if (!ok)
 465             return WTF::Unicode::replacementCharacter;
 466         return codePoint;
 467     }
 468
 469     // Replaces NULLs with replacement characters, since we do not perform preprocessing
 470     if (cc == kEndOfFileMarker)
 471         return WTF::Unicode::replacementCharacter;
 472     return cc;
 473 }
 474
 475 bool MediaQueryTokenizer::nextTwoCharsAreValidEscape(unsigned offset)
 476 {
 477     if (m_input.leftChars() < offset + 1)
 478         return false;
 479     return twoCharsAreValidEscape(m_input.peek(offset), m_input.peek(offset + 1));
 480 }
 481
 482 // http://www.w3.org/TR/css3-syntax/#starts-with-a-number
 483 bool MediaQueryTokenizer::nextCharsAreNumber()
 484 {
 485     UChar first = m_input.currentInputChar();
 486     UChar second = m_input.peek(1);
 487     if (isASCIIDigit(first))
 488         return true;
 489     if (first == '+' || first == '-')
 490         return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input.peek(2))));
 491     if (first =='.')
 492         return (isASCIIDigit(second));
 493     return false;
 494 }
 495
 496 // http://www.w3.org/TR/css3-syntax/#would-start-an-identifier
 497 bool MediaQueryTokenizer::nextCharsAreIdentifier()
 498 {
 499     UChar firstChar = m_input.currentInputChar();
 500     if (isNameStart(firstChar) || nextTwoCharsAreValidEscape(0))
 501         return true;
 502
 503     if (firstChar == '-') {
 504         if (isNameStart(m_input.peek(1)))
 505             return true;
 506         return nextTwoCharsAreValidEscape(1);
 507     }
 508
 509     return false;
 510 }
 511
 512 } // namespace WebCore