2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include "core/html/parser/HTMLTokenizer.h"
31 #include "core/HTMLNames.h"
32 #include "core/HTMLTokenizerNames.h"
33 #include "core/html/parser/HTMLEntityParser.h"
34 #include "core/html/parser/HTMLParserIdioms.h"
35 #include "core/html/parser/HTMLTreeBuilder.h"
36 #include "platform/NotImplemented.h"
37 #include "core/xml/parser/MarkupTokenizerInlines.h"
38 #include "wtf/ASCIICType.h"
39 #include "wtf/text/AtomicString.h"
40 #include "wtf/unicode/Unicode.h"
42 // Please don't use DEFINE_STATIC_LOCAL in this file. The HTMLTokenizer is used
43 // from multiple threads and DEFINE_STATIC_LOCAL isn't threadsafe.
44 #undef DEFINE_STATIC_LOCAL
48 using namespace HTMLNames;
50 // This has to go in a .cpp file, as the linker doesn't like it being included more than once.
51 // We don't have an HTMLToken.cpp though, so this is the next best place.
52 QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attribute) const
54 return QualifiedName(nullAtom, AtomicString(attribute.name), nullAtom);
57 bool AtomicHTMLToken::usesName() const
59 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
62 bool AtomicHTMLToken::usesAttributes() const
64 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
67 static inline UChar toLowerCase(UChar cc)
69 ASSERT(isASCIIUpper(cc));
70 const int lowerCaseOffset = 0x20;
71 return cc + lowerCaseOffset;
74 static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string)
76 if (vector.size() != string.length())
82 return equal(string.impl(), vector.data(), vector.size());
85 static inline bool isEndTagBufferingState(HTMLTokenizer::State state)
88 case HTMLTokenizer::RCDATAEndTagOpenState:
89 case HTMLTokenizer::RCDATAEndTagNameState:
90 case HTMLTokenizer::RAWTEXTEndTagOpenState:
91 case HTMLTokenizer::RAWTEXTEndTagNameState:
92 case HTMLTokenizer::ScriptDataEndTagOpenState:
93 case HTMLTokenizer::ScriptDataEndTagNameState:
94 case HTMLTokenizer::ScriptDataEscapedEndTagOpenState:
95 case HTMLTokenizer::ScriptDataEscapedEndTagNameState:
102 #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName)
103 #define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName)
104 #define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName)
105 #define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName)
107 HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options)
108 : m_inputStreamPreprocessor(this)
114 HTMLTokenizer::~HTMLTokenizer()
118 void HTMLTokenizer::reset()
120 m_state = HTMLTokenizer::DataState;
122 m_forceNullCharacterReplacement = false;
123 m_shouldAllowCDATA = false;
124 m_additionalAllowedCharacter = '\0';
127 bool HTMLTokenizer::canCreateCheckpoint() const
129 if (!m_appropriateEndTagName.isEmpty())
131 if (!m_temporaryBuffer.isEmpty())
133 if (!m_bufferedEndTagName.isEmpty())
138 void HTMLTokenizer::createCheckpoint(Checkpoint& result) const
140 ASSERT(canCreateCheckpoint());
141 result.options = m_options;
142 result.state = m_state;
143 result.additionalAllowedCharacter = m_additionalAllowedCharacter;
144 result.skipNextNewLine = m_inputStreamPreprocessor.skipNextNewLine();
145 result.shouldAllowCDATA = m_shouldAllowCDATA;
148 void HTMLTokenizer::restoreFromCheckpoint(const Checkpoint& checkpoint)
151 m_options = checkpoint.options;
152 m_state = checkpoint.state;
153 m_additionalAllowedCharacter = checkpoint.additionalAllowedCharacter;
154 m_inputStreamPreprocessor.reset(checkpoint.skipNextNewLine);
155 m_shouldAllowCDATA = checkpoint.shouldAllowCDATA;
158 inline bool HTMLTokenizer::processEntity(SegmentedString& source)
160 bool notEnoughCharacters = false;
161 DecodedHTMLEntity decodedEntity;
162 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
163 if (notEnoughCharacters)
166 ASSERT(decodedEntity.isEmpty());
167 bufferCharacter('&');
169 for (unsigned i = 0; i < decodedEntity.length; ++i)
170 bufferCharacter(decodedEntity.data[i]);
175 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
177 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized);
178 source.advanceAndUpdateLineNumber();
179 if (m_token->type() == HTMLToken::Character)
181 m_token->beginEndTag(m_bufferedEndTagName);
182 m_bufferedEndTagName.clear();
183 m_appropriateEndTagName.clear();
184 m_temporaryBuffer.clear();
188 #define FLUSH_AND_ADVANCE_TO(stateName) \
190 m_state = HTMLTokenizer::stateName; \
191 if (flushBufferedEndTag(source)) \
193 if (source.isEmpty() \
194 || !m_inputStreamPreprocessor.peek(source)) \
195 return haveBufferedCharacterToken(); \
196 cc = m_inputStreamPreprocessor.nextInputCharacter(); \
200 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer::State state)
203 flushBufferedEndTag(source);
207 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
209 // If we have a token in progress, then we're supposed to be called back
210 // with the same token so we can finish it.
211 ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized);
214 if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
215 // FIXME: This should call flushBufferedEndTag().
216 // We started an end tag during our last iteration.
217 m_token->beginEndTag(m_bufferedEndTagName);
218 m_bufferedEndTagName.clear();
219 m_appropriateEndTagName.clear();
220 m_temporaryBuffer.clear();
221 if (m_state == HTMLTokenizer::DataState) {
222 // We're back in the data state, so we must be done with the tag.
227 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source))
228 return haveBufferedCharacterToken();
229 UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
231 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
233 HTML_BEGIN_STATE(DataState) {
235 HTML_ADVANCE_TO(CharacterReferenceInDataState);
236 else if (cc == '<') {
237 if (m_token->type() == HTMLToken::Character) {
238 // We have a bunch of character tokens queued up that we
239 // are emitting lazily here.
242 HTML_ADVANCE_TO(TagOpenState);
243 } else if (cc == kEndOfFileMarker)
244 return emitEndOfFile(source);
247 HTML_ADVANCE_TO(DataState);
252 HTML_BEGIN_STATE(CharacterReferenceInDataState) {
253 if (!processEntity(source))
254 return haveBufferedCharacterToken();
255 HTML_SWITCH_TO(DataState);
259 HTML_BEGIN_STATE(RCDATAState) {
261 HTML_ADVANCE_TO(CharacterReferenceInRCDATAState);
263 HTML_ADVANCE_TO(RCDATALessThanSignState);
264 else if (cc == kEndOfFileMarker)
265 return emitEndOfFile(source);
268 HTML_ADVANCE_TO(RCDATAState);
273 HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) {
274 if (!processEntity(source))
275 return haveBufferedCharacterToken();
276 HTML_SWITCH_TO(RCDATAState);
280 HTML_BEGIN_STATE(RAWTEXTState) {
282 HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
283 else if (cc == kEndOfFileMarker)
284 return emitEndOfFile(source);
287 HTML_ADVANCE_TO(RAWTEXTState);
292 HTML_BEGIN_STATE(ScriptDataState) {
294 HTML_ADVANCE_TO(ScriptDataLessThanSignState);
295 else if (cc == kEndOfFileMarker)
296 return emitEndOfFile(source);
299 HTML_ADVANCE_TO(ScriptDataState);
304 HTML_BEGIN_STATE(PLAINTEXTState) {
305 if (cc == kEndOfFileMarker)
306 return emitEndOfFile(source);
308 HTML_ADVANCE_TO(PLAINTEXTState);
312 HTML_BEGIN_STATE(TagOpenState) {
314 HTML_ADVANCE_TO(MarkupDeclarationOpenState);
316 HTML_ADVANCE_TO(EndTagOpenState);
317 else if (isASCIIUpper(cc)) {
318 m_token->beginStartTag(toLowerCase(cc));
319 HTML_ADVANCE_TO(TagNameState);
320 } else if (isASCIILower(cc)) {
321 m_token->beginStartTag(cc);
322 HTML_ADVANCE_TO(TagNameState);
323 } else if (cc == '?') {
325 // The spec consumes the current character before switching
326 // to the bogus comment state, but it's easier to implement
327 // if we reconsume the current character.
328 HTML_RECONSUME_IN(BogusCommentState);
331 bufferCharacter('<');
332 HTML_RECONSUME_IN(DataState);
337 HTML_BEGIN_STATE(EndTagOpenState) {
338 if (isASCIIUpper(cc)) {
339 m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc)));
340 m_appropriateEndTagName.clear();
341 HTML_ADVANCE_TO(TagNameState);
342 } else if (isASCIILower(cc)) {
343 m_token->beginEndTag(static_cast<LChar>(cc));
344 m_appropriateEndTagName.clear();
345 HTML_ADVANCE_TO(TagNameState);
346 } else if (cc == '>') {
348 HTML_ADVANCE_TO(DataState);
349 } else if (cc == kEndOfFileMarker) {
351 bufferCharacter('<');
352 bufferCharacter('/');
353 HTML_RECONSUME_IN(DataState);
356 HTML_RECONSUME_IN(BogusCommentState);
361 HTML_BEGIN_STATE(TagNameState) {
362 if (isTokenizerWhitespace(cc))
363 HTML_ADVANCE_TO(BeforeAttributeNameState);
365 HTML_ADVANCE_TO(SelfClosingStartTagState);
367 return emitAndResumeIn(source, HTMLTokenizer::DataState);
368 else if (isASCIIUpper(cc)) {
369 m_token->appendToName(toLowerCase(cc));
370 HTML_ADVANCE_TO(TagNameState);
371 } else if (cc == kEndOfFileMarker) {
373 HTML_RECONSUME_IN(DataState);
375 m_token->appendToName(cc);
376 HTML_ADVANCE_TO(TagNameState);
381 HTML_BEGIN_STATE(RCDATALessThanSignState) {
383 m_temporaryBuffer.clear();
384 ASSERT(m_bufferedEndTagName.isEmpty());
385 HTML_ADVANCE_TO(RCDATAEndTagOpenState);
387 bufferCharacter('<');
388 HTML_RECONSUME_IN(RCDATAState);
393 HTML_BEGIN_STATE(RCDATAEndTagOpenState) {
394 if (isASCIIUpper(cc)) {
395 m_temporaryBuffer.append(static_cast<LChar>(cc));
396 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
397 HTML_ADVANCE_TO(RCDATAEndTagNameState);
398 } else if (isASCIILower(cc)) {
399 m_temporaryBuffer.append(static_cast<LChar>(cc));
400 addToPossibleEndTag(static_cast<LChar>(cc));
401 HTML_ADVANCE_TO(RCDATAEndTagNameState);
403 bufferCharacter('<');
404 bufferCharacter('/');
405 HTML_RECONSUME_IN(RCDATAState);
410 HTML_BEGIN_STATE(RCDATAEndTagNameState) {
411 if (isASCIIUpper(cc)) {
412 m_temporaryBuffer.append(static_cast<LChar>(cc));
413 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
414 HTML_ADVANCE_TO(RCDATAEndTagNameState);
415 } else if (isASCIILower(cc)) {
416 m_temporaryBuffer.append(static_cast<LChar>(cc));
417 addToPossibleEndTag(static_cast<LChar>(cc));
418 HTML_ADVANCE_TO(RCDATAEndTagNameState);
420 if (isTokenizerWhitespace(cc)) {
421 if (isAppropriateEndTag()) {
422 m_temporaryBuffer.append(static_cast<LChar>(cc));
423 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
425 } else if (cc == '/') {
426 if (isAppropriateEndTag()) {
427 m_temporaryBuffer.append(static_cast<LChar>(cc));
428 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
430 } else if (cc == '>') {
431 if (isAppropriateEndTag()) {
432 m_temporaryBuffer.append(static_cast<LChar>(cc));
433 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
436 bufferCharacter('<');
437 bufferCharacter('/');
438 m_token->appendToCharacter(m_temporaryBuffer);
439 m_bufferedEndTagName.clear();
440 m_temporaryBuffer.clear();
441 HTML_RECONSUME_IN(RCDATAState);
446 HTML_BEGIN_STATE(RAWTEXTLessThanSignState) {
448 m_temporaryBuffer.clear();
449 ASSERT(m_bufferedEndTagName.isEmpty());
450 HTML_ADVANCE_TO(RAWTEXTEndTagOpenState);
452 bufferCharacter('<');
453 HTML_RECONSUME_IN(RAWTEXTState);
458 HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) {
459 if (isASCIIUpper(cc)) {
460 m_temporaryBuffer.append(static_cast<LChar>(cc));
461 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
462 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
463 } else if (isASCIILower(cc)) {
464 m_temporaryBuffer.append(static_cast<LChar>(cc));
465 addToPossibleEndTag(static_cast<LChar>(cc));
466 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
468 bufferCharacter('<');
469 bufferCharacter('/');
470 HTML_RECONSUME_IN(RAWTEXTState);
475 HTML_BEGIN_STATE(RAWTEXTEndTagNameState) {
476 if (isASCIIUpper(cc)) {
477 m_temporaryBuffer.append(static_cast<LChar>(cc));
478 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
479 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
480 } else if (isASCIILower(cc)) {
481 m_temporaryBuffer.append(static_cast<LChar>(cc));
482 addToPossibleEndTag(static_cast<LChar>(cc));
483 HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
485 if (isTokenizerWhitespace(cc)) {
486 if (isAppropriateEndTag()) {
487 m_temporaryBuffer.append(static_cast<LChar>(cc));
488 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
490 } else if (cc == '/') {
491 if (isAppropriateEndTag()) {
492 m_temporaryBuffer.append(static_cast<LChar>(cc));
493 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
495 } else if (cc == '>') {
496 if (isAppropriateEndTag()) {
497 m_temporaryBuffer.append(static_cast<LChar>(cc));
498 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
501 bufferCharacter('<');
502 bufferCharacter('/');
503 m_token->appendToCharacter(m_temporaryBuffer);
504 m_bufferedEndTagName.clear();
505 m_temporaryBuffer.clear();
506 HTML_RECONSUME_IN(RAWTEXTState);
511 HTML_BEGIN_STATE(ScriptDataLessThanSignState) {
513 m_temporaryBuffer.clear();
514 ASSERT(m_bufferedEndTagName.isEmpty());
515 HTML_ADVANCE_TO(ScriptDataEndTagOpenState);
516 } else if (cc == '!') {
517 bufferCharacter('<');
518 bufferCharacter('!');
519 HTML_ADVANCE_TO(ScriptDataEscapeStartState);
521 bufferCharacter('<');
522 HTML_RECONSUME_IN(ScriptDataState);
527 HTML_BEGIN_STATE(ScriptDataEndTagOpenState) {
528 if (isASCIIUpper(cc)) {
529 m_temporaryBuffer.append(static_cast<LChar>(cc));
530 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
531 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
532 } else if (isASCIILower(cc)) {
533 m_temporaryBuffer.append(static_cast<LChar>(cc));
534 addToPossibleEndTag(static_cast<LChar>(cc));
535 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
537 bufferCharacter('<');
538 bufferCharacter('/');
539 HTML_RECONSUME_IN(ScriptDataState);
544 HTML_BEGIN_STATE(ScriptDataEndTagNameState) {
545 if (isASCIIUpper(cc)) {
546 m_temporaryBuffer.append(static_cast<LChar>(cc));
547 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
548 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
549 } else if (isASCIILower(cc)) {
550 m_temporaryBuffer.append(static_cast<LChar>(cc));
551 addToPossibleEndTag(static_cast<LChar>(cc));
552 HTML_ADVANCE_TO(ScriptDataEndTagNameState);
554 if (isTokenizerWhitespace(cc)) {
555 if (isAppropriateEndTag()) {
556 m_temporaryBuffer.append(static_cast<LChar>(cc));
557 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
559 } else if (cc == '/') {
560 if (isAppropriateEndTag()) {
561 m_temporaryBuffer.append(static_cast<LChar>(cc));
562 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
564 } else if (cc == '>') {
565 if (isAppropriateEndTag()) {
566 m_temporaryBuffer.append(static_cast<LChar>(cc));
567 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
570 bufferCharacter('<');
571 bufferCharacter('/');
572 m_token->appendToCharacter(m_temporaryBuffer);
573 m_bufferedEndTagName.clear();
574 m_temporaryBuffer.clear();
575 HTML_RECONSUME_IN(ScriptDataState);
580 HTML_BEGIN_STATE(ScriptDataEscapeStartState) {
583 HTML_ADVANCE_TO(ScriptDataEscapeStartDashState);
585 HTML_RECONSUME_IN(ScriptDataState);
589 HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) {
592 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
594 HTML_RECONSUME_IN(ScriptDataState);
598 HTML_BEGIN_STATE(ScriptDataEscapedState) {
601 HTML_ADVANCE_TO(ScriptDataEscapedDashState);
602 } else if (cc == '<')
603 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
604 else if (cc == kEndOfFileMarker) {
606 HTML_RECONSUME_IN(DataState);
609 HTML_ADVANCE_TO(ScriptDataEscapedState);
614 HTML_BEGIN_STATE(ScriptDataEscapedDashState) {
617 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
618 } else if (cc == '<')
619 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
620 else if (cc == kEndOfFileMarker) {
622 HTML_RECONSUME_IN(DataState);
625 HTML_ADVANCE_TO(ScriptDataEscapedState);
630 HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) {
633 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
634 } else if (cc == '<')
635 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
636 else if (cc == '>') {
638 HTML_ADVANCE_TO(ScriptDataState);
639 } else if (cc == kEndOfFileMarker) {
641 HTML_RECONSUME_IN(DataState);
644 HTML_ADVANCE_TO(ScriptDataEscapedState);
649 HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
651 m_temporaryBuffer.clear();
652 ASSERT(m_bufferedEndTagName.isEmpty());
653 HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
654 } else if (isASCIIUpper(cc)) {
655 bufferCharacter('<');
657 m_temporaryBuffer.clear();
658 m_temporaryBuffer.append(toLowerCase(cc));
659 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
660 } else if (isASCIILower(cc)) {
661 bufferCharacter('<');
663 m_temporaryBuffer.clear();
664 m_temporaryBuffer.append(static_cast<LChar>(cc));
665 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
667 bufferCharacter('<');
668 HTML_RECONSUME_IN(ScriptDataEscapedState);
673 HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
674 if (isASCIIUpper(cc)) {
675 m_temporaryBuffer.append(static_cast<LChar>(cc));
676 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
677 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
678 } else if (isASCIILower(cc)) {
679 m_temporaryBuffer.append(static_cast<LChar>(cc));
680 addToPossibleEndTag(static_cast<LChar>(cc));
681 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
683 bufferCharacter('<');
684 bufferCharacter('/');
685 HTML_RECONSUME_IN(ScriptDataEscapedState);
690 HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
691 if (isASCIIUpper(cc)) {
692 m_temporaryBuffer.append(static_cast<LChar>(cc));
693 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
694 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
695 } else if (isASCIILower(cc)) {
696 m_temporaryBuffer.append(static_cast<LChar>(cc));
697 addToPossibleEndTag(static_cast<LChar>(cc));
698 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
700 if (isTokenizerWhitespace(cc)) {
701 if (isAppropriateEndTag()) {
702 m_temporaryBuffer.append(static_cast<LChar>(cc));
703 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
705 } else if (cc == '/') {
706 if (isAppropriateEndTag()) {
707 m_temporaryBuffer.append(static_cast<LChar>(cc));
708 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
710 } else if (cc == '>') {
711 if (isAppropriateEndTag()) {
712 m_temporaryBuffer.append(static_cast<LChar>(cc));
713 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
716 bufferCharacter('<');
717 bufferCharacter('/');
718 m_token->appendToCharacter(m_temporaryBuffer);
719 m_bufferedEndTagName.clear();
720 m_temporaryBuffer.clear();
721 HTML_RECONSUME_IN(ScriptDataEscapedState);
726 HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
727 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
729 if (temporaryBufferIs(scriptTag.localName()))
730 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
732 HTML_ADVANCE_TO(ScriptDataEscapedState);
733 } else if (isASCIIUpper(cc)) {
735 m_temporaryBuffer.append(toLowerCase(cc));
736 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
737 } else if (isASCIILower(cc)) {
739 m_temporaryBuffer.append(static_cast<LChar>(cc));
740 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
742 HTML_RECONSUME_IN(ScriptDataEscapedState);
746 HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) {
749 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState);
750 } else if (cc == '<') {
752 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
753 } else if (cc == kEndOfFileMarker) {
755 HTML_RECONSUME_IN(DataState);
758 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
763 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
766 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
767 } else if (cc == '<') {
769 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
770 } else if (cc == kEndOfFileMarker) {
772 HTML_RECONSUME_IN(DataState);
775 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
780 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
783 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
784 } else if (cc == '<') {
786 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
787 } else if (cc == '>') {
789 HTML_ADVANCE_TO(ScriptDataState);
790 } else if (cc == kEndOfFileMarker) {
792 HTML_RECONSUME_IN(DataState);
795 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
800 HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
803 m_temporaryBuffer.clear();
804 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
806 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
810 HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
811 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
813 if (temporaryBufferIs(scriptTag.localName()))
814 HTML_ADVANCE_TO(ScriptDataEscapedState);
816 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
817 } else if (isASCIIUpper(cc)) {
819 m_temporaryBuffer.append(toLowerCase(cc));
820 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
821 } else if (isASCIILower(cc)) {
823 m_temporaryBuffer.append(static_cast<LChar>(cc));
824 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
826 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
830 HTML_BEGIN_STATE(BeforeAttributeNameState) {
831 if (isTokenizerWhitespace(cc))
832 HTML_ADVANCE_TO(BeforeAttributeNameState);
834 HTML_ADVANCE_TO(SelfClosingStartTagState);
836 return emitAndResumeIn(source, HTMLTokenizer::DataState);
837 else if (isASCIIUpper(cc)) {
838 m_token->addNewAttribute();
839 m_token->beginAttributeName(source.numberOfCharactersConsumed());
840 m_token->appendToAttributeName(toLowerCase(cc));
841 HTML_ADVANCE_TO(AttributeNameState);
842 } else if (cc == kEndOfFileMarker) {
844 HTML_RECONSUME_IN(DataState);
846 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
848 m_token->addNewAttribute();
849 m_token->beginAttributeName(source.numberOfCharactersConsumed());
850 m_token->appendToAttributeName(cc);
851 HTML_ADVANCE_TO(AttributeNameState);
856 HTML_BEGIN_STATE(AttributeNameState) {
857 if (isTokenizerWhitespace(cc)) {
858 m_token->endAttributeName(source.numberOfCharactersConsumed());
859 HTML_ADVANCE_TO(AfterAttributeNameState);
860 } else if (cc == '/') {
861 m_token->endAttributeName(source.numberOfCharactersConsumed());
862 HTML_ADVANCE_TO(SelfClosingStartTagState);
863 } else if (cc == '=') {
864 m_token->endAttributeName(source.numberOfCharactersConsumed());
865 HTML_ADVANCE_TO(BeforeAttributeValueState);
866 } else if (cc == '>') {
867 m_token->endAttributeName(source.numberOfCharactersConsumed());
868 return emitAndResumeIn(source, HTMLTokenizer::DataState);
869 } else if (isASCIIUpper(cc)) {
870 m_token->appendToAttributeName(toLowerCase(cc));
871 HTML_ADVANCE_TO(AttributeNameState);
872 } else if (cc == kEndOfFileMarker) {
874 m_token->endAttributeName(source.numberOfCharactersConsumed());
875 HTML_RECONSUME_IN(DataState);
877 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
879 m_token->appendToAttributeName(cc);
880 HTML_ADVANCE_TO(AttributeNameState);
885 HTML_BEGIN_STATE(AfterAttributeNameState) {
886 if (isTokenizerWhitespace(cc))
887 HTML_ADVANCE_TO(AfterAttributeNameState);
889 HTML_ADVANCE_TO(SelfClosingStartTagState);
891 HTML_ADVANCE_TO(BeforeAttributeValueState);
893 return emitAndResumeIn(source, HTMLTokenizer::DataState);
894 else if (isASCIIUpper(cc)) {
895 m_token->addNewAttribute();
896 m_token->beginAttributeName(source.numberOfCharactersConsumed());
897 m_token->appendToAttributeName(toLowerCase(cc));
898 HTML_ADVANCE_TO(AttributeNameState);
899 } else if (cc == kEndOfFileMarker) {
901 HTML_RECONSUME_IN(DataState);
903 if (cc == '"' || cc == '\'' || cc == '<')
905 m_token->addNewAttribute();
906 m_token->beginAttributeName(source.numberOfCharactersConsumed());
907 m_token->appendToAttributeName(cc);
908 HTML_ADVANCE_TO(AttributeNameState);
913 HTML_BEGIN_STATE(BeforeAttributeValueState) {
914 if (isTokenizerWhitespace(cc))
915 HTML_ADVANCE_TO(BeforeAttributeValueState);
916 else if (cc == '"') {
917 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
918 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
919 } else if (cc == '&') {
920 m_token->beginAttributeValue(source.numberOfCharactersConsumed());
921 HTML_RECONSUME_IN(AttributeValueUnquotedState);
922 } else if (cc == '\'') {
923 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
924 HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
925 } else if (cc == '>') {
927 return emitAndResumeIn(source, HTMLTokenizer::DataState);
928 } else if (cc == kEndOfFileMarker) {
930 HTML_RECONSUME_IN(DataState);
932 if (cc == '<' || cc == '=' || cc == '`')
934 m_token->beginAttributeValue(source.numberOfCharactersConsumed());
935 m_token->appendToAttributeValue(cc);
936 HTML_ADVANCE_TO(AttributeValueUnquotedState);
941 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) {
943 m_token->endAttributeValue(source.numberOfCharactersConsumed());
944 HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
945 } else if (cc == '&') {
946 m_additionalAllowedCharacter = '"';
947 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
948 } else if (cc == kEndOfFileMarker) {
950 m_token->endAttributeValue(source.numberOfCharactersConsumed());
951 HTML_RECONSUME_IN(DataState);
953 m_token->appendToAttributeValue(cc);
954 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
959 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) {
961 m_token->endAttributeValue(source.numberOfCharactersConsumed());
962 HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
963 } else if (cc == '&') {
964 m_additionalAllowedCharacter = '\'';
965 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
966 } else if (cc == kEndOfFileMarker) {
968 m_token->endAttributeValue(source.numberOfCharactersConsumed());
969 HTML_RECONSUME_IN(DataState);
971 m_token->appendToAttributeValue(cc);
972 HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
977 HTML_BEGIN_STATE(AttributeValueUnquotedState) {
978 if (isTokenizerWhitespace(cc)) {
979 m_token->endAttributeValue(source.numberOfCharactersConsumed());
980 HTML_ADVANCE_TO(BeforeAttributeNameState);
981 } else if (cc == '&') {
982 m_additionalAllowedCharacter = '>';
983 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
984 } else if (cc == '>') {
985 m_token->endAttributeValue(source.numberOfCharactersConsumed());
986 return emitAndResumeIn(source, HTMLTokenizer::DataState);
987 } else if (cc == kEndOfFileMarker) {
989 m_token->endAttributeValue(source.numberOfCharactersConsumed());
990 HTML_RECONSUME_IN(DataState);
992 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
994 m_token->appendToAttributeValue(cc);
995 HTML_ADVANCE_TO(AttributeValueUnquotedState);
1000 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
1001 bool notEnoughCharacters = false;
1002 DecodedHTMLEntity decodedEntity;
1003 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
1004 if (notEnoughCharacters)
1005 return haveBufferedCharacterToken();
1007 ASSERT(decodedEntity.isEmpty());
1008 m_token->appendToAttributeValue('&');
1010 for (unsigned i = 0; i < decodedEntity.length; ++i)
1011 m_token->appendToAttributeValue(decodedEntity.data[i]);
1013 // We're supposed to switch back to the attribute value state that
1014 // we were in when we were switched into this state. Rather than
1015 // keeping track of this explictly, we observe that the previous
1016 // state can be determined by m_additionalAllowedCharacter.
1017 if (m_additionalAllowedCharacter == '"')
1018 HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
1019 else if (m_additionalAllowedCharacter == '\'')
1020 HTML_SWITCH_TO(AttributeValueSingleQuotedState);
1021 else if (m_additionalAllowedCharacter == '>')
1022 HTML_SWITCH_TO(AttributeValueUnquotedState);
1024 ASSERT_NOT_REACHED();
1028 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
1029 if (isTokenizerWhitespace(cc))
1030 HTML_ADVANCE_TO(BeforeAttributeNameState);
1032 HTML_ADVANCE_TO(SelfClosingStartTagState);
1034 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1035 else if (cc == kEndOfFileMarker) {
1037 HTML_RECONSUME_IN(DataState);
1040 HTML_RECONSUME_IN(BeforeAttributeNameState);
1045 HTML_BEGIN_STATE(SelfClosingStartTagState) {
1047 m_token->setSelfClosing();
1048 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1049 } else if (cc == kEndOfFileMarker) {
1051 HTML_RECONSUME_IN(DataState);
1054 HTML_RECONSUME_IN(BeforeAttributeNameState);
1059 HTML_BEGIN_STATE(BogusCommentState) {
1060 m_token->beginComment();
1061 HTML_RECONSUME_IN(ContinueBogusCommentState);
1065 HTML_BEGIN_STATE(ContinueBogusCommentState) {
1067 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1068 else if (cc == kEndOfFileMarker)
1069 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1071 m_token->appendToComment(cc);
1072 HTML_ADVANCE_TO(ContinueBogusCommentState);
1077 HTML_BEGIN_STATE(MarkupDeclarationOpenState) {
1079 SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::dashDash);
1080 if (result == SegmentedString::DidMatch) {
1081 source.advanceAndASSERT('-');
1082 source.advanceAndASSERT('-');
1083 m_token->beginComment();
1084 HTML_SWITCH_TO(CommentStartState);
1085 } else if (result == SegmentedString::NotEnoughCharacters)
1086 return haveBufferedCharacterToken();
1087 } else if (cc == 'D' || cc == 'd') {
1088 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::doctype);
1089 if (result == SegmentedString::DidMatch) {
1090 advanceStringAndASSERTIgnoringCase(source, "doctype");
1091 HTML_SWITCH_TO(DOCTYPEState);
1092 } else if (result == SegmentedString::NotEnoughCharacters)
1093 return haveBufferedCharacterToken();
1094 } else if (cc == '[' && shouldAllowCDATA()) {
1095 SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::cdata);
1096 if (result == SegmentedString::DidMatch) {
1097 advanceStringAndASSERT(source, "[CDATA[");
1098 HTML_SWITCH_TO(CDATASectionState);
1099 } else if (result == SegmentedString::NotEnoughCharacters)
1100 return haveBufferedCharacterToken();
1103 HTML_RECONSUME_IN(BogusCommentState);
1107 HTML_BEGIN_STATE(CommentStartState) {
1109 HTML_ADVANCE_TO(CommentStartDashState);
1110 else if (cc == '>') {
1112 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1113 } else if (cc == kEndOfFileMarker) {
1115 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1117 m_token->appendToComment(cc);
1118 HTML_ADVANCE_TO(CommentState);
1123 HTML_BEGIN_STATE(CommentStartDashState) {
1125 HTML_ADVANCE_TO(CommentEndState);
1126 else if (cc == '>') {
1128 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1129 } else if (cc == kEndOfFileMarker) {
1131 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1133 m_token->appendToComment('-');
1134 m_token->appendToComment(cc);
1135 HTML_ADVANCE_TO(CommentState);
1140 HTML_BEGIN_STATE(CommentState) {
1142 HTML_ADVANCE_TO(CommentEndDashState);
1143 else if (cc == kEndOfFileMarker) {
1145 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1147 m_token->appendToComment(cc);
1148 HTML_ADVANCE_TO(CommentState);
1153 HTML_BEGIN_STATE(CommentEndDashState) {
1155 HTML_ADVANCE_TO(CommentEndState);
1156 else if (cc == kEndOfFileMarker) {
1158 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1160 m_token->appendToComment('-');
1161 m_token->appendToComment(cc);
1162 HTML_ADVANCE_TO(CommentState);
1167 HTML_BEGIN_STATE(CommentEndState) {
1169 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1170 else if (cc == '!') {
1172 HTML_ADVANCE_TO(CommentEndBangState);
1173 } else if (cc == '-') {
1175 m_token->appendToComment('-');
1176 HTML_ADVANCE_TO(CommentEndState);
1177 } else if (cc == kEndOfFileMarker) {
1179 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1182 m_token->appendToComment('-');
1183 m_token->appendToComment('-');
1184 m_token->appendToComment(cc);
1185 HTML_ADVANCE_TO(CommentState);
1190 HTML_BEGIN_STATE(CommentEndBangState) {
1192 m_token->appendToComment('-');
1193 m_token->appendToComment('-');
1194 m_token->appendToComment('!');
1195 HTML_ADVANCE_TO(CommentEndDashState);
1196 } else if (cc == '>')
1197 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1198 else if (cc == kEndOfFileMarker) {
1200 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1202 m_token->appendToComment('-');
1203 m_token->appendToComment('-');
1204 m_token->appendToComment('!');
1205 m_token->appendToComment(cc);
1206 HTML_ADVANCE_TO(CommentState);
1211 HTML_BEGIN_STATE(DOCTYPEState) {
1212 if (isTokenizerWhitespace(cc))
1213 HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1214 else if (cc == kEndOfFileMarker) {
1216 m_token->beginDOCTYPE();
1217 m_token->setForceQuirks();
1218 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1221 HTML_RECONSUME_IN(BeforeDOCTYPENameState);
1226 HTML_BEGIN_STATE(BeforeDOCTYPENameState) {
1227 if (isTokenizerWhitespace(cc))
1228 HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1229 else if (isASCIIUpper(cc)) {
1230 m_token->beginDOCTYPE(toLowerCase(cc));
1231 HTML_ADVANCE_TO(DOCTYPENameState);
1232 } else if (cc == '>') {
1234 m_token->beginDOCTYPE();
1235 m_token->setForceQuirks();
1236 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1237 } else if (cc == kEndOfFileMarker) {
1239 m_token->beginDOCTYPE();
1240 m_token->setForceQuirks();
1241 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1243 m_token->beginDOCTYPE(cc);
1244 HTML_ADVANCE_TO(DOCTYPENameState);
1249 HTML_BEGIN_STATE(DOCTYPENameState) {
1250 if (isTokenizerWhitespace(cc))
1251 HTML_ADVANCE_TO(AfterDOCTYPENameState);
1253 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1254 else if (isASCIIUpper(cc)) {
1255 m_token->appendToName(toLowerCase(cc));
1256 HTML_ADVANCE_TO(DOCTYPENameState);
1257 } else if (cc == kEndOfFileMarker) {
1259 m_token->setForceQuirks();
1260 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1262 m_token->appendToName(cc);
1263 HTML_ADVANCE_TO(DOCTYPENameState);
1268 HTML_BEGIN_STATE(AfterDOCTYPENameState) {
1269 if (isTokenizerWhitespace(cc))
1270 HTML_ADVANCE_TO(AfterDOCTYPENameState);
1272 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1273 else if (cc == kEndOfFileMarker) {
1275 m_token->setForceQuirks();
1276 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1278 if (cc == 'P' || cc == 'p') {
1279 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::publicString);
1280 if (result == SegmentedString::DidMatch) {
1281 advanceStringAndASSERTIgnoringCase(source, "public");
1282 HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState);
1283 } else if (result == SegmentedString::NotEnoughCharacters)
1284 return haveBufferedCharacterToken();
1285 } else if (cc == 'S' || cc == 's') {
1286 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::system);
1287 if (result == SegmentedString::DidMatch) {
1288 advanceStringAndASSERTIgnoringCase(source, "system");
1289 HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState);
1290 } else if (result == SegmentedString::NotEnoughCharacters)
1291 return haveBufferedCharacterToken();
1294 m_token->setForceQuirks();
1295 HTML_ADVANCE_TO(BogusDOCTYPEState);
1300 HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
1301 if (isTokenizerWhitespace(cc))
1302 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1303 else if (cc == '"') {
1305 m_token->setPublicIdentifierToEmptyString();
1306 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1307 } else if (cc == '\'') {
1309 m_token->setPublicIdentifierToEmptyString();
1310 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1311 } else if (cc == '>') {
1313 m_token->setForceQuirks();
1314 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1315 } else if (cc == kEndOfFileMarker) {
1317 m_token->setForceQuirks();
1318 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1321 m_token->setForceQuirks();
1322 HTML_ADVANCE_TO(BogusDOCTYPEState);
1327 HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
1328 if (isTokenizerWhitespace(cc))
1329 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1330 else if (cc == '"') {
1331 m_token->setPublicIdentifierToEmptyString();
1332 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1333 } else if (cc == '\'') {
1334 m_token->setPublicIdentifierToEmptyString();
1335 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1336 } else if (cc == '>') {
1338 m_token->setForceQuirks();
1339 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1340 } else if (cc == kEndOfFileMarker) {
1342 m_token->setForceQuirks();
1343 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1346 m_token->setForceQuirks();
1347 HTML_ADVANCE_TO(BogusDOCTYPEState);
1352 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) {
1354 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1355 else if (cc == '>') {
1357 m_token->setForceQuirks();
1358 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1359 } else if (cc == kEndOfFileMarker) {
1361 m_token->setForceQuirks();
1362 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1364 m_token->appendToPublicIdentifier(cc);
1365 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1370 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) {
1372 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1373 else if (cc == '>') {
1375 m_token->setForceQuirks();
1376 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1377 } else if (cc == kEndOfFileMarker) {
1379 m_token->setForceQuirks();
1380 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1382 m_token->appendToPublicIdentifier(cc);
1383 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1388 HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
1389 if (isTokenizerWhitespace(cc))
1390 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1392 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1393 else if (cc == '"') {
1395 m_token->setSystemIdentifierToEmptyString();
1396 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1397 } else if (cc == '\'') {
1399 m_token->setSystemIdentifierToEmptyString();
1400 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1401 } else if (cc == kEndOfFileMarker) {
1403 m_token->setForceQuirks();
1404 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1407 m_token->setForceQuirks();
1408 HTML_ADVANCE_TO(BogusDOCTYPEState);
1413 HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) {
1414 if (isTokenizerWhitespace(cc))
1415 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1417 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1418 else if (cc == '"') {
1419 m_token->setSystemIdentifierToEmptyString();
1420 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1421 } else if (cc == '\'') {
1422 m_token->setSystemIdentifierToEmptyString();
1423 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1424 } else if (cc == kEndOfFileMarker) {
1426 m_token->setForceQuirks();
1427 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1430 m_token->setForceQuirks();
1431 HTML_ADVANCE_TO(BogusDOCTYPEState);
1436 HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
1437 if (isTokenizerWhitespace(cc))
1438 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1439 else if (cc == '"') {
1441 m_token->setSystemIdentifierToEmptyString();
1442 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1443 } else if (cc == '\'') {
1445 m_token->setSystemIdentifierToEmptyString();
1446 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1447 } else if (cc == '>') {
1449 m_token->setForceQuirks();
1450 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1451 } else if (cc == kEndOfFileMarker) {
1453 m_token->setForceQuirks();
1454 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1457 m_token->setForceQuirks();
1458 HTML_ADVANCE_TO(BogusDOCTYPEState);
1463 HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
1464 if (isTokenizerWhitespace(cc))
1465 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1467 m_token->setSystemIdentifierToEmptyString();
1468 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1469 } else if (cc == '\'') {
1470 m_token->setSystemIdentifierToEmptyString();
1471 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1472 } else if (cc == '>') {
1474 m_token->setForceQuirks();
1475 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1476 } else if (cc == kEndOfFileMarker) {
1478 m_token->setForceQuirks();
1479 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1482 m_token->setForceQuirks();
1483 HTML_ADVANCE_TO(BogusDOCTYPEState);
1488 HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) {
1490 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1491 else if (cc == '>') {
1493 m_token->setForceQuirks();
1494 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1495 } else if (cc == kEndOfFileMarker) {
1497 m_token->setForceQuirks();
1498 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1500 m_token->appendToSystemIdentifier(cc);
1501 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1506 HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) {
1508 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1509 else if (cc == '>') {
1511 m_token->setForceQuirks();
1512 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1513 } else if (cc == kEndOfFileMarker) {
1515 m_token->setForceQuirks();
1516 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1518 m_token->appendToSystemIdentifier(cc);
1519 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1524 HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
1525 if (isTokenizerWhitespace(cc))
1526 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1528 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1529 else if (cc == kEndOfFileMarker) {
1531 m_token->setForceQuirks();
1532 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1535 HTML_ADVANCE_TO(BogusDOCTYPEState);
1540 HTML_BEGIN_STATE(BogusDOCTYPEState) {
1542 return emitAndResumeIn(source, HTMLTokenizer::DataState);
1543 else if (cc == kEndOfFileMarker)
1544 return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1545 HTML_ADVANCE_TO(BogusDOCTYPEState);
1549 HTML_BEGIN_STATE(CDATASectionState) {
1551 HTML_ADVANCE_TO(CDATASectionRightSquareBracketState);
1552 else if (cc == kEndOfFileMarker)
1553 HTML_RECONSUME_IN(DataState);
1555 bufferCharacter(cc);
1556 HTML_ADVANCE_TO(CDATASectionState);
1561 HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) {
1563 HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState);
1565 bufferCharacter(']');
1566 HTML_RECONSUME_IN(CDATASectionState);
1570 HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) {
1572 HTML_ADVANCE_TO(DataState);
1574 bufferCharacter(']');
1575 bufferCharacter(']');
1576 HTML_RECONSUME_IN(CDATASectionState);
1583 ASSERT_NOT_REACHED();
1587 String HTMLTokenizer::bufferedCharacters() const
1589 // FIXME: Add an assert about m_state.
1590 StringBuilder characters;
1591 characters.reserveCapacity(numberOfBufferedCharacters());
1592 characters.append('<');
1593 characters.append('/');
1594 characters.append(m_temporaryBuffer.data(), m_temporaryBuffer.size());
1595 return characters.toString();
1598 void HTMLTokenizer::updateStateFor(const String& tagName)
1600 if (threadSafeMatch(tagName, textareaTag) || threadSafeMatch(tagName, titleTag))
1601 setState(HTMLTokenizer::RCDATAState);
1602 else if (threadSafeMatch(tagName, plaintextTag))
1603 setState(HTMLTokenizer::PLAINTEXTState);
1604 else if (threadSafeMatch(tagName, scriptTag))
1605 setState(HTMLTokenizer::ScriptDataState);
1606 else if (threadSafeMatch(tagName, styleTag)
1607 || threadSafeMatch(tagName, iframeTag)
1608 || threadSafeMatch(tagName, xmpTag)
1609 || (threadSafeMatch(tagName, noembedTag) && m_options.pluginsEnabled)
1610 || threadSafeMatch(tagName, noframesTag)
1611 || (threadSafeMatch(tagName, noscriptTag) && m_options.scriptEnabled))
1612 setState(HTMLTokenizer::RAWTEXTState);
1615 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
1617 return vectorEqualsString(m_temporaryBuffer, expectedString);
1620 inline void HTMLTokenizer::addToPossibleEndTag(LChar cc)
1622 ASSERT(isEndTagBufferingState(m_state));
1623 m_bufferedEndTagName.append(cc);
1626 inline bool HTMLTokenizer::isAppropriateEndTag()
1628 if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size())
1631 size_t numCharacters = m_bufferedEndTagName.size();
1633 for (size_t i = 0; i < numCharacters; i++) {
1634 if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i])
1641 inline void HTMLTokenizer::parseError()