f8b2f6259297c5f9bd1d775162503885723405fe
[platform/framework/web/crosswalk.git] / src / third_party / WebKit / Source / core / html / parser / HTMLTokenizer.cpp
1 /*
2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 #include "config.h"
29 #include "core/html/parser/HTMLTokenizer.h"
30
31 #include "core/HTMLNames.h"
32 #include "core/HTMLTokenizerNames.h"
33 #include "core/html/parser/HTMLEntityParser.h"
34 #include "core/html/parser/HTMLParserIdioms.h"
35 #include "core/html/parser/HTMLTreeBuilder.h"
36 #include "platform/NotImplemented.h"
37 #include "core/xml/parser/MarkupTokenizerInlines.h"
38 #include "wtf/ASCIICType.h"
39 #include "wtf/text/AtomicString.h"
40 #include "wtf/unicode/Unicode.h"
41
42 // Please don't use DEFINE_STATIC_LOCAL in this file. The HTMLTokenizer is used
43 // from multiple threads and DEFINE_STATIC_LOCAL isn't threadsafe.
44 #undef DEFINE_STATIC_LOCAL
45
46 namespace blink {
47
48 using namespace HTMLNames;
49
50 // This has to go in a .cpp file, as the linker doesn't like it being included more than once.
51 // We don't have an HTMLToken.cpp though, so this is the next best place.
52 QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attribute) const
53 {
54     return QualifiedName(nullAtom, AtomicString(attribute.name), nullAtom);
55 }
56
57 bool AtomicHTMLToken::usesName() const
58 {
59     return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
60 }
61
62 bool AtomicHTMLToken::usesAttributes() const
63 {
64     return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
65 }
66
67 static inline UChar toLowerCase(UChar cc)
68 {
69     ASSERT(isASCIIUpper(cc));
70     const int lowerCaseOffset = 0x20;
71     return cc + lowerCaseOffset;
72 }
73
74 static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string)
75 {
76     if (vector.size() != string.length())
77         return false;
78
79     if (!string.length())
80         return true;
81
82     return equal(string.impl(), vector.data(), vector.size());
83 }
84
85 static inline bool isEndTagBufferingState(HTMLTokenizer::State state)
86 {
87     switch (state) {
88     case HTMLTokenizer::RCDATAEndTagOpenState:
89     case HTMLTokenizer::RCDATAEndTagNameState:
90     case HTMLTokenizer::RAWTEXTEndTagOpenState:
91     case HTMLTokenizer::RAWTEXTEndTagNameState:
92     case HTMLTokenizer::ScriptDataEndTagOpenState:
93     case HTMLTokenizer::ScriptDataEndTagNameState:
94     case HTMLTokenizer::ScriptDataEscapedEndTagOpenState:
95     case HTMLTokenizer::ScriptDataEscapedEndTagNameState:
96         return true;
97     default:
98         return false;
99     }
100 }
101
102 #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName)
103 #define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName)
104 #define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName)
105 #define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName)
106
107 HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options)
108     : m_inputStreamPreprocessor(this)
109     , m_options(options)
110 {
111     reset();
112 }
113
114 HTMLTokenizer::~HTMLTokenizer()
115 {
116 }
117
118 void HTMLTokenizer::reset()
119 {
120     m_state = HTMLTokenizer::DataState;
121     m_token = 0;
122     m_forceNullCharacterReplacement = false;
123     m_shouldAllowCDATA = false;
124     m_additionalAllowedCharacter = '\0';
125 }
126
127 bool HTMLTokenizer::canCreateCheckpoint() const
128 {
129     if (!m_appropriateEndTagName.isEmpty())
130         return false;
131     if (!m_temporaryBuffer.isEmpty())
132         return false;
133     if (!m_bufferedEndTagName.isEmpty())
134         return false;
135     return true;
136 }
137
138 void HTMLTokenizer::createCheckpoint(Checkpoint& result) const
139 {
140     ASSERT(canCreateCheckpoint());
141     result.options = m_options;
142     result.state = m_state;
143     result.additionalAllowedCharacter = m_additionalAllowedCharacter;
144     result.skipNextNewLine = m_inputStreamPreprocessor.skipNextNewLine();
145     result.shouldAllowCDATA = m_shouldAllowCDATA;
146 }
147
148 void HTMLTokenizer::restoreFromCheckpoint(const Checkpoint& checkpoint)
149 {
150     m_token = 0;
151     m_options = checkpoint.options;
152     m_state = checkpoint.state;
153     m_additionalAllowedCharacter = checkpoint.additionalAllowedCharacter;
154     m_inputStreamPreprocessor.reset(checkpoint.skipNextNewLine);
155     m_shouldAllowCDATA = checkpoint.shouldAllowCDATA;
156 }
157
158 inline bool HTMLTokenizer::processEntity(SegmentedString& source)
159 {
160     bool notEnoughCharacters = false;
161     DecodedHTMLEntity decodedEntity;
162     bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
163     if (notEnoughCharacters)
164         return false;
165     if (!success) {
166         ASSERT(decodedEntity.isEmpty());
167         bufferCharacter('&');
168     } else {
169         for (unsigned i = 0; i < decodedEntity.length; ++i)
170             bufferCharacter(decodedEntity.data[i]);
171     }
172     return true;
173 }
174
175 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
176 {
177     ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized);
178     source.advanceAndUpdateLineNumber();
179     if (m_token->type() == HTMLToken::Character)
180         return true;
181     m_token->beginEndTag(m_bufferedEndTagName);
182     m_bufferedEndTagName.clear();
183     m_appropriateEndTagName.clear();
184     m_temporaryBuffer.clear();
185     return false;
186 }
187
188 #define FLUSH_AND_ADVANCE_TO(stateName)                                    \
189     do {                                                                   \
190         m_state = HTMLTokenizer::stateName;                           \
191         if (flushBufferedEndTag(source))                                   \
192             return true;                                                   \
193         if (source.isEmpty()                                               \
194             || !m_inputStreamPreprocessor.peek(source))                    \
195             return haveBufferedCharacterToken();                           \
196         cc = m_inputStreamPreprocessor.nextInputCharacter();               \
197         goto stateName;                                                    \
198     } while (false)
199
200 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer::State state)
201 {
202     m_state = state;
203     flushBufferedEndTag(source);
204     return true;
205 }
206
207 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
208 {
209     // If we have a token in progress, then we're supposed to be called back
210     // with the same token so we can finish it.
211     ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized);
212     m_token = &token;
213
214     if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
215         // FIXME: This should call flushBufferedEndTag().
216         // We started an end tag during our last iteration.
217         m_token->beginEndTag(m_bufferedEndTagName);
218         m_bufferedEndTagName.clear();
219         m_appropriateEndTagName.clear();
220         m_temporaryBuffer.clear();
221         if (m_state == HTMLTokenizer::DataState) {
222             // We're back in the data state, so we must be done with the tag.
223             return true;
224         }
225     }
226
227     if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source))
228         return haveBufferedCharacterToken();
229     UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
230
231     // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
232     switch (m_state) {
233     HTML_BEGIN_STATE(DataState) {
234         if (cc == '&')
235             HTML_ADVANCE_TO(CharacterReferenceInDataState);
236         else if (cc == '<') {
237             if (m_token->type() == HTMLToken::Character) {
238                 // We have a bunch of character tokens queued up that we
239                 // are emitting lazily here.
240                 return true;
241             }
242             HTML_ADVANCE_TO(TagOpenState);
243         } else if (cc == kEndOfFileMarker)
244             return emitEndOfFile(source);
245         else {
246             bufferCharacter(cc);
247             HTML_ADVANCE_TO(DataState);
248         }
249     }
250     END_STATE()
251
252     HTML_BEGIN_STATE(CharacterReferenceInDataState) {
253         if (!processEntity(source))
254             return haveBufferedCharacterToken();
255         HTML_SWITCH_TO(DataState);
256     }
257     END_STATE()
258
259     HTML_BEGIN_STATE(RCDATAState) {
260         if (cc == '&')
261             HTML_ADVANCE_TO(CharacterReferenceInRCDATAState);
262         else if (cc == '<')
263             HTML_ADVANCE_TO(RCDATALessThanSignState);
264         else if (cc == kEndOfFileMarker)
265             return emitEndOfFile(source);
266         else {
267             bufferCharacter(cc);
268             HTML_ADVANCE_TO(RCDATAState);
269         }
270     }
271     END_STATE()
272
273     HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) {
274         if (!processEntity(source))
275             return haveBufferedCharacterToken();
276         HTML_SWITCH_TO(RCDATAState);
277     }
278     END_STATE()
279
280     HTML_BEGIN_STATE(RAWTEXTState) {
281         if (cc == '<')
282             HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
283         else if (cc == kEndOfFileMarker)
284             return emitEndOfFile(source);
285         else {
286             bufferCharacter(cc);
287             HTML_ADVANCE_TO(RAWTEXTState);
288         }
289     }
290     END_STATE()
291
292     HTML_BEGIN_STATE(ScriptDataState) {
293         if (cc == '<')
294             HTML_ADVANCE_TO(ScriptDataLessThanSignState);
295         else if (cc == kEndOfFileMarker)
296             return emitEndOfFile(source);
297         else {
298             bufferCharacter(cc);
299             HTML_ADVANCE_TO(ScriptDataState);
300         }
301     }
302     END_STATE()
303
304     HTML_BEGIN_STATE(PLAINTEXTState) {
305         if (cc == kEndOfFileMarker)
306             return emitEndOfFile(source);
307         bufferCharacter(cc);
308         HTML_ADVANCE_TO(PLAINTEXTState);
309     }
310     END_STATE()
311
312     HTML_BEGIN_STATE(TagOpenState) {
313         if (cc == '!')
314             HTML_ADVANCE_TO(MarkupDeclarationOpenState);
315         else if (cc == '/')
316             HTML_ADVANCE_TO(EndTagOpenState);
317         else if (isASCIIUpper(cc)) {
318             m_token->beginStartTag(toLowerCase(cc));
319             HTML_ADVANCE_TO(TagNameState);
320         } else if (isASCIILower(cc)) {
321             m_token->beginStartTag(cc);
322             HTML_ADVANCE_TO(TagNameState);
323         } else if (cc == '?') {
324             parseError();
325             // The spec consumes the current character before switching
326             // to the bogus comment state, but it's easier to implement
327             // if we reconsume the current character.
328             HTML_RECONSUME_IN(BogusCommentState);
329         } else {
330             parseError();
331             bufferCharacter('<');
332             HTML_RECONSUME_IN(DataState);
333         }
334     }
335     END_STATE()
336
337     HTML_BEGIN_STATE(EndTagOpenState) {
338         if (isASCIIUpper(cc)) {
339             m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc)));
340             m_appropriateEndTagName.clear();
341             HTML_ADVANCE_TO(TagNameState);
342         } else if (isASCIILower(cc)) {
343             m_token->beginEndTag(static_cast<LChar>(cc));
344             m_appropriateEndTagName.clear();
345             HTML_ADVANCE_TO(TagNameState);
346         } else if (cc == '>') {
347             parseError();
348             HTML_ADVANCE_TO(DataState);
349         } else if (cc == kEndOfFileMarker) {
350             parseError();
351             bufferCharacter('<');
352             bufferCharacter('/');
353             HTML_RECONSUME_IN(DataState);
354         } else {
355             parseError();
356             HTML_RECONSUME_IN(BogusCommentState);
357         }
358     }
359     END_STATE()
360
361     HTML_BEGIN_STATE(TagNameState) {
362         if (isTokenizerWhitespace(cc))
363             HTML_ADVANCE_TO(BeforeAttributeNameState);
364         else if (cc == '/')
365             HTML_ADVANCE_TO(SelfClosingStartTagState);
366         else if (cc == '>')
367             return emitAndResumeIn(source, HTMLTokenizer::DataState);
368         else if (isASCIIUpper(cc)) {
369             m_token->appendToName(toLowerCase(cc));
370             HTML_ADVANCE_TO(TagNameState);
371         } else if (cc == kEndOfFileMarker) {
372             parseError();
373             HTML_RECONSUME_IN(DataState);
374         } else {
375             m_token->appendToName(cc);
376             HTML_ADVANCE_TO(TagNameState);
377         }
378     }
379     END_STATE()
380
381     HTML_BEGIN_STATE(RCDATALessThanSignState) {
382         if (cc == '/') {
383             m_temporaryBuffer.clear();
384             ASSERT(m_bufferedEndTagName.isEmpty());
385             HTML_ADVANCE_TO(RCDATAEndTagOpenState);
386         } else {
387             bufferCharacter('<');
388             HTML_RECONSUME_IN(RCDATAState);
389         }
390     }
391     END_STATE()
392
393     HTML_BEGIN_STATE(RCDATAEndTagOpenState) {
394         if (isASCIIUpper(cc)) {
395             m_temporaryBuffer.append(static_cast<LChar>(cc));
396             addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
397             HTML_ADVANCE_TO(RCDATAEndTagNameState);
398         } else if (isASCIILower(cc)) {
399             m_temporaryBuffer.append(static_cast<LChar>(cc));
400             addToPossibleEndTag(static_cast<LChar>(cc));
401             HTML_ADVANCE_TO(RCDATAEndTagNameState);
402         } else {
403             bufferCharacter('<');
404             bufferCharacter('/');
405             HTML_RECONSUME_IN(RCDATAState);
406         }
407     }
408     END_STATE()
409
410     HTML_BEGIN_STATE(RCDATAEndTagNameState) {
411         if (isASCIIUpper(cc)) {
412             m_temporaryBuffer.append(static_cast<LChar>(cc));
413             addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
414             HTML_ADVANCE_TO(RCDATAEndTagNameState);
415         } else if (isASCIILower(cc)) {
416             m_temporaryBuffer.append(static_cast<LChar>(cc));
417             addToPossibleEndTag(static_cast<LChar>(cc));
418             HTML_ADVANCE_TO(RCDATAEndTagNameState);
419         } else {
420             if (isTokenizerWhitespace(cc)) {
421                 if (isAppropriateEndTag()) {
422                     m_temporaryBuffer.append(static_cast<LChar>(cc));
423                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
424                 }
425             } else if (cc == '/') {
426                 if (isAppropriateEndTag()) {
427                     m_temporaryBuffer.append(static_cast<LChar>(cc));
428                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
429                 }
430             } else if (cc == '>') {
431                 if (isAppropriateEndTag()) {
432                     m_temporaryBuffer.append(static_cast<LChar>(cc));
433                     return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
434                 }
435             }
436             bufferCharacter('<');
437             bufferCharacter('/');
438             m_token->appendToCharacter(m_temporaryBuffer);
439             m_bufferedEndTagName.clear();
440             m_temporaryBuffer.clear();
441             HTML_RECONSUME_IN(RCDATAState);
442         }
443     }
444     END_STATE()
445
446     HTML_BEGIN_STATE(RAWTEXTLessThanSignState) {
447         if (cc == '/') {
448             m_temporaryBuffer.clear();
449             ASSERT(m_bufferedEndTagName.isEmpty());
450             HTML_ADVANCE_TO(RAWTEXTEndTagOpenState);
451         } else {
452             bufferCharacter('<');
453             HTML_RECONSUME_IN(RAWTEXTState);
454         }
455     }
456     END_STATE()
457
458     HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) {
459         if (isASCIIUpper(cc)) {
460             m_temporaryBuffer.append(static_cast<LChar>(cc));
461             addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
462             HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
463         } else if (isASCIILower(cc)) {
464             m_temporaryBuffer.append(static_cast<LChar>(cc));
465             addToPossibleEndTag(static_cast<LChar>(cc));
466             HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
467         } else {
468             bufferCharacter('<');
469             bufferCharacter('/');
470             HTML_RECONSUME_IN(RAWTEXTState);
471         }
472     }
473     END_STATE()
474
475     HTML_BEGIN_STATE(RAWTEXTEndTagNameState) {
476         if (isASCIIUpper(cc)) {
477             m_temporaryBuffer.append(static_cast<LChar>(cc));
478             addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
479             HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
480         } else if (isASCIILower(cc)) {
481             m_temporaryBuffer.append(static_cast<LChar>(cc));
482             addToPossibleEndTag(static_cast<LChar>(cc));
483             HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
484         } else {
485             if (isTokenizerWhitespace(cc)) {
486                 if (isAppropriateEndTag()) {
487                     m_temporaryBuffer.append(static_cast<LChar>(cc));
488                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
489                 }
490             } else if (cc == '/') {
491                 if (isAppropriateEndTag()) {
492                     m_temporaryBuffer.append(static_cast<LChar>(cc));
493                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
494                 }
495             } else if (cc == '>') {
496                 if (isAppropriateEndTag()) {
497                     m_temporaryBuffer.append(static_cast<LChar>(cc));
498                     return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
499                 }
500             }
501             bufferCharacter('<');
502             bufferCharacter('/');
503             m_token->appendToCharacter(m_temporaryBuffer);
504             m_bufferedEndTagName.clear();
505             m_temporaryBuffer.clear();
506             HTML_RECONSUME_IN(RAWTEXTState);
507         }
508     }
509     END_STATE()
510
511     HTML_BEGIN_STATE(ScriptDataLessThanSignState) {
512         if (cc == '/') {
513             m_temporaryBuffer.clear();
514             ASSERT(m_bufferedEndTagName.isEmpty());
515             HTML_ADVANCE_TO(ScriptDataEndTagOpenState);
516         } else if (cc == '!') {
517             bufferCharacter('<');
518             bufferCharacter('!');
519             HTML_ADVANCE_TO(ScriptDataEscapeStartState);
520         } else {
521             bufferCharacter('<');
522             HTML_RECONSUME_IN(ScriptDataState);
523         }
524     }
525     END_STATE()
526
527     HTML_BEGIN_STATE(ScriptDataEndTagOpenState) {
528         if (isASCIIUpper(cc)) {
529             m_temporaryBuffer.append(static_cast<LChar>(cc));
530             addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
531             HTML_ADVANCE_TO(ScriptDataEndTagNameState);
532         } else if (isASCIILower(cc)) {
533             m_temporaryBuffer.append(static_cast<LChar>(cc));
534             addToPossibleEndTag(static_cast<LChar>(cc));
535             HTML_ADVANCE_TO(ScriptDataEndTagNameState);
536         } else {
537             bufferCharacter('<');
538             bufferCharacter('/');
539             HTML_RECONSUME_IN(ScriptDataState);
540         }
541     }
542     END_STATE()
543
544     HTML_BEGIN_STATE(ScriptDataEndTagNameState) {
545         if (isASCIIUpper(cc)) {
546             m_temporaryBuffer.append(static_cast<LChar>(cc));
547             addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
548             HTML_ADVANCE_TO(ScriptDataEndTagNameState);
549         } else if (isASCIILower(cc)) {
550             m_temporaryBuffer.append(static_cast<LChar>(cc));
551             addToPossibleEndTag(static_cast<LChar>(cc));
552             HTML_ADVANCE_TO(ScriptDataEndTagNameState);
553         } else {
554             if (isTokenizerWhitespace(cc)) {
555                 if (isAppropriateEndTag()) {
556                     m_temporaryBuffer.append(static_cast<LChar>(cc));
557                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
558                 }
559             } else if (cc == '/') {
560                 if (isAppropriateEndTag()) {
561                     m_temporaryBuffer.append(static_cast<LChar>(cc));
562                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
563                 }
564             } else if (cc == '>') {
565                 if (isAppropriateEndTag()) {
566                     m_temporaryBuffer.append(static_cast<LChar>(cc));
567                     return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
568                 }
569             }
570             bufferCharacter('<');
571             bufferCharacter('/');
572             m_token->appendToCharacter(m_temporaryBuffer);
573             m_bufferedEndTagName.clear();
574             m_temporaryBuffer.clear();
575             HTML_RECONSUME_IN(ScriptDataState);
576         }
577     }
578     END_STATE()
579
580     HTML_BEGIN_STATE(ScriptDataEscapeStartState) {
581         if (cc == '-') {
582             bufferCharacter(cc);
583             HTML_ADVANCE_TO(ScriptDataEscapeStartDashState);
584         } else
585             HTML_RECONSUME_IN(ScriptDataState);
586     }
587     END_STATE()
588
589     HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) {
590         if (cc == '-') {
591             bufferCharacter(cc);
592             HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
593         } else
594             HTML_RECONSUME_IN(ScriptDataState);
595     }
596     END_STATE()
597
598     HTML_BEGIN_STATE(ScriptDataEscapedState) {
599         if (cc == '-') {
600             bufferCharacter(cc);
601             HTML_ADVANCE_TO(ScriptDataEscapedDashState);
602         } else if (cc == '<')
603             HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
604         else if (cc == kEndOfFileMarker) {
605             parseError();
606             HTML_RECONSUME_IN(DataState);
607         } else {
608             bufferCharacter(cc);
609             HTML_ADVANCE_TO(ScriptDataEscapedState);
610         }
611     }
612     END_STATE()
613
614     HTML_BEGIN_STATE(ScriptDataEscapedDashState) {
615         if (cc == '-') {
616             bufferCharacter(cc);
617             HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
618         } else if (cc == '<')
619             HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
620         else if (cc == kEndOfFileMarker) {
621             parseError();
622             HTML_RECONSUME_IN(DataState);
623         } else {
624             bufferCharacter(cc);
625             HTML_ADVANCE_TO(ScriptDataEscapedState);
626         }
627     }
628     END_STATE()
629
630     HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) {
631         if (cc == '-') {
632             bufferCharacter(cc);
633             HTML_ADVANCE_TO(ScriptDataEscapedDashDashState);
634         } else if (cc == '<')
635             HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState);
636         else if (cc == '>') {
637             bufferCharacter(cc);
638             HTML_ADVANCE_TO(ScriptDataState);
639         } else if (cc == kEndOfFileMarker) {
640             parseError();
641             HTML_RECONSUME_IN(DataState);
642         } else {
643             bufferCharacter(cc);
644             HTML_ADVANCE_TO(ScriptDataEscapedState);
645         }
646     }
647     END_STATE()
648
649     HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
650         if (cc == '/') {
651             m_temporaryBuffer.clear();
652             ASSERT(m_bufferedEndTagName.isEmpty());
653             HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
654         } else if (isASCIIUpper(cc)) {
655             bufferCharacter('<');
656             bufferCharacter(cc);
657             m_temporaryBuffer.clear();
658             m_temporaryBuffer.append(toLowerCase(cc));
659             HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
660         } else if (isASCIILower(cc)) {
661             bufferCharacter('<');
662             bufferCharacter(cc);
663             m_temporaryBuffer.clear();
664             m_temporaryBuffer.append(static_cast<LChar>(cc));
665             HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
666         } else {
667             bufferCharacter('<');
668             HTML_RECONSUME_IN(ScriptDataEscapedState);
669         }
670     }
671     END_STATE()
672
673     HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
674         if (isASCIIUpper(cc)) {
675             m_temporaryBuffer.append(static_cast<LChar>(cc));
676             addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
677             HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
678         } else if (isASCIILower(cc)) {
679             m_temporaryBuffer.append(static_cast<LChar>(cc));
680             addToPossibleEndTag(static_cast<LChar>(cc));
681             HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
682         } else {
683             bufferCharacter('<');
684             bufferCharacter('/');
685             HTML_RECONSUME_IN(ScriptDataEscapedState);
686         }
687     }
688     END_STATE()
689
690     HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
691         if (isASCIIUpper(cc)) {
692             m_temporaryBuffer.append(static_cast<LChar>(cc));
693             addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
694             HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
695         } else if (isASCIILower(cc)) {
696             m_temporaryBuffer.append(static_cast<LChar>(cc));
697             addToPossibleEndTag(static_cast<LChar>(cc));
698             HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState);
699         } else {
700             if (isTokenizerWhitespace(cc)) {
701                 if (isAppropriateEndTag()) {
702                     m_temporaryBuffer.append(static_cast<LChar>(cc));
703                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
704                 }
705             } else if (cc == '/') {
706                 if (isAppropriateEndTag()) {
707                     m_temporaryBuffer.append(static_cast<LChar>(cc));
708                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
709                 }
710             } else if (cc == '>') {
711                 if (isAppropriateEndTag()) {
712                     m_temporaryBuffer.append(static_cast<LChar>(cc));
713                     return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
714                 }
715             }
716             bufferCharacter('<');
717             bufferCharacter('/');
718             m_token->appendToCharacter(m_temporaryBuffer);
719             m_bufferedEndTagName.clear();
720             m_temporaryBuffer.clear();
721             HTML_RECONSUME_IN(ScriptDataEscapedState);
722         }
723     }
724     END_STATE()
725
726     HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
727         if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
728             bufferCharacter(cc);
729             if (temporaryBufferIs(scriptTag.localName()))
730                 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
731             else
732                 HTML_ADVANCE_TO(ScriptDataEscapedState);
733         } else if (isASCIIUpper(cc)) {
734             bufferCharacter(cc);
735             m_temporaryBuffer.append(toLowerCase(cc));
736             HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
737         } else if (isASCIILower(cc)) {
738             bufferCharacter(cc);
739             m_temporaryBuffer.append(static_cast<LChar>(cc));
740             HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState);
741         } else
742             HTML_RECONSUME_IN(ScriptDataEscapedState);
743     }
744     END_STATE()
745
746     HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) {
747         if (cc == '-') {
748             bufferCharacter(cc);
749             HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState);
750         } else if (cc == '<') {
751             bufferCharacter(cc);
752             HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
753         } else if (cc == kEndOfFileMarker) {
754             parseError();
755             HTML_RECONSUME_IN(DataState);
756         } else {
757             bufferCharacter(cc);
758             HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
759         }
760     }
761     END_STATE()
762
763     HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
764         if (cc == '-') {
765             bufferCharacter(cc);
766             HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
767         } else if (cc == '<') {
768             bufferCharacter(cc);
769             HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
770         } else if (cc == kEndOfFileMarker) {
771             parseError();
772             HTML_RECONSUME_IN(DataState);
773         } else {
774             bufferCharacter(cc);
775             HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
776         }
777     }
778     END_STATE()
779
780     HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
781         if (cc == '-') {
782             bufferCharacter(cc);
783             HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
784         } else if (cc == '<') {
785             bufferCharacter(cc);
786             HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
787         } else if (cc == '>') {
788             bufferCharacter(cc);
789             HTML_ADVANCE_TO(ScriptDataState);
790         } else if (cc == kEndOfFileMarker) {
791             parseError();
792             HTML_RECONSUME_IN(DataState);
793         } else {
794             bufferCharacter(cc);
795             HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
796         }
797     }
798     END_STATE()
799
800     HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
801         if (cc == '/') {
802             bufferCharacter(cc);
803             m_temporaryBuffer.clear();
804             HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
805         } else
806             HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
807     }
808     END_STATE()
809
810     HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
811         if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
812             bufferCharacter(cc);
813             if (temporaryBufferIs(scriptTag.localName()))
814                 HTML_ADVANCE_TO(ScriptDataEscapedState);
815             else
816                 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState);
817         } else if (isASCIIUpper(cc)) {
818             bufferCharacter(cc);
819             m_temporaryBuffer.append(toLowerCase(cc));
820             HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
821         } else if (isASCIILower(cc)) {
822             bufferCharacter(cc);
823             m_temporaryBuffer.append(static_cast<LChar>(cc));
824             HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState);
825         } else
826             HTML_RECONSUME_IN(ScriptDataDoubleEscapedState);
827     }
828     END_STATE()
829
830     HTML_BEGIN_STATE(BeforeAttributeNameState) {
831         if (isTokenizerWhitespace(cc))
832             HTML_ADVANCE_TO(BeforeAttributeNameState);
833         else if (cc == '/')
834             HTML_ADVANCE_TO(SelfClosingStartTagState);
835         else if (cc == '>')
836             return emitAndResumeIn(source, HTMLTokenizer::DataState);
837         else if (isASCIIUpper(cc)) {
838             m_token->addNewAttribute();
839             m_token->beginAttributeName(source.numberOfCharactersConsumed());
840             m_token->appendToAttributeName(toLowerCase(cc));
841             HTML_ADVANCE_TO(AttributeNameState);
842         } else if (cc == kEndOfFileMarker) {
843             parseError();
844             HTML_RECONSUME_IN(DataState);
845         } else {
846             if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
847                 parseError();
848             m_token->addNewAttribute();
849             m_token->beginAttributeName(source.numberOfCharactersConsumed());
850             m_token->appendToAttributeName(cc);
851             HTML_ADVANCE_TO(AttributeNameState);
852         }
853     }
854     END_STATE()
855
856     HTML_BEGIN_STATE(AttributeNameState) {
857         if (isTokenizerWhitespace(cc)) {
858             m_token->endAttributeName(source.numberOfCharactersConsumed());
859             HTML_ADVANCE_TO(AfterAttributeNameState);
860         } else if (cc == '/') {
861             m_token->endAttributeName(source.numberOfCharactersConsumed());
862             HTML_ADVANCE_TO(SelfClosingStartTagState);
863         } else if (cc == '=') {
864             m_token->endAttributeName(source.numberOfCharactersConsumed());
865             HTML_ADVANCE_TO(BeforeAttributeValueState);
866         } else if (cc == '>') {
867             m_token->endAttributeName(source.numberOfCharactersConsumed());
868             return emitAndResumeIn(source, HTMLTokenizer::DataState);
869         } else if (isASCIIUpper(cc)) {
870             m_token->appendToAttributeName(toLowerCase(cc));
871             HTML_ADVANCE_TO(AttributeNameState);
872         } else if (cc == kEndOfFileMarker) {
873             parseError();
874             m_token->endAttributeName(source.numberOfCharactersConsumed());
875             HTML_RECONSUME_IN(DataState);
876         } else {
877             if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
878                 parseError();
879             m_token->appendToAttributeName(cc);
880             HTML_ADVANCE_TO(AttributeNameState);
881         }
882     }
883     END_STATE()
884
885     HTML_BEGIN_STATE(AfterAttributeNameState) {
886         if (isTokenizerWhitespace(cc))
887             HTML_ADVANCE_TO(AfterAttributeNameState);
888         else if (cc == '/')
889             HTML_ADVANCE_TO(SelfClosingStartTagState);
890         else if (cc == '=')
891             HTML_ADVANCE_TO(BeforeAttributeValueState);
892         else if (cc == '>')
893             return emitAndResumeIn(source, HTMLTokenizer::DataState);
894         else if (isASCIIUpper(cc)) {
895             m_token->addNewAttribute();
896             m_token->beginAttributeName(source.numberOfCharactersConsumed());
897             m_token->appendToAttributeName(toLowerCase(cc));
898             HTML_ADVANCE_TO(AttributeNameState);
899         } else if (cc == kEndOfFileMarker) {
900             parseError();
901             HTML_RECONSUME_IN(DataState);
902         } else {
903             if (cc == '"' || cc == '\'' || cc == '<')
904                 parseError();
905             m_token->addNewAttribute();
906             m_token->beginAttributeName(source.numberOfCharactersConsumed());
907             m_token->appendToAttributeName(cc);
908             HTML_ADVANCE_TO(AttributeNameState);
909         }
910     }
911     END_STATE()
912
913     HTML_BEGIN_STATE(BeforeAttributeValueState) {
914         if (isTokenizerWhitespace(cc))
915             HTML_ADVANCE_TO(BeforeAttributeValueState);
916         else if (cc == '"') {
917             m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
918             HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
919         } else if (cc == '&') {
920             m_token->beginAttributeValue(source.numberOfCharactersConsumed());
921             HTML_RECONSUME_IN(AttributeValueUnquotedState);
922         } else if (cc == '\'') {
923             m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
924             HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
925         } else if (cc == '>') {
926             parseError();
927             return emitAndResumeIn(source, HTMLTokenizer::DataState);
928         } else if (cc == kEndOfFileMarker) {
929             parseError();
930             HTML_RECONSUME_IN(DataState);
931         } else {
932             if (cc == '<' || cc == '=' || cc == '`')
933                 parseError();
934             m_token->beginAttributeValue(source.numberOfCharactersConsumed());
935             m_token->appendToAttributeValue(cc);
936             HTML_ADVANCE_TO(AttributeValueUnquotedState);
937         }
938     }
939     END_STATE()
940
941     HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) {
942         if (cc == '"') {
943             m_token->endAttributeValue(source.numberOfCharactersConsumed());
944             HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
945         } else if (cc == '&') {
946             m_additionalAllowedCharacter = '"';
947             HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
948         } else if (cc == kEndOfFileMarker) {
949             parseError();
950             m_token->endAttributeValue(source.numberOfCharactersConsumed());
951             HTML_RECONSUME_IN(DataState);
952         } else {
953             m_token->appendToAttributeValue(cc);
954             HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
955         }
956     }
957     END_STATE()
958
959     HTML_BEGIN_STATE(AttributeValueSingleQuotedState) {
960         if (cc == '\'') {
961             m_token->endAttributeValue(source.numberOfCharactersConsumed());
962             HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
963         } else if (cc == '&') {
964             m_additionalAllowedCharacter = '\'';
965             HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
966         } else if (cc == kEndOfFileMarker) {
967             parseError();
968             m_token->endAttributeValue(source.numberOfCharactersConsumed());
969             HTML_RECONSUME_IN(DataState);
970         } else {
971             m_token->appendToAttributeValue(cc);
972             HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
973         }
974     }
975     END_STATE()
976
977     HTML_BEGIN_STATE(AttributeValueUnquotedState) {
978         if (isTokenizerWhitespace(cc)) {
979             m_token->endAttributeValue(source.numberOfCharactersConsumed());
980             HTML_ADVANCE_TO(BeforeAttributeNameState);
981         } else if (cc == '&') {
982             m_additionalAllowedCharacter = '>';
983             HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
984         } else if (cc == '>') {
985             m_token->endAttributeValue(source.numberOfCharactersConsumed());
986             return emitAndResumeIn(source, HTMLTokenizer::DataState);
987         } else if (cc == kEndOfFileMarker) {
988             parseError();
989             m_token->endAttributeValue(source.numberOfCharactersConsumed());
990             HTML_RECONSUME_IN(DataState);
991         } else {
992             if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
993                 parseError();
994             m_token->appendToAttributeValue(cc);
995             HTML_ADVANCE_TO(AttributeValueUnquotedState);
996         }
997     }
998     END_STATE()
999
1000     HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
1001         bool notEnoughCharacters = false;
1002         DecodedHTMLEntity decodedEntity;
1003         bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
1004         if (notEnoughCharacters)
1005             return haveBufferedCharacterToken();
1006         if (!success) {
1007             ASSERT(decodedEntity.isEmpty());
1008             m_token->appendToAttributeValue('&');
1009         } else {
1010             for (unsigned i = 0; i < decodedEntity.length; ++i)
1011                 m_token->appendToAttributeValue(decodedEntity.data[i]);
1012         }
1013         // We're supposed to switch back to the attribute value state that
1014         // we were in when we were switched into this state. Rather than
1015         // keeping track of this explictly, we observe that the previous
1016         // state can be determined by m_additionalAllowedCharacter.
1017         if (m_additionalAllowedCharacter == '"')
1018             HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
1019         else if (m_additionalAllowedCharacter == '\'')
1020             HTML_SWITCH_TO(AttributeValueSingleQuotedState);
1021         else if (m_additionalAllowedCharacter == '>')
1022             HTML_SWITCH_TO(AttributeValueUnquotedState);
1023         else
1024             ASSERT_NOT_REACHED();
1025     }
1026     END_STATE()
1027
1028     HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
1029         if (isTokenizerWhitespace(cc))
1030             HTML_ADVANCE_TO(BeforeAttributeNameState);
1031         else if (cc == '/')
1032             HTML_ADVANCE_TO(SelfClosingStartTagState);
1033         else if (cc == '>')
1034             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1035         else if (cc == kEndOfFileMarker) {
1036             parseError();
1037             HTML_RECONSUME_IN(DataState);
1038         } else {
1039             parseError();
1040             HTML_RECONSUME_IN(BeforeAttributeNameState);
1041         }
1042     }
1043     END_STATE()
1044
1045     HTML_BEGIN_STATE(SelfClosingStartTagState) {
1046         if (cc == '>') {
1047             m_token->setSelfClosing();
1048             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1049         } else if (cc == kEndOfFileMarker) {
1050             parseError();
1051             HTML_RECONSUME_IN(DataState);
1052         } else {
1053             parseError();
1054             HTML_RECONSUME_IN(BeforeAttributeNameState);
1055         }
1056     }
1057     END_STATE()
1058
1059     HTML_BEGIN_STATE(BogusCommentState) {
1060         m_token->beginComment();
1061         HTML_RECONSUME_IN(ContinueBogusCommentState);
1062     }
1063     END_STATE()
1064
1065     HTML_BEGIN_STATE(ContinueBogusCommentState) {
1066         if (cc == '>')
1067             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1068         else if (cc == kEndOfFileMarker)
1069             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1070         else {
1071             m_token->appendToComment(cc);
1072             HTML_ADVANCE_TO(ContinueBogusCommentState);
1073         }
1074     }
1075     END_STATE()
1076
1077     HTML_BEGIN_STATE(MarkupDeclarationOpenState) {
1078         if (cc == '-') {
1079             SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::dashDash);
1080             if (result == SegmentedString::DidMatch) {
1081                 source.advanceAndASSERT('-');
1082                 source.advanceAndASSERT('-');
1083                 m_token->beginComment();
1084                 HTML_SWITCH_TO(CommentStartState);
1085             } else if (result == SegmentedString::NotEnoughCharacters)
1086                 return haveBufferedCharacterToken();
1087         } else if (cc == 'D' || cc == 'd') {
1088             SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::doctype);
1089             if (result == SegmentedString::DidMatch) {
1090                 advanceStringAndASSERTIgnoringCase(source, "doctype");
1091                 HTML_SWITCH_TO(DOCTYPEState);
1092             } else if (result == SegmentedString::NotEnoughCharacters)
1093                 return haveBufferedCharacterToken();
1094         } else if (cc == '[' && shouldAllowCDATA()) {
1095             SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::cdata);
1096             if (result == SegmentedString::DidMatch) {
1097                 advanceStringAndASSERT(source, "[CDATA[");
1098                 HTML_SWITCH_TO(CDATASectionState);
1099             } else if (result == SegmentedString::NotEnoughCharacters)
1100                 return haveBufferedCharacterToken();
1101         }
1102         parseError();
1103         HTML_RECONSUME_IN(BogusCommentState);
1104     }
1105     END_STATE()
1106
1107     HTML_BEGIN_STATE(CommentStartState) {
1108         if (cc == '-')
1109             HTML_ADVANCE_TO(CommentStartDashState);
1110         else if (cc == '>') {
1111             parseError();
1112             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1113         } else if (cc == kEndOfFileMarker) {
1114             parseError();
1115             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1116         } else {
1117             m_token->appendToComment(cc);
1118             HTML_ADVANCE_TO(CommentState);
1119         }
1120     }
1121     END_STATE()
1122
1123     HTML_BEGIN_STATE(CommentStartDashState) {
1124         if (cc == '-')
1125             HTML_ADVANCE_TO(CommentEndState);
1126         else if (cc == '>') {
1127             parseError();
1128             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1129         } else if (cc == kEndOfFileMarker) {
1130             parseError();
1131             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1132         } else {
1133             m_token->appendToComment('-');
1134             m_token->appendToComment(cc);
1135             HTML_ADVANCE_TO(CommentState);
1136         }
1137     }
1138     END_STATE()
1139
1140     HTML_BEGIN_STATE(CommentState) {
1141         if (cc == '-')
1142             HTML_ADVANCE_TO(CommentEndDashState);
1143         else if (cc == kEndOfFileMarker) {
1144             parseError();
1145             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1146         } else {
1147             m_token->appendToComment(cc);
1148             HTML_ADVANCE_TO(CommentState);
1149         }
1150     }
1151     END_STATE()
1152
1153     HTML_BEGIN_STATE(CommentEndDashState) {
1154         if (cc == '-')
1155             HTML_ADVANCE_TO(CommentEndState);
1156         else if (cc == kEndOfFileMarker) {
1157             parseError();
1158             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1159         } else {
1160             m_token->appendToComment('-');
1161             m_token->appendToComment(cc);
1162             HTML_ADVANCE_TO(CommentState);
1163         }
1164     }
1165     END_STATE()
1166
1167     HTML_BEGIN_STATE(CommentEndState) {
1168         if (cc == '>')
1169             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1170         else if (cc == '!') {
1171             parseError();
1172             HTML_ADVANCE_TO(CommentEndBangState);
1173         } else if (cc == '-') {
1174             parseError();
1175             m_token->appendToComment('-');
1176             HTML_ADVANCE_TO(CommentEndState);
1177         } else if (cc == kEndOfFileMarker) {
1178             parseError();
1179             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1180         } else {
1181             parseError();
1182             m_token->appendToComment('-');
1183             m_token->appendToComment('-');
1184             m_token->appendToComment(cc);
1185             HTML_ADVANCE_TO(CommentState);
1186         }
1187     }
1188     END_STATE()
1189
1190     HTML_BEGIN_STATE(CommentEndBangState) {
1191         if (cc == '-') {
1192             m_token->appendToComment('-');
1193             m_token->appendToComment('-');
1194             m_token->appendToComment('!');
1195             HTML_ADVANCE_TO(CommentEndDashState);
1196         } else if (cc == '>')
1197             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1198         else if (cc == kEndOfFileMarker) {
1199             parseError();
1200             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1201         } else {
1202             m_token->appendToComment('-');
1203             m_token->appendToComment('-');
1204             m_token->appendToComment('!');
1205             m_token->appendToComment(cc);
1206             HTML_ADVANCE_TO(CommentState);
1207         }
1208     }
1209     END_STATE()
1210
1211     HTML_BEGIN_STATE(DOCTYPEState) {
1212         if (isTokenizerWhitespace(cc))
1213             HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1214         else if (cc == kEndOfFileMarker) {
1215             parseError();
1216             m_token->beginDOCTYPE();
1217             m_token->setForceQuirks();
1218             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1219         } else {
1220             parseError();
1221             HTML_RECONSUME_IN(BeforeDOCTYPENameState);
1222         }
1223     }
1224     END_STATE()
1225
1226     HTML_BEGIN_STATE(BeforeDOCTYPENameState) {
1227         if (isTokenizerWhitespace(cc))
1228             HTML_ADVANCE_TO(BeforeDOCTYPENameState);
1229         else if (isASCIIUpper(cc)) {
1230             m_token->beginDOCTYPE(toLowerCase(cc));
1231             HTML_ADVANCE_TO(DOCTYPENameState);
1232         } else if (cc == '>') {
1233             parseError();
1234             m_token->beginDOCTYPE();
1235             m_token->setForceQuirks();
1236             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1237         } else if (cc == kEndOfFileMarker) {
1238             parseError();
1239             m_token->beginDOCTYPE();
1240             m_token->setForceQuirks();
1241             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1242         } else {
1243             m_token->beginDOCTYPE(cc);
1244             HTML_ADVANCE_TO(DOCTYPENameState);
1245         }
1246     }
1247     END_STATE()
1248
1249     HTML_BEGIN_STATE(DOCTYPENameState) {
1250         if (isTokenizerWhitespace(cc))
1251             HTML_ADVANCE_TO(AfterDOCTYPENameState);
1252         else if (cc == '>')
1253             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1254         else if (isASCIIUpper(cc)) {
1255             m_token->appendToName(toLowerCase(cc));
1256             HTML_ADVANCE_TO(DOCTYPENameState);
1257         } else if (cc == kEndOfFileMarker) {
1258             parseError();
1259             m_token->setForceQuirks();
1260             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1261         } else {
1262             m_token->appendToName(cc);
1263             HTML_ADVANCE_TO(DOCTYPENameState);
1264         }
1265     }
1266     END_STATE()
1267
1268     HTML_BEGIN_STATE(AfterDOCTYPENameState) {
1269         if (isTokenizerWhitespace(cc))
1270             HTML_ADVANCE_TO(AfterDOCTYPENameState);
1271         if (cc == '>')
1272             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1273         else if (cc == kEndOfFileMarker) {
1274             parseError();
1275             m_token->setForceQuirks();
1276             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1277         } else {
1278             if (cc == 'P' || cc == 'p') {
1279                 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::publicString);
1280                 if (result == SegmentedString::DidMatch) {
1281                     advanceStringAndASSERTIgnoringCase(source, "public");
1282                     HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState);
1283                 } else if (result == SegmentedString::NotEnoughCharacters)
1284                     return haveBufferedCharacterToken();
1285             } else if (cc == 'S' || cc == 's') {
1286                 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::system);
1287                 if (result == SegmentedString::DidMatch) {
1288                     advanceStringAndASSERTIgnoringCase(source, "system");
1289                     HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState);
1290                 } else if (result == SegmentedString::NotEnoughCharacters)
1291                     return haveBufferedCharacterToken();
1292             }
1293             parseError();
1294             m_token->setForceQuirks();
1295             HTML_ADVANCE_TO(BogusDOCTYPEState);
1296         }
1297     }
1298     END_STATE()
1299
1300     HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
1301         if (isTokenizerWhitespace(cc))
1302             HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1303         else if (cc == '"') {
1304             parseError();
1305             m_token->setPublicIdentifierToEmptyString();
1306             HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1307         } else if (cc == '\'') {
1308             parseError();
1309             m_token->setPublicIdentifierToEmptyString();
1310             HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1311         } else if (cc == '>') {
1312             parseError();
1313             m_token->setForceQuirks();
1314             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1315         } else if (cc == kEndOfFileMarker) {
1316             parseError();
1317             m_token->setForceQuirks();
1318             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1319         } else {
1320             parseError();
1321             m_token->setForceQuirks();
1322             HTML_ADVANCE_TO(BogusDOCTYPEState);
1323         }
1324     }
1325     END_STATE()
1326
1327     HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
1328         if (isTokenizerWhitespace(cc))
1329             HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
1330         else if (cc == '"') {
1331             m_token->setPublicIdentifierToEmptyString();
1332             HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1333         } else if (cc == '\'') {
1334             m_token->setPublicIdentifierToEmptyString();
1335             HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1336         } else if (cc == '>') {
1337             parseError();
1338             m_token->setForceQuirks();
1339             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1340         } else if (cc == kEndOfFileMarker) {
1341             parseError();
1342             m_token->setForceQuirks();
1343             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1344         } else {
1345             parseError();
1346             m_token->setForceQuirks();
1347             HTML_ADVANCE_TO(BogusDOCTYPEState);
1348         }
1349     }
1350     END_STATE()
1351
1352     HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) {
1353         if (cc == '"')
1354             HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1355         else if (cc == '>') {
1356             parseError();
1357             m_token->setForceQuirks();
1358             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1359         } else if (cc == kEndOfFileMarker) {
1360             parseError();
1361             m_token->setForceQuirks();
1362             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1363         } else {
1364             m_token->appendToPublicIdentifier(cc);
1365             HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
1366         }
1367     }
1368     END_STATE()
1369
1370     HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) {
1371         if (cc == '\'')
1372             HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
1373         else if (cc == '>') {
1374             parseError();
1375             m_token->setForceQuirks();
1376             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1377         } else if (cc == kEndOfFileMarker) {
1378             parseError();
1379             m_token->setForceQuirks();
1380             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1381         } else {
1382             m_token->appendToPublicIdentifier(cc);
1383             HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
1384         }
1385     }
1386     END_STATE()
1387
1388     HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
1389         if (isTokenizerWhitespace(cc))
1390             HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1391         else if (cc == '>')
1392             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1393         else if (cc == '"') {
1394             parseError();
1395             m_token->setSystemIdentifierToEmptyString();
1396             HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1397         } else if (cc == '\'') {
1398             parseError();
1399             m_token->setSystemIdentifierToEmptyString();
1400             HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1401         } else if (cc == kEndOfFileMarker) {
1402             parseError();
1403             m_token->setForceQuirks();
1404             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1405         } else {
1406             parseError();
1407             m_token->setForceQuirks();
1408             HTML_ADVANCE_TO(BogusDOCTYPEState);
1409         }
1410     }
1411     END_STATE()
1412
1413     HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) {
1414         if (isTokenizerWhitespace(cc))
1415             HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
1416         else if (cc == '>')
1417             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1418         else if (cc == '"') {
1419             m_token->setSystemIdentifierToEmptyString();
1420             HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1421         } else if (cc == '\'') {
1422             m_token->setSystemIdentifierToEmptyString();
1423             HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1424         } else if (cc == kEndOfFileMarker) {
1425             parseError();
1426             m_token->setForceQuirks();
1427             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1428         } else {
1429             parseError();
1430             m_token->setForceQuirks();
1431             HTML_ADVANCE_TO(BogusDOCTYPEState);
1432         }
1433     }
1434     END_STATE()
1435
1436     HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
1437         if (isTokenizerWhitespace(cc))
1438             HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1439         else if (cc == '"') {
1440             parseError();
1441             m_token->setSystemIdentifierToEmptyString();
1442             HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1443         } else if (cc == '\'') {
1444             parseError();
1445             m_token->setSystemIdentifierToEmptyString();
1446             HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1447         } else if (cc == '>') {
1448             parseError();
1449             m_token->setForceQuirks();
1450             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1451         } else if (cc == kEndOfFileMarker) {
1452             parseError();
1453             m_token->setForceQuirks();
1454             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1455         } else {
1456             parseError();
1457             m_token->setForceQuirks();
1458             HTML_ADVANCE_TO(BogusDOCTYPEState);
1459         }
1460     }
1461     END_STATE()
1462
1463     HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
1464         if (isTokenizerWhitespace(cc))
1465             HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
1466         if (cc == '"') {
1467             m_token->setSystemIdentifierToEmptyString();
1468             HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1469         } else if (cc == '\'') {
1470             m_token->setSystemIdentifierToEmptyString();
1471             HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1472         } else if (cc == '>') {
1473             parseError();
1474             m_token->setForceQuirks();
1475             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1476         } else if (cc == kEndOfFileMarker) {
1477             parseError();
1478             m_token->setForceQuirks();
1479             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1480         } else {
1481             parseError();
1482             m_token->setForceQuirks();
1483             HTML_ADVANCE_TO(BogusDOCTYPEState);
1484         }
1485     }
1486     END_STATE()
1487
1488     HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) {
1489         if (cc == '"')
1490             HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1491         else if (cc == '>') {
1492             parseError();
1493             m_token->setForceQuirks();
1494             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1495         } else if (cc == kEndOfFileMarker) {
1496             parseError();
1497             m_token->setForceQuirks();
1498             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1499         } else {
1500             m_token->appendToSystemIdentifier(cc);
1501             HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
1502         }
1503     }
1504     END_STATE()
1505
1506     HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) {
1507         if (cc == '\'')
1508             HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1509         else if (cc == '>') {
1510             parseError();
1511             m_token->setForceQuirks();
1512             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1513         } else if (cc == kEndOfFileMarker) {
1514             parseError();
1515             m_token->setForceQuirks();
1516             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1517         } else {
1518             m_token->appendToSystemIdentifier(cc);
1519             HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
1520         }
1521     }
1522     END_STATE()
1523
1524     HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
1525         if (isTokenizerWhitespace(cc))
1526             HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
1527         else if (cc == '>')
1528             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1529         else if (cc == kEndOfFileMarker) {
1530             parseError();
1531             m_token->setForceQuirks();
1532             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1533         } else {
1534             parseError();
1535             HTML_ADVANCE_TO(BogusDOCTYPEState);
1536         }
1537     }
1538     END_STATE()
1539
1540     HTML_BEGIN_STATE(BogusDOCTYPEState) {
1541         if (cc == '>')
1542             return emitAndResumeIn(source, HTMLTokenizer::DataState);
1543         else if (cc == kEndOfFileMarker)
1544             return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
1545         HTML_ADVANCE_TO(BogusDOCTYPEState);
1546     }
1547     END_STATE()
1548
1549     HTML_BEGIN_STATE(CDATASectionState) {
1550         if (cc == ']')
1551             HTML_ADVANCE_TO(CDATASectionRightSquareBracketState);
1552         else if (cc == kEndOfFileMarker)
1553             HTML_RECONSUME_IN(DataState);
1554         else {
1555             bufferCharacter(cc);
1556             HTML_ADVANCE_TO(CDATASectionState);
1557         }
1558     }
1559     END_STATE()
1560
1561     HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) {
1562         if (cc == ']')
1563             HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState);
1564         else {
1565             bufferCharacter(']');
1566             HTML_RECONSUME_IN(CDATASectionState);
1567         }
1568     }
1569
1570     HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) {
1571         if (cc == '>')
1572             HTML_ADVANCE_TO(DataState);
1573         else {
1574             bufferCharacter(']');
1575             bufferCharacter(']');
1576             HTML_RECONSUME_IN(CDATASectionState);
1577         }
1578     }
1579     END_STATE()
1580
1581     }
1582
1583     ASSERT_NOT_REACHED();
1584     return false;
1585 }
1586
1587 String HTMLTokenizer::bufferedCharacters() const
1588 {
1589     // FIXME: Add an assert about m_state.
1590     StringBuilder characters;
1591     characters.reserveCapacity(numberOfBufferedCharacters());
1592     characters.append('<');
1593     characters.append('/');
1594     characters.append(m_temporaryBuffer.data(), m_temporaryBuffer.size());
1595     return characters.toString();
1596 }
1597
1598 void HTMLTokenizer::updateStateFor(const String& tagName)
1599 {
1600     if (threadSafeMatch(tagName, textareaTag) || threadSafeMatch(tagName, titleTag))
1601         setState(HTMLTokenizer::RCDATAState);
1602     else if (threadSafeMatch(tagName, plaintextTag))
1603         setState(HTMLTokenizer::PLAINTEXTState);
1604     else if (threadSafeMatch(tagName, scriptTag))
1605         setState(HTMLTokenizer::ScriptDataState);
1606     else if (threadSafeMatch(tagName, styleTag)
1607         || threadSafeMatch(tagName, iframeTag)
1608         || threadSafeMatch(tagName, xmpTag)
1609         || (threadSafeMatch(tagName, noembedTag) && m_options.pluginsEnabled)
1610         || threadSafeMatch(tagName, noframesTag)
1611         || (threadSafeMatch(tagName, noscriptTag) && m_options.scriptEnabled))
1612         setState(HTMLTokenizer::RAWTEXTState);
1613 }
1614
1615 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
1616 {
1617     return vectorEqualsString(m_temporaryBuffer, expectedString);
1618 }
1619
1620 inline void HTMLTokenizer::addToPossibleEndTag(LChar cc)
1621 {
1622     ASSERT(isEndTagBufferingState(m_state));
1623     m_bufferedEndTagName.append(cc);
1624 }
1625
1626 inline bool HTMLTokenizer::isAppropriateEndTag()
1627 {
1628     if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size())
1629         return false;
1630
1631     size_t numCharacters = m_bufferedEndTagName.size();
1632
1633     for (size_t i = 0; i < numCharacters; i++) {
1634         if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i])
1635             return false;
1636     }
1637
1638     return true;
1639 }
1640
1641 inline void HTMLTokenizer::parseError()
1642 {
1643     notImplemented();
1644 }
1645
1646 }