Change remaining uses of {to,from}Ascii to {to,from}Latin1 [other]
[profile/ivi/qtxmlpatterns.git] / src / xmlpatterns / parser / qxquerytokenizer.cpp
1 /****************************************************************************
2 **
3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
4 ** Contact: http://www.qt-project.org/
5 **
6 ** This file is part of the QtXmlPatterns module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** GNU Lesser General Public License Usage
10 ** This file may be used under the terms of the GNU Lesser General Public
11 ** License version 2.1 as published by the Free Software Foundation and
12 ** appearing in the file LICENSE.LGPL included in the packaging of this
13 ** file. Please review the following information to ensure the GNU Lesser
14 ** General Public License version 2.1 requirements will be met:
15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
16 **
17 ** In addition, as a special exception, Nokia gives you certain additional
18 ** rights. These rights are described in the Nokia Qt LGPL Exception
19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
20 **
21 ** GNU General Public License Usage
22 ** Alternatively, this file may be used under the terms of the GNU General
23 ** Public License version 3.0 as published by the Free Software Foundation
24 ** and appearing in the file LICENSE.GPL included in the packaging of this
25 ** file. Please review the following information to ensure the GNU General
26 ** Public License version 3.0 requirements will be met:
27 ** http://www.gnu.org/copyleft/gpl.html.
28 **
29 ** Other Usage
30 ** Alternatively, this file may be used in accordance with the terms and
31 ** conditions contained in a signed written agreement between you and Nokia.
32 **
33 **
34 **
35 **
36 **
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41
42 #include <QByteArray>
43
44 #include "qquerytransformparser_p.h"
45
46 #include "qxquerytokenizer_p.h"
47
48 #include "qtokenlookup.cpp"
49
50 QT_BEGIN_NAMESPACE
51
52 namespace QPatternist
53 {
54
55 #define handleWhitespace()                      \
56 {                                               \
57     const TokenType t = consumeWhitespace();    \
58     if(t != SUCCESS)                            \
59         return Token(t);                        \
60 }
61
62 XQueryTokenizer::XQueryTokenizer(const QString &query,
63                                  const QUrl &location,
64                                  const State startingState) : Tokenizer(location)
65                                                             , m_data(query)
66                                                             , m_length(query.length())
67                                                             , m_state(startingState)
68                                                             , m_pos(0)
69                                                             , m_line(1)
70                                                             , m_columnOffset(0)
71                                                             , m_scanOnly(false)
72 {
73     Q_ASSERT(location.isValid() || location.isEmpty());
74 }
75
76 const QChar XQueryTokenizer::current() const
77 {
78     if(m_pos < m_length)
79         return m_data.at(m_pos);
80     else
81         return QChar();
82 }
83
84 char XQueryTokenizer::peekCurrent() const
85 {
86     return current().toLatin1();
87 }
88
89 int XQueryTokenizer::peekForColonColon() const
90 {
91     /* Note, we don't modify m_pos in this function, so we need to do offset
92      * calculations. */
93     int pos = m_pos;
94
95     while(pos < m_length)
96     {
97         switch(m_data.at(pos).toLatin1())
98         {
99             /* Fallthrough these four. */
100             case ' ':
101             case '\t':
102             case '\n':
103             case '\r':
104                 break;
105             case ':':
106             {
107                 if(peekAhead((pos - m_pos) + 1) == ':')
108                     return pos - m_pos;
109                 /* Fallthrough. */
110             }
111             default:
112                 return -1;
113         }
114         ++pos;
115     }
116
117     return -1;
118 }
119
120 Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
121                                                       const State s,
122                                                       const int advance)
123 {
124     Q_ASSERT(advance >= 0);
125     m_pos += advance;
126     setState(s);
127     return Token(code);
128 }
129
130 Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
131                                                       const QString &value,
132                                                       const State s)
133 {
134     setState(s);
135     return Token(code, value);
136 }
137
138 Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code,
139                                                   const int advance)
140 {
141     Q_ASSERT(advance >= 0);
142     m_pos += advance;
143     return Token(code);
144 }
145
146 QString XQueryTokenizer::normalizeEOL(const QString &input,
147                                       const CharacterSkips &characterSkips)
148 {
149     const int len = input.count();
150     QString result;
151
152     /* The likely hood is rather high it'll be the same content. */
153     result.reserve(len);
154
155     for(int i = 0; i < len; ++i)
156     {
157         const QChar &at = input.at(i);
158
159         if(characterSkips.contains(i))
160         {
161             result.append(at);
162             continue;
163         }
164         switch(input.at(i).unicode())
165         {
166             case '\r':
167             {
168                 if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n'))
169                     ++i;
170
171                 /* Else, fallthrough. */
172             }
173             case '\n':
174             {
175                 result.append(QLatin1Char('\n'));
176                 continue;
177             }
178             default:
179             {
180                 result.append(at);
181             }
182         }
183     }
184
185     return result;
186 }
187
188 Tokenizer::TokenType XQueryTokenizer::consumeComment()
189 {
190     /* Below, we return ERROR instead of END_OF_FILE such that the parser
191      * sees an invalid comment. */
192     while(m_pos < m_length)
193     {
194         switch(peekCurrent())
195         {
196             case ':':
197             {
198                 ++m_pos; /* Consume ':' */
199                 if(atEnd())
200                     return ERROR;
201
202                 if(peekCurrent() == ')')
203                 {
204                     ++m_pos; /* Consume ')' */
205                     return SUCCESS; /* The comment closed nicely. */
206                 }
207                 continue; /* We don't want to increment m_pos twice. */
208             }
209             case '(':
210             { /* It looks like the start of a comment. */
211                 ++m_pos;
212
213                 if(atEnd())
214                     return END_OF_FILE;
215                 else if(peekCurrent() == ':')
216                 {
217                     /* And it is a nested comment -- parse it. */
218                     const TokenType retval = consumeComment();
219                     if(retval == SUCCESS)
220                         continue; /* Continue with our "own" comment. */
221                     else
222                         return retval; /* Return the error in the nested comment. */
223                 }
224                 break;
225             }
226             case '\n':
227             /* Fallthrough. */
228             case '\r':
229             {
230                 /* We want to count \r\n as a single line break. */
231                 if(peekAhead() == '\n')
232                     ++m_pos;
233
234                 m_columnOffset = m_pos;
235                 ++m_line;
236
237                 break;
238             }
239         }
240         ++m_pos;
241     }
242
243     return ERROR; /* Error: we reached the end while inside a comment. */
244 }
245
246 bool XQueryTokenizer::consumeRawWhitespace()
247 {
248     while(m_pos < m_length)
249     {
250         switch(peekCurrent())
251         {
252             case ' ':
253             case '\t':
254                 break;
255             case '\n':
256             case '\r':
257             {
258                 if(peekAhead() == '\n')
259                     ++m_pos;
260
261                 m_columnOffset = m_pos;
262                 ++m_line;
263
264                 break;
265             }
266             default:
267                 return false;
268         }
269         ++m_pos;
270     }
271     return true;
272 }
273
274 Tokenizer::TokenType XQueryTokenizer::consumeWhitespace()
275 {
276     while(m_pos < m_length)
277     {
278         switch(peekCurrent())
279         {
280             case ' ':
281             case '\t':
282                 break;
283             case '\n':
284             case '\r':
285             {
286                 /* We want to count \r\n as a single line break. */
287                 if(peekAhead() == '\n')
288                     ++m_pos;
289
290                 m_columnOffset = m_pos;
291                 ++m_line;
292
293                 break;
294             }
295             case '(':
296             {
297                 if(peekAhead() == ':')
298                 {
299                     m_pos += 2; /* Consume "(:" */
300
301                     const TokenType comment = consumeComment();
302                     if(comment == SUCCESS)
303                         continue;
304                     else
305                         return comment;
306                 }
307             }
308             default:
309                 return SUCCESS;
310         }
311         ++m_pos;
312     }
313
314     return END_OF_FILE;
315 }
316
317 char XQueryTokenizer::peekAhead(const int length) const
318 {
319     if(m_pos + length < m_length)
320         return m_data.at(m_pos + length).toLatin1();
321     else
322         return 0;
323 }
324
325 Tokenizer::Token XQueryTokenizer::error()
326 {
327     return Token(ERROR);
328 }
329
330 bool XQueryTokenizer::isDigit(const char ch)
331 {
332     return ch >= '0' && ch <= '9';
333 }
334
335 /* Replace with function in QXmlUtils. Write test cases for this. */
336 bool XQueryTokenizer::isNCNameStart(const QChar ch)
337 {
338     if(ch == QLatin1Char('_'))
339         return true;
340
341     switch(ch.category())
342     {
343         case QChar::Letter_Lowercase:
344         case QChar::Letter_Uppercase:
345         case QChar::Letter_Other:
346         case QChar::Letter_Titlecase:
347         case QChar::Number_Letter:
348             return true;
349         default:
350             return false;
351     }
352 }
353
354 bool XQueryTokenizer::isNCNameBody(const QChar ch)
355 {
356     switch(ch.unicode())
357     {
358         case '.':
359         case '_':
360         case '-':
361             return true;
362     }
363
364     switch(ch.category())
365     {
366         case QChar::Letter_Lowercase:
367         case QChar::Letter_Uppercase:
368         case QChar::Letter_Other:
369         case QChar::Letter_Titlecase:
370         case QChar::Number_Letter:
371         case QChar::Mark_SpacingCombining:
372         case QChar::Mark_Enclosing:
373         case QChar::Mark_NonSpacing:
374         case QChar::Letter_Modifier:
375         case QChar::Number_DecimalDigit:
376             return true;
377         default:
378             return false;
379     }
380 }
381
382 bool XQueryTokenizer::isPhraseKeyword(const TokenType code)
383 {
384     switch(code)
385     {
386         /* Fallthrough all these. */
387         case CASTABLE:
388         case CAST:
389         case COPY_NAMESPACES:
390         case DECLARE:
391         case EMPTY:
392         case MODULE:
393         case IMPORT:
394         case INSTANCE:
395         case ORDER:
396         case ORDERING:
397         case XQUERY:
398         case STABLE:
399         case TREAT:
400             return true;
401         default:
402             return false;
403     }
404 }
405
406 bool XQueryTokenizer::isOperatorKeyword(const TokenType code)
407 {
408     switch(code)
409     {
410         /* Fallthrough all these. */
411         case AS:
412         case ASCENDING:
413         case AT:
414         case CASE:
415         case CAST:
416         case CASTABLE:
417         case EQ:
418         case EXTERNAL:
419         case GE:
420         case G_EQ:
421         case G_GT:
422         case G_LT:
423         case G_NE:
424         case GT:
425         case IN:
426         case INHERIT:
427         case INSTANCE:
428         case IS:
429         case ITEM:
430         case LE:
431         case LT:
432         case NE:
433         case NO_INHERIT:
434         case NO_PRESERVE:
435         case OF:
436         case PRESERVE:
437         case RETURN:
438         case STABLE:
439         case TO:
440         case TREAT:
441             return true;
442         default:
443             return false;
444     };
445 }
446
447 bool XQueryTokenizer::isTypeToken(const TokenType t)
448 {
449     switch(t)
450     {
451         /* Fallthrough all these. */
452         case ATTRIBUTE:
453         case COMMENT:
454         case DOCUMENT:
455         case DOCUMENT_NODE:
456         case ELEMENT:
457         case ITEM:
458         case NODE:
459         case PROCESSING_INSTRUCTION:
460         case SCHEMA_ATTRIBUTE:
461         case SCHEMA_ELEMENT:
462         case TEXT:
463             return true;
464         default:
465             return false;
466     }
467 }
468
469 Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName()
470 {
471     const int start = m_pos;
472
473     const Token t1 = tokenizeNCName();
474     if(t1.hasError())
475         return t1;
476
477     if(peekCurrent() != ':' || peekAhead() == '=')
478         return t1;
479
480     ++m_pos;
481
482     const Token t2 = tokenizeNCName();
483     if(t2.hasError())
484         return t2;
485     else
486         return Token(QNAME, m_data.mid(start, m_pos - start));
487 }
488
489 Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral()
490 {
491     setState(Operator);
492     const int startPos = m_pos;
493     bool hasDot = false;
494     bool isXPath20 = false;
495
496     for(; m_pos < m_length; ++m_pos)
497     {
498         QChar ch(current());
499
500         char cell = ch.cell();
501
502         if(cell == 'e' || cell == 'E')
503         {
504             isXPath20 = true;
505             ++m_pos;
506             ch = current();
507
508             if(ch.row() != 0)
509                 break;
510
511             cell = ch.cell();
512
513             if(cell == '+' || cell == '-')
514                 continue;
515         }
516
517         if(isNCNameStart(ch))
518             return error();
519
520         if(cell < '0' || cell > '9')
521         {
522             if(cell == '.' && !hasDot)
523                 hasDot = true;
524             else
525                 break;
526         }
527     }
528
529     return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos));
530 }
531
532 QString XQueryTokenizer::tokenizeCharacterReference()
533 {
534     Q_ASSERT(peekCurrent() == '&');
535
536     const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1);
537
538     if(theEnd == -1) /* No ';' found, a syntax error. i18n. */
539         return QString();
540
541     QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1));
542     m_pos = theEnd;
543
544     const QChar charRef(charForReference(content));
545
546     if(!charRef.isNull())
547         return charRef;
548     else if(content.startsWith(QLatin1Char('#')))
549     {
550         int base;
551
552         /* It is only '#' or '#x'. */
553         if(content.length() < 2)
554             return QString();
555
556         /* We got a hex number if it starts with 'x', otherwise it's a decimal. */
557         if(content.at(1) == QLatin1Char('x'))
558         {
559             base = 16;
560             content = content.mid(2); /* Remove "#x". */
561         }
562         else
563         {
564             base = 10;
565             content = content.mid(1); /* Remove "#". */
566         }
567
568         bool conversionOK = false;
569         const int codepoint = content.toInt(&conversionOK, base);
570
571         if(conversionOK)
572         {
573             const QChar ch(codepoint);
574
575             if(ch.isNull())
576             {
577                 /* We likely have something which require surrogate pairs. */
578                 QString result;
579                 result += QChar(QChar::highSurrogate(codepoint));
580                 result += QChar(QChar::lowSurrogate(codepoint));
581                 return result;
582             }
583             else
584                 return ch;
585         }
586         else
587             return QString();
588     }
589     else
590         return QString();
591 }
592
593 int XQueryTokenizer::scanUntil(const char *const content)
594 {
595     const int end = m_data.indexOf(QString::fromLatin1(content), m_pos);
596
597     if(end == -1)
598         return -1;
599     else
600     {
601         const int len = end - m_pos;
602         m_pos += len;
603         return len;
604     }
605 }
606
607 QChar XQueryTokenizer::charForReference(const QString &reference)
608 {
609     if(m_charRefs.isEmpty())
610     {
611         /* Initialize. */
612         m_charRefs.reserve(5);
613         m_charRefs.insert(QLatin1String("lt"),     QLatin1Char('<'));
614         m_charRefs.insert(QLatin1String("gt"),     QLatin1Char('>'));
615         m_charRefs.insert(QLatin1String("amp"),    QLatin1Char('&'));
616         m_charRefs.insert(QLatin1String("quot"),   QLatin1Char('"'));
617         m_charRefs.insert(QLatin1String("apos"),   QLatin1Char('\''));
618     }
619
620     return m_charRefs.value(reference);
621 }
622
623 Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral()
624 {
625     const QChar delimiter(current());
626     /* We cannot unfortunately just scan and then do mid(),
627      * since we can encounter character references. */
628     QString result;
629
630     /* This is more likely than QString's default allocation. */
631     result.reserve(8);
632
633     CharacterSkips skipEOLNormalization;
634
635     /* Advance over the initial quote character. */
636     ++m_pos;
637
638     for(; m_pos < m_length; ++m_pos)
639     {
640         const QChar c(current());
641
642         if(c == QLatin1Char('&'))
643         {
644             const QString charRef(tokenizeCharacterReference());
645
646             if(charRef.isNull())
647                 return error();
648             else
649             {
650                 skipEOLNormalization.insert(result.count());
651                 result.append(charRef);
652             }
653
654         }
655         else if(c == delimiter)
656         {
657             /* Maybe the escaping mechanism is used. For instance, "s""s"
658              * has the value `s"s'. */
659             ++m_pos;
660
661             if(current() == delimiter) /* Double quote. */
662                 result += delimiter;
663             else
664                 return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization));
665         }
666         else
667             result += c;
668     }
669
670     return error();
671 }
672
673 Tokenizer::Token XQueryTokenizer::tokenizeNCName()
674 {
675     const int startPos = m_pos;
676
677     if(m_pos < m_length && isNCNameStart(current()))
678     {
679         ++m_pos;
680
681         for(; m_pos < m_length; ++m_pos)
682         {
683             if(!isNCNameBody(current()))
684                 break;
685         }
686
687         return Token(NCNAME, m_data.mid(startPos, m_pos - startPos));
688     }
689     else
690         return error();
691 }
692
693 bool XQueryTokenizer::aheadEquals(const char *const chs,
694                                   const int len,
695                                   const int offset) const
696 {
697     Q_ASSERT(len > 0);
698     Q_ASSERT(qstrlen(chs) == uint(len));
699
700     if(m_pos + len >= m_length)
701         return false;
702
703     for(int i = offset; i < (len + offset); ++i)
704     {
705         if(m_data.at(m_pos + i).toLatin1() != chs[i - offset])
706             return false;
707     }
708
709     return true;
710 }
711
712 const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword)
713 {
714     return TokenLookup::value(keyword.toLatin1().constData(), keyword.length());
715 }
716
717 XQueryTokenizer::State XQueryTokenizer::state() const
718 {
719     return m_state;
720 }
721
722 void XQueryTokenizer::setState(const State s)
723 {
724     m_state = s;
725 }
726
727 void XQueryTokenizer::pushState(const State s)
728 {
729     m_stateStack.push(s);
730 }
731
732 void XQueryTokenizer::pushState()
733 {
734     m_stateStack.push(m_state);
735 }
736
737 void XQueryTokenizer::popState()
738 {
739     /* QStack::pop() asserts if it's empty, so we need to check
740      * it, since we might receive unbalanced curlies. */
741     if(!m_stateStack.isEmpty())
742         m_state = m_stateStack.pop();
743 }
744
745 Tokenizer::Token XQueryTokenizer::nextToken()
746 {
747     switch(state())
748     {
749         /* We want to skip or do special whitespace handling for these
750          * states. So fallthrough all of the following. */
751         case AposAttributeContent:
752         case Axis:
753         case ElementContent:
754         case EndTag:
755         case Pragma:
756         case PragmaContent:
757         case ProcessingInstructionName:
758         case QuotAttributeContent:
759         case StartTag:
760         case XMLComment:
761             break;
762         default:
763             handleWhitespace();
764     }
765
766     switch(state())
767     {
768         case XMLSpaceDecl:
769         /* Fallthrough. */
770         case NamespaceKeyword:
771         {
772             switch(peekCurrent())
773             {
774                 case ',':
775                     return tokenAndAdvance(COMMA);
776                 case '"':
777                 /* Fallthrough. */
778                 case '\'':
779                 {
780                     setState(NamespaceDecl);
781                     return tokenizeStringLiteral();
782                 }
783             }
784
785             const Token id(tokenizeNCName());
786
787             if(id.type != NCNAME)
788                 return id;
789
790             const TokenMap *const keyword = lookupKeyword(id.value);
791             if(keyword)
792             {
793                 switch(keyword->token)
794                 {
795                     case INHERIT:
796                     /* Fallthrough. */
797                     case NO_INHERIT:
798                     {
799                         setState(Default);
800                         break;
801                     }
802                     case NAMESPACE:
803                     {
804                         setState(NamespaceDecl);
805                         break;
806                     }
807                     case ORDERED:
808                     /* Fallthrough. */
809                     case UNORDERED:
810                     /* Fallthrough. */
811                     case STRIP:
812                     {
813                         setState(Default);
814                         break;
815                     }
816                     case PRESERVE:
817                     {
818                         if(state() != NamespaceKeyword)
819                             setState(Default);
820                     }
821                     default:
822                         break;
823                 }
824
825                 return Token(keyword->token);
826             }
827             else
828                 return id;
829
830             Q_ASSERT(false);
831         }
832         case NamespaceDecl:
833         {
834             switch(peekCurrent())
835             {
836                 case '=':
837                     return tokenAndAdvance(G_EQ);
838                 case ';':
839                     return tokenAndChangeState(SEMI_COLON, Default);
840                 case '\'':
841                 /* Fallthrough. */
842                 case '\"':
843                     return tokenizeStringLiteral();
844             }
845
846             const Token nc(tokenizeNCName());
847
848             handleWhitespace();
849
850             const char pc = peekCurrent();
851             const TokenMap* const t = lookupKeyword(nc.value);
852
853             if(pc == '\'' || (pc == '"' && t))
854                 return tokenAndChangeState(t->token, Default, 0);
855             else
856                 return nc;
857
858             Q_ASSERT(false);
859         }
860         case Axis:
861         {
862             if(peekCurrent() == ':')
863             {
864                 Q_ASSERT(peekAhead() == ':');
865                 m_pos += 2;
866                 setState(AfterAxisSeparator);
867                 return Token(COLONCOLON);
868             }
869             /* Fallthrough. */
870         }
871         case AfterAxisSeparator:
872         /* Fallthrough. */
873         case Default:
874            /* State Operator and state Default have a lot of tokens in common except
875             * for minor differences. So we treat them the same way, and sprinkles logic
876             * here and there to handle the small differences. */
877         /* Fallthrough. */
878         case Operator:
879         {
880             switch(peekCurrent())
881             {
882                 case '=':
883                     return tokenAndChangeState(G_EQ, Default);
884                 case '-':
885                     return tokenAndChangeState(MINUS, Default);
886                 case '+':
887                     return tokenAndChangeState(PLUS, Default);
888                 case '[':
889                     return tokenAndChangeState(LBRACKET, Default);
890                 case ']':
891                     return tokenAndChangeState(RBRACKET, Operator);
892                 case ',':
893                     return tokenAndChangeState(COMMA, Default);
894                 case ';':
895                     return tokenAndChangeState(SEMI_COLON, Default);
896                 case '$':
897                     return tokenAndChangeState(DOLLAR, VarName);
898                 case '|':
899                     return tokenAndChangeState(BAR, Default);
900                 case '?':
901                     return tokenAndChangeState(QUESTION, Operator);
902                 case ')':
903                     return tokenAndChangeState(RPAREN, Operator);
904                 case '@':
905                     return tokenAndChangeState(AT_SIGN, Default);
906                 /* Fallthrough all these. */
907                 case '1':
908                 case '2':
909                 case '3':
910                 case '4':
911                 case '5':
912                 case '6':
913                 case '7':
914                 case '8':
915                 case '9':
916                 case '0':
917                     return tokenizeNumberLiteral();
918                 case '.':
919                 {
920                     const char next = peekAhead();
921                     if(next == '.')
922                         return tokenAndChangeState(DOTDOT, Operator, 2);
923                     /* .5 is allowed, as short form for 0.5:
924                      * <tt>[142]     DecimalLiteral     ::=     ("." Digits) | (Digits "." [0-9]*)</tt>
925                      */
926                     else if(isDigit(next))
927                         return tokenizeNumberLiteral();
928                     else
929                         return tokenAndChangeState(DOT, Operator);
930                 }
931                 case '\'':
932                 /* Fallthrough. */
933                 case '"':
934                 {
935                     setState(Operator);
936                     return tokenizeStringLiteral();
937
938                 }
939                 case '(':
940                 {
941                     if(peekAhead() == '#')
942                         return tokenAndChangeState(PRAGMA_START, Pragma, 2);
943                     else
944                         return tokenAndChangeState(LPAREN, Default);
945                 }
946                 case '*':
947                 {
948                     if(peekAhead() == ':')
949                     {
950                         m_pos += 2; /* Consume *:. */
951                         const Token nc = tokenizeNCName();
952
953                         if(nc.hasError())
954                             return error();
955                         else
956                             return tokenAndChangeState(ANY_PREFIX, nc.value, Operator);
957                     }
958                     else
959                         return tokenAndChangeState(STAR, state() == Default ? Operator : Default);
960                 }
961                 case ':':
962                 {
963                     switch(peekAhead())
964                     {
965                         case '=':
966                             return tokenAndChangeState(ASSIGN, Default, 2);
967                         case ':':
968                             return tokenAndChangeState(COLONCOLON, Default, 2);
969                         default:
970                             return error();
971                     }
972                 }
973                 case '!':
974                 {
975                     if(peekAhead() == '=')
976                         return tokenAndChangeState(G_NE, Default, 2);
977                     else
978                         return error();
979                 }
980                 case '<':
981                 {
982                     switch(peekAhead())
983                     {
984                         case '=':
985                             return tokenAndChangeState(G_LE, Default, 2);
986                         case '<':
987                             return tokenAndChangeState(PRECEDES, Default, 2);
988                         case '?':
989                         {
990                             pushState(Operator);
991                             return tokenAndChangeState(PI_START, ProcessingInstructionName, 2);
992                         }
993                         case '!':
994                         {
995                             if(aheadEquals("!--", 3))
996                             {
997                                 m_pos += 3; /* Consume "!--". */
998                                 pushState(Operator);
999                                 return tokenAndChangeState(COMMENT_START, XMLComment);
1000                             }
1001                             /* Fallthrough. It's a syntax error, and this is a good way to report it. */
1002                         }
1003                         default:
1004                         {
1005                             if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1)))
1006                             {
1007                                 /* We assume it's an element constructor. */
1008                                 pushState(Operator);
1009                             }
1010
1011                             return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag);
1012                         }
1013                     }
1014                 }
1015                 case '>':
1016                 {
1017                     switch(peekAhead())
1018                     {
1019                         case '=':
1020                             return tokenAndChangeState(G_GE, Default, 2);
1021                         case '>':
1022                             return tokenAndChangeState(FOLLOWS, Default, 2);
1023                         default:
1024                             return tokenAndChangeState(G_GT, Default);
1025                     }
1026                 }
1027                 case '/':
1028                 {
1029                     if(peekAhead() == '/')
1030                         return tokenAndChangeState(SLASHSLASH, Default, 2);
1031                     else
1032                         return tokenAndChangeState(SLASH, Default);
1033                 }
1034                 case '{':
1035                 {
1036                     pushState(Operator);
1037                     return tokenAndChangeState(CURLY_LBRACE, Default);
1038                 }
1039                 case '}':
1040                 {
1041                     popState();
1042
1043                     return tokenAndAdvance(CURLY_RBRACE);
1044                 }
1045             }
1046
1047             /* Ok. We're in state Default or Operator, and it wasn't a simple
1048              * character. */
1049
1050             const Token id(tokenizeNCName());
1051
1052             if(id.type != NCNAME)
1053                 return id;
1054
1055             const TokenMap *const keyword = lookupKeyword(id.value);
1056
1057             if(state() == Operator)
1058             {
1059                 if(keyword)
1060                 {
1061                     if(keyword->token == DEFAULT || keyword->token == ASCENDING || keyword->token == DESCENDING)
1062                         setState(Operator);
1063                     else if(keyword->token == RETURN)
1064                         setState(Default);
1065                     else if(isPhraseKeyword(keyword->token))
1066                     {
1067                         const TokenType ws = consumeWhitespace();
1068                         if(ws == ERROR)
1069                             return error();
1070
1071                         const Token id2(tokenizeNCName());
1072                         const TokenMap *const keyword2 = lookupKeyword(id2.value);
1073
1074                         if(keyword2)
1075                         {
1076                             if(keyword->token == TREAT && keyword2->token == AS)
1077                                 setState(ItemType);
1078                             else if (keyword->token == CAST || (keyword->token == CASTABLE && keyword2->token == AS) || keyword2->token == BY)
1079                                 setState(Default);
1080
1081                             m_tokenStack.push(Token(keyword2->token));
1082                         }
1083                         else
1084                             m_tokenStack.push(id2);
1085
1086                         return Token(keyword->token);
1087                     }
1088                     else
1089                     {
1090                         /* Such that we tokenize the second token in "empty greatest". */
1091                         if(keyword->token != EMPTY)
1092                             setState(Default);
1093                     }
1094
1095                     if(keyword->token == AS || keyword->token == CASE)
1096                         setState(ItemType);
1097
1098                     return Token(keyword->token);
1099                 }
1100                 else
1101                     return id;
1102             }
1103
1104             Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator);
1105
1106             /*
1107              * This is hard. Consider this:
1108              *
1109              * Valid:           child       ::nameTest
1110              * Valid:           child::     nameTest
1111              * Syntax Error:    child       :localName
1112              * Syntax Error:    child:      localName
1113              *
1114              * Consider "child ::name". Right now, we're here:
1115              *                ^
1116              * We don't know whether "child" is a prefix and hence the whitespace is invalid,
1117              * or whether it's an axis and hence skippable. */
1118             {
1119                 const int wsLength = peekForColonColon();
1120                 /* We cannot call handleWhitespace() because it returns on
1121                  * END_OF_FILE, and we have parsed up keyword, and we need to
1122                  * deal with that.
1123                  *
1124                  * If we have a colon colon, which means the whitespace is
1125                  * allowed, we skip it. */
1126                 if(wsLength != -1)
1127                     m_pos += wsLength;
1128             }
1129
1130             /* Handle name tests. */
1131             if(peekCurrent() == ':')
1132             {
1133                 switch(peekAhead())
1134                 {
1135                     case '=':
1136                         return id;
1137                     case '*':
1138                     {
1139                         m_pos += 2;
1140                         return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator);
1141                     }
1142                     case ':':
1143                     {
1144                         /* We have an axis. */
1145                         setState(Axis);
1146                         return keyword ? Token(keyword->token) : id;
1147                     }
1148                     default:
1149                     {
1150                         /* It's a QName. */
1151                         ++m_pos; /* Consume the colon. */
1152
1153                         const Token id2(tokenizeNCName());
1154
1155                         if(id2.type != NCNAME)
1156                         {
1157                             --m_pos;
1158                             return id;
1159                         }
1160
1161                         setState(Operator);
1162                         const int qNameLen = id.value.length() + id2.value.length() + 1;
1163                         return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen));
1164                     }
1165                 }
1166             }
1167
1168             if(!keyword || isOperatorKeyword(keyword->token))
1169             {
1170                 setState(Operator);
1171                 return id;
1172             }
1173
1174             const TokenType ws = consumeWhitespace();
1175             if(ws == ERROR) // TODO this should test for success. Write test.
1176                 return Token(ERROR);
1177
1178             if(atEnd())
1179             {
1180                 setState(Operator);
1181                 return id;
1182             }
1183
1184             /* Let the if-body apply for constructors, and node type tests. */
1185             if(isTypeToken(keyword->token) ||
1186                keyword->token == TYPESWITCH ||
1187                keyword->token == ORDERED ||
1188                keyword->token == UNORDERED ||
1189                keyword->token == IF)
1190             {
1191                 switch(peekCurrent())
1192                 {
1193                     case '(':
1194                     {
1195                         // TODO See if we can remove DOCUMENT from isTypeToken.
1196                         if(isTypeToken(keyword->token) && keyword->token != DOCUMENT)
1197                         {
1198                             m_tokenStack.push(Token(LPAREN));
1199                             ++m_pos; /* Consume '('. */
1200                             pushState(Operator);
1201
1202                             if(keyword->token == PROCESSING_INSTRUCTION)
1203                                 setState(KindTestForPI);
1204                             else
1205                                 setState(KindTest);
1206
1207                             return Token(keyword->token);
1208                         }
1209                         else if(keyword->token == TYPESWITCH || keyword->token == IF)
1210                             return Token(keyword->token);
1211                         else /* It's a function call. */
1212                             return id;
1213                     }
1214                     case '{':
1215                     {
1216                         m_tokenStack.push(Token(CURLY_LBRACE));
1217                         ++m_pos; /* Consume '{'. */
1218                         pushState(Operator);
1219                         /* Stay in state Default. */
1220                         return Token(keyword->token);
1221                     }
1222                     default:
1223                     {
1224                         /* We have read in a token which is for instance
1225                          * "return", and now it can be an element
1226                          * test("element") a node kind test("element()"), or a
1227                          * computed element constructor("element name {...").
1228                          * We need to do a two-token lookahead here, because
1229                          * "element return" can be an element test followed by
1230                          * the return keyword, but it can also be an element
1231                          * constructor("element return {"). */
1232                         if(isNCNameStart(current()))
1233                         {
1234                             const int currentPos = m_pos;
1235                             const Token token2 = tokenizeNCNameOrQName();
1236
1237                             if(token2.hasError())
1238                                 return token2;
1239
1240                             handleWhitespace();
1241
1242                             if(peekCurrent() == '{')
1243                             {
1244                                 /* An element constructor. */
1245                                 m_tokenStack.push(token2);
1246                                 return Token(keyword->token);
1247                             }
1248
1249                             /* We jump back in the stream, we need to tokenize token2 according
1250                              * to the state. */
1251                             m_pos = currentPos;
1252                             setState(Operator);
1253                             return Token(NCNAME, QLatin1String(keyword->name));
1254                         }
1255                     }
1256                 }
1257             }
1258
1259             if(peekCurrent() == '$')
1260             {
1261                 setState(VarName);
1262                 return Token(keyword->token);
1263             }
1264
1265             /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */
1266             if(peekCurrent() == '(')
1267                 return id;
1268             else if(peekCurrent() == '{' && keyword->token == VALIDATE)
1269                 return Token(keyword->token);
1270
1271             if(!isNCNameStart(current()))
1272             {
1273                 setState(Operator);
1274                 return id;
1275             }
1276
1277             const Token id2(tokenizeNCName());
1278             const TokenMap *const keyword2 = lookupKeyword(id2.value);
1279
1280             if(!keyword2)
1281             {
1282                 /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */
1283                 setState(Operator);
1284                 return id;
1285             }
1286
1287             switch(keyword->token)
1288             {
1289                 case DECLARE:
1290                 {
1291                     switch(keyword2->token)
1292                     {
1293                         case VARIABLE:
1294                         /* Fallthrough. */
1295                         case FUNCTION:
1296                         {
1297                             m_tokenStack.push(Token(keyword2->token));
1298                             setState(Default);
1299                             return Token(keyword->token);
1300                         }
1301                         case OPTION:
1302                         {
1303                             m_tokenStack.push(Token(keyword2->token));
1304                             setState(Default);
1305                             return Token(keyword->token);
1306                         }
1307                         case COPY_NAMESPACES:
1308                         /* Fallthrough. */
1309                         case ORDERING:
1310                         {
1311                             m_tokenStack.push(Token(keyword2->token));
1312                             setState(NamespaceKeyword);
1313                             return Token(keyword->token);
1314                         }
1315                         case CONSTRUCTION:
1316                         {
1317                             // TODO identical to CONSTRUCTION?
1318                             m_tokenStack.push(Token(keyword2->token));
1319                             setState(Operator);
1320                             return Token(keyword->token);
1321                         }
1322                         case NAMESPACE:
1323                         /* Fallthrough. */
1324                         case BASEURI:
1325                         {
1326                             m_tokenStack.push(Token(keyword2->token));
1327                             setState(NamespaceDecl);
1328                             return Token(keyword->token);
1329                         }
1330                         case BOUNDARY_SPACE:
1331                         {
1332                             m_tokenStack.push(Token(keyword2->token));
1333                             setState(XMLSpaceDecl);
1334                             return Token(keyword->token);
1335                         }
1336                         case DEFAULT:
1337                         {
1338                             m_tokenStack.push(Token(keyword2->token));
1339
1340                             const TokenType ws2 = consumeWhitespace();
1341                             if(ws2 != SUCCESS)
1342                             {
1343                                 m_tokenStack.prepend(Token(ws2));
1344                                 return Token(keyword->token);
1345                             }
1346
1347                             const Token id3(tokenizeNCName());
1348
1349                             if(id3.type != NCNAME)
1350                             {
1351                                 m_tokenStack.prepend(id3);
1352                                 return Token(keyword->token);
1353                             }
1354
1355                             const TokenMap *const keyword3 = lookupKeyword(id3.value);
1356                             if(!keyword3)
1357                             {
1358                                 m_tokenStack.prepend(id3);
1359                                 return Token(keyword->token);
1360                             }
1361                             else
1362                             {
1363                                 m_tokenStack.prepend(Token(keyword3->token));
1364
1365                                 if(keyword3->token == ORDER)
1366                                     setState(Operator);
1367                                 else
1368                                     setState(NamespaceDecl);
1369                             }
1370
1371                             return Token(keyword->token);
1372                         }
1373                         default:
1374                         {
1375                             m_tokenStack.push(Token(keyword2->token));
1376                             setState(Default);
1377                             return id;
1378                         }
1379                     }
1380                 }
1381                 case XQUERY:
1382                 {
1383                     m_tokenStack.push(Token(keyword2->token));
1384
1385                     if(keyword2->token == VERSION)
1386                     {
1387                         setState(NamespaceDecl);
1388                         return Token(keyword->token);
1389                     }
1390                     else
1391                     {
1392                         setState(Operator);
1393                         return id;
1394                     }
1395                 }
1396                 case IMPORT:
1397                 {
1398                     m_tokenStack.push(Token(keyword2->token));
1399
1400                     switch(keyword2->token)
1401                     {
1402                         case SCHEMA:
1403                         /* Fallthrough. */
1404                         case MODULE:
1405                         {
1406                             setState(NamespaceKeyword);
1407                             return Token(keyword->token);
1408                         }
1409                         default:
1410                         {
1411                             setState(Operator);
1412                             return id;
1413                         }
1414                     }
1415                 }
1416                 case VALIDATE:
1417                 {
1418                     m_tokenStack.push(Token(keyword2->token));
1419
1420                     switch(keyword2->token)
1421                     {
1422                         case LAX:
1423                         case STRICT:
1424                         {
1425                             pushState(Operator);
1426                             return Token(keyword->token);
1427                         }
1428                         default:
1429                         {
1430                             setState(Operator);
1431                             return id;
1432                         }
1433                     }
1434                 }
1435                 default:
1436                 {
1437                     m_tokenStack.push(Token(keyword2->token));
1438                     setState(Operator);
1439                     return id;
1440                 }
1441             }
1442
1443             Q_ASSERT(false);
1444
1445         }
1446         case VarName:
1447         {
1448             if(peekCurrent() == '$')
1449                 return tokenAndAdvance(DOLLAR);
1450
1451             setState(Operator);
1452             return tokenizeNCNameOrQName();
1453             Q_ASSERT(false);
1454         }
1455         case ItemType:
1456         {
1457             switch(peekCurrent())
1458             {
1459                 case '(':
1460                     return tokenAndChangeState(LPAREN, KindTest);
1461                 case '$':
1462                     return tokenAndChangeState(DOLLAR, VarName);
1463             }
1464
1465             const Token name(tokenizeNCNameOrQName());
1466
1467             if(name.hasError())
1468                 return error();
1469
1470             else if(name.type == QNAME)
1471             {
1472                 setState(OccurrenceIndicator);
1473                 return name;
1474             }
1475             else
1476             {
1477                 const TokenMap *const keyword = lookupKeyword(name.value);
1478
1479                 if(keyword)
1480                 {
1481                     pushState(OccurrenceIndicator);
1482                     return Token(keyword->token);
1483                 }
1484                 else
1485                 {
1486                     setState(Default);
1487                     return name;
1488                 }
1489             }
1490             Q_ASSERT(false);
1491         }
1492         case KindTest:
1493         {
1494             switch(peekCurrent())
1495             {
1496                 case ')':
1497                 {
1498                     popState();
1499                     return tokenAndAdvance(RPAREN);
1500                 }
1501                 case '(':
1502                     return tokenAndAdvance(LPAREN);
1503                 case ',':
1504                     return tokenAndAdvance(COMMA);
1505                 case '*':
1506                     return tokenAndAdvance(STAR);
1507                 case '?':
1508                     return tokenAndAdvance(QUESTION);
1509                 case '\'':
1510                 /* Fallthrough. */
1511                 case '"':
1512                     return tokenizeStringLiteral();
1513             }
1514
1515             const Token nc(tokenizeNCNameOrQName());
1516             if(nc.hasError())
1517                 return nc;
1518
1519             const TokenType ws = consumeWhitespace();
1520             if(ws == ERROR)
1521                 return error();
1522
1523             if(peekCurrent() == '(')
1524             {
1525                 const TokenMap *const keyword = lookupKeyword(nc.value);
1526                 if(keyword)
1527                 {
1528                     pushState(KindTest);
1529                     return Token(keyword->token);
1530                 }
1531                 else
1532                     return nc;
1533             }
1534             else
1535                 return nc;
1536             Q_ASSERT(false);
1537         }
1538         case KindTestForPI:
1539         {
1540             switch(peekCurrent())
1541             {
1542                 case ')':
1543                 {
1544                     popState();
1545                     return tokenAndAdvance(RPAREN);
1546                 }
1547                 case '\'':
1548                 /* Fallthrough. */
1549                 case '"':
1550                     return tokenizeStringLiteral();
1551                 default:
1552                     return tokenizeNCName();
1553             }
1554             Q_ASSERT(false);
1555         }
1556         case OccurrenceIndicator:
1557         {
1558             switch(peekCurrent())
1559             {
1560                 case '?':
1561                     return tokenAndChangeState(QUESTION, Operator);
1562                 case '*':
1563                     return tokenAndChangeState(STAR, Operator);
1564                 case '+':
1565                     return tokenAndChangeState(PLUS, Operator);
1566                 default:
1567                 {
1568                     setState(Operator);
1569                     return nextToken();
1570                 }
1571             }
1572             Q_ASSERT(false);
1573         }
1574         case XQueryVersion:
1575         {
1576             switch(peekCurrent())
1577             {
1578                 case '\'':
1579                 /* Fallthrough. */
1580                 case '"':
1581                     return tokenizeStringLiteral();
1582                 case ';':
1583                     return tokenAndChangeState(SEMI_COLON, Default);
1584             }
1585
1586             const Token id(tokenizeNCName());
1587
1588             if(id.type != NCNAME)
1589                 return id;
1590
1591             const TokenMap *const keyword = lookupKeyword(id.value);
1592             if(keyword)
1593                 return tokenAndChangeState(keyword->token, Default);
1594             else
1595                 return id;
1596             Q_ASSERT(false);
1597         }
1598         case StartTag:
1599         {
1600             if(peekAhead(-1) == '<')
1601             {
1602                 if(current().isSpace())
1603                     return Token(ERROR);
1604             }
1605             else
1606             {
1607                 if(consumeRawWhitespace())
1608                     return Token(END_OF_FILE);
1609             }
1610
1611             switch(peekCurrent())
1612             {
1613                 case '/':
1614                 {
1615                     if(peekAhead() == '>')
1616                     {
1617                         m_pos += 2;
1618
1619                         if(m_scanOnly)
1620                             return Token(POSITION_SET);
1621                         else
1622                         {
1623                             popState();
1624                             return Token(QUICK_TAG_END);
1625                         }
1626                     }
1627                     else
1628                         return error();
1629                 }
1630                 case '>':
1631                 {
1632                     if(m_scanOnly)
1633                         return tokenAndChangeState(POSITION_SET, StartTag);
1634                     else
1635                         return tokenAndChangeState(G_GT, ElementContent);
1636                 }
1637                 case '=':
1638                     return tokenAndAdvance(G_EQ);
1639                 case '\'':
1640                     return tokenAndChangeState(APOS, AposAttributeContent);
1641                 case '"':
1642                     return tokenAndChangeState(QUOTE, QuotAttributeContent);
1643                 default:
1644                     return tokenizeNCNameOrQName();
1645             }
1646             Q_ASSERT(false);
1647         }
1648         case AposAttributeContent:
1649         /* Fallthrough. */
1650         case QuotAttributeContent:
1651         {
1652             const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"'));
1653             QString result;
1654             result.reserve(20);
1655
1656             if(m_scanOnly)
1657             {
1658                 int stack = 0;
1659                 return attributeAsRaw(sep, stack, m_pos, true, result);
1660             }
1661
1662             Q_ASSERT(!m_scanOnly);
1663             while(true)
1664             {
1665                 if(atEnd())
1666                 {
1667                     /* In the case that the XSL-T tokenizer invokes us with
1668                      * default state QuotAttributeContent, we need to be able
1669                      * to return a single string, in case that is all we have
1670                      * accumulated. */
1671                     if(result.isEmpty())
1672                         return Token(END_OF_FILE);
1673                     else
1674                         return Token(STRING_LITERAL, result);
1675                 }
1676
1677                 const QChar curr(current());
1678
1679                 if(curr == sep)
1680                 {
1681                     if(m_pos + 1 == m_length)
1682                         return Token(END_OF_FILE);
1683
1684                     if(m_data.at(m_pos + 1) == sep)
1685                     {
1686                         /* The quoting mechanism was used. */
1687                         m_pos += 2;
1688                         result.append(sep);
1689                         continue;
1690                     }
1691
1692                     const QChar next(m_data.at(m_pos + 1));
1693                     if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>'))
1694                         return Token(ERROR); // i18n Space must separate attributes
1695                     else if(result.isEmpty())
1696                     {
1697                         return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE,
1698                                                    StartTag, 1);
1699                     }
1700                     else
1701                     {
1702                         /* Don't consume the sep, but leave it so we next time return a token for it. */
1703                         return Token(STRING_LITERAL, result);
1704                     }
1705
1706                     ++m_pos;
1707                     continue;
1708                 }
1709                 else if(curr == QLatin1Char('{'))
1710                 {
1711                     if(m_pos + 1 == m_length)
1712                         return Token(END_OF_FILE);
1713                     else if(peekAhead() == '{')
1714                     {
1715                         ++m_pos;
1716                         result.append(QLatin1Char('{'));
1717                     }
1718                     else
1719                     {
1720                         if(result.isEmpty())
1721                         {
1722                             /* The Attribute Value Template appeared directly in the attribute. */
1723                             pushState();
1724                             return tokenAndChangeState(CURLY_LBRACE, Default);
1725                         }
1726                         else
1727                         {
1728                             /* We don't advance, keep '{' as next token. */
1729                             return Token(STRING_LITERAL, result);
1730                         }
1731                     }
1732                 }
1733                 else if(curr == QLatin1Char('}'))
1734                 {
1735                     if(m_pos + 1 == m_length)
1736                         return Token(END_OF_FILE);
1737                     else if(peekAhead() == '}')
1738                     {
1739                         ++m_pos;
1740                         result.append(QLatin1Char('}'));
1741                     }
1742                     else
1743                         return Token(ERROR);
1744                 }
1745                 else if(curr == QLatin1Char('&'))
1746                 {
1747                     const QString ret(tokenizeCharacterReference());
1748                     if(ret.isNull())
1749                         return Token(ERROR);
1750                     else
1751                         result.append(ret);
1752                 }
1753                 else if(curr == QLatin1Char('<'))
1754                     return Token(STRING_LITERAL, result);
1755                 else
1756                 {
1757                     /* See Extensible Markup Language (XML) 1.0 (Fourth Edition),
1758                      * 3.3.3 Attribute-Value Normalization.
1759                      *
1760                      * However, it is complicated a bit by that AVN is defined on top of
1761                      * EOL normalization and we do those two in one go here. */
1762                     switch(curr.unicode())
1763                     {
1764                         case 0xD:
1765                         {
1766                             if(peekAhead() == '\n')
1767                             {
1768                                 result.append(QLatin1Char(' '));
1769                                 ++m_pos;
1770                                 break;
1771                             }
1772                         }
1773                         case 0xA:
1774                         /* Fallthrough. */
1775                         case 0x9:
1776                         {
1777                             result.append(QLatin1Char(' '));
1778                             break;
1779                         }
1780                         default:
1781                             result.append(curr);
1782                     }
1783                 }
1784
1785                 ++m_pos;
1786             }
1787             Q_ASSERT(false);
1788         }
1789         case ElementContent:
1790         {
1791             QString result;
1792             result.reserve(20);
1793
1794             /* Whether the text node, result, may be whitespace only. Character references
1795              * and CDATA sections disables that. */
1796             bool mayBeWS = true;
1797
1798             CharacterSkips skipEOLNormalization;
1799
1800             while(true)
1801             {
1802                 if(atEnd())
1803                     return Token(END_OF_FILE);
1804
1805                 switch(peekCurrent())
1806                 {
1807                     case '<':
1808                     {
1809                         if(!result.isEmpty() && peekAhead(2) != '[')
1810                         {
1811                             /* We encountered the end, and it was not a CDATA section. */
1812                             /* We don't advance. Next time we'll handle the <... stuff. */
1813                             return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
1814                         }
1815
1816                         ++m_pos;
1817                         if(atEnd())
1818                             return Token(END_OF_FILE);
1819
1820                         const QChar ahead(current());
1821                         if(ahead.isSpace())
1822                             return error();
1823                         else if(ahead == QLatin1Char('/'))
1824                         {
1825                             if(m_pos + 1 == m_length)
1826                                 return Token(END_OF_FILE);
1827                             else if(m_data.at(m_pos + 1).isSpace())
1828                                 return error();
1829                             else
1830                                 return tokenAndChangeState(BEGIN_END_TAG, EndTag);
1831                         }
1832                         else if(isNCNameStart(ahead))
1833                         {
1834                             pushState();
1835                             return tokenAndChangeState(G_LT, StartTag, 0);
1836                         }
1837                         else if(aheadEquals("!--", 3, 0))
1838                         {
1839                             pushState();
1840                             m_pos += 3;
1841                             return tokenAndChangeState(COMMENT_START, XMLComment, 0);
1842                         }
1843                         else if(aheadEquals("![CDATA[", 8, 0))
1844                         {
1845                             mayBeWS = false;
1846                             m_pos += 8;
1847                             const int start = m_pos;
1848                             const int len = scanUntil("]]>");
1849
1850                             if(len == -1)
1851                                 return Token(END_OF_FILE);
1852
1853                             m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */
1854                             result.append(m_data.mid(start, len));
1855                             break;
1856                         }
1857                         else if(ahead == QLatin1Char('?'))
1858                         {
1859                             pushState();
1860                             return tokenAndChangeState(PI_START, ProcessingInstructionName);
1861                         }
1862                         else
1863                             return Token(G_LT);
1864                     }
1865                     case '&':
1866                     {
1867                         const QString ret(tokenizeCharacterReference());
1868                         if(ret.isNull())
1869                             return Token(ERROR);
1870                         else
1871                         {
1872                             skipEOLNormalization.insert(result.count());
1873                             result.append(ret);
1874                             mayBeWS = false;
1875                             break;
1876                         }
1877                     }
1878                     case '{':
1879                     {
1880                         // TODO remove this check, also below.
1881                         if(m_pos + 1 == m_length)
1882                             return Token(END_OF_FILE);
1883                         else if(peekAhead() == '{')
1884                         {
1885                             ++m_pos;
1886                             result.append(QLatin1Char('{'));
1887                         }
1888                         else
1889                         {
1890                             if(result.isEmpty())
1891                             {
1892                                 pushState();
1893                                 return tokenAndChangeState(CURLY_LBRACE, Default);
1894                             }
1895                             else
1896                             {
1897                                 /* We don't advance here. */
1898                                 return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
1899                             }
1900                         }
1901                         break;
1902                     }
1903                     case '}':
1904                     {
1905                         if(m_pos + 1 == m_length)
1906                             return Token(END_OF_FILE);
1907                         else if(peekAhead() == '}')
1908                         {
1909                             ++m_pos;
1910                             result.append(QLatin1Char('}'));
1911                         }
1912                         else
1913                         {
1914                             /* This is a parse error, and the grammar won't be able
1915                              * to reduce this CURLY_RBRACE. */
1916                             return tokenAndChangeState(CURLY_RBRACE, Default);
1917                         }
1918                         break;
1919                     }
1920                     case '\n':
1921                     {
1922                         /* We want to translate \r\n into \n. */
1923                         if(peekAhead(-1) == '\r')
1924                             break;
1925                         /* else, fallthrough. */
1926                     }
1927                     case '\r':
1928                     {
1929                         result.append(QLatin1Char('\n'));
1930                         break;
1931                     }
1932                     default:
1933                     {
1934                         result.append(current());
1935                         break;
1936                     }
1937                 }
1938                 ++m_pos;
1939             }
1940             Q_ASSERT(false);
1941         }
1942         case ProcessingInstructionName:
1943         {
1944             const int start = m_pos;
1945
1946             while(true)
1947             {
1948                 ++m_pos;
1949                 if(m_pos >= m_length)
1950                     return Token(END_OF_FILE);
1951
1952                 const QChar next(current());
1953                 if(next.isSpace() || next == QLatin1Char('?'))
1954                 {
1955                     return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start),
1956                                                ProcessingInstructionContent);
1957                 }
1958             }
1959             Q_ASSERT(false);
1960         }
1961         case ProcessingInstructionContent:
1962         {
1963             /* Consume whitespace between the name and the content. */
1964             if(consumeRawWhitespace())
1965                 return Token(END_OF_FILE);
1966
1967             const int start = m_pos;
1968             const int len = scanUntil("?>");
1969
1970             if(len == -1)
1971                 return Token(END_OF_FILE);
1972             else
1973             {
1974                 m_pos += 2; /* Consume "?>" */
1975                 popState();
1976                 return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
1977             }
1978             Q_ASSERT(false);
1979         }
1980         case EndTag:
1981         {
1982             if(consumeRawWhitespace())
1983                 return END_OF_FILE;
1984
1985             if(peekCurrent() == '>')
1986             {
1987                 popState();
1988                 return tokenAndAdvance(G_GT);
1989             }
1990             else
1991                 return tokenizeNCNameOrQName();
1992             Q_ASSERT(false);
1993         }
1994         case XMLComment:
1995         {
1996             const int start = m_pos;
1997             const int len = scanUntil("--");
1998
1999             if(len == -1)
2000                 return END_OF_FILE;
2001             else
2002             {
2003                 m_pos += 2; /* Consume "--". */
2004                 popState();
2005
2006                 if(peekCurrent() == '>')
2007                 {
2008                     ++m_pos;
2009                     return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
2010                 }
2011                 else
2012                     return error();
2013             }
2014             Q_ASSERT(false);
2015         }
2016         case Pragma:
2017         {
2018             /* Consume whitespace. */
2019             if(consumeRawWhitespace())
2020                 return Token(END_OF_FILE);
2021
2022             setState(PragmaContent);
2023             return tokenizeNCNameOrQName();
2024         }
2025         case PragmaContent:
2026         {
2027             QString result;
2028             result.reserve(20);
2029
2030             const bool hasWS = m_pos < m_length && current().isSpace();
2031
2032             /* Consume all whitespace up to the pragma content(if any). */
2033             if(consumeRawWhitespace())
2034                 return Token(END_OF_FILE);
2035
2036             if(peekCurrent() == '#' && peekAhead() == ')')
2037             {
2038                 /* We reached the end, and there's no pragma content. */
2039                 return tokenAndChangeState(PRAGMA_END, Default, 2);
2040             }
2041             else if(!hasWS)
2042             {
2043                 /* A separating space is required if there's pragma content. */
2044                 return error(); /* i18n */
2045             }
2046
2047             const int start = m_pos;
2048             const int len = scanUntil("#)");
2049             if(len == -1)
2050                 return Token(END_OF_FILE);
2051
2052             return Token(STRING_LITERAL, m_data.mid(start, len));
2053             Q_ASSERT(false);
2054         }
2055     }
2056
2057     Q_ASSERT(false);
2058     return error();
2059 }
2060
2061 Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep,
2062                                                  int &sepStack,
2063                                                  const int startPos,
2064                                                  const bool aInLiteral,
2065                                                  QString &result)
2066 {
2067     bool inLiteral = aInLiteral;
2068     const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"');
2069
2070     while(true)
2071     {
2072         if(atEnd())
2073             return END_OF_FILE;
2074
2075         if(peekCurrent() == sep.unicode())
2076         {
2077             if(inLiteral)
2078                 inLiteral = false;
2079             else
2080                 inLiteral = true;
2081
2082             if(peekAhead() == sep.unicode())
2083             {
2084                 /* The quoting mechanism was used. */
2085                 result.append(current());
2086                 m_pos += 2;
2087                 continue;
2088             }
2089             else
2090             {
2091                 /* Don't consume the separator, such that we
2092                  * return a token for it next time. */
2093                 if(m_pos == startPos)
2094                 {
2095                     ++m_pos;
2096                     setState(StartTag);
2097                     return Token(sep == QLatin1Char('"') ? QUOTE : APOS);
2098                 }
2099
2100
2101                 if(sepStack == 0)
2102                 {
2103                     return Token(STRING_LITERAL, result);
2104                 }
2105                 else
2106                 {
2107                     result.append(current());
2108                     ++m_pos;
2109                     continue;
2110                 }
2111             }
2112         }
2113         else if(peekCurrent() == '&')
2114         {
2115             const QString ret(tokenizeCharacterReference());
2116             if(ret.isNull())
2117                 return Token(ERROR);
2118             else
2119             {
2120                 result.append(ret);
2121                 ++m_pos;
2122                 continue;
2123             }
2124         }
2125         else if(peekCurrent() == otherSep)
2126         {
2127             result.append(current());
2128             ++m_pos;
2129
2130             if(peekCurrent() == otherSep)
2131                 ++m_pos;
2132
2133             if(inLiteral)
2134                 inLiteral = false;
2135             else
2136                 inLiteral = true;
2137
2138             continue;
2139         }
2140         else if(peekCurrent() == '{')
2141         {
2142             result.append(current());
2143
2144             if(peekAhead() == '{')
2145             {
2146                 m_pos += 2;
2147                 continue;
2148             }
2149             else
2150             {
2151                 ++m_pos;
2152                 ++sepStack;
2153                 const Token t(attributeAsRaw(sep, sepStack, startPos, false, result));
2154                 if(t.type != SUCCESS)
2155                     return t;
2156             }
2157
2158         }
2159         else if(peekCurrent() == '}')
2160         {
2161             if(inLiteral && peekAhead() == '}')
2162             {
2163                 result.append(current());
2164                 m_pos += 2;
2165                 continue;
2166             }
2167             else
2168             {
2169                 ++m_pos;
2170                 --sepStack;
2171                 return Token(SUCCESS); /* The return value is arbitrary. */
2172             }
2173         }
2174         else
2175         {
2176             result.append(current());
2177             ++m_pos;
2178         }
2179     }
2180 }
2181
2182 Tokenizer::Token XQueryTokenizer::nextToken(YYLTYPE *const sourceLocator)
2183 {
2184     sourceLocator->first_line = m_line;
2185     sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */
2186
2187     if(m_tokenStack.isEmpty())
2188         return nextToken();
2189     else
2190     {
2191         const Token retval(m_tokenStack.pop());
2192
2193         switch(retval.type)
2194         {
2195             case MODULE:
2196             /* Fallthrough.*/
2197             case SCHEMA:
2198             /* Fallthrough.*/
2199             case COPY_NAMESPACES:
2200             {
2201                 setState(NamespaceKeyword);
2202                 break;
2203             }
2204             case VERSION:
2205             {
2206                 setState(XQueryVersion);
2207                 break;
2208             }
2209             case AS:
2210             /* Fallthrough. */
2211             case OF:
2212             {
2213                 setState(ItemType);
2214                 break;
2215             }
2216             default:
2217             {
2218                 if(isOperatorKeyword(retval.type))
2219                     setState(Default);
2220
2221                 break;
2222             }
2223         };
2224
2225         return retval;
2226     }
2227 }
2228
2229 int XQueryTokenizer::commenceScanOnly()
2230 {
2231     m_scanOnly = true;
2232     return m_pos;
2233 }
2234
2235 void XQueryTokenizer::resumeTokenizationFrom(const int pos)
2236 {
2237     m_scanOnly = false;
2238     m_pos = pos;
2239 }
2240
2241 void XQueryTokenizer::setParserContext(const ParserContext::Ptr &)
2242 {
2243 }
2244
2245 #undef handleWhitespace
2246
2247 } // namespace QPatternist
2248
2249 QT_END_NAMESPACE