src/xmlpatterns/parser/qxquerytokenizer.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
   4 ** Contact: http://www.qt-project.org/
   5 **
   6 ** This file is part of the QtXmlPatterns module of the Qt Toolkit.
   7 **
   8 ** $QT_BEGIN_LICENSE:LGPL$
   9 ** GNU Lesser General Public License Usage
  10 ** This file may be used under the terms of the GNU Lesser General Public
  11 ** License version 2.1 as published by the Free Software Foundation and
  12 ** appearing in the file LICENSE.LGPL included in the packaging of this
  13 ** file. Please review the following information to ensure the GNU Lesser
  14 ** General Public License version 2.1 requirements will be met:
  15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  16 **
  17 ** In addition, as a special exception, Nokia gives you certain additional
  18 ** rights. These rights are described in the Nokia Qt LGPL Exception
  19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  20 **
  21 ** GNU General Public License Usage
  22 ** Alternatively, this file may be used under the terms of the GNU General
  23 ** Public License version 3.0 as published by the Free Software Foundation
  24 ** and appearing in the file LICENSE.GPL included in the packaging of this
  25 ** file. Please review the following information to ensure the GNU General
  26 ** Public License version 3.0 requirements will be met:
  27 ** http://www.gnu.org/copyleft/gpl.html.
  28 **
  29 ** Other Usage
  30 ** Alternatively, this file may be used in accordance with the terms and
  31 ** conditions contained in a signed written agreement between you and Nokia.
  32 **
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 #include <QByteArray>
  43
  44 #include "qquerytransformparser_p.h"
  45
  46 #include "qxquerytokenizer_p.h"
  47
  48 #include "qtokenlookup.cpp"
  49
  50 QT_BEGIN_NAMESPACE
  51
  52 namespace QPatternist
  53 {
  54
  55 #define handleWhitespace()                      \
  56 {                                               \
  57     const TokenType t = consumeWhitespace();    \
  58     if(t != SUCCESS)                            \
  59         return Token(t);                        \
  60 }
  61
  62 XQueryTokenizer::XQueryTokenizer(const QString &query,
  63                                  const QUrl &location,
  64                                  const State startingState) : Tokenizer(location)
  65                                                             , m_data(query)
  66                                                             , m_length(query.length())
  67                                                             , m_state(startingState)
  68                                                             , m_pos(0)
  69                                                             , m_line(1)
  70                                                             , m_columnOffset(0)
  71                                                             , m_scanOnly(false)
  72 {
  73     Q_ASSERT(location.isValid() || location.isEmpty());
  74 }
  75
  76 const QChar XQueryTokenizer::current() const
  77 {
  78     if(m_pos < m_length)
  79         return m_data.at(m_pos);
  80     else
  81         return QChar();
  82 }
  83
  84 char XQueryTokenizer::peekCurrent() const
  85 {
  86     return current().toLatin1();
  87 }
  88
  89 int XQueryTokenizer::peekForColonColon() const
  90 {
  91     /* Note, we don't modify m_pos in this function, so we need to do offset
  92      * calculations. */
  93     int pos = m_pos;
  94
  95     while(pos < m_length)
  96     {
  97         switch(m_data.at(pos).toLatin1())
  98         {
  99             /* Fallthrough these four. */
 100             case ' ':
 101             case '\t':
 102             case '\n':
 103             case '\r':
 104                 break;
 105             case ':':
 106             {
 107                 if(peekAhead((pos - m_pos) + 1) == ':')
 108                     return pos - m_pos;
 109                 /* Fallthrough. */
 110             }
 111             default:
 112                 return -1;
 113         }
 114         ++pos;
 115     }
 116
 117     return -1;
 118 }
 119
 120 Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
 121                                                       const State s,
 122                                                       const int advance)
 123 {
 124     Q_ASSERT(advance >= 0);
 125     m_pos += advance;
 126     setState(s);
 127     return Token(code);
 128 }
 129
 130 Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
 131                                                       const QString &value,
 132                                                       const State s)
 133 {
 134     setState(s);
 135     return Token(code, value);
 136 }
 137
 138 Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code,
 139                                                   const int advance)
 140 {
 141     Q_ASSERT(advance >= 0);
 142     m_pos += advance;
 143     return Token(code);
 144 }
 145
 146 QString XQueryTokenizer::normalizeEOL(const QString &input,
 147                                       const CharacterSkips &characterSkips)
 148 {
 149     const int len = input.count();
 150     QString result;
 151
 152     /* The likely hood is rather high it'll be the same content. */
 153     result.reserve(len);
 154
 155     for(int i = 0; i < len; ++i)
 156     {
 157         const QChar &at = input.at(i);
 158
 159         if(characterSkips.contains(i))
 160         {
 161             result.append(at);
 162             continue;
 163         }
 164         switch(input.at(i).unicode())
 165         {
 166             case '\r':
 167             {
 168                 if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n'))
 169                     ++i;
 170
 171                 /* Else, fallthrough. */
 172             }
 173             case '\n':
 174             {
 175                 result.append(QLatin1Char('\n'));
 176                 continue;
 177             }
 178             default:
 179             {
 180                 result.append(at);
 181             }
 182         }
 183     }
 184
 185     return result;
 186 }
 187
 188 Tokenizer::TokenType XQueryTokenizer::consumeComment()
 189 {
 190     /* Below, we return ERROR instead of END_OF_FILE such that the parser
 191      * sees an invalid comment. */
 192     while(m_pos < m_length)
 193     {
 194         switch(peekCurrent())
 195         {
 196             case ':':
 197             {
 198                 ++m_pos; /* Consume ':' */
 199                 if(atEnd())
 200                     return ERROR;
 201
 202                 if(peekCurrent() == ')')
 203                 {
 204                     ++m_pos; /* Consume ')' */
 205                     return SUCCESS; /* The comment closed nicely. */
 206                 }
 207                 continue; /* We don't want to increment m_pos twice. */
 208             }
 209             case '(':
 210             { /* It looks like the start of a comment. */
 211                 ++m_pos;
 212
 213                 if(atEnd())
 214                     return END_OF_FILE;
 215                 else if(peekCurrent() == ':')
 216                 {
 217                     /* And it is a nested comment -- parse it. */
 218                     const TokenType retval = consumeComment();
 219                     if(retval == SUCCESS)
 220                         continue; /* Continue with our "own" comment. */
 221                     else
 222                         return retval; /* Return the error in the nested comment. */
 223                 }
 224                 break;
 225             }
 226             case '\n':
 227             /* Fallthrough. */
 228             case '\r':
 229             {
 230                 /* We want to count \r\n as a single line break. */
 231                 if(peekAhead() == '\n')
 232                     ++m_pos;
 233
 234                 m_columnOffset = m_pos;
 235                 ++m_line;
 236
 237                 break;
 238             }
 239         }
 240         ++m_pos;
 241     }
 242
 243     return ERROR; /* Error: we reached the end while inside a comment. */
 244 }
 245
 246 bool XQueryTokenizer::consumeRawWhitespace()
 247 {
 248     while(m_pos < m_length)
 249     {
 250         switch(peekCurrent())
 251         {
 252             case ' ':
 253             case '\t':
 254                 break;
 255             case '\n':
 256             case '\r':
 257             {
 258                 if(peekAhead() == '\n')
 259                     ++m_pos;
 260
 261                 m_columnOffset = m_pos;
 262                 ++m_line;
 263
 264                 break;
 265             }
 266             default:
 267                 return false;
 268         }
 269         ++m_pos;
 270     }
 271     return true;
 272 }
 273
 274 Tokenizer::TokenType XQueryTokenizer::consumeWhitespace()
 275 {
 276     while(m_pos < m_length)
 277     {
 278         switch(peekCurrent())
 279         {
 280             case ' ':
 281             case '\t':
 282                 break;
 283             case '\n':
 284             case '\r':
 285             {
 286                 /* We want to count \r\n as a single line break. */
 287                 if(peekAhead() == '\n')
 288                     ++m_pos;
 289
 290                 m_columnOffset = m_pos;
 291                 ++m_line;
 292
 293                 break;
 294             }
 295             case '(':
 296             {
 297                 if(peekAhead() == ':')
 298                 {
 299                     m_pos += 2; /* Consume "(:" */
 300
 301                     const TokenType comment = consumeComment();
 302                     if(comment == SUCCESS)
 303                         continue;
 304                     else
 305                         return comment;
 306                 }
 307             }
 308             default:
 309                 return SUCCESS;
 310         }
 311         ++m_pos;
 312     }
 313
 314     return END_OF_FILE;
 315 }
 316
 317 char XQueryTokenizer::peekAhead(const int length) const
 318 {
 319     if(m_pos + length < m_length)
 320         return m_data.at(m_pos + length).toLatin1();
 321     else
 322         return 0;
 323 }
 324
 325 Tokenizer::Token XQueryTokenizer::error()
 326 {
 327     return Token(ERROR);
 328 }
 329
 330 bool XQueryTokenizer::isDigit(const char ch)
 331 {
 332     return ch >= '0' && ch <= '9';
 333 }
 334
 335 /* Replace with function in QXmlUtils. Write test cases for this. */
 336 bool XQueryTokenizer::isNCNameStart(const QChar ch)
 337 {
 338     if(ch == QLatin1Char('_'))
 339         return true;
 340
 341     switch(ch.category())
 342     {
 343         case QChar::Letter_Lowercase:
 344         case QChar::Letter_Uppercase:
 345         case QChar::Letter_Other:
 346         case QChar::Letter_Titlecase:
 347         case QChar::Number_Letter:
 348             return true;
 349         default:
 350             return false;
 351     }
 352 }
 353
 354 bool XQueryTokenizer::isNCNameBody(const QChar ch)
 355 {
 356     switch(ch.unicode())
 357     {
 358         case '.':
 359         case '_':
 360         case '-':
 361             return true;
 362     }
 363
 364     switch(ch.category())
 365     {
 366         case QChar::Letter_Lowercase:
 367         case QChar::Letter_Uppercase:
 368         case QChar::Letter_Other:
 369         case QChar::Letter_Titlecase:
 370         case QChar::Number_Letter:
 371         case QChar::Mark_SpacingCombining:
 372         case QChar::Mark_Enclosing:
 373         case QChar::Mark_NonSpacing:
 374         case QChar::Letter_Modifier:
 375         case QChar::Number_DecimalDigit:
 376             return true;
 377         default:
 378             return false;
 379     }
 380 }
 381
 382 bool XQueryTokenizer::isPhraseKeyword(const TokenType code)
 383 {
 384     switch(code)
 385     {
 386         /* Fallthrough all these. */
 387         case CASTABLE:
 388         case CAST:
 389         case COPY_NAMESPACES:
 390         case DECLARE:
 391         case EMPTY:
 392         case MODULE:
 393         case IMPORT:
 394         case INSTANCE:
 395         case ORDER:
 396         case ORDERING:
 397         case XQUERY:
 398         case STABLE:
 399         case TREAT:
 400             return true;
 401         default:
 402             return false;
 403     }
 404 }
 405
 406 bool XQueryTokenizer::isOperatorKeyword(const TokenType code)
 407 {
 408     switch(code)
 409     {
 410         /* Fallthrough all these. */
 411         case AS:
 412         case ASCENDING:
 413         case AT:
 414         case CASE:
 415         case CAST:
 416         case CASTABLE:
 417         case EQ:
 418         case EXTERNAL:
 419         case GE:
 420         case G_EQ:
 421         case G_GT:
 422         case G_LT:
 423         case G_NE:
 424         case GT:
 425         case IN:
 426         case INHERIT:
 427         case INSTANCE:
 428         case IS:
 429         case ITEM:
 430         case LE:
 431         case LT:
 432         case NE:
 433         case NO_INHERIT:
 434         case NO_PRESERVE:
 435         case OF:
 436         case PRESERVE:
 437         case RETURN:
 438         case STABLE:
 439         case TO:
 440         case TREAT:
 441             return true;
 442         default:
 443             return false;
 444     };
 445 }
 446
 447 bool XQueryTokenizer::isTypeToken(const TokenType t)
 448 {
 449     switch(t)
 450     {
 451         /* Fallthrough all these. */
 452         case ATTRIBUTE:
 453         case COMMENT:
 454         case DOCUMENT:
 455         case DOCUMENT_NODE:
 456         case ELEMENT:
 457         case ITEM:
 458         case NODE:
 459         case PROCESSING_INSTRUCTION:
 460         case SCHEMA_ATTRIBUTE:
 461         case SCHEMA_ELEMENT:
 462         case TEXT:
 463             return true;
 464         default:
 465             return false;
 466     }
 467 }
 468
 469 Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName()
 470 {
 471     const int start = m_pos;
 472
 473     const Token t1 = tokenizeNCName();
 474     if(t1.hasError())
 475         return t1;
 476
 477     if(peekCurrent() != ':' || peekAhead() == '=')
 478         return t1;
 479
 480     ++m_pos;
 481
 482     const Token t2 = tokenizeNCName();
 483     if(t2.hasError())
 484         return t2;
 485     else
 486         return Token(QNAME, m_data.mid(start, m_pos - start));
 487 }
 488
 489 Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral()
 490 {
 491     setState(Operator);
 492     const int startPos = m_pos;
 493     bool hasDot = false;
 494     bool isXPath20 = false;
 495
 496     for(; m_pos < m_length; ++m_pos)
 497     {
 498         QChar ch(current());
 499
 500         char cell = ch.cell();
 501
 502         if(cell == 'e' || cell == 'E')
 503         {
 504             isXPath20 = true;
 505             ++m_pos;
 506             ch = current();
 507
 508             if(ch.row() != 0)
 509                 break;
 510
 511             cell = ch.cell();
 512
 513             if(cell == '+' || cell == '-')
 514                 continue;
 515         }
 516
 517         if(isNCNameStart(ch))
 518             return error();
 519
 520         if(cell < '0' || cell > '9')
 521         {
 522             if(cell == '.' && !hasDot)
 523                 hasDot = true;
 524             else
 525                 break;
 526         }
 527     }
 528
 529     return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos));
 530 }
 531
 532 QString XQueryTokenizer::tokenizeCharacterReference()
 533 {
 534     Q_ASSERT(peekCurrent() == '&');
 535
 536     const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1);
 537
 538     if(theEnd == -1) /* No ';' found, a syntax error. i18n. */
 539         return QString();
 540
 541     QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1));
 542     m_pos = theEnd;
 543
 544     const QChar charRef(charForReference(content));
 545
 546     if(!charRef.isNull())
 547         return charRef;
 548     else if(content.startsWith(QLatin1Char('#')))
 549     {
 550         int base;
 551
 552         /* It is only '#' or '#x'. */
 553         if(content.length() < 2)
 554             return QString();
 555
 556         /* We got a hex number if it starts with 'x', otherwise it's a decimal. */
 557         if(content.at(1) == QLatin1Char('x'))
 558         {
 559             base = 16;
 560             content = content.mid(2); /* Remove "#x". */
 561         }
 562         else
 563         {
 564             base = 10;
 565             content = content.mid(1); /* Remove "#". */
 566         }
 567
 568         bool conversionOK = false;
 569         const int codepoint = content.toInt(&conversionOK, base);
 570
 571         if(conversionOK)
 572         {
 573             const QChar ch(codepoint);
 574
 575             if(ch.isNull())
 576             {
 577                 /* We likely have something which require surrogate pairs. */
 578                 QString result;
 579                 result += QChar(QChar::highSurrogate(codepoint));
 580                 result += QChar(QChar::lowSurrogate(codepoint));
 581                 return result;
 582             }
 583             else
 584                 return ch;
 585         }
 586         else
 587             return QString();
 588     }
 589     else
 590         return QString();
 591 }
 592
 593 int XQueryTokenizer::scanUntil(const char *const content)
 594 {
 595     const int end = m_data.indexOf(QString::fromLatin1(content), m_pos);
 596
 597     if(end == -1)
 598         return -1;
 599     else
 600     {
 601         const int len = end - m_pos;
 602         m_pos += len;
 603         return len;
 604     }
 605 }
 606
 607 QChar XQueryTokenizer::charForReference(const QString &reference)
 608 {
 609     if(m_charRefs.isEmpty())
 610     {
 611         /* Initialize. */
 612         m_charRefs.reserve(5);
 613         m_charRefs.insert(QLatin1String("lt"),     QLatin1Char('<'));
 614         m_charRefs.insert(QLatin1String("gt"),     QLatin1Char('>'));
 615         m_charRefs.insert(QLatin1String("amp"),    QLatin1Char('&'));
 616         m_charRefs.insert(QLatin1String("quot"),   QLatin1Char('"'));
 617         m_charRefs.insert(QLatin1String("apos"),   QLatin1Char('\''));
 618     }
 619
 620     return m_charRefs.value(reference);
 621 }
 622
 623 Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral()
 624 {
 625     const QChar delimiter(current());
 626     /* We cannot unfortunately just scan and then do mid(),
 627      * since we can encounter character references. */
 628     QString result;
 629
 630     /* This is more likely than QString's default allocation. */
 631     result.reserve(8);
 632
 633     CharacterSkips skipEOLNormalization;
 634
 635     /* Advance over the initial quote character. */
 636     ++m_pos;
 637
 638     for(; m_pos < m_length; ++m_pos)
 639     {
 640         const QChar c(current());
 641
 642         if(c == QLatin1Char('&'))
 643         {
 644             const QString charRef(tokenizeCharacterReference());
 645
 646             if(charRef.isNull())
 647                 return error();
 648             else
 649             {
 650                 skipEOLNormalization.insert(result.count());
 651                 result.append(charRef);
 652             }
 653
 654         }
 655         else if(c == delimiter)
 656         {
 657             /* Maybe the escaping mechanism is used. For instance, "s""s"
 658              * has the value `s"s'. */
 659             ++m_pos;
 660
 661             if(current() == delimiter) /* Double quote. */
 662                 result += delimiter;
 663             else
 664                 return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization));
 665         }
 666         else
 667             result += c;
 668     }
 669
 670     return error();
 671 }
 672
 673 Tokenizer::Token XQueryTokenizer::tokenizeNCName()
 674 {
 675     const int startPos = m_pos;
 676
 677     if(m_pos < m_length && isNCNameStart(current()))
 678     {
 679         ++m_pos;
 680
 681         for(; m_pos < m_length; ++m_pos)
 682         {
 683             if(!isNCNameBody(current()))
 684                 break;
 685         }
 686
 687         return Token(NCNAME, m_data.mid(startPos, m_pos - startPos));
 688     }
 689     else
 690         return error();
 691 }
 692
 693 bool XQueryTokenizer::aheadEquals(const char *const chs,
 694                                   const int len,
 695                                   const int offset) const
 696 {
 697     Q_ASSERT(len > 0);
 698     Q_ASSERT(qstrlen(chs) == uint(len));
 699
 700     if(m_pos + len >= m_length)
 701         return false;
 702
 703     for(int i = offset; i < (len + offset); ++i)
 704     {
 705         if(m_data.at(m_pos + i).toLatin1() != chs[i - offset])
 706             return false;
 707     }
 708
 709     return true;
 710 }
 711
 712 const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword)
 713 {
 714     return TokenLookup::value(keyword.toLatin1().constData(), keyword.length());
 715 }
 716
 717 XQueryTokenizer::State XQueryTokenizer::state() const
 718 {
 719     return m_state;
 720 }
 721
 722 void XQueryTokenizer::setState(const State s)
 723 {
 724     m_state = s;
 725 }
 726
 727 void XQueryTokenizer::pushState(const State s)
 728 {
 729     m_stateStack.push(s);
 730 }
 731
 732 void XQueryTokenizer::pushState()
 733 {
 734     m_stateStack.push(m_state);
 735 }
 736
 737 void XQueryTokenizer::popState()
 738 {
 739     /* QStack::pop() asserts if it's empty, so we need to check
 740      * it, since we might receive unbalanced curlies. */
 741     if(!m_stateStack.isEmpty())
 742         m_state = m_stateStack.pop();
 743 }
 744
 745 Tokenizer::Token XQueryTokenizer::nextToken()
 746 {
 747     switch(state())
 748     {
 749         /* We want to skip or do special whitespace handling for these
 750          * states. So fallthrough all of the following. */
 751         case AposAttributeContent:
 752         case Axis:
 753         case ElementContent:
 754         case EndTag:
 755         case Pragma:
 756         case PragmaContent:
 757         case ProcessingInstructionName:
 758         case QuotAttributeContent:
 759         case StartTag:
 760         case XMLComment:
 761             break;
 762         default:
 763             handleWhitespace();
 764     }
 765
 766     switch(state())
 767     {
 768         case XMLSpaceDecl:
 769         /* Fallthrough. */
 770         case NamespaceKeyword:
 771         {
 772             switch(peekCurrent())
 773             {
 774                 case ',':
 775                     return tokenAndAdvance(COMMA);
 776                 case '"':
 777                 /* Fallthrough. */
 778                 case '\'':
 779                 {
 780                     setState(NamespaceDecl);
 781                     return tokenizeStringLiteral();
 782                 }
 783             }
 784
 785             const Token id(tokenizeNCName());
 786
 787             if(id.type != NCNAME)
 788                 return id;
 789
 790             const TokenMap *const keyword = lookupKeyword(id.value);
 791             if(keyword)
 792             {
 793                 switch(keyword->token)
 794                 {
 795                     case INHERIT:
 796                     /* Fallthrough. */
 797                     case NO_INHERIT:
 798                     {
 799                         setState(Default);
 800                         break;
 801                     }
 802                     case NAMESPACE:
 803                     {
 804                         setState(NamespaceDecl);
 805                         break;
 806                     }
 807                     case ORDERED:
 808                     /* Fallthrough. */
 809                     case UNORDERED:
 810                     /* Fallthrough. */
 811                     case STRIP:
 812                     {
 813                         setState(Default);
 814                         break;
 815                     }
 816                     case PRESERVE:
 817                     {
 818                         if(state() != NamespaceKeyword)
 819                             setState(Default);
 820                     }
 821                     default:
 822                         break;
 823                 }
 824
 825                 return Token(keyword->token);
 826             }
 827             else
 828                 return id;
 829
 830             Q_ASSERT(false);
 831         }
 832         case NamespaceDecl:
 833         {
 834             switch(peekCurrent())
 835             {
 836                 case '=':
 837                     return tokenAndAdvance(G_EQ);
 838                 case ';':
 839                     return tokenAndChangeState(SEMI_COLON, Default);
 840                 case '\'':
 841                 /* Fallthrough. */
 842                 case '\"':
 843                     return tokenizeStringLiteral();
 844             }
 845
 846             const Token nc(tokenizeNCName());
 847
 848             handleWhitespace();
 849
 850             const char pc = peekCurrent();
 851             const TokenMap* const t = lookupKeyword(nc.value);
 852
 853             if(pc == '\'' || (pc == '"' && t))
 854                 return tokenAndChangeState(t->token, Default, 0);
 855             else
 856                 return nc;
 857
 858             Q_ASSERT(false);
 859         }
 860         case Axis:
 861         {
 862             if(peekCurrent() == ':')
 863             {
 864                 Q_ASSERT(peekAhead() == ':');
 865                 m_pos += 2;
 866                 setState(AfterAxisSeparator);
 867                 return Token(COLONCOLON);
 868             }
 869             /* Fallthrough. */
 870         }
 871         case AfterAxisSeparator:
 872         /* Fallthrough. */
 873         case Default:
 874            /* State Operator and state Default have a lot of tokens in common except
 875             * for minor differences. So we treat them the same way, and sprinkles logic
 876             * here and there to handle the small differences. */
 877         /* Fallthrough. */
 878         case Operator:
 879         {
 880             switch(peekCurrent())
 881             {
 882                 case '=':
 883                     return tokenAndChangeState(G_EQ, Default);
 884                 case '-':
 885                     return tokenAndChangeState(MINUS, Default);
 886                 case '+':
 887                     return tokenAndChangeState(PLUS, Default);
 888                 case '[':
 889                     return tokenAndChangeState(LBRACKET, Default);
 890                 case ']':
 891                     return tokenAndChangeState(RBRACKET, Operator);
 892                 case ',':
 893                     return tokenAndChangeState(COMMA, Default);
 894                 case ';':
 895                     return tokenAndChangeState(SEMI_COLON, Default);
 896                 case '$':
 897                     return tokenAndChangeState(DOLLAR, VarName);
 898                 case '|':
 899                     return tokenAndChangeState(BAR, Default);
 900                 case '?':
 901                     return tokenAndChangeState(QUESTION, Operator);
 902                 case ')':
 903                     return tokenAndChangeState(RPAREN, Operator);
 904                 case '@':
 905                     return tokenAndChangeState(AT_SIGN, Default);
 906                 /* Fallthrough all these. */
 907                 case '1':
 908                 case '2':
 909                 case '3':
 910                 case '4':
 911                 case '5':
 912                 case '6':
 913                 case '7':
 914                 case '8':
 915                 case '9':
 916                 case '0':
 917                     return tokenizeNumberLiteral();
 918                 case '.':
 919                 {
 920                     const char next = peekAhead();
 921                     if(next == '.')
 922                         return tokenAndChangeState(DOTDOT, Operator, 2);
 923                     /* .5 is allowed, as short form for 0.5:
 924                      * <tt>[142]     DecimalLiteral     ::=     ("." Digits) | (Digits "." [0-9]*)</tt>
 925                      */
 926                     else if(isDigit(next))
 927                         return tokenizeNumberLiteral();
 928                     else
 929                         return tokenAndChangeState(DOT, Operator);
 930                 }
 931                 case '\'':
 932                 /* Fallthrough. */
 933                 case '"':
 934                 {
 935                     setState(Operator);
 936                     return tokenizeStringLiteral();
 937
 938                 }
 939                 case '(':
 940                 {
 941                     if(peekAhead() == '#')
 942                         return tokenAndChangeState(PRAGMA_START, Pragma, 2);
 943                     else
 944                         return tokenAndChangeState(LPAREN, Default);
 945                 }
 946                 case '*':
 947                 {
 948                     if(peekAhead() == ':')
 949                     {
 950                         m_pos += 2; /* Consume *:. */
 951                         const Token nc = tokenizeNCName();
 952
 953                         if(nc.hasError())
 954                             return error();
 955                         else
 956                             return tokenAndChangeState(ANY_PREFIX, nc.value, Operator);
 957                     }
 958                     else
 959                         return tokenAndChangeState(STAR, state() == Default ? Operator : Default);
 960                 }
 961                 case ':':
 962                 {
 963                     switch(peekAhead())
 964                     {
 965                         case '=':
 966                             return tokenAndChangeState(ASSIGN, Default, 2);
 967                         case ':':
 968                             return tokenAndChangeState(COLONCOLON, Default, 2);
 969                         default:
 970                             return error();
 971                     }
 972                 }
 973                 case '!':
 974                 {
 975                     if(peekAhead() == '=')
 976                         return tokenAndChangeState(G_NE, Default, 2);
 977                     else
 978                         return error();
 979                 }
 980                 case '<':
 981                 {
 982                     switch(peekAhead())
 983                     {
 984                         case '=':
 985                             return tokenAndChangeState(G_LE, Default, 2);
 986                         case '<':
 987                             return tokenAndChangeState(PRECEDES, Default, 2);
 988                         case '?':
 989                         {
 990                             pushState(Operator);
 991                             return tokenAndChangeState(PI_START, ProcessingInstructionName, 2);
 992                         }
 993                         case '!':
 994                         {
 995                             if(aheadEquals("!--", 3))
 996                             {
 997                                 m_pos += 3; /* Consume "!--". */
 998                                 pushState(Operator);
 999                                 return tokenAndChangeState(COMMENT_START, XMLComment);
1000                             }
1001                             /* Fallthrough. It's a syntax error, and this is a good way to report it. */
1002                         }
1003                         default:
1004                         {
1005                             if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1)))
1006                             {
1007                                 /* We assume it's an element constructor. */
1008                                 pushState(Operator);
1009                             }
1010
1011                             return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag);
1012                         }
1013                     }
1014                 }
1015                 case '>':
1016                 {
1017                     switch(peekAhead())
1018                     {
1019                         case '=':
1020                             return tokenAndChangeState(G_GE, Default, 2);
1021                         case '>':
1022                             return tokenAndChangeState(FOLLOWS, Default, 2);
1023                         default:
1024                             return tokenAndChangeState(G_GT, Default);
1025                     }
1026                 }
1027                 case '/':
1028                 {
1029                     if(peekAhead() == '/')
1030                         return tokenAndChangeState(SLASHSLASH, Default, 2);
1031                     else
1032                         return tokenAndChangeState(SLASH, Default);
1033                 }
1034                 case '{':
1035                 {
1036                     pushState(Operator);
1037                     return tokenAndChangeState(CURLY_LBRACE, Default);
1038                 }
1039                 case '}':
1040                 {
1041                     popState();
1042
1043                     return tokenAndAdvance(CURLY_RBRACE);
1044                 }
1045             }
1046
1047             /* Ok. We're in state Default or Operator, and it wasn't a simple
1048              * character. */
1049
1050             const Token id(tokenizeNCName());
1051
1052             if(id.type != NCNAME)
1053                 return id;
1054
1055             const TokenMap *const keyword = lookupKeyword(id.value);
1056
1057             if(state() == Operator)
1058             {
1059                 if(keyword)
1060                 {
1061                     if(keyword->token == DEFAULT || keyword->token == ASCENDING || keyword->token == DESCENDING)
1062                         setState(Operator);
1063                     else if(keyword->token == RETURN)
1064                         setState(Default);
1065                     else if(isPhraseKeyword(keyword->token))
1066                     {
1067                         const TokenType ws = consumeWhitespace();
1068                         if(ws == ERROR)
1069                             return error();
1070
1071                         const Token id2(tokenizeNCName());
1072                         const TokenMap *const keyword2 = lookupKeyword(id2.value);
1073
1074                         if(keyword2)
1075                         {
1076                             if(keyword->token == TREAT && keyword2->token == AS)
1077                                 setState(ItemType);
1078                             else if (keyword->token == CAST || (keyword->token == CASTABLE && keyword2->token == AS) || keyword2->token == BY)
1079                                 setState(Default);
1080
1081                             m_tokenStack.push(Token(keyword2->token));
1082                         }
1083                         else
1084                             m_tokenStack.push(id2);
1085
1086                         return Token(keyword->token);
1087                     }
1088                     else
1089                     {
1090                         /* Such that we tokenize the second token in "empty greatest". */
1091                         if(keyword->token != EMPTY)
1092                             setState(Default);
1093                     }
1094
1095                     if(keyword->token == AS || keyword->token == CASE)
1096                         setState(ItemType);
1097
1098                     return Token(keyword->token);
1099                 }
1100                 else
1101                     return id;
1102             }
1103
1104             Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator);
1105
1106             /*
1107              * This is hard. Consider this:
1108              *
1109              * Valid:           child       ::nameTest
1110              * Valid:           child::     nameTest
1111              * Syntax Error:    child       :localName
1112              * Syntax Error:    child:      localName
1113              *
1114              * Consider "child ::name". Right now, we're here:
1115              *                ^
1116              * We don't know whether "child" is a prefix and hence the whitespace is invalid,
1117              * or whether it's an axis and hence skippable. */
1118             {
1119                 const int wsLength = peekForColonColon();
1120                 /* We cannot call handleWhitespace() because it returns on
1121                  * END_OF_FILE, and we have parsed up keyword, and we need to
1122                  * deal with that.
1123                  *
1124                  * If we have a colon colon, which means the whitespace is
1125                  * allowed, we skip it. */
1126                 if(wsLength != -1)
1127                     m_pos += wsLength;
1128             }
1129
1130             /* Handle name tests. */
1131             if(peekCurrent() == ':')
1132             {
1133                 switch(peekAhead())
1134                 {
1135                     case '=':
1136                         return id;
1137                     case '*':
1138                     {
1139                         m_pos += 2;
1140                         return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator);
1141                     }
1142                     case ':':
1143                     {
1144                         /* We have an axis. */
1145                         setState(Axis);
1146                         return keyword ? Token(keyword->token) : id;
1147                     }
1148                     default:
1149                     {
1150                         /* It's a QName. */
1151                         ++m_pos; /* Consume the colon. */
1152
1153                         const Token id2(tokenizeNCName());
1154
1155                         if(id2.type != NCNAME)
1156                         {
1157                             --m_pos;
1158                             return id;
1159                         }
1160
1161                         setState(Operator);
1162                         const int qNameLen = id.value.length() + id2.value.length() + 1;
1163                         return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen));
1164                     }
1165                 }
1166             }
1167
1168             if(!keyword || isOperatorKeyword(keyword->token))
1169             {
1170                 setState(Operator);
1171                 return id;
1172             }
1173
1174             const TokenType ws = consumeWhitespace();
1175             if(ws == ERROR) // TODO this should test for success. Write test.
1176                 return Token(ERROR);
1177
1178             if(atEnd())
1179             {
1180                 setState(Operator);
1181                 return id;
1182             }
1183
1184             /* Let the if-body apply for constructors, and node type tests. */
1185             if(isTypeToken(keyword->token) ||
1186                keyword->token == TYPESWITCH ||
1187                keyword->token == ORDERED ||
1188                keyword->token == UNORDERED ||
1189                keyword->token == IF)
1190             {
1191                 switch(peekCurrent())
1192                 {
1193                     case '(':
1194                     {
1195                         // TODO See if we can remove DOCUMENT from isTypeToken.
1196                         if(isTypeToken(keyword->token) && keyword->token != DOCUMENT)
1197                         {
1198                             m_tokenStack.push(Token(LPAREN));
1199                             ++m_pos; /* Consume '('. */
1200                             pushState(Operator);
1201
1202                             if(keyword->token == PROCESSING_INSTRUCTION)
1203                                 setState(KindTestForPI);
1204                             else
1205                                 setState(KindTest);
1206
1207                             return Token(keyword->token);
1208                         }
1209                         else if(keyword->token == TYPESWITCH || keyword->token == IF)
1210                             return Token(keyword->token);
1211                         else /* It's a function call. */
1212                             return id;
1213                     }
1214                     case '{':
1215                     {
1216                         m_tokenStack.push(Token(CURLY_LBRACE));
1217                         ++m_pos; /* Consume '{'. */
1218                         pushState(Operator);
1219                         /* Stay in state Default. */
1220                         return Token(keyword->token);
1221                     }
1222                     default:
1223                     {
1224                         /* We have read in a token which is for instance
1225                          * "return", and now it can be an element
1226                          * test("element") a node kind test("element()"), or a
1227                          * computed element constructor("element name {...").
1228                          * We need to do a two-token lookahead here, because
1229                          * "element return" can be an element test followed by
1230                          * the return keyword, but it can also be an element
1231                          * constructor("element return {"). */
1232                         if(isNCNameStart(current()))
1233                         {
1234                             const int currentPos = m_pos;
1235                             const Token token2 = tokenizeNCNameOrQName();
1236
1237                             if(token2.hasError())
1238                                 return token2;
1239
1240                             handleWhitespace();
1241
1242                             if(peekCurrent() == '{')
1243                             {
1244                                 /* An element constructor. */
1245                                 m_tokenStack.push(token2);
1246                                 return Token(keyword->token);
1247                             }
1248
1249                             /* We jump back in the stream, we need to tokenize token2 according
1250                              * to the state. */
1251                             m_pos = currentPos;
1252                             setState(Operator);
1253                             return Token(NCNAME, QLatin1String(keyword->name));
1254                         }
1255                     }
1256                 }
1257             }
1258
1259             if(peekCurrent() == '$')
1260             {
1261                 setState(VarName);
1262                 return Token(keyword->token);
1263             }
1264
1265             /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */
1266             if(peekCurrent() == '(')
1267                 return id;
1268             else if(peekCurrent() == '{' && keyword->token == VALIDATE)
1269                 return Token(keyword->token);
1270
1271             if(!isNCNameStart(current()))
1272             {
1273                 setState(Operator);
1274                 return id;
1275             }
1276
1277             const Token id2(tokenizeNCName());
1278             const TokenMap *const keyword2 = lookupKeyword(id2.value);
1279
1280             if(!keyword2)
1281             {
1282                 /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */
1283                 setState(Operator);
1284                 return id;
1285             }
1286
1287             switch(keyword->token)
1288             {
1289                 case DECLARE:
1290                 {
1291                     switch(keyword2->token)
1292                     {
1293                         case VARIABLE:
1294                         /* Fallthrough. */
1295                         case FUNCTION:
1296                         {
1297                             m_tokenStack.push(Token(keyword2->token));
1298                             setState(Default);
1299                             return Token(keyword->token);
1300                         }
1301                         case OPTION:
1302                         {
1303                             m_tokenStack.push(Token(keyword2->token));
1304                             setState(Default);
1305                             return Token(keyword->token);
1306                         }
1307                         case COPY_NAMESPACES:
1308                         /* Fallthrough. */
1309                         case ORDERING:
1310                         {
1311                             m_tokenStack.push(Token(keyword2->token));
1312                             setState(NamespaceKeyword);
1313                             return Token(keyword->token);
1314                         }
1315                         case CONSTRUCTION:
1316                         {
1317                             // TODO identical to CONSTRUCTION?
1318                             m_tokenStack.push(Token(keyword2->token));
1319                             setState(Operator);
1320                             return Token(keyword->token);
1321                         }
1322                         case NAMESPACE:
1323                         /* Fallthrough. */
1324                         case BASEURI:
1325                         {
1326                             m_tokenStack.push(Token(keyword2->token));
1327                             setState(NamespaceDecl);
1328                             return Token(keyword->token);
1329                         }
1330                         case BOUNDARY_SPACE:
1331                         {
1332                             m_tokenStack.push(Token(keyword2->token));
1333                             setState(XMLSpaceDecl);
1334                             return Token(keyword->token);
1335                         }
1336                         case DEFAULT:
1337                         {
1338                             m_tokenStack.push(Token(keyword2->token));
1339
1340                             const TokenType ws2 = consumeWhitespace();
1341                             if(ws2 != SUCCESS)
1342                             {
1343                                 m_tokenStack.prepend(Token(ws2));
1344                                 return Token(keyword->token);
1345                             }
1346
1347                             const Token id3(tokenizeNCName());
1348
1349                             if(id3.type != NCNAME)
1350                             {
1351                                 m_tokenStack.prepend(id3);
1352                                 return Token(keyword->token);
1353                             }
1354
1355                             const TokenMap *const keyword3 = lookupKeyword(id3.value);
1356                             if(!keyword3)
1357                             {
1358                                 m_tokenStack.prepend(id3);
1359                                 return Token(keyword->token);
1360                             }
1361                             else
1362                             {
1363                                 m_tokenStack.prepend(Token(keyword3->token));
1364
1365                                 if(keyword3->token == ORDER)
1366                                     setState(Operator);
1367                                 else
1368                                     setState(NamespaceDecl);
1369                             }
1370
1371                             return Token(keyword->token);
1372                         }
1373                         default:
1374                         {
1375                             m_tokenStack.push(Token(keyword2->token));
1376                             setState(Default);
1377                             return id;
1378                         }
1379                     }
1380                 }
1381                 case XQUERY:
1382                 {
1383                     m_tokenStack.push(Token(keyword2->token));
1384
1385                     if(keyword2->token == VERSION)
1386                     {
1387                         setState(NamespaceDecl);
1388                         return Token(keyword->token);
1389                     }
1390                     else
1391                     {
1392                         setState(Operator);
1393                         return id;
1394                     }
1395                 }
1396                 case IMPORT:
1397                 {
1398                     m_tokenStack.push(Token(keyword2->token));
1399
1400                     switch(keyword2->token)
1401                     {
1402                         case SCHEMA:
1403                         /* Fallthrough. */
1404                         case MODULE:
1405                         {
1406                             setState(NamespaceKeyword);
1407                             return Token(keyword->token);
1408                         }
1409                         default:
1410                         {
1411                             setState(Operator);
1412                             return id;
1413                         }
1414                     }
1415                 }
1416                 case VALIDATE:
1417                 {
1418                     m_tokenStack.push(Token(keyword2->token));
1419
1420                     switch(keyword2->token)
1421                     {
1422                         case LAX:
1423                         case STRICT:
1424                         {
1425                             pushState(Operator);
1426                             return Token(keyword->token);
1427                         }
1428                         default:
1429                         {
1430                             setState(Operator);
1431                             return id;
1432                         }
1433                     }
1434                 }
1435                 default:
1436                 {
1437                     m_tokenStack.push(Token(keyword2->token));
1438                     setState(Operator);
1439                     return id;
1440                 }
1441             }
1442
1443             Q_ASSERT(false);
1444
1445         }
1446         case VarName:
1447         {
1448             if(peekCurrent() == '$')
1449                 return tokenAndAdvance(DOLLAR);
1450
1451             setState(Operator);
1452             return tokenizeNCNameOrQName();
1453             Q_ASSERT(false);
1454         }
1455         case ItemType:
1456         {
1457             switch(peekCurrent())
1458             {
1459                 case '(':
1460                     return tokenAndChangeState(LPAREN, KindTest);
1461                 case '$':
1462                     return tokenAndChangeState(DOLLAR, VarName);
1463             }
1464
1465             const Token name(tokenizeNCNameOrQName());
1466
1467             if(name.hasError())
1468                 return error();
1469
1470             else if(name.type == QNAME)
1471             {
1472                 setState(OccurrenceIndicator);
1473                 return name;
1474             }
1475             else
1476             {
1477                 const TokenMap *const keyword = lookupKeyword(name.value);
1478
1479                 if(keyword)
1480                 {
1481                     pushState(OccurrenceIndicator);
1482                     return Token(keyword->token);
1483                 }
1484                 else
1485                 {
1486                     setState(Default);
1487                     return name;
1488                 }
1489             }
1490             Q_ASSERT(false);
1491         }
1492         case KindTest:
1493         {
1494             switch(peekCurrent())
1495             {
1496                 case ')':
1497                 {
1498                     popState();
1499                     return tokenAndAdvance(RPAREN);
1500                 }
1501                 case '(':
1502                     return tokenAndAdvance(LPAREN);
1503                 case ',':
1504                     return tokenAndAdvance(COMMA);
1505                 case '*':
1506                     return tokenAndAdvance(STAR);
1507                 case '?':
1508                     return tokenAndAdvance(QUESTION);
1509                 case '\'':
1510                 /* Fallthrough. */
1511                 case '"':
1512                     return tokenizeStringLiteral();
1513             }
1514
1515             const Token nc(tokenizeNCNameOrQName());
1516             if(nc.hasError())
1517                 return nc;
1518
1519             const TokenType ws = consumeWhitespace();
1520             if(ws == ERROR)
1521                 return error();
1522
1523             if(peekCurrent() == '(')
1524             {
1525                 const TokenMap *const keyword = lookupKeyword(nc.value);
1526                 if(keyword)
1527                 {
1528                     pushState(KindTest);
1529                     return Token(keyword->token);
1530                 }
1531                 else
1532                     return nc;
1533             }
1534             else
1535                 return nc;
1536             Q_ASSERT(false);
1537         }
1538         case KindTestForPI:
1539         {
1540             switch(peekCurrent())
1541             {
1542                 case ')':
1543                 {
1544                     popState();
1545                     return tokenAndAdvance(RPAREN);
1546                 }
1547                 case '\'':
1548                 /* Fallthrough. */
1549                 case '"':
1550                     return tokenizeStringLiteral();
1551                 default:
1552                     return tokenizeNCName();
1553             }
1554             Q_ASSERT(false);
1555         }
1556         case OccurrenceIndicator:
1557         {
1558             switch(peekCurrent())
1559             {
1560                 case '?':
1561                     return tokenAndChangeState(QUESTION, Operator);
1562                 case '*':
1563                     return tokenAndChangeState(STAR, Operator);
1564                 case '+':
1565                     return tokenAndChangeState(PLUS, Operator);
1566                 default:
1567                 {
1568                     setState(Operator);
1569                     return nextToken();
1570                 }
1571             }
1572             Q_ASSERT(false);
1573         }
1574         case XQueryVersion:
1575         {
1576             switch(peekCurrent())
1577             {
1578                 case '\'':
1579                 /* Fallthrough. */
1580                 case '"':
1581                     return tokenizeStringLiteral();
1582                 case ';':
1583                     return tokenAndChangeState(SEMI_COLON, Default);
1584             }
1585
1586             const Token id(tokenizeNCName());
1587
1588             if(id.type != NCNAME)
1589                 return id;
1590
1591             const TokenMap *const keyword = lookupKeyword(id.value);
1592             if(keyword)
1593                 return tokenAndChangeState(keyword->token, Default);
1594             else
1595                 return id;
1596             Q_ASSERT(false);
1597         }
1598         case StartTag:
1599         {
1600             if(peekAhead(-1) == '<')
1601             {
1602                 if(current().isSpace())
1603                     return Token(ERROR);
1604             }
1605             else
1606             {
1607                 if(consumeRawWhitespace())
1608                     return Token(END_OF_FILE);
1609             }
1610
1611             switch(peekCurrent())
1612             {
1613                 case '/':
1614                 {
1615                     if(peekAhead() == '>')
1616                     {
1617                         m_pos += 2;
1618
1619                         if(m_scanOnly)
1620                             return Token(POSITION_SET);
1621                         else
1622                         {
1623                             popState();
1624                             return Token(QUICK_TAG_END);
1625                         }
1626                     }
1627                     else
1628                         return error();
1629                 }
1630                 case '>':
1631                 {
1632                     if(m_scanOnly)
1633                         return tokenAndChangeState(POSITION_SET, StartTag);
1634                     else
1635                         return tokenAndChangeState(G_GT, ElementContent);
1636                 }
1637                 case '=':
1638                     return tokenAndAdvance(G_EQ);
1639                 case '\'':
1640                     return tokenAndChangeState(APOS, AposAttributeContent);
1641                 case '"':
1642                     return tokenAndChangeState(QUOTE, QuotAttributeContent);
1643                 default:
1644                     return tokenizeNCNameOrQName();
1645             }
1646             Q_ASSERT(false);
1647         }
1648         case AposAttributeContent:
1649         /* Fallthrough. */
1650         case QuotAttributeContent:
1651         {
1652             const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"'));
1653             QString result;
1654             result.reserve(20);
1655
1656             if(m_scanOnly)
1657             {
1658                 int stack = 0;
1659                 return attributeAsRaw(sep, stack, m_pos, true, result);
1660             }
1661
1662             Q_ASSERT(!m_scanOnly);
1663             while(true)
1664             {
1665                 if(atEnd())
1666                 {
1667                     /* In the case that the XSL-T tokenizer invokes us with
1668                      * default state QuotAttributeContent, we need to be able
1669                      * to return a single string, in case that is all we have
1670                      * accumulated. */
1671                     if(result.isEmpty())
1672                         return Token(END_OF_FILE);
1673                     else
1674                         return Token(STRING_LITERAL, result);
1675                 }
1676
1677                 const QChar curr(current());
1678
1679                 if(curr == sep)
1680                 {
1681                     if(m_pos + 1 == m_length)
1682                         return Token(END_OF_FILE);
1683
1684                     if(m_data.at(m_pos + 1) == sep)
1685                     {
1686                         /* The quoting mechanism was used. */
1687                         m_pos += 2;
1688                         result.append(sep);
1689                         continue;
1690                     }
1691
1692                     const QChar next(m_data.at(m_pos + 1));
1693                     if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>'))
1694                         return Token(ERROR); // i18n Space must separate attributes
1695                     else if(result.isEmpty())
1696                     {
1697                         return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE,
1698                                                    StartTag, 1);
1699                     }
1700                     else
1701                     {
1702                         /* Don't consume the sep, but leave it so we next time return a token for it. */
1703                         return Token(STRING_LITERAL, result);
1704                     }
1705
1706                     ++m_pos;
1707                     continue;
1708                 }
1709                 else if(curr == QLatin1Char('{'))
1710                 {
1711                     if(m_pos + 1 == m_length)
1712                         return Token(END_OF_FILE);
1713                     else if(peekAhead() == '{')
1714                     {
1715                         ++m_pos;
1716                         result.append(QLatin1Char('{'));
1717                     }
1718                     else
1719                     {
1720                         if(result.isEmpty())
1721                         {
1722                             /* The Attribute Value Template appeared directly in the attribute. */
1723                             pushState();
1724                             return tokenAndChangeState(CURLY_LBRACE, Default);
1725                         }
1726                         else
1727                         {
1728                             /* We don't advance, keep '{' as next token. */
1729                             return Token(STRING_LITERAL, result);
1730                         }
1731                     }
1732                 }
1733                 else if(curr == QLatin1Char('}'))
1734                 {
1735                     if(m_pos + 1 == m_length)
1736                         return Token(END_OF_FILE);
1737                     else if(peekAhead() == '}')
1738                     {
1739                         ++m_pos;
1740                         result.append(QLatin1Char('}'));
1741                     }
1742                     else
1743                         return Token(ERROR);
1744                 }
1745                 else if(curr == QLatin1Char('&'))
1746                 {
1747                     const QString ret(tokenizeCharacterReference());
1748                     if(ret.isNull())
1749                         return Token(ERROR);
1750                     else
1751                         result.append(ret);
1752                 }
1753                 else if(curr == QLatin1Char('<'))
1754                     return Token(STRING_LITERAL, result);
1755                 else
1756                 {
1757                     /* See Extensible Markup Language (XML) 1.0 (Fourth Edition),
1758                      * 3.3.3 Attribute-Value Normalization.
1759                      *
1760                      * However, it is complicated a bit by that AVN is defined on top of
1761                      * EOL normalization and we do those two in one go here. */
1762                     switch(curr.unicode())
1763                     {
1764                         case 0xD:
1765                         {
1766                             if(peekAhead() == '\n')
1767                             {
1768                                 result.append(QLatin1Char(' '));
1769                                 ++m_pos;
1770                                 break;
1771                             }
1772                         }
1773                         case 0xA:
1774                         /* Fallthrough. */
1775                         case 0x9:
1776                         {
1777                             result.append(QLatin1Char(' '));
1778                             break;
1779                         }
1780                         default:
1781                             result.append(curr);
1782                     }
1783                 }
1784
1785                 ++m_pos;
1786             }
1787             Q_ASSERT(false);
1788         }
1789         case ElementContent:
1790         {
1791             QString result;
1792             result.reserve(20);
1793
1794             /* Whether the text node, result, may be whitespace only. Character references
1795              * and CDATA sections disables that. */
1796             bool mayBeWS = true;
1797
1798             CharacterSkips skipEOLNormalization;
1799
1800             while(true)
1801             {
1802                 if(atEnd())
1803                     return Token(END_OF_FILE);
1804
1805                 switch(peekCurrent())
1806                 {
1807                     case '<':
1808                     {
1809                         if(!result.isEmpty() && peekAhead(2) != '[')
1810                         {
1811                             /* We encountered the end, and it was not a CDATA section. */
1812                             /* We don't advance. Next time we'll handle the <... stuff. */
1813                             return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
1814                         }
1815
1816                         ++m_pos;
1817                         if(atEnd())
1818                             return Token(END_OF_FILE);
1819
1820                         const QChar ahead(current());
1821                         if(ahead.isSpace())
1822                             return error();
1823                         else if(ahead == QLatin1Char('/'))
1824                         {
1825                             if(m_pos + 1 == m_length)
1826                                 return Token(END_OF_FILE);
1827                             else if(m_data.at(m_pos + 1).isSpace())
1828                                 return error();
1829                             else
1830                                 return tokenAndChangeState(BEGIN_END_TAG, EndTag);
1831                         }
1832                         else if(isNCNameStart(ahead))
1833                         {
1834                             pushState();
1835                             return tokenAndChangeState(G_LT, StartTag, 0);
1836                         }
1837                         else if(aheadEquals("!--", 3, 0))
1838                         {
1839                             pushState();
1840                             m_pos += 3;
1841                             return tokenAndChangeState(COMMENT_START, XMLComment, 0);
1842                         }
1843                         else if(aheadEquals("![CDATA[", 8, 0))
1844                         {
1845                             mayBeWS = false;
1846                             m_pos += 8;
1847                             const int start = m_pos;
1848                             const int len = scanUntil("]]>");
1849
1850                             if(len == -1)
1851                                 return Token(END_OF_FILE);
1852
1853                             m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */
1854                             result.append(m_data.mid(start, len));
1855                             break;
1856                         }
1857                         else if(ahead == QLatin1Char('?'))
1858                         {
1859                             pushState();
1860                             return tokenAndChangeState(PI_START, ProcessingInstructionName);
1861                         }
1862                         else
1863                             return Token(G_LT);
1864                     }
1865                     case '&':
1866                     {
1867                         const QString ret(tokenizeCharacterReference());
1868                         if(ret.isNull())
1869                             return Token(ERROR);
1870                         else
1871                         {
1872                             skipEOLNormalization.insert(result.count());
1873                             result.append(ret);
1874                             mayBeWS = false;
1875                             break;
1876                         }
1877                     }
1878                     case '{':
1879                     {
1880                         // TODO remove this check, also below.
1881                         if(m_pos + 1 == m_length)
1882                             return Token(END_OF_FILE);
1883                         else if(peekAhead() == '{')
1884                         {
1885                             ++m_pos;
1886                             result.append(QLatin1Char('{'));
1887                         }
1888                         else
1889                         {
1890                             if(result.isEmpty())
1891                             {
1892                                 pushState();
1893                                 return tokenAndChangeState(CURLY_LBRACE, Default);
1894                             }
1895                             else
1896                             {
1897                                 /* We don't advance here. */
1898                                 return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
1899                             }
1900                         }
1901                         break;
1902                     }
1903                     case '}':
1904                     {
1905                         if(m_pos + 1 == m_length)
1906                             return Token(END_OF_FILE);
1907                         else if(peekAhead() == '}')
1908                         {
1909                             ++m_pos;
1910                             result.append(QLatin1Char('}'));
1911                         }
1912                         else
1913                         {
1914                             /* This is a parse error, and the grammar won't be able
1915                              * to reduce this CURLY_RBRACE. */
1916                             return tokenAndChangeState(CURLY_RBRACE, Default);
1917                         }
1918                         break;
1919                     }
1920                     case '\n':
1921                     {
1922                         /* We want to translate \r\n into \n. */
1923                         if(peekAhead(-1) == '\r')
1924                             break;
1925                         /* else, fallthrough. */
1926                     }
1927                     case '\r':
1928                     {
1929                         result.append(QLatin1Char('\n'));
1930                         break;
1931                     }
1932                     default:
1933                     {
1934                         result.append(current());
1935                         break;
1936                     }
1937                 }
1938                 ++m_pos;
1939             }
1940             Q_ASSERT(false);
1941         }
1942         case ProcessingInstructionName:
1943         {
1944             const int start = m_pos;
1945
1946             while(true)
1947             {
1948                 ++m_pos;
1949                 if(m_pos >= m_length)
1950                     return Token(END_OF_FILE);
1951
1952                 const QChar next(current());
1953                 if(next.isSpace() || next == QLatin1Char('?'))
1954                 {
1955                     return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start),
1956                                                ProcessingInstructionContent);
1957                 }
1958             }
1959             Q_ASSERT(false);
1960         }
1961         case ProcessingInstructionContent:
1962         {
1963             /* Consume whitespace between the name and the content. */
1964             if(consumeRawWhitespace())
1965                 return Token(END_OF_FILE);
1966
1967             const int start = m_pos;
1968             const int len = scanUntil("?>");
1969
1970             if(len == -1)
1971                 return Token(END_OF_FILE);
1972             else
1973             {
1974                 m_pos += 2; /* Consume "?>" */
1975                 popState();
1976                 return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
1977             }
1978             Q_ASSERT(false);
1979         }
1980         case EndTag:
1981         {
1982             if(consumeRawWhitespace())
1983                 return END_OF_FILE;
1984
1985             if(peekCurrent() == '>')
1986             {
1987                 popState();
1988                 return tokenAndAdvance(G_GT);
1989             }
1990             else
1991                 return tokenizeNCNameOrQName();
1992             Q_ASSERT(false);
1993         }
1994         case XMLComment:
1995         {
1996             const int start = m_pos;
1997             const int len = scanUntil("--");
1998
1999             if(len == -1)
2000                 return END_OF_FILE;
2001             else
2002             {
2003                 m_pos += 2; /* Consume "--". */
2004                 popState();
2005
2006                 if(peekCurrent() == '>')
2007                 {
2008                     ++m_pos;
2009                     return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
2010                 }
2011                 else
2012                     return error();
2013             }
2014             Q_ASSERT(false);
2015         }
2016         case Pragma:
2017         {
2018             /* Consume whitespace. */
2019             if(consumeRawWhitespace())
2020                 return Token(END_OF_FILE);
2021
2022             setState(PragmaContent);
2023             return tokenizeNCNameOrQName();
2024         }
2025         case PragmaContent:
2026         {
2027             QString result;
2028             result.reserve(20);
2029
2030             const bool hasWS = m_pos < m_length && current().isSpace();
2031
2032             /* Consume all whitespace up to the pragma content(if any). */
2033             if(consumeRawWhitespace())
2034                 return Token(END_OF_FILE);
2035
2036             if(peekCurrent() == '#' && peekAhead() == ')')
2037             {
2038                 /* We reached the end, and there's no pragma content. */
2039                 return tokenAndChangeState(PRAGMA_END, Default, 2);
2040             }
2041             else if(!hasWS)
2042             {
2043                 /* A separating space is required if there's pragma content. */
2044                 return error(); /* i18n */
2045             }
2046
2047             const int start = m_pos;
2048             const int len = scanUntil("#)");
2049             if(len == -1)
2050                 return Token(END_OF_FILE);
2051
2052             return Token(STRING_LITERAL, m_data.mid(start, len));
2053             Q_ASSERT(false);
2054         }
2055     }
2056
2057     Q_ASSERT(false);
2058     return error();
2059 }
2060
2061 Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep,
2062                                                  int &sepStack,
2063                                                  const int startPos,
2064                                                  const bool aInLiteral,
2065                                                  QString &result)
2066 {
2067     bool inLiteral = aInLiteral;
2068     const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"');
2069
2070     while(true)
2071     {
2072         if(atEnd())
2073             return END_OF_FILE;
2074
2075         if(peekCurrent() == sep.unicode())
2076         {
2077             if(inLiteral)
2078                 inLiteral = false;
2079             else
2080                 inLiteral = true;
2081
2082             if(peekAhead() == sep.unicode())
2083             {
2084                 /* The quoting mechanism was used. */
2085                 result.append(current());
2086                 m_pos += 2;
2087                 continue;
2088             }
2089             else
2090             {
2091                 /* Don't consume the separator, such that we
2092                  * return a token for it next time. */
2093                 if(m_pos == startPos)
2094                 {
2095                     ++m_pos;
2096                     setState(StartTag);
2097                     return Token(sep == QLatin1Char('"') ? QUOTE : APOS);
2098                 }
2099
2100
2101                 if(sepStack == 0)
2102                 {
2103                     return Token(STRING_LITERAL, result);
2104                 }
2105                 else
2106                 {
2107                     result.append(current());
2108                     ++m_pos;
2109                     continue;
2110                 }
2111             }
2112         }
2113         else if(peekCurrent() == '&')
2114         {
2115             const QString ret(tokenizeCharacterReference());
2116             if(ret.isNull())
2117                 return Token(ERROR);
2118             else
2119             {
2120                 result.append(ret);
2121                 ++m_pos;
2122                 continue;
2123             }
2124         }
2125         else if(peekCurrent() == otherSep)
2126         {
2127             result.append(current());
2128             ++m_pos;
2129
2130             if(peekCurrent() == otherSep)
2131                 ++m_pos;
2132
2133             if(inLiteral)
2134                 inLiteral = false;
2135             else
2136                 inLiteral = true;
2137
2138             continue;
2139         }
2140         else if(peekCurrent() == '{')
2141         {
2142             result.append(current());
2143
2144             if(peekAhead() == '{')
2145             {
2146                 m_pos += 2;
2147                 continue;
2148             }
2149             else
2150             {
2151                 ++m_pos;
2152                 ++sepStack;
2153                 const Token t(attributeAsRaw(sep, sepStack, startPos, false, result));
2154                 if(t.type != SUCCESS)
2155                     return t;
2156             }
2157
2158         }
2159         else if(peekCurrent() == '}')
2160         {
2161             if(inLiteral && peekAhead() == '}')
2162             {
2163                 result.append(current());
2164                 m_pos += 2;
2165                 continue;
2166             }
2167             else
2168             {
2169                 ++m_pos;
2170                 --sepStack;
2171                 return Token(SUCCESS); /* The return value is arbitrary. */
2172             }
2173         }
2174         else
2175         {
2176             result.append(current());
2177             ++m_pos;
2178         }
2179     }
2180 }
2181
2182 Tokenizer::Token XQueryTokenizer::nextToken(YYLTYPE *const sourceLocator)
2183 {
2184     sourceLocator->first_line = m_line;
2185     sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */
2186
2187     if(m_tokenStack.isEmpty())
2188         return nextToken();
2189     else
2190     {
2191         const Token retval(m_tokenStack.pop());
2192
2193         switch(retval.type)
2194         {
2195             case MODULE:
2196             /* Fallthrough.*/
2197             case SCHEMA:
2198             /* Fallthrough.*/
2199             case COPY_NAMESPACES:
2200             {
2201                 setState(NamespaceKeyword);
2202                 break;
2203             }
2204             case VERSION:
2205             {
2206                 setState(XQueryVersion);
2207                 break;
2208             }
2209             case AS:
2210             /* Fallthrough. */
2211             case OF:
2212             {
2213                 setState(ItemType);
2214                 break;
2215             }
2216             default:
2217             {
2218                 if(isOperatorKeyword(retval.type))
2219                     setState(Default);
2220
2221                 break;
2222             }
2223         };
2224
2225         return retval;
2226     }
2227 }
2228
2229 int XQueryTokenizer::commenceScanOnly()
2230 {
2231     m_scanOnly = true;
2232     return m_pos;
2233 }
2234
2235 void XQueryTokenizer::resumeTokenizationFrom(const int pos)
2236 {
2237     m_scanOnly = false;
2238     m_pos = pos;
2239 }
2240
2241 void XQueryTokenizer::setParserContext(const ParserContext::Ptr &)
2242 {
2243 }
2244
2245 #undef handleWhitespace
2246
2247 } // namespace QPatternist
2248
2249 QT_END_NAMESPACE