1 /****************************************************************************
3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
4 ** Contact: http://www.qt-project.org/
6 ** This file is part of the QtXmlPatterns module of the Qt Toolkit.
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** GNU Lesser General Public License Usage
10 ** This file may be used under the terms of the GNU Lesser General Public
11 ** License version 2.1 as published by the Free Software Foundation and
12 ** appearing in the file LICENSE.LGPL included in the packaging of this
13 ** file. Please review the following information to ensure the GNU Lesser
14 ** General Public License version 2.1 requirements will be met:
15 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
17 ** In addition, as a special exception, Nokia gives you certain additional
18 ** rights. These rights are described in the Nokia Qt LGPL Exception
19 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
21 ** GNU General Public License Usage
22 ** Alternatively, this file may be used under the terms of the GNU General
23 ** Public License version 3.0 as published by the Free Software Foundation
24 ** and appearing in the file LICENSE.GPL included in the packaging of this
25 ** file. Please review the following information to ensure the GNU General
26 ** Public License version 3.0 requirements will be met:
27 ** http://www.gnu.org/copyleft/gpl.html.
30 ** Alternatively, this file may be used in accordance with the terms and
31 ** conditions contained in a signed written agreement between you and Nokia.
40 ****************************************************************************/
44 #include "qquerytransformparser_p.h"
46 #include "qxquerytokenizer_p.h"
48 #include "qtokenlookup.cpp"
55 #define handleWhitespace() \
57 const TokenType t = consumeWhitespace(); \
62 XQueryTokenizer::XQueryTokenizer(const QString &query,
64 const State startingState) : Tokenizer(location)
66 , m_length(query.length())
67 , m_state(startingState)
73 Q_ASSERT(location.isValid() || location.isEmpty());
76 const QChar XQueryTokenizer::current() const
79 return m_data.at(m_pos);
84 char XQueryTokenizer::peekCurrent() const
86 return current().toLatin1();
89 int XQueryTokenizer::peekForColonColon() const
91 /* Note, we don't modify m_pos in this function, so we need to do offset
97 switch(m_data.at(pos).toLatin1())
99 /* Fallthrough these four. */
107 if(peekAhead((pos - m_pos) + 1) == ':')
120 Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
124 Q_ASSERT(advance >= 0);
130 Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
131 const QString &value,
135 return Token(code, value);
138 Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code,
141 Q_ASSERT(advance >= 0);
146 QString XQueryTokenizer::normalizeEOL(const QString &input,
147 const CharacterSkips &characterSkips)
149 const int len = input.count();
152 /* The likely hood is rather high it'll be the same content. */
155 for(int i = 0; i < len; ++i)
157 const QChar &at = input.at(i);
159 if(characterSkips.contains(i))
164 switch(input.at(i).unicode())
168 if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n'))
171 /* Else, fallthrough. */
175 result.append(QLatin1Char('\n'));
188 Tokenizer::TokenType XQueryTokenizer::consumeComment()
190 /* Below, we return ERROR instead of END_OF_FILE such that the parser
191 * sees an invalid comment. */
192 while(m_pos < m_length)
194 switch(peekCurrent())
198 ++m_pos; /* Consume ':' */
202 if(peekCurrent() == ')')
204 ++m_pos; /* Consume ')' */
205 return SUCCESS; /* The comment closed nicely. */
207 continue; /* We don't want to increment m_pos twice. */
210 { /* It looks like the start of a comment. */
215 else if(peekCurrent() == ':')
217 /* And it is a nested comment -- parse it. */
218 const TokenType retval = consumeComment();
219 if(retval == SUCCESS)
220 continue; /* Continue with our "own" comment. */
222 return retval; /* Return the error in the nested comment. */
230 /* We want to count \r\n as a single line break. */
231 if(peekAhead() == '\n')
234 m_columnOffset = m_pos;
243 return ERROR; /* Error: we reached the end while inside a comment. */
246 bool XQueryTokenizer::consumeRawWhitespace()
248 while(m_pos < m_length)
250 switch(peekCurrent())
258 if(peekAhead() == '\n')
261 m_columnOffset = m_pos;
274 Tokenizer::TokenType XQueryTokenizer::consumeWhitespace()
276 while(m_pos < m_length)
278 switch(peekCurrent())
286 /* We want to count \r\n as a single line break. */
287 if(peekAhead() == '\n')
290 m_columnOffset = m_pos;
297 if(peekAhead() == ':')
299 m_pos += 2; /* Consume "(:" */
301 const TokenType comment = consumeComment();
302 if(comment == SUCCESS)
317 char XQueryTokenizer::peekAhead(const int length) const
319 if(m_pos + length < m_length)
320 return m_data.at(m_pos + length).toLatin1();
325 Tokenizer::Token XQueryTokenizer::error()
330 bool XQueryTokenizer::isDigit(const char ch)
332 return ch >= '0' && ch <= '9';
335 /* Replace with function in QXmlUtils. Write test cases for this. */
336 bool XQueryTokenizer::isNCNameStart(const QChar ch)
338 if(ch == QLatin1Char('_'))
341 switch(ch.category())
343 case QChar::Letter_Lowercase:
344 case QChar::Letter_Uppercase:
345 case QChar::Letter_Other:
346 case QChar::Letter_Titlecase:
347 case QChar::Number_Letter:
354 bool XQueryTokenizer::isNCNameBody(const QChar ch)
364 switch(ch.category())
366 case QChar::Letter_Lowercase:
367 case QChar::Letter_Uppercase:
368 case QChar::Letter_Other:
369 case QChar::Letter_Titlecase:
370 case QChar::Number_Letter:
371 case QChar::Mark_SpacingCombining:
372 case QChar::Mark_Enclosing:
373 case QChar::Mark_NonSpacing:
374 case QChar::Letter_Modifier:
375 case QChar::Number_DecimalDigit:
382 bool XQueryTokenizer::isPhraseKeyword(const TokenType code)
386 /* Fallthrough all these. */
389 case COPY_NAMESPACES:
406 bool XQueryTokenizer::isOperatorKeyword(const TokenType code)
410 /* Fallthrough all these. */
447 bool XQueryTokenizer::isTypeToken(const TokenType t)
451 /* Fallthrough all these. */
459 case PROCESSING_INSTRUCTION:
460 case SCHEMA_ATTRIBUTE:
469 Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName()
471 const int start = m_pos;
473 const Token t1 = tokenizeNCName();
477 if(peekCurrent() != ':' || peekAhead() == '=')
482 const Token t2 = tokenizeNCName();
486 return Token(QNAME, m_data.mid(start, m_pos - start));
489 Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral()
492 const int startPos = m_pos;
494 bool isXPath20 = false;
496 for(; m_pos < m_length; ++m_pos)
500 char cell = ch.cell();
502 if(cell == 'e' || cell == 'E')
513 if(cell == '+' || cell == '-')
517 if(isNCNameStart(ch))
520 if(cell < '0' || cell > '9')
522 if(cell == '.' && !hasDot)
529 return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos));
532 QString XQueryTokenizer::tokenizeCharacterReference()
534 Q_ASSERT(peekCurrent() == '&');
536 const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1);
538 if(theEnd == -1) /* No ';' found, a syntax error. i18n. */
541 QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1));
544 const QChar charRef(charForReference(content));
546 if(!charRef.isNull())
548 else if(content.startsWith(QLatin1Char('#')))
552 /* It is only '#' or '#x'. */
553 if(content.length() < 2)
556 /* We got a hex number if it starts with 'x', otherwise it's a decimal. */
557 if(content.at(1) == QLatin1Char('x'))
560 content = content.mid(2); /* Remove "#x". */
565 content = content.mid(1); /* Remove "#". */
568 bool conversionOK = false;
569 const int codepoint = content.toInt(&conversionOK, base);
573 const QChar ch(codepoint);
577 /* We likely have something which require surrogate pairs. */
579 result += QChar(QChar::highSurrogate(codepoint));
580 result += QChar(QChar::lowSurrogate(codepoint));
593 int XQueryTokenizer::scanUntil(const char *const content)
595 const int end = m_data.indexOf(QString::fromLatin1(content), m_pos);
601 const int len = end - m_pos;
607 QChar XQueryTokenizer::charForReference(const QString &reference)
609 if(m_charRefs.isEmpty())
612 m_charRefs.reserve(5);
613 m_charRefs.insert(QLatin1String("lt"), QLatin1Char('<'));
614 m_charRefs.insert(QLatin1String("gt"), QLatin1Char('>'));
615 m_charRefs.insert(QLatin1String("amp"), QLatin1Char('&'));
616 m_charRefs.insert(QLatin1String("quot"), QLatin1Char('"'));
617 m_charRefs.insert(QLatin1String("apos"), QLatin1Char('\''));
620 return m_charRefs.value(reference);
623 Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral()
625 const QChar delimiter(current());
626 /* We cannot unfortunately just scan and then do mid(),
627 * since we can encounter character references. */
630 /* This is more likely than QString's default allocation. */
633 CharacterSkips skipEOLNormalization;
635 /* Advance over the initial quote character. */
638 for(; m_pos < m_length; ++m_pos)
640 const QChar c(current());
642 if(c == QLatin1Char('&'))
644 const QString charRef(tokenizeCharacterReference());
650 skipEOLNormalization.insert(result.count());
651 result.append(charRef);
655 else if(c == delimiter)
657 /* Maybe the escaping mechanism is used. For instance, "s""s"
658 * has the value `s"s'. */
661 if(current() == delimiter) /* Double quote. */
664 return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization));
673 Tokenizer::Token XQueryTokenizer::tokenizeNCName()
675 const int startPos = m_pos;
677 if(m_pos < m_length && isNCNameStart(current()))
681 for(; m_pos < m_length; ++m_pos)
683 if(!isNCNameBody(current()))
687 return Token(NCNAME, m_data.mid(startPos, m_pos - startPos));
693 bool XQueryTokenizer::aheadEquals(const char *const chs,
695 const int offset) const
698 Q_ASSERT(qstrlen(chs) == uint(len));
700 if(m_pos + len >= m_length)
703 for(int i = offset; i < (len + offset); ++i)
705 if(m_data.at(m_pos + i).toLatin1() != chs[i - offset])
712 const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword)
714 return TokenLookup::value(keyword.toLatin1().constData(), keyword.length());
717 XQueryTokenizer::State XQueryTokenizer::state() const
722 void XQueryTokenizer::setState(const State s)
727 void XQueryTokenizer::pushState(const State s)
729 m_stateStack.push(s);
732 void XQueryTokenizer::pushState()
734 m_stateStack.push(m_state);
737 void XQueryTokenizer::popState()
739 /* QStack::pop() asserts if it's empty, so we need to check
740 * it, since we might receive unbalanced curlies. */
741 if(!m_stateStack.isEmpty())
742 m_state = m_stateStack.pop();
745 Tokenizer::Token XQueryTokenizer::nextToken()
749 /* We want to skip or do special whitespace handling for these
750 * states. So fallthrough all of the following. */
751 case AposAttributeContent:
757 case ProcessingInstructionName:
758 case QuotAttributeContent:
770 case NamespaceKeyword:
772 switch(peekCurrent())
775 return tokenAndAdvance(COMMA);
780 setState(NamespaceDecl);
781 return tokenizeStringLiteral();
785 const Token id(tokenizeNCName());
787 if(id.type != NCNAME)
790 const TokenMap *const keyword = lookupKeyword(id.value);
793 switch(keyword->token)
804 setState(NamespaceDecl);
818 if(state() != NamespaceKeyword)
825 return Token(keyword->token);
834 switch(peekCurrent())
837 return tokenAndAdvance(G_EQ);
839 return tokenAndChangeState(SEMI_COLON, Default);
843 return tokenizeStringLiteral();
846 const Token nc(tokenizeNCName());
850 const char pc = peekCurrent();
851 const TokenMap* const t = lookupKeyword(nc.value);
853 if(pc == '\'' || (pc == '"' && t))
854 return tokenAndChangeState(t->token, Default, 0);
862 if(peekCurrent() == ':')
864 Q_ASSERT(peekAhead() == ':');
866 setState(AfterAxisSeparator);
867 return Token(COLONCOLON);
871 case AfterAxisSeparator:
874 /* State Operator and state Default have a lot of tokens in common except
875 * for minor differences. So we treat them the same way, and sprinkles logic
876 * here and there to handle the small differences. */
880 switch(peekCurrent())
883 return tokenAndChangeState(G_EQ, Default);
885 return tokenAndChangeState(MINUS, Default);
887 return tokenAndChangeState(PLUS, Default);
889 return tokenAndChangeState(LBRACKET, Default);
891 return tokenAndChangeState(RBRACKET, Operator);
893 return tokenAndChangeState(COMMA, Default);
895 return tokenAndChangeState(SEMI_COLON, Default);
897 return tokenAndChangeState(DOLLAR, VarName);
899 return tokenAndChangeState(BAR, Default);
901 return tokenAndChangeState(QUESTION, Operator);
903 return tokenAndChangeState(RPAREN, Operator);
905 return tokenAndChangeState(AT_SIGN, Default);
906 /* Fallthrough all these. */
917 return tokenizeNumberLiteral();
920 const char next = peekAhead();
922 return tokenAndChangeState(DOTDOT, Operator, 2);
923 /* .5 is allowed, as short form for 0.5:
924 * <tt>[142] DecimalLiteral ::= ("." Digits) | (Digits "." [0-9]*)</tt>
926 else if(isDigit(next))
927 return tokenizeNumberLiteral();
929 return tokenAndChangeState(DOT, Operator);
936 return tokenizeStringLiteral();
941 if(peekAhead() == '#')
942 return tokenAndChangeState(PRAGMA_START, Pragma, 2);
944 return tokenAndChangeState(LPAREN, Default);
948 if(peekAhead() == ':')
950 m_pos += 2; /* Consume *:. */
951 const Token nc = tokenizeNCName();
956 return tokenAndChangeState(ANY_PREFIX, nc.value, Operator);
959 return tokenAndChangeState(STAR, state() == Default ? Operator : Default);
966 return tokenAndChangeState(ASSIGN, Default, 2);
968 return tokenAndChangeState(COLONCOLON, Default, 2);
975 if(peekAhead() == '=')
976 return tokenAndChangeState(G_NE, Default, 2);
985 return tokenAndChangeState(G_LE, Default, 2);
987 return tokenAndChangeState(PRECEDES, Default, 2);
991 return tokenAndChangeState(PI_START, ProcessingInstructionName, 2);
995 if(aheadEquals("!--", 3))
997 m_pos += 3; /* Consume "!--". */
999 return tokenAndChangeState(COMMENT_START, XMLComment);
1001 /* Fallthrough. It's a syntax error, and this is a good way to report it. */
1005 if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1)))
1007 /* We assume it's an element constructor. */
1008 pushState(Operator);
1011 return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag);
1020 return tokenAndChangeState(G_GE, Default, 2);
1022 return tokenAndChangeState(FOLLOWS, Default, 2);
1024 return tokenAndChangeState(G_GT, Default);
1029 if(peekAhead() == '/')
1030 return tokenAndChangeState(SLASHSLASH, Default, 2);
1032 return tokenAndChangeState(SLASH, Default);
1036 pushState(Operator);
1037 return tokenAndChangeState(CURLY_LBRACE, Default);
1043 return tokenAndAdvance(CURLY_RBRACE);
1047 /* Ok. We're in state Default or Operator, and it wasn't a simple
1050 const Token id(tokenizeNCName());
1052 if(id.type != NCNAME)
1055 const TokenMap *const keyword = lookupKeyword(id.value);
1057 if(state() == Operator)
1061 if(keyword->token == DEFAULT || keyword->token == ASCENDING || keyword->token == DESCENDING)
1063 else if(keyword->token == RETURN)
1065 else if(isPhraseKeyword(keyword->token))
1067 const TokenType ws = consumeWhitespace();
1071 const Token id2(tokenizeNCName());
1072 const TokenMap *const keyword2 = lookupKeyword(id2.value);
1076 if(keyword->token == TREAT && keyword2->token == AS)
1078 else if (keyword->token == CAST || (keyword->token == CASTABLE && keyword2->token == AS) || keyword2->token == BY)
1081 m_tokenStack.push(Token(keyword2->token));
1084 m_tokenStack.push(id2);
1086 return Token(keyword->token);
1090 /* Such that we tokenize the second token in "empty greatest". */
1091 if(keyword->token != EMPTY)
1095 if(keyword->token == AS || keyword->token == CASE)
1098 return Token(keyword->token);
1104 Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator);
1107 * This is hard. Consider this:
1109 * Valid: child ::nameTest
1110 * Valid: child:: nameTest
1111 * Syntax Error: child :localName
1112 * Syntax Error: child: localName
1114 * Consider "child ::name". Right now, we're here:
1116 * We don't know whether "child" is a prefix and hence the whitespace is invalid,
1117 * or whether it's an axis and hence skippable. */
1119 const int wsLength = peekForColonColon();
1120 /* We cannot call handleWhitespace() because it returns on
1121 * END_OF_FILE, and we have parsed up keyword, and we need to
1124 * If we have a colon colon, which means the whitespace is
1125 * allowed, we skip it. */
1130 /* Handle name tests. */
1131 if(peekCurrent() == ':')
1140 return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator);
1144 /* We have an axis. */
1146 return keyword ? Token(keyword->token) : id;
1151 ++m_pos; /* Consume the colon. */
1153 const Token id2(tokenizeNCName());
1155 if(id2.type != NCNAME)
1162 const int qNameLen = id.value.length() + id2.value.length() + 1;
1163 return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen));
1168 if(!keyword || isOperatorKeyword(keyword->token))
1174 const TokenType ws = consumeWhitespace();
1175 if(ws == ERROR) // TODO this should test for success. Write test.
1176 return Token(ERROR);
1184 /* Let the if-body apply for constructors, and node type tests. */
1185 if(isTypeToken(keyword->token) ||
1186 keyword->token == TYPESWITCH ||
1187 keyword->token == ORDERED ||
1188 keyword->token == UNORDERED ||
1189 keyword->token == IF)
1191 switch(peekCurrent())
1195 // TODO See if we can remove DOCUMENT from isTypeToken.
1196 if(isTypeToken(keyword->token) && keyword->token != DOCUMENT)
1198 m_tokenStack.push(Token(LPAREN));
1199 ++m_pos; /* Consume '('. */
1200 pushState(Operator);
1202 if(keyword->token == PROCESSING_INSTRUCTION)
1203 setState(KindTestForPI);
1207 return Token(keyword->token);
1209 else if(keyword->token == TYPESWITCH || keyword->token == IF)
1210 return Token(keyword->token);
1211 else /* It's a function call. */
1216 m_tokenStack.push(Token(CURLY_LBRACE));
1217 ++m_pos; /* Consume '{'. */
1218 pushState(Operator);
1219 /* Stay in state Default. */
1220 return Token(keyword->token);
1224 /* We have read in a token which is for instance
1225 * "return", and now it can be an element
1226 * test("element") a node kind test("element()"), or a
1227 * computed element constructor("element name {...").
1228 * We need to do a two-token lookahead here, because
1229 * "element return" can be an element test followed by
1230 * the return keyword, but it can also be an element
1231 * constructor("element return {"). */
1232 if(isNCNameStart(current()))
1234 const int currentPos = m_pos;
1235 const Token token2 = tokenizeNCNameOrQName();
1237 if(token2.hasError())
1242 if(peekCurrent() == '{')
1244 /* An element constructor. */
1245 m_tokenStack.push(token2);
1246 return Token(keyword->token);
1249 /* We jump back in the stream, we need to tokenize token2 according
1253 return Token(NCNAME, QLatin1String(keyword->name));
1259 if(peekCurrent() == '$')
1262 return Token(keyword->token);
1265 /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */
1266 if(peekCurrent() == '(')
1268 else if(peekCurrent() == '{' && keyword->token == VALIDATE)
1269 return Token(keyword->token);
1271 if(!isNCNameStart(current()))
1277 const Token id2(tokenizeNCName());
1278 const TokenMap *const keyword2 = lookupKeyword(id2.value);
1282 /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */
1287 switch(keyword->token)
1291 switch(keyword2->token)
1297 m_tokenStack.push(Token(keyword2->token));
1299 return Token(keyword->token);
1303 m_tokenStack.push(Token(keyword2->token));
1305 return Token(keyword->token);
1307 case COPY_NAMESPACES:
1311 m_tokenStack.push(Token(keyword2->token));
1312 setState(NamespaceKeyword);
1313 return Token(keyword->token);
1317 // TODO identical to CONSTRUCTION?
1318 m_tokenStack.push(Token(keyword2->token));
1320 return Token(keyword->token);
1326 m_tokenStack.push(Token(keyword2->token));
1327 setState(NamespaceDecl);
1328 return Token(keyword->token);
1330 case BOUNDARY_SPACE:
1332 m_tokenStack.push(Token(keyword2->token));
1333 setState(XMLSpaceDecl);
1334 return Token(keyword->token);
1338 m_tokenStack.push(Token(keyword2->token));
1340 const TokenType ws2 = consumeWhitespace();
1343 m_tokenStack.prepend(Token(ws2));
1344 return Token(keyword->token);
1347 const Token id3(tokenizeNCName());
1349 if(id3.type != NCNAME)
1351 m_tokenStack.prepend(id3);
1352 return Token(keyword->token);
1355 const TokenMap *const keyword3 = lookupKeyword(id3.value);
1358 m_tokenStack.prepend(id3);
1359 return Token(keyword->token);
1363 m_tokenStack.prepend(Token(keyword3->token));
1365 if(keyword3->token == ORDER)
1368 setState(NamespaceDecl);
1371 return Token(keyword->token);
1375 m_tokenStack.push(Token(keyword2->token));
1383 m_tokenStack.push(Token(keyword2->token));
1385 if(keyword2->token == VERSION)
1387 setState(NamespaceDecl);
1388 return Token(keyword->token);
1398 m_tokenStack.push(Token(keyword2->token));
1400 switch(keyword2->token)
1406 setState(NamespaceKeyword);
1407 return Token(keyword->token);
1418 m_tokenStack.push(Token(keyword2->token));
1420 switch(keyword2->token)
1425 pushState(Operator);
1426 return Token(keyword->token);
1437 m_tokenStack.push(Token(keyword2->token));
1448 if(peekCurrent() == '$')
1449 return tokenAndAdvance(DOLLAR);
1452 return tokenizeNCNameOrQName();
1457 switch(peekCurrent())
1460 return tokenAndChangeState(LPAREN, KindTest);
1462 return tokenAndChangeState(DOLLAR, VarName);
1465 const Token name(tokenizeNCNameOrQName());
1470 else if(name.type == QNAME)
1472 setState(OccurrenceIndicator);
1477 const TokenMap *const keyword = lookupKeyword(name.value);
1481 pushState(OccurrenceIndicator);
1482 return Token(keyword->token);
1494 switch(peekCurrent())
1499 return tokenAndAdvance(RPAREN);
1502 return tokenAndAdvance(LPAREN);
1504 return tokenAndAdvance(COMMA);
1506 return tokenAndAdvance(STAR);
1508 return tokenAndAdvance(QUESTION);
1512 return tokenizeStringLiteral();
1515 const Token nc(tokenizeNCNameOrQName());
1519 const TokenType ws = consumeWhitespace();
1523 if(peekCurrent() == '(')
1525 const TokenMap *const keyword = lookupKeyword(nc.value);
1528 pushState(KindTest);
1529 return Token(keyword->token);
1540 switch(peekCurrent())
1545 return tokenAndAdvance(RPAREN);
1550 return tokenizeStringLiteral();
1552 return tokenizeNCName();
1556 case OccurrenceIndicator:
1558 switch(peekCurrent())
1561 return tokenAndChangeState(QUESTION, Operator);
1563 return tokenAndChangeState(STAR, Operator);
1565 return tokenAndChangeState(PLUS, Operator);
1576 switch(peekCurrent())
1581 return tokenizeStringLiteral();
1583 return tokenAndChangeState(SEMI_COLON, Default);
1586 const Token id(tokenizeNCName());
1588 if(id.type != NCNAME)
1591 const TokenMap *const keyword = lookupKeyword(id.value);
1593 return tokenAndChangeState(keyword->token, Default);
1600 if(peekAhead(-1) == '<')
1602 if(current().isSpace())
1603 return Token(ERROR);
1607 if(consumeRawWhitespace())
1608 return Token(END_OF_FILE);
1611 switch(peekCurrent())
1615 if(peekAhead() == '>')
1620 return Token(POSITION_SET);
1624 return Token(QUICK_TAG_END);
1633 return tokenAndChangeState(POSITION_SET, StartTag);
1635 return tokenAndChangeState(G_GT, ElementContent);
1638 return tokenAndAdvance(G_EQ);
1640 return tokenAndChangeState(APOS, AposAttributeContent);
1642 return tokenAndChangeState(QUOTE, QuotAttributeContent);
1644 return tokenizeNCNameOrQName();
1648 case AposAttributeContent:
1650 case QuotAttributeContent:
1652 const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"'));
1659 return attributeAsRaw(sep, stack, m_pos, true, result);
1662 Q_ASSERT(!m_scanOnly);
1667 /* In the case that the XSL-T tokenizer invokes us with
1668 * default state QuotAttributeContent, we need to be able
1669 * to return a single string, in case that is all we have
1671 if(result.isEmpty())
1672 return Token(END_OF_FILE);
1674 return Token(STRING_LITERAL, result);
1677 const QChar curr(current());
1681 if(m_pos + 1 == m_length)
1682 return Token(END_OF_FILE);
1684 if(m_data.at(m_pos + 1) == sep)
1686 /* The quoting mechanism was used. */
1692 const QChar next(m_data.at(m_pos + 1));
1693 if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>'))
1694 return Token(ERROR); // i18n Space must separate attributes
1695 else if(result.isEmpty())
1697 return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE,
1702 /* Don't consume the sep, but leave it so we next time return a token for it. */
1703 return Token(STRING_LITERAL, result);
1709 else if(curr == QLatin1Char('{'))
1711 if(m_pos + 1 == m_length)
1712 return Token(END_OF_FILE);
1713 else if(peekAhead() == '{')
1716 result.append(QLatin1Char('{'));
1720 if(result.isEmpty())
1722 /* The Attribute Value Template appeared directly in the attribute. */
1724 return tokenAndChangeState(CURLY_LBRACE, Default);
1728 /* We don't advance, keep '{' as next token. */
1729 return Token(STRING_LITERAL, result);
1733 else if(curr == QLatin1Char('}'))
1735 if(m_pos + 1 == m_length)
1736 return Token(END_OF_FILE);
1737 else if(peekAhead() == '}')
1740 result.append(QLatin1Char('}'));
1743 return Token(ERROR);
1745 else if(curr == QLatin1Char('&'))
1747 const QString ret(tokenizeCharacterReference());
1749 return Token(ERROR);
1753 else if(curr == QLatin1Char('<'))
1754 return Token(STRING_LITERAL, result);
1757 /* See Extensible Markup Language (XML) 1.0 (Fourth Edition),
1758 * 3.3.3 Attribute-Value Normalization.
1760 * However, it is complicated a bit by that AVN is defined on top of
1761 * EOL normalization and we do those two in one go here. */
1762 switch(curr.unicode())
1766 if(peekAhead() == '\n')
1768 result.append(QLatin1Char(' '));
1777 result.append(QLatin1Char(' '));
1781 result.append(curr);
1789 case ElementContent:
1794 /* Whether the text node, result, may be whitespace only. Character references
1795 * and CDATA sections disables that. */
1796 bool mayBeWS = true;
1798 CharacterSkips skipEOLNormalization;
1803 return Token(END_OF_FILE);
1805 switch(peekCurrent())
1809 if(!result.isEmpty() && peekAhead(2) != '[')
1811 /* We encountered the end, and it was not a CDATA section. */
1812 /* We don't advance. Next time we'll handle the <... stuff. */
1813 return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
1818 return Token(END_OF_FILE);
1820 const QChar ahead(current());
1823 else if(ahead == QLatin1Char('/'))
1825 if(m_pos + 1 == m_length)
1826 return Token(END_OF_FILE);
1827 else if(m_data.at(m_pos + 1).isSpace())
1830 return tokenAndChangeState(BEGIN_END_TAG, EndTag);
1832 else if(isNCNameStart(ahead))
1835 return tokenAndChangeState(G_LT, StartTag, 0);
1837 else if(aheadEquals("!--", 3, 0))
1841 return tokenAndChangeState(COMMENT_START, XMLComment, 0);
1843 else if(aheadEquals("![CDATA[", 8, 0))
1847 const int start = m_pos;
1848 const int len = scanUntil("]]>");
1851 return Token(END_OF_FILE);
1853 m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */
1854 result.append(m_data.mid(start, len));
1857 else if(ahead == QLatin1Char('?'))
1860 return tokenAndChangeState(PI_START, ProcessingInstructionName);
1867 const QString ret(tokenizeCharacterReference());
1869 return Token(ERROR);
1872 skipEOLNormalization.insert(result.count());
1880 // TODO remove this check, also below.
1881 if(m_pos + 1 == m_length)
1882 return Token(END_OF_FILE);
1883 else if(peekAhead() == '{')
1886 result.append(QLatin1Char('{'));
1890 if(result.isEmpty())
1893 return tokenAndChangeState(CURLY_LBRACE, Default);
1897 /* We don't advance here. */
1898 return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
1905 if(m_pos + 1 == m_length)
1906 return Token(END_OF_FILE);
1907 else if(peekAhead() == '}')
1910 result.append(QLatin1Char('}'));
1914 /* This is a parse error, and the grammar won't be able
1915 * to reduce this CURLY_RBRACE. */
1916 return tokenAndChangeState(CURLY_RBRACE, Default);
1922 /* We want to translate \r\n into \n. */
1923 if(peekAhead(-1) == '\r')
1925 /* else, fallthrough. */
1929 result.append(QLatin1Char('\n'));
1934 result.append(current());
1942 case ProcessingInstructionName:
1944 const int start = m_pos;
1949 if(m_pos >= m_length)
1950 return Token(END_OF_FILE);
1952 const QChar next(current());
1953 if(next.isSpace() || next == QLatin1Char('?'))
1955 return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start),
1956 ProcessingInstructionContent);
1961 case ProcessingInstructionContent:
1963 /* Consume whitespace between the name and the content. */
1964 if(consumeRawWhitespace())
1965 return Token(END_OF_FILE);
1967 const int start = m_pos;
1968 const int len = scanUntil("?>");
1971 return Token(END_OF_FILE);
1974 m_pos += 2; /* Consume "?>" */
1976 return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
1982 if(consumeRawWhitespace())
1985 if(peekCurrent() == '>')
1988 return tokenAndAdvance(G_GT);
1991 return tokenizeNCNameOrQName();
1996 const int start = m_pos;
1997 const int len = scanUntil("--");
2003 m_pos += 2; /* Consume "--". */
2006 if(peekCurrent() == '>')
2009 return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
2018 /* Consume whitespace. */
2019 if(consumeRawWhitespace())
2020 return Token(END_OF_FILE);
2022 setState(PragmaContent);
2023 return tokenizeNCNameOrQName();
2030 const bool hasWS = m_pos < m_length && current().isSpace();
2032 /* Consume all whitespace up to the pragma content(if any). */
2033 if(consumeRawWhitespace())
2034 return Token(END_OF_FILE);
2036 if(peekCurrent() == '#' && peekAhead() == ')')
2038 /* We reached the end, and there's no pragma content. */
2039 return tokenAndChangeState(PRAGMA_END, Default, 2);
2043 /* A separating space is required if there's pragma content. */
2044 return error(); /* i18n */
2047 const int start = m_pos;
2048 const int len = scanUntil("#)");
2050 return Token(END_OF_FILE);
2052 return Token(STRING_LITERAL, m_data.mid(start, len));
2061 Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep,
2064 const bool aInLiteral,
2067 bool inLiteral = aInLiteral;
2068 const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"');
2075 if(peekCurrent() == sep.unicode())
2082 if(peekAhead() == sep.unicode())
2084 /* The quoting mechanism was used. */
2085 result.append(current());
2091 /* Don't consume the separator, such that we
2092 * return a token for it next time. */
2093 if(m_pos == startPos)
2097 return Token(sep == QLatin1Char('"') ? QUOTE : APOS);
2103 return Token(STRING_LITERAL, result);
2107 result.append(current());
2113 else if(peekCurrent() == '&')
2115 const QString ret(tokenizeCharacterReference());
2117 return Token(ERROR);
2125 else if(peekCurrent() == otherSep)
2127 result.append(current());
2130 if(peekCurrent() == otherSep)
2140 else if(peekCurrent() == '{')
2142 result.append(current());
2144 if(peekAhead() == '{')
2153 const Token t(attributeAsRaw(sep, sepStack, startPos, false, result));
2154 if(t.type != SUCCESS)
2159 else if(peekCurrent() == '}')
2161 if(inLiteral && peekAhead() == '}')
2163 result.append(current());
2171 return Token(SUCCESS); /* The return value is arbitrary. */
2176 result.append(current());
2182 Tokenizer::Token XQueryTokenizer::nextToken(YYLTYPE *const sourceLocator)
2184 sourceLocator->first_line = m_line;
2185 sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */
2187 if(m_tokenStack.isEmpty())
2191 const Token retval(m_tokenStack.pop());
2199 case COPY_NAMESPACES:
2201 setState(NamespaceKeyword);
2206 setState(XQueryVersion);
2218 if(isOperatorKeyword(retval.type))
2229 int XQueryTokenizer::commenceScanOnly()
2235 void XQueryTokenizer::resumeTokenizationFrom(const int pos)
2241 void XQueryTokenizer::setParserContext(const ParserContext::Ptr &)
2245 #undef handleWhitespace
2247 } // namespace QPatternist