Source/WebCore/xml/XPathParser.cpp

   1 /*
   2  * Copyright 2005 Maksim Orlovich <maksim@kde.org>
   3  * Copyright (C) 2006 Apple Computer, Inc.
   4  * Copyright (C) 2007 Alexey Proskuryakov <ap@webkit.org>
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26  */
  27
  28 #include "config.h"
  29 #include "XPathParser.h"
  30
  31 #include "ExceptionCode.h"
  32 #include "XPathEvaluator.h"
  33 #include "XPathException.h"
  34 #include "XPathNSResolver.h"
  35 #include "XPathStep.h"
  36 #include <wtf/StdLibExtras.h>
  37 #include <wtf/text/StringHash.h>
  38
  39 int xpathyyparse(void*);
  40
  41 using namespace WTF;
  42 using namespace Unicode;
  43
  44 namespace WebCore {
  45 namespace XPath {
  46
  47 class LocationPath;
  48
  49 #include "XPathGrammar.h"
  50
  51 Parser* Parser::currentParser = 0;
  52
  53 enum XMLCat { NameStart, NameCont, NotPartOfName };
  54
  55 typedef HashMap<String, Step::Axis> AxisNamesMap;
  56
  57 static XMLCat charCat(UChar aChar)
  58 {
  59     //### might need to add some special cases from the XML spec.
  60
  61     if (aChar == '_')
  62         return NameStart;
  63
  64     if (aChar == '.' || aChar == '-')
  65         return NameCont;
  66     CharCategory category = Unicode::category(aChar);
  67     if (category & (Letter_Uppercase | Letter_Lowercase | Letter_Other | Letter_Titlecase | Number_Letter))
  68         return NameStart;
  69     if (category & (Mark_NonSpacing | Mark_SpacingCombining | Mark_Enclosing | Letter_Modifier | Number_DecimalDigit))
  70         return NameCont;
  71     return NotPartOfName;
  72 }
  73
  74 static void setUpAxisNamesMap(AxisNamesMap& axisNames)
  75 {
  76     struct AxisName {
  77         const char* name;
  78         Step::Axis axis;
  79     };
  80     const AxisName axisNameList[] = {
  81         { "ancestor", Step::AncestorAxis },
  82         { "ancestor-or-self", Step::AncestorOrSelfAxis },
  83         { "attribute", Step::AttributeAxis },
  84         { "child", Step::ChildAxis },
  85         { "descendant", Step::DescendantAxis },
  86         { "descendant-or-self", Step::DescendantOrSelfAxis },
  87         { "following", Step::FollowingAxis },
  88         { "following-sibling", Step::FollowingSiblingAxis },
  89         { "namespace", Step::NamespaceAxis },
  90         { "parent", Step::ParentAxis },
  91         { "preceding", Step::PrecedingAxis },
  92         { "preceding-sibling", Step::PrecedingSiblingAxis },
  93         { "self", Step::SelfAxis }
  94     };
  95     for (unsigned i = 0; i < sizeof(axisNameList) / sizeof(axisNameList[0]); ++i)
  96         axisNames.set(axisNameList[i].name, axisNameList[i].axis);
  97 }
  98
  99 static bool isAxisName(const String& name, Step::Axis& type)
 100 {
 101     DEFINE_STATIC_LOCAL(AxisNamesMap, axisNames, ());
 102
 103     if (axisNames.isEmpty())
 104         setUpAxisNamesMap(axisNames);
 105
 106     AxisNamesMap::iterator it = axisNames.find(name);
 107     if (it == axisNames.end())
 108         return false;
 109     type = it->second;
 110     return true;
 111 }
 112
 113 static bool isNodeTypeName(const String& name)
 114 {
 115     DEFINE_STATIC_LOCAL(HashSet<String>, nodeTypeNames, ());
 116     if (nodeTypeNames.isEmpty()) {
 117         nodeTypeNames.add("comment");
 118         nodeTypeNames.add("text");
 119         nodeTypeNames.add("processing-instruction");
 120         nodeTypeNames.add("node");
 121     }
 122     return nodeTypeNames.contains(name);
 123 }
 124
 125 // Returns whether the current token can possibly be a binary operator, given
 126 // the previous token. Necessary to disambiguate some of the operators
 127 // (* (multiply), div, and, or, mod) in the [32] Operator rule
 128 // (check http://www.w3.org/TR/xpath#exprlex).
 129 bool Parser::isBinaryOperatorContext() const
 130 {
 131     switch (m_lastTokenType) {
 132     case 0:
 133     case '@': case AXISNAME: case '(': case '[': case ',':
 134     case AND: case OR: case MULOP:
 135     case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
 136     case EQOP: case RELOP:
 137         return false;
 138     default:
 139         return true;
 140     }
 141 }
 142
 143 void Parser::skipWS()
 144 {
 145     while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos]))
 146         ++m_nextPos;
 147 }
 148
 149 Token Parser::makeTokenAndAdvance(int code, int advance)
 150 {
 151     m_nextPos += advance;
 152     return Token(code);
 153 }
 154
 155 Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance)
 156 {
 157     m_nextPos += advance;
 158     return Token(code, val);
 159 }
 160
 161 Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance)
 162 {
 163     m_nextPos += advance;
 164     return Token(code, val);
 165 }
 166
 167 // Returns next char if it's there and interesting, 0 otherwise
 168 char Parser::peekAheadHelper()
 169 {
 170     if (m_nextPos + 1 >= m_data.length())
 171         return 0;
 172     UChar next = m_data[m_nextPos + 1];
 173     if (next >= 0xff)
 174         return 0;
 175     return next;
 176 }
 177
 178 char Parser::peekCurHelper()
 179 {
 180     if (m_nextPos >= m_data.length())
 181         return 0;
 182     UChar next = m_data[m_nextPos];
 183     if (next >= 0xff)
 184         return 0;
 185     return next;
 186 }
 187
 188 Token Parser::lexString()
 189 {
 190     UChar delimiter = m_data[m_nextPos];
 191     int startPos = m_nextPos + 1;
 192
 193     for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
 194         if (m_data[m_nextPos] == delimiter) {
 195             String value = m_data.substring(startPos, m_nextPos - startPos);
 196             if (value.isNull())
 197                 value = "";
 198             ++m_nextPos; // Consume the char.
 199             return Token(LITERAL, value);
 200         }
 201     }
 202
 203     // Ouch, went off the end -- report error.
 204     return Token(XPATH_ERROR);
 205 }
 206
 207 Token Parser::lexNumber()
 208 {
 209     int startPos = m_nextPos;
 210     bool seenDot = false;
 211
 212     // Go until end or a non-digits character.
 213     for (; m_nextPos < m_data.length(); ++m_nextPos) {
 214         UChar aChar = m_data[m_nextPos];
 215         if (aChar >= 0xff) break;
 216
 217         if (aChar < '0' || aChar > '9') {
 218             if (aChar == '.' && !seenDot)
 219                 seenDot = true;
 220             else
 221                 break;
 222         }
 223     }
 224
 225     return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos));
 226 }
 227
 228 bool Parser::lexNCName(String& name)
 229 {
 230     int startPos = m_nextPos;
 231     if (m_nextPos >= m_data.length())
 232         return false;
 233
 234     if (charCat(m_data[m_nextPos]) != NameStart)
 235         return false;
 236
 237     // Keep going until we get a character that's not good for names.
 238     for (; m_nextPos < m_data.length(); ++m_nextPos)
 239         if (charCat(m_data[m_nextPos]) == NotPartOfName)
 240             break;
 241
 242     name = m_data.substring(startPos, m_nextPos - startPos);
 243     return true;
 244 }
 245
 246 bool Parser::lexQName(String& name)
 247 {
 248     String n1;
 249     if (!lexNCName(n1))
 250         return false;
 251
 252     skipWS();
 253
 254     // If the next character is :, what we just got it the prefix, if not,
 255     // it's the whole thing.
 256     if (peekAheadHelper() != ':') {
 257         name = n1;
 258         return true;
 259     }
 260
 261     String n2;
 262     if (!lexNCName(n2))
 263         return false;
 264
 265     name = n1 + ":" + n2;
 266     return true;
 267 }
 268
 269 Token Parser::nextTokenInternal()
 270 {
 271     skipWS();
 272
 273     if (m_nextPos >= m_data.length())
 274         return Token(0);
 275
 276     char code = peekCurHelper();
 277     switch (code) {
 278     case '(': case ')': case '[': case ']':
 279     case '@': case ',': case '|':
 280         return makeTokenAndAdvance(code);
 281     case '\'':
 282     case '\"':
 283         return lexString();
 284     case '0': case '1': case '2': case '3': case '4':
 285     case '5': case '6': case '7': case '8': case '9':
 286         return lexNumber();
 287     case '.': {
 288         char next = peekAheadHelper();
 289         if (next == '.')
 290             return makeTokenAndAdvance(DOTDOT, 2);
 291         if (next >= '0' && next <= '9')
 292             return lexNumber();
 293         return makeTokenAndAdvance('.');
 294     }
 295     case '/':
 296         if (peekAheadHelper() == '/')
 297             return makeTokenAndAdvance(SLASHSLASH, 2);
 298         return makeTokenAndAdvance('/');
 299     case '+':
 300         return makeTokenAndAdvance(PLUS);
 301     case '-':
 302         return makeTokenAndAdvance(MINUS);
 303     case '=':
 304         return makeTokenAndAdvance(EQOP, EqTestOp::OP_EQ);
 305     case '!':
 306         if (peekAheadHelper() == '=')
 307             return makeTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2);
 308         return Token(XPATH_ERROR);
 309     case '<':
 310         if (peekAheadHelper() == '=')
 311             return makeTokenAndAdvance(RELOP, EqTestOp::OP_LE, 2);
 312         return makeTokenAndAdvance(RELOP, EqTestOp::OP_LT);
 313     case '>':
 314         if (peekAheadHelper() == '=')
 315             return makeTokenAndAdvance(RELOP, EqTestOp::OP_GE, 2);
 316         return makeTokenAndAdvance(RELOP, EqTestOp::OP_GT);
 317     case '*':
 318         if (isBinaryOperatorContext())
 319             return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul);
 320         ++m_nextPos;
 321         return Token(NAMETEST, "*");
 322     case '$': { // $ QName
 323         m_nextPos++;
 324         String name;
 325         if (!lexQName(name))
 326             return Token(XPATH_ERROR);
 327         return Token(VARIABLEREFERENCE, name);
 328     }
 329     }
 330
 331     String name;
 332     if (!lexNCName(name))
 333         return Token(XPATH_ERROR);
 334
 335     skipWS();
 336     // If we're in an operator context, check for any operator names
 337     if (isBinaryOperatorContext()) {
 338         if (name == "and") //### hash?
 339             return Token(AND);
 340         if (name == "or")
 341             return Token(OR);
 342         if (name == "mod")
 343             return Token(MULOP, NumericOp::OP_Mod);
 344         if (name == "div")
 345             return Token(MULOP, NumericOp::OP_Div);
 346     }
 347
 348     // See whether we are at a :
 349     if (peekCurHelper() == ':') {
 350         m_nextPos++;
 351         // Any chance it's an axis name?
 352         if (peekCurHelper() == ':') {
 353             m_nextPos++;
 354
 355             //It might be an axis name.
 356             Step::Axis axis;
 357             if (isAxisName(name, axis))
 358                 return Token(AXISNAME, axis);
 359             // Ugh, :: is only valid in axis names -> error
 360             return Token(XPATH_ERROR);
 361         }
 362
 363         // Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
 364         skipWS();
 365         if (peekCurHelper() == '*') {
 366             m_nextPos++;
 367             return Token(NAMETEST, name + ":*");
 368         }
 369
 370         // Make a full qname.
 371         String n2;
 372         if (!lexNCName(n2))
 373             return Token(XPATH_ERROR);
 374
 375         name = name + ":" + n2;
 376     }
 377
 378     skipWS();
 379     if (peekCurHelper() == '(') {
 380         //note: we don't swallow the (here!
 381
 382         //either node type of function name
 383         if (isNodeTypeName(name)) {
 384             if (name == "processing-instruction")
 385                 return Token(PI, name);
 386
 387             return Token(NODETYPE, name);
 388         }
 389         //must be a function name.
 390         return Token(FUNCTIONNAME, name);
 391     }
 392
 393     // At this point, it must be NAMETEST.
 394     return Token(NAMETEST, name);
 395 }
 396
 397 Token Parser::nextToken()
 398 {
 399     Token toRet = nextTokenInternal();
 400     m_lastTokenType = toRet.type;
 401     return toRet;
 402 }
 403
 404 Parser::Parser()
 405 {
 406     reset(String());
 407 }
 408
 409 Parser::~Parser()
 410 {
 411 }
 412
 413 void Parser::reset(const String& data)
 414 {
 415     m_nextPos = 0;
 416     m_data = data;
 417     m_lastTokenType = 0;
 418
 419     m_topExpr = 0;
 420     m_gotNamespaceError = false;
 421 }
 422
 423 int Parser::lex(void* data)
 424 {
 425     YYSTYPE* yylval = static_cast<YYSTYPE*>(data);
 426     Token tok = nextToken();
 427
 428     switch (tok.type) {
 429     case AXISNAME:
 430         yylval->axis = tok.axis;
 431         break;
 432     case MULOP:
 433         yylval->numop = tok.numop;
 434         break;
 435     case RELOP:
 436     case EQOP:
 437         yylval->eqop = tok.eqop;
 438         break;
 439     case NODETYPE:
 440     case PI:
 441     case FUNCTIONNAME:
 442     case LITERAL:
 443     case VARIABLEREFERENCE:
 444     case NUMBER:
 445     case NAMETEST:
 446         yylval->str = new String(tok.str);
 447         registerString(yylval->str);
 448         break;
 449     }
 450
 451     return tok.type;
 452 }
 453
 454 bool Parser::expandQName(const String& qName, String& localName, String& namespaceURI)
 455 {
 456     size_t colon = qName.find(':');
 457     if (colon != notFound) {
 458         if (!m_resolver)
 459             return false;
 460         namespaceURI = m_resolver->lookupNamespaceURI(qName.left(colon));
 461         if (namespaceURI.isNull())
 462             return false;
 463         localName = qName.substring(colon + 1);
 464     } else
 465         localName = qName;
 466
 467     return true;
 468 }
 469
 470 Expression* Parser::parseStatement(const String& statement, PassRefPtr<XPathNSResolver> resolver, ExceptionCode& ec)
 471 {
 472     reset(statement);
 473
 474     m_resolver = resolver;
 475
 476     Parser* oldParser = currentParser;
 477     currentParser = this;
 478     int parseError = xpathyyparse(this);
 479     currentParser = oldParser;
 480
 481     if (parseError) {
 482         deleteAllValues(m_parseNodes);
 483         m_parseNodes.clear();
 484
 485         HashSet<Vector<Predicate*>*>::iterator pend = m_predicateVectors.end();
 486         for (HashSet<Vector<Predicate*>*>::iterator it = m_predicateVectors.begin(); it != pend; ++it) {
 487             deleteAllValues(**it);
 488             delete *it;
 489         }
 490         m_predicateVectors.clear();
 491
 492         HashSet<Vector<Expression*>*>::iterator eend = m_expressionVectors.end();
 493         for (HashSet<Vector<Expression*>*>::iterator it = m_expressionVectors.begin(); it != eend; ++it) {
 494             deleteAllValues(**it);
 495             delete *it;
 496         }
 497         m_expressionVectors.clear();
 498
 499         deleteAllValues(m_strings);
 500         m_strings.clear();
 501
 502         deleteAllValues(m_nodeTests);
 503         m_nodeTests.clear();
 504
 505         m_topExpr = 0;
 506
 507         if (m_gotNamespaceError)
 508             ec = NAMESPACE_ERR;
 509         else
 510             ec = XPathException::INVALID_EXPRESSION_ERR;
 511         return 0;
 512     }
 513
 514     ASSERT(m_parseNodes.size() == 1);
 515     ASSERT(*m_parseNodes.begin() == m_topExpr);
 516     ASSERT(m_expressionVectors.size() == 0);
 517     ASSERT(m_predicateVectors.size() == 0);
 518     ASSERT(m_strings.size() == 0);
 519     ASSERT(m_nodeTests.size() == 0);
 520
 521     m_parseNodes.clear();
 522     Expression* result = m_topExpr;
 523     m_topExpr = 0;
 524
 525     return result;
 526 }
 527
 528 void Parser::registerParseNode(ParseNode* node)
 529 {
 530     if (node == 0)
 531         return;
 532
 533     ASSERT(!m_parseNodes.contains(node));
 534
 535     m_parseNodes.add(node);
 536 }
 537
 538 void Parser::unregisterParseNode(ParseNode* node)
 539 {
 540     if (node == 0)
 541         return;
 542
 543     ASSERT(m_parseNodes.contains(node));
 544
 545     m_parseNodes.remove(node);
 546 }
 547
 548 void Parser::registerPredicateVector(Vector<Predicate*>* vector)
 549 {
 550     if (vector == 0)
 551         return;
 552
 553     ASSERT(!m_predicateVectors.contains(vector));
 554
 555     m_predicateVectors.add(vector);
 556 }
 557
 558 void Parser::deletePredicateVector(Vector<Predicate*>* vector)
 559 {
 560     if (vector == 0)
 561         return;
 562
 563     ASSERT(m_predicateVectors.contains(vector));
 564
 565     m_predicateVectors.remove(vector);
 566     delete vector;
 567 }
 568
 569
 570 void Parser::registerExpressionVector(Vector<Expression*>* vector)
 571 {
 572     if (vector == 0)
 573         return;
 574
 575     ASSERT(!m_expressionVectors.contains(vector));
 576
 577     m_expressionVectors.add(vector);
 578 }
 579
 580 void Parser::deleteExpressionVector(Vector<Expression*>* vector)
 581 {
 582     if (vector == 0)
 583         return;
 584
 585     ASSERT(m_expressionVectors.contains(vector));
 586
 587     m_expressionVectors.remove(vector);
 588     delete vector;
 589 }
 590
 591 void Parser::registerString(String* s)
 592 {
 593     if (s == 0)
 594         return;
 595
 596     ASSERT(!m_strings.contains(s));
 597
 598     m_strings.add(s);
 599 }
 600
 601 void Parser::deleteString(String* s)
 602 {
 603     if (s == 0)
 604         return;
 605
 606     ASSERT(m_strings.contains(s));
 607
 608     m_strings.remove(s);
 609     delete s;
 610 }
 611
 612 void Parser::registerNodeTest(Step::NodeTest* t)
 613 {
 614     if (t == 0)
 615         return;
 616
 617     ASSERT(!m_nodeTests.contains(t));
 618
 619     m_nodeTests.add(t);
 620 }
 621
 622 void Parser::deleteNodeTest(Step::NodeTest* t)
 623 {
 624     if (t == 0)
 625         return;
 626
 627     ASSERT(m_nodeTests.contains(t));
 628
 629     m_nodeTests.remove(t);
 630     delete t;
 631 }
 632
 633 }
 634 }