src/third_party/WebKit/Source/core/xml/XPathParser.cpp

   1 /*
   2  * Copyright 2005 Maksim Orlovich <maksim@kde.org>
   3  * Copyright (C) 2006 Apple Computer, Inc.
   4  * Copyright (C) 2007 Alexey Proskuryakov <ap@webkit.org>
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26  */
  27
  28 #include "config.h"
  29 #include "core/xml/XPathParser.h"
  30
  31 #include "bindings/v8/ExceptionState.h"
  32 #include "core/dom/ExceptionCode.h"
  33 #include "core/xml/XPathEvaluator.h"
  34 #include "core/xml/XPathNSResolver.h"
  35 #include "core/xml/XPathPath.h"
  36 #include "wtf/StdLibExtras.h"
  37 #include "wtf/text/StringHash.h"
  38
  39 using namespace WebCore;
  40 using namespace WTF;
  41 using namespace Unicode;
  42 using namespace XPath;
  43
  44 #include "XPathGrammar.h"
  45
  46 Parser* Parser::currentParser = 0;
  47
  48 enum XMLCat { NameStart, NameCont, NotPartOfName };
  49
  50 typedef HashMap<String, Step::Axis> AxisNamesMap;
  51
  52 static XMLCat charCat(UChar aChar)
  53 {
  54     //### might need to add some special cases from the XML spec.
  55
  56     if (aChar == '_')
  57         return NameStart;
  58
  59     if (aChar == '.' || aChar == '-')
  60         return NameCont;
  61     CharCategory category = Unicode::category(aChar);
  62     if (category & (Letter_Uppercase | Letter_Lowercase | Letter_Other | Letter_Titlecase | Number_Letter))
  63         return NameStart;
  64     if (category & (Mark_NonSpacing | Mark_SpacingCombining | Mark_Enclosing | Letter_Modifier | Number_DecimalDigit))
  65         return NameCont;
  66     return NotPartOfName;
  67 }
  68
  69 static void setUpAxisNamesMap(AxisNamesMap& axisNames)
  70 {
  71     struct AxisName {
  72         const char* name;
  73         Step::Axis axis;
  74     };
  75     const AxisName axisNameList[] = {
  76         { "ancestor", Step::AncestorAxis },
  77         { "ancestor-or-self", Step::AncestorOrSelfAxis },
  78         { "attribute", Step::AttributeAxis },
  79         { "child", Step::ChildAxis },
  80         { "descendant", Step::DescendantAxis },
  81         { "descendant-or-self", Step::DescendantOrSelfAxis },
  82         { "following", Step::FollowingAxis },
  83         { "following-sibling", Step::FollowingSiblingAxis },
  84         { "namespace", Step::NamespaceAxis },
  85         { "parent", Step::ParentAxis },
  86         { "preceding", Step::PrecedingAxis },
  87         { "preceding-sibling", Step::PrecedingSiblingAxis },
  88         { "self", Step::SelfAxis }
  89     };
  90     for (unsigned i = 0; i < sizeof(axisNameList) / sizeof(axisNameList[0]); ++i)
  91         axisNames.set(axisNameList[i].name, axisNameList[i].axis);
  92 }
  93
  94 static bool isAxisName(const String& name, Step::Axis& type)
  95 {
  96     DEFINE_STATIC_LOCAL(AxisNamesMap, axisNames, ());
  97
  98     if (axisNames.isEmpty())
  99         setUpAxisNamesMap(axisNames);
 100
 101     AxisNamesMap::iterator it = axisNames.find(name);
 102     if (it == axisNames.end())
 103         return false;
 104     type = it->value;
 105     return true;
 106 }
 107
 108 static bool isNodeTypeName(const String& name)
 109 {
 110     DEFINE_STATIC_LOCAL(HashSet<String>, nodeTypeNames, ());
 111     if (nodeTypeNames.isEmpty()) {
 112         nodeTypeNames.add("comment");
 113         nodeTypeNames.add("text");
 114         nodeTypeNames.add("processing-instruction");
 115         nodeTypeNames.add("node");
 116     }
 117     return nodeTypeNames.contains(name);
 118 }
 119
 120 // Returns whether the current token can possibly be a binary operator, given
 121 // the previous token. Necessary to disambiguate some of the operators
 122 // (* (multiply), div, and, or, mod) in the [32] Operator rule
 123 // (check http://www.w3.org/TR/xpath#exprlex).
 124 bool Parser::isBinaryOperatorContext() const
 125 {
 126     switch (m_lastTokenType) {
 127     case 0:
 128     case '@': case AXISNAME: case '(': case '[': case ',':
 129     case AND: case OR: case MULOP:
 130     case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
 131     case EQOP: case RELOP:
 132         return false;
 133     default:
 134         return true;
 135     }
 136 }
 137
 138 void Parser::skipWS()
 139 {
 140     while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos]))
 141         ++m_nextPos;
 142 }
 143
 144 Token Parser::makeTokenAndAdvance(int code, int advance)
 145 {
 146     m_nextPos += advance;
 147     return Token(code);
 148 }
 149
 150 Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance)
 151 {
 152     m_nextPos += advance;
 153     return Token(code, val);
 154 }
 155
 156 Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance)
 157 {
 158     m_nextPos += advance;
 159     return Token(code, val);
 160 }
 161
 162 // Returns next char if it's there and interesting, 0 otherwise
 163 char Parser::peekAheadHelper()
 164 {
 165     if (m_nextPos + 1 >= m_data.length())
 166         return 0;
 167     UChar next = m_data[m_nextPos + 1];
 168     if (next >= 0xff)
 169         return 0;
 170     return next;
 171 }
 172
 173 char Parser::peekCurHelper()
 174 {
 175     if (m_nextPos >= m_data.length())
 176         return 0;
 177     UChar next = m_data[m_nextPos];
 178     if (next >= 0xff)
 179         return 0;
 180     return next;
 181 }
 182
 183 Token Parser::lexString()
 184 {
 185     UChar delimiter = m_data[m_nextPos];
 186     int startPos = m_nextPos + 1;
 187
 188     for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
 189         if (m_data[m_nextPos] == delimiter) {
 190             String value = m_data.substring(startPos, m_nextPos - startPos);
 191             if (value.isNull())
 192                 value = "";
 193             ++m_nextPos; // Consume the char.
 194             return Token(LITERAL, value);
 195         }
 196     }
 197
 198     // Ouch, went off the end -- report error.
 199     return Token(XPATH_ERROR);
 200 }
 201
 202 Token Parser::lexNumber()
 203 {
 204     int startPos = m_nextPos;
 205     bool seenDot = false;
 206
 207     // Go until end or a non-digits character.
 208     for (; m_nextPos < m_data.length(); ++m_nextPos) {
 209         UChar aChar = m_data[m_nextPos];
 210         if (aChar >= 0xff) break;
 211
 212         if (aChar < '0' || aChar > '9') {
 213             if (aChar == '.' && !seenDot)
 214                 seenDot = true;
 215             else
 216                 break;
 217         }
 218     }
 219
 220     return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos));
 221 }
 222
 223 bool Parser::lexNCName(String& name)
 224 {
 225     int startPos = m_nextPos;
 226     if (m_nextPos >= m_data.length())
 227         return false;
 228
 229     if (charCat(m_data[m_nextPos]) != NameStart)
 230         return false;
 231
 232     // Keep going until we get a character that's not good for names.
 233     for (; m_nextPos < m_data.length(); ++m_nextPos)
 234         if (charCat(m_data[m_nextPos]) == NotPartOfName)
 235             break;
 236
 237     name = m_data.substring(startPos, m_nextPos - startPos);
 238     return true;
 239 }
 240
 241 bool Parser::lexQName(String& name)
 242 {
 243     String n1;
 244     if (!lexNCName(n1))
 245         return false;
 246
 247     skipWS();
 248
 249     // If the next character is :, what we just got it the prefix, if not,
 250     // it's the whole thing.
 251     if (peekAheadHelper() != ':') {
 252         name = n1;
 253         return true;
 254     }
 255
 256     String n2;
 257     if (!lexNCName(n2))
 258         return false;
 259
 260     name = n1 + ":" + n2;
 261     return true;
 262 }
 263
 264 Token Parser::nextTokenInternal()
 265 {
 266     skipWS();
 267
 268     if (m_nextPos >= m_data.length())
 269         return Token(0);
 270
 271     char code = peekCurHelper();
 272     switch (code) {
 273     case '(': case ')': case '[': case ']':
 274     case '@': case ',': case '|':
 275         return makeTokenAndAdvance(code);
 276     case '\'':
 277     case '\"':
 278         return lexString();
 279     case '0': case '1': case '2': case '3': case '4':
 280     case '5': case '6': case '7': case '8': case '9':
 281         return lexNumber();
 282     case '.': {
 283         char next = peekAheadHelper();
 284         if (next == '.')
 285             return makeTokenAndAdvance(DOTDOT, 2);
 286         if (next >= '0' && next <= '9')
 287             return lexNumber();
 288         return makeTokenAndAdvance('.');
 289     }
 290     case '/':
 291         if (peekAheadHelper() == '/')
 292             return makeTokenAndAdvance(SLASHSLASH, 2);
 293         return makeTokenAndAdvance('/');
 294     case '+':
 295         return makeTokenAndAdvance(PLUS);
 296     case '-':
 297         return makeTokenAndAdvance(MINUS);
 298     case '=':
 299         return makeTokenAndAdvance(EQOP, EqTestOp::OP_EQ);
 300     case '!':
 301         if (peekAheadHelper() == '=')
 302             return makeTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2);
 303         return Token(XPATH_ERROR);
 304     case '<':
 305         if (peekAheadHelper() == '=')
 306             return makeTokenAndAdvance(RELOP, EqTestOp::OP_LE, 2);
 307         return makeTokenAndAdvance(RELOP, EqTestOp::OP_LT);
 308     case '>':
 309         if (peekAheadHelper() == '=')
 310             return makeTokenAndAdvance(RELOP, EqTestOp::OP_GE, 2);
 311         return makeTokenAndAdvance(RELOP, EqTestOp::OP_GT);
 312     case '*':
 313         if (isBinaryOperatorContext())
 314             return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul);
 315         ++m_nextPos;
 316         return Token(NAMETEST, "*");
 317     case '$': { // $ QName
 318         m_nextPos++;
 319         String name;
 320         if (!lexQName(name))
 321             return Token(XPATH_ERROR);
 322         return Token(VARIABLEREFERENCE, name);
 323     }
 324     }
 325
 326     String name;
 327     if (!lexNCName(name))
 328         return Token(XPATH_ERROR);
 329
 330     skipWS();
 331     // If we're in an operator context, check for any operator names
 332     if (isBinaryOperatorContext()) {
 333         if (name == "and") //### hash?
 334             return Token(AND);
 335         if (name == "or")
 336             return Token(OR);
 337         if (name == "mod")
 338             return Token(MULOP, NumericOp::OP_Mod);
 339         if (name == "div")
 340             return Token(MULOP, NumericOp::OP_Div);
 341     }
 342
 343     // See whether we are at a :
 344     if (peekCurHelper() == ':') {
 345         m_nextPos++;
 346         // Any chance it's an axis name?
 347         if (peekCurHelper() == ':') {
 348             m_nextPos++;
 349
 350             //It might be an axis name.
 351             Step::Axis axis;
 352             if (isAxisName(name, axis))
 353                 return Token(AXISNAME, axis);
 354             // Ugh, :: is only valid in axis names -> error
 355             return Token(XPATH_ERROR);
 356         }
 357
 358         // Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
 359         skipWS();
 360         if (peekCurHelper() == '*') {
 361             m_nextPos++;
 362             return Token(NAMETEST, name + ":*");
 363         }
 364
 365         // Make a full qname.
 366         String n2;
 367         if (!lexNCName(n2))
 368             return Token(XPATH_ERROR);
 369
 370         name = name + ":" + n2;
 371     }
 372
 373     skipWS();
 374     if (peekCurHelper() == '(') {
 375         //note: we don't swallow the (here!
 376
 377         //either node type of function name
 378         if (isNodeTypeName(name)) {
 379             if (name == "processing-instruction")
 380                 return Token(PI, name);
 381
 382             return Token(NODETYPE, name);
 383         }
 384         //must be a function name.
 385         return Token(FUNCTIONNAME, name);
 386     }
 387
 388     // At this point, it must be NAMETEST.
 389     return Token(NAMETEST, name);
 390 }
 391
 392 Token Parser::nextToken()
 393 {
 394     Token toRet = nextTokenInternal();
 395     m_lastTokenType = toRet.type;
 396     return toRet;
 397 }
 398
 399 Parser::Parser()
 400 {
 401     reset(String());
 402 }
 403
 404 Parser::~Parser()
 405 {
 406 }
 407
 408 void Parser::reset(const String& data)
 409 {
 410     m_nextPos = 0;
 411     m_data = data;
 412     m_lastTokenType = 0;
 413
 414     m_topExpr = 0;
 415     m_gotNamespaceError = false;
 416 }
 417
 418 int Parser::lex(void* data)
 419 {
 420     YYSTYPE* yylval = static_cast<YYSTYPE*>(data);
 421     Token tok = nextToken();
 422
 423     switch (tok.type) {
 424     case AXISNAME:
 425         yylval->axis = tok.axis;
 426         break;
 427     case MULOP:
 428         yylval->numop = tok.numop;
 429         break;
 430     case RELOP:
 431     case EQOP:
 432         yylval->eqop = tok.eqop;
 433         break;
 434     case NODETYPE:
 435     case PI:
 436     case FUNCTIONNAME:
 437     case LITERAL:
 438     case VARIABLEREFERENCE:
 439     case NUMBER:
 440     case NAMETEST:
 441         yylval->str = new String(tok.str);
 442         registerString(yylval->str);
 443         break;
 444     }
 445
 446     return tok.type;
 447 }
 448
 449 bool Parser::expandQName(const String& qName, AtomicString& localName, AtomicString& namespaceURI)
 450 {
 451     size_t colon = qName.find(':');
 452     if (colon != kNotFound) {
 453         if (!m_resolver)
 454             return false;
 455         namespaceURI = m_resolver->lookupNamespaceURI(qName.left(colon));
 456         if (namespaceURI.isNull())
 457             return false;
 458         localName = AtomicString(qName.substring(colon + 1));
 459     } else {
 460         localName = AtomicString(qName);
 461     }
 462
 463     return true;
 464 }
 465
 466 Expression* Parser::parseStatement(const String& statement, PassRefPtr<XPathNSResolver> resolver, ExceptionState& exceptionState)
 467 {
 468     reset(statement);
 469
 470     m_resolver = resolver;
 471
 472     Parser* oldParser = currentParser;
 473     currentParser = this;
 474     int parseError = xpathyyparse(this);
 475     currentParser = oldParser;
 476
 477     if (parseError) {
 478         deleteAllValues(m_parseNodes);
 479         m_parseNodes.clear();
 480
 481         HashSet<Vector<OwnPtr<Predicate> >*>::iterator pend = m_predicateVectors.end();
 482         for (HashSet<Vector<OwnPtr<Predicate> >*>::iterator it = m_predicateVectors.begin(); it != pend; ++it)
 483             delete *it;
 484         m_predicateVectors.clear();
 485
 486         HashSet<Vector<OwnPtr<Expression> >*>::iterator eend = m_expressionVectors.end();
 487         for (HashSet<Vector<OwnPtr<Expression> >*>::iterator it = m_expressionVectors.begin(); it != eend; ++it)
 488             delete *it;
 489         m_expressionVectors.clear();
 490
 491         deleteAllValues(m_strings);
 492         m_strings.clear();
 493
 494         deleteAllValues(m_nodeTests);
 495         m_nodeTests.clear();
 496
 497         m_topExpr = 0;
 498
 499         if (m_gotNamespaceError)
 500             exceptionState.throwDOMException(NamespaceError, "The string '" + statement + "' contains unresolvable namespaces.");
 501         else
 502             exceptionState.throwDOMException(SyntaxError, "The string '" + statement + "' is not a valid XPath expression.");
 503         return 0;
 504     }
 505
 506     ASSERT(m_parseNodes.size() == 1);
 507     ASSERT(*m_parseNodes.begin() == m_topExpr);
 508     ASSERT(m_expressionVectors.size() == 0);
 509     ASSERT(m_predicateVectors.size() == 0);
 510     ASSERT(m_strings.size() == 0);
 511     ASSERT(m_nodeTests.size() == 0);
 512
 513     m_parseNodes.clear();
 514     Expression* result = m_topExpr;
 515     m_topExpr = 0;
 516
 517     return result;
 518 }
 519
 520 void Parser::registerParseNode(ParseNode* node)
 521 {
 522     if (node == 0)
 523         return;
 524
 525     ASSERT(!m_parseNodes.contains(node));
 526
 527     m_parseNodes.add(node);
 528 }
 529
 530 void Parser::unregisterParseNode(ParseNode* node)
 531 {
 532     if (node == 0)
 533         return;
 534
 535     ASSERT(m_parseNodes.contains(node));
 536
 537     m_parseNodes.remove(node);
 538 }
 539
 540 void Parser::registerPredicateVector(Vector<OwnPtr<Predicate> >* vector)
 541 {
 542     if (vector == 0)
 543         return;
 544
 545     ASSERT(!m_predicateVectors.contains(vector));
 546
 547     m_predicateVectors.add(vector);
 548 }
 549
 550 void Parser::deletePredicateVector(Vector<OwnPtr<Predicate> >* vector)
 551 {
 552     if (vector == 0)
 553         return;
 554
 555     ASSERT(m_predicateVectors.contains(vector));
 556
 557     m_predicateVectors.remove(vector);
 558     delete vector;
 559 }
 560
 561
 562 void Parser::registerExpressionVector(Vector<OwnPtr<Expression> >* vector)
 563 {
 564     if (vector == 0)
 565         return;
 566
 567     ASSERT(!m_expressionVectors.contains(vector));
 568
 569     m_expressionVectors.add(vector);
 570 }
 571
 572 void Parser::deleteExpressionVector(Vector<OwnPtr<Expression> >* vector)
 573 {
 574     if (vector == 0)
 575         return;
 576
 577     ASSERT(m_expressionVectors.contains(vector));
 578
 579     m_expressionVectors.remove(vector);
 580     delete vector;
 581 }
 582
 583 void Parser::registerString(String* s)
 584 {
 585     if (s == 0)
 586         return;
 587
 588     ASSERT(!m_strings.contains(s));
 589
 590     m_strings.add(s);
 591 }
 592
 593 void Parser::deleteString(String* s)
 594 {
 595     if (s == 0)
 596         return;
 597
 598     ASSERT(m_strings.contains(s));
 599
 600     m_strings.remove(s);
 601     delete s;
 602 }
 603
 604 void Parser::registerNodeTest(Step::NodeTest* t)
 605 {
 606     if (t == 0)
 607         return;
 608
 609     ASSERT(!m_nodeTests.contains(t));
 610
 611     m_nodeTests.add(t);
 612 }
 613
 614 void Parser::deleteNodeTest(Step::NodeTest* t)
 615 {
 616     if (t == 0)
 617         return;
 618
 619     ASSERT(m_nodeTests.contains(t));
 620
 621     m_nodeTests.remove(t);
 622     delete t;
 623 }
 624