executor/xeXMLParser.cpp

   1 /*-------------------------------------------------------------------------
   2  * drawElements Quality Program Test Executor
   3  * ------------------------------------------
   4  *
   5  * Copyright 2014 The Android Open Source Project
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  * you may not use this file except in compliance with the License.
   9  * You may obtain a copy of the License at
  10  *
  11  *      http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  *
  19  *//*!
  20  * \file
  21  * \brief XML Parser.
  22  *//*--------------------------------------------------------------------*/
  23
  24 #include "xeXMLParser.hpp"
  25 #include "deInt32.h"
  26
  27 namespace xe
  28 {
  29 namespace xml
  30 {
  31
  32 enum
  33 {
  34         TOKENIZER_INITIAL_BUFFER_SIZE   = 1024
  35 };
  36
  37 static inline bool isIdentifierStartChar (int ch)
  38 {
  39         return de::inRange<int>(ch, 'a', 'z') || de::inRange<int>(ch, 'A', 'Z');
  40 }
  41
  42 static inline bool isIdentifierChar (int ch)
  43 {
  44         return isIdentifierStartChar(ch) || de::inRange<int>(ch, '0', '9') || (ch == '-') || (ch == '_');
  45 }
  46
  47 static inline bool isWhitespaceChar (int ch)
  48 {
  49         return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
  50 }
  51
  52 static int getNextBufferSize (int curSize, int minNewSize)
  53 {
  54         return de::max(curSize*2, 1<<deLog2Ceil32(minNewSize));
  55 }
  56
  57 Tokenizer::Tokenizer (void)
  58         : m_curToken    (TOKEN_INCOMPLETE)
  59         , m_curTokenLen (0)
  60         , m_state               (STATE_DATA)
  61         , m_buf                 (TOKENIZER_INITIAL_BUFFER_SIZE)
  62 {
  63 }
  64
  65 Tokenizer::~Tokenizer (void)
  66 {
  67 }
  68
  69 void Tokenizer::clear (void)
  70 {
  71         m_curToken              = TOKEN_INCOMPLETE;
  72         m_curTokenLen   = 0;
  73         m_state                 = STATE_DATA;
  74         m_buf.clear();
  75 }
  76
  77 void Tokenizer::error (const std::string& what)
  78 {
  79         throw ParseError(what);
  80 }
  81
  82 void Tokenizer::feed (const deUint8* bytes, int numBytes)
  83 {
  84         // Grow buffer if necessary.
  85         if (m_buf.getNumFree() < numBytes)
  86         {
  87                 m_buf.resize(getNextBufferSize(m_buf.getSize(), m_buf.getNumElements()+numBytes));
  88         }
  89
  90         // Append to front.
  91         m_buf.pushFront(bytes, numBytes);
  92
  93         // If we haven't parsed complete token, re-try after data feed.
  94         if (m_curToken == TOKEN_INCOMPLETE)
  95                 advance();
  96 }
  97
  98 int Tokenizer::getChar (int offset) const
  99 {
 100         DE_ASSERT(de::inRange(offset, 0, m_buf.getNumElements()));
 101
 102         if (offset < m_buf.getNumElements())
 103                 return m_buf.peekBack(offset);
 104         else
 105                 return END_OF_BUFFER;
 106 }
 107
 108 void Tokenizer::advance (void)
 109 {
 110         if (m_curToken != TOKEN_INCOMPLETE)
 111         {
 112                 // Parser should not try to advance beyond end of string.
 113                 DE_ASSERT(m_curToken != TOKEN_END_OF_STRING);
 114
 115                 // If current token is tag end, change state to data.
 116                 if (m_curToken == TOKEN_TAG_END                                         ||
 117                         m_curToken == TOKEN_EMPTY_ELEMENT_END                   ||
 118                         m_curToken == TOKEN_PROCESSING_INSTRUCTION_END  ||
 119                         m_curToken == TOKEN_COMMENT                                             ||
 120                         m_curToken == TOKEN_ENTITY)
 121                         m_state = STATE_DATA;
 122
 123                 // Advance buffer by length of last token.
 124                 m_buf.popBack(m_curTokenLen);
 125
 126                 // Reset state.
 127                 m_curToken              = TOKEN_INCOMPLETE;
 128                 m_curTokenLen   = 0;
 129
 130                 // If we hit end of string here, report it as end of string.
 131                 if (getChar(0) == END_OF_STRING)
 132                 {
 133                         m_curToken              = TOKEN_END_OF_STRING;
 134                         m_curTokenLen   = 1;
 135                         return;
 136                 }
 137         }
 138
 139         int curChar = getChar(m_curTokenLen);
 140
 141         for (;;)
 142         {
 143                 if (m_state == STATE_DATA)
 144                 {
 145                         // Advance until we hit end of buffer or tag start and treat that as data token.
 146                         if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER || curChar == '<' || curChar == '&')
 147                         {
 148                                 if (curChar == '<')
 149                                         m_state = STATE_TAG;
 150                                 else if (curChar == '&')
 151                                         m_state = STATE_ENTITY;
 152
 153                                 if (m_curTokenLen > 0)
 154                                 {
 155                                         // Report data token.
 156                                         m_curToken = TOKEN_DATA;
 157                                         return;
 158                                 }
 159                                 else if (curChar == END_OF_STRING || curChar == (int)END_OF_BUFFER)
 160                                 {
 161                                         // Just return incomplete token, no data parsed.
 162                                         return;
 163                                 }
 164                                 else
 165                                 {
 166                                         DE_ASSERT(m_state == STATE_TAG || m_state == STATE_ENTITY);
 167                                         continue;
 168                                 }
 169                         }
 170                 }
 171                 else
 172                 {
 173                         // Eat all whitespace if present.
 174                         if (m_curTokenLen == 0)
 175                         {
 176                                 while (isWhitespaceChar(curChar))
 177                                 {
 178                                         m_buf.popBack();
 179                                         curChar = getChar(0);
 180                                 }
 181                         }
 182
 183                         // Handle end of string / buffer.
 184                         if (curChar == END_OF_STRING)
 185                                 error("Unexpected end of string");
 186                         else if (curChar == (int)END_OF_BUFFER)
 187                         {
 188                                 DE_ASSERT(m_curToken == TOKEN_INCOMPLETE);
 189                                 return;
 190                         }
 191
 192                         if (m_curTokenLen == 0)
 193                         {
 194                                 // Expect start of identifier, value or special tag token.
 195                                 if (curChar == '\'' || curChar == '"')
 196                                         m_state = STATE_VALUE;
 197                                 else if (isIdentifierStartChar(curChar))
 198                                         m_state = STATE_IDENTIFIER;
 199                                 else if (curChar == '<' || curChar == '?' || curChar == '/')
 200                                         m_state = STATE_TAG;
 201                                 else if (curChar == '&')
 202                                         DE_ASSERT(m_state == STATE_ENTITY);
 203                                 else if (curChar == '=')
 204                                 {
 205                                         m_curToken              = TOKEN_EQUAL;
 206                                         m_curTokenLen   = 1;
 207                                         return;
 208                                 }
 209                                 else if (curChar == '>')
 210                                 {
 211                                         m_curToken              = TOKEN_TAG_END;
 212                                         m_curTokenLen   = 1;
 213                                         return;
 214                                 }
 215                                 else
 216                                         error("Unexpected character");
 217                         }
 218                         else if (m_state == STATE_IDENTIFIER)
 219                         {
 220                                 if (!isIdentifierChar(curChar))
 221                                 {
 222                                         m_curToken = TOKEN_IDENTIFIER;
 223                                         return;
 224                                 }
 225                         }
 226                         else if (m_state == STATE_VALUE)
 227                         {
 228                                 // \todo [2012-06-07 pyry] Escapes.
 229                                 if (curChar == '\'' || curChar == '"')
 230                                 {
 231                                         // \todo [2012-10-17 pyry] Should we actually do the check against getChar(0)?
 232                                         if (curChar != getChar(0))
 233                                                 error("Mismatched quote");
 234                                         m_curToken               = TOKEN_STRING;
 235                                         m_curTokenLen   += 1;
 236                                         return;
 237                                 }
 238                         }
 239                         else if (m_state == STATE_COMMENT)
 240                         {
 241                                 DE_ASSERT(m_curTokenLen >= 2); // 2 characters have been parsed if we are in comment state.
 242
 243                                 if (m_curTokenLen <= 3)
 244                                 {
 245                                         if (curChar != '-')
 246                                                 error("Invalid comment start");
 247                                 }
 248                                 else
 249                                 {
 250                                         int prev2 = m_curTokenLen > 5 ? getChar(m_curTokenLen-2) : 0;
 251                                         int prev1 = m_curTokenLen > 4 ? getChar(m_curTokenLen-1) : 0;
 252
 253                                         if (prev2 == '-' && prev1 == '-')
 254                                         {
 255                                                 if (curChar != '>')
 256                                                         error("Invalid comment end");
 257                                                 m_curToken               = TOKEN_COMMENT;
 258                                                 m_curTokenLen   += 1;
 259                                                 return;
 260                                         }
 261                                 }
 262                         }
 263                         else if (m_state == STATE_ENTITY)
 264                         {
 265                                 if (m_curTokenLen >= 1)
 266                                 {
 267                                         if (curChar == ';')
 268                                         {
 269                                                 m_curToken               = TOKEN_ENTITY;
 270                                                 m_curTokenLen   += 1;
 271                                                 return;
 272                                         }
 273                                         else if (!de::inRange<int>(curChar, '0', '9')   &&
 274                                                          !de::inRange<int>(curChar, 'a', 'z')   &&
 275                                                          !de::inRange<int>(curChar, 'A', 'Z'))
 276                                                 error("Invalid entity");
 277                                 }
 278                         }
 279                         else
 280                         {
 281                                 // Special tokens are at most 2 characters.
 282                                 DE_ASSERT(m_state == STATE_TAG && m_curTokenLen == 1);
 283
 284                                 int prevChar = getChar(m_curTokenLen-1);
 285
 286                                 if (prevChar == '<')
 287                                 {
 288                                         // Tag start.
 289                                         if (curChar == '/')
 290                                         {
 291                                                 m_curToken              = TOKEN_END_TAG_START;
 292                                                 m_curTokenLen   = 2;
 293                                                 return;
 294                                         }
 295                                         else if (curChar == '?')
 296                                         {
 297                                                 m_curToken              = TOKEN_PROCESSING_INSTRUCTION_START;
 298                                                 m_curTokenLen   = 2;
 299                                                 return;
 300                                         }
 301                                         else if (curChar == '!')
 302                                         {
 303                                                 m_state = STATE_COMMENT;
 304                                         }
 305                                         else
 306                                         {
 307                                                 m_curToken              = TOKEN_TAG_START;
 308                                                 m_curTokenLen   = 1;
 309                                                 return;
 310                                         }
 311                                 }
 312                                 else if (prevChar == '?')
 313                                 {
 314                                         if (curChar != '>')
 315                                                 error("Invalid processing instruction end");
 316                                         m_curToken              = TOKEN_PROCESSING_INSTRUCTION_END;
 317                                         m_curTokenLen   = 2;
 318                                         return;
 319                                 }
 320                                 else if (prevChar == '/')
 321                                 {
 322                                         if (curChar != '>')
 323                                                 error("Invalid empty element end");
 324                                         m_curToken              = TOKEN_EMPTY_ELEMENT_END;
 325                                         m_curTokenLen   = 2;
 326                                         return;
 327                                 }
 328                                 else
 329                                         error("Could not parse special token");
 330                         }
 331                 }
 332
 333                 m_curTokenLen   += 1;
 334                 curChar                  = getChar(m_curTokenLen);
 335         }
 336 }
 337
 338 void Tokenizer::getString (std::string& dst) const
 339 {
 340         DE_ASSERT(m_curToken == TOKEN_STRING);
 341         dst.resize(m_curTokenLen-2);
 342         for (int ndx = 0; ndx < m_curTokenLen-2; ndx++)
 343                 dst[ndx] = m_buf.peekBack(ndx+1);
 344 }
 345
 346 Parser::Parser (void)
 347         : m_element             (ELEMENT_INCOMPLETE)
 348         , m_state               (STATE_DATA)
 349 {
 350 }
 351
 352 Parser::~Parser (void)
 353 {
 354 }
 355
 356 void Parser::clear (void)
 357 {
 358         m_tokenizer.clear();
 359         m_elementName.clear();
 360         m_attributes.clear();
 361         m_attribName.clear();
 362         m_entityValue.clear();
 363
 364         m_element       = ELEMENT_INCOMPLETE;
 365         m_state         = STATE_DATA;
 366 }
 367
 368 void Parser::error (const std::string& what)
 369 {
 370         throw ParseError(what);
 371 }
 372
 373 void Parser::feed (const deUint8* bytes, int numBytes)
 374 {
 375         m_tokenizer.feed(bytes, numBytes);
 376
 377         if (m_element == ELEMENT_INCOMPLETE)
 378                 advance();
 379 }
 380
 381 void Parser::advance (void)
 382 {
 383         if (m_element == ELEMENT_START)
 384                 m_attributes.clear();
 385
 386         // \note No token is advanced when element end is reported.
 387         if (m_state == STATE_YIELD_EMPTY_ELEMENT_END)
 388         {
 389                 DE_ASSERT(m_element == ELEMENT_START);
 390                 m_element       = ELEMENT_END;
 391                 m_state         = STATE_DATA;
 392                 return;
 393         }
 394
 395         if (m_element != ELEMENT_INCOMPLETE)
 396         {
 397                 m_tokenizer.advance();
 398                 m_element = ELEMENT_INCOMPLETE;
 399         }
 400
 401         for (;;)
 402         {
 403                 Token curToken = m_tokenizer.getToken();
 404
 405                 // Skip comments.
 406                 while (curToken == TOKEN_COMMENT)
 407                 {
 408                         m_tokenizer.advance();
 409                         curToken = m_tokenizer.getToken();
 410                 }
 411
 412                 if (curToken == TOKEN_INCOMPLETE)
 413                 {
 414                         DE_ASSERT(m_element == ELEMENT_INCOMPLETE);
 415                         return;
 416                 }
 417
 418                 switch (m_state)
 419                 {
 420                         case STATE_ENTITY:
 421                                 m_state = STATE_DATA;
 422                                 // Fall-through to STATE_DATA processing.
 423
 424                         case STATE_DATA:
 425                                 switch (curToken)
 426                                 {
 427                                         case TOKEN_DATA:
 428                                                 m_element = ELEMENT_DATA;
 429                                                 return;
 430
 431                                         case TOKEN_END_OF_STRING:
 432                                                 m_element = ELEMENT_END_OF_STRING;
 433                                                 return;
 434
 435                                         case TOKEN_TAG_START:
 436                                                 m_state = STATE_START_TAG_OPEN;
 437                                                 break;
 438
 439                                         case TOKEN_END_TAG_START:
 440                                                 m_state = STATE_END_TAG_OPEN;
 441                                                 break;
 442
 443                                         case TOKEN_PROCESSING_INSTRUCTION_START:
 444                                                 m_state = STATE_IN_PROCESSING_INSTRUCTION;
 445                                                 break;
 446
 447                                         case TOKEN_ENTITY:
 448                                                 m_state         = STATE_ENTITY;
 449                                                 m_element       = ELEMENT_DATA;
 450                                                 parseEntityValue();
 451                                                 return;
 452
 453                                         default:
 454                                                 error("Unexpected token");
 455                                 }
 456                                 break;
 457
 458                         case STATE_IN_PROCESSING_INSTRUCTION:
 459                                 if (curToken == TOKEN_PROCESSING_INSTRUCTION_END)
 460                                         m_state = STATE_DATA;
 461                                 else
 462                                         if (curToken != TOKEN_IDENTIFIER && curToken != TOKEN_EQUAL && curToken != TOKEN_STRING)
 463                                                 error("Unexpected token in processing instruction");
 464                                 break;
 465
 466                         case STATE_START_TAG_OPEN:
 467                                 if (curToken != TOKEN_IDENTIFIER)
 468                                         error("Expected identifier");
 469                                 m_tokenizer.getTokenStr(m_elementName);
 470                                 m_state = STATE_ATTRIBUTE_LIST;
 471                                 break;
 472
 473                         case STATE_END_TAG_OPEN:
 474                                 if (curToken != TOKEN_IDENTIFIER)
 475                                         error("Expected identifier");
 476                                 m_tokenizer.getTokenStr(m_elementName);
 477                                 m_state = STATE_EXPECTING_END_TAG_CLOSE;
 478                                 break;
 479
 480                         case STATE_EXPECTING_END_TAG_CLOSE:
 481                                 if (curToken != TOKEN_TAG_END)
 482                                         error("Expected tag end");
 483                                 m_state         = STATE_DATA;
 484                                 m_element       = ELEMENT_END;
 485                                 return;
 486
 487                         case STATE_ATTRIBUTE_LIST:
 488                                 if (curToken == TOKEN_IDENTIFIER)
 489                                 {
 490                                         m_tokenizer.getTokenStr(m_attribName);
 491                                         m_state = STATE_EXPECTING_ATTRIBUTE_EQ;
 492                                 }
 493                                 else if (curToken == TOKEN_EMPTY_ELEMENT_END)
 494                                 {
 495                                         m_state         = STATE_YIELD_EMPTY_ELEMENT_END;
 496                                         m_element       = ELEMENT_START;
 497                                         return;
 498                                 }
 499                                 else if (curToken == TOKEN_TAG_END)
 500                                 {
 501                                         m_state         = STATE_DATA;
 502                                         m_element       = ELEMENT_START;
 503                                         return;
 504                                 }
 505                                 else
 506                                         error("Unexpected token");
 507                                 break;
 508
 509                         case STATE_EXPECTING_ATTRIBUTE_EQ:
 510                                 if (curToken != TOKEN_EQUAL)
 511                                         error("Expected '='");
 512                                 m_state = STATE_EXPECTING_ATTRIBUTE_VALUE;
 513                                 break;
 514
 515                         case STATE_EXPECTING_ATTRIBUTE_VALUE:
 516                                 if (curToken != TOKEN_STRING)
 517                                         error("Expected value");
 518                                 if (hasAttribute(m_attribName.c_str()))
 519                                         error("Duplicate attribute");
 520
 521                                 m_tokenizer.getString(m_attributes[m_attribName]);
 522                                 m_state = STATE_ATTRIBUTE_LIST;
 523                                 break;
 524
 525                         default:
 526                                 DE_ASSERT(false);
 527                 }
 528
 529                 m_tokenizer.advance();
 530         }
 531 }
 532
 533 static char getEntityValue (const std::string& entity)
 534 {
 535         static const struct
 536         {
 537                 const char*             name;
 538                 char                    value;
 539         } s_entities[] =
 540         {
 541                         { "&lt;",                       '<' },
 542                         { "&gt;",                       '>' },
 543                         { "&amp;",                      '&' },
 544                         { "&apos;",                     '\''},
 545                         { "&quot;",                     '"' },
 546         };
 547
 548         for (int ndx = 0; ndx < DE_LENGTH_OF_ARRAY(s_entities); ndx++)
 549         {
 550                 if (entity == s_entities[ndx].name)
 551                         return s_entities[ndx].value;
 552         }
 553
 554         return 0;
 555 }
 556
 557 void Parser::parseEntityValue (void)
 558 {
 559         DE_ASSERT(m_state == STATE_ENTITY && m_tokenizer.getToken() == TOKEN_ENTITY);
 560
 561         std::string entity;
 562         m_tokenizer.getTokenStr(entity);
 563
 564         const char value = getEntityValue(entity);
 565         if (value == 0)
 566                 error("Invalid entity '" + entity + "'");
 567
 568         m_entityValue.resize(1);
 569         m_entityValue[0] = value;
 570 }
 571
 572 } // xml
 573 } // xe