rlcodegen/xmlscan.rl

   1 /*
   2  *  Copyright 2001-2007 Adrian Thurston <thurston@cs.queensu.ca>
   3  */
   4
   5 /*  This file is part of Ragel.
   6  *
   7  *  Ragel is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  Ragel is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with Ragel; if not, write to the Free Software
  19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  */
  21 #include <iostream>
  22 #include <string.h>
  23 #include "vector.h"
  24 #include "xmlparse.h"
  25 #include "rlcodegen.h"
  26 #include "buffer.h"
  27
  28 using std::istream;
  29 using std::cout;
  30 using std::endl;
  31
  32 #define BUFSIZE 4096
  33
  34 %%{
  35         machine Scanner;
  36         write data;
  37 }%%
  38
  39 class Perfect_Hash
  40 {
  41 private:
  42         static inline unsigned int hash (const char *str, unsigned int len);
  43
  44 public:
  45         static struct XMLTagHashPair *in_word_set (const char *str, unsigned int len);
  46 };
  47
  48 struct Scanner
  49 {
  50         Scanner( istream &input ) :
  51                 input(input),
  52                 curline(1),
  53                 curcol(1),
  54                 p(0), pe(0),
  55                 done(false),
  56                 data(0), data_len(0),
  57                 value(0)
  58         {
  59                 %%{
  60                         machine Scanner;
  61                         write init;
  62                 }%%
  63         }
  64
  65         int scan();
  66         void adjustAttrPointers( int distance );
  67
  68         istream &input;
  69
  70         /* Scanner State. */
  71         int cs, act, have, curline, curcol;
  72         char *tokstart, *tokend;
  73         char *p, *pe;
  74         int done;
  75
  76         /* Token data */
  77         char *data;
  78         int data_len;
  79         int value;
  80         AttrMkList attrMkList;
  81         Buffer buffer;
  82         char *tag_id_start;
  83         int tag_id_len;
  84         int token_col, token_line;
  85
  86         char buf[BUFSIZE];
  87 };
  88
  89
  90 #define TK_NO_TOKEN (-1)
  91 #define TK_ERR 1
  92 #define TK_EOF 2
  93 #define TK_OpenTag 3
  94 #define TK_CloseTag 4
  95
  96 #define ret_tok( _tok ) token = (_tok); data = tokstart
  97
  98 void Scanner::adjustAttrPointers( int distance )
  99 {
 100         for ( AttrMkList::Iter attr = attrMkList; attr.lte(); attr++ ) {
 101                 attr->id -= distance;
 102                 attr->value -= distance;
 103         }
 104 }
 105
 106 int Scanner::scan( )
 107 {
 108         int token = TK_NO_TOKEN;
 109         int space, readlen;
 110         char *attr_id_start;
 111         char *attr_value_start;
 112         int attr_id_len;
 113         int attr_value_len;
 114
 115         attrMkList.empty();
 116         buffer.clear();
 117
 118         while ( 1 ) {
 119                 if ( p == pe ) {
 120                         //printf("scanner: need more data\n");
 121
 122                         if ( tokstart == 0 )
 123                                 have = 0;
 124                         else {
 125                                 /* There is data that needs to be shifted over. */
 126                                 //printf("scanner: buffer broken mid token\n");
 127                                 have = pe - tokstart;
 128                                 memmove( buf, tokstart, have );
 129
 130                                 int distance = tokstart - buf;
 131                                 tokend -= distance;
 132                                 tag_id_start -= distance;
 133                                 attr_id_start -= distance;
 134                                 attr_value_start -= distance;
 135                                 adjustAttrPointers( distance );
 136                                 tokstart = buf;
 137                         }
 138
 139                         p = buf + have;
 140                         space = BUFSIZE - have;
 141
 142                         if ( space == 0 ) {
 143                                 /* We filled up the buffer trying to scan a token. */
 144                                 //printf("scanner: out of buffer space, you have a really long tag\n");
 145                                 return TK_ERR;
 146                         }
 147
 148                         if ( done ) {
 149                                 //printf("scanner: end of file\n");
 150                                 p[0] = 0;
 151                                 readlen = 1;
 152                         }
 153                         else {
 154                                 input.read( p, space );
 155                                 readlen = input.gcount();
 156                                 if ( input.eof() ) {
 157                                         //printf("scanner: setting done flag\n");
 158                                         done = 1;
 159                                 }
 160                         }
 161
 162                         pe = p + readlen;
 163                 }
 164
 165                 /* There is no claim that this is a proper XML parser, but it is good
 166                  * enough for our purposes. */
 167                 %%{
 168                         machine Scanner;
 169
 170                         action colup { curcol++; }
 171                         action start_tok { token_col = curcol; token_line = curline; }
 172                         NL = '\n' @{ curcol = 0; curline++; };
 173
 174                         WS = [\r\t ] | NL;
 175                         id = [_a-zA-Z][_a-zA-Z0-9]*;
 176                         literal = '"' ( [^"] | NL )* '"';
 177
 178                         # Attribute identifiers.
 179                         action start_attr_id { attr_id_start = p; }
 180                         action leave_attr_id { attr_id_len = p - attr_id_start; }
 181
 182                         attr_id = id >start_attr_id %leave_attr_id;
 183
 184                         # Attribute values
 185                         action start_attr_value { attr_value_start = p; }
 186                         action leave_attr_value
 187                         {
 188                                 attr_value_len = p - attr_value_start;
 189
 190                                 AttrMarker newAttr;
 191                                 newAttr.id = attr_id_start;
 192                                 newAttr.idLen = attr_id_len;
 193                                 newAttr.value = attr_value_start;
 194                                 newAttr.valueLen = attr_value_len;
 195                                 attrMkList.append( newAttr );
 196                         }
 197
 198                         attr_value = literal >start_attr_value %leave_attr_value;
 199
 200                         # Attribute list.
 201                         attribute = attr_id WS* '=' WS* attr_value WS*;
 202
 203                         # Tag identifiers.
 204                         action tag_id_start { tag_id_start = p; }
 205                         action leave_tag_id { tag_id_len = p - tag_id_start; }
 206
 207                         tag_id = id >tag_id_start %leave_tag_id;
 208
 209                         main := |*
 210                                 # Tags
 211                                 ( '<' WS* tag_id ( WS+ attribute* )? '>' ) >start_tok $colup
 212                                         => { ret_tok( TK_OpenTag ); fbreak; };
 213
 214                                 ( '<' WS* '/' WS* tag_id WS* '>' ) >start_tok $colup
 215                                         => { ret_tok( TK_CloseTag ); fbreak; };
 216
 217                                 # Data in between tags.
 218                                 ( [^<&\0] | NL ) $colup
 219                                         => { buffer.append( *p ); };
 220
 221                                 # Specials.
 222                                 "&amp;" $colup
 223                                         => { buffer.append( '&' ); };
 224                                 "&lt;" $colup
 225                                         => { buffer.append( '<' ); };
 226                                 "&gt;" $colup
 227                                         => { buffer.append( '>' ); };
 228
 229                                 # EOF
 230                                 0 >start_tok => { ret_tok( TK_EOF ); fbreak; };
 231
 232                         *|;
 233
 234                         write exec;
 235                 }%%
 236
 237                 if ( cs == Scanner_error )
 238                         return TK_ERR;
 239
 240                 if ( token != TK_NO_TOKEN ) {
 241                         /* fbreak does not advance p, so we do it manually. */
 242                         p = p + 1;
 243                         data_len = p - data;
 244                         return token;
 245                 }
 246         }
 247 }
 248
 249
 250 int xml_parse( istream &input, char *fileName )
 251 {
 252         Scanner scanner( input );
 253         Parser parser( fileName );
 254
 255         parser.init();
 256
 257         while ( 1 ) {
 258                 int token = scanner.scan();
 259                 if ( token == TK_EOF ) {
 260                         //cout << "parser_driver: EOF" << endl;
 261                         parser.token( _eof );
 262                         break;
 263                 }
 264                 else if ( token == TK_ERR ) {
 265                         //cout << "parser_driver: ERR" << endl;
 266                         break;
 267                 }
 268                 else {
 269                         /* All other tokens are either open or close tags. */
 270                         XMLTagHashPair *tagId = Perfect_Hash::in_word_set(
 271                                         scanner.tag_id_start, scanner.tag_id_len );
 272
 273                         XMLTag *tag = new XMLTag( tagId, token == TK_OpenTag ?
 274                                         XMLTag::Open : XMLTag::Close );
 275
 276                         if ( tagId != 0 ) {
 277                                 /* Get attributes for open tags. */
 278                                 if ( token == TK_OpenTag && scanner.attrMkList.length() > 0 ) {
 279                                         tag->attrList = new AttrList;
 280                                         for ( AttrMkList::Iter attr = scanner.attrMkList;
 281                                                         attr.lte(); attr++ )
 282                                         {
 283                                                 Attribute newAttr;
 284                                                 newAttr.id = new char[attr->idLen+1];
 285                                                 memcpy( newAttr.id, attr->id, attr->idLen );
 286                                                 newAttr.id[attr->idLen] = 0;
 287
 288                                                 /* Exclude the surrounding quotes. */
 289                                                 newAttr.value = new char[attr->valueLen-1];
 290                                                 memcpy( newAttr.value, attr->value+1, attr->valueLen-2 );
 291                                                 newAttr.value[attr->valueLen-2] = 0;
 292
 293                                                 tag->attrList->append( newAttr );
 294                                         }
 295                                 }
 296
 297                                 /* Get content for closing tags. */
 298                                 if ( token == TK_CloseTag ) {
 299                                         switch ( tagId->id ) {
 300                                         case TAG_host: case TAG_option:
 301                                         case TAG_t: case TAG_alphtype:
 302                                         case TAG_text: case TAG_goto:
 303                                         case TAG_call: case TAG_next:
 304                                         case TAG_entry: case TAG_set_tokend:
 305                                         case TAG_set_act: case TAG_start_state:
 306                                         case TAG_state_actions: case TAG_action_table:
 307                                         case TAG_cond_space: case TAG_c:
 308                                                 tag->content = new char[scanner.buffer.length+1];
 309                                                 memcpy( tag->content, scanner.buffer.data,
 310                                                                 scanner.buffer.length );
 311                                                 tag->content[scanner.buffer.length] = 0;
 312                                                 break;
 313                                         }
 314                                 }
 315                         }
 316
 317                         #if 0
 318                         cout << "parser_driver: " << (tag->type == XMLTag::Open ? "open" : "close") <<
 319                                         ": " << tag->tagId->name << endl;
 320                         if ( tag->attrList != 0 ) {
 321                                 for ( AttrList::Iter attr = *tag->attrList; attr.lte(); attr++ )
 322                                         cout << "    " << attr->id << ": " << attr->value << endl;
 323                         }
 324                         if ( tag->content != 0 )
 325                                 cout << "    content: " << tag->content << endl;
 326                         #endif
 327
 328                         parser.token( tag, scanner.token_col, scanner.token_line );
 329                 }
 330         }
 331
 332         return 0;
 333 }