ragel/rlscan.rl

   1 /*
   2  *  Copyright 2006 Adrian Thurston <thurston@cs.queensu.ca>
   3  */
   4
   5 /*  This file is part of Ragel.
   6  *
   7  *  Ragel is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  Ragel is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with Ragel; if not, write to the Free Software
  19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  */
  21
  22 #include <iostream>
  23 #include <fstream>
  24 #include <string.h>
  25
  26 #include "ragel.h"
  27 #include "rlparse.h"
  28 #include "parsedata.h"
  29 #include "avltree.h"
  30 #include "vector.h"
  31
  32
  33 using std::ifstream;
  34 using std::istream;
  35 using std::ostream;
  36 using std::cout;
  37 using std::cerr;
  38 using std::endl;
  39
  40 /* This is used for tracking the current stack of include file/machine pairs. It is
  41  * is used to detect and recursive include structure. */
  42 struct IncludeStackItem
  43 {
  44         IncludeStackItem( char *fileName, char *sectionName )
  45                 : fileName(fileName), sectionName(sectionName) {}
  46
  47         char *fileName;
  48         char *sectionName;
  49 };
  50
  51 typedef Vector<IncludeStackItem> IncludeStack;
  52 IncludeStack includeStack;
  53
  54 enum InlineBlockType
  55 {
  56         CurlyDelimited,
  57         SemiTerminated
  58 };
  59
  60 struct Scanner
  61 {
  62         Scanner( char *fileName, istream &input,
  63                         Parser *inclToParser, char *inclSectionTarg,
  64                         int include_depth )
  65         :
  66                 fileName(fileName), input(input),
  67                 inclToParser(inclToParser),
  68                 inclSectionTarg(inclSectionTarg),
  69                 include_depth(include_depth),
  70                 line(1), column(1), lastnl(0),
  71                 parser(0), active(false),
  72                 parserExistsError(false), ragelDefOpen(false),
  73                 whitespaceOn(true)
  74                 {}
  75
  76         bool recursiveInclude( IncludeStack &includeStack,
  77                         char *inclFileName, char *inclSectionName );
  78
  79         char *prepareFileName( char *fileName, int len )
  80         {
  81                 bool caseInsensitive;
  82                 Token tokenFnStr, tokenRes;
  83                 tokenFnStr.data = fileName;
  84                 tokenFnStr.length = len;
  85                 tokenFnStr.prepareLitString( tokenRes, caseInsensitive );
  86                 return tokenRes.data;
  87         }
  88
  89         void init();
  90         void token( int type, char *start, char *end );
  91         void token( int type, char c );
  92         void token( int type );
  93         void updateCol();
  94         void startSection();
  95         void endSection();
  96         void openRagelDef();
  97         void do_scan();
  98         bool parserExists();
  99         ostream &error();
 100
 101         char *fileName;
 102         istream &input;
 103         Parser *inclToParser;
 104         char *inclSectionTarg;
 105         int include_depth;
 106
 107         int cs;
 108         int line;
 109         char *word, *lit;
 110         int word_len, lit_len;
 111         InputLoc sectionLoc;
 112         char *tokstart, *tokend;
 113         int column;
 114         char *lastnl;
 115
 116         /* Set by machine statements, these persist from section to section
 117          * allowing for unnamed sections. */
 118         Parser *parser;
 119         bool active;
 120
 121         /* This is set if ragel has already emitted an error stating that
 122          * no section name has been seen and thus no parser exists. */
 123         bool parserExistsError;
 124         bool ragelDefOpen;
 125
 126         /* This is for inline code. By default it is on. It goes off for
 127          * statements and values in inline blocks which are parsed. */
 128         bool whitespaceOn;
 129 };
 130
 131 %%{
 132         machine section_parse;
 133         alphtype int;
 134         write data;
 135 }%%
 136
 137 void Scanner::init( )
 138 {
 139         %% write init;
 140 }
 141
 142 bool Scanner::parserExists()
 143 {
 144         if ( parser != 0 )
 145                 return true;
 146
 147         if ( ! parserExistsError ) {
 148                 error() << "include: there is no previous specification name" << endl;
 149                 parserExistsError = true;
 150         }
 151         return false;
 152 }
 153
 154 ostream &Scanner::error()
 155 {
 156         /* Maintain the error count. */
 157         gblErrorCount += 1;
 158
 159         cerr << fileName << ":" << line << ":" << column << ": ";
 160         return cerr;
 161 }
 162
 163 bool Scanner::recursiveInclude( IncludeStack &includeStack,
 164                         char *inclFileName, char *inclSectionName )
 165 {
 166         for ( IncludeStack::Iter si = includeStack; si.lte(); si++ ) {
 167                 if ( strcmp( si->fileName, inclFileName ) == 0 &&
 168                                 strcmp( si->sectionName, inclSectionName ) == 0 )
 169                 {
 170                         return true;
 171                 }
 172         }
 173         return false;
 174 }
 175
 176 void Scanner::updateCol()
 177 {
 178         char *from = lastnl;
 179         if ( from == 0 )
 180                 from = tokstart;
 181         //cerr << "adding " << tokend - from << " to column" << endl;
 182         column += tokend - from;
 183         lastnl = 0;
 184 }
 185
 186 void Scanner::token( int type, char c )
 187 {
 188         token( type, &c, &c + 1 );
 189 }
 190
 191 void Scanner::token( int type )
 192 {
 193         token( type, 0, 0 );
 194 }
 195
 196 %%{
 197         machine section_parse;
 198
 199         # This relies on the the kelbt implementation and the order
 200         # that tokens are declared.
 201         KW_Machine = 128;
 202         KW_Include = 129;
 203         KW_Write = 130;
 204         TK_Word = 131;
 205         TK_Literal = 132;
 206
 207         action clear_words { word = lit = 0; word_len = lit_len = 0; }
 208         action store_word { word = tokdata; word_len = toklen; }
 209         action store_lit { lit = tokdata; lit_len = toklen; }
 210
 211         action mach_err { error() << "bad machine statement" << endl; }
 212         action incl_err { error() << "bad include statement" << endl; }
 213         action write_err { error() << "bad write statement" << endl; }
 214
 215         action handle_machine
 216         {
 217                 /* Assign a name to the machine. */
 218                 char *machine = word;
 219                 //cerr << "scanner: machine statement: " << machine << endl;
 220
 221                 if ( inclSectionTarg == 0 ) {
 222                         active = true;
 223
 224                         ParserDictEl *pdEl = parserDict.find( machine );
 225                         if ( pdEl != 0 ) {
 226                                 //cerr << "scanner: using existing parser" << endl;
 227                         }
 228                         else {
 229                                 //cerr << "scanner: creating a new parser" << endl;
 230                                 pdEl = new ParserDictEl( machine );
 231                                 pdEl->value = new Parser( fileName, machine, sectionLoc );
 232                                 pdEl->value->init();
 233                                 parserDict.insert( pdEl );
 234                         }
 235
 236                         parser = pdEl->value;
 237                 }
 238                 else if ( strcmp( inclSectionTarg, machine ) == 0 ) {
 239                         //cerr << "scanner: found include target" << endl;
 240                         active = true;
 241                         parser = inclToParser;
 242                 }
 243                 else {
 244                         //cerr << "scanner: ignoring section" << endl;
 245                         active = false;
 246                         parser = 0;
 247                 }
 248         }
 249
 250         machine_stmt =
 251                 ( KW_Machine TK_Word @store_word ';' ) @handle_machine
 252                 <>err mach_err <>eof mach_err;
 253
 254         action handle_include
 255         {
 256                 if ( active && parserExists() ) {
 257                         char *inclSectionName = word;
 258                         char *inclFileName = 0;
 259
 260                         /* Implement defaults for the input file and section name. */
 261                         if ( inclSectionName == 0 )
 262                                 inclSectionName = parser->sectionName;
 263
 264                         if ( lit != 0 )
 265                                 inclFileName = prepareFileName( lit, lit_len );
 266                         else
 267                                 inclFileName = fileName;
 268
 269                         /* Open the file and process it. */
 270                         //cerr << "scanner: include: " << inclSectionName << " " << inclFileName << endl;
 271
 272                         /* Check for a recursive include structure. Add the current file/section
 273                          * name then check if what we are including is already in the stack. */
 274                         includeStack.append( IncludeStackItem( fileName, parser->sectionName ) );
 275
 276                         if ( recursiveInclude( includeStack, inclFileName, inclSectionName ) )
 277                                 error() << "include: this is a recursive include operation" << endl;
 278                         else {
 279                                 /* Open the input file for reading. */
 280                                 ifstream *inFile = new ifstream( inclFileName );
 281                                 if ( ! inFile->is_open() ) {
 282                                         error() << "include: could not open " <<
 283                                                         inclFileName << " for reading" << endl;
 284                                 }
 285
 286                                 Scanner scanner( inclFileName, *inFile, parser,
 287                                                 inclSectionName, include_depth+1 );
 288                                 scanner.init();
 289                                 scanner.do_scan( );
 290                                 delete inFile;
 291                         }
 292
 293                         /* Remove the last element (len-1) */
 294                         includeStack.remove( -1 );
 295                 }
 296         }
 297
 298         include_names = (
 299                 TK_Word @store_word ( TK_Literal @store_lit )? |
 300                 TK_Literal @store_lit
 301         ) >clear_words;
 302
 303         include_stmt =
 304                 ( KW_Include include_names ';' ) @handle_include
 305                 <>err incl_err <>eof incl_err;
 306
 307         action write_command
 308         {
 309                 if ( active ) {
 310                         openRagelDef();
 311                         if ( strcmp( tokdata, "data" ) != 0 &&
 312                                         strcmp( tokdata, "init" ) != 0 &&
 313                                         strcmp( tokdata, "exec" ) != 0 &&
 314                                         strcmp( tokdata, "eof" ) != 0 )
 315                         {
 316                                 error() << "unknown write command" << endl;
 317                         }
 318                         *outStream << "  <write what=\"" << tokdata << "\">";
 319                 }
 320         }
 321
 322         action write_option
 323         {
 324                 if ( active )
 325                         *outStream << "<option>" << tokdata << "</option>";
 326         }
 327         action write_close
 328         {
 329                 if ( active )
 330                         *outStream << "</write>\n";
 331         }
 332
 333         write_stmt =
 334                 ( KW_Write TK_Word @write_command
 335                         ( TK_Word @write_option )* ';' @write_close )
 336                 <>err write_err <>eof write_err;
 337
 338         action handle_token
 339         {
 340                 /* Send the token off to the parser. */
 341                 if ( active && parserExists() ) {
 342                         InputLoc loc;
 343
 344                         #if 0
 345                         cerr << "scanner:" << line << ":" << column <<
 346                                         ": sending token to the parser " << lelNames[*p];
 347                         cerr << " " << toklen;
 348                         if ( tokdata != 0 )
 349                                 cerr << " " << tokdata;
 350                         cerr << endl;
 351                         #endif
 352
 353                         loc.fileName = fileName;
 354                         loc.line = line;
 355                         loc.col = column;
 356
 357                         parser->token( loc, type, tokdata, toklen );
 358                 }
 359         }
 360
 361         # Catch everything else.
 362         everything_else = ^( KW_Machine | KW_Include | KW_Write ) @handle_token;
 363
 364         main := (
 365                 machine_stmt |
 366                 include_stmt |
 367                 write_stmt |
 368                 everything_else
 369         )*;
 370 }%%
 371
 372 void Scanner::token( int type, char *start, char *end )
 373 {
 374         char *tokdata = 0;
 375         int toklen = 0;
 376         int *p = &type;
 377         int *pe = &type + 1;
 378
 379         if ( start != 0 ) {
 380                 toklen = end-start;
 381                 tokdata = new char[toklen+1];
 382                 memcpy( tokdata, start, toklen );
 383                 tokdata[toklen] = 0;
 384         }
 385
 386         %%{
 387                 machine section_parse;
 388                 write exec;
 389         }%%
 390
 391         updateCol();
 392 }
 393
 394 void Scanner::startSection( )
 395 {
 396         parserExistsError = false;
 397
 398         if ( include_depth == 0 ) {
 399                 if ( machineSpec == 0 && machineName == 0 )
 400                         *outStream << "</host>\n";
 401                 ragelDefOpen = false;
 402         }
 403
 404         sectionLoc.fileName = fileName;
 405         sectionLoc.line = line;
 406         sectionLoc.col = 0;
 407 }
 408
 409 void Scanner::openRagelDef()
 410 {
 411         if ( ! ragelDefOpen ) {
 412                 ragelDefOpen = true;
 413                 *outStream << "<ragel_def name=\"" << parser->sectionName << "\">\n";
 414         }
 415 }
 416
 417 void Scanner::endSection( )
 418 {
 419         /* Execute the eof actions for the section parser. */
 420         %%{
 421                 machine section_parse;
 422                 write eof;
 423         }%%
 424
 425         /* Close off the section with the parser. */
 426         if ( active && parserExists() ) {
 427                 InputLoc loc;
 428                 loc.fileName = fileName;
 429                 loc.line = line;
 430                 loc.col = 0;
 431
 432                 parser->token( loc, TK_EndSection, 0, 0 );
 433         }
 434
 435         if ( include_depth == 0 ) {
 436                 if ( ragelDefOpen ) {
 437                         *outStream << "</ragel_def>\n";
 438                         ragelDefOpen = false;
 439                 }
 440
 441                 if ( machineSpec == 0 && machineName == 0 ) {
 442                         /* The end section may include a newline on the end, so
 443                          * we use the last line, which will count the newline. */
 444                         *outStream << "<host line=\"" << line << "\">";
 445                 }
 446         }
 447 }
 448
 449 %%{
 450         machine rlscan;
 451
 452         # This is sent by the driver code.
 453         EOF = 0;
 454
 455         action inc_nl {
 456                 lastnl = p;
 457                 column = 0;
 458                 line++;
 459         }
 460         NL = '\n' @inc_nl;
 461
 462         # Identifiers, numbers, commetns, and other common things.
 463         ident = ( alpha | '_' ) ( alpha |digit |'_' )*;
 464         number = digit+;
 465         hex_number = '0x' [0-9a-fA-F]+;
 466
 467         c_comment =
 468                 '/*' ( any | NL )* :>> '*/';
 469
 470         cpp_comment =
 471                 '//' [^\n]* NL;
 472
 473         c_cpp_comment = c_comment | cpp_comment;
 474
 475         # These literal forms are common to C-like host code and ragel.
 476         s_literal = "'" ([^'\\] | NL | '\\' (any | NL))* "'";
 477         d_literal = '"' ([^"\\] | NL | '\\' (any | NL))* '"';
 478
 479         whitespace = [ \t] | NL;
 480         pound_comment = '#' [^\n]* NL;
 481
 482         # An inline block of code. This is specified as a scanned, but is sent to
 483         # the parser as one long block. The inline_block pointer is used to handle
 484         # the preservation of the data.
 485         inline_code := |*
 486                 # Inline expression keywords.
 487                 "fpc" => { token( KW_PChar ); };
 488                 "fc" => { token( KW_Char ); };
 489                 "fcurs" => { token( KW_CurState ); };
 490                 "ftargs" => { token( KW_TargState ); };
 491                 "fentry" => {
 492                         whitespaceOn = false;
 493                         token( KW_Entry );
 494                 };
 495
 496                 # Inline statement keywords.
 497                 "fhold" => {
 498                         whitespaceOn = false;
 499                         token( KW_Hold );
 500                 };
 501                 "fexec" => { token( KW_Exec, 0, 0 ); };
 502                 "fgoto" => {
 503                         whitespaceOn = false;
 504                         token( KW_Goto );
 505                 };
 506                 "fnext" => {
 507                         whitespaceOn = false;
 508                         token( KW_Next );
 509                 };
 510                 "fcall" => {
 511                         whitespaceOn = false;
 512                         token( KW_Call );
 513                 };
 514                 "fret" => {
 515                         whitespaceOn = false;
 516                         token( KW_Ret );
 517                 };
 518                 "fbreak" => {
 519                         whitespaceOn = false;
 520                         token( KW_Break );
 521                 };
 522
 523                 ident => { token( TK_Word, tokstart, tokend ); };
 524
 525                 number => { token( TK_UInt, tokstart, tokend ); };
 526                 hex_number => { token( TK_Hex, tokstart, tokend ); };
 527
 528                 ( s_literal | d_literal )
 529                         => { token( IL_Literal, tokstart, tokend ); };
 530
 531                 whitespace+ => {
 532                         if ( whitespaceOn )
 533                                 token( IL_WhiteSpace, tokstart, tokend );
 534                 };
 535                 c_cpp_comment => { token( IL_Comment, tokstart, tokend ); };
 536
 537                 "::" => { token( TK_NameSep, tokstart, tokend ); };
 538
 539                 # Some symbols need to go to the parser as with their cardinal value as
 540                 # the token type (as opposed to being sent as anonymous symbols)
 541                 # because they are part of the sequences which we interpret. The * ) ;
 542                 # symbols cause whitespace parsing to come back on. This gets turned
 543                 # off by some keywords.
 544
 545                 ";" => {
 546                         whitespaceOn = true;
 547                         token( *tokstart, tokstart, tokend );
 548                         if ( inlineBlockType == SemiTerminated )
 549                                 fgoto parser_def;
 550                 };
 551
 552                 [*)] => {
 553                         whitespaceOn = true;
 554                         token( *tokstart, tokstart, tokend );
 555                 };
 556
 557                 [,(] => { token( *tokstart, tokstart, tokend ); };
 558
 559                 '{' => {
 560                         token( IL_Symbol, tokstart, tokend );
 561                         curly_count += 1;
 562                 };
 563
 564                 '}' => {
 565                         if ( --curly_count == 0 && inlineBlockType == CurlyDelimited ) {
 566                                 /* Inline code block ends. */
 567                                 token( '}' );
 568                                 fgoto parser_def;
 569                         }
 570                         else {
 571                                 /* Either a semi terminated inline block or only the closing
 572                                  * brace of some inner scope, not the block's closing brace. */
 573                                 token( IL_Symbol, tokstart, tokend );
 574                         }
 575                 };
 576
 577                 EOF => {
 578                         error() << "unterminated code block" << endl;
 579                 };
 580
 581                 # Send every other character as a symbol.
 582                 any => { token( IL_Symbol, tokstart, tokend ); };
 583         *|;
 584
 585         or_literal := |*
 586                 # Escape sequences in OR expressions.
 587                 '\\0' => { token( RE_Char, '\0' ); };
 588                 '\\a' => { token( RE_Char, '\a' ); };
 589                 '\\b' => { token( RE_Char, '\b' ); };
 590                 '\\t' => { token( RE_Char, '\t' ); };
 591                 '\\n' => { token( RE_Char, '\n' ); };
 592                 '\\v' => { token( RE_Char, '\v' ); };
 593                 '\\f' => { token( RE_Char, '\f' ); };
 594                 '\\r' => { token( RE_Char, '\r' ); };
 595                 '\\\n' => { updateCol(); };
 596                 '\\' any => { token( RE_Char, tokstart+1, tokend ); };
 597
 598                 # Range dash in an OR expression.
 599                 '-' => { token( RE_Dash, 0, 0 ); };
 600
 601                 # Terminate an OR expression.
 602                 ']'     => { token( RE_SqClose ); fret; };
 603
 604                 EOF => {
 605                         error() << "unterminated OR literal" << endl;
 606                 };
 607
 608                 # Characters in an OR expression.
 609                 [^\]] => { token( RE_Char, tokstart, tokend ); };
 610
 611         *|;
 612
 613         re_literal := |*
 614                 # Escape sequences in regular expressions.
 615                 '\\0' => { token( RE_Char, '\0' ); };
 616                 '\\a' => { token( RE_Char, '\a' ); };
 617                 '\\b' => { token( RE_Char, '\b' ); };
 618                 '\\t' => { token( RE_Char, '\t' ); };
 619                 '\\n' => { token( RE_Char, '\n' ); };
 620                 '\\v' => { token( RE_Char, '\v' ); };
 621                 '\\f' => { token( RE_Char, '\f' ); };
 622                 '\\r' => { token( RE_Char, '\r' ); };
 623                 '\\\n' => { updateCol(); };
 624                 '\\' any => { token( RE_Char, tokstart+1, tokend ); };
 625
 626                 # Terminate an OR expression.
 627                 '/' [i]? => {
 628                         token( RE_Slash, tokstart, tokend );
 629                         fgoto parser_def;
 630                 };
 631
 632                 # Special characters.
 633                 '.' => { token( RE_Dot ); };
 634                 '*' => { token( RE_Star ); };
 635
 636                 '[' => { token( RE_SqOpen ); fcall or_literal; };
 637                 '[^' => { token( RE_SqOpenNeg ); fcall or_literal; };
 638
 639                 EOF => {
 640                         error() << "unterminated regular expression" << endl;
 641                 };
 642
 643                 # Characters in an OR expression.
 644                 [^\/] => { token( RE_Char, tokstart, tokend ); };
 645         *|;
 646
 647         write_statement := |*
 648                 ident => { token( TK_Word, tokstart, tokend ); } ;
 649                 [ \t\n]+ => { updateCol(); };
 650                 ';' => { token( ';' ); fgoto parser_def; };
 651
 652                 EOF => {
 653                         error() << "unterminated write statement" << endl;
 654                 };
 655         *|;
 656
 657         # Parser definitions.
 658         parser_def := |*
 659                 'machine' => { token( KW_Machine ); };
 660                 'include' => { token( KW_Include ); };
 661                 'write' => {
 662                         token( KW_Write );
 663                         fgoto write_statement;
 664                 };
 665                 'action' => { token( KW_Action ); };
 666                 'alphtype' => { token( KW_AlphType ); };
 667
 668                 # FIXME: Enable this post 5.17.
 669                 # 'range' => { token( KW_Range ); };
 670
 671                 'getkey' => {
 672                         token( KW_GetKey );
 673                         inlineBlockType = SemiTerminated;
 674                         fgoto inline_code;
 675                 };
 676                 'access' => {
 677                         token( KW_Access );
 678                         inlineBlockType = SemiTerminated;
 679                         fgoto inline_code;
 680                 };
 681                 'variable' => {
 682                         token( KW_Variable );
 683                         inlineBlockType = SemiTerminated;
 684                         fgoto inline_code;
 685                 };
 686                 'when' => { token( KW_When ); };
 687                 'eof' => { token( KW_Eof ); };
 688                 'err' => { token( KW_Err ); };
 689                 'lerr' => { token( KW_Lerr ); };
 690                 'to' => { token( KW_To ); };
 691                 'from' => { token( KW_From ); };
 692
 693                 # Identifiers.
 694                 ident => { token( TK_Word, tokstart, tokend ); } ;
 695
 696                 # Numbers
 697                 number => { token( TK_UInt, tokstart, tokend ); };
 698                 hex_number => { token( TK_Hex, tokstart, tokend ); };
 699
 700                 # Literals, with optionals.
 701                 ( s_literal | d_literal ) [i]?
 702                         => { token( TK_Literal, tokstart, tokend ); };
 703
 704                 '[' => { token( RE_SqOpen ); fcall or_literal; };
 705                 '[^' => { token( RE_SqOpenNeg ); fcall or_literal; };
 706
 707                 '/' => { token( RE_Slash ); fgoto re_literal; };
 708
 709                 # Ignore.
 710                 pound_comment => { updateCol(); };
 711
 712                 ':=' => { token( TK_ColonEquals ); };
 713
 714                 # To State Actions.
 715                 ">~" => { token( TK_StartToState ); };
 716                 "$~" => { token( TK_AllToState ); };
 717                 "%~" => { token( TK_FinalToState ); };
 718                 "<~" => { token( TK_NotStartToState ); };
 719                 "@~" => { token( TK_NotFinalToState ); };
 720                 "<>~" => { token( TK_MiddleToState ); };
 721
 722                 # From State actions
 723                 ">*" => { token( TK_StartFromState ); };
 724                 "$*" => { token( TK_AllFromState ); };
 725                 "%*" => { token( TK_FinalFromState ); };
 726                 "<*" => { token( TK_NotStartFromState ); };
 727                 "@*" => { token( TK_NotFinalFromState ); };
 728                 "<>*" => { token( TK_MiddleFromState ); };
 729
 730                 # EOF Actions.
 731                 ">/" => { token( TK_StartEOF ); };
 732                 "$/" => { token( TK_AllEOF ); };
 733                 "%/" => { token( TK_FinalEOF ); };
 734                 "</" => { token( TK_NotStartEOF ); };
 735                 "@/" => { token( TK_NotFinalEOF ); };
 736                 "<>/" => { token( TK_MiddleEOF ); };
 737
 738                 # Global Error actions.
 739                 ">!" => { token( TK_StartGblError ); };
 740                 "$!" => { token( TK_AllGblError ); };
 741                 "%!" => { token( TK_FinalGblError ); };
 742                 "<!" => { token( TK_NotStartGblError ); };
 743                 "@!" => { token( TK_NotFinalGblError ); };
 744                 "<>!" => { token( TK_MiddleGblError ); };
 745
 746                 # Local error actions.
 747                 ">^" => { token( TK_StartLocalError ); };
 748                 "$^" => { token( TK_AllLocalError ); };
 749                 "%^" => { token( TK_FinalLocalError ); };
 750                 "<^" => { token( TK_NotStartLocalError ); };
 751                 "@^" => { token( TK_NotFinalLocalError ); };
 752                 "<>^" => { token( TK_MiddleLocalError ); };
 753
 754                 # Middle.
 755                 "<>" => { token( TK_Middle ); };
 756
 757                 # Conditions.
 758                 '>?' => { token( TK_StartCond ); };
 759                 '$?' => { token( TK_AllCond ); };
 760                 '%?' => { token( TK_LeavingCond ); };
 761
 762                 '..' => { token( TK_DotDot ); };
 763                 '**' => { token( TK_StarStar ); };
 764                 '--' => { token( TK_DashDash ); };
 765                 '->' => { token( TK_Arrow ); };
 766                 '=>' => { token( TK_DoubleArrow ); };
 767
 768                 ":>"  => { token( TK_ColonGt ); };
 769                 ":>>" => { token( TK_ColonGtGt ); };
 770                 "<:"  => { token( TK_LtColon ); };
 771
 772                 # Opening of longest match.
 773                 "|*" => { token( TK_BarStar ); };
 774
 775                 '}%%' => {
 776                         /* In order to generate anything we must be in the top level file
 777                          * and the current spec must be active and there must not have been
 778                          * any parse errors. */
 779                         updateCol();
 780                         endSection();
 781                         fgoto main;
 782                 };
 783
 784                 [ \t]+ => { updateCol(); };
 785
 786                 # If we are in a single line machine then newline may end the spec.
 787                 NL => {
 788                         updateCol();
 789                         if ( singleLineSpec ) {
 790                                 /* In order to generate anything we must be in the top level file
 791                                  * and the current spec must be active and there must not have been
 792                                  * any parse errors. */
 793                                 endSection();
 794                                 fgoto main;
 795                         }
 796                 };
 797
 798                 '{' => {
 799                         token( '{' );
 800                         curly_count = 1;
 801                         inlineBlockType = CurlyDelimited;
 802                         fgoto inline_code;
 803                 };
 804
 805                 EOF => {
 806                         error() << "unterminated ragel section" << endl;
 807                 };
 808
 809                 any => { token( *tokstart ); } ;
 810         *|;
 811
 812         action pass {
 813                 updateCol();
 814
 815                 /* If no errors and we are at the bottom of the include stack (the
 816                  * source file listed on the command line) then write out the data. */
 817                 if ( include_depth == 0 && machineSpec == 0 && machineName == 0 )
 818                         xmlEscapeHost( *outStream, tokstart, tokend-tokstart );
 819         }
 820
 821         # Outside code scanner. These tokens get passed through.
 822         main := |*
 823                 ident => pass;
 824                 number => pass;
 825                 c_cpp_comment => pass;
 826                 s_literal | d_literal => pass;
 827                 '%%{' => {
 828                         updateCol();
 829                         singleLineSpec = false;
 830                         startSection();
 831                         fgoto parser_def;
 832                 };
 833                 '%%' => {
 834                         updateCol();
 835                         singleLineSpec = true;
 836                         startSection();
 837                         fgoto parser_def;
 838                 };
 839                 whitespace+ => pass;
 840                 EOF;
 841                 any => pass;
 842         *|;
 843
 844 }%%
 845
 846 %% write data;
 847
 848 void Scanner::do_scan()
 849 {
 850         int bufsize = 8;
 851         char *buf = new char[bufsize];
 852         const char last_char = 0;
 853         int cs, act, have = 0;
 854         int top, stack[1];
 855         int curly_count = 0;
 856         bool execute = true;
 857         bool singleLineSpec = false;
 858         InlineBlockType inlineBlockType;
 859
 860         %% write init;
 861
 862         while ( execute ) {
 863                 char *p = buf + have;
 864                 int space = bufsize - have;
 865
 866                 if ( space == 0 ) {
 867                         /* We filled up the buffer trying to scan a token. Grow it. */
 868                         bufsize = bufsize * 2;
 869                         char *newbuf = new char[bufsize];
 870                         //cerr << "FULL BUFFER, NEW SIZE: " << bufsize << endl;
 871
 872                         /* Recompute p and space. */
 873                         p = newbuf + have;
 874                         space = bufsize - have;
 875
 876                         /* Patch up pointers possibly in use. */
 877                         if ( tokstart != 0 )
 878                                 tokstart = newbuf + ( tokstart - buf );
 879                         tokend = newbuf + ( tokend - buf );
 880
 881                         /* Copy the new buffer in. */
 882                         memcpy( newbuf, buf, have );
 883                         delete[] buf;
 884                         buf = newbuf;
 885                 }
 886
 887                 input.read( p, space );
 888                 int len = input.gcount();
 889
 890                 /* If we see eof then append the EOF char. */
 891                 if ( len == 0 ) {
 892                         p[0] = last_char, len = 1;
 893                         execute = false;
 894                 }
 895
 896                 char *pe = p + len;
 897                 %% write exec;
 898
 899                 /* Check if we failed. */
 900                 if ( cs == rlscan_error ) {
 901                         /* Machine failed before finding a token. */
 902                         //cerr << "PARSE ERROR" << endl;
 903                         exit(1);
 904                 }
 905
 906                 /* Decide if we need to preserve anything. */
 907                 char *preserve = tokstart;
 908
 909                 /* Now set up the prefix. */
 910                 if ( preserve == 0 )
 911                         have = 0;
 912                 else {
 913                         /* There is data that needs to be shifted over. */
 914                         have = pe - preserve;
 915                         memmove( buf, preserve, have );
 916                         unsigned int shiftback = preserve - buf;
 917                         if ( tokstart != 0 )
 918                                 tokstart -= shiftback;
 919                         tokend -= shiftback;
 920
 921                         preserve = buf;
 922                 }
 923         }
 924
 925         delete[] buf;
 926 }
 927
 928 void scan( char *fileName, istream &input )
 929 {
 930         Scanner scanner( fileName, input, 0, 0, 0 );
 931         scanner.init();
 932         scanner.do_scan();
 933
 934         InputLoc eofLoc;
 935         eofLoc.fileName = fileName;
 936         eofLoc.col = 1;
 937         eofLoc.line = scanner.line;
 938 }