ragel/rlscan.rl

   1 /*
   2  *  Copyright 2006-2007 Adrian Thurston <thurston@cs.queensu.ca>
   3  */
   4
   5 /*  This file is part of Ragel.
   6  *
   7  *  Ragel is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  Ragel is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with Ragel; if not, write to the Free Software
  19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  */
  21
  22 #include <iostream>
  23 #include <fstream>
  24 #include <string.h>
  25
  26 #include "ragel.h"
  27 #include "rlscan.h"
  28
  29 //#define LOG_TOKENS
  30
  31 using std::ifstream;
  32 using std::istream;
  33 using std::ostream;
  34 using std::cout;
  35 using std::cerr;
  36 using std::endl;
  37
  38 enum InlineBlockType
  39 {
  40         CurlyDelimited,
  41         SemiTerminated
  42 };
  43
  44
  45 /*
  46  * The Scanner for Importing
  47  */
  48
  49 %%{
  50         machine inline_token_scan;
  51         alphtype int;
  52         access tok_;
  53
  54         # Import scanner tokens.
  55         import "rlparse.h";
  56
  57         main := |*
  58                 # Define of number.
  59                 IMP_Define IMP_Word IMP_UInt => {
  60                         int base = tok_ts - token_data;
  61                         int nameOff = 1;
  62                         int numOff = 2;
  63
  64                         directToParser( inclToParser, fileName, line, column, TK_Word,
  65                                         token_strings[base+nameOff], token_lens[base+nameOff] );
  66                         directToParser( inclToParser, fileName, line, column, '=', 0, 0 );
  67                         directToParser( inclToParser, fileName, line, column, TK_UInt,
  68                                         token_strings[base+numOff], token_lens[base+numOff] );
  69                         directToParser( inclToParser, fileName, line, column, ';', 0, 0 );
  70                 };
  71
  72                 # Assignment of number.
  73                 IMP_Word '=' IMP_UInt => {
  74                         int base = tok_ts - token_data;
  75                         int nameOff = 0;
  76                         int numOff = 2;
  77
  78                         directToParser( inclToParser, fileName, line, column, TK_Word,
  79                                         token_strings[base+nameOff], token_lens[base+nameOff] );
  80                         directToParser( inclToParser, fileName, line, column, '=', 0, 0 );
  81                         directToParser( inclToParser, fileName, line, column, TK_UInt,
  82                                         token_strings[base+numOff], token_lens[base+numOff] );
  83                         directToParser( inclToParser, fileName, line, column, ';', 0, 0 );
  84                 };
  85
  86                 # Define of literal.
  87                 IMP_Define IMP_Word IMP_Literal => {
  88                         int base = tok_ts - token_data;
  89                         int nameOff = 1;
  90                         int litOff = 2;
  91
  92                         directToParser( inclToParser, fileName, line, column, TK_Word,
  93                                         token_strings[base+nameOff], token_lens[base+nameOff] );
  94                         directToParser( inclToParser, fileName, line, column, '=', 0, 0 );
  95                         directToParser( inclToParser, fileName, line, column, TK_Literal,
  96                                         token_strings[base+litOff], token_lens[base+litOff] );
  97                         directToParser( inclToParser, fileName, line, column, ';', 0, 0 );
  98                 };
  99
 100                 # Assignment of literal.
 101                 IMP_Word '=' IMP_Literal => {
 102                         int base = tok_ts - token_data;
 103                         int nameOff = 0;
 104                         int litOff = 2;
 105
 106                         directToParser( inclToParser, fileName, line, column, TK_Word,
 107                                         token_strings[base+nameOff], token_lens[base+nameOff] );
 108                         directToParser( inclToParser, fileName, line, column, '=', 0, 0 );
 109                         directToParser( inclToParser, fileName, line, column, TK_Literal,
 110                                         token_strings[base+litOff], token_lens[base+litOff] );
 111                         directToParser( inclToParser, fileName, line, column, ';', 0, 0 );
 112                 };
 113
 114                 # Catch everything else.
 115                 any;
 116         *|;
 117 }%%
 118
 119 %% write data;
 120
 121 void Scanner::flushImport()
 122 {
 123         int *p = token_data;
 124         int *pe = token_data + cur_token;
 125         int *eof = 0;
 126
 127         %%{
 128                 machine inline_token_scan;
 129                 write init;
 130                 write exec;
 131         }%%
 132
 133         if ( tok_ts == 0 )
 134                 cur_token = 0;
 135         else {
 136                 cur_token = pe - tok_ts;
 137                 int ts_offset = tok_ts - token_data;
 138                 memmove( token_data, token_data+ts_offset, cur_token*sizeof(token_data[0]) );
 139                 memmove( token_strings, token_strings+ts_offset, cur_token*sizeof(token_strings[0]) );
 140                 memmove( token_lens, token_lens+ts_offset, cur_token*sizeof(token_lens[0]) );
 141         }
 142 }
 143
 144 void Scanner::directToParser( Parser *toParser, char *tokFileName, int tokLine,
 145                 int tokColumn, int type, char *tokdata, int toklen )
 146 {
 147         InputLoc loc;
 148
 149         #ifdef LOG_TOKENS
 150         cerr << "scanner:" << tokLine << ":" << tokColumn <<
 151                         ": sending token to the parser " << Parser_lelNames[type];
 152         cerr << " " << toklen;
 153         if ( tokdata != 0 )
 154                 cerr << " " << tokdata;
 155         cerr << endl;
 156         #endif
 157
 158         loc.fileName = tokFileName;
 159         loc.line = tokLine;
 160         loc.col = tokColumn;
 161
 162         toParser->token( loc, type, tokdata, toklen );
 163 }
 164
 165 void Scanner::importToken( int token, char *start, char *end )
 166 {
 167         if ( cur_token == max_tokens )
 168                 flushImport();
 169
 170         token_data[cur_token] = token;
 171         if ( start == 0 ) {
 172                 token_strings[cur_token] = 0;
 173                 token_lens[cur_token] = 0;
 174         }
 175         else {
 176                 int toklen = end-start;
 177                 token_lens[cur_token] = toklen;
 178                 token_strings[cur_token] = new char[toklen+1];
 179                 memcpy( token_strings[cur_token], start, toklen );
 180                 token_strings[cur_token][toklen] = 0;
 181         }
 182         cur_token++;
 183 }
 184
 185 void Scanner::pass( int token, char *start, char *end )
 186 {
 187         if ( importMachines )
 188                 importToken( token, start, end );
 189         pass();
 190 }
 191
 192 void Scanner::pass()
 193 {
 194         updateCol();
 195
 196         /* If no errors and we are at the bottom of the include stack (the
 197          * source file listed on the command line) then write out the data. */
 198         if ( includeDepth == 0 && machineSpec == 0 && machineName == 0 )
 199                 xmlEscapeHost( output, ts, te-ts );
 200 }
 201
 202 /*
 203  * The scanner for processing sections, includes, imports, etc.
 204  */
 205
 206 %%{
 207         machine section_parse;
 208         alphtype int;
 209         write data;
 210 }%%
 211
 212
 213 void Scanner::init( )
 214 {
 215         %% write init;
 216 }
 217
 218 bool Scanner::active()
 219 {
 220         if ( ignoreSection )
 221                 return false;
 222
 223         if ( parser == 0 && ! parserExistsError ) {
 224                 scan_error() << "this specification has no name, nor does any previous"
 225                         " specification" << endl;
 226                 parserExistsError = true;
 227         }
 228
 229         if ( parser == 0 )
 230                 return false;
 231
 232         return true;
 233 }
 234
 235 ostream &Scanner::scan_error()
 236 {
 237         /* Maintain the error count. */
 238         gblErrorCount += 1;
 239         cerr << fileName << ":" << line << ":" << column << ": ";
 240         return cerr;
 241 }
 242
 243 bool Scanner::recursiveInclude( char *inclFileName, char *inclSectionName )
 244 {
 245         for ( IncludeStack::Iter si = includeStack; si.lte(); si++ ) {
 246                 if ( strcmp( si->fileName, inclFileName ) == 0 &&
 247                                 strcmp( si->sectionName, inclSectionName ) == 0 )
 248                 {
 249                         return true;
 250                 }
 251         }
 252         return false;
 253 }
 254
 255 void Scanner::updateCol()
 256 {
 257         char *from = lastnl;
 258         if ( from == 0 )
 259                 from = ts;
 260         //cerr << "adding " << te - from << " to column" << endl;
 261         column += te - from;
 262         lastnl = 0;
 263 }
 264
 265 %%{
 266         machine section_parse;
 267
 268         # Need the defines representing tokens.
 269         import "rlparse.h";
 270
 271         action clear_words { word = lit = 0; word_len = lit_len = 0; }
 272         action store_word { word = tokdata; word_len = toklen; }
 273         action store_lit { lit = tokdata; lit_len = toklen; }
 274
 275         action mach_err { scan_error() << "bad machine statement" << endl; }
 276         action incl_err { scan_error() << "bad include statement" << endl; }
 277         action import_err { scan_error() << "bad import statement" << endl; }
 278         action write_err { scan_error() << "bad write statement" << endl; }
 279
 280         action handle_machine
 281         {
 282                 /* Assign a name to the machine. */
 283                 char *machine = word;
 284
 285                 if ( !importMachines && inclSectionTarg == 0 ) {
 286                         ignoreSection = false;
 287
 288                         ParserDictEl *pdEl = parserDict.find( machine );
 289                         if ( pdEl == 0 ) {
 290                                 pdEl = new ParserDictEl( machine );
 291                                 pdEl->value = new Parser( fileName, machine, sectionLoc );
 292                                 pdEl->value->init();
 293                                 parserDict.insert( pdEl );
 294                         }
 295
 296                         parser = pdEl->value;
 297                 }
 298                 else if ( !importMachines && strcmp( inclSectionTarg, machine ) == 0 ) {
 299                         /* found include target */
 300                         ignoreSection = false;
 301                         parser = inclToParser;
 302                 }
 303                 else {
 304                         /* ignoring section */
 305                         ignoreSection = true;
 306                         parser = 0;
 307                 }
 308         }
 309
 310         machine_stmt =
 311                 ( KW_Machine TK_Word @store_word ';' ) @handle_machine
 312                 <>err mach_err <>eof mach_err;
 313
 314         action handle_include
 315         {
 316                 if ( active() ) {
 317                         char *inclSectionName = word;
 318                         char *inclFileName = 0;
 319
 320                         /* Implement defaults for the input file and section name. */
 321                         if ( inclSectionName == 0 )
 322                                 inclSectionName = parser->sectionName;
 323
 324                         if ( lit != 0 )
 325                                 inclFileName = prepareFileName( lit, lit_len );
 326                         else
 327                                 inclFileName = fileName;
 328
 329                         /* Check for a recursive include structure. Add the current file/section
 330                          * name then check if what we are including is already in the stack. */
 331                         includeStack.append( IncludeStackItem( fileName, parser->sectionName ) );
 332
 333                         if ( recursiveInclude( inclFileName, inclSectionName ) )
 334                                 scan_error() << "include: this is a recursive include operation" << endl;
 335                         else {
 336                                 /* Open the input file for reading. */
 337                                 ifstream *inFile = new ifstream( inclFileName );
 338                                 if ( ! inFile->is_open() ) {
 339                                         scan_error() << "include: could not open " <<
 340                                                         inclFileName << " for reading" << endl;
 341                                 }
 342
 343                                 Scanner scanner( inclFileName, *inFile, output, parser,
 344                                                 inclSectionName, includeDepth+1, false );
 345                                 scanner.do_scan( );
 346                                 delete inFile;
 347                         }
 348
 349                         /* Remove the last element (len-1) */
 350                         includeStack.remove( -1 );
 351                 }
 352         }
 353
 354         include_names = (
 355                 TK_Word @store_word ( TK_Literal @store_lit )? |
 356                 TK_Literal @store_lit
 357         ) >clear_words;
 358
 359         include_stmt =
 360                 ( KW_Include include_names ';' ) @handle_include
 361                 <>err incl_err <>eof incl_err;
 362
 363         action handle_import
 364         {
 365                 if ( active() ) {
 366                         char *importFileName = prepareFileName( lit, lit_len );
 367
 368                         /* Open the input file for reading. */
 369                         ifstream *inFile = new ifstream( importFileName );
 370                         if ( ! inFile->is_open() ) {
 371                                 scan_error() << "import: could not open " <<
 372                                                 importFileName << " for reading" << endl;
 373                         }
 374
 375                         Scanner scanner( importFileName, *inFile, output, parser,
 376                                         0, includeDepth+1, true );
 377                         scanner.do_scan( );
 378                         scanner.importToken( 0, 0, 0 );
 379                         scanner.flushImport();
 380                         delete inFile;
 381                 }
 382         }
 383
 384         import_stmt =
 385                 ( KW_Import TK_Literal @store_lit ';' ) @handle_import
 386                 <>err import_err <>eof import_err;
 387
 388         action write_command
 389         {
 390                 if ( active() && machineSpec == 0 && machineName == 0 ) {
 391                         output << "<write"
 392                                         " def_name=\"" << parser->sectionName << "\""
 393                                         " line=\"" << line << "\""
 394                                         " col=\"" << column << "\""
 395                                         ">";
 396                 }
 397         }
 398
 399         action write_arg
 400         {
 401                 if ( active() && machineSpec == 0 && machineName == 0 )
 402                         output << "<arg>" << tokdata << "</arg>";
 403         }
 404
 405         action write_close
 406         {
 407                 if ( active() && machineSpec == 0 && machineName == 0 )
 408                         output << "</write>\n";
 409         }
 410
 411         write_stmt =
 412                 ( KW_Write @write_command
 413                 ( TK_Word @write_arg )+ ';' @write_close )
 414                 <>err write_err <>eof write_err;
 415
 416         action handle_token
 417         {
 418                 /* Send the token off to the parser. */
 419                 if ( active() )
 420                         directToParser( parser, fileName, line, column, type, tokdata, toklen );
 421         }
 422
 423         # Catch everything else.
 424         everything_else =
 425                 ^( KW_Machine | KW_Include | KW_Import | KW_Write ) @handle_token;
 426
 427         main := (
 428                 machine_stmt |
 429                 include_stmt |
 430                 import_stmt |
 431                 write_stmt |
 432                 everything_else
 433         )*;
 434 }%%
 435
 436 void Scanner::token( int type, char c )
 437 {
 438         token( type, &c, &c + 1 );
 439 }
 440
 441 void Scanner::token( int type )
 442 {
 443         token( type, 0, 0 );
 444 }
 445
 446 void Scanner::token( int type, char *start, char *end )
 447 {
 448         char *tokdata = 0;
 449         int toklen = 0;
 450         if ( start != 0 ) {
 451                 toklen = end-start;
 452                 tokdata = new char[toklen+1];
 453                 memcpy( tokdata, start, toklen );
 454                 tokdata[toklen] = 0;
 455         }
 456
 457         processToken( type, tokdata, toklen );
 458 }
 459
 460 void Scanner::processToken( int type, char *tokdata, int toklen )
 461 {
 462         int *p, *pe, *eof;
 463
 464
 465         if ( type < 0 )
 466                 p = pe = eof = 0;
 467         else {
 468                 p = &type;
 469                 pe = &type + 1;
 470                 eof = 0;
 471         }
 472
 473         %%{
 474                 machine section_parse;
 475                 write exec;
 476         }%%
 477
 478         updateCol();
 479
 480         /* Record the last token for use in controlling the scan of subsequent
 481          * tokens. */
 482         lastToken = type;
 483 }
 484
 485 void Scanner::startSection( )
 486 {
 487         parserExistsError = false;
 488
 489         if ( includeDepth == 0 ) {
 490                 if ( machineSpec == 0 && machineName == 0 )
 491                         output << "</host>\n";
 492         }
 493
 494         sectionLoc.fileName = fileName;
 495         sectionLoc.line = line;
 496         sectionLoc.col = 0;
 497 }
 498
 499 void Scanner::endSection( )
 500 {
 501         /* Execute the eof actions for the section parser. */
 502         processToken( -1, 0, 0 );
 503
 504         /* Close off the section with the parser. */
 505         if ( active() ) {
 506                 InputLoc loc;
 507                 loc.fileName = fileName;
 508                 loc.line = line;
 509                 loc.col = 0;
 510
 511                 parser->token( loc, TK_EndSection, 0, 0 );
 512         }
 513
 514         if ( includeDepth == 0 ) {
 515                 if ( machineSpec == 0 && machineName == 0 ) {
 516                         /* The end section may include a newline on the end, so
 517                          * we use the last line, which will count the newline. */
 518                         output << "<host line=\"" << line << "\">";
 519                 }
 520         }
 521 }
 522
 523 %%{
 524         machine rlscan;
 525
 526         # This is sent by the driver code.
 527         EOF = 0;
 528
 529         action inc_nl {
 530                 lastnl = p;
 531                 column = 0;
 532                 line++;
 533         }
 534         NL = '\n' @inc_nl;
 535
 536         # Identifiers, numbers, commetns, and other common things.
 537         ident = ( alpha | '_' ) ( alpha |digit |'_' )*;
 538         number = digit+;
 539         hex_number = '0x' [0-9a-fA-F]+;
 540
 541         c_comment =
 542                 '/*' ( any | NL )* :>> '*/';
 543
 544         cpp_comment =
 545                 '//' [^\n]* NL;
 546
 547         c_cpp_comment = c_comment | cpp_comment;
 548
 549         ruby_comment = '#' [^\n]* NL;
 550
 551         # These literal forms are common to host code and ragel.
 552         s_literal = "'" ([^'\\] | NL | '\\' (any | NL))* "'";
 553         d_literal = '"' ([^"\\] | NL | '\\' (any | NL))* '"';
 554         host_re_literal = '/' ([^/\\] | NL | '\\' (any | NL))* '/';
 555
 556         whitespace = [ \t] | NL;
 557         pound_comment = '#' [^\n]* NL;
 558
 559         # An inline block of code for Ruby.
 560         inline_code_ruby := |*
 561                 # Inline expression keywords.
 562                 "fpc" => { token( KW_PChar ); };
 563                 "fc" => { token( KW_Char ); };
 564                 "fcurs" => { token( KW_CurState ); };
 565                 "ftargs" => { token( KW_TargState ); };
 566                 "fentry" => {
 567                         whitespaceOn = false;
 568                         token( KW_Entry );
 569                 };
 570
 571                 # Inline statement keywords.
 572                 "fhold" => {
 573                         whitespaceOn = false;
 574                         token( KW_Hold );
 575                 };
 576                 "fexec" => { token( KW_Exec, 0, 0 ); };
 577                 "fgoto" => {
 578                         whitespaceOn = false;
 579                         token( KW_Goto );
 580                 };
 581                 "fnext" => {
 582                         whitespaceOn = false;
 583                         token( KW_Next );
 584                 };
 585                 "fcall" => {
 586                         whitespaceOn = false;
 587                         token( KW_Call );
 588                 };
 589                 "fret" => {
 590                         whitespaceOn = false;
 591                         token( KW_Ret );
 592                 };
 593                 "fbreak" => {
 594                         whitespaceOn = false;
 595                         token( KW_Break );
 596                 };
 597
 598                 ident => { token( TK_Word, ts, te ); };
 599
 600                 number => { token( TK_UInt, ts, te ); };
 601                 hex_number => { token( TK_Hex, ts, te ); };
 602
 603                 ( s_literal | d_literal | host_re_literal )
 604                         => { token( IL_Literal, ts, te ); };
 605
 606                 whitespace+ => {
 607                         if ( whitespaceOn )
 608                                 token( IL_WhiteSpace, ts, te );
 609                 };
 610
 611                 ruby_comment => { token( IL_Comment, ts, te ); };
 612
 613                 "::" => { token( TK_NameSep, ts, te ); };
 614
 615                 # Some symbols need to go to the parser as with their cardinal value as
 616                 # the token type (as opposed to being sent as anonymous symbols)
 617                 # because they are part of the sequences which we interpret. The * ) ;
 618                 # symbols cause whitespace parsing to come back on. This gets turned
 619                 # off by some keywords.
 620
 621                 ";" => {
 622                         whitespaceOn = true;
 623                         token( *ts, ts, te );
 624                         if ( inlineBlockType == SemiTerminated )
 625                                 fret;
 626                 };
 627
 628                 [*)] => {
 629                         whitespaceOn = true;
 630                         token( *ts, ts, te );
 631                 };
 632
 633                 [,(] => { token( *ts, ts, te ); };
 634
 635                 '{' => {
 636                         token( IL_Symbol, ts, te );
 637                         curly_count += 1;
 638                 };
 639
 640                 '}' => {
 641                         if ( --curly_count == 0 && inlineBlockType == CurlyDelimited ) {
 642                                 /* Inline code block ends. */
 643                                 token( '}' );
 644                                 fret;
 645                         }
 646                         else {
 647                                 /* Either a semi terminated inline block or only the closing
 648                                  * brace of some inner scope, not the block's closing brace. */
 649                                 token( IL_Symbol, ts, te );
 650                         }
 651                 };
 652
 653                 EOF => {
 654                         scan_error() << "unterminated code block" << endl;
 655                 };
 656
 657                 # Send every other character as a symbol.
 658                 any => { token( IL_Symbol, ts, te ); };
 659         *|;
 660
 661
 662         # An inline block of code for languages other than Ruby.
 663         inline_code := |*
 664                 # Inline expression keywords.
 665                 "fpc" => { token( KW_PChar ); };
 666                 "fc" => { token( KW_Char ); };
 667                 "fcurs" => { token( KW_CurState ); };
 668                 "ftargs" => { token( KW_TargState ); };
 669                 "fentry" => {
 670                         whitespaceOn = false;
 671                         token( KW_Entry );
 672                 };
 673
 674                 # Inline statement keywords.
 675                 "fhold" => {
 676                         whitespaceOn = false;
 677                         token( KW_Hold );
 678                 };
 679                 "fexec" => { token( KW_Exec, 0, 0 ); };
 680                 "fgoto" => {
 681                         whitespaceOn = false;
 682                         token( KW_Goto );
 683                 };
 684                 "fnext" => {
 685                         whitespaceOn = false;
 686                         token( KW_Next );
 687                 };
 688                 "fcall" => {
 689                         whitespaceOn = false;
 690                         token( KW_Call );
 691                 };
 692                 "fret" => {
 693                         whitespaceOn = false;
 694                         token( KW_Ret );
 695                 };
 696                 "fbreak" => {
 697                         whitespaceOn = false;
 698                         token( KW_Break );
 699                 };
 700
 701                 ident => { token( TK_Word, ts, te ); };
 702
 703                 number => { token( TK_UInt, ts, te ); };
 704                 hex_number => { token( TK_Hex, ts, te ); };
 705
 706                 ( s_literal | d_literal )
 707                         => { token( IL_Literal, ts, te ); };
 708
 709                 whitespace+ => {
 710                         if ( whitespaceOn )
 711                                 token( IL_WhiteSpace, ts, te );
 712                 };
 713
 714                 c_cpp_comment => { token( IL_Comment, ts, te ); };
 715
 716                 "::" => { token( TK_NameSep, ts, te ); };
 717
 718                 # Some symbols need to go to the parser as with their cardinal value as
 719                 # the token type (as opposed to being sent as anonymous symbols)
 720                 # because they are part of the sequences which we interpret. The * ) ;
 721                 # symbols cause whitespace parsing to come back on. This gets turned
 722                 # off by some keywords.
 723
 724                 ";" => {
 725                         whitespaceOn = true;
 726                         token( *ts, ts, te );
 727                         if ( inlineBlockType == SemiTerminated )
 728                                 fret;
 729                 };
 730
 731                 [*)] => {
 732                         whitespaceOn = true;
 733                         token( *ts, ts, te );
 734                 };
 735
 736                 [,(] => { token( *ts, ts, te ); };
 737
 738                 '{' => {
 739                         token( IL_Symbol, ts, te );
 740                         curly_count += 1;
 741                 };
 742
 743                 '}' => {
 744                         if ( --curly_count == 0 && inlineBlockType == CurlyDelimited ) {
 745                                 /* Inline code block ends. */
 746                                 token( '}' );
 747                                 fret;
 748                         }
 749                         else {
 750                                 /* Either a semi terminated inline block or only the closing
 751                                  * brace of some inner scope, not the block's closing brace. */
 752                                 token( IL_Symbol, ts, te );
 753                         }
 754                 };
 755
 756                 EOF => {
 757                         scan_error() << "unterminated code block" << endl;
 758                 };
 759
 760                 # Send every other character as a symbol.
 761                 any => { token( IL_Symbol, ts, te ); };
 762         *|;
 763
 764         or_literal := |*
 765                 # Escape sequences in OR expressions.
 766                 '\\0' => { token( RE_Char, '\0' ); };
 767                 '\\a' => { token( RE_Char, '\a' ); };
 768                 '\\b' => { token( RE_Char, '\b' ); };
 769                 '\\t' => { token( RE_Char, '\t' ); };
 770                 '\\n' => { token( RE_Char, '\n' ); };
 771                 '\\v' => { token( RE_Char, '\v' ); };
 772                 '\\f' => { token( RE_Char, '\f' ); };
 773                 '\\r' => { token( RE_Char, '\r' ); };
 774                 '\\\n' => { updateCol(); };
 775                 '\\' any => { token( RE_Char, ts+1, te ); };
 776
 777                 # Range dash in an OR expression.
 778                 '-' => { token( RE_Dash, 0, 0 ); };
 779
 780                 # Terminate an OR expression.
 781                 ']'     => { token( RE_SqClose ); fret; };
 782
 783                 EOF => {
 784                         scan_error() << "unterminated OR literal" << endl;
 785                 };
 786
 787                 # Characters in an OR expression.
 788                 [^\]] => { token( RE_Char, ts, te ); };
 789
 790         *|;
 791
 792         ragel_re_literal := |*
 793                 # Escape sequences in regular expressions.
 794                 '\\0' => { token( RE_Char, '\0' ); };
 795                 '\\a' => { token( RE_Char, '\a' ); };
 796                 '\\b' => { token( RE_Char, '\b' ); };
 797                 '\\t' => { token( RE_Char, '\t' ); };
 798                 '\\n' => { token( RE_Char, '\n' ); };
 799                 '\\v' => { token( RE_Char, '\v' ); };
 800                 '\\f' => { token( RE_Char, '\f' ); };
 801                 '\\r' => { token( RE_Char, '\r' ); };
 802                 '\\\n' => { updateCol(); };
 803                 '\\' any => { token( RE_Char, ts+1, te ); };
 804
 805                 # Terminate an OR expression.
 806                 '/' [i]? => {
 807                         token( RE_Slash, ts, te );
 808                         fgoto parser_def;
 809                 };
 810
 811                 # Special characters.
 812                 '.' => { token( RE_Dot ); };
 813                 '*' => { token( RE_Star ); };
 814
 815                 '[' => { token( RE_SqOpen ); fcall or_literal; };
 816                 '[^' => { token( RE_SqOpenNeg ); fcall or_literal; };
 817
 818                 EOF => {
 819                         scan_error() << "unterminated regular expression" << endl;
 820                 };
 821
 822                 # Characters in an OR expression.
 823                 [^\/] => { token( RE_Char, ts, te ); };
 824         *|;
 825
 826         # We need a separate token space here to avoid the ragel keywords.
 827         write_statement := |*
 828                 ident => { token( TK_Word, ts, te ); } ;
 829                 [ \t\n]+ => { updateCol(); };
 830                 ';' => { token( ';' ); fgoto parser_def; };
 831
 832                 EOF => {
 833                         scan_error() << "unterminated write statement" << endl;
 834                 };
 835         *|;
 836
 837         # Parser definitions.
 838         parser_def := |*
 839                 'machine' => { token( KW_Machine ); };
 840                 'include' => { token( KW_Include ); };
 841                 'import' => { token( KW_Import ); };
 842                 'write' => {
 843                         token( KW_Write );
 844                         fgoto write_statement;
 845                 };
 846                 'action' => { token( KW_Action ); };
 847                 'alphtype' => { token( KW_AlphType ); };
 848                 'prepush' => { token( KW_PrePush ); };
 849                 'postpop' => { token( KW_PostPop ); };
 850
 851                 # FIXME: Enable this post 5.17.
 852                 # 'range' => { token( KW_Range ); };
 853
 854                 'getkey' => {
 855                         token( KW_GetKey );
 856                         inlineBlockType = SemiTerminated;
 857                         if ( hostLang->lang == HostLang::Ruby )
 858                                 fcall inline_code_ruby;
 859                         else
 860                                 fcall inline_code;
 861                 };
 862                 'access' => {
 863                         token( KW_Access );
 864                         inlineBlockType = SemiTerminated;
 865                         if ( hostLang->lang == HostLang::Ruby )
 866                                 fcall inline_code_ruby;
 867                         else
 868                                 fcall inline_code;
 869                 };
 870                 'variable' => {
 871                         token( KW_Variable );
 872                         inlineBlockType = SemiTerminated;
 873                         if ( hostLang->lang == HostLang::Ruby )
 874                                 fcall inline_code_ruby;
 875                         else
 876                                 fcall inline_code;
 877                 };
 878                 'when' => { token( KW_When ); };
 879                 'inwhen' => { token( KW_InWhen ); };
 880                 'outwhen' => { token( KW_OutWhen ); };
 881                 'eof' => { token( KW_Eof ); };
 882                 'err' => { token( KW_Err ); };
 883                 'lerr' => { token( KW_Lerr ); };
 884                 'to' => { token( KW_To ); };
 885                 'from' => { token( KW_From ); };
 886                 'export' => { token( KW_Export ); };
 887
 888                 # Identifiers.
 889                 ident => { token( TK_Word, ts, te ); } ;
 890
 891                 # Numbers
 892                 number => { token( TK_UInt, ts, te ); };
 893                 hex_number => { token( TK_Hex, ts, te ); };
 894
 895                 # Literals, with optionals.
 896                 ( s_literal | d_literal ) [i]?
 897                         => { token( TK_Literal, ts, te ); };
 898
 899                 '[' => { token( RE_SqOpen ); fcall or_literal; };
 900                 '[^' => { token( RE_SqOpenNeg ); fcall or_literal; };
 901
 902                 '/' => { token( RE_Slash ); fgoto ragel_re_literal; };
 903
 904                 # Ignore.
 905                 pound_comment => { updateCol(); };
 906
 907                 ':=' => { token( TK_ColonEquals ); };
 908
 909                 # To State Actions.
 910                 ">~" => { token( TK_StartToState ); };
 911                 "$~" => { token( TK_AllToState ); };
 912                 "%~" => { token( TK_FinalToState ); };
 913                 "<~" => { token( TK_NotStartToState ); };
 914                 "@~" => { token( TK_NotFinalToState ); };
 915                 "<>~" => { token( TK_MiddleToState ); };
 916
 917                 # From State actions
 918                 ">*" => { token( TK_StartFromState ); };
 919                 "$*" => { token( TK_AllFromState ); };
 920                 "%*" => { token( TK_FinalFromState ); };
 921                 "<*" => { token( TK_NotStartFromState ); };
 922                 "@*" => { token( TK_NotFinalFromState ); };
 923                 "<>*" => { token( TK_MiddleFromState ); };
 924
 925                 # EOF Actions.
 926                 ">/" => { token( TK_StartEOF ); };
 927                 "$/" => { token( TK_AllEOF ); };
 928                 "%/" => { token( TK_FinalEOF ); };
 929                 "</" => { token( TK_NotStartEOF ); };
 930                 "@/" => { token( TK_NotFinalEOF ); };
 931                 "<>/" => { token( TK_MiddleEOF ); };
 932
 933                 # Global Error actions.
 934                 ">!" => { token( TK_StartGblError ); };
 935                 "$!" => { token( TK_AllGblError ); };
 936                 "%!" => { token( TK_FinalGblError ); };
 937                 "<!" => { token( TK_NotStartGblError ); };
 938                 "@!" => { token( TK_NotFinalGblError ); };
 939                 "<>!" => { token( TK_MiddleGblError ); };
 940
 941                 # Local error actions.
 942                 ">^" => { token( TK_StartLocalError ); };
 943                 "$^" => { token( TK_AllLocalError ); };
 944                 "%^" => { token( TK_FinalLocalError ); };
 945                 "<^" => { token( TK_NotStartLocalError ); };
 946                 "@^" => { token( TK_NotFinalLocalError ); };
 947                 "<>^" => { token( TK_MiddleLocalError ); };
 948
 949                 # Middle.
 950                 "<>" => { token( TK_Middle ); };
 951
 952                 # Conditions.
 953                 '>?' => { token( TK_StartCond ); };
 954                 '$?' => { token( TK_AllCond ); };
 955                 '%?' => { token( TK_LeavingCond ); };
 956
 957                 '..' => { token( TK_DotDot ); };
 958                 '**' => { token( TK_StarStar ); };
 959                 '--' => { token( TK_DashDash ); };
 960                 '->' => { token( TK_Arrow ); };
 961                 '=>' => { token( TK_DoubleArrow ); };
 962
 963                 ":>"  => { token( TK_ColonGt ); };
 964                 ":>>" => { token( TK_ColonGtGt ); };
 965                 "<:"  => { token( TK_LtColon ); };
 966
 967                 # Opening of longest match.
 968                 "|*" => { token( TK_BarStar ); };
 969
 970                 # Separater for name references.
 971                 "::" => { token( TK_NameSep, ts, te ); };
 972
 973                 '}%%' => {
 974                         updateCol();
 975                         endSection();
 976                         fret;
 977                 };
 978
 979                 [ \t\r]+ => { updateCol(); };
 980
 981                 # If we are in a single line machine then newline may end the spec.
 982                 NL => {
 983                         updateCol();
 984                         if ( singleLineSpec ) {
 985                                 endSection();
 986                                 fret;
 987                         }
 988                 };
 989
 990                 '{' => {
 991                         if ( lastToken == KW_Export || lastToken == KW_Entry )
 992                                 token( '{' );
 993                         else {
 994                                 token( '{' );
 995                                 curly_count = 1;
 996                                 inlineBlockType = CurlyDelimited;
 997                                 if ( hostLang->lang == HostLang::Ruby )
 998                                         fcall inline_code_ruby;
 999                                 else
1000                                         fcall inline_code;
1001                         }
1002                 };
1003
1004                 EOF => {
1005                         scan_error() << "unterminated ragel section" << endl;
1006                 };
1007
1008                 any => { token( *ts ); } ;
1009         *|;
1010
1011         # Outside code scanner. These tokens get passed through.
1012         main_ruby := |*
1013                 ident => { pass( IMP_Word, ts, te ); };
1014                 number => { pass( IMP_UInt, ts, te ); };
1015                 ruby_comment => { pass(); };
1016                 ( s_literal | d_literal | host_re_literal )
1017                         => { pass( IMP_Literal, ts, te ); };
1018
1019                 '%%{' => {
1020                         updateCol();
1021                         singleLineSpec = false;
1022                         startSection();
1023                         fcall parser_def;
1024                 };
1025                 '%%' => {
1026                         updateCol();
1027                         singleLineSpec = true;
1028                         startSection();
1029                         fcall parser_def;
1030                 };
1031                 whitespace+ => { pass(); };
1032                 EOF;
1033                 any => { pass( *ts, 0, 0 ); };
1034         *|;
1035
1036         # Outside code scanner. These tokens get passed through.
1037         main := |*
1038                 'define' => { pass( IMP_Define, 0, 0 ); };
1039                 ident => { pass( IMP_Word, ts, te ); };
1040                 number => { pass( IMP_UInt, ts, te ); };
1041                 c_cpp_comment => { pass(); };
1042                 ( s_literal | d_literal ) => { pass( IMP_Literal, ts, te ); };
1043
1044                 '%%{' => {
1045                         updateCol();
1046                         singleLineSpec = false;
1047                         startSection();
1048                         fcall parser_def;
1049                 };
1050                 '%%' => {
1051                         updateCol();
1052                         singleLineSpec = true;
1053                         startSection();
1054                         fcall parser_def;
1055                 };
1056                 whitespace+ => { pass(); };
1057                 EOF;
1058                 any => { pass( *ts, 0, 0 ); };
1059         *|;
1060 }%%
1061
1062 %% write data;
1063
1064 void Scanner::do_scan()
1065 {
1066         int bufsize = 8;
1067         char *buf = new char[bufsize];
1068         int cs, act, have = 0;
1069         int top;
1070
1071         /* The stack is two deep, one level for going into ragel defs from the main
1072          * machines which process outside code, and another for going into or literals
1073          * from either a ragel spec, or a regular expression. */
1074         int stack[2];
1075         int curly_count = 0;
1076         bool execute = true;
1077         bool singleLineSpec = false;
1078         InlineBlockType inlineBlockType = CurlyDelimited;
1079
1080         /* Init the section parser and the character scanner. */
1081         init();
1082         %% write init;
1083
1084         /* Set up the start state. FIXME: After 5.20 is released the nocs write
1085          * init option should be used, the main machine eliminated and this statement moved
1086          * above the write init. */
1087         if ( hostLang->lang == HostLang::Ruby )
1088                 cs = rlscan_en_main_ruby;
1089         else
1090                 cs = rlscan_en_main;
1091
1092         while ( execute ) {
1093                 char *p = buf + have;
1094                 int space = bufsize - have;
1095
1096                 if ( space == 0 ) {
1097                         /* We filled up the buffer trying to scan a token. Grow it. */
1098                         bufsize = bufsize * 2;
1099                         char *newbuf = new char[bufsize];
1100
1101                         /* Recompute p and space. */
1102                         p = newbuf + have;
1103                         space = bufsize - have;
1104
1105                         /* Patch up pointers possibly in use. */
1106                         if ( ts != 0 )
1107                                 ts = newbuf + ( ts - buf );
1108                         te = newbuf + ( te - buf );
1109
1110                         /* Copy the new buffer in. */
1111                         memcpy( newbuf, buf, have );
1112                         delete[] buf;
1113                         buf = newbuf;
1114                 }
1115
1116                 input.read( p, space );
1117                 int len = input.gcount();
1118                 char *pe = p + len;
1119
1120                 /* If we see eof then append the eof var. */
1121                 char *eof = 0;
1122                 if ( len == 0 ) {
1123                         eof = pe;
1124                         execute = false;
1125                 }
1126
1127                 %% write exec;
1128
1129                 /* Check if we failed. */
1130                 if ( cs == rlscan_error ) {
1131                         /* Machine failed before finding a token. I'm not yet sure if this
1132                          * is reachable. */
1133                         scan_error() << "scanner error" << endl;
1134                         exit(1);
1135                 }
1136
1137                 /* Decide if we need to preserve anything. */
1138                 char *preserve = ts;
1139
1140                 /* Now set up the prefix. */
1141                 if ( preserve == 0 )
1142                         have = 0;
1143                 else {
1144                         /* There is data that needs to be shifted over. */
1145                         have = pe - preserve;
1146                         memmove( buf, preserve, have );
1147                         unsigned int shiftback = preserve - buf;
1148                         if ( ts != 0 )
1149                                 ts -= shiftback;
1150                         te -= shiftback;
1151
1152                         preserve = buf;
1153                 }
1154         }
1155
1156         delete[] buf;
1157 }