ragel/rlscan.rl

   1 /*
   2  *  Copyright 2006-2007 Adrian Thurston <thurston@cs.queensu.ca>
   3  */
   4
   5 /*  This file is part of Ragel.
   6  *
   7  *  Ragel is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  Ragel is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with Ragel; if not, write to the Free Software
  19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  */
  21
  22 #include <iostream>
  23 #include <fstream>
  24 #include <string.h>
  25
  26 #include "ragel.h"
  27 #include "rlscan.h"
  28
  29 using std::ifstream;
  30 using std::istream;
  31 using std::ostream;
  32 using std::cout;
  33 using std::cerr;
  34 using std::endl;
  35
  36 /*
  37  * The Scanner for Importing
  38  */
  39
  40 #define IMP_Word 128
  41 #define IMP_Literal 129
  42 #define IMP_Number 130
  43 #define IMP_Define 131
  44
  45 %%{
  46         machine inline_token_scan;
  47         alphtype int;
  48         access tok_;
  49
  50         IMP_Word = 128;
  51         IMP_Literal = 129;
  52         IMP_Number = 130;
  53         IMP_Define = 131;
  54
  55         main := |*
  56                 IMP_Define IMP_Word IMP_Number => { cerr << ( "define" ) << endl; };
  57                 IMP_Word '=' IMP_Number => { cerr << ( "const1" ) << endl; };
  58                 IMP_Word '=' IMP_Literal => { cerr << ( "const2" ) << endl; };
  59                 any;
  60         *|;
  61 }%%
  62
  63 %% write data;
  64
  65 void ImportScanner::token( int token, char *start, char *end )
  66 {
  67         if ( cur_token == max_tokens ) {
  68                 int *p = token_data;
  69                 int *pe = token_data + cur_token;
  70
  71                 %% write init;
  72                 %% write exec;
  73
  74                 if ( tok_tokstart == 0 )
  75                         cur_token = 0;
  76                 else {
  77                         cerr << "BLOCK BREAK" << endl;
  78                         cur_token = pe - tok_tokstart;
  79                         memmove( token_data, tok_tokstart, cur_token*sizeof(int) );
  80                 }
  81         }
  82
  83         token_data[cur_token++] = token;
  84 }
  85
  86 %%{
  87         machine inline_scan;
  88         access chr_;
  89
  90         # This is sent by the driver code.
  91         EOF = 0;
  92         NL = '\n';
  93
  94         # Identifiers, numbers, commetns, and other common things.
  95         ident = ( alpha | '_' ) ( alpha |digit |'_' )*;
  96         number = digit+;
  97         hex_number = '0x' [0-9a-fA-F]+;
  98
  99         c_comment =
 100                 '/*' ( any | NL )* :>> '*/';
 101
 102         cpp_comment =
 103                 '//' [^\n]* NL;
 104
 105         c_cpp_comment = c_comment | cpp_comment;
 106
 107         # These literal forms are common to C-like host code and ragel.
 108         s_literal = "'" ([^'\\] | NL | '\\' (any | NL))* "'";
 109         d_literal = '"' ([^"\\] | NL | '\\' (any | NL))* '"';
 110
 111         whitespace = [ \t] | NL;
 112
 113
 114         # Outside code scanner. These tokens get passed through.
 115         main := |*
 116                 'define' => { token( IMP_Define, 0, 0 ); };
 117                 ident => { token( IMP_Word, chr_tokstart, chr_tokend ); };
 118                 number => { token( IMP_Number, chr_tokstart, chr_tokend ); };
 119                 c_cpp_comment;
 120                 s_literal | d_literal => { token( IMP_Literal, chr_tokstart, chr_tokend ); };
 121                 whitespace+;
 122                 EOF;
 123                 any => { token( *chr_tokstart, 0, 0 ); };
 124         *|;
 125 }%%
 126
 127 %% write data;
 128
 129 void ImportScanner::do_scan()
 130 {
 131         int bufsize = 8;
 132         char *buf = new char[bufsize];
 133         const char last_char = 0;
 134         int chr_cs, chr_act, have = 0;
 135         bool execute = true;
 136
 137         /* Init the section parser and the character scanner. */
 138         %% write init;
 139
 140         while ( execute ) {
 141                 char *p = buf + have;
 142                 int space = bufsize - have;
 143
 144                 if ( space == 0 ) {
 145                         /* We filled up the buffer trying to scan a token. Grow it. */
 146                         bufsize = bufsize * 2;
 147                         char *newbuf = new char[bufsize];
 148
 149                         /* Recompute p and space. */
 150                         p = newbuf + have;
 151                         space = bufsize - have;
 152
 153                         /* Patch up pointers possibly in use. */
 154                         if ( chr_tokstart != 0 )
 155                                 chr_tokstart = newbuf + ( chr_tokstart - buf );
 156                         chr_tokend = newbuf + ( chr_tokend - buf );
 157
 158                         /* Copy the new buffer in. */
 159                         memcpy( newbuf, buf, have );
 160                         delete[] buf;
 161                         buf = newbuf;
 162                 }
 163
 164                 input.read( p, space );
 165                 int len = input.gcount();
 166
 167                 /* If we see eof then append the EOF char. */
 168                 if ( len == 0 ) {
 169                         p[0] = last_char, len = 1;
 170                         execute = false;
 171                 }
 172
 173                 char *pe = p + len;
 174                 %% write exec;
 175
 176                 /* Check if we failed. */
 177                 if ( chr_cs == inline_scan_error ) {
 178                         /* Machine failed before finding a token. I'm not yet sure if this
 179                          * is reachable. */
 180                         scan_error() << "scanner error" << endl;
 181                         exit(1);
 182                 }
 183
 184                 /* Decide if we need to preserve anything. */
 185                 char *preserve = chr_tokstart;
 186
 187                 /* Now set up the prefix. */
 188                 if ( preserve == 0 )
 189                         have = 0;
 190                 else {
 191                         /* There is data that needs to be shifted over. */
 192                         have = pe - preserve;
 193                         memmove( buf, preserve, have );
 194                         unsigned int shiftback = preserve - buf;
 195                         if ( chr_tokstart != 0 )
 196                                 chr_tokstart -= shiftback;
 197                         chr_tokend -= shiftback;
 198
 199                         preserve = buf;
 200                 }
 201         }
 202
 203         delete[] buf;
 204 }
 205
 206 ostream &ImportScanner::scan_error()
 207 {
 208         /* Maintain the error count. */
 209         gblErrorCount += 1;
 210         cerr << fileName << ":" << line << ":" << column << ": ";
 211         return cerr;
 212 }
 213
 214
 215 /*
 216  * The Ragel Scanner
 217  */
 218
 219 enum InlineBlockType
 220 {
 221         CurlyDelimited,
 222         SemiTerminated
 223 };
 224
 225 %%{
 226         machine section_parse;
 227         alphtype int;
 228         write data;
 229 }%%
 230
 231
 232 void Scanner::init( )
 233 {
 234         %% write init;
 235 }
 236
 237 bool Scanner::active()
 238 {
 239         if ( ignoreSection )
 240                 return false;
 241
 242         if ( parser == 0 && ! parserExistsError ) {
 243                 scan_error() << "there is no previous specification name" << endl;
 244                 parserExistsError = true;
 245         }
 246
 247         if ( parser == 0 )
 248                 return false;
 249
 250         return true;
 251 }
 252
 253 ostream &Scanner::scan_error()
 254 {
 255         /* Maintain the error count. */
 256         gblErrorCount += 1;
 257         cerr << fileName << ":" << line << ":" << column << ": ";
 258         return cerr;
 259 }
 260
 261 bool Scanner::recursiveInclude( char *inclFileName, char *inclSectionName )
 262 {
 263         for ( IncludeStack::Iter si = includeStack; si.lte(); si++ ) {
 264                 if ( strcmp( si->fileName, inclFileName ) == 0 &&
 265                                 strcmp( si->sectionName, inclSectionName ) == 0 )
 266                 {
 267                         return true;
 268                 }
 269         }
 270         return false;
 271 }
 272
 273 void Scanner::updateCol()
 274 {
 275         char *from = lastnl;
 276         if ( from == 0 )
 277                 from = tokstart;
 278         //cerr << "adding " << tokend - from << " to column" << endl;
 279         column += tokend - from;
 280         lastnl = 0;
 281 }
 282
 283 void Scanner::token( int type, char c )
 284 {
 285         token( type, &c, &c + 1 );
 286 }
 287
 288 void Scanner::token( int type )
 289 {
 290         token( type, 0, 0 );
 291 }
 292
 293 %%{
 294         machine section_parse;
 295
 296         # This relies on the the kelbt implementation and the order
 297         # that tokens are declared.
 298         KW_Machine = 128;
 299         KW_Include = 129;
 300         KW_Write = 130;
 301         TK_Word = 131;
 302         TK_Literal = 132;
 303
 304         action clear_words { word = lit = 0; word_len = lit_len = 0; }
 305         action store_word { word = tokdata; word_len = toklen; }
 306         action store_lit { lit = tokdata; lit_len = toklen; }
 307
 308         action mach_err { scan_error() << "bad machine statement" << endl; }
 309         action incl_err { scan_error() << "bad include statement" << endl; }
 310         action write_err { scan_error() << "bad write statement" << endl; }
 311
 312         action handle_machine
 313         {
 314                 /* Assign a name to the machine. */
 315                 char *machine = word;
 316
 317                 if ( inclSectionTarg == 0 ) {
 318                         ignoreSection = false;
 319
 320                         ParserDictEl *pdEl = parserDict.find( machine );
 321                         if ( pdEl == 0 ) {
 322                                 pdEl = new ParserDictEl( machine );
 323                                 pdEl->value = new Parser( fileName, machine, sectionLoc );
 324                                 pdEl->value->init();
 325                                 parserDict.insert( pdEl );
 326                         }
 327
 328                         parser = pdEl->value;
 329                 }
 330                 else if ( strcmp( inclSectionTarg, machine ) == 0 ) {
 331                         /* found include target */
 332                         ignoreSection = false;
 333                         parser = inclToParser;
 334                 }
 335                 else {
 336                         /* ignoring section */
 337                         ignoreSection = true;
 338                         parser = 0;
 339                 }
 340         }
 341
 342         machine_stmt =
 343                 ( KW_Machine TK_Word @store_word ';' ) @handle_machine
 344                 <>err mach_err <>eof mach_err;
 345
 346         action handle_include
 347         {
 348                 if ( active() ) {
 349                         char *inclSectionName = word;
 350                         char *inclFileName = 0;
 351
 352                         /* Implement defaults for the input file and section name. */
 353                         if ( inclSectionName == 0 )
 354                                 inclSectionName = parser->sectionName;
 355
 356                         if ( lit != 0 )
 357                                 inclFileName = prepareFileName( lit, lit_len );
 358                         else
 359                                 inclFileName = fileName;
 360
 361                         /* Check for a recursive include structure. Add the current file/section
 362                          * name then check if what we are including is already in the stack. */
 363                         includeStack.append( IncludeStackItem( fileName, parser->sectionName ) );
 364
 365                         if ( recursiveInclude( inclFileName, inclSectionName ) )
 366                                 scan_error() << "include: this is a recursive include operation" << endl;
 367                         else {
 368                                 /* Open the input file for reading. */
 369                                 ifstream *inFile = new ifstream( inclFileName );
 370                                 if ( ! inFile->is_open() ) {
 371                                         scan_error() << "include: could not open " <<
 372                                                         inclFileName << " for reading" << endl;
 373                                 }
 374
 375                                 Scanner scanner( inclFileName, *inFile, output, parser,
 376                                                 inclSectionName, includeDepth+1 );
 377                                 scanner.do_scan( );
 378                                 delete inFile;
 379                         }
 380
 381                         /* Remove the last element (len-1) */
 382                         includeStack.remove( -1 );
 383                 }
 384         }
 385
 386         include_names = (
 387                 TK_Word @store_word ( TK_Literal @store_lit )? |
 388                 TK_Literal @store_lit
 389         ) >clear_words;
 390
 391         include_stmt =
 392                 ( KW_Include include_names ';' ) @handle_include
 393                 <>err incl_err <>eof incl_err;
 394
 395         action write_command
 396         {
 397                 if ( active() && machineSpec == 0 && machineName == 0 ) {
 398                         output << "<write"
 399                                         " def_name=\"" << parser->sectionName << "\""
 400                                         " line=\"" << line << "\""
 401                                         " col=\"" << column << "\""
 402                                         ">";
 403                 }
 404         }
 405
 406         action write_arg
 407         {
 408                 if ( active() && machineSpec == 0 && machineName == 0 )
 409                         output << "<arg>" << tokdata << "</arg>";
 410         }
 411
 412         action write_close
 413         {
 414                 if ( active() && machineSpec == 0 && machineName == 0 )
 415                         output << "</write>\n";
 416         }
 417
 418         write_stmt =
 419                 ( KW_Write @write_command
 420                 ( TK_Word @write_arg )+ ';' @write_close )
 421                 <>err write_err <>eof write_err;
 422
 423         action handle_token
 424         {
 425                 /* Send the token off to the parser. */
 426                 if ( active() ) {
 427                         InputLoc loc;
 428
 429                         #if 0
 430                         cerr << "scanner:" << line << ":" << column <<
 431                                         ": sending token to the parser " << Parser_lelNames[*p];
 432                         cerr << " " << toklen;
 433                         if ( tokdata != 0 )
 434                                 cerr << " " << tokdata;
 435                         cerr << endl;
 436                         #endif
 437
 438                         loc.fileName = fileName;
 439                         loc.line = line;
 440                         loc.col = column;
 441
 442                         parser->token( loc, type, tokdata, toklen );
 443                 }
 444         }
 445
 446         # Catch everything else.
 447         everything_else = ^( KW_Machine | KW_Include | KW_Write ) @handle_token;
 448
 449         main := (
 450                 machine_stmt |
 451                 include_stmt |
 452                 write_stmt |
 453                 everything_else
 454         )*;
 455 }%%
 456
 457 void Scanner::token( int type, char *start, char *end )
 458 {
 459         char *tokdata = 0;
 460         int toklen = 0;
 461         int *p = &type;
 462         int *pe = &type + 1;
 463
 464         if ( start != 0 ) {
 465                 toklen = end-start;
 466                 tokdata = new char[toklen+1];
 467                 memcpy( tokdata, start, toklen );
 468                 tokdata[toklen] = 0;
 469         }
 470
 471         %%{
 472                 machine section_parse;
 473                 write exec;
 474         }%%
 475
 476         updateCol();
 477
 478         /* Record the last token for use in controlling the scan of subsequent
 479          * tokens. */
 480         lastToken = type;
 481 }
 482
 483 void Scanner::startSection( )
 484 {
 485         parserExistsError = false;
 486
 487         if ( includeDepth == 0 ) {
 488                 if ( machineSpec == 0 && machineName == 0 )
 489                         output << "</host>\n";
 490         }
 491
 492         sectionLoc.fileName = fileName;
 493         sectionLoc.line = line;
 494         sectionLoc.col = 0;
 495 }
 496
 497 void Scanner::endSection( )
 498 {
 499         /* Execute the eof actions for the section parser. */
 500         %%{
 501                 machine section_parse;
 502                 write eof;
 503         }%%
 504
 505         /* Close off the section with the parser. */
 506         if ( active() ) {
 507                 InputLoc loc;
 508                 loc.fileName = fileName;
 509                 loc.line = line;
 510                 loc.col = 0;
 511
 512                 parser->token( loc, TK_EndSection, 0, 0 );
 513         }
 514
 515         if ( includeDepth == 0 ) {
 516                 if ( machineSpec == 0 && machineName == 0 ) {
 517                         /* The end section may include a newline on the end, so
 518                          * we use the last line, which will count the newline. */
 519                         output << "<host line=\"" << line << "\">";
 520                 }
 521         }
 522 }
 523
 524 %%{
 525         machine rlscan;
 526
 527         # This is sent by the driver code.
 528         EOF = 0;
 529
 530         action inc_nl {
 531                 lastnl = p;
 532                 column = 0;
 533                 line++;
 534         }
 535         NL = '\n' @inc_nl;
 536
 537         # Identifiers, numbers, commetns, and other common things.
 538         ident = ( alpha | '_' ) ( alpha |digit |'_' )*;
 539         number = digit+;
 540         hex_number = '0x' [0-9a-fA-F]+;
 541
 542         c_comment =
 543                 '/*' ( any | NL )* :>> '*/';
 544
 545         cpp_comment =
 546                 '//' [^\n]* NL;
 547
 548         c_cpp_comment = c_comment | cpp_comment;
 549
 550         # These literal forms are common to C-like host code and ragel.
 551         s_literal = "'" ([^'\\] | NL | '\\' (any | NL))* "'";
 552         d_literal = '"' ([^"\\] | NL | '\\' (any | NL))* '"';
 553
 554         whitespace = [ \t] | NL;
 555         pound_comment = '#' [^\n]* NL;
 556
 557         # An inline block of code. This is specified as a scanned, but is sent to
 558         # the parser as one long block. The inline_block pointer is used to handle
 559         # the preservation of the data.
 560         inline_code := |*
 561                 # Inline expression keywords.
 562                 "fpc" => { token( KW_PChar ); };
 563                 "fc" => { token( KW_Char ); };
 564                 "fcurs" => { token( KW_CurState ); };
 565                 "ftargs" => { token( KW_TargState ); };
 566                 "fentry" => {
 567                         whitespaceOn = false;
 568                         token( KW_Entry );
 569                 };
 570
 571                 # Inline statement keywords.
 572                 "fhold" => {
 573                         whitespaceOn = false;
 574                         token( KW_Hold );
 575                 };
 576                 "fexec" => { token( KW_Exec, 0, 0 ); };
 577                 "fgoto" => {
 578                         whitespaceOn = false;
 579                         token( KW_Goto );
 580                 };
 581                 "fnext" => {
 582                         whitespaceOn = false;
 583                         token( KW_Next );
 584                 };
 585                 "fcall" => {
 586                         whitespaceOn = false;
 587                         token( KW_Call );
 588                 };
 589                 "fret" => {
 590                         whitespaceOn = false;
 591                         token( KW_Ret );
 592                 };
 593                 "fbreak" => {
 594                         whitespaceOn = false;
 595                         token( KW_Break );
 596                 };
 597
 598                 ident => { token( TK_Word, tokstart, tokend ); };
 599
 600                 number => { token( TK_UInt, tokstart, tokend ); };
 601                 hex_number => { token( TK_Hex, tokstart, tokend ); };
 602
 603                 ( s_literal | d_literal )
 604                         => { token( IL_Literal, tokstart, tokend ); };
 605
 606                 whitespace+ => {
 607                         if ( whitespaceOn )
 608                                 token( IL_WhiteSpace, tokstart, tokend );
 609                 };
 610                 c_cpp_comment => { token( IL_Comment, tokstart, tokend ); };
 611
 612                 "::" => { token( TK_NameSep, tokstart, tokend ); };
 613
 614                 # Some symbols need to go to the parser as with their cardinal value as
 615                 # the token type (as opposed to being sent as anonymous symbols)
 616                 # because they are part of the sequences which we interpret. The * ) ;
 617                 # symbols cause whitespace parsing to come back on. This gets turned
 618                 # off by some keywords.
 619
 620                 ";" => {
 621                         whitespaceOn = true;
 622                         token( *tokstart, tokstart, tokend );
 623                         if ( inlineBlockType == SemiTerminated )
 624                                 fgoto parser_def;
 625                 };
 626
 627                 [*)] => {
 628                         whitespaceOn = true;
 629                         token( *tokstart, tokstart, tokend );
 630                 };
 631
 632                 [,(] => { token( *tokstart, tokstart, tokend ); };
 633
 634                 '{' => {
 635                         token( IL_Symbol, tokstart, tokend );
 636                         curly_count += 1;
 637                 };
 638
 639                 '}' => {
 640                         if ( --curly_count == 0 && inlineBlockType == CurlyDelimited ) {
 641                                 /* Inline code block ends. */
 642                                 token( '}' );
 643                                 fgoto parser_def;
 644                         }
 645                         else {
 646                                 /* Either a semi terminated inline block or only the closing
 647                                  * brace of some inner scope, not the block's closing brace. */
 648                                 token( IL_Symbol, tokstart, tokend );
 649                         }
 650                 };
 651
 652                 EOF => {
 653                         scan_error() << "unterminated code block" << endl;
 654                 };
 655
 656                 # Send every other character as a symbol.
 657                 any => { token( IL_Symbol, tokstart, tokend ); };
 658         *|;
 659
 660         or_literal := |*
 661                 # Escape sequences in OR expressions.
 662                 '\\0' => { token( RE_Char, '\0' ); };
 663                 '\\a' => { token( RE_Char, '\a' ); };
 664                 '\\b' => { token( RE_Char, '\b' ); };
 665                 '\\t' => { token( RE_Char, '\t' ); };
 666                 '\\n' => { token( RE_Char, '\n' ); };
 667                 '\\v' => { token( RE_Char, '\v' ); };
 668                 '\\f' => { token( RE_Char, '\f' ); };
 669                 '\\r' => { token( RE_Char, '\r' ); };
 670                 '\\\n' => { updateCol(); };
 671                 '\\' any => { token( RE_Char, tokstart+1, tokend ); };
 672
 673                 # Range dash in an OR expression.
 674                 '-' => { token( RE_Dash, 0, 0 ); };
 675
 676                 # Terminate an OR expression.
 677                 ']'     => { token( RE_SqClose ); fret; };
 678
 679                 EOF => {
 680                         scan_error() << "unterminated OR literal" << endl;
 681                 };
 682
 683                 # Characters in an OR expression.
 684                 [^\]] => { token( RE_Char, tokstart, tokend ); };
 685
 686         *|;
 687
 688         re_literal := |*
 689                 # Escape sequences in regular expressions.
 690                 '\\0' => { token( RE_Char, '\0' ); };
 691                 '\\a' => { token( RE_Char, '\a' ); };
 692                 '\\b' => { token( RE_Char, '\b' ); };
 693                 '\\t' => { token( RE_Char, '\t' ); };
 694                 '\\n' => { token( RE_Char, '\n' ); };
 695                 '\\v' => { token( RE_Char, '\v' ); };
 696                 '\\f' => { token( RE_Char, '\f' ); };
 697                 '\\r' => { token( RE_Char, '\r' ); };
 698                 '\\\n' => { updateCol(); };
 699                 '\\' any => { token( RE_Char, tokstart+1, tokend ); };
 700
 701                 # Terminate an OR expression.
 702                 '/' [i]? => {
 703                         token( RE_Slash, tokstart, tokend );
 704                         fgoto parser_def;
 705                 };
 706
 707                 # Special characters.
 708                 '.' => { token( RE_Dot ); };
 709                 '*' => { token( RE_Star ); };
 710
 711                 '[' => { token( RE_SqOpen ); fcall or_literal; };
 712                 '[^' => { token( RE_SqOpenNeg ); fcall or_literal; };
 713
 714                 EOF => {
 715                         scan_error() << "unterminated regular expression" << endl;
 716                 };
 717
 718                 # Characters in an OR expression.
 719                 [^\/] => { token( RE_Char, tokstart, tokend ); };
 720         *|;
 721
 722         # We need a separate token space here to avoid the ragel keywords.
 723         write_statement := |*
 724                 ident => { token( TK_Word, tokstart, tokend ); } ;
 725                 [ \t\n]+ => { updateCol(); };
 726                 ';' => { token( ';' ); fgoto parser_def; };
 727
 728                 EOF => {
 729                         scan_error() << "unterminated write statement" << endl;
 730                 };
 731         *|;
 732
 733         # Parser definitions.
 734         parser_def := |*
 735                 'machine' => { token( KW_Machine ); };
 736                 'include' => { token( KW_Include ); };
 737                 'write' => {
 738                         token( KW_Write );
 739                         fgoto write_statement;
 740                 };
 741                 'action' => { token( KW_Action ); };
 742                 'alphtype' => { token( KW_AlphType ); };
 743
 744                 # FIXME: Enable this post 5.17.
 745                 # 'range' => { token( KW_Range ); };
 746
 747                 'getkey' => {
 748                         token( KW_GetKey );
 749                         inlineBlockType = SemiTerminated;
 750                         fgoto inline_code;
 751                 };
 752                 'access' => {
 753                         token( KW_Access );
 754                         inlineBlockType = SemiTerminated;
 755                         fgoto inline_code;
 756                 };
 757                 'variable' => {
 758                         token( KW_Variable );
 759                         inlineBlockType = SemiTerminated;
 760                         fgoto inline_code;
 761                 };
 762                 'when' => { token( KW_When ); };
 763                 'eof' => { token( KW_Eof ); };
 764                 'err' => { token( KW_Err ); };
 765                 'lerr' => { token( KW_Lerr ); };
 766                 'to' => { token( KW_To ); };
 767                 'from' => { token( KW_From ); };
 768                 'export' => { token( KW_Export ); };
 769
 770                 # Identifiers.
 771                 ident => { token( TK_Word, tokstart, tokend ); } ;
 772
 773                 # Numbers
 774                 number => { token( TK_UInt, tokstart, tokend ); };
 775                 hex_number => { token( TK_Hex, tokstart, tokend ); };
 776
 777                 # Literals, with optionals.
 778                 ( s_literal | d_literal ) [i]?
 779                         => { token( TK_Literal, tokstart, tokend ); };
 780
 781                 '[' => { token( RE_SqOpen ); fcall or_literal; };
 782                 '[^' => { token( RE_SqOpenNeg ); fcall or_literal; };
 783
 784                 '/' => { token( RE_Slash ); fgoto re_literal; };
 785
 786                 # Ignore.
 787                 pound_comment => { updateCol(); };
 788
 789                 ':=' => { token( TK_ColonEquals ); };
 790
 791                 # To State Actions.
 792                 ">~" => { token( TK_StartToState ); };
 793                 "$~" => { token( TK_AllToState ); };
 794                 "%~" => { token( TK_FinalToState ); };
 795                 "<~" => { token( TK_NotStartToState ); };
 796                 "@~" => { token( TK_NotFinalToState ); };
 797                 "<>~" => { token( TK_MiddleToState ); };
 798
 799                 # From State actions
 800                 ">*" => { token( TK_StartFromState ); };
 801                 "$*" => { token( TK_AllFromState ); };
 802                 "%*" => { token( TK_FinalFromState ); };
 803                 "<*" => { token( TK_NotStartFromState ); };
 804                 "@*" => { token( TK_NotFinalFromState ); };
 805                 "<>*" => { token( TK_MiddleFromState ); };
 806
 807                 # EOF Actions.
 808                 ">/" => { token( TK_StartEOF ); };
 809                 "$/" => { token( TK_AllEOF ); };
 810                 "%/" => { token( TK_FinalEOF ); };
 811                 "</" => { token( TK_NotStartEOF ); };
 812                 "@/" => { token( TK_NotFinalEOF ); };
 813                 "<>/" => { token( TK_MiddleEOF ); };
 814
 815                 # Global Error actions.
 816                 ">!" => { token( TK_StartGblError ); };
 817                 "$!" => { token( TK_AllGblError ); };
 818                 "%!" => { token( TK_FinalGblError ); };
 819                 "<!" => { token( TK_NotStartGblError ); };
 820                 "@!" => { token( TK_NotFinalGblError ); };
 821                 "<>!" => { token( TK_MiddleGblError ); };
 822
 823                 # Local error actions.
 824                 ">^" => { token( TK_StartLocalError ); };
 825                 "$^" => { token( TK_AllLocalError ); };
 826                 "%^" => { token( TK_FinalLocalError ); };
 827                 "<^" => { token( TK_NotStartLocalError ); };
 828                 "@^" => { token( TK_NotFinalLocalError ); };
 829                 "<>^" => { token( TK_MiddleLocalError ); };
 830
 831                 # Middle.
 832                 "<>" => { token( TK_Middle ); };
 833
 834                 # Conditions.
 835                 '>?' => { token( TK_StartCond ); };
 836                 '$?' => { token( TK_AllCond ); };
 837                 '%?' => { token( TK_LeavingCond ); };
 838
 839                 '..' => { token( TK_DotDot ); };
 840                 '**' => { token( TK_StarStar ); };
 841                 '--' => { token( TK_DashDash ); };
 842                 '->' => { token( TK_Arrow ); };
 843                 '=>' => { token( TK_DoubleArrow ); };
 844
 845                 ":>"  => { token( TK_ColonGt ); };
 846                 ":>>" => { token( TK_ColonGtGt ); };
 847                 "<:"  => { token( TK_LtColon ); };
 848
 849                 # Opening of longest match.
 850                 "|*" => { token( TK_BarStar ); };
 851
 852                 # Separater for name references.
 853                 "::" => { token( TK_NameSep, tokstart, tokend ); };
 854
 855                 '}%%' => {
 856                         updateCol();
 857                         endSection();
 858                         fgoto main;
 859                 };
 860
 861                 [ \t\r]+ => { updateCol(); };
 862
 863                 # If we are in a single line machine then newline may end the spec.
 864                 NL => {
 865                         updateCol();
 866                         if ( singleLineSpec ) {
 867                                 endSection();
 868                                 fgoto main;
 869                         }
 870                 };
 871
 872                 '{' => {
 873                         if ( lastToken == KW_Export || lastToken == KW_Entry )
 874                                 token( '{' );
 875                         else {
 876                                 token( '{' );
 877                                 curly_count = 1;
 878                                 inlineBlockType = CurlyDelimited;
 879                                 fgoto inline_code;
 880                         }
 881                 };
 882
 883                 EOF => {
 884                         scan_error() << "unterminated ragel section" << endl;
 885                 };
 886
 887                 any => { token( *tokstart ); } ;
 888         *|;
 889
 890         action pass {
 891                 updateCol();
 892
 893                 /* If no errors and we are at the bottom of the include stack (the
 894                  * source file listed on the command line) then write out the data. */
 895                 if ( includeDepth == 0 && machineSpec == 0 && machineName == 0 )
 896                         xmlEscapeHost( output, tokstart, tokend-tokstart );
 897         }
 898
 899         # Outside code scanner. These tokens get passed through.
 900         main := |*
 901                 ident => pass;
 902                 number => pass;
 903                 c_cpp_comment => pass;
 904                 s_literal | d_literal => pass;
 905                 '%%{' => {
 906                         updateCol();
 907                         singleLineSpec = false;
 908                         startSection();
 909                         fgoto parser_def;
 910                 };
 911                 '%%' => {
 912                         updateCol();
 913                         singleLineSpec = true;
 914                         startSection();
 915                         fgoto parser_def;
 916                 };
 917                 whitespace+ => pass;
 918                 EOF;
 919                 any => pass;
 920         *|;
 921
 922 }%%
 923
 924 %% write data;
 925
 926 void Scanner::do_scan()
 927 {
 928         int bufsize = 8;
 929         char *buf = new char[bufsize];
 930         const char last_char = 0;
 931         int cs, act, have = 0;
 932         int top, stack[1];
 933         int curly_count = 0;
 934         bool execute = true;
 935         bool singleLineSpec = false;
 936         InlineBlockType inlineBlockType = CurlyDelimited;
 937
 938         /* Init the section parser and the character scanner. */
 939         init();
 940         %% write init;
 941
 942         while ( execute ) {
 943                 char *p = buf + have;
 944                 int space = bufsize - have;
 945
 946                 if ( space == 0 ) {
 947                         /* We filled up the buffer trying to scan a token. Grow it. */
 948                         bufsize = bufsize * 2;
 949                         char *newbuf = new char[bufsize];
 950
 951                         /* Recompute p and space. */
 952                         p = newbuf + have;
 953                         space = bufsize - have;
 954
 955                         /* Patch up pointers possibly in use. */
 956                         if ( tokstart != 0 )
 957                                 tokstart = newbuf + ( tokstart - buf );
 958                         tokend = newbuf + ( tokend - buf );
 959
 960                         /* Copy the new buffer in. */
 961                         memcpy( newbuf, buf, have );
 962                         delete[] buf;
 963                         buf = newbuf;
 964                 }
 965
 966                 input.read( p, space );
 967                 int len = input.gcount();
 968
 969                 /* If we see eof then append the EOF char. */
 970                 if ( len == 0 ) {
 971                         p[0] = last_char, len = 1;
 972                         execute = false;
 973                 }
 974
 975                 char *pe = p + len;
 976                 %% write exec;
 977
 978                 /* Check if we failed. */
 979                 if ( cs == rlscan_error ) {
 980                         /* Machine failed before finding a token. I'm not yet sure if this
 981                          * is reachable. */
 982                         scan_error() << "scanner error" << endl;
 983                         exit(1);
 984                 }
 985
 986                 /* Decide if we need to preserve anything. */
 987                 char *preserve = tokstart;
 988
 989                 /* Now set up the prefix. */
 990                 if ( preserve == 0 )
 991                         have = 0;
 992                 else {
 993                         /* There is data that needs to be shifted over. */
 994                         have = pe - preserve;
 995                         memmove( buf, preserve, have );
 996                         unsigned int shiftback = preserve - buf;
 997                         if ( tokstart != 0 )
 998                                 tokstart -= shiftback;
 999                         tokend -= shiftback;
1000
1001                         preserve = buf;
1002                 }
1003         }
1004
1005         delete[] buf;
1006 }
1007
1008 void scan( char *fileName, istream &input, ostream &output )
1009 {
1010 }