ragel/rlscan.rl

   1 /*
   2  *  Copyright 2006 Adrian Thurston <thurston@cs.queensu.ca>
   3  */
   4
   5 /*  This file is part of Ragel.
   6  *
   7  *  Ragel is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  Ragel is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *  GNU General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU General Public License
  18  *  along with Ragel; if not, write to the Free Software
  19  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  */
  21
  22 #include <iostream>
  23 #include <fstream>
  24 #include <string.h>
  25
  26 #include "ragel.h"
  27 #include "rlparse.h"
  28 #include "parsedata.h"
  29 #include "avltree.h"
  30 #include "vector.h"
  31
  32
  33 using std::ifstream;
  34 using std::istream;
  35 using std::ostream;
  36 using std::cout;
  37 using std::cerr;
  38 using std::endl;
  39
  40 /* This is used for tracking the current stack of include file/machine pairs. It is
  41  * is used to detect and recursive include structure. */
  42 struct IncludeStackItem
  43 {
  44         IncludeStackItem( char *fileName, char *sectionName )
  45                 : fileName(fileName), sectionName(sectionName) {}
  46
  47         char *fileName;
  48         char *sectionName;
  49 };
  50
  51 typedef Vector<IncludeStackItem> IncludeStack;
  52 IncludeStack includeStack;
  53
  54 enum InlineBlockType
  55 {
  56         CurlyDelimited,
  57         SemiTerminated
  58 };
  59
  60 struct Scanner
  61 {
  62         Scanner( char *fileName, istream &input,
  63                         Parser *inclToParser, char *inclSectionTarg,
  64                         int include_depth )
  65         :
  66                 fileName(fileName), input(input),
  67                 inclToParser(inclToParser),
  68                 inclSectionTarg(inclSectionTarg),
  69                 include_depth(include_depth),
  70                 line(1), column(1), lastnl(0),
  71                 parser(0), active(false),
  72                 parserExistsError(false), ragelDefOpen(false),
  73                 whitespaceOn(true)
  74                 {}
  75
  76         bool recursiveInclude( IncludeStack &includeStack,
  77                         char *inclFileName, char *inclSectionName );
  78
  79         char *prepareFileName( char *fileName, int len )
  80         {
  81                 bool caseInsensitive;
  82                 Token tokenFnStr, tokenRes;
  83                 tokenFnStr.data = fileName;
  84                 tokenFnStr.length = len;
  85                 tokenFnStr.prepareLitString( tokenRes, caseInsensitive );
  86                 return tokenRes.data;
  87         }
  88
  89         void init();
  90         void token( int type, char *start, char *end );
  91         void token( int type, char *string );
  92         void token( int type );
  93         void updateCol();
  94         void startSection();
  95         void endSection();
  96         void openRagelDef();
  97         void do_scan();
  98         bool parserExists();
  99         ostream &error();
 100
 101         char *fileName;
 102         istream &input;
 103         Parser *inclToParser;
 104         char *inclSectionTarg;
 105         int include_depth;
 106
 107         int cs;
 108         int line;
 109         char *word, *lit;
 110         int word_len, lit_len;
 111         InputLoc sectionLoc;
 112         char *tokstart, *tokend;
 113         int column;
 114         char *lastnl;
 115
 116         /* Set by machine statements, these persist from section to section
 117          * allowing for unnamed sections. */
 118         Parser *parser;
 119         bool active;
 120
 121         /* This is set if ragel has already emitted an error stating that
 122          * no section name has been seen and thus no parser exists. */
 123         bool parserExistsError;
 124         bool ragelDefOpen;
 125
 126         /* This is for inline code. By default it is on. It goes off for
 127          * statements and values in inline blocks which are parsed. */
 128         bool whitespaceOn;
 129 };
 130
 131 %%{
 132         machine section_parse;
 133         alphtype int;
 134         write data;
 135 }%%
 136
 137 void Scanner::init( )
 138 {
 139         %% write init;
 140 }
 141
 142 bool Scanner::parserExists()
 143 {
 144         if ( parser != 0 )
 145                 return true;
 146
 147         if ( ! parserExistsError ) {
 148                 error() << "include: there is no previous specification name" << endl;
 149                 parserExistsError = true;
 150         }
 151         return false;
 152 }
 153
 154 ostream &Scanner::error()
 155 {
 156         /* Maintain the error count. */
 157         gblErrorCount += 1;
 158
 159         cerr << fileName << ":" << line << ":" << column << ": ";
 160         return cerr;
 161 }
 162
 163 bool Scanner::recursiveInclude( IncludeStack &includeStack,
 164                         char *inclFileName, char *inclSectionName )
 165 {
 166         for ( IncludeStack::Iter si = includeStack; si.lte(); si++ ) {
 167                 if ( strcmp( si->fileName, inclFileName ) == 0 &&
 168                                 strcmp( si->sectionName, inclSectionName ) == 0 )
 169                 {
 170                         return true;
 171                 }
 172         }
 173         return false;
 174 }
 175
 176 void Scanner::updateCol()
 177 {
 178         char *from = lastnl;
 179         if ( from == 0 )
 180                 from = tokstart;
 181         //cerr << "adding " << tokend - from << " to column" << endl;
 182         column += tokend - from;
 183         lastnl = 0;
 184 }
 185
 186 void Scanner::token( int type, char *string )
 187 {
 188         token( type, string, string + strlen(string) );
 189 }
 190
 191 void Scanner::token( int type )
 192 {
 193         token( type, 0, 0 );
 194 }
 195
 196 %%{
 197         machine section_parse;
 198
 199         # This relies on the the kelbt implementation and the order
 200         # that tokens are declared.
 201         KW_Machine = 128;
 202         KW_Include = 129;
 203         KW_Write = 130;
 204         TK_Word = 131;
 205         TK_Literal = 132;
 206
 207         action clear_words { word = lit = 0; word_len = lit_len = 0; }
 208         action store_word { word = tokdata; word_len = toklen; }
 209         action store_lit { lit = tokdata; lit_len = toklen; }
 210
 211         action mach_err { error() << "bad machine statement" << endl; }
 212         action incl_err { error() << "bad include statement" << endl; }
 213         action write_err { error() << "bad write statement" << endl; }
 214
 215         action handle_machine
 216         {
 217                 /* Assign a name to the machine. */
 218                 char *machine = word;
 219                 //cerr << "scanner: machine statement: " << machine << endl;
 220
 221                 if ( inclSectionTarg == 0 ) {
 222                         active = true;
 223
 224                         ParserDictEl *pdEl = parserDict.find( machine );
 225                         if ( pdEl != 0 ) {
 226                                 //cerr << "scanner: using existing parser" << endl;
 227                         }
 228                         else {
 229                                 //cerr << "scanner: creating a new parser" << endl;
 230                                 pdEl = new ParserDictEl( machine );
 231                                 pdEl->value = new Parser( fileName, machine, sectionLoc );
 232                                 pdEl->value->init();
 233                                 parserDict.insert( pdEl );
 234                         }
 235
 236                         parser = pdEl->value;
 237                 }
 238                 else if ( strcmp( inclSectionTarg, machine ) == 0 ) {
 239                         //cerr << "scanner: found include target" << endl;
 240                         active = true;
 241                         parser = inclToParser;
 242                 }
 243                 else {
 244                         //cerr << "scanner: ignoring section" << endl;
 245                         active = false;
 246                         parser = 0;
 247                 }
 248         }
 249
 250         machine_stmt =
 251                 ( KW_Machine TK_Word @store_word ';' ) @handle_machine
 252                 <>err mach_err <>eof mach_err;
 253
 254         action handle_include
 255         {
 256                 if ( active && parserExists() ) {
 257                         char *inclSectionName = word;
 258                         char *inclFileName = 0;
 259
 260                         /* Implement defaults for the input file and section name. */
 261                         if ( inclSectionName == 0 )
 262                                 inclSectionName = parser->sectionName;
 263
 264                         if ( lit != 0 )
 265                                 inclFileName = prepareFileName( lit, lit_len );
 266                         else
 267                                 inclFileName = fileName;
 268
 269                         /* Open the file and process it. */
 270                         //cerr << "scanner: include: " << inclSectionName << " " << inclFileName << endl;
 271
 272                         /* Check for a recursive include structure. Add the current file/section
 273                          * name then check if what we are including is already in the stack. */
 274                         includeStack.append( IncludeStackItem( fileName, parser->sectionName ) );
 275
 276                         if ( recursiveInclude( includeStack, inclFileName, inclSectionName ) )
 277                                 error() << "include: this is a recursive include operation" << endl;
 278                         else {
 279                                 /* Open the input file for reading. */
 280                                 ifstream *inFile = new ifstream( inclFileName );
 281                                 if ( ! inFile->is_open() ) {
 282                                         error() << "include: could not open " <<
 283                                                         inclFileName << " for reading" << endl;
 284                                 }
 285
 286                                 Scanner scanner( inclFileName, *inFile, parser,
 287                                                 inclSectionName, include_depth+1 );
 288                                 scanner.init();
 289                                 scanner.do_scan( );
 290                                 delete inFile;
 291                         }
 292
 293                         /* Remove the last element (len-1) */
 294                         includeStack.remove( -1 );
 295                 }
 296         }
 297
 298         include_names = (
 299                 TK_Word @store_word ( TK_Literal @store_lit )? |
 300                 TK_Literal @store_lit
 301         ) >clear_words;
 302
 303         include_stmt =
 304                 ( KW_Include include_names ';' ) @handle_include
 305                 <>err incl_err <>eof incl_err;
 306
 307         action write_command
 308         {
 309                 if ( active ) {
 310                         openRagelDef();
 311                         if ( strcmp( tokdata, "data" ) != 0 &&
 312                                         strcmp( tokdata, "init" ) != 0 &&
 313                                         strcmp( tokdata, "exec" ) != 0 &&
 314                                         strcmp( tokdata, "eof" ) != 0 )
 315                         {
 316                                 error() << "unknown write command" << endl;
 317                         }
 318                         *outStream << "  <write what=\"" << tokdata << "\">";
 319                 }
 320         }
 321
 322         action write_option
 323         {
 324                 if ( active )
 325                         *outStream << "<option>" << tokdata << "</option>";
 326         }
 327         action write_close
 328         {
 329                 if ( active )
 330                         *outStream << "</write>\n";
 331         }
 332
 333         write_stmt =
 334                 ( KW_Write TK_Word @write_command
 335                         ( TK_Word @write_option )* ';' @write_close )
 336                 <>err write_err <>eof write_err;
 337
 338         action handle_token
 339         {
 340                 /* Send the token off to the parser. */
 341                 if ( active && parserExists() ) {
 342                         InputLoc loc;
 343
 344                         //cerr << "scanner:" << line << ":" << column <<
 345                         //              ": sending token to the parser " << lelNames[*p];
 346                         //if ( tokdata != 0 )
 347                         //      cerr << " " << tokdata;
 348                         //cerr << endl;
 349
 350                         loc.fileName = fileName;
 351                         loc.line = line;
 352                         loc.col = column;
 353
 354                         parser->token( loc, type, tokdata, toklen );
 355                 }
 356         }
 357
 358         # Catch everything else.
 359         everything_else = ^( KW_Machine | KW_Include | KW_Write ) @handle_token;
 360
 361         main := (
 362                 machine_stmt |
 363                 include_stmt |
 364                 write_stmt |
 365                 everything_else
 366         )*;
 367 }%%
 368
 369 void Scanner::token( int type, char *start, char *end )
 370 {
 371         char *tokdata = 0;
 372         int toklen = 0;
 373         int *p = &type;
 374         int *pe = &type + 1;
 375
 376         if ( start != 0 ) {
 377                 toklen = end-start;
 378                 tokdata = new char[toklen+1];
 379                 memcpy( tokdata, start, toklen );
 380                 tokdata[toklen] = 0;
 381         }
 382
 383         %%{
 384                 machine section_parse;
 385                 write exec;
 386         }%%
 387
 388         updateCol();
 389 }
 390
 391 void Scanner::startSection( )
 392 {
 393         parserExistsError = false;
 394
 395         if ( include_depth == 0 ) {
 396                 if ( machineSpec == 0 && machineName == 0 )
 397                         *outStream << "</host>\n";
 398                 ragelDefOpen = false;
 399         }
 400
 401         sectionLoc.fileName = fileName;
 402         sectionLoc.line = line;
 403         sectionLoc.col = 0;
 404 }
 405
 406 void Scanner::openRagelDef()
 407 {
 408         if ( ! ragelDefOpen ) {
 409                 ragelDefOpen = true;
 410                 *outStream << "<ragel_def name=\"" << parser->sectionName << "\">\n";
 411         }
 412 }
 413
 414 void Scanner::endSection( )
 415 {
 416         /* Execute the eof actions for the section parser. */
 417         %%{
 418                 machine section_parse;
 419                 write eof;
 420         }%%
 421
 422         /* Close off the section with the parser. */
 423         if ( active && parserExists() ) {
 424                 InputLoc loc;
 425                 loc.fileName = fileName;
 426                 loc.line = line;
 427                 loc.col = 0;
 428
 429                 parser->token( loc, TK_EndSection, 0, 0 );
 430         }
 431
 432         if ( include_depth == 0 ) {
 433                 if ( ragelDefOpen ) {
 434                         *outStream << "</ragel_def>\n";
 435                         ragelDefOpen = false;
 436                 }
 437
 438                 if ( machineSpec == 0 && machineName == 0 ) {
 439                         /* The end section may include a newline on the end, so
 440                          * we use the last line, which will count the newline. */
 441                         *outStream << "<host line=\"" << line << "\">";
 442                 }
 443         }
 444 }
 445
 446 %%{
 447         machine rlscan;
 448
 449         # This is sent by the driver code.
 450         EOF = 0;
 451
 452         action inc_nl {
 453                 lastnl = p;
 454                 column = 0;
 455                 line++;
 456         }
 457         NL = '\n' @inc_nl;
 458
 459         # Identifiers, numbers, commetns, and other common things.
 460         ident = ( alpha | '_' ) ( alpha |digit |'_' )*;
 461         number = digit+;
 462         hex_number = '0x' [0-9a-fA-F]+;
 463
 464         c_comment =
 465                 '/*' ( any | NL )* :>> '*/';
 466
 467         cpp_comment =
 468                 '//' [^\n]* NL;
 469
 470         c_cpp_comment = c_comment | cpp_comment;
 471
 472         # These literal forms are common to C-like host code and ragel.
 473         s_literal = "'" ([^'\\] | NL | '\\' (any | NL))* "'";
 474         d_literal = '"' ([^"\\] | NL | '\\' (any | NL))* '"';
 475
 476         whitespace = [ \t] | NL;
 477         pound_comment = '#' [^\n]* NL;
 478
 479         # An inline block of code. This is specified as a scanned, but is sent to
 480         # the parser as one long block. The inline_block pointer is used to handle
 481         # the preservation of the data.
 482         inline_code := |*
 483                 # Inline expression keywords.
 484                 "fpc" => { token( KW_PChar ); };
 485                 "fc" => { token( KW_Char ); };
 486                 "fcurs" => { token( KW_CurState ); };
 487                 "ftargs" => { token( KW_TargState ); };
 488                 "fentry" => {
 489                         whitespaceOn = false;
 490                         token( KW_Entry );
 491                 };
 492
 493                 # Inline statement keywords.
 494                 "fhold" => {
 495                         whitespaceOn = false;
 496                         token( KW_Hold );
 497                 };
 498                 "fexec" => { token( KW_Exec, 0, 0 ); };
 499                 "fgoto" => {
 500                         whitespaceOn = false;
 501                         token( KW_Goto );
 502                 };
 503                 "fnext" => {
 504                         whitespaceOn = false;
 505                         token( KW_Next );
 506                 };
 507                 "fcall" => {
 508                         whitespaceOn = false;
 509                         token( KW_Call );
 510                 };
 511                 "fret" => {
 512                         whitespaceOn = false;
 513                         token( KW_Ret );
 514                 };
 515                 "fbreak" => {
 516                         whitespaceOn = false;
 517                         token( KW_Break );
 518                 };
 519
 520                 ident => { token( TK_Word, tokstart, tokend ); };
 521
 522                 number => { token( TK_UInt, tokstart, tokend ); };
 523                 hex_number => { token( TK_Hex, tokstart, tokend ); };
 524
 525                 ( s_literal | d_literal )
 526                         => { token( IL_Literal, tokstart, tokend ); };
 527
 528                 whitespace+ => {
 529                         if ( whitespaceOn )
 530                                 token( IL_WhiteSpace, tokstart, tokend );
 531                 };
 532                 c_cpp_comment => { token( IL_Comment, tokstart, tokend ); };
 533
 534                 "::" => { token( TK_NameSep, tokstart, tokend ); };
 535
 536                 # Some symbols need to go to the parser as with their cardinal value as
 537                 # the token type (as opposed to being sent as anonymous symbols)
 538                 # because they are part of the sequences which we interpret. The * ) ;
 539                 # symbols cause whitespace parsing to come back on. This gets turned
 540                 # off by some keywords.
 541
 542                 ";" => {
 543                         whitespaceOn = true;
 544                         token( *tokstart, tokstart, tokend );
 545                         if ( inlineBlockType == SemiTerminated )
 546                                 fgoto parser_def;
 547                 };
 548
 549                 [*)] => {
 550                         whitespaceOn = true;
 551                         token( *tokstart, tokstart, tokend );
 552                 };
 553
 554                 [,(] => { token( *tokstart, tokstart, tokend ); };
 555
 556                 '{' => {
 557                         token( IL_Symbol, tokstart, tokend );
 558                         curly_count += 1;
 559                 };
 560
 561                 '}' => {
 562                         if ( --curly_count == 0 && inlineBlockType == CurlyDelimited ) {
 563                                 /* Inline code block ends. */
 564                                 token( '}' );
 565                                 fgoto parser_def;
 566                         }
 567                         else {
 568                                 /* Either a semi terminated inline block or only the closing
 569                                  * brace of some inner scope, not the block's closing brace. */
 570                                 token( IL_Symbol, tokstart, tokend );
 571                         }
 572                 };
 573
 574                 # Send every other character as a symbol.
 575                 any => { token( IL_Symbol, tokstart, tokend ); };
 576         *|;
 577
 578         or_literal := |*
 579                 # Escape sequences in OR expressions.
 580                 '\\0' => { token( RE_Char, "\0" ); };
 581                 '\\a' => { token( RE_Char, "\a" ); };
 582                 '\\b' => { token( RE_Char, "\b" ); };
 583                 '\\t' => { token( RE_Char, "\t" ); };
 584                 '\\n' => { token( RE_Char, "\n" ); };
 585                 '\\v' => { token( RE_Char, "\v" ); };
 586                 '\\f' => { token( RE_Char, "\f" ); };
 587                 '\\r' => { token( RE_Char, "\r" ); };
 588                 '\\\n' => { updateCol(); };
 589                 '\\' any => { token( RE_Char, tokstart+1, tokend ); };
 590
 591                 # Range dash in an OR expression.
 592                 '-' => { token( RE_Dash, 0, 0 ); };
 593
 594                 # Terminate an OR expression.
 595                 ']'     => { token( RE_SqClose ); fret; };
 596
 597                 # Characters in an OR expression.
 598                 [^\]] => { token( RE_Char, tokstart, tokend ); };
 599         *|;
 600
 601         re_literal := |*
 602                 # Escape sequences in regular expressions.
 603                 '\\0' => { token( RE_Char, "\0" ); };
 604                 '\\a' => { token( RE_Char, "\a" ); };
 605                 '\\b' => { token( RE_Char, "\b" ); };
 606                 '\\t' => { token( RE_Char, "\t" ); };
 607                 '\\n' => { token( RE_Char, "\n" ); };
 608                 '\\v' => { token( RE_Char, "\v" ); };
 609                 '\\f' => { token( RE_Char, "\f" ); };
 610                 '\\r' => { token( RE_Char, "\r" ); };
 611                 '\\\n' => { updateCol(); };
 612                 '\\' any => { token( RE_Char, tokstart+1, tokend ); };
 613
 614                 # Terminate an OR expression.
 615                 '/' [i]? => {
 616                         token( RE_Slash, tokstart, tokend );
 617                         fgoto parser_def;
 618                 };
 619
 620                 # Special characters.
 621                 '.' => { token( RE_Dot ); };
 622                 '*' => { token( RE_Star ); };
 623
 624                 '[' => { token( RE_SqOpen ); fcall or_literal; };
 625                 '[^' => { token( RE_SqOpenNeg ); fcall or_literal; };
 626
 627                 # Characters in an OR expression.
 628                 [^\/] => { token( RE_Char, tokstart, tokend ); };
 629         *|;
 630
 631         write_statement := |*
 632                 ident => { token( TK_Word, tokstart, tokend ); } ;
 633                 [ \t\n]+ => { updateCol(); };
 634                 ';' => { token( ';' ); fgoto parser_def; };
 635         *|;
 636
 637         # Parser definitions.
 638         parser_def := |*
 639                 'machine' => { token( KW_Machine ); };
 640                 'include' => { token( KW_Include ); };
 641                 'write' => {
 642                         token( KW_Write );
 643                         fgoto write_statement;
 644                 };
 645                 'action' => { token( KW_Action ); };
 646                 'alphtype' => { token( KW_AlphType ); };
 647
 648                 # FIXME: Enable this post 5.17.
 649                 # 'range' => { token( KW_Range ); };
 650
 651                 'getkey' => {
 652                         token( KW_GetKey );
 653                         inlineBlockType = SemiTerminated;
 654                         fgoto inline_code;
 655                 };
 656                 'access' => {
 657                         token( KW_Access );
 658                         inlineBlockType = SemiTerminated;
 659                         fgoto inline_code;
 660                 };
 661                 'variable' => {
 662                         token( KW_Variable );
 663                         inlineBlockType = SemiTerminated;
 664                         fgoto inline_code;
 665                 };
 666                 'when' => { token( KW_When ); };
 667                 'eof' => { token( KW_Eof ); };
 668                 'err' => { token( KW_Err ); };
 669                 'lerr' => { token( KW_Lerr ); };
 670                 'to' => { token( KW_To ); };
 671                 'from' => { token( KW_From ); };
 672
 673                 # Identifiers.
 674                 ident => { token( TK_Word, tokstart, tokend ); } ;
 675
 676                 # Numbers
 677                 number => { token( TK_UInt, tokstart, tokend ); };
 678                 hex_number => { token( TK_Hex, tokstart, tokend ); };
 679
 680                 # Literals, with optionals.
 681                 ( s_literal | d_literal ) [i]?
 682                         => { token( TK_Literal, tokstart, tokend ); };
 683
 684                 '[' => { token( RE_SqOpen ); fcall or_literal; };
 685                 '[^' => { token( RE_SqOpenNeg ); fcall or_literal; };
 686
 687                 '/' => { token( RE_Slash ); fgoto re_literal; };
 688
 689                 # Ignore.
 690                 pound_comment => { updateCol(); };
 691
 692                 ':=' => { token( TK_ColonEquals ); };
 693
 694                 # To State Actions.
 695                 ">~" => { token( TK_StartToState ); };
 696                 "$~" => { token( TK_AllToState ); };
 697                 "%~" => { token( TK_FinalToState ); };
 698                 "<~" => { token( TK_NotStartToState ); };
 699                 "@~" => { token( TK_NotFinalToState ); };
 700                 "<>~" => { token( TK_MiddleToState ); };
 701
 702                 # From State actions
 703                 ">*" => { token( TK_StartFromState ); };
 704                 "$*" => { token( TK_AllFromState ); };
 705                 "%*" => { token( TK_FinalFromState ); };
 706                 "<*" => { token( TK_NotStartFromState ); };
 707                 "@*" => { token( TK_NotFinalFromState ); };
 708                 "<>*" => { token( TK_MiddleFromState ); };
 709
 710                 # EOF Actions.
 711                 ">/" => { token( TK_StartEOF ); };
 712                 "$/" => { token( TK_AllEOF ); };
 713                 "%/" => { token( TK_FinalEOF ); };
 714                 "</" => { token( TK_NotStartEOF ); };
 715                 "@/" => { token( TK_NotFinalEOF ); };
 716                 "<>/" => { token( TK_MiddleEOF ); };
 717
 718                 # Global Error actions.
 719                 ">!" => { token( TK_StartGblError ); };
 720                 "$!" => { token( TK_AllGblError ); };
 721                 "%!" => { token( TK_FinalGblError ); };
 722                 "<!" => { token( TK_NotStartGblError ); };
 723                 "@!" => { token( TK_NotFinalGblError ); };
 724                 "<>!" => { token( TK_MiddleGblError ); };
 725
 726                 # Local error actions.
 727                 ">^" => { token( TK_StartLocalError ); };
 728                 "$^" => { token( TK_AllLocalError ); };
 729                 "%^" => { token( TK_FinalLocalError ); };
 730                 "<^" => { token( TK_NotStartLocalError ); };
 731                 "@^" => { token( TK_NotFinalLocalError ); };
 732                 "<>^" => { token( TK_MiddleLocalError ); };
 733
 734                 # Middle.
 735                 "<>" => { token( TK_Middle ); };
 736
 737                 # Conditions.
 738                 '>?' => { token( TK_StartCond ); };
 739                 '$?' => { token( TK_AllCond ); };
 740                 '%?' => { token( TK_LeavingCond ); };
 741
 742                 '..' => { token( TK_DotDot ); };
 743                 '**' => { token( TK_StarStar ); };
 744                 '--' => { token( TK_DashDash ); };
 745                 '->' => { token( TK_Arrow ); };
 746                 '=>' => { token( TK_DoubleArrow ); };
 747
 748                 ":>"  => { token( TK_ColonGt ); };
 749                 ":>>" => { token( TK_ColonGtGt ); };
 750                 "<:"  => { token( TK_LtColon ); };
 751
 752                 # Opening of longest match.
 753                 "|*" => { token( TK_BarStar ); };
 754
 755                 '}%%' => {
 756                         /* In order to generate anything we must be in the top level file
 757                          * and the current spec must be active and there must not have been
 758                          * any parse errors. */
 759                         updateCol();
 760                         endSection();
 761                         fgoto main;
 762                 };
 763
 764                 [ \t]+ => { updateCol(); };
 765
 766                 # If we are in a single line machine then newline may end the spec.
 767                 NL => {
 768                         updateCol();
 769                         if ( singleLineSpec ) {
 770                                 /* In order to generate anything we must be in the top level file
 771                                  * and the current spec must be active and there must not have been
 772                                  * any parse errors. */
 773                                 endSection();
 774                                 fgoto main;
 775                         }
 776                 };
 777
 778                 '{' => {
 779                         token( '{' );
 780                         curly_count = 1;
 781                         inlineBlockType = CurlyDelimited;
 782                         fgoto inline_code;
 783                 };
 784
 785                 any => { token( *tokstart ); } ;
 786         *|;
 787
 788         action pass {
 789                 updateCol();
 790
 791                 /* If no errors and we are at the bottom of the include stack (the
 792                  * source file listed on the command line) then write out the data. */
 793                 if ( include_depth == 0 && machineSpec == 0 && machineName == 0 )
 794                         xmlEscapeHost( *outStream, tokstart, tokend-tokstart );
 795         }
 796
 797         # Outside code scanner. These tokens get passed through.
 798         main := |*
 799                 ident => pass;
 800                 number => pass;
 801                 c_cpp_comment => pass;
 802                 s_literal | d_literal => pass;
 803                 '%%{' => {
 804                         updateCol();
 805                         singleLineSpec = false;
 806                         startSection();
 807                         fgoto parser_def;
 808                 };
 809                 '%%' => {
 810                         updateCol();
 811                         singleLineSpec = true;
 812                         startSection();
 813                         fgoto parser_def;
 814                 };
 815                 whitespace+ => pass;
 816                 EOF;
 817                 any => pass;
 818         *|;
 819
 820 }%%
 821
 822 %% write data;
 823
 824 void Scanner::do_scan()
 825 {
 826         int bufsize = 8;
 827         char *buf = new char[bufsize];
 828         const char last_char = 0;
 829         int cs, act, have = 0;
 830         int top, stack[1];
 831         int curly_count = 0;
 832         bool execute = true;
 833         bool singleLineSpec = false;
 834         InlineBlockType inlineBlockType;
 835
 836         %% write init;
 837
 838         while ( execute ) {
 839                 char *p = buf + have;
 840                 int space = bufsize - have;
 841
 842                 if ( space == 0 ) {
 843                         /* We filled up the buffer trying to scan a token. Grow it. */
 844                         bufsize = bufsize * 2;
 845                         char *newbuf = new char[bufsize];
 846                         //cerr << "FULL BUFFER, NEW SIZE: " << bufsize << endl;
 847
 848                         /* Recompute p and space. */
 849                         p = newbuf + have;
 850                         space = bufsize - have;
 851
 852                         /* Patch up pointers possibly in use. */
 853                         if ( tokstart != 0 )
 854                                 tokstart = newbuf + ( tokstart - buf );
 855                         tokend = newbuf + ( tokend - buf );
 856
 857                         /* Copy the new buffer in. */
 858                         memcpy( newbuf, buf, have );
 859                         delete[] buf;
 860                         buf = newbuf;
 861                 }
 862
 863                 input.read( p, space );
 864                 int len = input.gcount();
 865
 866                 /* If we see eof then append the EOF char. */
 867                 if ( len == 0 ) {
 868                         p[0] = last_char, len = 1;
 869                         execute = false;
 870                 }
 871
 872                 char *pe = p + len;
 873                 %% write exec;
 874
 875                 /* Check if we failed. */
 876                 if ( cs == rlscan_error ) {
 877                         /* Machine failed before finding a token. */
 878                         //cerr << "PARSE ERROR" << endl;
 879                         exit(1);
 880                 }
 881
 882                 /* Decide if we need to preserve anything. */
 883                 char *preserve = tokstart;
 884
 885                 /* Now set up the prefix. */
 886                 if ( preserve == 0 )
 887                         have = 0;
 888                 else {
 889                         /* There is data that needs to be shifted over. */
 890                         have = pe - preserve;
 891                         memmove( buf, preserve, have );
 892                         unsigned int shiftback = preserve - buf;
 893                         if ( tokstart != 0 )
 894                                 tokstart -= shiftback;
 895                         tokend -= shiftback;
 896
 897                         preserve = buf;
 898                 }
 899         }
 900
 901         delete[] buf;
 902 }
 903
 904 void scan( char *fileName, istream &input )
 905 {
 906         Scanner scanner( fileName, input, 0, 0, 0 );
 907         scanner.init();
 908         scanner.do_scan();
 909 }
 910