scan.c

   1 /*
   2  * Copyright 1993-2002 Christopher Seiwald and Perforce Software, Inc.
   3  *
   4  * This file is part of Jam - see jam.c for Copyright information.
   5  */
   6
   7 #include "jam.h"
   8 #include "lists.h"
   9 #include "parse.h"
  10 #include "scan.h"
  11 #include "jamgram.h"
  12 #include "jambase.h"
  13 #include "newstr.h"
  14
  15 /*
  16  * scan.c - the jam yacc scanner
  17  *
  18  * 12/26/93 (seiwald) - bump buf in yylex to 10240 - yuk.
  19  * 09/16/94 (seiwald) - check for overflows, unmatched {}'s, etc.
  20  *          Also handle tokens abutting EOF by remembering
  21  *          to return EOF now matter how many times yylex()
  22  *          reinvokes yyline().
  23  * 02/11/95 (seiwald) - honor only punctuation keywords if SCAN_PUNCT.
  24  * 07/27/95 (seiwald) - Include jamgram.h after scan.h, so that YYSTYPE is
  25  *          defined before Linux's yacc tries to redefine it.
  26  */
  27
  28 struct keyword
  29 {
  30     char * word;
  31     int    type;
  32 } keywords[] =
  33 {
  34 #include "jamgramtab.h"
  35     { 0, 0 }
  36 };
  37
  38 struct include
  39 {
  40     struct include   * next;       /* next serial include file */
  41     char             * string;     /* pointer into current line */
  42     char           * * strings;    /* for yyfparse() -- text to parse */
  43     FILE             * file;       /* for yyfparse() -- file being read */
  44     char             * fname;      /* for yyfparse() -- file name */
  45     int                line;       /* line counter for error messages */
  46     char               buf[ 512 ]; /* for yyfparse() -- line buffer */
  47 };
  48
  49 static struct include * incp = 0; /* current file; head of chain */
  50
  51 static int scanmode = SCAN_NORMAL;
  52 static int anyerrors = 0;
  53
  54
  55 static char * symdump( YYSTYPE * );
  56
  57 #define BIGGEST_TOKEN 10240  /* no single token can be larger */
  58
  59
  60 /*
  61  * Set parser mode: normal, string, or keyword.
  62  */
  63
  64 void yymode( int n )
  65 {
  66     scanmode = n;
  67 }
  68
  69
  70 void yyerror( char * s )
  71 {
  72     /* We use yylval instead of incp to access the error location information as
  73      * the incp pointer will already be reset to 0 in case the error occurred at
  74      * EOF.
  75      *
  76      * The two may differ only if we get an error while reading a lexical token
  77      * spanning muliple lines, e.g. a multi-line string literal or action body,
  78      * in which case yylval location information will hold the information about
  79      * where this token started while incp will hold the information about where
  80      * reading it broke.
  81      *
  82      * TODO: Test the theory about when yylval and incp location information are
  83      * the same and when they differ.
  84      */
  85     printf( "%s:%d: %s at %s\n", yylval.file, yylval.line, s, symdump( &yylval ) );
  86     ++anyerrors;
  87 }
  88
  89
  90 int yyanyerrors()
  91 {
  92     return anyerrors != 0;
  93 }
  94
  95
  96 void yyfparse( char * s )
  97 {
  98     struct include * i = (struct include *)BJAM_MALLOC( sizeof( *i ) );
  99
 100     /* Push this onto the incp chain. */
 101     i->string = "";
 102     i->strings = 0;
 103     i->file = 0;
 104     i->fname = copystr( s );
 105     i->line = 0;
 106     i->next = incp;
 107     incp = i;
 108
 109     /* If the filename is "+", it means use the internal jambase. */
 110     if ( !strcmp( s, "+" ) )
 111         i->strings = jambase;
 112 }
 113
 114
 115 /*
 116  * yyline() - read new line and return first character.
 117  *
 118  * Fabricates a continuous stream of characters across include files, returning
 119  * EOF at the bitter end.
 120  */
 121
 122 int yyline()
 123 {
 124     struct include * i = incp;
 125
 126     if ( !incp )
 127         return EOF;
 128
 129     /* Once we start reading from the input stream, we reset the include
 130      * insertion point so that the next include file becomes the head of the
 131      * list.
 132      */
 133
 134     /* If there is more data in this line, return it. */
 135     if ( *i->string )
 136         return *i->string++;
 137
 138     /* If we are reading from an internal string list, go to the next string. */
 139     if ( i->strings )
 140     {
 141         if ( *i->strings )
 142         {
 143             ++i->line;
 144             i->string = *(i->strings++);
 145             return *i->string++;
 146         }
 147     }
 148     else
 149     {
 150         /* If necessary, open the file. */
 151         if ( !i->file )
 152         {
 153             FILE * f = stdin;
 154             if ( strcmp( i->fname, "-" ) && !( f = fopen( i->fname, "r" ) ) )
 155                 perror( i->fname );
 156             i->file = f;
 157         }
 158
 159         /* If there is another line in this file, start it. */
 160         if ( i->file && fgets( i->buf, sizeof( i->buf ), i->file ) )
 161         {
 162             ++i->line;
 163             i->string = i->buf;
 164             return *i->string++;
 165         }
 166     }
 167
 168     /* This include is done. Free it up and return EOF so yyparse() returns to
 169      * parse_file().
 170      */
 171
 172     incp = i->next;
 173
 174     /* Close file, free name. */
 175     if ( i->file && ( i->file != stdin ) )
 176         fclose( i->file );
 177     freestr( i->fname );
 178     BJAM_FREE( (char *)i );
 179
 180     return EOF;
 181 }
 182
 183
 184 /*
 185  * yylex() - set yylval to current token; return its type.
 186  *
 187  * Macros to move things along:
 188  *
 189  *  yychar() - return and advance character; invalid after EOF.
 190  *  yyprev() - back up one character; invalid before yychar().
 191  *
 192  * yychar() returns a continuous stream of characters, until it hits the EOF of
 193  * the current include file.
 194  */
 195
 196 #define yychar() ( *incp->string ? *incp->string++ : yyline() )
 197 #define yyprev() ( incp->string-- )
 198
 199 int yylex()
 200 {
 201     int c;
 202     char buf[ BIGGEST_TOKEN ];
 203     char * b = buf;
 204
 205     if ( !incp )
 206         goto eof;
 207
 208     /* Get first character (whitespace or of token). */
 209     c = yychar();
 210
 211     if ( scanmode == SCAN_STRING )
 212     {
 213         /* If scanning for a string (action's {}'s), look for the closing brace.
 214          * We handle matching braces, if they match.
 215          */
 216
 217         int nest = 1;
 218
 219         while ( ( c != EOF ) && ( b < buf + sizeof( buf ) ) )
 220         {
 221             if ( c == '{' )
 222                 ++nest;
 223
 224             if ( ( c == '}' ) && !--nest )
 225                 break;
 226
 227             *b++ = c;
 228
 229             c = yychar();
 230
 231             /* Turn trailing "\r\n" sequences into plain "\n" for Cygwin. */
 232             if ( ( c == '\n' ) && ( b[ -1 ] == '\r' ) )
 233                 --b;
 234         }
 235
 236         /* We ate the ending brace -- regurgitate it. */
 237         if ( c != EOF )
 238             yyprev();
 239
 240         /* Check for obvious errors. */
 241         if ( b == buf + sizeof( buf ) )
 242         {
 243             yyerror( "action block too big" );
 244             goto eof;
 245         }
 246
 247         if ( nest )
 248         {
 249             yyerror( "unmatched {} in action block" );
 250             goto eof;
 251         }
 252
 253         *b = 0;
 254         yylval.type = STRING;
 255         yylval.string = newstr( buf );
 256         yylval.file = incp->fname;
 257         yylval.line = incp->line;
 258     }
 259     else
 260     {
 261         char * b = buf;
 262         struct keyword * k;
 263         int inquote = 0;
 264         int notkeyword;
 265
 266         /* Eat white space. */
 267         for ( ;; )
 268         {
 269             /* Skip past white space. */
 270             while ( ( c != EOF ) && isspace( c ) )
 271                 c = yychar();
 272
 273             /* Not a comment? */
 274             if ( c != '#' )
 275                 break;
 276
 277             /* Swallow up comment line. */
 278             while ( ( ( c = yychar() ) != EOF ) && ( c != '\n' ) ) ;
 279         }
 280
 281         /* c now points to the first character of a token. */
 282         if ( c == EOF )
 283             goto eof;
 284
 285         yylval.file = incp->fname;
 286         yylval.line = incp->line;
 287
 288         /* While scanning the word, disqualify it for (expensive) keyword lookup
 289          * when we can: $anything, "anything", \anything
 290          */
 291         notkeyword = c == '$';
 292
 293         /* Look for white space to delimit word. "'s get stripped but preserve
 294          * white space. \ protects next character.
 295          */
 296         while
 297         (
 298             ( c != EOF ) &&
 299             ( b < buf + sizeof( buf ) ) &&
 300             ( inquote || !isspace( c ) )
 301         )
 302         {
 303             if ( c == '"' )
 304             {
 305                 /* begin or end " */
 306                 inquote = !inquote;
 307                 notkeyword = 1;
 308             }
 309             else if ( c != '\\' )
 310             {
 311                 /* normal char */
 312                 *b++ = c;
 313             }
 314             else if ( ( c = yychar() ) != EOF )
 315             {
 316                 /* \c */
 317                 if (c == 'n')
 318                     c = '\n';
 319                 else if (c == 'r')
 320                     c = '\r';
 321                 else if (c == 't')
 322                     c = '\t';
 323                 *b++ = c;
 324                 notkeyword = 1;
 325             }
 326             else
 327             {
 328                 /* \EOF */
 329                 break;
 330             }
 331
 332             c = yychar();
 333         }
 334
 335         /* Check obvious errors. */
 336         if ( b == buf + sizeof( buf ) )
 337         {
 338             yyerror( "string too big" );
 339             goto eof;
 340         }
 341
 342         if ( inquote )
 343         {
 344             yyerror( "unmatched \" in string" );
 345             goto eof;
 346         }
 347
 348         /* We looked ahead a character - back up. */
 349         if ( c != EOF )
 350             yyprev();
 351
 352         /* Scan token table. Do not scan if it is obviously not a keyword or if
 353          * it is an alphabetic when were looking for punctuation.
 354          */
 355
 356         *b = 0;
 357         yylval.type = ARG;
 358
 359         if ( !notkeyword && !( isalpha( *buf ) && ( scanmode == SCAN_PUNCT ) ) )
 360             for ( k = keywords; k->word; ++k )
 361                 if ( ( *buf == *k->word ) && !strcmp( k->word, buf ) )
 362                 {
 363                     yylval.type = k->type;
 364                     yylval.string = k->word;  /* used by symdump */
 365                     break;
 366                 }
 367
 368         if ( yylval.type == ARG )
 369             yylval.string = newstr( buf );
 370     }
 371
 372     if ( DEBUG_SCAN )
 373         printf( "scan %s\n", symdump( &yylval ) );
 374
 375     return yylval.type;
 376
 377 eof:
 378     /* We do not reset yylval.file & yylval.line here so unexpected EOF error
 379      * messages would include correct error location information.
 380      */
 381     yylval.type = EOF;
 382     return yylval.type;
 383 }
 384
 385
 386 static char * symdump( YYSTYPE * s )
 387 {
 388     static char buf[ BIGGEST_TOKEN + 20 ];
 389     switch ( s->type )
 390     {
 391         case EOF   : sprintf( buf, "EOF"                          ); break;
 392         case 0     : sprintf( buf, "unknown symbol %s", s->string ); break;
 393         case ARG   : sprintf( buf, "argument %s"      , s->string ); break;
 394         case STRING: sprintf( buf, "string \"%s\""    , s->string ); break;
 395         default    : sprintf( buf, "keyword %s"       , s->string ); break;
 396     }
 397     return buf;
 398 }
 399
 400
 401 /*
 402  * Get information about the current file and line, for those epsilon
 403  * transitions that produce a parse.
 404  */
 405
 406 void yyinput_stream( char * * name, int * line )
 407 {
 408     if ( incp )
 409     {
 410         *name = incp->fname;
 411         *line = incp->line;
 412     }
 413     else
 414     {
 415         *name = "(builtin)";
 416         *line = -1;
 417     }
 418 }