gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
   3    1999, 2000, 2002, 2003
   4    Free Software Foundation, Inc.
   5
   6    This file is part of GAS, the GNU Assembler.
   7
   8    GAS is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 2, or (at your option)
  11    any later version.
  12
  13    GAS is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with GAS; see the file COPYING.  If not, write to the Free
  20    Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  21    02111-1307, USA.  */
  22
  23 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
  24 /* App, the assembler pre-processor.  This pre-processor strips out excess
  25    spaces, turns single-quoted characters into a decimal constant, and turns
  26    # <number> <filename> <garbage> into a .line <number>\n.file <filename>
  27    pair.  This needs better error-handling.  */
  28
  29 #include <stdio.h>
  30 #include "as.h"                 /* For BAD_CASE() only.  */
  31
  32 #if (__STDC__ != 1)
  33 #ifndef const
  34 #define const  /* empty */
  35 #endif
  36 #endif
  37
  38 #ifdef TC_M68K
  39 /* Whether we are scrubbing in m68k MRI mode.  This is different from
  40    flag_m68k_mri, because the two flags will be affected by the .mri
  41    pseudo-op at different times.  */
  42 static int scrub_m68k_mri;
  43
  44 /* The pseudo-op which switches in and out of MRI mode.  See the
  45    comment in do_scrub_chars.  */
  46 static const char mri_pseudo[] = ".mri 0";
  47 #else
  48 #define scrub_m68k_mri 0
  49 #endif
  50
  51 #if defined TC_ARM && defined OBJ_ELF
  52 /* The pseudo-op for which we need to special-case `@' characters.
  53    See the comment in do_scrub_chars.  */
  54 static const char   symver_pseudo[] = ".symver";
  55 static const char * symver_state;
  56 #endif
  57
  58 static char lex[256];
  59 static const char symbol_chars[] =
  60 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  61
  62 #define LEX_IS_SYMBOL_COMPONENT         1
  63 #define LEX_IS_WHITESPACE               2
  64 #define LEX_IS_LINE_SEPARATOR           3
  65 #define LEX_IS_COMMENT_START            4
  66 #define LEX_IS_LINE_COMMENT_START       5
  67 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  68 #define LEX_IS_STRINGQUOTE              8
  69 #define LEX_IS_COLON                    9
  70 #define LEX_IS_NEWLINE                  10
  71 #define LEX_IS_ONECHAR_QUOTE            11
  72 #ifdef TC_V850
  73 #define LEX_IS_DOUBLEDASH_1ST           12
  74 #endif
  75 #ifdef TC_M32R
  76 #define DOUBLEBAR_PARALLEL
  77 #endif
  78 #ifdef DOUBLEBAR_PARALLEL
  79 #define LEX_IS_DOUBLEBAR_1ST            13
  80 #endif
  81 #define LEX_IS_PARALLEL_SEPARATOR       14
  82 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  83 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  84 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  85 #define IS_PARALLEL_SEPARATOR(c)        (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
  86 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  87 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  88 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  89
  90 static int process_escape PARAMS ((int));
  91
  92 /* FIXME-soon: The entire lexer/parser thingy should be
  93    built statically at compile time rather than dynamically
  94    each and every time the assembler is run.  xoxorich.  */
  95
  96 void
  97 do_scrub_begin (m68k_mri)
  98      int m68k_mri ATTRIBUTE_UNUSED;
  99 {
 100   const char *p;
 101   int c;
 102
 103   lex[' '] = LEX_IS_WHITESPACE;
 104   lex['\t'] = LEX_IS_WHITESPACE;
 105   lex['\r'] = LEX_IS_WHITESPACE;
 106   lex['\n'] = LEX_IS_NEWLINE;
 107   lex[':'] = LEX_IS_COLON;
 108
 109 #ifdef TC_M68K
 110   scrub_m68k_mri = m68k_mri;
 111
 112   if (! m68k_mri)
 113 #endif
 114     {
 115       lex['"'] = LEX_IS_STRINGQUOTE;
 116
 117 #if ! defined (TC_HPPA) && ! defined (TC_I370)
 118       /* I370 uses single-quotes to delimit integer, float constants.  */
 119       lex['\''] = LEX_IS_ONECHAR_QUOTE;
 120 #endif
 121
 122 #ifdef SINGLE_QUOTE_STRINGS
 123       lex['\''] = LEX_IS_STRINGQUOTE;
 124 #endif
 125     }
 126
 127   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
 128      in state 5 of do_scrub_chars must be changed.  */
 129
 130   /* Note that these override the previous defaults, e.g. if ';' is a
 131      comment char, then it isn't a line separator.  */
 132   for (p = symbol_chars; *p; ++p)
 133     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 134
 135   for (c = 128; c < 256; ++c)
 136     lex[c] = LEX_IS_SYMBOL_COMPONENT;
 137
 138 #ifdef tc_symbol_chars
 139   /* This macro permits the processor to specify all characters which
 140      may appears in an operand.  This will prevent the scrubber from
 141      discarding meaningful whitespace in certain cases.  The i386
 142      backend uses this to support prefixes, which can confuse the
 143      scrubber as to whether it is parsing operands or opcodes.  */
 144   for (p = tc_symbol_chars; *p; ++p)
 145     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 146 #endif
 147
 148   /* The m68k backend wants to be able to change comment_chars.  */
 149 #ifndef tc_comment_chars
 150 #define tc_comment_chars comment_chars
 151 #endif
 152   for (p = tc_comment_chars; *p; p++)
 153     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
 154
 155   for (p = line_comment_chars; *p; p++)
 156     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 157
 158   for (p = line_separator_chars; *p; p++)
 159     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 160
 161 #ifdef tc_parallel_separator_chars
 162   /* This macro permits the processor to specify all characters which
 163      separate parallel insns on the same line.  */
 164   for (p = tc_parallel_separator_chars; *p; p++)
 165     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
 166 #endif
 167
 168   /* Only allow slash-star comments if slash is not in use.
 169      FIXME: This isn't right.  We should always permit them.  */
 170   if (lex['/'] == 0)
 171     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 172
 173 #ifdef TC_M68K
 174   if (m68k_mri)
 175     {
 176       lex['\''] = LEX_IS_STRINGQUOTE;
 177       lex[';'] = LEX_IS_COMMENT_START;
 178       lex['*'] = LEX_IS_LINE_COMMENT_START;
 179       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 180          then it can't be used in an expression.  */
 181       lex['!'] = LEX_IS_LINE_COMMENT_START;
 182     }
 183 #endif
 184
 185 #ifdef TC_V850
 186   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
 187 #endif
 188 #ifdef DOUBLEBAR_PARALLEL
 189   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
 190 #endif
 191 #ifdef TC_D30V
 192   /* Must do this is we want VLIW instruction with "->" or "<-".  */
 193   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
 194 #endif
 195 }
 196
 197 /* Saved state of the scrubber.  */
 198 static int state;
 199 static int old_state;
 200 static char *out_string;
 201 static char out_buf[20];
 202 static int add_newlines;
 203 static char *saved_input;
 204 static int saved_input_len;
 205 static char input_buffer[32 * 1024];
 206 static const char *mri_state;
 207 static char mri_last_ch;
 208
 209 /* Data structure for saving the state of app across #include's.  Note that
 210    app is called asynchronously to the parsing of the .include's, so our
 211    state at the time .include is interpreted is completely unrelated.
 212    That's why we have to save it all.  */
 213
 214 struct app_save
 215 {
 216   int          state;
 217   int          old_state;
 218   char *       out_string;
 219   char         out_buf[sizeof (out_buf)];
 220   int          add_newlines;
 221   char *       saved_input;
 222   int          saved_input_len;
 223 #ifdef TC_M68K
 224   int          scrub_m68k_mri;
 225 #endif
 226   const char * mri_state;
 227   char         mri_last_ch;
 228 #if defined TC_ARM && defined OBJ_ELF
 229   const char * symver_state;
 230 #endif
 231 };
 232
 233 char *
 234 app_push ()
 235 {
 236   register struct app_save *saved;
 237
 238   saved = (struct app_save *) xmalloc (sizeof (*saved));
 239   saved->state = state;
 240   saved->old_state = old_state;
 241   saved->out_string = out_string;
 242   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 243   saved->add_newlines = add_newlines;
 244   if (saved_input == NULL)
 245     saved->saved_input = NULL;
 246   else
 247     {
 248       saved->saved_input = xmalloc (saved_input_len);
 249       memcpy (saved->saved_input, saved_input, saved_input_len);
 250       saved->saved_input_len = saved_input_len;
 251     }
 252 #ifdef TC_M68K
 253   saved->scrub_m68k_mri = scrub_m68k_mri;
 254 #endif
 255   saved->mri_state = mri_state;
 256   saved->mri_last_ch = mri_last_ch;
 257 #if defined TC_ARM && defined OBJ_ELF
 258   saved->symver_state = symver_state;
 259 #endif
 260
 261   /* do_scrub_begin() is not useful, just wastes time.  */
 262
 263   state = 0;
 264   saved_input = NULL;
 265
 266   return (char *) saved;
 267 }
 268
 269 void
 270 app_pop (arg)
 271      char *arg;
 272 {
 273   register struct app_save *saved = (struct app_save *) arg;
 274
 275   /* There is no do_scrub_end ().  */
 276   state = saved->state;
 277   old_state = saved->old_state;
 278   out_string = saved->out_string;
 279   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 280   add_newlines = saved->add_newlines;
 281   if (saved->saved_input == NULL)
 282     saved_input = NULL;
 283   else
 284     {
 285       assert (saved->saved_input_len <= (int) (sizeof input_buffer));
 286       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
 287       saved_input = input_buffer;
 288       saved_input_len = saved->saved_input_len;
 289       free (saved->saved_input);
 290     }
 291 #ifdef TC_M68K
 292   scrub_m68k_mri = saved->scrub_m68k_mri;
 293 #endif
 294   mri_state = saved->mri_state;
 295   mri_last_ch = saved->mri_last_ch;
 296 #if defined TC_ARM && defined OBJ_ELF
 297   symver_state = saved->symver_state;
 298 #endif
 299
 300   free (arg);
 301 }
 302
 303 /* @@ This assumes that \n &c are the same on host and target.  This is not
 304    necessarily true.  */
 305
 306 static int
 307 process_escape (ch)
 308      int ch;
 309 {
 310   switch (ch)
 311     {
 312     case 'b':
 313       return '\b';
 314     case 'f':
 315       return '\f';
 316     case 'n':
 317       return '\n';
 318     case 'r':
 319       return '\r';
 320     case 't':
 321       return '\t';
 322     case '\'':
 323       return '\'';
 324     case '"':
 325       return '\"';
 326     default:
 327       return ch;
 328     }
 329 }
 330
 331 /* This function is called to process input characters.  The GET
 332    parameter is used to retrieve more input characters.  GET should
 333    set its parameter to point to a buffer, and return the length of
 334    the buffer; it should return 0 at end of file.  The scrubbed output
 335    characters are put into the buffer starting at TOSTART; the TOSTART
 336    buffer is TOLEN bytes in length.  The function returns the number
 337    of scrubbed characters put into TOSTART.  This will be TOLEN unless
 338    end of file was seen.  This function is arranged as a state
 339    machine, and saves its state so that it may return at any point.
 340    This is the way the old code used to work.  */
 341
 342 int
 343 do_scrub_chars (get, tostart, tolen)
 344      int (*get) PARAMS ((char *, int));
 345      char *tostart;
 346      int tolen;
 347 {
 348   char *to = tostart;
 349   char *toend = tostart + tolen;
 350   char *from;
 351   char *fromend;
 352   int fromlen;
 353   register int ch, ch2 = 0;
 354
 355   /*State 0: beginning of normal line
 356           1: After first whitespace on line (flush more white)
 357           2: After first non-white (opcode) on line (keep 1white)
 358           3: after second white on line (into operands) (flush white)
 359           4: after putting out a .line, put out digits
 360           5: parsing a string, then go to old-state
 361           6: putting out \ escape in a "d string.
 362           7: After putting out a .appfile, put out string.
 363           8: After putting out a .appfile string, flush until newline.
 364           9: After seeing symbol char in state 3 (keep 1white after symchar)
 365          10: After seeing whitespace in state 9 (keep white before symchar)
 366          11: After seeing a symbol character in state 0 (eg a label definition)
 367          -1: output string in out_string and go to the state in old_state
 368          -2: flush text until a '*' '/' is seen, then go to state old_state
 369 #ifdef TC_V850
 370          12: After seeing a dash, looking for a second dash as a start
 371              of comment.
 372 #endif
 373 #ifdef DOUBLEBAR_PARALLEL
 374          13: After seeing a vertical bar, looking for a second
 375              vertical bar as a parallel expression separator.
 376 #endif
 377 #ifdef TC_IA64
 378          14: After seeing a `(' at state 0, looking for a `)' as
 379              predicate.
 380          15: After seeing a `(' at state 1, looking for a `)' as
 381              predicate.
 382 #endif
 383           */
 384
 385   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 386      constructs like ``.loc 1 20''.  This was turning into ``.loc
 387      120''.  States 9 and 10 ensure that a space is never dropped in
 388      between characters which could appear in an identifier.  Ian
 389      Taylor, ian@cygnus.com.
 390
 391      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 392      correctly on the PA (and any other target where colons are optional).
 393      Jeff Law, law@cs.utah.edu.
 394
 395      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
 396      get squashed into "cmp r1,r2||trap#1", with the all important space
 397      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
 398
 399   /* This macro gets the next input character.  */
 400
 401 #define GET()                                                   \
 402   (from < fromend                                               \
 403    ? * (unsigned char *) (from++)                               \
 404    : (saved_input = NULL,                                       \
 405       fromlen = (*get) (input_buffer, sizeof input_buffer),     \
 406       from = input_buffer,                                      \
 407       fromend = from + fromlen,                                 \
 408       (fromlen == 0                                             \
 409        ? EOF                                                    \
 410        : * (unsigned char *) (from++))))
 411
 412   /* This macro pushes a character back on the input stream.  */
 413
 414 #define UNGET(uch) (*--from = (uch))
 415
 416   /* This macro puts a character into the output buffer.  If this
 417      character fills the output buffer, this macro jumps to the label
 418      TOFULL.  We use this rather ugly approach because we need to
 419      handle two different termination conditions: EOF on the input
 420      stream, and a full output buffer.  It would be simpler if we
 421      always read in the entire input stream before processing it, but
 422      I don't want to make such a significant change to the assembler's
 423      memory usage.  */
 424
 425 #define PUT(pch)                                \
 426   do                                            \
 427     {                                           \
 428       *to++ = (pch);                            \
 429       if (to >= toend)                          \
 430         goto tofull;                            \
 431     }                                           \
 432   while (0)
 433
 434   if (saved_input != NULL)
 435     {
 436       from = saved_input;
 437       fromend = from + saved_input_len;
 438     }
 439   else
 440     {
 441       fromlen = (*get) (input_buffer, sizeof input_buffer);
 442       if (fromlen == 0)
 443         return 0;
 444       from = input_buffer;
 445       fromend = from + fromlen;
 446     }
 447
 448   while (1)
 449     {
 450       /* The cases in this switch end with continue, in order to
 451          branch back to the top of this while loop and generate the
 452          next output character in the appropriate state.  */
 453       switch (state)
 454         {
 455         case -1:
 456           ch = *out_string++;
 457           if (*out_string == '\0')
 458             {
 459               state = old_state;
 460               old_state = 3;
 461             }
 462           PUT (ch);
 463           continue;
 464
 465         case -2:
 466           for (;;)
 467             {
 468               do
 469                 {
 470                   ch = GET ();
 471
 472                   if (ch == EOF)
 473                     {
 474                       as_warn (_("end of file in comment"));
 475                       goto fromeof;
 476                     }
 477
 478                   if (ch == '\n')
 479                     PUT ('\n');
 480                 }
 481               while (ch != '*');
 482
 483               while ((ch = GET ()) == '*')
 484                 ;
 485
 486               if (ch == EOF)
 487                 {
 488                   as_warn (_("end of file in comment"));
 489                   goto fromeof;
 490                 }
 491
 492               if (ch == '/')
 493                 break;
 494
 495               UNGET (ch);
 496             }
 497
 498           state = old_state;
 499           UNGET (' ');
 500           continue;
 501
 502         case 4:
 503           ch = GET ();
 504           if (ch == EOF)
 505             goto fromeof;
 506           else if (ch >= '0' && ch <= '9')
 507             PUT (ch);
 508           else
 509             {
 510               while (ch != EOF && IS_WHITESPACE (ch))
 511                 ch = GET ();
 512               if (ch == '"')
 513                 {
 514                   UNGET (ch);
 515                   if (scrub_m68k_mri)
 516                     out_string = "\n\tappfile ";
 517                   else
 518                     out_string = "\n\t.appfile ";
 519                   old_state = 7;
 520                   state = -1;
 521                   PUT (*out_string++);
 522                 }
 523               else
 524                 {
 525                   while (ch != EOF && ch != '\n')
 526                     ch = GET ();
 527                   state = 0;
 528                   PUT (ch);
 529                 }
 530             }
 531           continue;
 532
 533         case 5:
 534           /* We are going to copy everything up to a quote character,
 535              with special handling for a backslash.  We try to
 536              optimize the copying in the simple case without using the
 537              GET and PUT macros.  */
 538           {
 539             char *s;
 540             int len;
 541
 542             for (s = from; s < fromend; s++)
 543               {
 544                 ch = *s;
 545                 /* This condition must be changed if the type of any
 546                    other character can be LEX_IS_STRINGQUOTE.  */
 547                 if (ch == '\\'
 548                     || ch == '"'
 549                     || ch == '\''
 550                     || ch == '\n')
 551                   break;
 552               }
 553             len = s - from;
 554             if (len > toend - to)
 555               len = toend - to;
 556             if (len > 0)
 557               {
 558                 memcpy (to, from, len);
 559                 to += len;
 560                 from += len;
 561               }
 562           }
 563
 564           ch = GET ();
 565           if (ch == EOF)
 566             {
 567               as_warn (_("end of file in string; inserted '\"'"));
 568               state = old_state;
 569               UNGET ('\n');
 570               PUT ('"');
 571             }
 572           else if (lex[ch] == LEX_IS_STRINGQUOTE)
 573             {
 574               state = old_state;
 575               PUT (ch);
 576             }
 577 #ifndef NO_STRING_ESCAPES
 578           else if (ch == '\\')
 579             {
 580               state = 6;
 581               PUT (ch);
 582             }
 583 #endif
 584           else if (scrub_m68k_mri && ch == '\n')
 585             {
 586               /* Just quietly terminate the string.  This permits lines like
 587                    bne  label   loop if we haven't reach end yet.  */
 588               state = old_state;
 589               UNGET (ch);
 590               PUT ('\'');
 591             }
 592           else
 593             {
 594               PUT (ch);
 595             }
 596           continue;
 597
 598         case 6:
 599           state = 5;
 600           ch = GET ();
 601           switch (ch)
 602             {
 603               /* Handle strings broken across lines, by turning '\n' into
 604                  '\\' and 'n'.  */
 605             case '\n':
 606               UNGET ('n');
 607               add_newlines++;
 608               PUT ('\\');
 609               continue;
 610
 611             case EOF:
 612               as_warn (_("end of file in string; '\"' inserted"));
 613               PUT ('"');
 614               continue;
 615
 616             case '"':
 617             case '\\':
 618             case 'b':
 619             case 'f':
 620             case 'n':
 621             case 'r':
 622             case 't':
 623             case 'v':
 624             case 'x':
 625             case 'X':
 626             case '0':
 627             case '1':
 628             case '2':
 629             case '3':
 630             case '4':
 631             case '5':
 632             case '6':
 633             case '7':
 634               break;
 635
 636             default:
 637 #ifdef ONLY_STANDARD_ESCAPES
 638               as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
 639 #endif
 640               break;
 641             }
 642           PUT (ch);
 643           continue;
 644
 645         case 7:
 646           ch = GET ();
 647           state = 5;
 648           old_state = 8;
 649           if (ch == EOF)
 650             goto fromeof;
 651           PUT (ch);
 652           continue;
 653
 654         case 8:
 655           do
 656             ch = GET ();
 657           while (ch != '\n' && ch != EOF);
 658           if (ch == EOF)
 659             goto fromeof;
 660           state = 0;
 661           PUT (ch);
 662           continue;
 663
 664 #ifdef DOUBLEBAR_PARALLEL
 665         case 13:
 666           ch = GET ();
 667           if (ch != '|')
 668             abort ();
 669
 670           /* Reset back to state 1 and pretend that we are parsing a
 671              line from just after the first white space.  */
 672           state = 1;
 673           PUT ('|');
 674           continue;
 675 #endif
 676         }
 677
 678       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
 679
 680       /* flushchar: */
 681       ch = GET ();
 682
 683 #ifdef TC_IA64
 684       if (ch == '(' && (state == 0 || state == 1))
 685         {
 686           state += 14;
 687           PUT (ch);
 688           continue;
 689         }
 690       else if (state == 14 || state == 15)
 691         {
 692           if (ch == ')')
 693             state -= 14;
 694           else
 695             {
 696               PUT (ch);
 697               continue;
 698             }
 699         }
 700 #endif
 701
 702     recycle:
 703
 704 #if defined TC_ARM && defined OBJ_ELF
 705       /* We need to watch out for .symver directives.  See the comment later
 706          in this function.  */
 707       if (symver_state == NULL)
 708         {
 709           if ((state == 0 || state == 1) && ch == symver_pseudo[0])
 710             symver_state = symver_pseudo + 1;
 711         }
 712       else
 713         {
 714           /* We advance to the next state if we find the right
 715              character.  */
 716           if (ch != '\0' && (*symver_state == ch))
 717             ++symver_state;
 718           else if (*symver_state != '\0')
 719             /* We did not get the expected character, or we didn't
 720                get a valid terminating character after seeing the
 721                entire pseudo-op, so we must go back to the beginning.  */
 722             symver_state = NULL;
 723           else
 724             {
 725               /* We've read the entire pseudo-op.  If this is the end
 726                  of the line, go back to the beginning.  */
 727               if (IS_NEWLINE (ch))
 728                 symver_state = NULL;
 729             }
 730         }
 731 #endif /* TC_ARM && OBJ_ELF */
 732
 733 #ifdef TC_M68K
 734       /* We want to have pseudo-ops which control whether we are in
 735          MRI mode or not.  Unfortunately, since m68k MRI mode affects
 736          the scrubber, that means that we need a special purpose
 737          recognizer here.  */
 738       if (mri_state == NULL)
 739         {
 740           if ((state == 0 || state == 1)
 741               && ch == mri_pseudo[0])
 742             mri_state = mri_pseudo + 1;
 743         }
 744       else
 745         {
 746           /* We advance to the next state if we find the right
 747              character, or if we need a space character and we get any
 748              whitespace character, or if we need a '0' and we get a
 749              '1' (this is so that we only need one state to handle
 750              ``.mri 0'' and ``.mri 1'').  */
 751           if (ch != '\0'
 752               && (*mri_state == ch
 753                   || (*mri_state == ' '
 754                       && lex[ch] == LEX_IS_WHITESPACE)
 755                   || (*mri_state == '0'
 756                       && ch == '1')))
 757             {
 758               mri_last_ch = ch;
 759               ++mri_state;
 760             }
 761           else if (*mri_state != '\0'
 762                    || (lex[ch] != LEX_IS_WHITESPACE
 763                        && lex[ch] != LEX_IS_NEWLINE))
 764             {
 765               /* We did not get the expected character, or we didn't
 766                  get a valid terminating character after seeing the
 767                  entire pseudo-op, so we must go back to the
 768                  beginning.  */
 769               mri_state = NULL;
 770             }
 771           else
 772             {
 773               /* We've read the entire pseudo-op.  mips_last_ch is
 774                  either '0' or '1' indicating whether to enter or
 775                  leave MRI mode.  */
 776               do_scrub_begin (mri_last_ch == '1');
 777               mri_state = NULL;
 778
 779               /* We continue handling the character as usual.  The
 780                  main gas reader must also handle the .mri pseudo-op
 781                  to control expression parsing and the like.  */
 782             }
 783         }
 784 #endif
 785
 786       if (ch == EOF)
 787         {
 788           if (state != 0)
 789             {
 790               as_warn (_("end of file not at end of a line; newline inserted"));
 791               state = 0;
 792               PUT ('\n');
 793             }
 794           goto fromeof;
 795         }
 796
 797       switch (lex[ch])
 798         {
 799         case LEX_IS_WHITESPACE:
 800           do
 801             {
 802               ch = GET ();
 803             }
 804           while (ch != EOF && IS_WHITESPACE (ch));
 805           if (ch == EOF)
 806             goto fromeof;
 807
 808           if (state == 0)
 809             {
 810               /* Preserve a single whitespace character at the
 811                  beginning of a line.  */
 812               state = 1;
 813               UNGET (ch);
 814               PUT (' ');
 815               break;
 816             }
 817
 818 #ifdef KEEP_WHITE_AROUND_COLON
 819           if (lex[ch] == LEX_IS_COLON)
 820             {
 821               /* Only keep this white if there's no white *after* the
 822                  colon.  */
 823               ch2 = GET ();
 824               UNGET (ch2);
 825               if (!IS_WHITESPACE (ch2))
 826                 {
 827                   state = 9;
 828                   UNGET (ch);
 829                   PUT (' ');
 830                   break;
 831                 }
 832             }
 833 #endif
 834           if (IS_COMMENT (ch)
 835               || ch == '/'
 836               || IS_LINE_SEPARATOR (ch)
 837               || IS_PARALLEL_SEPARATOR (ch))
 838             {
 839               if (scrub_m68k_mri)
 840                 {
 841                   /* In MRI mode, we keep these spaces.  */
 842                   UNGET (ch);
 843                   PUT (' ');
 844                   break;
 845                 }
 846               goto recycle;
 847             }
 848
 849           /* If we're in state 2 or 11, we've seen a non-white
 850              character followed by whitespace.  If the next character
 851              is ':', this is whitespace after a label name which we
 852              normally must ignore.  In MRI mode, though, spaces are
 853              not permitted between the label and the colon.  */
 854           if ((state == 2 || state == 11)
 855               && lex[ch] == LEX_IS_COLON
 856               && ! scrub_m68k_mri)
 857             {
 858               state = 1;
 859               PUT (ch);
 860               break;
 861             }
 862
 863           switch (state)
 864             {
 865             case 0:
 866               state++;
 867               goto recycle;     /* Punted leading sp */
 868             case 1:
 869               /* We can arrive here if we leave a leading whitespace
 870                  character at the beginning of a line.  */
 871               goto recycle;
 872             case 2:
 873               state = 3;
 874               if (to + 1 < toend)
 875                 {
 876                   /* Optimize common case by skipping UNGET/GET.  */
 877                   PUT (' ');    /* Sp after opco */
 878                   goto recycle;
 879                 }
 880               UNGET (ch);
 881               PUT (' ');
 882               break;
 883             case 3:
 884               if (scrub_m68k_mri)
 885                 {
 886                   /* In MRI mode, we keep these spaces.  */
 887                   UNGET (ch);
 888                   PUT (' ');
 889                   break;
 890                 }
 891               goto recycle;     /* Sp in operands */
 892             case 9:
 893             case 10:
 894               if (scrub_m68k_mri)
 895                 {
 896                   /* In MRI mode, we keep these spaces.  */
 897                   state = 3;
 898                   UNGET (ch);
 899                   PUT (' ');
 900                   break;
 901                 }
 902               state = 10;       /* Sp after symbol char */
 903               goto recycle;
 904             case 11:
 905               if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
 906                 state = 1;
 907               else
 908                 {
 909                   /* We know that ch is not ':', since we tested that
 910                      case above.  Therefore this is not a label, so it
 911                      must be the opcode, and we've just seen the
 912                      whitespace after it.  */
 913                   state = 3;
 914                 }
 915               UNGET (ch);
 916               PUT (' ');        /* Sp after label definition.  */
 917               break;
 918             default:
 919               BAD_CASE (state);
 920             }
 921           break;
 922
 923         case LEX_IS_TWOCHAR_COMMENT_1ST:
 924           ch2 = GET ();
 925           if (ch2 == '*')
 926             {
 927               for (;;)
 928                 {
 929                   do
 930                     {
 931                       ch2 = GET ();
 932                       if (ch2 != EOF && IS_NEWLINE (ch2))
 933                         add_newlines++;
 934                     }
 935                   while (ch2 != EOF && ch2 != '*');
 936
 937                   while (ch2 == '*')
 938                     ch2 = GET ();
 939
 940                   if (ch2 == EOF || ch2 == '/')
 941                     break;
 942
 943                   /* This UNGET will ensure that we count newlines
 944                      correctly.  */
 945                   UNGET (ch2);
 946                 }
 947
 948               if (ch2 == EOF)
 949                 as_warn (_("end of file in multiline comment"));
 950
 951               ch = ' ';
 952               goto recycle;
 953             }
 954 #ifdef DOUBLESLASH_LINE_COMMENTS
 955           else if (ch2 == '/')
 956             {
 957               do
 958                 {
 959                   ch = GET ();
 960                 }
 961               while (ch != EOF && !IS_NEWLINE (ch));
 962               if (ch == EOF)
 963                 as_warn ("end of file in comment; newline inserted");
 964               state = 0;
 965               PUT ('\n');
 966               break;
 967             }
 968 #endif
 969           else
 970             {
 971               if (ch2 != EOF)
 972                 UNGET (ch2);
 973               if (state == 9 || state == 10)
 974                 state = 3;
 975               PUT (ch);
 976             }
 977           break;
 978
 979         case LEX_IS_STRINGQUOTE:
 980           if (state == 10)
 981             {
 982               /* Preserve the whitespace in foo "bar".  */
 983               UNGET (ch);
 984               state = 3;
 985               PUT (' ');
 986
 987               /* PUT didn't jump out.  We could just break, but we
 988                  know what will happen, so optimize a bit.  */
 989               ch = GET ();
 990               old_state = 3;
 991             }
 992           else if (state == 9)
 993             old_state = 3;
 994           else
 995             old_state = state;
 996           state = 5;
 997           PUT (ch);
 998           break;
 999
1000 #ifndef IEEE_STYLE
1001         case LEX_IS_ONECHAR_QUOTE:
1002           if (state == 10)
1003             {
1004               /* Preserve the whitespace in foo 'b'.  */
1005               UNGET (ch);
1006               state = 3;
1007               PUT (' ');
1008               break;
1009             }
1010           ch = GET ();
1011           if (ch == EOF)
1012             {
1013               as_warn (_("end of file after a one-character quote; \\0 inserted"));
1014               ch = 0;
1015             }
1016           if (ch == '\\')
1017             {
1018               ch = GET ();
1019               if (ch == EOF)
1020                 {
1021                   as_warn (_("end of file in escape character"));
1022                   ch = '\\';
1023                 }
1024               else
1025                 ch = process_escape (ch);
1026             }
1027           sprintf (out_buf, "%d", (int) (unsigned char) ch);
1028
1029           /* None of these 'x constants for us.  We want 'x'.  */
1030           if ((ch = GET ()) != '\'')
1031             {
1032 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1033               as_warn (_("missing close quote; (assumed)"));
1034 #else
1035               if (ch != EOF)
1036                 UNGET (ch);
1037 #endif
1038             }
1039           if (strlen (out_buf) == 1)
1040             {
1041               PUT (out_buf[0]);
1042               break;
1043             }
1044           if (state == 9)
1045             old_state = 3;
1046           else
1047             old_state = state;
1048           state = -1;
1049           out_string = out_buf;
1050           PUT (*out_string++);
1051           break;
1052 #endif
1053
1054         case LEX_IS_COLON:
1055 #ifdef KEEP_WHITE_AROUND_COLON
1056           state = 9;
1057 #else
1058           if (state == 9 || state == 10)
1059             state = 3;
1060           else if (state != 3)
1061             state = 1;
1062 #endif
1063           PUT (ch);
1064           break;
1065
1066         case LEX_IS_NEWLINE:
1067           /* Roll out a bunch of newlines from inside comments, etc.  */
1068           if (add_newlines)
1069             {
1070               --add_newlines;
1071               UNGET (ch);
1072             }
1073           /* Fall through.  */
1074
1075         case LEX_IS_LINE_SEPARATOR:
1076           state = 0;
1077           PUT (ch);
1078           break;
1079
1080         case LEX_IS_PARALLEL_SEPARATOR:
1081           state = 1;
1082           PUT (ch);
1083           break;
1084
1085 #ifdef TC_V850
1086         case LEX_IS_DOUBLEDASH_1ST:
1087           ch2 = GET ();
1088           if (ch2 != '-')
1089             {
1090               UNGET (ch2);
1091               goto de_fault;
1092             }
1093           /* Read and skip to end of line.  */
1094           do
1095             {
1096               ch = GET ();
1097             }
1098           while (ch != EOF && ch != '\n');
1099
1100           if (ch == EOF)
1101             as_warn (_("end of file in comment; newline inserted"));
1102
1103           state = 0;
1104           PUT ('\n');
1105           break;
1106 #endif
1107 #ifdef DOUBLEBAR_PARALLEL
1108         case LEX_IS_DOUBLEBAR_1ST:
1109           ch2 = GET ();
1110           UNGET (ch2);
1111           if (ch2 != '|')
1112             goto de_fault;
1113
1114           /* Handle '||' in two states as invoking PUT twice might
1115              result in the first one jumping out of this loop.  We'd
1116              then lose track of the state and one '|' char.  */
1117           state = 13;
1118           PUT ('|');
1119           break;
1120 #endif
1121         case LEX_IS_LINE_COMMENT_START:
1122           /* FIXME-someday: The two character comment stuff was badly
1123              thought out.  On i386, we want '/' as line comment start
1124              AND we want C style comments.  hence this hack.  The
1125              whole lexical process should be reworked.  xoxorich.  */
1126           if (ch == '/')
1127             {
1128               ch2 = GET ();
1129               if (ch2 == '*')
1130                 {
1131                   old_state = 3;
1132                   state = -2;
1133                   break;
1134                 }
1135               else
1136                 {
1137                   UNGET (ch2);
1138                 }
1139             }
1140
1141           if (state == 0 || state == 1) /* Only comment at start of line.  */
1142             {
1143               int startch;
1144
1145               startch = ch;
1146
1147               do
1148                 {
1149                   ch = GET ();
1150                 }
1151               while (ch != EOF && IS_WHITESPACE (ch));
1152
1153               if (ch == EOF)
1154                 {
1155                   as_warn (_("end of file in comment; newline inserted"));
1156                   PUT ('\n');
1157                   break;
1158                 }
1159
1160               if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1161                 {
1162                   /* Not a cpp line.  */
1163                   while (ch != EOF && !IS_NEWLINE (ch))
1164                     ch = GET ();
1165                   if (ch == EOF)
1166                     as_warn (_("end of file in comment; newline inserted"));
1167                   state = 0;
1168                   PUT ('\n');
1169                   break;
1170                 }
1171               /* Looks like `# 123 "filename"' from cpp.  */
1172               UNGET (ch);
1173               old_state = 4;
1174               state = -1;
1175               if (scrub_m68k_mri)
1176                 out_string = "\tappline ";
1177               else
1178                 out_string = "\t.appline ";
1179               PUT (*out_string++);
1180               break;
1181             }
1182
1183 #ifdef TC_D10V
1184           /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1185              Trap is the only short insn that has a first operand that is
1186              neither register nor label.
1187              We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1188              We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1189              already LEX_IS_LINE_COMMENT_START.  However, it is the
1190              only character in line_comment_chars for d10v, hence we
1191              can recognize it as such.  */
1192           /* An alternative approach would be to reset the state to 1 when
1193              we see '||', '<'- or '->', but that seems to be overkill.  */
1194           if (state == 10)
1195             PUT (' ');
1196 #endif
1197           /* We have a line comment character which is not at the
1198              start of a line.  If this is also a normal comment
1199              character, fall through.  Otherwise treat it as a default
1200              character.  */
1201           if (strchr (tc_comment_chars, ch) == NULL
1202               && (! scrub_m68k_mri
1203                   || (ch != '!' && ch != '*')))
1204             goto de_fault;
1205           if (scrub_m68k_mri
1206               && (ch == '!' || ch == '*' || ch == '#')
1207               && state != 1
1208               && state != 10)
1209             goto de_fault;
1210           /* Fall through.  */
1211         case LEX_IS_COMMENT_START:
1212 #if defined TC_ARM && defined OBJ_ELF
1213           /* On the ARM, `@' is the comment character.
1214              Unfortunately this is also a special character in ELF .symver
1215              directives (and .type, though we deal with those another way).
1216              So we check if this line is such a directive, and treat
1217              the character as default if so.  This is a hack.  */
1218           if ((symver_state != NULL) && (*symver_state == 0))
1219             goto de_fault;
1220 #endif
1221 #ifdef WARN_COMMENTS
1222           if (!found_comment)
1223             as_where (&found_comment_file, &found_comment);
1224 #endif
1225           do
1226             {
1227               ch = GET ();
1228             }
1229           while (ch != EOF && !IS_NEWLINE (ch));
1230           if (ch == EOF)
1231             as_warn (_("end of file in comment; newline inserted"));
1232           state = 0;
1233           PUT ('\n');
1234           break;
1235
1236         case LEX_IS_SYMBOL_COMPONENT:
1237           if (state == 10)
1238             {
1239               /* This is a symbol character following another symbol
1240                  character, with whitespace in between.  We skipped
1241                  the whitespace earlier, so output it now.  */
1242               UNGET (ch);
1243               state = 3;
1244               PUT (' ');
1245               break;
1246             }
1247
1248           if (state == 3)
1249             state = 9;
1250
1251           /* This is a common case.  Quickly copy CH and all the
1252              following symbol component or normal characters.  */
1253           if (to + 1 < toend
1254               && mri_state == NULL
1255 #if defined TC_ARM && defined OBJ_ELF
1256               && symver_state == NULL
1257 #endif
1258               )
1259             {
1260               char *s;
1261               int len;
1262
1263               for (s = from; s < fromend; s++)
1264                 {
1265                   int type;
1266
1267                   ch2 = *(unsigned char *) s;
1268                   type = lex[ch2];
1269                   if (type != 0
1270                       && type != LEX_IS_SYMBOL_COMPONENT)
1271                     break;
1272                 }
1273
1274               if (s > from)
1275                 /* Handle the last character normally, for
1276                    simplicity.  */
1277                 --s;
1278
1279               len = s - from;
1280
1281               if (len > (toend - to) - 1)
1282                 len = (toend - to) - 1;
1283
1284               if (len > 0)
1285                 {
1286                   PUT (ch);
1287                   if (len > 8)
1288                     {
1289                       memcpy (to, from, len);
1290                       to += len;
1291                       from += len;
1292                     }
1293                   else
1294                     {
1295                       switch (len)
1296                         {
1297                         case 8: *to++ = *from++;
1298                         case 7: *to++ = *from++;
1299                         case 6: *to++ = *from++;
1300                         case 5: *to++ = *from++;
1301                         case 4: *to++ = *from++;
1302                         case 3: *to++ = *from++;
1303                         case 2: *to++ = *from++;
1304                         case 1: *to++ = *from++;
1305                         }
1306                     }
1307                   ch = GET ();
1308                 }
1309             }
1310
1311           /* Fall through.  */
1312         default:
1313         de_fault:
1314           /* Some relatively `normal' character.  */
1315           if (state == 0)
1316             {
1317               if (IS_SYMBOL_COMPONENT (ch))
1318                 state = 11;     /* Now seeing label definition.  */
1319             }
1320           else if (state == 1)
1321             {
1322               if (IS_SYMBOL_COMPONENT (ch))
1323                 state = 2;      /* Ditto.  */
1324             }
1325           else if (state == 9)
1326             {
1327               if (!IS_SYMBOL_COMPONENT (ch))
1328                 state = 3;
1329             }
1330           else if (state == 10)
1331             {
1332               if (ch == '\\')
1333                 {
1334                   /* Special handling for backslash: a backslash may
1335                      be the beginning of a formal parameter (of a
1336                      macro) following another symbol character, with
1337                      whitespace in between.  If that is the case, we
1338                      output a space before the parameter.  Strictly
1339                      speaking, correct handling depends upon what the
1340                      macro parameter expands into; if the parameter
1341                      expands into something which does not start with
1342                      an operand character, then we don't want to keep
1343                      the space.  We don't have enough information to
1344                      make the right choice, so here we are making the
1345                      choice which is more likely to be correct.  */
1346                   PUT (' ');
1347                 }
1348
1349               state = 3;
1350             }
1351           PUT (ch);
1352           break;
1353         }
1354     }
1355
1356   /*NOTREACHED*/
1357
1358  fromeof:
1359   /* We have reached the end of the input.  */
1360   return to - tostart;
1361
1362  tofull:
1363   /* The output buffer is full.  Save any input we have not yet
1364      processed.  */
1365   if (fromend > from)
1366     {
1367       saved_input = from;
1368       saved_input_len = fromend - from;
1369     }
1370   else
1371     saved_input = NULL;
1372
1373   return to - tostart;
1374 }
1375