gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
   3    1999, 2000, 2001, 2002, 2003, 2005, 2006, 2007, 2008, 2009, 2010, 2012
   4    Free Software Foundation, Inc.
   5
   6    This file is part of GAS, the GNU Assembler.
   7
   8    GAS is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 3, or (at your option)
  11    any later version.
  12
  13    GAS is distributed in the hope that it will be useful, but WITHOUT
  14    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
  16    License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with GAS; see the file COPYING.  If not, write to the Free
  20    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
  21    02110-1301, USA.  */
  22
  23 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
  24 /* App, the assembler pre-processor.  This pre-processor strips out
  25    excess spaces, turns single-quoted characters into a decimal
  26    constant, and turns the # in # <number> <filename> <garbage> into a
  27    .linefile.  This needs better error-handling.  */
  28
  29 #include "as.h"
  30
  31 #if (__STDC__ != 1)
  32 #ifndef const
  33 #define const  /* empty */
  34 #endif
  35 #endif
  36
  37 #ifdef H_TICK_HEX
  38 int enable_h_tick_hex = 0;
  39 #endif
  40
  41 #ifdef TC_M68K
  42 /* Whether we are scrubbing in m68k MRI mode.  This is different from
  43    flag_m68k_mri, because the two flags will be affected by the .mri
  44    pseudo-op at different times.  */
  45 static int scrub_m68k_mri;
  46
  47 /* The pseudo-op which switches in and out of MRI mode.  See the
  48    comment in do_scrub_chars.  */
  49 static const char mri_pseudo[] = ".mri 0";
  50 #else
  51 #define scrub_m68k_mri 0
  52 #endif
  53
  54 #if defined TC_ARM && defined OBJ_ELF
  55 /* The pseudo-op for which we need to special-case `@' characters.
  56    See the comment in do_scrub_chars.  */
  57 static const char   symver_pseudo[] = ".symver";
  58 static const char * symver_state;
  59 #endif
  60
  61 static char lex[256];
  62 static const char symbol_chars[] =
  63 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  64
  65 #define LEX_IS_SYMBOL_COMPONENT         1
  66 #define LEX_IS_WHITESPACE               2
  67 #define LEX_IS_LINE_SEPARATOR           3
  68 #define LEX_IS_COMMENT_START            4
  69 #define LEX_IS_LINE_COMMENT_START       5
  70 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  71 #define LEX_IS_STRINGQUOTE              8
  72 #define LEX_IS_COLON                    9
  73 #define LEX_IS_NEWLINE                  10
  74 #define LEX_IS_ONECHAR_QUOTE            11
  75 #ifdef TC_V850
  76 #define LEX_IS_DOUBLEDASH_1ST           12
  77 #endif
  78 #ifdef TC_M32R
  79 #define DOUBLEBAR_PARALLEL
  80 #endif
  81 #ifdef DOUBLEBAR_PARALLEL
  82 #define LEX_IS_DOUBLEBAR_1ST            13
  83 #endif
  84 #define LEX_IS_PARALLEL_SEPARATOR       14
  85 #ifdef H_TICK_HEX
  86 #define LEX_IS_H                        15
  87 #endif
  88 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  89 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  90 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  91 #define IS_PARALLEL_SEPARATOR(c)        (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
  92 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  93 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  94 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  95
  96 static int process_escape (int);
  97
  98 /* FIXME-soon: The entire lexer/parser thingy should be
  99    built statically at compile time rather than dynamically
 100    each and every time the assembler is run.  xoxorich.  */
 101
 102 void
 103 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
 104 {
 105   const char *p;
 106   int c;
 107
 108   lex[' '] = LEX_IS_WHITESPACE;
 109   lex['\t'] = LEX_IS_WHITESPACE;
 110   lex['\r'] = LEX_IS_WHITESPACE;
 111   lex['\n'] = LEX_IS_NEWLINE;
 112   lex[':'] = LEX_IS_COLON;
 113
 114 #ifdef TC_M68K
 115   scrub_m68k_mri = m68k_mri;
 116
 117   if (! m68k_mri)
 118 #endif
 119     {
 120       lex['"'] = LEX_IS_STRINGQUOTE;
 121
 122 #if ! defined (TC_HPPA) && ! defined (TC_I370)
 123       /* I370 uses single-quotes to delimit integer, float constants.  */
 124       lex['\''] = LEX_IS_ONECHAR_QUOTE;
 125 #endif
 126
 127 #ifdef SINGLE_QUOTE_STRINGS
 128       lex['\''] = LEX_IS_STRINGQUOTE;
 129 #endif
 130     }
 131
 132   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
 133      in state 5 of do_scrub_chars must be changed.  */
 134
 135   /* Note that these override the previous defaults, e.g. if ';' is a
 136      comment char, then it isn't a line separator.  */
 137   for (p = symbol_chars; *p; ++p)
 138     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 139
 140   for (c = 128; c < 256; ++c)
 141     lex[c] = LEX_IS_SYMBOL_COMPONENT;
 142
 143 #ifdef tc_symbol_chars
 144   /* This macro permits the processor to specify all characters which
 145      may appears in an operand.  This will prevent the scrubber from
 146      discarding meaningful whitespace in certain cases.  The i386
 147      backend uses this to support prefixes, which can confuse the
 148      scrubber as to whether it is parsing operands or opcodes.  */
 149   for (p = tc_symbol_chars; *p; ++p)
 150     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 151 #endif
 152
 153   /* The m68k backend wants to be able to change comment_chars.  */
 154 #ifndef tc_comment_chars
 155 #define tc_comment_chars comment_chars
 156 #endif
 157   for (p = tc_comment_chars; *p; p++)
 158     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
 159
 160   for (p = line_comment_chars; *p; p++)
 161     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 162
 163   for (p = line_separator_chars; *p; p++)
 164     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 165
 166 #ifdef tc_parallel_separator_chars
 167   /* This macro permits the processor to specify all characters which
 168      separate parallel insns on the same line.  */
 169   for (p = tc_parallel_separator_chars; *p; p++)
 170     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
 171 #endif
 172
 173   /* Only allow slash-star comments if slash is not in use.
 174      FIXME: This isn't right.  We should always permit them.  */
 175   if (lex['/'] == 0)
 176     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 177
 178 #ifdef TC_M68K
 179   if (m68k_mri)
 180     {
 181       lex['\''] = LEX_IS_STRINGQUOTE;
 182       lex[';'] = LEX_IS_COMMENT_START;
 183       lex['*'] = LEX_IS_LINE_COMMENT_START;
 184       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 185          then it can't be used in an expression.  */
 186       lex['!'] = LEX_IS_LINE_COMMENT_START;
 187     }
 188 #endif
 189
 190 #ifdef TC_V850
 191   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
 192 #endif
 193 #ifdef DOUBLEBAR_PARALLEL
 194   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
 195 #endif
 196 #ifdef TC_D30V
 197   /* Must do this is we want VLIW instruction with "->" or "<-".  */
 198   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
 199 #endif
 200
 201 #ifdef H_TICK_HEX
 202   if (enable_h_tick_hex)
 203     {
 204       lex['h'] = LEX_IS_H;
 205       lex['H'] = LEX_IS_H;
 206     }
 207 #endif
 208 }
 209
 210 /* Saved state of the scrubber.  */
 211 static int state;
 212 static int old_state;
 213 static char *out_string;
 214 static char out_buf[20];
 215 static int add_newlines;
 216 static char *saved_input;
 217 static size_t saved_input_len;
 218 static char input_buffer[32 * 1024];
 219 static const char *mri_state;
 220 static char mri_last_ch;
 221
 222 /* Data structure for saving the state of app across #include's.  Note that
 223    app is called asynchronously to the parsing of the .include's, so our
 224    state at the time .include is interpreted is completely unrelated.
 225    That's why we have to save it all.  */
 226
 227 struct app_save
 228 {
 229   int          state;
 230   int          old_state;
 231   char *       out_string;
 232   char         out_buf[sizeof (out_buf)];
 233   int          add_newlines;
 234   char *       saved_input;
 235   size_t       saved_input_len;
 236 #ifdef TC_M68K
 237   int          scrub_m68k_mri;
 238 #endif
 239   const char * mri_state;
 240   char         mri_last_ch;
 241 #if defined TC_ARM && defined OBJ_ELF
 242   const char * symver_state;
 243 #endif
 244 };
 245
 246 char *
 247 app_push (void)
 248 {
 249   register struct app_save *saved;
 250
 251   saved = (struct app_save *) xmalloc (sizeof (*saved));
 252   saved->state = state;
 253   saved->old_state = old_state;
 254   saved->out_string = out_string;
 255   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 256   saved->add_newlines = add_newlines;
 257   if (saved_input == NULL)
 258     saved->saved_input = NULL;
 259   else
 260     {
 261       saved->saved_input = (char *) xmalloc (saved_input_len);
 262       memcpy (saved->saved_input, saved_input, saved_input_len);
 263       saved->saved_input_len = saved_input_len;
 264     }
 265 #ifdef TC_M68K
 266   saved->scrub_m68k_mri = scrub_m68k_mri;
 267 #endif
 268   saved->mri_state = mri_state;
 269   saved->mri_last_ch = mri_last_ch;
 270 #if defined TC_ARM && defined OBJ_ELF
 271   saved->symver_state = symver_state;
 272 #endif
 273
 274   /* do_scrub_begin() is not useful, just wastes time.  */
 275
 276   state = 0;
 277   saved_input = NULL;
 278   add_newlines = 0;
 279
 280   return (char *) saved;
 281 }
 282
 283 void
 284 app_pop (char *arg)
 285 {
 286   register struct app_save *saved = (struct app_save *) arg;
 287
 288   /* There is no do_scrub_end ().  */
 289   state = saved->state;
 290   old_state = saved->old_state;
 291   out_string = saved->out_string;
 292   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 293   add_newlines = saved->add_newlines;
 294   if (saved->saved_input == NULL)
 295     saved_input = NULL;
 296   else
 297     {
 298       gas_assert (saved->saved_input_len <= sizeof (input_buffer));
 299       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
 300       saved_input = input_buffer;
 301       saved_input_len = saved->saved_input_len;
 302       free (saved->saved_input);
 303     }
 304 #ifdef TC_M68K
 305   scrub_m68k_mri = saved->scrub_m68k_mri;
 306 #endif
 307   mri_state = saved->mri_state;
 308   mri_last_ch = saved->mri_last_ch;
 309 #if defined TC_ARM && defined OBJ_ELF
 310   symver_state = saved->symver_state;
 311 #endif
 312
 313   free (arg);
 314 }
 315
 316 /* @@ This assumes that \n &c are the same on host and target.  This is not
 317    necessarily true.  */
 318
 319 static int
 320 process_escape (int ch)
 321 {
 322   switch (ch)
 323     {
 324     case 'b':
 325       return '\b';
 326     case 'f':
 327       return '\f';
 328     case 'n':
 329       return '\n';
 330     case 'r':
 331       return '\r';
 332     case 't':
 333       return '\t';
 334     case '\'':
 335       return '\'';
 336     case '"':
 337       return '\"';
 338     default:
 339       return ch;
 340     }
 341 }
 342
 343 /* This function is called to process input characters.  The GET
 344    parameter is used to retrieve more input characters.  GET should
 345    set its parameter to point to a buffer, and return the length of
 346    the buffer; it should return 0 at end of file.  The scrubbed output
 347    characters are put into the buffer starting at TOSTART; the TOSTART
 348    buffer is TOLEN bytes in length.  The function returns the number
 349    of scrubbed characters put into TOSTART.  This will be TOLEN unless
 350    end of file was seen.  This function is arranged as a state
 351    machine, and saves its state so that it may return at any point.
 352    This is the way the old code used to work.  */
 353
 354 size_t
 355 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
 356 {
 357   char *to = tostart;
 358   char *toend = tostart + tolen;
 359   char *from;
 360   char *fromend;
 361   size_t fromlen;
 362   register int ch, ch2 = 0;
 363   /* Character that started the string we're working on.  */
 364   static char quotechar;
 365
 366   /*State 0: beginning of normal line
 367           1: After first whitespace on line (flush more white)
 368           2: After first non-white (opcode) on line (keep 1white)
 369           3: after second white on line (into operands) (flush white)
 370           4: after putting out a .linefile, put out digits
 371           5: parsing a string, then go to old-state
 372           6: putting out \ escape in a "d string.
 373           7: no longer used
 374           8: no longer used
 375           9: After seeing symbol char in state 3 (keep 1white after symchar)
 376          10: After seeing whitespace in state 9 (keep white before symchar)
 377          11: After seeing a symbol character in state 0 (eg a label definition)
 378          -1: output string in out_string and go to the state in old_state
 379          -2: flush text until a '*' '/' is seen, then go to state old_state
 380 #ifdef TC_V850
 381          12: After seeing a dash, looking for a second dash as a start
 382              of comment.
 383 #endif
 384 #ifdef DOUBLEBAR_PARALLEL
 385          13: After seeing a vertical bar, looking for a second
 386              vertical bar as a parallel expression separator.
 387 #endif
 388 #ifdef TC_PREDICATE_START_CHAR
 389          14: After seeing a predicate start character at state 0, looking
 390              for a predicate end character as predicate.
 391          15: After seeing a predicate start character at state 1, looking
 392              for a predicate end character as predicate.
 393 #endif
 394 #ifdef TC_Z80
 395          16: After seeing an 'a' or an 'A' at the start of a symbol
 396          17: After seeing an 'f' or an 'F' in state 16
 397 #endif
 398           */
 399
 400   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 401      constructs like ``.loc 1 20''.  This was turning into ``.loc
 402      120''.  States 9 and 10 ensure that a space is never dropped in
 403      between characters which could appear in an identifier.  Ian
 404      Taylor, ian@cygnus.com.
 405
 406      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 407      correctly on the PA (and any other target where colons are optional).
 408      Jeff Law, law@cs.utah.edu.
 409
 410      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
 411      get squashed into "cmp r1,r2||trap#1", with the all important space
 412      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
 413
 414   /* This macro gets the next input character.  */
 415
 416 #define GET()                                                   \
 417   (from < fromend                                               \
 418    ? * (unsigned char *) (from++)                               \
 419    : (saved_input = NULL,                                       \
 420       fromlen = (*get) (input_buffer, sizeof input_buffer),     \
 421       from = input_buffer,                                      \
 422       fromend = from + fromlen,                                 \
 423       (fromlen == 0                                             \
 424        ? EOF                                                    \
 425        : * (unsigned char *) (from++))))
 426
 427   /* This macro pushes a character back on the input stream.  */
 428
 429 #define UNGET(uch) (*--from = (uch))
 430
 431   /* This macro puts a character into the output buffer.  If this
 432      character fills the output buffer, this macro jumps to the label
 433      TOFULL.  We use this rather ugly approach because we need to
 434      handle two different termination conditions: EOF on the input
 435      stream, and a full output buffer.  It would be simpler if we
 436      always read in the entire input stream before processing it, but
 437      I don't want to make such a significant change to the assembler's
 438      memory usage.  */
 439
 440 #define PUT(pch)                                \
 441   do                                            \
 442     {                                           \
 443       *to++ = (pch);                            \
 444       if (to >= toend)                          \
 445         goto tofull;                            \
 446     }                                           \
 447   while (0)
 448
 449   if (saved_input != NULL)
 450     {
 451       from = saved_input;
 452       fromend = from + saved_input_len;
 453     }
 454   else
 455     {
 456       fromlen = (*get) (input_buffer, sizeof input_buffer);
 457       if (fromlen == 0)
 458         return 0;
 459       from = input_buffer;
 460       fromend = from + fromlen;
 461     }
 462
 463   while (1)
 464     {
 465       /* The cases in this switch end with continue, in order to
 466          branch back to the top of this while loop and generate the
 467          next output character in the appropriate state.  */
 468       switch (state)
 469         {
 470         case -1:
 471           ch = *out_string++;
 472           if (*out_string == '\0')
 473             {
 474               state = old_state;
 475               old_state = 3;
 476             }
 477           PUT (ch);
 478           continue;
 479
 480         case -2:
 481           for (;;)
 482             {
 483               do
 484                 {
 485                   ch = GET ();
 486
 487                   if (ch == EOF)
 488                     {
 489                       as_warn (_("end of file in comment"));
 490                       goto fromeof;
 491                     }
 492
 493                   if (ch == '\n')
 494                     PUT ('\n');
 495                 }
 496               while (ch != '*');
 497
 498               while ((ch = GET ()) == '*')
 499                 ;
 500
 501               if (ch == EOF)
 502                 {
 503                   as_warn (_("end of file in comment"));
 504                   goto fromeof;
 505                 }
 506
 507               if (ch == '/')
 508                 break;
 509
 510               UNGET (ch);
 511             }
 512
 513           state = old_state;
 514           UNGET (' ');
 515           continue;
 516
 517         case 4:
 518           ch = GET ();
 519           if (ch == EOF)
 520             goto fromeof;
 521           else if (ch >= '0' && ch <= '9')
 522             PUT (ch);
 523           else
 524             {
 525               while (ch != EOF && IS_WHITESPACE (ch))
 526                 ch = GET ();
 527               if (ch == '"')
 528                 {
 529                   quotechar = ch;
 530                   state = 5;
 531                   old_state = 3;
 532                   PUT (ch);
 533                 }
 534               else
 535                 {
 536                   while (ch != EOF && ch != '\n')
 537                     ch = GET ();
 538                   state = 0;
 539                   PUT (ch);
 540                 }
 541             }
 542           continue;
 543
 544         case 5:
 545           /* We are going to copy everything up to a quote character,
 546              with special handling for a backslash.  We try to
 547              optimize the copying in the simple case without using the
 548              GET and PUT macros.  */
 549           {
 550             char *s;
 551             ptrdiff_t len;
 552
 553             for (s = from; s < fromend; s++)
 554               {
 555                 ch = *s;
 556                 if (ch == '\\'
 557                     || ch == quotechar
 558                     || ch == '\n')
 559                   break;
 560               }
 561             len = s - from;
 562             if (len > toend - to)
 563               len = toend - to;
 564             if (len > 0)
 565               {
 566                 memcpy (to, from, len);
 567                 to += len;
 568                 from += len;
 569                 if (to >= toend)
 570                   goto tofull;
 571               }
 572           }
 573
 574           ch = GET ();
 575           if (ch == EOF)
 576             {
 577               /* This buffer is here specifically so
 578                  that the UNGET below will work.  */
 579               static char one_char_buf[1];
 580
 581               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 582               state = old_state;
 583               from = fromend = one_char_buf + 1;
 584               fromlen = 1;
 585               UNGET ('\n');
 586               PUT (quotechar);
 587             }
 588           else if (ch == quotechar)
 589             {
 590               state = old_state;
 591               PUT (ch);
 592             }
 593 #ifndef NO_STRING_ESCAPES
 594           else if (ch == '\\')
 595             {
 596               state = 6;
 597               PUT (ch);
 598             }
 599 #endif
 600           else if (scrub_m68k_mri && ch == '\n')
 601             {
 602               /* Just quietly terminate the string.  This permits lines like
 603                    bne  label   loop if we haven't reach end yet.  */
 604               state = old_state;
 605               UNGET (ch);
 606               PUT ('\'');
 607             }
 608           else
 609             {
 610               PUT (ch);
 611             }
 612           continue;
 613
 614         case 6:
 615           state = 5;
 616           ch = GET ();
 617           switch (ch)
 618             {
 619               /* Handle strings broken across lines, by turning '\n' into
 620                  '\\' and 'n'.  */
 621             case '\n':
 622               UNGET ('n');
 623               add_newlines++;
 624               PUT ('\\');
 625               continue;
 626
 627             case EOF:
 628               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 629               PUT (quotechar);
 630               continue;
 631
 632             case '"':
 633             case '\\':
 634             case 'b':
 635             case 'f':
 636             case 'n':
 637             case 'r':
 638             case 't':
 639             case 'v':
 640             case 'x':
 641             case 'X':
 642             case '0':
 643             case '1':
 644             case '2':
 645             case '3':
 646             case '4':
 647             case '5':
 648             case '6':
 649             case '7':
 650               break;
 651
 652             default:
 653 #ifdef ONLY_STANDARD_ESCAPES
 654               as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
 655 #endif
 656               break;
 657             }
 658           PUT (ch);
 659           continue;
 660
 661 #ifdef DOUBLEBAR_PARALLEL
 662         case 13:
 663           ch = GET ();
 664           if (ch != '|')
 665             abort ();
 666
 667           /* Reset back to state 1 and pretend that we are parsing a
 668              line from just after the first white space.  */
 669           state = 1;
 670           PUT ('|');
 671 #ifdef TC_TIC6X
 672           /* "||^" is used for SPMASKed instructions.  */
 673           ch = GET ();
 674           if (ch == EOF)
 675             goto fromeof;
 676           else if (ch == '^')
 677             PUT ('^');
 678           else
 679             UNGET (ch);
 680 #endif
 681           continue;
 682 #endif
 683 #ifdef TC_Z80
 684         case 16:
 685           /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
 686           ch = GET ();
 687           if (ch == 'f' || ch == 'F')
 688             {
 689               state = 17;
 690               PUT (ch);
 691             }
 692           else
 693             {
 694               state = 9;
 695               break;
 696             }
 697         case 17:
 698           /* We have seen "af" at the start of a symbol,
 699              a ' here is a part of that symbol.  */
 700           ch = GET ();
 701           state = 9;
 702           if (ch == '\'')
 703             /* Change to avoid warning about unclosed string.  */
 704             PUT ('`');
 705           else if (ch != EOF)
 706             UNGET (ch);
 707           break;
 708 #endif
 709         }
 710
 711       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
 712
 713       /* flushchar: */
 714       ch = GET ();
 715
 716 #ifdef TC_PREDICATE_START_CHAR
 717       if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
 718         {
 719           state += 14;
 720           PUT (ch);
 721           continue;
 722         }
 723       else if (state == 14 || state == 15)
 724         {
 725           if (ch == TC_PREDICATE_END_CHAR)
 726             {
 727               state -= 14;
 728               PUT (ch);
 729               ch = GET ();
 730             }
 731           else
 732             {
 733               PUT (ch);
 734               continue;
 735             }
 736         }
 737 #endif
 738
 739     recycle:
 740
 741 #if defined TC_ARM && defined OBJ_ELF
 742       /* We need to watch out for .symver directives.  See the comment later
 743          in this function.  */
 744       if (symver_state == NULL)
 745         {
 746           if ((state == 0 || state == 1) && ch == symver_pseudo[0])
 747             symver_state = symver_pseudo + 1;
 748         }
 749       else
 750         {
 751           /* We advance to the next state if we find the right
 752              character.  */
 753           if (ch != '\0' && (*symver_state == ch))
 754             ++symver_state;
 755           else if (*symver_state != '\0')
 756             /* We did not get the expected character, or we didn't
 757                get a valid terminating character after seeing the
 758                entire pseudo-op, so we must go back to the beginning.  */
 759             symver_state = NULL;
 760           else
 761             {
 762               /* We've read the entire pseudo-op.  If this is the end
 763                  of the line, go back to the beginning.  */
 764               if (IS_NEWLINE (ch))
 765                 symver_state = NULL;
 766             }
 767         }
 768 #endif /* TC_ARM && OBJ_ELF */
 769
 770 #ifdef TC_M68K
 771       /* We want to have pseudo-ops which control whether we are in
 772          MRI mode or not.  Unfortunately, since m68k MRI mode affects
 773          the scrubber, that means that we need a special purpose
 774          recognizer here.  */
 775       if (mri_state == NULL)
 776         {
 777           if ((state == 0 || state == 1)
 778               && ch == mri_pseudo[0])
 779             mri_state = mri_pseudo + 1;
 780         }
 781       else
 782         {
 783           /* We advance to the next state if we find the right
 784              character, or if we need a space character and we get any
 785              whitespace character, or if we need a '0' and we get a
 786              '1' (this is so that we only need one state to handle
 787              ``.mri 0'' and ``.mri 1'').  */
 788           if (ch != '\0'
 789               && (*mri_state == ch
 790                   || (*mri_state == ' '
 791                       && lex[ch] == LEX_IS_WHITESPACE)
 792                   || (*mri_state == '0'
 793                       && ch == '1')))
 794             {
 795               mri_last_ch = ch;
 796               ++mri_state;
 797             }
 798           else if (*mri_state != '\0'
 799                    || (lex[ch] != LEX_IS_WHITESPACE
 800                        && lex[ch] != LEX_IS_NEWLINE))
 801             {
 802               /* We did not get the expected character, or we didn't
 803                  get a valid terminating character after seeing the
 804                  entire pseudo-op, so we must go back to the
 805                  beginning.  */
 806               mri_state = NULL;
 807             }
 808           else
 809             {
 810               /* We've read the entire pseudo-op.  mips_last_ch is
 811                  either '0' or '1' indicating whether to enter or
 812                  leave MRI mode.  */
 813               do_scrub_begin (mri_last_ch == '1');
 814               mri_state = NULL;
 815
 816               /* We continue handling the character as usual.  The
 817                  main gas reader must also handle the .mri pseudo-op
 818                  to control expression parsing and the like.  */
 819             }
 820         }
 821 #endif
 822
 823       if (ch == EOF)
 824         {
 825           if (state != 0)
 826             {
 827               as_warn (_("end of file not at end of a line; newline inserted"));
 828               state = 0;
 829               PUT ('\n');
 830             }
 831           goto fromeof;
 832         }
 833
 834       switch (lex[ch])
 835         {
 836         case LEX_IS_WHITESPACE:
 837           do
 838             {
 839               ch = GET ();
 840             }
 841           while (ch != EOF && IS_WHITESPACE (ch));
 842           if (ch == EOF)
 843             goto fromeof;
 844
 845           if (state == 0)
 846             {
 847               /* Preserve a single whitespace character at the
 848                  beginning of a line.  */
 849               state = 1;
 850               UNGET (ch);
 851               PUT (' ');
 852               break;
 853             }
 854
 855 #ifdef KEEP_WHITE_AROUND_COLON
 856           if (lex[ch] == LEX_IS_COLON)
 857             {
 858               /* Only keep this white if there's no white *after* the
 859                  colon.  */
 860               ch2 = GET ();
 861               if (ch2 != EOF)
 862                 UNGET (ch2);
 863               if (!IS_WHITESPACE (ch2))
 864                 {
 865                   state = 9;
 866                   UNGET (ch);
 867                   PUT (' ');
 868                   break;
 869                 }
 870             }
 871 #endif
 872           if (IS_COMMENT (ch)
 873               || ch == '/'
 874               || IS_LINE_SEPARATOR (ch)
 875               || IS_PARALLEL_SEPARATOR (ch))
 876             {
 877               if (scrub_m68k_mri)
 878                 {
 879                   /* In MRI mode, we keep these spaces.  */
 880                   UNGET (ch);
 881                   PUT (' ');
 882                   break;
 883                 }
 884               goto recycle;
 885             }
 886
 887           /* If we're in state 2 or 11, we've seen a non-white
 888              character followed by whitespace.  If the next character
 889              is ':', this is whitespace after a label name which we
 890              normally must ignore.  In MRI mode, though, spaces are
 891              not permitted between the label and the colon.  */
 892           if ((state == 2 || state == 11)
 893               && lex[ch] == LEX_IS_COLON
 894               && ! scrub_m68k_mri)
 895             {
 896               state = 1;
 897               PUT (ch);
 898               break;
 899             }
 900
 901           switch (state)
 902             {
 903             case 1:
 904               /* We can arrive here if we leave a leading whitespace
 905                  character at the beginning of a line.  */
 906               goto recycle;
 907             case 2:
 908               state = 3;
 909               if (to + 1 < toend)
 910                 {
 911                   /* Optimize common case by skipping UNGET/GET.  */
 912                   PUT (' ');    /* Sp after opco */
 913                   goto recycle;
 914                 }
 915               UNGET (ch);
 916               PUT (' ');
 917               break;
 918             case 3:
 919 #ifndef TC_KEEP_OPERAND_SPACES
 920               /* For TI C6X, we keep these spaces as they may separate
 921                  functional unit specifiers from operands.  */
 922               if (scrub_m68k_mri)
 923 #endif
 924                 {
 925                   /* In MRI mode, we keep these spaces.  */
 926                   UNGET (ch);
 927                   PUT (' ');
 928                   break;
 929                 }
 930               goto recycle;     /* Sp in operands */
 931             case 9:
 932             case 10:
 933 #ifndef TC_KEEP_OPERAND_SPACES
 934               if (scrub_m68k_mri)
 935 #endif
 936                 {
 937                   /* In MRI mode, we keep these spaces.  */
 938                   state = 3;
 939                   UNGET (ch);
 940                   PUT (' ');
 941                   break;
 942                 }
 943               state = 10;       /* Sp after symbol char */
 944               goto recycle;
 945             case 11:
 946               if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
 947                 state = 1;
 948               else
 949                 {
 950                   /* We know that ch is not ':', since we tested that
 951                      case above.  Therefore this is not a label, so it
 952                      must be the opcode, and we've just seen the
 953                      whitespace after it.  */
 954                   state = 3;
 955                 }
 956               UNGET (ch);
 957               PUT (' ');        /* Sp after label definition.  */
 958               break;
 959             default:
 960               BAD_CASE (state);
 961             }
 962           break;
 963
 964         case LEX_IS_TWOCHAR_COMMENT_1ST:
 965           ch2 = GET ();
 966           if (ch2 == '*')
 967             {
 968               for (;;)
 969                 {
 970                   do
 971                     {
 972                       ch2 = GET ();
 973                       if (ch2 != EOF && IS_NEWLINE (ch2))
 974                         add_newlines++;
 975                     }
 976                   while (ch2 != EOF && ch2 != '*');
 977
 978                   while (ch2 == '*')
 979                     ch2 = GET ();
 980
 981                   if (ch2 == EOF || ch2 == '/')
 982                     break;
 983
 984                   /* This UNGET will ensure that we count newlines
 985                      correctly.  */
 986                   UNGET (ch2);
 987                 }
 988
 989               if (ch2 == EOF)
 990                 as_warn (_("end of file in multiline comment"));
 991
 992               ch = ' ';
 993               goto recycle;
 994             }
 995 #ifdef DOUBLESLASH_LINE_COMMENTS
 996           else if (ch2 == '/')
 997             {
 998               do
 999                 {
1000                   ch = GET ();
1001                 }
1002               while (ch != EOF && !IS_NEWLINE (ch));
1003               if (ch == EOF)
1004                 as_warn ("end of file in comment; newline inserted");
1005               state = 0;
1006               PUT ('\n');
1007               break;
1008             }
1009 #endif
1010           else
1011             {
1012               if (ch2 != EOF)
1013                 UNGET (ch2);
1014               if (state == 9 || state == 10)
1015                 state = 3;
1016               PUT (ch);
1017             }
1018           break;
1019
1020         case LEX_IS_STRINGQUOTE:
1021           quotechar = ch;
1022           if (state == 10)
1023             {
1024               /* Preserve the whitespace in foo "bar".  */
1025               UNGET (ch);
1026               state = 3;
1027               PUT (' ');
1028
1029               /* PUT didn't jump out.  We could just break, but we
1030                  know what will happen, so optimize a bit.  */
1031               ch = GET ();
1032               old_state = 3;
1033             }
1034           else if (state == 9)
1035             old_state = 3;
1036           else
1037             old_state = state;
1038           state = 5;
1039           PUT (ch);
1040           break;
1041
1042 #ifndef IEEE_STYLE
1043         case LEX_IS_ONECHAR_QUOTE:
1044 #ifdef H_TICK_HEX
1045           if (state == 9 && enable_h_tick_hex)
1046             {
1047               char c;
1048
1049               c = GET ();
1050               as_warn ("'%c found after symbol", c);
1051               UNGET (c);
1052             }
1053 #endif
1054           if (state == 10)
1055             {
1056               /* Preserve the whitespace in foo 'b'.  */
1057               UNGET (ch);
1058               state = 3;
1059               PUT (' ');
1060               break;
1061             }
1062           ch = GET ();
1063           if (ch == EOF)
1064             {
1065               as_warn (_("end of file after a one-character quote; \\0 inserted"));
1066               ch = 0;
1067             }
1068           if (ch == '\\')
1069             {
1070               ch = GET ();
1071               if (ch == EOF)
1072                 {
1073                   as_warn (_("end of file in escape character"));
1074                   ch = '\\';
1075                 }
1076               else
1077                 ch = process_escape (ch);
1078             }
1079           sprintf (out_buf, "%d", (int) (unsigned char) ch);
1080
1081           /* None of these 'x constants for us.  We want 'x'.  */
1082           if ((ch = GET ()) != '\'')
1083             {
1084 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1085               as_warn (_("missing close quote; (assumed)"));
1086 #else
1087               if (ch != EOF)
1088                 UNGET (ch);
1089 #endif
1090             }
1091           if (strlen (out_buf) == 1)
1092             {
1093               PUT (out_buf[0]);
1094               break;
1095             }
1096           if (state == 9)
1097             old_state = 3;
1098           else
1099             old_state = state;
1100           state = -1;
1101           out_string = out_buf;
1102           PUT (*out_string++);
1103           break;
1104 #endif
1105
1106         case LEX_IS_COLON:
1107 #ifdef KEEP_WHITE_AROUND_COLON
1108           state = 9;
1109 #else
1110           if (state == 9 || state == 10)
1111             state = 3;
1112           else if (state != 3)
1113             state = 1;
1114 #endif
1115           PUT (ch);
1116           break;
1117
1118         case LEX_IS_NEWLINE:
1119           /* Roll out a bunch of newlines from inside comments, etc.  */
1120           if (add_newlines)
1121             {
1122               --add_newlines;
1123               UNGET (ch);
1124             }
1125           /* Fall through.  */
1126
1127         case LEX_IS_LINE_SEPARATOR:
1128           state = 0;
1129           PUT (ch);
1130           break;
1131
1132         case LEX_IS_PARALLEL_SEPARATOR:
1133           state = 1;
1134           PUT (ch);
1135           break;
1136
1137 #ifdef TC_V850
1138         case LEX_IS_DOUBLEDASH_1ST:
1139           ch2 = GET ();
1140           if (ch2 != '-')
1141             {
1142               if (ch2 != EOF)
1143                 UNGET (ch2);
1144               goto de_fault;
1145             }
1146           /* Read and skip to end of line.  */
1147           do
1148             {
1149               ch = GET ();
1150             }
1151           while (ch != EOF && ch != '\n');
1152
1153           if (ch == EOF)
1154             as_warn (_("end of file in comment; newline inserted"));
1155
1156           state = 0;
1157           PUT ('\n');
1158           break;
1159 #endif
1160 #ifdef DOUBLEBAR_PARALLEL
1161         case LEX_IS_DOUBLEBAR_1ST:
1162           ch2 = GET ();
1163           if (ch2 != EOF)
1164             UNGET (ch2);
1165           if (ch2 != '|')
1166             goto de_fault;
1167
1168           /* Handle '||' in two states as invoking PUT twice might
1169              result in the first one jumping out of this loop.  We'd
1170              then lose track of the state and one '|' char.  */
1171           state = 13;
1172           PUT ('|');
1173           break;
1174 #endif
1175         case LEX_IS_LINE_COMMENT_START:
1176           /* FIXME-someday: The two character comment stuff was badly
1177              thought out.  On i386, we want '/' as line comment start
1178              AND we want C style comments.  hence this hack.  The
1179              whole lexical process should be reworked.  xoxorich.  */
1180           if (ch == '/')
1181             {
1182               ch2 = GET ();
1183               if (ch2 == '*')
1184                 {
1185                   old_state = 3;
1186                   state = -2;
1187                   break;
1188                 }
1189               else
1190                 {
1191                   UNGET (ch2);
1192                 }
1193             }
1194
1195           if (state == 0 || state == 1) /* Only comment at start of line.  */
1196             {
1197               int startch;
1198
1199               startch = ch;
1200
1201               do
1202                 {
1203                   ch = GET ();
1204                 }
1205               while (ch != EOF && IS_WHITESPACE (ch));
1206
1207               if (ch == EOF)
1208                 {
1209                   as_warn (_("end of file in comment; newline inserted"));
1210                   PUT ('\n');
1211                   break;
1212                 }
1213
1214               if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1215                 {
1216                   /* Not a cpp line.  */
1217                   while (ch != EOF && !IS_NEWLINE (ch))
1218                     ch = GET ();
1219                   if (ch == EOF)
1220                     {
1221                       as_warn (_("end of file in comment; newline inserted"));
1222                       PUT ('\n');
1223                     }
1224                   else /* IS_NEWLINE (ch) */
1225                     {
1226                       /* To process non-zero add_newlines.  */
1227                       UNGET (ch);
1228                     }
1229                   state = 0;
1230                   break;
1231                 }
1232               /* Looks like `# 123 "filename"' from cpp.  */
1233               UNGET (ch);
1234               old_state = 4;
1235               state = -1;
1236               if (scrub_m68k_mri)
1237                 out_string = "\tlinefile ";
1238               else
1239                 out_string = "\t.linefile ";
1240               PUT (*out_string++);
1241               break;
1242             }
1243
1244 #ifdef TC_D10V
1245           /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1246              Trap is the only short insn that has a first operand that is
1247              neither register nor label.
1248              We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1249              We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1250              already LEX_IS_LINE_COMMENT_START.  However, it is the
1251              only character in line_comment_chars for d10v, hence we
1252              can recognize it as such.  */
1253           /* An alternative approach would be to reset the state to 1 when
1254              we see '||', '<'- or '->', but that seems to be overkill.  */
1255           if (state == 10)
1256             PUT (' ');
1257 #endif
1258           /* We have a line comment character which is not at the
1259              start of a line.  If this is also a normal comment
1260              character, fall through.  Otherwise treat it as a default
1261              character.  */
1262           if (strchr (tc_comment_chars, ch) == NULL
1263               && (! scrub_m68k_mri
1264                   || (ch != '!' && ch != '*')))
1265             goto de_fault;
1266           if (scrub_m68k_mri
1267               && (ch == '!' || ch == '*' || ch == '#')
1268               && state != 1
1269               && state != 10)
1270             goto de_fault;
1271           /* Fall through.  */
1272         case LEX_IS_COMMENT_START:
1273 #if defined TC_ARM && defined OBJ_ELF
1274           /* On the ARM, `@' is the comment character.
1275              Unfortunately this is also a special character in ELF .symver
1276              directives (and .type, though we deal with those another way).
1277              So we check if this line is such a directive, and treat
1278              the character as default if so.  This is a hack.  */
1279           if ((symver_state != NULL) && (*symver_state == 0))
1280             goto de_fault;
1281 #endif
1282
1283 #ifdef TC_ARM
1284           /* For the ARM, care is needed not to damage occurrences of \@
1285              by stripping the @ onwards.  Yuck.  */
1286           if (to > tostart && *(to - 1) == '\\')
1287             /* Do not treat the @ as a start-of-comment.  */
1288             goto de_fault;
1289 #endif
1290
1291 #ifdef WARN_COMMENTS
1292           if (!found_comment)
1293             as_where (&found_comment_file, &found_comment);
1294 #endif
1295           do
1296             {
1297               ch = GET ();
1298             }
1299           while (ch != EOF && !IS_NEWLINE (ch));
1300           if (ch == EOF)
1301             as_warn (_("end of file in comment; newline inserted"));
1302           state = 0;
1303           PUT ('\n');
1304           break;
1305
1306 #ifdef H_TICK_HEX
1307         case LEX_IS_H:
1308           /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1309              the H' with 0x to make them gas-style hex characters.  */
1310           if (enable_h_tick_hex)
1311             {
1312               char quot;
1313
1314               quot = GET ();
1315               if (quot == '\'')
1316                 {
1317                   UNGET ('x');
1318                   ch = '0';
1319                 }
1320               else
1321                 UNGET (quot);
1322             }
1323           /* FALL THROUGH */
1324 #endif
1325
1326         case LEX_IS_SYMBOL_COMPONENT:
1327           if (state == 10)
1328             {
1329               /* This is a symbol character following another symbol
1330                  character, with whitespace in between.  We skipped
1331                  the whitespace earlier, so output it now.  */
1332               UNGET (ch);
1333               state = 3;
1334               PUT (' ');
1335               break;
1336             }
1337
1338 #ifdef TC_Z80
1339           /* "af'" is a symbol containing '\''.  */
1340           if (state == 3 && (ch == 'a' || ch == 'A'))
1341             {
1342               state = 16;
1343               PUT (ch);
1344               ch = GET ();
1345               if (ch == 'f' || ch == 'F')
1346                 {
1347                   state = 17;
1348                   PUT (ch);
1349                   break;
1350                 }
1351               else
1352                 {
1353                   state = 9;
1354                   if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1355                     {
1356                       if (ch != EOF)
1357                         UNGET (ch);
1358                       break;
1359                     }
1360                 }
1361             }
1362 #endif
1363           if (state == 3)
1364             state = 9;
1365
1366           /* This is a common case.  Quickly copy CH and all the
1367              following symbol component or normal characters.  */
1368           if (to + 1 < toend
1369               && mri_state == NULL
1370 #if defined TC_ARM && defined OBJ_ELF
1371               && symver_state == NULL
1372 #endif
1373               )
1374             {
1375               char *s;
1376               ptrdiff_t len;
1377
1378               for (s = from; s < fromend; s++)
1379                 {
1380                   int type;
1381
1382                   ch2 = *(unsigned char *) s;
1383                   type = lex[ch2];
1384                   if (type != 0
1385                       && type != LEX_IS_SYMBOL_COMPONENT)
1386                     break;
1387                 }
1388
1389               if (s > from)
1390                 /* Handle the last character normally, for
1391                    simplicity.  */
1392                 --s;
1393
1394               len = s - from;
1395
1396               if (len > (toend - to) - 1)
1397                 len = (toend - to) - 1;
1398
1399               if (len > 0)
1400                 {
1401                   PUT (ch);
1402                   memcpy (to, from, len);
1403                   to += len;
1404                   from += len;
1405                   if (to >= toend)
1406                     goto tofull;
1407                   ch = GET ();
1408                 }
1409             }
1410
1411           /* Fall through.  */
1412         default:
1413         de_fault:
1414           /* Some relatively `normal' character.  */
1415           if (state == 0)
1416             {
1417               state = 11;       /* Now seeing label definition.  */
1418             }
1419           else if (state == 1)
1420             {
1421               state = 2;        /* Ditto.  */
1422             }
1423           else if (state == 9)
1424             {
1425               if (!IS_SYMBOL_COMPONENT (ch))
1426                 state = 3;
1427             }
1428           else if (state == 10)
1429             {
1430               if (ch == '\\')
1431                 {
1432                   /* Special handling for backslash: a backslash may
1433                      be the beginning of a formal parameter (of a
1434                      macro) following another symbol character, with
1435                      whitespace in between.  If that is the case, we
1436                      output a space before the parameter.  Strictly
1437                      speaking, correct handling depends upon what the
1438                      macro parameter expands into; if the parameter
1439                      expands into something which does not start with
1440                      an operand character, then we don't want to keep
1441                      the space.  We don't have enough information to
1442                      make the right choice, so here we are making the
1443                      choice which is more likely to be correct.  */
1444                   if (to + 1 >= toend)
1445                     {
1446                       /* If we're near the end of the buffer, save the
1447                          character for the next time round.  Otherwise
1448                          we'll lose our state.  */
1449                       UNGET (ch);
1450                       goto tofull;
1451                     }
1452                   *to++ = ' ';
1453                 }
1454
1455               state = 3;
1456             }
1457           PUT (ch);
1458           break;
1459         }
1460     }
1461
1462   /*NOTREACHED*/
1463
1464  fromeof:
1465   /* We have reached the end of the input.  */
1466   return to - tostart;
1467
1468  tofull:
1469   /* The output buffer is full.  Save any input we have not yet
1470      processed.  */
1471   if (fromend > from)
1472     {
1473       saved_input = from;
1474       saved_input_len = fromend - from;
1475     }
1476   else
1477     saved_input = NULL;
1478
1479   return to - tostart;
1480 }