gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright (C) 1987-2017 Free Software Foundation, Inc.
   3
   4    This file is part of GAS, the GNU Assembler.
   5
   6    GAS is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3, or (at your option)
   9    any later version.
  10
  11    GAS is distributed in the hope that it will be useful, but WITHOUT
  12    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
  14    License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with GAS; see the file COPYING.  If not, write to the Free
  18    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
  19    02110-1301, USA.  */
  20
  21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
  22 /* App, the assembler pre-processor.  This pre-processor strips out
  23    excess spaces, turns single-quoted characters into a decimal
  24    constant, and turns the # in # <number> <filename> <garbage> into a
  25    .linefile.  This needs better error-handling.  */
  26
  27 #include "as.h"
  28
  29 #if (__STDC__ != 1)
  30 #ifndef const
  31 #define const  /* empty */
  32 #endif
  33 #endif
  34
  35 #ifdef H_TICK_HEX
  36 int enable_h_tick_hex = 0;
  37 #endif
  38
  39 #ifdef TC_M68K
  40 /* Whether we are scrubbing in m68k MRI mode.  This is different from
  41    flag_m68k_mri, because the two flags will be affected by the .mri
  42    pseudo-op at different times.  */
  43 static int scrub_m68k_mri;
  44
  45 /* The pseudo-op which switches in and out of MRI mode.  See the
  46    comment in do_scrub_chars.  */
  47 static const char mri_pseudo[] = ".mri 0";
  48 #else
  49 #define scrub_m68k_mri 0
  50 #endif
  51
  52 #if defined TC_ARM && defined OBJ_ELF
  53 /* The pseudo-op for which we need to special-case `@' characters.
  54    See the comment in do_scrub_chars.  */
  55 static const char   symver_pseudo[] = ".symver";
  56 static const char * symver_state;
  57 #endif
  58
  59 static char lex[256];
  60 static const char symbol_chars[] =
  61 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  62
  63 #define LEX_IS_SYMBOL_COMPONENT         1
  64 #define LEX_IS_WHITESPACE               2
  65 #define LEX_IS_LINE_SEPARATOR           3
  66 #define LEX_IS_COMMENT_START            4
  67 #define LEX_IS_LINE_COMMENT_START       5
  68 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  69 #define LEX_IS_STRINGQUOTE              8
  70 #define LEX_IS_COLON                    9
  71 #define LEX_IS_NEWLINE                  10
  72 #define LEX_IS_ONECHAR_QUOTE            11
  73 #ifdef TC_V850
  74 #define LEX_IS_DOUBLEDASH_1ST           12
  75 #endif
  76 #ifdef TC_M32R
  77 #define DOUBLEBAR_PARALLEL
  78 #endif
  79 #ifdef DOUBLEBAR_PARALLEL
  80 #define LEX_IS_DOUBLEBAR_1ST            13
  81 #endif
  82 #define LEX_IS_PARALLEL_SEPARATOR       14
  83 #ifdef H_TICK_HEX
  84 #define LEX_IS_H                        15
  85 #endif
  86 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  87 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  88 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  89 #define IS_PARALLEL_SEPARATOR(c)        (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
  90 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  91 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  92 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  93
  94 static int process_escape (int);
  95
  96 /* FIXME-soon: The entire lexer/parser thingy should be
  97    built statically at compile time rather than dynamically
  98    each and every time the assembler is run.  xoxorich.  */
  99
 100 void
 101 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
 102 {
 103   const char *p;
 104   int c;
 105
 106   lex[' '] = LEX_IS_WHITESPACE;
 107   lex['\t'] = LEX_IS_WHITESPACE;
 108   lex['\r'] = LEX_IS_WHITESPACE;
 109   lex['\n'] = LEX_IS_NEWLINE;
 110   lex[':'] = LEX_IS_COLON;
 111
 112 #ifdef TC_M68K
 113   scrub_m68k_mri = m68k_mri;
 114
 115   if (! m68k_mri)
 116 #endif
 117     {
 118       lex['"'] = LEX_IS_STRINGQUOTE;
 119
 120 #if ! defined (TC_HPPA) && ! defined (TC_I370)
 121       /* I370 uses single-quotes to delimit integer, float constants.  */
 122       lex['\''] = LEX_IS_ONECHAR_QUOTE;
 123 #endif
 124
 125 #ifdef SINGLE_QUOTE_STRINGS
 126       lex['\''] = LEX_IS_STRINGQUOTE;
 127 #endif
 128     }
 129
 130   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
 131      in state 5 of do_scrub_chars must be changed.  */
 132
 133   /* Note that these override the previous defaults, e.g. if ';' is a
 134      comment char, then it isn't a line separator.  */
 135   for (p = symbol_chars; *p; ++p)
 136     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 137
 138   for (c = 128; c < 256; ++c)
 139     lex[c] = LEX_IS_SYMBOL_COMPONENT;
 140
 141 #ifdef tc_symbol_chars
 142   /* This macro permits the processor to specify all characters which
 143      may appears in an operand.  This will prevent the scrubber from
 144      discarding meaningful whitespace in certain cases.  The i386
 145      backend uses this to support prefixes, which can confuse the
 146      scrubber as to whether it is parsing operands or opcodes.  */
 147   for (p = tc_symbol_chars; *p; ++p)
 148     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 149 #endif
 150
 151   /* The m68k backend wants to be able to change comment_chars.  */
 152 #ifndef tc_comment_chars
 153 #define tc_comment_chars comment_chars
 154 #endif
 155   for (p = tc_comment_chars; *p; p++)
 156     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
 157
 158   for (p = line_comment_chars; *p; p++)
 159     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 160
 161 #ifndef tc_line_separator_chars
 162 #define tc_line_separator_chars line_separator_chars
 163 #endif
 164   for (p = tc_line_separator_chars; *p; p++)
 165     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 166
 167 #ifdef tc_parallel_separator_chars
 168   /* This macro permits the processor to specify all characters which
 169      separate parallel insns on the same line.  */
 170   for (p = tc_parallel_separator_chars; *p; p++)
 171     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
 172 #endif
 173
 174   /* Only allow slash-star comments if slash is not in use.
 175      FIXME: This isn't right.  We should always permit them.  */
 176   if (lex['/'] == 0)
 177     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 178
 179 #ifdef TC_M68K
 180   if (m68k_mri)
 181     {
 182       lex['\''] = LEX_IS_STRINGQUOTE;
 183       lex[';'] = LEX_IS_COMMENT_START;
 184       lex['*'] = LEX_IS_LINE_COMMENT_START;
 185       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 186          then it can't be used in an expression.  */
 187       lex['!'] = LEX_IS_LINE_COMMENT_START;
 188     }
 189 #endif
 190
 191 #ifdef TC_V850
 192   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
 193 #endif
 194 #ifdef DOUBLEBAR_PARALLEL
 195   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
 196 #endif
 197 #ifdef TC_D30V
 198   /* Must do this is we want VLIW instruction with "->" or "<-".  */
 199   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
 200 #endif
 201
 202 #ifdef H_TICK_HEX
 203   if (enable_h_tick_hex)
 204     {
 205       lex['h'] = LEX_IS_H;
 206       lex['H'] = LEX_IS_H;
 207     }
 208 #endif
 209 }
 210
 211 /* Saved state of the scrubber.  */
 212 static int state;
 213 static int old_state;
 214 static const char *out_string;
 215 static char out_buf[20];
 216 static int add_newlines;
 217 static char *saved_input;
 218 static size_t saved_input_len;
 219 static char input_buffer[32 * 1024];
 220 static const char *mri_state;
 221 static char mri_last_ch;
 222
 223 /* Data structure for saving the state of app across #include's.  Note that
 224    app is called asynchronously to the parsing of the .include's, so our
 225    state at the time .include is interpreted is completely unrelated.
 226    That's why we have to save it all.  */
 227
 228 struct app_save
 229 {
 230   int          state;
 231   int          old_state;
 232   const char * out_string;
 233   char         out_buf[sizeof (out_buf)];
 234   int          add_newlines;
 235   char *       saved_input;
 236   size_t       saved_input_len;
 237 #ifdef TC_M68K
 238   int          scrub_m68k_mri;
 239 #endif
 240   const char * mri_state;
 241   char         mri_last_ch;
 242 #if defined TC_ARM && defined OBJ_ELF
 243   const char * symver_state;
 244 #endif
 245 };
 246
 247 char *
 248 app_push (void)
 249 {
 250   struct app_save *saved;
 251
 252   saved = XNEW (struct app_save);
 253   saved->state = state;
 254   saved->old_state = old_state;
 255   saved->out_string = out_string;
 256   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 257   saved->add_newlines = add_newlines;
 258   if (saved_input == NULL)
 259     saved->saved_input = NULL;
 260   else
 261     {
 262       saved->saved_input = XNEWVEC (char, saved_input_len);
 263       memcpy (saved->saved_input, saved_input, saved_input_len);
 264       saved->saved_input_len = saved_input_len;
 265     }
 266 #ifdef TC_M68K
 267   saved->scrub_m68k_mri = scrub_m68k_mri;
 268 #endif
 269   saved->mri_state = mri_state;
 270   saved->mri_last_ch = mri_last_ch;
 271 #if defined TC_ARM && defined OBJ_ELF
 272   saved->symver_state = symver_state;
 273 #endif
 274
 275   /* do_scrub_begin() is not useful, just wastes time.  */
 276
 277   state = 0;
 278   saved_input = NULL;
 279   add_newlines = 0;
 280
 281   return (char *) saved;
 282 }
 283
 284 void
 285 app_pop (char *arg)
 286 {
 287   struct app_save *saved = (struct app_save *) arg;
 288
 289   /* There is no do_scrub_end ().  */
 290   state = saved->state;
 291   old_state = saved->old_state;
 292   out_string = saved->out_string;
 293   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 294   add_newlines = saved->add_newlines;
 295   if (saved->saved_input == NULL)
 296     saved_input = NULL;
 297   else
 298     {
 299       gas_assert (saved->saved_input_len <= sizeof (input_buffer));
 300       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
 301       saved_input = input_buffer;
 302       saved_input_len = saved->saved_input_len;
 303       free (saved->saved_input);
 304     }
 305 #ifdef TC_M68K
 306   scrub_m68k_mri = saved->scrub_m68k_mri;
 307 #endif
 308   mri_state = saved->mri_state;
 309   mri_last_ch = saved->mri_last_ch;
 310 #if defined TC_ARM && defined OBJ_ELF
 311   symver_state = saved->symver_state;
 312 #endif
 313
 314   free (arg);
 315 }
 316
 317 /* @@ This assumes that \n &c are the same on host and target.  This is not
 318    necessarily true.  */
 319
 320 static int
 321 process_escape (int ch)
 322 {
 323   switch (ch)
 324     {
 325     case 'b':
 326       return '\b';
 327     case 'f':
 328       return '\f';
 329     case 'n':
 330       return '\n';
 331     case 'r':
 332       return '\r';
 333     case 't':
 334       return '\t';
 335     case '\'':
 336       return '\'';
 337     case '"':
 338       return '\"';
 339     default:
 340       return ch;
 341     }
 342 }
 343
 344 /* This function is called to process input characters.  The GET
 345    parameter is used to retrieve more input characters.  GET should
 346    set its parameter to point to a buffer, and return the length of
 347    the buffer; it should return 0 at end of file.  The scrubbed output
 348    characters are put into the buffer starting at TOSTART; the TOSTART
 349    buffer is TOLEN bytes in length.  The function returns the number
 350    of scrubbed characters put into TOSTART.  This will be TOLEN unless
 351    end of file was seen.  This function is arranged as a state
 352    machine, and saves its state so that it may return at any point.
 353    This is the way the old code used to work.  */
 354
 355 size_t
 356 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
 357 {
 358   char *to = tostart;
 359   char *toend = tostart + tolen;
 360   char *from;
 361   char *fromend;
 362   size_t fromlen;
 363   int ch, ch2 = 0;
 364   /* Character that started the string we're working on.  */
 365   static char quotechar;
 366
 367   /*State 0: beginning of normal line
 368           1: After first whitespace on line (flush more white)
 369           2: After first non-white (opcode) on line (keep 1white)
 370           3: after second white on line (into operands) (flush white)
 371           4: after putting out a .linefile, put out digits
 372           5: parsing a string, then go to old-state
 373           6: putting out \ escape in a "d string.
 374           7: no longer used
 375           8: no longer used
 376           9: After seeing symbol char in state 3 (keep 1white after symchar)
 377          10: After seeing whitespace in state 9 (keep white before symchar)
 378          11: After seeing a symbol character in state 0 (eg a label definition)
 379          -1: output string in out_string and go to the state in old_state
 380          -2: flush text until a '*' '/' is seen, then go to state old_state
 381 #ifdef TC_V850
 382          12: After seeing a dash, looking for a second dash as a start
 383              of comment.
 384 #endif
 385 #ifdef DOUBLEBAR_PARALLEL
 386          13: After seeing a vertical bar, looking for a second
 387              vertical bar as a parallel expression separator.
 388 #endif
 389 #ifdef TC_PREDICATE_START_CHAR
 390          14: After seeing a predicate start character at state 0, looking
 391              for a predicate end character as predicate.
 392          15: After seeing a predicate start character at state 1, looking
 393              for a predicate end character as predicate.
 394 #endif
 395 #ifdef TC_Z80
 396          16: After seeing an 'a' or an 'A' at the start of a symbol
 397          17: After seeing an 'f' or an 'F' in state 16
 398 #endif
 399           */
 400
 401   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 402      constructs like ``.loc 1 20''.  This was turning into ``.loc
 403      120''.  States 9 and 10 ensure that a space is never dropped in
 404      between characters which could appear in an identifier.  Ian
 405      Taylor, ian@cygnus.com.
 406
 407      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 408      correctly on the PA (and any other target where colons are optional).
 409      Jeff Law, law@cs.utah.edu.
 410
 411      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
 412      get squashed into "cmp r1,r2||trap#1", with the all important space
 413      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
 414
 415   /* This macro gets the next input character.  */
 416
 417 #define GET()                                                   \
 418   (from < fromend                                               \
 419    ? * (unsigned char *) (from++)                               \
 420    : (saved_input = NULL,                                       \
 421       fromlen = (*get) (input_buffer, sizeof input_buffer),     \
 422       from = input_buffer,                                      \
 423       fromend = from + fromlen,                                 \
 424       (fromlen == 0                                             \
 425        ? EOF                                                    \
 426        : * (unsigned char *) (from++))))
 427
 428   /* This macro pushes a character back on the input stream.  */
 429
 430 #define UNGET(uch) (*--from = (uch))
 431
 432   /* This macro puts a character into the output buffer.  If this
 433      character fills the output buffer, this macro jumps to the label
 434      TOFULL.  We use this rather ugly approach because we need to
 435      handle two different termination conditions: EOF on the input
 436      stream, and a full output buffer.  It would be simpler if we
 437      always read in the entire input stream before processing it, but
 438      I don't want to make such a significant change to the assembler's
 439      memory usage.  */
 440
 441 #define PUT(pch)                                \
 442   do                                            \
 443     {                                           \
 444       *to++ = (pch);                            \
 445       if (to >= toend)                          \
 446         goto tofull;                            \
 447     }                                           \
 448   while (0)
 449
 450   if (saved_input != NULL)
 451     {
 452       from = saved_input;
 453       fromend = from + saved_input_len;
 454     }
 455   else
 456     {
 457       fromlen = (*get) (input_buffer, sizeof input_buffer);
 458       if (fromlen == 0)
 459         return 0;
 460       from = input_buffer;
 461       fromend = from + fromlen;
 462     }
 463
 464   while (1)
 465     {
 466       /* The cases in this switch end with continue, in order to
 467          branch back to the top of this while loop and generate the
 468          next output character in the appropriate state.  */
 469       switch (state)
 470         {
 471         case -1:
 472           ch = *out_string++;
 473           if (*out_string == '\0')
 474             {
 475               state = old_state;
 476               old_state = 3;
 477             }
 478           PUT (ch);
 479           continue;
 480
 481         case -2:
 482           for (;;)
 483             {
 484               do
 485                 {
 486                   ch = GET ();
 487
 488                   if (ch == EOF)
 489                     {
 490                       as_warn (_("end of file in comment"));
 491                       goto fromeof;
 492                     }
 493
 494                   if (ch == '\n')
 495                     PUT ('\n');
 496                 }
 497               while (ch != '*');
 498
 499               while ((ch = GET ()) == '*')
 500                 ;
 501
 502               if (ch == EOF)
 503                 {
 504                   as_warn (_("end of file in comment"));
 505                   goto fromeof;
 506                 }
 507
 508               if (ch == '/')
 509                 break;
 510
 511               UNGET (ch);
 512             }
 513
 514           state = old_state;
 515           UNGET (' ');
 516           continue;
 517
 518         case 4:
 519           ch = GET ();
 520           if (ch == EOF)
 521             goto fromeof;
 522           else if (ch >= '0' && ch <= '9')
 523             PUT (ch);
 524           else
 525             {
 526               while (ch != EOF && IS_WHITESPACE (ch))
 527                 ch = GET ();
 528               if (ch == '"')
 529                 {
 530                   quotechar = ch;
 531                   state = 5;
 532                   old_state = 3;
 533                   PUT (ch);
 534                 }
 535               else
 536                 {
 537                   while (ch != EOF && ch != '\n')
 538                     ch = GET ();
 539                   state = 0;
 540                   PUT (ch);
 541                 }
 542             }
 543           continue;
 544
 545         case 5:
 546           /* We are going to copy everything up to a quote character,
 547              with special handling for a backslash.  We try to
 548              optimize the copying in the simple case without using the
 549              GET and PUT macros.  */
 550           {
 551             char *s;
 552             ptrdiff_t len;
 553
 554             for (s = from; s < fromend; s++)
 555               {
 556                 ch = *s;
 557                 if (ch == '\\'
 558                     || ch == quotechar
 559                     || ch == '\n')
 560                   break;
 561               }
 562             len = s - from;
 563             if (len > toend - to)
 564               len = toend - to;
 565             if (len > 0)
 566               {
 567                 memcpy (to, from, len);
 568                 to += len;
 569                 from += len;
 570                 if (to >= toend)
 571                   goto tofull;
 572               }
 573           }
 574
 575           ch = GET ();
 576           if (ch == EOF)
 577             {
 578               /* This buffer is here specifically so
 579                  that the UNGET below will work.  */
 580               static char one_char_buf[1];
 581
 582               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 583               state = old_state;
 584               from = fromend = one_char_buf + 1;
 585               fromlen = 1;
 586               UNGET ('\n');
 587               PUT (quotechar);
 588             }
 589           else if (ch == quotechar)
 590             {
 591               state = old_state;
 592               PUT (ch);
 593             }
 594 #ifndef NO_STRING_ESCAPES
 595           else if (ch == '\\')
 596             {
 597               state = 6;
 598               PUT (ch);
 599             }
 600 #endif
 601           else if (scrub_m68k_mri && ch == '\n')
 602             {
 603               /* Just quietly terminate the string.  This permits lines like
 604                    bne  label   loop if we haven't reach end yet.  */
 605               state = old_state;
 606               UNGET (ch);
 607               PUT ('\'');
 608             }
 609           else
 610             {
 611               PUT (ch);
 612             }
 613           continue;
 614
 615         case 6:
 616           state = 5;
 617           ch = GET ();
 618           switch (ch)
 619             {
 620               /* Handle strings broken across lines, by turning '\n' into
 621                  '\\' and 'n'.  */
 622             case '\n':
 623               UNGET ('n');
 624               add_newlines++;
 625               PUT ('\\');
 626               continue;
 627
 628             case EOF:
 629               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 630               PUT (quotechar);
 631               continue;
 632
 633             case '"':
 634             case '\\':
 635             case 'b':
 636             case 'f':
 637             case 'n':
 638             case 'r':
 639             case 't':
 640             case 'v':
 641             case 'x':
 642             case 'X':
 643             case '0':
 644             case '1':
 645             case '2':
 646             case '3':
 647             case '4':
 648             case '5':
 649             case '6':
 650             case '7':
 651               break;
 652
 653             default:
 654 #ifdef ONLY_STANDARD_ESCAPES
 655               as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
 656 #endif
 657               break;
 658             }
 659           PUT (ch);
 660           continue;
 661
 662 #ifdef DOUBLEBAR_PARALLEL
 663         case 13:
 664           ch = GET ();
 665           if (ch != '|')
 666             abort ();
 667
 668           /* Reset back to state 1 and pretend that we are parsing a
 669              line from just after the first white space.  */
 670           state = 1;
 671           PUT ('|');
 672 #ifdef TC_TIC6X
 673           /* "||^" is used for SPMASKed instructions.  */
 674           ch = GET ();
 675           if (ch == EOF)
 676             goto fromeof;
 677           else if (ch == '^')
 678             PUT ('^');
 679           else
 680             UNGET (ch);
 681 #endif
 682           continue;
 683 #endif
 684 #ifdef TC_Z80
 685         case 16:
 686           /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
 687           ch = GET ();
 688           if (ch == 'f' || ch == 'F')
 689             {
 690               state = 17;
 691               PUT (ch);
 692             }
 693           else
 694             {
 695               state = 9;
 696               break;
 697             }
 698           /* Fall through.  */
 699         case 17:
 700           /* We have seen "af" at the start of a symbol,
 701              a ' here is a part of that symbol.  */
 702           ch = GET ();
 703           state = 9;
 704           if (ch == '\'')
 705             /* Change to avoid warning about unclosed string.  */
 706             PUT ('`');
 707           else if (ch != EOF)
 708             UNGET (ch);
 709           break;
 710 #endif
 711         }
 712
 713       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
 714
 715       /* flushchar: */
 716       ch = GET ();
 717
 718 #ifdef TC_PREDICATE_START_CHAR
 719       if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
 720         {
 721           state += 14;
 722           PUT (ch);
 723           continue;
 724         }
 725       else if (state == 14 || state == 15)
 726         {
 727           if (ch == TC_PREDICATE_END_CHAR)
 728             {
 729               state -= 14;
 730               PUT (ch);
 731               ch = GET ();
 732             }
 733           else
 734             {
 735               PUT (ch);
 736               continue;
 737             }
 738         }
 739 #endif
 740
 741     recycle:
 742
 743 #if defined TC_ARM && defined OBJ_ELF
 744       /* We need to watch out for .symver directives.  See the comment later
 745          in this function.  */
 746       if (symver_state == NULL)
 747         {
 748           if ((state == 0 || state == 1) && ch == symver_pseudo[0])
 749             symver_state = symver_pseudo + 1;
 750         }
 751       else
 752         {
 753           /* We advance to the next state if we find the right
 754              character.  */
 755           if (ch != '\0' && (*symver_state == ch))
 756             ++symver_state;
 757           else if (*symver_state != '\0')
 758             /* We did not get the expected character, or we didn't
 759                get a valid terminating character after seeing the
 760                entire pseudo-op, so we must go back to the beginning.  */
 761             symver_state = NULL;
 762           else
 763             {
 764               /* We've read the entire pseudo-op.  If this is the end
 765                  of the line, go back to the beginning.  */
 766               if (IS_NEWLINE (ch))
 767                 symver_state = NULL;
 768             }
 769         }
 770 #endif /* TC_ARM && OBJ_ELF */
 771
 772 #ifdef TC_M68K
 773       /* We want to have pseudo-ops which control whether we are in
 774          MRI mode or not.  Unfortunately, since m68k MRI mode affects
 775          the scrubber, that means that we need a special purpose
 776          recognizer here.  */
 777       if (mri_state == NULL)
 778         {
 779           if ((state == 0 || state == 1)
 780               && ch == mri_pseudo[0])
 781             mri_state = mri_pseudo + 1;
 782         }
 783       else
 784         {
 785           /* We advance to the next state if we find the right
 786              character, or if we need a space character and we get any
 787              whitespace character, or if we need a '0' and we get a
 788              '1' (this is so that we only need one state to handle
 789              ``.mri 0'' and ``.mri 1'').  */
 790           if (ch != '\0'
 791               && (*mri_state == ch
 792                   || (*mri_state == ' '
 793                       && lex[ch] == LEX_IS_WHITESPACE)
 794                   || (*mri_state == '0'
 795                       && ch == '1')))
 796             {
 797               mri_last_ch = ch;
 798               ++mri_state;
 799             }
 800           else if (*mri_state != '\0'
 801                    || (lex[ch] != LEX_IS_WHITESPACE
 802                        && lex[ch] != LEX_IS_NEWLINE))
 803             {
 804               /* We did not get the expected character, or we didn't
 805                  get a valid terminating character after seeing the
 806                  entire pseudo-op, so we must go back to the
 807                  beginning.  */
 808               mri_state = NULL;
 809             }
 810           else
 811             {
 812               /* We've read the entire pseudo-op.  mips_last_ch is
 813                  either '0' or '1' indicating whether to enter or
 814                  leave MRI mode.  */
 815               do_scrub_begin (mri_last_ch == '1');
 816               mri_state = NULL;
 817
 818               /* We continue handling the character as usual.  The
 819                  main gas reader must also handle the .mri pseudo-op
 820                  to control expression parsing and the like.  */
 821             }
 822         }
 823 #endif
 824
 825       if (ch == EOF)
 826         {
 827           if (state != 0)
 828             {
 829               as_warn (_("end of file not at end of a line; newline inserted"));
 830               state = 0;
 831               PUT ('\n');
 832             }
 833           goto fromeof;
 834         }
 835
 836       switch (lex[ch])
 837         {
 838         case LEX_IS_WHITESPACE:
 839           do
 840             {
 841               ch = GET ();
 842             }
 843           while (ch != EOF && IS_WHITESPACE (ch));
 844           if (ch == EOF)
 845             goto fromeof;
 846
 847           if (state == 0)
 848             {
 849               /* Preserve a single whitespace character at the
 850                  beginning of a line.  */
 851               state = 1;
 852               UNGET (ch);
 853               PUT (' ');
 854               break;
 855             }
 856
 857 #ifdef KEEP_WHITE_AROUND_COLON
 858           if (lex[ch] == LEX_IS_COLON)
 859             {
 860               /* Only keep this white if there's no white *after* the
 861                  colon.  */
 862               ch2 = GET ();
 863               if (ch2 != EOF)
 864                 UNGET (ch2);
 865               if (!IS_WHITESPACE (ch2))
 866                 {
 867                   state = 9;
 868                   UNGET (ch);
 869                   PUT (' ');
 870                   break;
 871                 }
 872             }
 873 #endif
 874           if (IS_COMMENT (ch)
 875               || ch == '/'
 876               || IS_LINE_SEPARATOR (ch)
 877               || IS_PARALLEL_SEPARATOR (ch))
 878             {
 879               if (scrub_m68k_mri)
 880                 {
 881                   /* In MRI mode, we keep these spaces.  */
 882                   UNGET (ch);
 883                   PUT (' ');
 884                   break;
 885                 }
 886               goto recycle;
 887             }
 888
 889           /* If we're in state 2 or 11, we've seen a non-white
 890              character followed by whitespace.  If the next character
 891              is ':', this is whitespace after a label name which we
 892              normally must ignore.  In MRI mode, though, spaces are
 893              not permitted between the label and the colon.  */
 894           if ((state == 2 || state == 11)
 895               && lex[ch] == LEX_IS_COLON
 896               && ! scrub_m68k_mri)
 897             {
 898               state = 1;
 899               PUT (ch);
 900               break;
 901             }
 902
 903           switch (state)
 904             {
 905             case 1:
 906               /* We can arrive here if we leave a leading whitespace
 907                  character at the beginning of a line.  */
 908               goto recycle;
 909             case 2:
 910               state = 3;
 911               if (to + 1 < toend)
 912                 {
 913                   /* Optimize common case by skipping UNGET/GET.  */
 914                   PUT (' ');    /* Sp after opco */
 915                   goto recycle;
 916                 }
 917               UNGET (ch);
 918               PUT (' ');
 919               break;
 920             case 3:
 921 #ifndef TC_KEEP_OPERAND_SPACES
 922               /* For TI C6X, we keep these spaces as they may separate
 923                  functional unit specifiers from operands.  */
 924               if (scrub_m68k_mri)
 925 #endif
 926                 {
 927                   /* In MRI mode, we keep these spaces.  */
 928                   UNGET (ch);
 929                   PUT (' ');
 930                   break;
 931                 }
 932               goto recycle;     /* Sp in operands */
 933             case 9:
 934             case 10:
 935 #ifndef TC_KEEP_OPERAND_SPACES
 936               if (scrub_m68k_mri)
 937 #endif
 938                 {
 939                   /* In MRI mode, we keep these spaces.  */
 940                   state = 3;
 941                   UNGET (ch);
 942                   PUT (' ');
 943                   break;
 944                 }
 945               state = 10;       /* Sp after symbol char */
 946               goto recycle;
 947             case 11:
 948               if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
 949                 state = 1;
 950               else
 951                 {
 952                   /* We know that ch is not ':', since we tested that
 953                      case above.  Therefore this is not a label, so it
 954                      must be the opcode, and we've just seen the
 955                      whitespace after it.  */
 956                   state = 3;
 957                 }
 958               UNGET (ch);
 959               PUT (' ');        /* Sp after label definition.  */
 960               break;
 961             default:
 962               BAD_CASE (state);
 963             }
 964           break;
 965
 966         case LEX_IS_TWOCHAR_COMMENT_1ST:
 967           ch2 = GET ();
 968           if (ch2 == '*')
 969             {
 970               for (;;)
 971                 {
 972                   do
 973                     {
 974                       ch2 = GET ();
 975                       if (ch2 != EOF && IS_NEWLINE (ch2))
 976                         add_newlines++;
 977                     }
 978                   while (ch2 != EOF && ch2 != '*');
 979
 980                   while (ch2 == '*')
 981                     ch2 = GET ();
 982
 983                   if (ch2 == EOF || ch2 == '/')
 984                     break;
 985
 986                   /* This UNGET will ensure that we count newlines
 987                      correctly.  */
 988                   UNGET (ch2);
 989                 }
 990
 991               if (ch2 == EOF)
 992                 as_warn (_("end of file in multiline comment"));
 993
 994               ch = ' ';
 995               goto recycle;
 996             }
 997 #ifdef DOUBLESLASH_LINE_COMMENTS
 998           else if (ch2 == '/')
 999             {
1000               do
1001                 {
1002                   ch = GET ();
1003                 }
1004               while (ch != EOF && !IS_NEWLINE (ch));
1005               if (ch == EOF)
1006                 as_warn ("end of file in comment; newline inserted");
1007               state = 0;
1008               PUT ('\n');
1009               break;
1010             }
1011 #endif
1012           else
1013             {
1014               if (ch2 != EOF)
1015                 UNGET (ch2);
1016               if (state == 9 || state == 10)
1017                 state = 3;
1018               PUT (ch);
1019             }
1020           break;
1021
1022         case LEX_IS_STRINGQUOTE:
1023           quotechar = ch;
1024           if (state == 10)
1025             {
1026               /* Preserve the whitespace in foo "bar".  */
1027               UNGET (ch);
1028               state = 3;
1029               PUT (' ');
1030
1031               /* PUT didn't jump out.  We could just break, but we
1032                  know what will happen, so optimize a bit.  */
1033               ch = GET ();
1034               old_state = 3;
1035             }
1036           else if (state == 9)
1037             old_state = 3;
1038           else
1039             old_state = state;
1040           state = 5;
1041           PUT (ch);
1042           break;
1043
1044 #ifndef IEEE_STYLE
1045         case LEX_IS_ONECHAR_QUOTE:
1046 #ifdef H_TICK_HEX
1047           if (state == 9 && enable_h_tick_hex)
1048             {
1049               char c;
1050
1051               c = GET ();
1052               as_warn ("'%c found after symbol", c);
1053               UNGET (c);
1054             }
1055 #endif
1056           if (state == 10)
1057             {
1058               /* Preserve the whitespace in foo 'b'.  */
1059               UNGET (ch);
1060               state = 3;
1061               PUT (' ');
1062               break;
1063             }
1064           ch = GET ();
1065           if (ch == EOF)
1066             {
1067               as_warn (_("end of file after a one-character quote; \\0 inserted"));
1068               ch = 0;
1069             }
1070           if (ch == '\\')
1071             {
1072               ch = GET ();
1073               if (ch == EOF)
1074                 {
1075                   as_warn (_("end of file in escape character"));
1076                   ch = '\\';
1077                 }
1078               else
1079                 ch = process_escape (ch);
1080             }
1081           sprintf (out_buf, "%d", (int) (unsigned char) ch);
1082
1083           /* None of these 'x constants for us.  We want 'x'.  */
1084           if ((ch = GET ()) != '\'')
1085             {
1086 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1087               as_warn (_("missing close quote; (assumed)"));
1088 #else
1089               if (ch != EOF)
1090                 UNGET (ch);
1091 #endif
1092             }
1093           if (strlen (out_buf) == 1)
1094             {
1095               PUT (out_buf[0]);
1096               break;
1097             }
1098           if (state == 9)
1099             old_state = 3;
1100           else
1101             old_state = state;
1102           state = -1;
1103           out_string = out_buf;
1104           PUT (*out_string++);
1105           break;
1106 #endif
1107
1108         case LEX_IS_COLON:
1109 #ifdef KEEP_WHITE_AROUND_COLON
1110           state = 9;
1111 #else
1112           if (state == 9 || state == 10)
1113             state = 3;
1114           else if (state != 3)
1115             state = 1;
1116 #endif
1117           PUT (ch);
1118           break;
1119
1120         case LEX_IS_NEWLINE:
1121           /* Roll out a bunch of newlines from inside comments, etc.  */
1122           if (add_newlines)
1123             {
1124               --add_newlines;
1125               UNGET (ch);
1126             }
1127           /* Fall through.  */
1128
1129         case LEX_IS_LINE_SEPARATOR:
1130           state = 0;
1131           PUT (ch);
1132           break;
1133
1134         case LEX_IS_PARALLEL_SEPARATOR:
1135           state = 1;
1136           PUT (ch);
1137           break;
1138
1139 #ifdef TC_V850
1140         case LEX_IS_DOUBLEDASH_1ST:
1141           ch2 = GET ();
1142           if (ch2 != '-')
1143             {
1144               if (ch2 != EOF)
1145                 UNGET (ch2);
1146               goto de_fault;
1147             }
1148           /* Read and skip to end of line.  */
1149           do
1150             {
1151               ch = GET ();
1152             }
1153           while (ch != EOF && ch != '\n');
1154
1155           if (ch == EOF)
1156             as_warn (_("end of file in comment; newline inserted"));
1157
1158           state = 0;
1159           PUT ('\n');
1160           break;
1161 #endif
1162 #ifdef DOUBLEBAR_PARALLEL
1163         case LEX_IS_DOUBLEBAR_1ST:
1164           ch2 = GET ();
1165           if (ch2 != EOF)
1166             UNGET (ch2);
1167           if (ch2 != '|')
1168             goto de_fault;
1169
1170           /* Handle '||' in two states as invoking PUT twice might
1171              result in the first one jumping out of this loop.  We'd
1172              then lose track of the state and one '|' char.  */
1173           state = 13;
1174           PUT ('|');
1175           break;
1176 #endif
1177         case LEX_IS_LINE_COMMENT_START:
1178           /* FIXME-someday: The two character comment stuff was badly
1179              thought out.  On i386, we want '/' as line comment start
1180              AND we want C style comments.  hence this hack.  The
1181              whole lexical process should be reworked.  xoxorich.  */
1182           if (ch == '/')
1183             {
1184               ch2 = GET ();
1185               if (ch2 == '*')
1186                 {
1187                   old_state = 3;
1188                   state = -2;
1189                   break;
1190                 }
1191               else if (ch2 != EOF)
1192                 {
1193                   UNGET (ch2);
1194                 }
1195             }
1196
1197           if (state == 0 || state == 1) /* Only comment at start of line.  */
1198             {
1199               int startch;
1200
1201               startch = ch;
1202
1203               do
1204                 {
1205                   ch = GET ();
1206                 }
1207               while (ch != EOF && IS_WHITESPACE (ch));
1208
1209               if (ch == EOF)
1210                 {
1211                   as_warn (_("end of file in comment; newline inserted"));
1212                   PUT ('\n');
1213                   break;
1214                 }
1215
1216               if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1217                 {
1218                   /* Not a cpp line.  */
1219                   while (ch != EOF && !IS_NEWLINE (ch))
1220                     ch = GET ();
1221                   if (ch == EOF)
1222                     {
1223                       as_warn (_("end of file in comment; newline inserted"));
1224                       PUT ('\n');
1225                     }
1226                   else /* IS_NEWLINE (ch) */
1227                     {
1228                       /* To process non-zero add_newlines.  */
1229                       UNGET (ch);
1230                     }
1231                   state = 0;
1232                   break;
1233                 }
1234               /* Looks like `# 123 "filename"' from cpp.  */
1235               UNGET (ch);
1236               old_state = 4;
1237               state = -1;
1238               if (scrub_m68k_mri)
1239                 out_string = "\tlinefile ";
1240               else
1241                 out_string = "\t.linefile ";
1242               PUT (*out_string++);
1243               break;
1244             }
1245
1246 #ifdef TC_D10V
1247           /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1248              Trap is the only short insn that has a first operand that is
1249              neither register nor label.
1250              We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1251              We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1252              already LEX_IS_LINE_COMMENT_START.  However, it is the
1253              only character in line_comment_chars for d10v, hence we
1254              can recognize it as such.  */
1255           /* An alternative approach would be to reset the state to 1 when
1256              we see '||', '<'- or '->', but that seems to be overkill.  */
1257           if (state == 10)
1258             PUT (' ');
1259 #endif
1260           /* We have a line comment character which is not at the
1261              start of a line.  If this is also a normal comment
1262              character, fall through.  Otherwise treat it as a default
1263              character.  */
1264           if (strchr (tc_comment_chars, ch) == NULL
1265               && (! scrub_m68k_mri
1266                   || (ch != '!' && ch != '*')))
1267             goto de_fault;
1268           if (scrub_m68k_mri
1269               && (ch == '!' || ch == '*' || ch == '#')
1270               && state != 1
1271               && state != 10)
1272             goto de_fault;
1273           /* Fall through.  */
1274         case LEX_IS_COMMENT_START:
1275 #if defined TC_ARM && defined OBJ_ELF
1276           /* On the ARM, `@' is the comment character.
1277              Unfortunately this is also a special character in ELF .symver
1278              directives (and .type, though we deal with those another way).
1279              So we check if this line is such a directive, and treat
1280              the character as default if so.  This is a hack.  */
1281           if ((symver_state != NULL) && (*symver_state == 0))
1282             goto de_fault;
1283 #endif
1284
1285 #ifdef TC_ARM
1286           /* For the ARM, care is needed not to damage occurrences of \@
1287              by stripping the @ onwards.  Yuck.  */
1288           if (to > tostart && *(to - 1) == '\\')
1289             /* Do not treat the @ as a start-of-comment.  */
1290             goto de_fault;
1291 #endif
1292
1293 #ifdef WARN_COMMENTS
1294           if (!found_comment)
1295             found_comment_file = as_where (&found_comment);
1296 #endif
1297           do
1298             {
1299               ch = GET ();
1300             }
1301           while (ch != EOF && !IS_NEWLINE (ch));
1302           if (ch == EOF)
1303             as_warn (_("end of file in comment; newline inserted"));
1304           state = 0;
1305           PUT ('\n');
1306           break;
1307
1308 #ifdef H_TICK_HEX
1309         case LEX_IS_H:
1310           /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1311              the H' with 0x to make them gas-style hex characters.  */
1312           if (enable_h_tick_hex)
1313             {
1314               char quot;
1315
1316               quot = GET ();
1317               if (quot == '\'')
1318                 {
1319                   UNGET ('x');
1320                   ch = '0';
1321                 }
1322               else
1323                 UNGET (quot);
1324             }
1325 #endif
1326           /* Fall through.  */
1327
1328         case LEX_IS_SYMBOL_COMPONENT:
1329           if (state == 10)
1330             {
1331               /* This is a symbol character following another symbol
1332                  character, with whitespace in between.  We skipped
1333                  the whitespace earlier, so output it now.  */
1334               UNGET (ch);
1335               state = 3;
1336               PUT (' ');
1337               break;
1338             }
1339
1340 #ifdef TC_Z80
1341           /* "af'" is a symbol containing '\''.  */
1342           if (state == 3 && (ch == 'a' || ch == 'A'))
1343             {
1344               state = 16;
1345               PUT (ch);
1346               ch = GET ();
1347               if (ch == 'f' || ch == 'F')
1348                 {
1349                   state = 17;
1350                   PUT (ch);
1351                   break;
1352                 }
1353               else
1354                 {
1355                   state = 9;
1356                   if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1357                     {
1358                       if (ch != EOF)
1359                         UNGET (ch);
1360                       break;
1361                     }
1362                 }
1363             }
1364 #endif
1365           if (state == 3)
1366             state = 9;
1367
1368           /* This is a common case.  Quickly copy CH and all the
1369              following symbol component or normal characters.  */
1370           if (to + 1 < toend
1371               && mri_state == NULL
1372 #if defined TC_ARM && defined OBJ_ELF
1373               && symver_state == NULL
1374 #endif
1375               )
1376             {
1377               char *s;
1378               ptrdiff_t len;
1379
1380               for (s = from; s < fromend; s++)
1381                 {
1382                   int type;
1383
1384                   ch2 = *(unsigned char *) s;
1385                   type = lex[ch2];
1386                   if (type != 0
1387                       && type != LEX_IS_SYMBOL_COMPONENT)
1388                     break;
1389                 }
1390
1391               if (s > from)
1392                 /* Handle the last character normally, for
1393                    simplicity.  */
1394                 --s;
1395
1396               len = s - from;
1397
1398               if (len > (toend - to) - 1)
1399                 len = (toend - to) - 1;
1400
1401               if (len > 0)
1402                 {
1403                   PUT (ch);
1404                   memcpy (to, from, len);
1405                   to += len;
1406                   from += len;
1407                   if (to >= toend)
1408                     goto tofull;
1409                   ch = GET ();
1410                 }
1411             }
1412
1413           /* Fall through.  */
1414         default:
1415         de_fault:
1416           /* Some relatively `normal' character.  */
1417           if (state == 0)
1418             {
1419               state = 11;       /* Now seeing label definition.  */
1420             }
1421           else if (state == 1)
1422             {
1423               state = 2;        /* Ditto.  */
1424             }
1425           else if (state == 9)
1426             {
1427               if (!IS_SYMBOL_COMPONENT (ch))
1428                 state = 3;
1429             }
1430           else if (state == 10)
1431             {
1432               if (ch == '\\')
1433                 {
1434                   /* Special handling for backslash: a backslash may
1435                      be the beginning of a formal parameter (of a
1436                      macro) following another symbol character, with
1437                      whitespace in between.  If that is the case, we
1438                      output a space before the parameter.  Strictly
1439                      speaking, correct handling depends upon what the
1440                      macro parameter expands into; if the parameter
1441                      expands into something which does not start with
1442                      an operand character, then we don't want to keep
1443                      the space.  We don't have enough information to
1444                      make the right choice, so here we are making the
1445                      choice which is more likely to be correct.  */
1446                   if (to + 1 >= toend)
1447                     {
1448                       /* If we're near the end of the buffer, save the
1449                          character for the next time round.  Otherwise
1450                          we'll lose our state.  */
1451                       UNGET (ch);
1452                       goto tofull;
1453                     }
1454                   *to++ = ' ';
1455                 }
1456
1457               state = 3;
1458             }
1459           PUT (ch);
1460           break;
1461         }
1462     }
1463
1464   /*NOTREACHED*/
1465
1466  fromeof:
1467   /* We have reached the end of the input.  */
1468   return to - tostart;
1469
1470  tofull:
1471   /* The output buffer is full.  Save any input we have not yet
1472      processed.  */
1473   if (fromend > from)
1474     {
1475       saved_input = from;
1476       saved_input_len = fromend - from;
1477     }
1478   else
1479     saved_input = NULL;
1480
1481   return to - tostart;
1482 }