libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010
   3    Free Software Foundation, Inc.
   4    Contributed by Per Bothner, 1994-95.
   5    Based on CCCP program by Paul Rubin, June 1986
   6    Adapted to ANSI C, Richard Stallman, Jan 1987
   7    Broken out to separate file, Zack Weinberg, Mar 2000
   8
   9 This program is free software; you can redistribute it and/or modify it
  10 under the terms of the GNU General Public License as published by the
  11 Free Software Foundation; either version 3, or (at your option) any
  12 later version.
  13
  14 This program is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with this program; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "cpplib.h"
  26 #include "internal.h"
  27
  28 enum spell_type
  29 {
  30   SPELL_OPERATOR = 0,
  31   SPELL_IDENT,
  32   SPELL_LITERAL,
  33   SPELL_NONE
  34 };
  35
  36 struct token_spelling
  37 {
  38   enum spell_type category;
  39   const unsigned char *name;
  40 };
  41
  42 static const unsigned char *const digraph_spellings[] =
  43 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  44
  45 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  46 #define TK(e, s) { SPELL_ ## s,    UC #e },
  47 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  48 #undef OP
  49 #undef TK
  50
  51 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  52 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  53
  54 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  55 static int skip_line_comment (cpp_reader *);
  56 static void skip_whitespace (cpp_reader *, cppchar_t);
  57 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  58 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  59 static void store_comment (cpp_reader *, cpp_token *);
  60 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  61                             unsigned int, enum cpp_ttype);
  62 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  63 static int name_p (cpp_reader *, const cpp_string *);
  64 static tokenrun *next_tokenrun (tokenrun *);
  65
  66 static _cpp_buff *new_buff (size_t);
  67
  68
  69 /* Utility routine:
  70
  71    Compares, the token TOKEN to the NUL-terminated string STRING.
  72    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  73 int
  74 cpp_ideq (const cpp_token *token, const char *string)
  75 {
  76   if (token->type != CPP_NAME)
  77     return 0;
  78
  79   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  80 }
  81
  82 /* Record a note TYPE at byte POS into the current cleaned logical
  83    line.  */
  84 static void
  85 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  86 {
  87   if (buffer->notes_used == buffer->notes_cap)
  88     {
  89       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  90       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  91                                   buffer->notes_cap);
  92     }
  93
  94   buffer->notes[buffer->notes_used].pos = pos;
  95   buffer->notes[buffer->notes_used].type = type;
  96   buffer->notes_used++;
  97 }
  98
  99 \f
 100 /* Fast path to find line special characters using optimized character
 101    scanning algorithms.  Anything complicated falls back to the slow
 102    path below.  Since this loop is very hot it's worth doing these kinds
 103    of optimizations.
 104
 105    One of the paths through the ifdefs should provide
 106
 107      const uchar *search_line_fast (const uchar *s, const uchar *end);
 108
 109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 110    the found character.
 111
 112    Note that the last character of the buffer is *always* a newline,
 113    as forced by _cpp_convert_input.  This fact can be used to avoid
 114    explicitly looking for the end of the buffer.  */
 115
 116 /* Configure gives us an ifdef test.  */
 117 #ifndef WORDS_BIGENDIAN
 118 #define WORDS_BIGENDIAN 0
 119 #endif
 120
 121 /* We'd like the largest integer that fits into a register.  There's nothing
 122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 124    can get the "real" word size.  */
 125 #ifdef __GNUC__
 126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 127 #else
 128 typedef unsigned long word_type;
 129 #endif
 130
 131 /* The code below is only expecting sizes 4 or 8.
 132    Die at compile-time if this expectation is violated.  */
 133 typedef char check_word_type_size
 134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 135
 136 /* Return X with the first N bytes forced to values that won't match one
 137    of the interesting characters.  Note that NUL is not interesting.  */
 138
 139 static inline word_type
 140 acc_char_mask_misalign (word_type val, unsigned int n)
 141 {
 142   word_type mask = -1;
 143   if (WORDS_BIGENDIAN)
 144     mask >>= n * 8;
 145   else
 146     mask <<= n * 8;
 147   return val & mask;
 148 }
 149
 150 /* Return X replicated to all byte positions within WORD_TYPE.  */
 151
 152 static inline word_type
 153 acc_char_replicate (uchar x)
 154 {
 155   word_type ret;
 156
 157   ret = (x << 24) | (x << 16) | (x << 8) | x;
 158   if (sizeof(word_type) == 8)
 159     ret = (ret << 16 << 16) | ret;
 160   return ret;
 161 }
 162
 163 /* Return non-zero if some byte of VAL is (probably) C.  */
 164
 165 static inline word_type
 166 acc_char_cmp (word_type val, word_type c)
 167 {
 168 #if defined(__GNUC__) && defined(__alpha__)
 169   /* We can get exact results using a compare-bytes instruction.
 170      Get (val == c) via (0 >= (val ^ c)).  */
 171   return __builtin_alpha_cmpbge (0, val ^ c);
 172 #else
 173   word_type magic = 0x7efefefeU;
 174   if (sizeof(word_type) == 8)
 175     magic = (magic << 16 << 16) | 0xfefefefeU;
 176   magic |= 1;
 177
 178   val ^= c;
 179   return ((val + magic) ^ ~val) & ~magic;
 180 #endif
 181 }
 182
 183 /* Given the result of acc_char_cmp is non-zero, return the index of
 184    the found character.  If this was a false positive, return -1.  */
 185
 186 static inline int
 187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 188                 word_type val ATTRIBUTE_UNUSED)
 189 {
 190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 191   /* The cmpbge instruction sets *bits* of the result corresponding to
 192      matches in the bytes with no false positives.  */
 193   return __builtin_ctzl (cmp);
 194 #else
 195   unsigned int i;
 196
 197   /* ??? It would be nice to force unrolling here,
 198      and have all of these constants folded.  */
 199   for (i = 0; i < sizeof(word_type); ++i)
 200     {
 201       uchar c;
 202       if (WORDS_BIGENDIAN)
 203         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 204       else
 205         c = (val >> i * 8) & 0xff;
 206
 207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 208         return i;
 209     }
 210
 211   return -1;
 212 #endif
 213 }
 214
 215 /* A version of the fast scanner using bit fiddling techniques.
 216
 217    For 32-bit words, one would normally perform 16 comparisons and
 218    16 branches.  With this algorithm one performs 24 arithmetic
 219    operations and one branch.  Whether this is faster with a 32-bit
 220    word size is going to be somewhat system dependent.
 221
 222    For 64-bit words, we eliminate twice the number of comparisons
 223    and branches without increasing the number of arithmetic operations.
 224    It's almost certainly going to be a win with 64-bit word size.  */
 225
 226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 227   ATTRIBUTE_UNUSED;
 228
 229 static const uchar *
 230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 231 {
 232   const word_type repl_nl = acc_char_replicate ('\n');
 233   const word_type repl_cr = acc_char_replicate ('\r');
 234   const word_type repl_bs = acc_char_replicate ('\\');
 235   const word_type repl_qm = acc_char_replicate ('?');
 236
 237   unsigned int misalign;
 238   const word_type *p;
 239   word_type val, t;
 240
 241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 243   val = *p;
 244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 245   if (misalign)
 246     val = acc_char_mask_misalign (val, misalign);
 247
 248   /* Main loop.  */
 249   while (1)
 250     {
 251       t  = acc_char_cmp (val, repl_nl);
 252       t |= acc_char_cmp (val, repl_cr);
 253       t |= acc_char_cmp (val, repl_bs);
 254       t |= acc_char_cmp (val, repl_qm);
 255
 256       if (__builtin_expect (t != 0, 0))
 257         {
 258           int i = acc_char_index (t, val);
 259           if (i >= 0)
 260             return (const uchar *)p + i;
 261         }
 262
 263       val = *++p;
 264     }
 265 }
 266
 267 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__))
 268
 269 /* Replicated character data to be shared between implementations.
 270    Recall that outside of a context with vector support we can't
 271    define compatible vector types, therefore these are all defined
 272    in terms of raw characters.  */
 273 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 274   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 275     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 276   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 277     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 278   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 279     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 280   { '?', '?', '?', '?', '?', '?', '?', '?',
 281     '?', '?', '?', '?', '?', '?', '?', '?' },
 282 };
 283
 284 /* A version of the fast scanner using MMX vectorized byte compare insns.
 285
 286    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 287    which was packaged into SSE1; it is also present in the AMD 3dNOW-A
 288    extension.  Mark the function as using "sse" so that we emit a real
 289    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 290
 291 static const uchar *
 292 #ifndef __SSE__
 293 __attribute__((__target__("sse")))
 294 #endif
 295 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 296 {
 297   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 298   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 299
 300   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 301   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 302   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 303   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 304
 305   unsigned int misalign, found, mask;
 306   const v8qi *p;
 307   v8qi data, t, c;
 308
 309   /* Align the source pointer.  While MMX doesn't generate unaligned data
 310      faults, this allows us to safely scan to the end of the buffer without
 311      reading beyond the end of the last page.  */
 312   misalign = (uintptr_t)s & 7;
 313   p = (const v8qi *)((uintptr_t)s & -8);
 314   data = *p;
 315
 316   /* Create a mask for the bytes that are valid within the first
 317      16-byte block.  The Idea here is that the AND with the mask
 318      within the loop is "free", since we need some AND or TEST
 319      insn in order to set the flags for the branch anyway.  */
 320   mask = -1u << misalign;
 321
 322   /* Main loop processing 8 bytes at a time.  */
 323   goto start;
 324   do
 325     {
 326       data = *++p;
 327       mask = -1;
 328
 329     start:
 330       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 331       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 332       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 333       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 334       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 335       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 336       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 337       found = __builtin_ia32_pmovmskb (t);
 338       found &= mask;
 339     }
 340   while (!found);
 341
 342   __builtin_ia32_emms ();
 343
 344   /* FOUND contains 1 in bits for which we matched a relevant
 345      character.  Conversion to the byte index is trivial.  */
 346   found = __builtin_ctz(found);
 347   return (const uchar *)p + found;
 348 }
 349
 350 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 351
 352 static const uchar *
 353 #ifndef __SSE2__
 354 __attribute__((__target__("sse2")))
 355 #endif
 356 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 357 {
 358   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 359
 360   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 361   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 362   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 363   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 364
 365   unsigned int misalign, found, mask;
 366   const v16qi *p;
 367   v16qi data, t;
 368
 369   /* Align the source pointer.  */
 370   misalign = (uintptr_t)s & 15;
 371   p = (const v16qi *)((uintptr_t)s & -16);
 372   data = *p;
 373
 374   /* Create a mask for the bytes that are valid within the first
 375      16-byte block.  The Idea here is that the AND with the mask
 376      within the loop is "free", since we need some AND or TEST
 377      insn in order to set the flags for the branch anyway.  */
 378   mask = -1u << misalign;
 379
 380   /* Main loop processing 16 bytes at a time.  */
 381   goto start;
 382   do
 383     {
 384       data = *++p;
 385       mask = -1;
 386
 387     start:
 388       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 389       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 390       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 391       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 392       found = __builtin_ia32_pmovmskb128 (t);
 393       found &= mask;
 394     }
 395   while (!found);
 396
 397   /* FOUND contains 1 in bits for which we matched a relevant
 398      character.  Conversion to the byte index is trivial.  */
 399   found = __builtin_ctz(found);
 400   return (const uchar *)p + found;
 401 }
 402
 403 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 404
 405 static const uchar *
 406 #ifndef __SSE4_2__
 407 __attribute__((__target__("sse4.2")))
 408 #endif
 409 search_line_sse42 (const uchar *s, const uchar *end)
 410 {
 411   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 412   static const v16qi search = { '\n', '\r', '?', '\\' };
 413
 414   uintptr_t si = (uintptr_t)s;
 415   uintptr_t index;
 416
 417   /* Check for unaligned input.  */
 418   if (si & 15)
 419     {
 420       if (__builtin_expect (end - s < 16, 0)
 421           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 422         {
 423           /* There are less than 16 bytes left in the buffer, and less
 424              than 16 bytes left on the page.  Reading 16 bytes at this
 425              point might generate a spurious page fault.  Defer to the
 426              SSE2 implementation, which already handles alignment.  */
 427           return search_line_sse2 (s, end);
 428         }
 429
 430       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 431          memory need not be aligned.  */
 432       __asm ("%vpcmpestri $0, (%1), %2"
 433              : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
 434       if (__builtin_expect (index < 16, 0))
 435         goto found;
 436
 437       /* Advance the pointer to an aligned address.  We will re-scan a
 438          few bytes, but we no longer need care for reading past the
 439          end of a page, since we're guaranteed a match.  */
 440       s = (const uchar *)((si + 16) & -16);
 441     }
 442
 443   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 444      in inline assembly, we can make proper use of the flags set.  */
 445   __asm (      "sub $16, %1\n"
 446         "       .balign 16\n"
 447         "0:     add $16, %1\n"
 448         "       %vpcmpestri $0, (%1), %2\n"
 449         "       jnc 0b"
 450         : "=&c"(index), "+r"(s)
 451         : "x"(search), "a"(4), "d"(16));
 452
 453  found:
 454   return s + index;
 455 }
 456
 457 /* Check the CPU capabilities.  */
 458
 459 #include "../gcc/config/i386/cpuid.h"
 460
 461 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 462 static search_line_fast_type search_line_fast;
 463
 464 static void __attribute__((constructor))
 465 init_vectorized_lexer (void)
 466 {
 467   unsigned dummy, ecx = 0, edx = 0;
 468   search_line_fast_type impl = search_line_acc_char;
 469   int minimum = 0;
 470
 471 #if defined(__SSE4_2__)
 472   minimum = 3;
 473 #elif defined(__SSE2__)
 474   minimum = 2;
 475 #elif defined(__SSE__) || defined(__3dNOW_A__)
 476   minimum = 1;
 477 #endif
 478
 479   if (minimum == 3)
 480     impl = search_line_sse42;
 481   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 482     {
 483       if (minimum == 3 || (ecx & bit_SSE4_2))
 484         impl = search_line_sse42;
 485       else if (minimum == 2 || (edx & bit_SSE2))
 486         impl = search_line_sse2;
 487       else if (minimum == 1 || (edx & bit_SSE))
 488         impl = search_line_mmx;
 489     }
 490   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 491     {
 492       if (minimum == 1 || edx & bit_3DNOWP)
 493         impl = search_line_mmx;
 494     }
 495
 496   search_line_fast = impl;
 497 }
 498
 499 #elif defined(__GNUC__) && defined(__ALTIVEC__)
 500
 501 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 502 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 503    so we can't compile this function without -maltivec on the command line
 504    (or implied by some other switch).  */
 505
 506 static const uchar *
 507 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 508 {
 509   typedef __attribute__((altivec(vector))) unsigned char vc;
 510
 511   const vc repl_nl = {
 512     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 513     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 514   };
 515   const vc repl_cr = {
 516     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 517     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 518   };
 519   const vc repl_bs = {
 520     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 521     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 522   };
 523   const vc repl_qm = {
 524     '?', '?', '?', '?', '?', '?', '?', '?',
 525     '?', '?', '?', '?', '?', '?', '?', '?',
 526   };
 527   const vc ones = {
 528     -1, -1, -1, -1, -1, -1, -1, -1,
 529     -1, -1, -1, -1, -1, -1, -1, -1,
 530   };
 531   const vc zero = { 0 };
 532
 533   vc data, mask, t;
 534
 535   /* Altivec loads automatically mask addresses with -16.  This lets us
 536      issue the first load as early as possible.  */
 537   data = __builtin_vec_ld(0, (const vc *)s);
 538
 539   /* Discard bytes before the beginning of the buffer.  Do this by
 540      beginning with all ones and shifting in zeros according to the
 541      mis-alignment.  The LVSR instruction pulls the exact shift we
 542      want from the address.  */
 543   mask = __builtin_vec_lvsr(0, s);
 544   mask = __builtin_vec_perm(zero, ones, mask);
 545   data &= mask;
 546
 547   /* While altivec loads mask addresses, we still need to align S so
 548      that the offset we compute at the end is correct.  */
 549   s = (const uchar *)((uintptr_t)s & -16);
 550
 551   /* Main loop processing 16 bytes at a time.  */
 552   goto start;
 553   do
 554     {
 555       vc m_nl, m_cr, m_bs, m_qm;
 556
 557       s += 16;
 558       data = __builtin_vec_ld(0, (const vc *)s);
 559
 560     start:
 561       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 562       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 563       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 564       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 565       t = (m_nl | m_cr) | (m_bs | m_qm);
 566
 567       /* T now contains 0xff in bytes for which we matched one of the relevant
 568          characters.  We want to exit the loop if any byte in T is non-zero.
 569          Below is the expansion of vec_any_ne(t, zero).  */
 570     }
 571   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 572
 573   {
 574 #define N  (sizeof(vc) / sizeof(long))
 575
 576     typedef char check_count[(N == 2 || N == 4) * 2 - 1];
 577     union {
 578       vc v;
 579       unsigned long l[N];
 580     } u;
 581     unsigned long l, i = 0;
 582
 583     u.v = t;
 584
 585     /* Find the first word of T that is non-zero.  */
 586     switch (N)
 587       {
 588       case 4:
 589         l = u.l[i++];
 590         if (l != 0)
 591           break;
 592         s += sizeof(unsigned long);
 593         l = u.l[i++];
 594         if (l != 0)
 595           break;
 596         s += sizeof(unsigned long);
 597       case 2:
 598         l = u.l[i++];
 599         if (l != 0)
 600           break;
 601         s += sizeof(unsigned long);
 602         l = u.l[i];
 603       }
 604
 605     /* L now contains 0xff in bytes for which we matched one of the
 606        relevant characters.  We can find the byte index by finding
 607        its bit index and dividing by 8.  */
 608     l = __builtin_clzl(l) >> 3;
 609     return s + l;
 610
 611 #undef N
 612   }
 613 }
 614
 615 #else
 616
 617 /* We only have one accellerated alternative.  Use a direct call so that
 618    we encourage inlining.  */
 619
 620 #define search_line_fast  search_line_acc_char
 621
 622 #endif
 623
 624 /* Returns with a logical line that contains no escaped newlines or
 625    trigraphs.  This is a time-critical inner loop.  */
 626 void
 627 _cpp_clean_line (cpp_reader *pfile)
 628 {
 629   cpp_buffer *buffer;
 630   const uchar *s;
 631   uchar c, *d, *p;
 632
 633   buffer = pfile->buffer;
 634   buffer->cur_note = buffer->notes_used = 0;
 635   buffer->cur = buffer->line_base = buffer->next_line;
 636   buffer->need_line = false;
 637   s = buffer->next_line;
 638
 639   if (!buffer->from_stage3)
 640     {
 641       const uchar *pbackslash = NULL;
 642
 643       /* Fast path.  This is the common case of an un-escaped line with
 644          no trigraphs.  The primary win here is by not writing any
 645          data back to memory until we have to.  */
 646       while (1)
 647         {
 648           /* Perform an optimized search for \n, \r, \\, ?.  */
 649           s = search_line_fast (s, buffer->rlimit);
 650
 651           c = *s;
 652           if (c == '\\')
 653             {
 654               /* Record the location of the backslash and continue.  */
 655               pbackslash = s++;
 656             }
 657           else if (__builtin_expect (c == '?', 0))
 658             {
 659               if (__builtin_expect (s[1] == '?', false)
 660                    && _cpp_trigraph_map[s[2]])
 661                 {
 662                   /* Have a trigraph.  We may or may not have to convert
 663                      it.  Add a line note regardless, for -Wtrigraphs.  */
 664                   add_line_note (buffer, s, s[2]);
 665                   if (CPP_OPTION (pfile, trigraphs))
 666                     {
 667                       /* We do, and that means we have to switch to the
 668                          slow path.  */
 669                       d = (uchar *) s;
 670                       *d = _cpp_trigraph_map[s[2]];
 671                       s += 2;
 672                       goto slow_path;
 673                     }
 674                 }
 675               /* Not a trigraph.  Continue on fast-path.  */
 676               s++;
 677             }
 678           else
 679             break;
 680         }
 681
 682       /* This must be \r or \n.  We're either done, or we'll be forced
 683          to write back to the buffer and continue on the slow path.  */
 684       d = (uchar *) s;
 685
 686       if (__builtin_expect (s == buffer->rlimit, false))
 687         goto done;
 688
 689       /* DOS line ending? */
 690       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 691         {
 692           s++;
 693           if (s == buffer->rlimit)
 694             goto done;
 695         }
 696
 697       if (__builtin_expect (pbackslash == NULL, true))
 698         goto done;
 699
 700       /* Check for escaped newline.  */
 701       p = d;
 702       while (is_nvspace (p[-1]))
 703         p--;
 704       if (p - 1 != pbackslash)
 705         goto done;
 706
 707       /* Have an escaped newline; process it and proceed to
 708          the slow path.  */
 709       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 710       d = p - 2;
 711       buffer->next_line = p - 1;
 712
 713     slow_path:
 714       while (1)
 715         {
 716           c = *++s;
 717           *++d = c;
 718
 719           if (c == '\n' || c == '\r')
 720             {
 721               /* Handle DOS line endings.  */
 722               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 723                 s++;
 724               if (s == buffer->rlimit)
 725                 break;
 726
 727               /* Escaped?  */
 728               p = d;
 729               while (p != buffer->next_line && is_nvspace (p[-1]))
 730                 p--;
 731               if (p == buffer->next_line || p[-1] != '\\')
 732                 break;
 733
 734               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 735               d = p - 2;
 736               buffer->next_line = p - 1;
 737             }
 738           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 739             {
 740               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 741               add_line_note (buffer, d, s[2]);
 742               if (CPP_OPTION (pfile, trigraphs))
 743                 {
 744                   *d = _cpp_trigraph_map[s[2]];
 745                   s += 2;
 746                 }
 747             }
 748         }
 749     }
 750   else
 751     {
 752       while (*s != '\n' && *s != '\r')
 753         s++;
 754       d = (uchar *) s;
 755
 756       /* Handle DOS line endings.  */
 757       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 758         s++;
 759     }
 760
 761  done:
 762   *d = '\n';
 763   /* A sentinel note that should never be processed.  */
 764   add_line_note (buffer, d + 1, '\n');
 765   buffer->next_line = s + 1;
 766 }
 767
 768 /* Return true if the trigraph indicated by NOTE should be warned
 769    about in a comment.  */
 770 static bool
 771 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 772 {
 773   const uchar *p;
 774
 775   /* Within comments we don't warn about trigraphs, unless the
 776      trigraph forms an escaped newline, as that may change
 777      behavior.  */
 778   if (note->type != '/')
 779     return false;
 780
 781   /* If -trigraphs, then this was an escaped newline iff the next note
 782      is coincident.  */
 783   if (CPP_OPTION (pfile, trigraphs))
 784     return note[1].pos == note->pos;
 785
 786   /* Otherwise, see if this forms an escaped newline.  */
 787   p = note->pos + 3;
 788   while (is_nvspace (*p))
 789     p++;
 790
 791   /* There might have been escaped newlines between the trigraph and the
 792      newline we found.  Hence the position test.  */
 793   return (*p == '\n' && p < note[1].pos);
 794 }
 795
 796 /* Process the notes created by add_line_note as far as the current
 797    location.  */
 798 void
 799 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 800 {
 801   cpp_buffer *buffer = pfile->buffer;
 802
 803   for (;;)
 804     {
 805       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 806       unsigned int col;
 807
 808       if (note->pos > buffer->cur)
 809         break;
 810
 811       buffer->cur_note++;
 812       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 813
 814       if (note->type == '\\' || note->type == ' ')
 815         {
 816           if (note->type == ' ' && !in_comment)
 817             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 818                                  "backslash and newline separated by space");
 819
 820           if (buffer->next_line > buffer->rlimit)
 821             {
 822               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 823                                    "backslash-newline at end of file");
 824               /* Prevent "no newline at end of file" warning.  */
 825               buffer->next_line = buffer->rlimit;
 826             }
 827
 828           buffer->line_base = note->pos;
 829           CPP_INCREMENT_LINE (pfile, 0);
 830         }
 831       else if (_cpp_trigraph_map[note->type])
 832         {
 833           if (CPP_OPTION (pfile, warn_trigraphs)
 834               && (!in_comment || warn_in_comment (pfile, note)))
 835             {
 836               if (CPP_OPTION (pfile, trigraphs))
 837                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 838                                        pfile->line_table->highest_line, col,
 839                                        "trigraph ??%c converted to %c",
 840                                        note->type,
 841                                        (int) _cpp_trigraph_map[note->type]);
 842               else
 843                 {
 844                   cpp_warning_with_line
 845                     (pfile, CPP_W_TRIGRAPHS,
 846                      pfile->line_table->highest_line, col,
 847                      "trigraph ??%c ignored, use -trigraphs to enable",
 848                      note->type);
 849                 }
 850             }
 851         }
 852       else if (note->type == 0)
 853         /* Already processed in lex_raw_string.  */;
 854       else
 855         abort ();
 856     }
 857 }
 858
 859 /* Skip a C-style block comment.  We find the end of the comment by
 860    seeing if an asterisk is before every '/' we encounter.  Returns
 861    nonzero if comment terminated by EOF, zero otherwise.
 862
 863    Buffer->cur points to the initial asterisk of the comment.  */
 864 bool
 865 _cpp_skip_block_comment (cpp_reader *pfile)
 866 {
 867   cpp_buffer *buffer = pfile->buffer;
 868   const uchar *cur = buffer->cur;
 869   uchar c;
 870
 871   cur++;
 872   if (*cur == '/')
 873     cur++;
 874
 875   for (;;)
 876     {
 877       /* People like decorating comments with '*', so check for '/'
 878          instead for efficiency.  */
 879       c = *cur++;
 880
 881       if (c == '/')
 882         {
 883           if (cur[-2] == '*')
 884             break;
 885
 886           /* Warn about potential nested comments, but not if the '/'
 887              comes immediately before the true comment delimiter.
 888              Don't bother to get it right across escaped newlines.  */
 889           if (CPP_OPTION (pfile, warn_comments)
 890               && cur[0] == '*' && cur[1] != '/')
 891             {
 892               buffer->cur = cur;
 893               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 894                                      pfile->line_table->highest_line,
 895                                      CPP_BUF_COL (buffer),
 896                                      "\"/*\" within comment");
 897             }
 898         }
 899       else if (c == '\n')
 900         {
 901           unsigned int cols;
 902           buffer->cur = cur - 1;
 903           _cpp_process_line_notes (pfile, true);
 904           if (buffer->next_line >= buffer->rlimit)
 905             return true;
 906           _cpp_clean_line (pfile);
 907
 908           cols = buffer->next_line - buffer->line_base;
 909           CPP_INCREMENT_LINE (pfile, cols);
 910
 911           cur = buffer->cur;
 912         }
 913     }
 914
 915   buffer->cur = cur;
 916   _cpp_process_line_notes (pfile, true);
 917   return false;
 918 }
 919
 920 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 921    terminating newline.  Handles escaped newlines.  Returns nonzero
 922    if a multiline comment.  */
 923 static int
 924 skip_line_comment (cpp_reader *pfile)
 925 {
 926   cpp_buffer *buffer = pfile->buffer;
 927   source_location orig_line = pfile->line_table->highest_line;
 928
 929   while (*buffer->cur != '\n')
 930     buffer->cur++;
 931
 932   _cpp_process_line_notes (pfile, true);
 933   return orig_line != pfile->line_table->highest_line;
 934 }
 935
 936 /* Skips whitespace, saving the next non-whitespace character.  */
 937 static void
 938 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 939 {
 940   cpp_buffer *buffer = pfile->buffer;
 941   bool saw_NUL = false;
 942
 943   do
 944     {
 945       /* Horizontal space always OK.  */
 946       if (c == ' ' || c == '\t')
 947         ;
 948       /* Just \f \v or \0 left.  */
 949       else if (c == '\0')
 950         saw_NUL = true;
 951       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 952         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 953                              CPP_BUF_COL (buffer),
 954                              "%s in preprocessing directive",
 955                              c == '\f' ? "form feed" : "vertical tab");
 956
 957       c = *buffer->cur++;
 958     }
 959   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 960   while (is_nvspace (c));
 961
 962   if (saw_NUL)
 963     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 964
 965   buffer->cur--;
 966 }
 967
 968 /* See if the characters of a number token are valid in a name (no
 969    '.', '+' or '-').  */
 970 static int
 971 name_p (cpp_reader *pfile, const cpp_string *string)
 972 {
 973   unsigned int i;
 974
 975   for (i = 0; i < string->len; i++)
 976     if (!is_idchar (string->text[i]))
 977       return 0;
 978
 979   return 1;
 980 }
 981
 982 /* After parsing an identifier or other sequence, produce a warning about
 983    sequences not in NFC/NFKC.  */
 984 static void
 985 warn_about_normalization (cpp_reader *pfile,
 986                           const cpp_token *token,
 987                           const struct normalize_state *s)
 988 {
 989   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
 990       && !pfile->state.skipping)
 991     {
 992       /* Make sure that the token is printed using UCNs, even
 993          if we'd otherwise happily print UTF-8.  */
 994       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
 995       size_t sz;
 996
 997       sz = cpp_spell_token (pfile, token, buf, false) - buf;
 998       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 999         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1000                                "`%.*s' is not in NFKC", (int) sz, buf);
1001       else
1002         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1003                                "`%.*s' is not in NFC", (int) sz, buf);
1004     }
1005 }
1006
1007 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1008    an identifier.  FIRST is TRUE if this starts an identifier.  */
1009 static bool
1010 forms_identifier_p (cpp_reader *pfile, int first,
1011                     struct normalize_state *state)
1012 {
1013   cpp_buffer *buffer = pfile->buffer;
1014
1015   if (*buffer->cur == '$')
1016     {
1017       if (!CPP_OPTION (pfile, dollars_in_ident))
1018         return false;
1019
1020       buffer->cur++;
1021       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1022         {
1023           CPP_OPTION (pfile, warn_dollars) = 0;
1024           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1025         }
1026
1027       return true;
1028     }
1029
1030   /* Is this a syntactically valid UCN?  */
1031   if (CPP_OPTION (pfile, extended_identifiers)
1032       && *buffer->cur == '\\'
1033       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1034     {
1035       buffer->cur += 2;
1036       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1037                           state))
1038         return true;
1039       buffer->cur -= 2;
1040     }
1041
1042   return false;
1043 }
1044
1045 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1046 static cpp_hashnode *
1047 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1048 {
1049   cpp_hashnode *result;
1050   const uchar *cur;
1051   unsigned int len;
1052   unsigned int hash = HT_HASHSTEP (0, *base);
1053
1054   cur = base + 1;
1055   while (ISIDNUM (*cur))
1056     {
1057       hash = HT_HASHSTEP (hash, *cur);
1058       cur++;
1059     }
1060   len = cur - base;
1061   hash = HT_HASHFINISH (hash, len);
1062   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1063                                               base, len, hash, HT_ALLOC));
1064
1065   /* Rarely, identifiers require diagnostics when lexed.  */
1066   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1067                         && !pfile->state.skipping, 0))
1068     {
1069       /* It is allowed to poison the same identifier twice.  */
1070       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1071         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1072                    NODE_NAME (result));
1073
1074       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1075          replacement list of a variadic macro.  */
1076       if (result == pfile->spec_nodes.n__VA_ARGS__
1077           && !pfile->state.va_args_ok)
1078         cpp_error (pfile, CPP_DL_PEDWARN,
1079                    "__VA_ARGS__ can only appear in the expansion"
1080                    " of a C99 variadic macro");
1081
1082       /* For -Wc++-compat, warn about use of C++ named operators.  */
1083       if (result->flags & NODE_WARN_OPERATOR)
1084         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1085                      "identifier \"%s\" is a special operator name in C++",
1086                      NODE_NAME (result));
1087     }
1088
1089   return result;
1090 }
1091
1092 /* Get the cpp_hashnode of an identifier specified by NAME in
1093    the current cpp_reader object.  If none is found, NULL is returned.  */
1094 cpp_hashnode *
1095 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1096 {
1097   cpp_hashnode *result;
1098   result = lex_identifier_intern (pfile, (uchar *) name);
1099   return result;
1100 }
1101
1102 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1103 static cpp_hashnode *
1104 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1105                 struct normalize_state *nst)
1106 {
1107   cpp_hashnode *result;
1108   const uchar *cur;
1109   unsigned int len;
1110   unsigned int hash = HT_HASHSTEP (0, *base);
1111
1112   cur = pfile->buffer->cur;
1113   if (! starts_ucn)
1114     while (ISIDNUM (*cur))
1115       {
1116         hash = HT_HASHSTEP (hash, *cur);
1117         cur++;
1118       }
1119   pfile->buffer->cur = cur;
1120   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1121     {
1122       /* Slower version for identifiers containing UCNs (or $).  */
1123       do {
1124         while (ISIDNUM (*pfile->buffer->cur))
1125           {
1126             pfile->buffer->cur++;
1127             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1128           }
1129       } while (forms_identifier_p (pfile, false, nst));
1130       result = _cpp_interpret_identifier (pfile, base,
1131                                           pfile->buffer->cur - base);
1132     }
1133   else
1134     {
1135       len = cur - base;
1136       hash = HT_HASHFINISH (hash, len);
1137
1138       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1139                                                   base, len, hash, HT_ALLOC));
1140     }
1141
1142   /* Rarely, identifiers require diagnostics when lexed.  */
1143   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1144                         && !pfile->state.skipping, 0))
1145     {
1146       /* It is allowed to poison the same identifier twice.  */
1147       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1148         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1149                    NODE_NAME (result));
1150
1151       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1152          replacement list of a variadic macro.  */
1153       if (result == pfile->spec_nodes.n__VA_ARGS__
1154           && !pfile->state.va_args_ok)
1155         cpp_error (pfile, CPP_DL_PEDWARN,
1156                    "__VA_ARGS__ can only appear in the expansion"
1157                    " of a C99 variadic macro");
1158
1159       /* For -Wc++-compat, warn about use of C++ named operators.  */
1160       if (result->flags & NODE_WARN_OPERATOR)
1161         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1162                      "identifier \"%s\" is a special operator name in C++",
1163                      NODE_NAME (result));
1164     }
1165
1166   return result;
1167 }
1168
1169 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1170 static void
1171 lex_number (cpp_reader *pfile, cpp_string *number,
1172             struct normalize_state *nst)
1173 {
1174   const uchar *cur;
1175   const uchar *base;
1176   uchar *dest;
1177
1178   base = pfile->buffer->cur - 1;
1179   do
1180     {
1181       cur = pfile->buffer->cur;
1182
1183       /* N.B. ISIDNUM does not include $.  */
1184       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1185         {
1186           cur++;
1187           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1188         }
1189
1190       pfile->buffer->cur = cur;
1191     }
1192   while (forms_identifier_p (pfile, false, nst));
1193
1194   number->len = cur - base;
1195   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1196   memcpy (dest, base, number->len);
1197   dest[number->len] = '\0';
1198   number->text = dest;
1199 }
1200
1201 /* Create a token of type TYPE with a literal spelling.  */
1202 static void
1203 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1204                 unsigned int len, enum cpp_ttype type)
1205 {
1206   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1207
1208   memcpy (dest, base, len);
1209   dest[len] = '\0';
1210   token->type = type;
1211   token->val.str.len = len;
1212   token->val.str.text = dest;
1213 }
1214
1215 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1216    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1217
1218 static void
1219 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1220                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1221 {
1222   _cpp_buff *first_buff = *first_buff_p;
1223   _cpp_buff *last_buff = *last_buff_p;
1224
1225   if (first_buff == NULL)
1226     first_buff = last_buff = _cpp_get_buff (pfile, len);
1227   else if (len > BUFF_ROOM (last_buff))
1228     {
1229       size_t room = BUFF_ROOM (last_buff);
1230       memcpy (BUFF_FRONT (last_buff), base, room);
1231       BUFF_FRONT (last_buff) += room;
1232       base += room;
1233       len -= room;
1234       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1235     }
1236
1237   memcpy (BUFF_FRONT (last_buff), base, len);
1238   BUFF_FRONT (last_buff) += len;
1239
1240   *first_buff_p = first_buff;
1241   *last_buff_p = last_buff;
1242 }
1243
1244 /* Lexes a raw string.  The stored string contains the spelling, including
1245    double quotes, delimiter string, '(' and ')', any leading
1246    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1247    literal, or CPP_OTHER if it was not properly terminated.
1248
1249    The spelling is NUL-terminated, but it is not guaranteed that this
1250    is the first NUL since embedded NULs are preserved.  */
1251
1252 static void
1253 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1254                 const uchar *cur)
1255 {
1256   source_location saw_NUL = 0;
1257   const uchar *raw_prefix;
1258   unsigned int raw_prefix_len = 0;
1259   enum cpp_ttype type;
1260   size_t total_len = 0;
1261   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1262   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1263
1264   type = (*base == 'L' ? CPP_WSTRING :
1265           *base == 'U' ? CPP_STRING32 :
1266           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1267           : CPP_STRING);
1268
1269   raw_prefix = cur + 1;
1270   while (raw_prefix_len < 16)
1271     {
1272       switch (raw_prefix[raw_prefix_len])
1273         {
1274         case ' ': case '(': case ')': case '\\': case '\t':
1275         case '\v': case '\f': case '\n': default:
1276           break;
1277         /* Basic source charset except the above chars.  */
1278         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1279         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1280         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1281         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1282         case 'y': case 'z':
1283         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1284         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1285         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1286         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1287         case 'Y': case 'Z':
1288         case '0': case '1': case '2': case '3': case '4': case '5':
1289         case '6': case '7': case '8': case '9':
1290         case '_': case '{': case '}': case '#': case '[': case ']':
1291         case '<': case '>': case '%': case ':': case ';': case '.':
1292         case '?': case '*': case '+': case '-': case '/': case '^':
1293         case '&': case '|': case '~': case '!': case '=': case ',':
1294         case '"': case '\'':
1295           raw_prefix_len++;
1296           continue;
1297         }
1298       break;
1299     }
1300
1301   if (raw_prefix[raw_prefix_len] != '(')
1302     {
1303       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1304                 + 1;
1305       if (raw_prefix_len == 16)
1306         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1307                              "raw string delimiter longer than 16 characters");
1308       else
1309         cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1310                              "invalid character '%c' in raw string delimiter",
1311                              (int) raw_prefix[raw_prefix_len]);
1312       pfile->buffer->cur = raw_prefix - 1;
1313       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1314       return;
1315     }
1316
1317   cur = raw_prefix + raw_prefix_len + 1;
1318   for (;;)
1319     {
1320 #define BUF_APPEND(STR,LEN)                                     \
1321       do {                                                      \
1322         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1323                         &first_buff, &last_buff);               \
1324         total_len += (LEN);                                     \
1325       } while (0);
1326
1327       cppchar_t c;
1328
1329       /* If we previously performed any trigraph or line splicing
1330          transformations, undo them within the body of the raw string.  */
1331       while (note->pos < cur)
1332         ++note;
1333       for (; note->pos == cur; ++note)
1334         {
1335           switch (note->type)
1336             {
1337             case '\\':
1338             case ' ':
1339               /* Restore backslash followed by newline.  */
1340               BUF_APPEND (base, cur - base);
1341               base = cur;
1342               BUF_APPEND ("\\", 1);
1343             after_backslash:
1344               if (note->type == ' ')
1345                 {
1346                   /* GNU backslash whitespace newline extension.  FIXME
1347                      could be any sequence of non-vertical space.  When we
1348                      can properly restore any such sequence, we should mark
1349                      this note as handled so _cpp_process_line_notes
1350                      doesn't warn.  */
1351                   BUF_APPEND (" ", 1);
1352                 }
1353
1354               BUF_APPEND ("\n", 1);
1355               break;
1356
1357             case 0:
1358               /* Already handled.  */
1359               break;
1360
1361             default:
1362               if (_cpp_trigraph_map[note->type])
1363                 {
1364                   /* Don't warn about this trigraph in
1365                      _cpp_process_line_notes, since trigraphs show up as
1366                      trigraphs in raw strings.  */
1367                   uchar type = note->type;
1368                   note->type = 0;
1369
1370                   if (!CPP_OPTION (pfile, trigraphs))
1371                     /* If we didn't convert the trigraph in the first
1372                        place, don't do anything now either.  */
1373                     break;
1374
1375                   BUF_APPEND (base, cur - base);
1376                   base = cur;
1377                   BUF_APPEND ("??", 2);
1378
1379                   /* ??/ followed by newline gets two line notes, one for
1380                      the trigraph and one for the backslash/newline.  */
1381                   if (type == '/' && note[1].pos == cur)
1382                     {
1383                       if (note[1].type != '\\'
1384                           && note[1].type != ' ')
1385                         abort ();
1386                       BUF_APPEND ("/", 1);
1387                       ++note;
1388                       goto after_backslash;
1389                     }
1390                   /* The ) from ??) could be part of the suffix.  */
1391                   else if (type == ')'
1392                            && strncmp ((const char *) cur+1,
1393                                        (const char *) raw_prefix,
1394                                        raw_prefix_len) == 0
1395                            && cur[raw_prefix_len+1] == '"')
1396                     {
1397                       cur += raw_prefix_len+2;
1398                       goto break_outer_loop;
1399                     }
1400                   else
1401                     {
1402                       /* Skip the replacement character.  */
1403                       base = ++cur;
1404                       BUF_APPEND (&type, 1);
1405                     }
1406                 }
1407               else
1408                 abort ();
1409               break;
1410             }
1411         }
1412       c = *cur++;
1413
1414       if (c == ')'
1415           && strncmp ((const char *) cur, (const char *) raw_prefix,
1416                       raw_prefix_len) == 0
1417           && cur[raw_prefix_len] == '"')
1418         {
1419           cur += raw_prefix_len + 1;
1420           break;
1421         }
1422       else if (c == '\n')
1423         {
1424           if (pfile->state.in_directive
1425               || pfile->state.parsing_args
1426               || pfile->state.in_deferred_pragma)
1427             {
1428               cur--;
1429               type = CPP_OTHER;
1430               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1431                                    "unterminated raw string");
1432               break;
1433             }
1434
1435           BUF_APPEND (base, cur - base);
1436
1437           if (pfile->buffer->cur < pfile->buffer->rlimit)
1438             CPP_INCREMENT_LINE (pfile, 0);
1439           pfile->buffer->need_line = true;
1440
1441           pfile->buffer->cur = cur-1;
1442           _cpp_process_line_notes (pfile, false);
1443           if (!_cpp_get_fresh_line (pfile))
1444             {
1445               source_location src_loc = token->src_loc;
1446               token->type = CPP_EOF;
1447               /* Tell the compiler the line number of the EOF token.  */
1448               token->src_loc = pfile->line_table->highest_line;
1449               token->flags = BOL;
1450               if (first_buff != NULL)
1451                 _cpp_release_buff (pfile, first_buff);
1452               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1453                                    "unterminated raw string");
1454               return;
1455             }
1456
1457           cur = base = pfile->buffer->cur;
1458           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1459         }
1460       else if (c == '\0' && !saw_NUL)
1461         LINEMAP_POSITION_FOR_COLUMN (saw_NUL, pfile->line_table,
1462                                      CPP_BUF_COLUMN (pfile->buffer, cur));
1463     }
1464  break_outer_loop:
1465
1466   if (saw_NUL && !pfile->state.skipping)
1467     cpp_error_with_line (pfile, CPP_DL_WARNING, saw_NUL, 0,
1468                "null character(s) preserved in literal");
1469
1470   pfile->buffer->cur = cur;
1471   if (first_buff == NULL)
1472     create_literal (pfile, token, base, cur - base, type);
1473   else
1474     {
1475       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1476
1477       token->type = type;
1478       token->val.str.len = total_len + (cur - base);
1479       token->val.str.text = dest;
1480       last_buff = first_buff;
1481       while (last_buff != NULL)
1482         {
1483           memcpy (dest, last_buff->base,
1484                   BUFF_FRONT (last_buff) - last_buff->base);
1485           dest += BUFF_FRONT (last_buff) - last_buff->base;
1486           last_buff = last_buff->next;
1487         }
1488       _cpp_release_buff (pfile, first_buff);
1489       memcpy (dest, base, cur - base);
1490       dest[cur - base] = '\0';
1491     }
1492 }
1493
1494 /* Lexes a string, character constant, or angle-bracketed header file
1495    name.  The stored string contains the spelling, including opening
1496    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1497    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1498    if it was not properly terminated, or CPP_LESS for an unterminated
1499    header name which must be relexed as normal tokens.
1500
1501    The spelling is NUL-terminated, but it is not guaranteed that this
1502    is the first NUL since embedded NULs are preserved.  */
1503 static void
1504 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1505 {
1506   bool saw_NUL = false;
1507   const uchar *cur;
1508   cppchar_t terminator;
1509   enum cpp_ttype type;
1510
1511   cur = base;
1512   terminator = *cur++;
1513   if (terminator == 'L' || terminator == 'U')
1514     terminator = *cur++;
1515   else if (terminator == 'u')
1516     {
1517       terminator = *cur++;
1518       if (terminator == '8')
1519         terminator = *cur++;
1520     }
1521   if (terminator == 'R')
1522     {
1523       lex_raw_string (pfile, token, base, cur);
1524       return;
1525     }
1526   if (terminator == '"')
1527     type = (*base == 'L' ? CPP_WSTRING :
1528             *base == 'U' ? CPP_STRING32 :
1529             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1530                          : CPP_STRING);
1531   else if (terminator == '\'')
1532     type = (*base == 'L' ? CPP_WCHAR :
1533             *base == 'U' ? CPP_CHAR32 :
1534             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1535   else
1536     terminator = '>', type = CPP_HEADER_NAME;
1537
1538   for (;;)
1539     {
1540       cppchar_t c = *cur++;
1541
1542       /* In #include-style directives, terminators are not escapable.  */
1543       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1544         cur++;
1545       else if (c == terminator)
1546         break;
1547       else if (c == '\n')
1548         {
1549           cur--;
1550           /* Unmatched quotes always yield undefined behavior, but
1551              greedy lexing means that what appears to be an unterminated
1552              header name may actually be a legitimate sequence of tokens.  */
1553           if (terminator == '>')
1554             {
1555               token->type = CPP_LESS;
1556               return;
1557             }
1558           type = CPP_OTHER;
1559           break;
1560         }
1561       else if (c == '\0')
1562         saw_NUL = true;
1563     }
1564
1565   if (saw_NUL && !pfile->state.skipping)
1566     cpp_error (pfile, CPP_DL_WARNING,
1567                "null character(s) preserved in literal");
1568
1569   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1570     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1571                (int) terminator);
1572
1573   pfile->buffer->cur = cur;
1574   create_literal (pfile, token, base, cur - base, type);
1575 }
1576
1577 /* Return the comment table. The client may not make any assumption
1578    about the ordering of the table.  */
1579 cpp_comment_table *
1580 cpp_get_comments (cpp_reader *pfile)
1581 {
1582   return &pfile->comments;
1583 }
1584
1585 /* Append a comment to the end of the comment table. */
1586 static void
1587 store_comment (cpp_reader *pfile, cpp_token *token)
1588 {
1589   int len;
1590
1591   if (pfile->comments.allocated == 0)
1592     {
1593       pfile->comments.allocated = 256;
1594       pfile->comments.entries = (cpp_comment *) xmalloc
1595         (pfile->comments.allocated * sizeof (cpp_comment));
1596     }
1597
1598   if (pfile->comments.count == pfile->comments.allocated)
1599     {
1600       pfile->comments.allocated *= 2;
1601       pfile->comments.entries = (cpp_comment *) xrealloc
1602         (pfile->comments.entries,
1603          pfile->comments.allocated * sizeof (cpp_comment));
1604     }
1605
1606   len = token->val.str.len;
1607
1608   /* Copy comment. Note, token may not be NULL terminated. */
1609   pfile->comments.entries[pfile->comments.count].comment =
1610     (char *) xmalloc (sizeof (char) * (len + 1));
1611   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1612           token->val.str.text, len);
1613   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1614
1615   /* Set source location. */
1616   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1617
1618   /* Increment the count of entries in the comment table. */
1619   pfile->comments.count++;
1620 }
1621
1622 /* The stored comment includes the comment start and any terminator.  */
1623 static void
1624 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1625               cppchar_t type)
1626 {
1627   unsigned char *buffer;
1628   unsigned int len, clen;
1629
1630   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1631
1632   /* C++ comments probably (not definitely) have moved past a new
1633      line, which we don't want to save in the comment.  */
1634   if (is_vspace (pfile->buffer->cur[-1]))
1635     len--;
1636
1637   /* If we are currently in a directive, then we need to store all
1638      C++ comments as C comments internally, and so we need to
1639      allocate a little extra space in that case.
1640
1641      Note that the only time we encounter a directive here is
1642      when we are saving comments in a "#define".  */
1643   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
1644
1645   buffer = _cpp_unaligned_alloc (pfile, clen);
1646
1647   token->type = CPP_COMMENT;
1648   token->val.str.len = clen;
1649   token->val.str.text = buffer;
1650
1651   buffer[0] = '/';
1652   memcpy (buffer + 1, from, len - 1);
1653
1654   /* Finish conversion to a C comment, if necessary.  */
1655   if (pfile->state.in_directive && type == '/')
1656     {
1657       buffer[1] = '*';
1658       buffer[clen - 2] = '*';
1659       buffer[clen - 1] = '/';
1660     }
1661
1662   /* Finally store this comment for use by clients of libcpp. */
1663   store_comment (pfile, token);
1664 }
1665
1666 /* Allocate COUNT tokens for RUN.  */
1667 void
1668 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1669 {
1670   run->base = XNEWVEC (cpp_token, count);
1671   run->limit = run->base + count;
1672   run->next = NULL;
1673 }
1674
1675 /* Returns the next tokenrun, or creates one if there is none.  */
1676 static tokenrun *
1677 next_tokenrun (tokenrun *run)
1678 {
1679   if (run->next == NULL)
1680     {
1681       run->next = XNEW (tokenrun);
1682       run->next->prev = run;
1683       _cpp_init_tokenrun (run->next, 250);
1684     }
1685
1686   return run->next;
1687 }
1688
1689 /* Look ahead in the input stream.  */
1690 const cpp_token *
1691 cpp_peek_token (cpp_reader *pfile, int index)
1692 {
1693   cpp_context *context = pfile->context;
1694   const cpp_token *peektok;
1695   int count;
1696
1697   /* First, scan through any pending cpp_context objects.  */
1698   while (context->prev)
1699     {
1700       ptrdiff_t sz = (context->direct_p
1701                       ? LAST (context).token - FIRST (context).token
1702                       : LAST (context).ptoken - FIRST (context).ptoken);
1703
1704       if (index < (int) sz)
1705         return (context->direct_p
1706                 ? FIRST (context).token + index
1707                 : *(FIRST (context).ptoken + index));
1708
1709       index -= (int) sz;
1710       context = context->prev;
1711     }
1712
1713   /* We will have to read some new tokens after all (and do so
1714      without invalidating preceding tokens).  */
1715   count = index;
1716   pfile->keep_tokens++;
1717
1718   do
1719     {
1720       peektok = _cpp_lex_token (pfile);
1721       if (peektok->type == CPP_EOF)
1722         return peektok;
1723     }
1724   while (index--);
1725
1726   _cpp_backup_tokens_direct (pfile, count + 1);
1727   pfile->keep_tokens--;
1728
1729   return peektok;
1730 }
1731
1732 /* Allocate a single token that is invalidated at the same time as the
1733    rest of the tokens on the line.  Has its line and col set to the
1734    same as the last lexed token, so that diagnostics appear in the
1735    right place.  */
1736 cpp_token *
1737 _cpp_temp_token (cpp_reader *pfile)
1738 {
1739   cpp_token *old, *result;
1740   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1741   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1742
1743   old = pfile->cur_token - 1;
1744   /* Any pre-existing lookaheads must not be clobbered.  */
1745   if (la)
1746     {
1747       if (sz <= la)
1748         {
1749           tokenrun *next = next_tokenrun (pfile->cur_run);
1750
1751           if (sz < la)
1752             memmove (next->base + 1, next->base,
1753                      (la - sz) * sizeof (cpp_token));
1754
1755           next->base[0] = pfile->cur_run->limit[-1];
1756         }
1757
1758       if (sz > 1)
1759         memmove (pfile->cur_token + 1, pfile->cur_token,
1760                  MIN (la, sz - 1) * sizeof (cpp_token));
1761     }
1762
1763   if (!sz && pfile->cur_token == pfile->cur_run->limit)
1764     {
1765       pfile->cur_run = next_tokenrun (pfile->cur_run);
1766       pfile->cur_token = pfile->cur_run->base;
1767     }
1768
1769   result = pfile->cur_token++;
1770   result->src_loc = old->src_loc;
1771   return result;
1772 }
1773
1774 /* Lex a token into RESULT (external interface).  Takes care of issues
1775    like directive handling, token lookahead, multiple include
1776    optimization and skipping.  */
1777 const cpp_token *
1778 _cpp_lex_token (cpp_reader *pfile)
1779 {
1780   cpp_token *result;
1781
1782   for (;;)
1783     {
1784       if (pfile->cur_token == pfile->cur_run->limit)
1785         {
1786           pfile->cur_run = next_tokenrun (pfile->cur_run);
1787           pfile->cur_token = pfile->cur_run->base;
1788         }
1789       /* We assume that the current token is somewhere in the current
1790          run.  */
1791       if (pfile->cur_token < pfile->cur_run->base
1792           || pfile->cur_token >= pfile->cur_run->limit)
1793         abort ();
1794
1795       if (pfile->lookaheads)
1796         {
1797           pfile->lookaheads--;
1798           result = pfile->cur_token++;
1799         }
1800       else
1801         result = _cpp_lex_direct (pfile);
1802
1803       if (result->flags & BOL)
1804         {
1805           /* Is this a directive.  If _cpp_handle_directive returns
1806              false, it is an assembler #.  */
1807           if (result->type == CPP_HASH
1808               /* 6.10.3 p 11: Directives in a list of macro arguments
1809                  gives undefined behavior.  This implementation
1810                  handles the directive as normal.  */
1811               && pfile->state.parsing_args != 1)
1812             {
1813               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
1814                 {
1815                   if (pfile->directive_result.type == CPP_PADDING)
1816                     continue;
1817                   result = &pfile->directive_result;
1818                 }
1819             }
1820           else if (pfile->state.in_deferred_pragma)
1821             result = &pfile->directive_result;
1822
1823           if (pfile->cb.line_change && !pfile->state.skipping)
1824             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
1825         }
1826
1827       /* We don't skip tokens in directives.  */
1828       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
1829         break;
1830
1831       /* Outside a directive, invalidate controlling macros.  At file
1832          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
1833          get here and MI optimization works.  */
1834       pfile->mi_valid = false;
1835
1836       if (!pfile->state.skipping || result->type == CPP_EOF)
1837         break;
1838     }
1839
1840   return result;
1841 }
1842
1843 /* Returns true if a fresh line has been loaded.  */
1844 bool
1845 _cpp_get_fresh_line (cpp_reader *pfile)
1846 {
1847   int return_at_eof;
1848
1849   /* We can't get a new line until we leave the current directive.  */
1850   if (pfile->state.in_directive)
1851     return false;
1852
1853   for (;;)
1854     {
1855       cpp_buffer *buffer = pfile->buffer;
1856
1857       if (!buffer->need_line)
1858         return true;
1859
1860       if (buffer->next_line < buffer->rlimit)
1861         {
1862           _cpp_clean_line (pfile);
1863           return true;
1864         }
1865
1866       /* First, get out of parsing arguments state.  */
1867       if (pfile->state.parsing_args)
1868         return false;
1869
1870       /* End of buffer.  Non-empty files should end in a newline.  */
1871       if (buffer->buf != buffer->rlimit
1872           && buffer->next_line > buffer->rlimit
1873           && !buffer->from_stage3)
1874         {
1875           /* Clip to buffer size.  */
1876           buffer->next_line = buffer->rlimit;
1877         }
1878
1879       return_at_eof = buffer->return_at_eof;
1880       _cpp_pop_buffer (pfile);
1881       if (pfile->buffer == NULL || return_at_eof)
1882         return false;
1883     }
1884 }
1885
1886 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
1887   do                                                    \
1888     {                                                   \
1889       result->type = ELSE_TYPE;                         \
1890       if (*buffer->cur == CHAR)                         \
1891         buffer->cur++, result->type = THEN_TYPE;        \
1892     }                                                   \
1893   while (0)
1894
1895 /* Lex a token into pfile->cur_token, which is also incremented, to
1896    get diagnostics pointing to the correct location.
1897
1898    Does not handle issues such as token lookahead, multiple-include
1899    optimization, directives, skipping etc.  This function is only
1900    suitable for use by _cpp_lex_token, and in special cases like
1901    lex_expansion_token which doesn't care for any of these issues.
1902
1903    When meeting a newline, returns CPP_EOF if parsing a directive,
1904    otherwise returns to the start of the token buffer if permissible.
1905    Returns the location of the lexed token.  */
1906 cpp_token *
1907 _cpp_lex_direct (cpp_reader *pfile)
1908 {
1909   cppchar_t c;
1910   cpp_buffer *buffer;
1911   const unsigned char *comment_start;
1912   cpp_token *result = pfile->cur_token++;
1913
1914  fresh_line:
1915   result->flags = 0;
1916   buffer = pfile->buffer;
1917   if (buffer->need_line)
1918     {
1919       if (pfile->state.in_deferred_pragma)
1920         {
1921           result->type = CPP_PRAGMA_EOL;
1922           pfile->state.in_deferred_pragma = false;
1923           if (!pfile->state.pragma_allow_expansion)
1924             pfile->state.prevent_expansion--;
1925           return result;
1926         }
1927       if (!_cpp_get_fresh_line (pfile))
1928         {
1929           result->type = CPP_EOF;
1930           if (!pfile->state.in_directive)
1931             {
1932               /* Tell the compiler the line number of the EOF token.  */
1933               result->src_loc = pfile->line_table->highest_line;
1934               result->flags = BOL;
1935             }
1936           return result;
1937         }
1938       if (!pfile->keep_tokens)
1939         {
1940           pfile->cur_run = &pfile->base_run;
1941           result = pfile->base_run.base;
1942           pfile->cur_token = result + 1;
1943         }
1944       result->flags = BOL;
1945       if (pfile->state.parsing_args == 2)
1946         result->flags |= PREV_WHITE;
1947     }
1948   buffer = pfile->buffer;
1949  update_tokens_line:
1950   result->src_loc = pfile->line_table->highest_line;
1951
1952  skipped_white:
1953   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
1954       && !pfile->overlaid_buffer)
1955     {
1956       _cpp_process_line_notes (pfile, false);
1957       result->src_loc = pfile->line_table->highest_line;
1958     }
1959   c = *buffer->cur++;
1960
1961   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
1962                                CPP_BUF_COLUMN (buffer, buffer->cur));
1963
1964   switch (c)
1965     {
1966     case ' ': case '\t': case '\f': case '\v': case '\0':
1967       result->flags |= PREV_WHITE;
1968       skip_whitespace (pfile, c);
1969       goto skipped_white;
1970
1971     case '\n':
1972       if (buffer->cur < buffer->rlimit)
1973         CPP_INCREMENT_LINE (pfile, 0);
1974       buffer->need_line = true;
1975       goto fresh_line;
1976
1977     case '0': case '1': case '2': case '3': case '4':
1978     case '5': case '6': case '7': case '8': case '9':
1979       {
1980         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1981         result->type = CPP_NUMBER;
1982         lex_number (pfile, &result->val.str, &nst);
1983         warn_about_normalization (pfile, result, &nst);
1984         break;
1985       }
1986
1987     case 'L':
1988     case 'u':
1989     case 'U':
1990     case 'R':
1991       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
1992          wide strings or raw strings.  */
1993       if (c == 'L' || CPP_OPTION (pfile, uliterals))
1994         {
1995           if ((*buffer->cur == '\'' && c != 'R')
1996               || *buffer->cur == '"'
1997               || (*buffer->cur == 'R'
1998                   && c != 'R'
1999                   && buffer->cur[1] == '"'
2000                   && CPP_OPTION (pfile, uliterals))
2001               || (*buffer->cur == '8'
2002                   && c == 'u'
2003                   && (buffer->cur[1] == '"'
2004                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'))))
2005             {
2006               lex_string (pfile, result, buffer->cur - 1);
2007               break;
2008             }
2009         }
2010       /* Fall through.  */
2011
2012     case '_':
2013     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2014     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2015     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2016     case 's': case 't':           case 'v': case 'w': case 'x':
2017     case 'y': case 'z':
2018     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2019     case 'G': case 'H': case 'I': case 'J': case 'K':
2020     case 'M': case 'N': case 'O': case 'P': case 'Q':
2021     case 'S': case 'T':           case 'V': case 'W': case 'X':
2022     case 'Y': case 'Z':
2023       result->type = CPP_NAME;
2024       {
2025         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2026         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2027                                                 &nst);
2028         warn_about_normalization (pfile, result, &nst);
2029       }
2030
2031       /* Convert named operators to their proper types.  */
2032       if (result->val.node.node->flags & NODE_OPERATOR)
2033         {
2034           result->flags |= NAMED_OP;
2035           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2036         }
2037       break;
2038
2039     case '\'':
2040     case '"':
2041       lex_string (pfile, result, buffer->cur - 1);
2042       break;
2043
2044     case '/':
2045       /* A potential block or line comment.  */
2046       comment_start = buffer->cur;
2047       c = *buffer->cur;
2048
2049       if (c == '*')
2050         {
2051           if (_cpp_skip_block_comment (pfile))
2052             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2053         }
2054       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2055                             || cpp_in_system_header (pfile)))
2056         {
2057           /* Warn about comments only if pedantically GNUC89, and not
2058              in system headers.  */
2059           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2060               && ! buffer->warned_cplusplus_comments)
2061             {
2062               cpp_error (pfile, CPP_DL_PEDWARN,
2063                          "C++ style comments are not allowed in ISO C90");
2064               cpp_error (pfile, CPP_DL_PEDWARN,
2065                          "(this will be reported only once per input file)");
2066               buffer->warned_cplusplus_comments = 1;
2067             }
2068
2069           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2070             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2071         }
2072       else if (c == '=')
2073         {
2074           buffer->cur++;
2075           result->type = CPP_DIV_EQ;
2076           break;
2077         }
2078       else
2079         {
2080           result->type = CPP_DIV;
2081           break;
2082         }
2083
2084       if (!pfile->state.save_comments)
2085         {
2086           result->flags |= PREV_WHITE;
2087           goto update_tokens_line;
2088         }
2089
2090       /* Save the comment as a token in its own right.  */
2091       save_comment (pfile, result, comment_start, c);
2092       break;
2093
2094     case '<':
2095       if (pfile->state.angled_headers)
2096         {
2097           lex_string (pfile, result, buffer->cur - 1);
2098           if (result->type != CPP_LESS)
2099             break;
2100         }
2101
2102       result->type = CPP_LESS;
2103       if (*buffer->cur == '=')
2104         buffer->cur++, result->type = CPP_LESS_EQ;
2105       else if (*buffer->cur == '<')
2106         {
2107           buffer->cur++;
2108           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2109         }
2110       else if (CPP_OPTION (pfile, digraphs))
2111         {
2112           if (*buffer->cur == ':')
2113             {
2114               buffer->cur++;
2115               result->flags |= DIGRAPH;
2116               result->type = CPP_OPEN_SQUARE;
2117             }
2118           else if (*buffer->cur == '%')
2119             {
2120               buffer->cur++;
2121               result->flags |= DIGRAPH;
2122               result->type = CPP_OPEN_BRACE;
2123             }
2124         }
2125       break;
2126
2127     case '>':
2128       result->type = CPP_GREATER;
2129       if (*buffer->cur == '=')
2130         buffer->cur++, result->type = CPP_GREATER_EQ;
2131       else if (*buffer->cur == '>')
2132         {
2133           buffer->cur++;
2134           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2135         }
2136       break;
2137
2138     case '%':
2139       result->type = CPP_MOD;
2140       if (*buffer->cur == '=')
2141         buffer->cur++, result->type = CPP_MOD_EQ;
2142       else if (CPP_OPTION (pfile, digraphs))
2143         {
2144           if (*buffer->cur == ':')
2145             {
2146               buffer->cur++;
2147               result->flags |= DIGRAPH;
2148               result->type = CPP_HASH;
2149               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2150                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2151             }
2152           else if (*buffer->cur == '>')
2153             {
2154               buffer->cur++;
2155               result->flags |= DIGRAPH;
2156               result->type = CPP_CLOSE_BRACE;
2157             }
2158         }
2159       break;
2160
2161     case '.':
2162       result->type = CPP_DOT;
2163       if (ISDIGIT (*buffer->cur))
2164         {
2165           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2166           result->type = CPP_NUMBER;
2167           lex_number (pfile, &result->val.str, &nst);
2168           warn_about_normalization (pfile, result, &nst);
2169         }
2170       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2171         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2172       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2173         buffer->cur++, result->type = CPP_DOT_STAR;
2174       break;
2175
2176     case '+':
2177       result->type = CPP_PLUS;
2178       if (*buffer->cur == '+')
2179         buffer->cur++, result->type = CPP_PLUS_PLUS;
2180       else if (*buffer->cur == '=')
2181         buffer->cur++, result->type = CPP_PLUS_EQ;
2182       break;
2183
2184     case '-':
2185       result->type = CPP_MINUS;
2186       if (*buffer->cur == '>')
2187         {
2188           buffer->cur++;
2189           result->type = CPP_DEREF;
2190           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2191             buffer->cur++, result->type = CPP_DEREF_STAR;
2192         }
2193       else if (*buffer->cur == '-')
2194         buffer->cur++, result->type = CPP_MINUS_MINUS;
2195       else if (*buffer->cur == '=')
2196         buffer->cur++, result->type = CPP_MINUS_EQ;
2197       break;
2198
2199     case '&':
2200       result->type = CPP_AND;
2201       if (*buffer->cur == '&')
2202         buffer->cur++, result->type = CPP_AND_AND;
2203       else if (*buffer->cur == '=')
2204         buffer->cur++, result->type = CPP_AND_EQ;
2205       break;
2206
2207     case '|':
2208       result->type = CPP_OR;
2209       if (*buffer->cur == '|')
2210         buffer->cur++, result->type = CPP_OR_OR;
2211       else if (*buffer->cur == '=')
2212         buffer->cur++, result->type = CPP_OR_EQ;
2213       break;
2214
2215     case ':':
2216       result->type = CPP_COLON;
2217       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2218         buffer->cur++, result->type = CPP_SCOPE;
2219       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2220         {
2221           buffer->cur++;
2222           result->flags |= DIGRAPH;
2223           result->type = CPP_CLOSE_SQUARE;
2224         }
2225       break;
2226
2227     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2228     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2229     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2230     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2231     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2232
2233     case '?': result->type = CPP_QUERY; break;
2234     case '~': result->type = CPP_COMPL; break;
2235     case ',': result->type = CPP_COMMA; break;
2236     case '(': result->type = CPP_OPEN_PAREN; break;
2237     case ')': result->type = CPP_CLOSE_PAREN; break;
2238     case '[': result->type = CPP_OPEN_SQUARE; break;
2239     case ']': result->type = CPP_CLOSE_SQUARE; break;
2240     case '{': result->type = CPP_OPEN_BRACE; break;
2241     case '}': result->type = CPP_CLOSE_BRACE; break;
2242     case ';': result->type = CPP_SEMICOLON; break;
2243
2244       /* @ is a punctuator in Objective-C.  */
2245     case '@': result->type = CPP_ATSIGN; break;
2246
2247     case '$':
2248     case '\\':
2249       {
2250         const uchar *base = --buffer->cur;
2251         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2252
2253         if (forms_identifier_p (pfile, true, &nst))
2254           {
2255             result->type = CPP_NAME;
2256             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2257             warn_about_normalization (pfile, result, &nst);
2258             break;
2259           }
2260         buffer->cur++;
2261       }
2262
2263     default:
2264       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2265       break;
2266     }
2267
2268   return result;
2269 }
2270
2271 /* An upper bound on the number of bytes needed to spell TOKEN.
2272    Does not include preceding whitespace.  */
2273 unsigned int
2274 cpp_token_len (const cpp_token *token)
2275 {
2276   unsigned int len;
2277
2278   switch (TOKEN_SPELL (token))
2279     {
2280     default:            len = 6;                                break;
2281     case SPELL_LITERAL: len = token->val.str.len;               break;
2282     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2283     }
2284
2285   return len;
2286 }
2287
2288 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2289    Return the number of bytes read out of NAME.  (There are always
2290    10 bytes written to BUFFER.)  */
2291
2292 static size_t
2293 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2294 {
2295   int j;
2296   int ucn_len = 0;
2297   int ucn_len_c;
2298   unsigned t;
2299   unsigned long utf32;
2300
2301   /* Compute the length of the UTF-8 sequence.  */
2302   for (t = *name; t & 0x80; t <<= 1)
2303     ucn_len++;
2304
2305   utf32 = *name & (0x7F >> ucn_len);
2306   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2307     {
2308       utf32 = (utf32 << 6) | (*++name & 0x3F);
2309
2310       /* Ill-formed UTF-8.  */
2311       if ((*name & ~0x3F) != 0x80)
2312         abort ();
2313     }
2314
2315   *buffer++ = '\\';
2316   *buffer++ = 'U';
2317   for (j = 7; j >= 0; j--)
2318     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2319   return ucn_len;
2320 }
2321
2322 /* Given a token TYPE corresponding to a digraph, return a pointer to
2323    the spelling of the digraph.  */
2324 static const unsigned char *
2325 cpp_digraph2name (enum cpp_ttype type)
2326 {
2327   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2328 }
2329
2330 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2331    already contain the enough space to hold the token's spelling.
2332    Returns a pointer to the character after the last character written.
2333    FORSTRING is true if this is to be the spelling after translation
2334    phase 1 (this is different for UCNs).
2335    FIXME: Would be nice if we didn't need the PFILE argument.  */
2336 unsigned char *
2337 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2338                  unsigned char *buffer, bool forstring)
2339 {
2340   switch (TOKEN_SPELL (token))
2341     {
2342     case SPELL_OPERATOR:
2343       {
2344         const unsigned char *spelling;
2345         unsigned char c;
2346
2347         if (token->flags & DIGRAPH)
2348           spelling = cpp_digraph2name (token->type);
2349         else if (token->flags & NAMED_OP)
2350           goto spell_ident;
2351         else
2352           spelling = TOKEN_NAME (token);
2353
2354         while ((c = *spelling++) != '\0')
2355           *buffer++ = c;
2356       }
2357       break;
2358
2359     spell_ident:
2360     case SPELL_IDENT:
2361       if (forstring)
2362         {
2363           memcpy (buffer, NODE_NAME (token->val.node.node),
2364                   NODE_LEN (token->val.node.node));
2365           buffer += NODE_LEN (token->val.node.node);
2366         }
2367       else
2368         {
2369           size_t i;
2370           const unsigned char * name = NODE_NAME (token->val.node.node);
2371
2372           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2373             if (name[i] & ~0x7F)
2374               {
2375                 i += utf8_to_ucn (buffer, name + i) - 1;
2376                 buffer += 10;
2377               }
2378             else
2379               *buffer++ = NODE_NAME (token->val.node.node)[i];
2380         }
2381       break;
2382
2383     case SPELL_LITERAL:
2384       memcpy (buffer, token->val.str.text, token->val.str.len);
2385       buffer += token->val.str.len;
2386       break;
2387
2388     case SPELL_NONE:
2389       cpp_error (pfile, CPP_DL_ICE,
2390                  "unspellable token %s", TOKEN_NAME (token));
2391       break;
2392     }
2393
2394   return buffer;
2395 }
2396
2397 /* Returns TOKEN spelt as a null-terminated string.  The string is
2398    freed when the reader is destroyed.  Useful for diagnostics.  */
2399 unsigned char *
2400 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2401 {
2402   unsigned int len = cpp_token_len (token) + 1;
2403   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2404
2405   end = cpp_spell_token (pfile, token, start, false);
2406   end[0] = '\0';
2407
2408   return start;
2409 }
2410
2411 /* Returns a pointer to a string which spells the token defined by
2412    TYPE and FLAGS.  Used by C front ends, which really should move to
2413    using cpp_token_as_text.  */
2414 const char *
2415 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2416 {
2417   if (flags & DIGRAPH)
2418     return (const char *) cpp_digraph2name (type);
2419   else if (flags & NAMED_OP)
2420     return cpp_named_operator2name (type);
2421
2422   return (const char *) token_spellings[type].name;
2423 }
2424
2425 /* Writes the spelling of token to FP, without any preceding space.
2426    Separated from cpp_spell_token for efficiency - to avoid stdio
2427    double-buffering.  */
2428 void
2429 cpp_output_token (const cpp_token *token, FILE *fp)
2430 {
2431   switch (TOKEN_SPELL (token))
2432     {
2433     case SPELL_OPERATOR:
2434       {
2435         const unsigned char *spelling;
2436         int c;
2437
2438         if (token->flags & DIGRAPH)
2439           spelling = cpp_digraph2name (token->type);
2440         else if (token->flags & NAMED_OP)
2441           goto spell_ident;
2442         else
2443           spelling = TOKEN_NAME (token);
2444
2445         c = *spelling;
2446         do
2447           putc (c, fp);
2448         while ((c = *++spelling) != '\0');
2449       }
2450       break;
2451
2452     spell_ident:
2453     case SPELL_IDENT:
2454       {
2455         size_t i;
2456         const unsigned char * name = NODE_NAME (token->val.node.node);
2457
2458         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2459           if (name[i] & ~0x7F)
2460             {
2461               unsigned char buffer[10];
2462               i += utf8_to_ucn (buffer, name + i) - 1;
2463               fwrite (buffer, 1, 10, fp);
2464             }
2465           else
2466             fputc (NODE_NAME (token->val.node.node)[i], fp);
2467       }
2468       break;
2469
2470     case SPELL_LITERAL:
2471       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2472       break;
2473
2474     case SPELL_NONE:
2475       /* An error, most probably.  */
2476       break;
2477     }
2478 }
2479
2480 /* Compare two tokens.  */
2481 int
2482 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2483 {
2484   if (a->type == b->type && a->flags == b->flags)
2485     switch (TOKEN_SPELL (a))
2486       {
2487       default:                  /* Keep compiler happy.  */
2488       case SPELL_OPERATOR:
2489         /* token_no is used to track where multiple consecutive ##
2490            tokens were originally located.  */
2491         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2492       case SPELL_NONE:
2493         return (a->type != CPP_MACRO_ARG
2494                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2495       case SPELL_IDENT:
2496         return a->val.node.node == b->val.node.node;
2497       case SPELL_LITERAL:
2498         return (a->val.str.len == b->val.str.len
2499                 && !memcmp (a->val.str.text, b->val.str.text,
2500                             a->val.str.len));
2501       }
2502
2503   return 0;
2504 }
2505
2506 /* Returns nonzero if a space should be inserted to avoid an
2507    accidental token paste for output.  For simplicity, it is
2508    conservative, and occasionally advises a space where one is not
2509    needed, e.g. "." and ".2".  */
2510 int
2511 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2512                  const cpp_token *token2)
2513 {
2514   enum cpp_ttype a = token1->type, b = token2->type;
2515   cppchar_t c;
2516
2517   if (token1->flags & NAMED_OP)
2518     a = CPP_NAME;
2519   if (token2->flags & NAMED_OP)
2520     b = CPP_NAME;
2521
2522   c = EOF;
2523   if (token2->flags & DIGRAPH)
2524     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2525   else if (token_spellings[b].category == SPELL_OPERATOR)
2526     c = token_spellings[b].name[0];
2527
2528   /* Quickly get everything that can paste with an '='.  */
2529   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2530     return 1;
2531
2532   switch (a)
2533     {
2534     case CPP_GREATER:   return c == '>';
2535     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2536     case CPP_PLUS:      return c == '+';
2537     case CPP_MINUS:     return c == '-' || c == '>';
2538     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2539     case CPP_MOD:       return c == ':' || c == '>';
2540     case CPP_AND:       return c == '&';
2541     case CPP_OR:        return c == '|';
2542     case CPP_COLON:     return c == ':' || c == '>';
2543     case CPP_DEREF:     return c == '*';
2544     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2545     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2546     case CPP_NAME:      return ((b == CPP_NUMBER
2547                                  && name_p (pfile, &token2->val.str))
2548                                 || b == CPP_NAME
2549                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2550     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2551                                 || c == '.' || c == '+' || c == '-');
2552                                       /* UCNs */
2553     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2554                                  && b == CPP_NAME)
2555                                 || (CPP_OPTION (pfile, objc)
2556                                     && token1->val.str.text[0] == '@'
2557                                     && (b == CPP_NAME || b == CPP_STRING)));
2558     default:            break;
2559     }
2560
2561   return 0;
2562 }
2563
2564 /* Output all the remaining tokens on the current line, and a newline
2565    character, to FP.  Leading whitespace is removed.  If there are
2566    macros, special token padding is not performed.  */
2567 void
2568 cpp_output_line (cpp_reader *pfile, FILE *fp)
2569 {
2570   const cpp_token *token;
2571
2572   token = cpp_get_token (pfile);
2573   while (token->type != CPP_EOF)
2574     {
2575       cpp_output_token (token, fp);
2576       token = cpp_get_token (pfile);
2577       if (token->flags & PREV_WHITE)
2578         putc (' ', fp);
2579     }
2580
2581   putc ('\n', fp);
2582 }
2583
2584 /* Return a string representation of all the remaining tokens on the
2585    current line.  The result is allocated using xmalloc and must be
2586    freed by the caller.  */
2587 unsigned char *
2588 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2589 {
2590   const cpp_token *token;
2591   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2592   unsigned int alloced = 120 + out;
2593   unsigned char *result = (unsigned char *) xmalloc (alloced);
2594
2595   /* If DIR_NAME is empty, there are no initial contents.  */
2596   if (dir_name)
2597     {
2598       sprintf ((char *) result, "#%s ", dir_name);
2599       out += 2;
2600     }
2601
2602   token = cpp_get_token (pfile);
2603   while (token->type != CPP_EOF)
2604     {
2605       unsigned char *last;
2606       /* Include room for a possible space and the terminating nul.  */
2607       unsigned int len = cpp_token_len (token) + 2;
2608
2609       if (out + len > alloced)
2610         {
2611           alloced *= 2;
2612           if (out + len > alloced)
2613             alloced = out + len;
2614           result = (unsigned char *) xrealloc (result, alloced);
2615         }
2616
2617       last = cpp_spell_token (pfile, token, &result[out], 0);
2618       out = last - result;
2619
2620       token = cpp_get_token (pfile);
2621       if (token->flags & PREV_WHITE)
2622         result[out++] = ' ';
2623     }
2624
2625   result[out] = '\0';
2626   return result;
2627 }
2628
2629 /* Memory buffers.  Changing these three constants can have a dramatic
2630    effect on performance.  The values here are reasonable defaults,
2631    but might be tuned.  If you adjust them, be sure to test across a
2632    range of uses of cpplib, including heavy nested function-like macro
2633    expansion.  Also check the change in peak memory usage (NJAMD is a
2634    good tool for this).  */
2635 #define MIN_BUFF_SIZE 8000
2636 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2637 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2638         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2639
2640 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2641   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2642 #endif
2643
2644 /* Create a new allocation buffer.  Place the control block at the end
2645    of the buffer, so that buffer overflows will cause immediate chaos.  */
2646 static _cpp_buff *
2647 new_buff (size_t len)
2648 {
2649   _cpp_buff *result;
2650   unsigned char *base;
2651
2652   if (len < MIN_BUFF_SIZE)
2653     len = MIN_BUFF_SIZE;
2654   len = CPP_ALIGN (len);
2655
2656   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2657   result = (_cpp_buff *) (base + len);
2658   result->base = base;
2659   result->cur = base;
2660   result->limit = base + len;
2661   result->next = NULL;
2662   return result;
2663 }
2664
2665 /* Place a chain of unwanted allocation buffers on the free list.  */
2666 void
2667 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2668 {
2669   _cpp_buff *end = buff;
2670
2671   while (end->next)
2672     end = end->next;
2673   end->next = pfile->free_buffs;
2674   pfile->free_buffs = buff;
2675 }
2676
2677 /* Return a free buffer of size at least MIN_SIZE.  */
2678 _cpp_buff *
2679 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2680 {
2681   _cpp_buff *result, **p;
2682
2683   for (p = &pfile->free_buffs;; p = &(*p)->next)
2684     {
2685       size_t size;
2686
2687       if (*p == NULL)
2688         return new_buff (min_size);
2689       result = *p;
2690       size = result->limit - result->base;
2691       /* Return a buffer that's big enough, but don't waste one that's
2692          way too big.  */
2693       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2694         break;
2695     }
2696
2697   *p = result->next;
2698   result->next = NULL;
2699   result->cur = result->base;
2700   return result;
2701 }
2702
2703 /* Creates a new buffer with enough space to hold the uncommitted
2704    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2705    the excess bytes to the new buffer.  Chains the new buffer after
2706    BUFF, and returns the new buffer.  */
2707 _cpp_buff *
2708 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2709 {
2710   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2711   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2712
2713   buff->next = new_buff;
2714   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2715   return new_buff;
2716 }
2717
2718 /* Creates a new buffer with enough space to hold the uncommitted
2719    remaining bytes of the buffer pointed to by BUFF, and at least
2720    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2721    Chains the new buffer before the buffer pointed to by BUFF, and
2722    updates the pointer to point to the new buffer.  */
2723 void
2724 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
2725 {
2726   _cpp_buff *new_buff, *old_buff = *pbuff;
2727   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
2728
2729   new_buff = _cpp_get_buff (pfile, size);
2730   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
2731   new_buff->next = old_buff;
2732   *pbuff = new_buff;
2733 }
2734
2735 /* Free a chain of buffers starting at BUFF.  */
2736 void
2737 _cpp_free_buff (_cpp_buff *buff)
2738 {
2739   _cpp_buff *next;
2740
2741   for (; buff; buff = next)
2742     {
2743       next = buff->next;
2744       free (buff->base);
2745     }
2746 }
2747
2748 /* Allocate permanent, unaligned storage of length LEN.  */
2749 unsigned char *
2750 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
2751 {
2752   _cpp_buff *buff = pfile->u_buff;
2753   unsigned char *result = buff->cur;
2754
2755   if (len > (size_t) (buff->limit - result))
2756     {
2757       buff = _cpp_get_buff (pfile, len);
2758       buff->next = pfile->u_buff;
2759       pfile->u_buff = buff;
2760       result = buff->cur;
2761     }
2762
2763   buff->cur = result + len;
2764   return result;
2765 }
2766
2767 /* Allocate permanent, unaligned storage of length LEN from a_buff.
2768    That buffer is used for growing allocations when saving macro
2769    replacement lists in a #define, and when parsing an answer to an
2770    assertion in #assert, #unassert or #if (and therefore possibly
2771    whilst expanding macros).  It therefore must not be used by any
2772    code that they might call: specifically the lexer and the guts of
2773    the macro expander.
2774
2775    All existing other uses clearly fit this restriction: storing
2776    registered pragmas during initialization.  */
2777 unsigned char *
2778 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
2779 {
2780   _cpp_buff *buff = pfile->a_buff;
2781   unsigned char *result = buff->cur;
2782
2783   if (len > (size_t) (buff->limit - result))
2784     {
2785       buff = _cpp_get_buff (pfile, len);
2786       buff->next = pfile->a_buff;
2787       pfile->a_buff = buff;
2788       result = buff->cur;
2789     }
2790
2791   buff->cur = result + len;
2792   return result;
2793 }
2794
2795 /* Say which field of TOK is in use.  */
2796
2797 enum cpp_token_fld_kind
2798 cpp_token_val_index (cpp_token *tok)
2799 {
2800   switch (TOKEN_SPELL (tok))
2801     {
2802     case SPELL_IDENT:
2803       return CPP_TOKEN_FLD_NODE;
2804     case SPELL_LITERAL:
2805       return CPP_TOKEN_FLD_STR;
2806     case SPELL_OPERATOR:
2807       if (tok->type == CPP_PASTE)
2808         return CPP_TOKEN_FLD_TOKEN_NO;
2809       else
2810         return CPP_TOKEN_FLD_NONE;
2811     case SPELL_NONE:
2812       if (tok->type == CPP_MACRO_ARG)
2813         return CPP_TOKEN_FLD_ARG_NO;
2814       else if (tok->type == CPP_PADDING)
2815         return CPP_TOKEN_FLD_SOURCE;
2816       else if (tok->type == CPP_PRAGMA)
2817         return CPP_TOKEN_FLD_PRAGMA;
2818       /* else fall through */
2819     default:
2820       return CPP_TOKEN_FLD_NONE;
2821     }
2822 }