libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2013 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problems can be properly
 267    autoconfed:
 268
 269    The Solaris 9 assembler cannot assemble SSE4.2 insns.
 270    Before Solaris 9 Update 6, SSE insns cannot be executed.
 271    The Solaris 10+ assembler tags objects with the instruction set
 272    extensions used, so SSE4.2 executables cannot run on machines that
 273    don't support that extension.  */
 274
 275 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 276
 277 /* Replicated character data to be shared between implementations.
 278    Recall that outside of a context with vector support we can't
 279    define compatible vector types, therefore these are all defined
 280    in terms of raw characters.  */
 281 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 282   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 283     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 284   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 285     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 286   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 287     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 288   { '?', '?', '?', '?', '?', '?', '?', '?',
 289     '?', '?', '?', '?', '?', '?', '?', '?' },
 290 };
 291
 292 /* A version of the fast scanner using MMX vectorized byte compare insns.
 293
 294    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 295    which was packaged into SSE1; it is also present in the AMD MMX
 296    extension.  Mark the function as using "sse" so that we emit a real
 297    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 298
 299 static const uchar *
 300 #ifndef __SSE__
 301 __attribute__((__target__("sse")))
 302 #endif
 303 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 304 {
 305   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 306   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 307
 308   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 309   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 310   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 311   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 312
 313   unsigned int misalign, found, mask;
 314   const v8qi *p;
 315   v8qi data, t, c;
 316
 317   /* Align the source pointer.  While MMX doesn't generate unaligned data
 318      faults, this allows us to safely scan to the end of the buffer without
 319      reading beyond the end of the last page.  */
 320   misalign = (uintptr_t)s & 7;
 321   p = (const v8qi *)((uintptr_t)s & -8);
 322   data = *p;
 323
 324   /* Create a mask for the bytes that are valid within the first
 325      16-byte block.  The Idea here is that the AND with the mask
 326      within the loop is "free", since we need some AND or TEST
 327      insn in order to set the flags for the branch anyway.  */
 328   mask = -1u << misalign;
 329
 330   /* Main loop processing 8 bytes at a time.  */
 331   goto start;
 332   do
 333     {
 334       data = *++p;
 335       mask = -1;
 336
 337     start:
 338       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 339       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 345       found = __builtin_ia32_pmovmskb (t);
 346       found &= mask;
 347     }
 348   while (!found);
 349
 350   __builtin_ia32_emms ();
 351
 352   /* FOUND contains 1 in bits for which we matched a relevant
 353      character.  Conversion to the byte index is trivial.  */
 354   found = __builtin_ctz(found);
 355   return (const uchar *)p + found;
 356 }
 357
 358 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 359
 360 static const uchar *
 361 #ifndef __SSE2__
 362 __attribute__((__target__("sse2")))
 363 #endif
 364 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 365 {
 366   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 367
 368   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 369   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 370   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 371   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 372
 373   unsigned int misalign, found, mask;
 374   const v16qi *p;
 375   v16qi data, t;
 376
 377   /* Align the source pointer.  */
 378   misalign = (uintptr_t)s & 15;
 379   p = (const v16qi *)((uintptr_t)s & -16);
 380   data = *p;
 381
 382   /* Create a mask for the bytes that are valid within the first
 383      16-byte block.  The Idea here is that the AND with the mask
 384      within the loop is "free", since we need some AND or TEST
 385      insn in order to set the flags for the branch anyway.  */
 386   mask = -1u << misalign;
 387
 388   /* Main loop processing 16 bytes at a time.  */
 389   goto start;
 390   do
 391     {
 392       data = *++p;
 393       mask = -1;
 394
 395     start:
 396       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 398       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 399       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 400       found = __builtin_ia32_pmovmskb128 (t);
 401       found &= mask;
 402     }
 403   while (!found);
 404
 405   /* FOUND contains 1 in bits for which we matched a relevant
 406      character.  Conversion to the byte index is trivial.  */
 407   found = __builtin_ctz(found);
 408   return (const uchar *)p + found;
 409 }
 410
 411 #ifdef HAVE_SSE4
 412 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 413
 414 static const uchar *
 415 #ifndef __SSE4_2__
 416 __attribute__((__target__("sse4.2")))
 417 #endif
 418 search_line_sse42 (const uchar *s, const uchar *end)
 419 {
 420   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 421   static const v16qi search = { '\n', '\r', '?', '\\' };
 422
 423   uintptr_t si = (uintptr_t)s;
 424   uintptr_t index;
 425
 426   /* Check for unaligned input.  */
 427   if (si & 15)
 428     {
 429       v16qi sv;
 430
 431       if (__builtin_expect (end - s < 16, 0)
 432           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 433         {
 434           /* There are less than 16 bytes left in the buffer, and less
 435              than 16 bytes left on the page.  Reading 16 bytes at this
 436              point might generate a spurious page fault.  Defer to the
 437              SSE2 implementation, which already handles alignment.  */
 438           return search_line_sse2 (s, end);
 439         }
 440
 441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 442          memory need not be aligned.  */
 443       sv = __builtin_ia32_loaddqu ((const char *) s);
 444       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 445
 446       if (__builtin_expect (index < 16, 0))
 447         goto found;
 448
 449       /* Advance the pointer to an aligned address.  We will re-scan a
 450          few bytes, but we no longer need care for reading past the
 451          end of a page, since we're guaranteed a match.  */
 452       s = (const uchar *)((si + 16) & -16);
 453     }
 454
 455   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 456      in inline assembly, we can make proper use of the flags set.  */
 457   __asm (      "sub $16, %1\n"
 458         "       .balign 16\n"
 459         "0:     add $16, %1\n"
 460         "       %vpcmpestri $0, (%1), %2\n"
 461         "       jnc 0b"
 462         : "=&c"(index), "+r"(s)
 463         : "x"(search), "a"(4), "d"(16));
 464
 465  found:
 466   return s + index;
 467 }
 468
 469 #else
 470 /* Work around out-dated assemblers without sse4 support.  */
 471 #define search_line_sse42 search_line_sse2
 472 #endif
 473
 474 /* Check the CPU capabilities.  */
 475
 476 #include "../gcc/config/i386/cpuid.h"
 477
 478 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 479 static search_line_fast_type search_line_fast;
 480
 481 #define HAVE_init_vectorized_lexer 1
 482 static inline void
 483 init_vectorized_lexer (void)
 484 {
 485   unsigned dummy, ecx = 0, edx = 0;
 486   search_line_fast_type impl = search_line_acc_char;
 487   int minimum = 0;
 488
 489 #if defined(__SSE4_2__)
 490   minimum = 3;
 491 #elif defined(__SSE2__)
 492   minimum = 2;
 493 #elif defined(__SSE__)
 494   minimum = 1;
 495 #endif
 496
 497   if (minimum == 3)
 498     impl = search_line_sse42;
 499   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 500     {
 501       if (minimum == 3 || (ecx & bit_SSE4_2))
 502         impl = search_line_sse42;
 503       else if (minimum == 2 || (edx & bit_SSE2))
 504         impl = search_line_sse2;
 505       else if (minimum == 1 || (edx & bit_SSE))
 506         impl = search_line_mmx;
 507     }
 508   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 509     {
 510       if (minimum == 1
 511           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 512         impl = search_line_mmx;
 513     }
 514
 515   search_line_fast = impl;
 516 }
 517
 518 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 519
 520 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 521 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 522    so we can't compile this function without -maltivec on the command line
 523    (or implied by some other switch).  */
 524
 525 static const uchar *
 526 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 527 {
 528   typedef __attribute__((altivec(vector))) unsigned char vc;
 529
 530   const vc repl_nl = {
 531     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 532     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 533   };
 534   const vc repl_cr = {
 535     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 536     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 537   };
 538   const vc repl_bs = {
 539     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 540     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 541   };
 542   const vc repl_qm = {
 543     '?', '?', '?', '?', '?', '?', '?', '?',
 544     '?', '?', '?', '?', '?', '?', '?', '?',
 545   };
 546   const vc ones = {
 547     -1, -1, -1, -1, -1, -1, -1, -1,
 548     -1, -1, -1, -1, -1, -1, -1, -1,
 549   };
 550   const vc zero = { 0 };
 551
 552   vc data, mask, t;
 553
 554   /* Altivec loads automatically mask addresses with -16.  This lets us
 555      issue the first load as early as possible.  */
 556   data = __builtin_vec_ld(0, (const vc *)s);
 557
 558   /* Discard bytes before the beginning of the buffer.  Do this by
 559      beginning with all ones and shifting in zeros according to the
 560      mis-alignment.  The LVSR instruction pulls the exact shift we
 561      want from the address.  */
 562   mask = __builtin_vec_lvsr(0, s);
 563   mask = __builtin_vec_perm(zero, ones, mask);
 564   data &= mask;
 565
 566   /* While altivec loads mask addresses, we still need to align S so
 567      that the offset we compute at the end is correct.  */
 568   s = (const uchar *)((uintptr_t)s & -16);
 569
 570   /* Main loop processing 16 bytes at a time.  */
 571   goto start;
 572   do
 573     {
 574       vc m_nl, m_cr, m_bs, m_qm;
 575
 576       s += 16;
 577       data = __builtin_vec_ld(0, (const vc *)s);
 578
 579     start:
 580       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 581       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 582       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 583       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 584       t = (m_nl | m_cr) | (m_bs | m_qm);
 585
 586       /* T now contains 0xff in bytes for which we matched one of the relevant
 587          characters.  We want to exit the loop if any byte in T is non-zero.
 588          Below is the expansion of vec_any_ne(t, zero).  */
 589     }
 590   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 591
 592   {
 593 #define N  (sizeof(vc) / sizeof(long))
 594
 595     union {
 596       vc v;
 597       /* Statically assert that N is 2 or 4.  */
 598       unsigned long l[(N == 2 || N == 4) ? N : -1];
 599     } u;
 600     unsigned long l, i = 0;
 601
 602     u.v = t;
 603
 604     /* Find the first word of T that is non-zero.  */
 605     switch (N)
 606       {
 607       case 4:
 608         l = u.l[i++];
 609         if (l != 0)
 610           break;
 611         s += sizeof(unsigned long);
 612         l = u.l[i++];
 613         if (l != 0)
 614           break;
 615         s += sizeof(unsigned long);
 616       case 2:
 617         l = u.l[i++];
 618         if (l != 0)
 619           break;
 620         s += sizeof(unsigned long);
 621         l = u.l[i];
 622       }
 623
 624     /* L now contains 0xff in bytes for which we matched one of the
 625        relevant characters.  We can find the byte index by finding
 626        its bit index and dividing by 8.  */
 627     l = __builtin_clzl(l) >> 3;
 628     return s + l;
 629
 630 #undef N
 631   }
 632 }
 633
 634 #elif defined (__ARM_NEON__)
 635 #include "arm_neon.h"
 636
 637 static const uchar *
 638 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 639 {
 640   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 641   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 642   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 643   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 644   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 645
 646   unsigned int misalign, found, mask;
 647   const uint8_t *p;
 648   uint8x16_t data;
 649
 650   /* Align the source pointer.  */
 651   misalign = (uintptr_t)s & 15;
 652   p = (const uint8_t *)((uintptr_t)s & -16);
 653   data = vld1q_u8 (p);
 654
 655   /* Create a mask for the bytes that are valid within the first
 656      16-byte block.  The Idea here is that the AND with the mask
 657      within the loop is "free", since we need some AND or TEST
 658      insn in order to set the flags for the branch anyway.  */
 659   mask = (-1u << misalign) & 0xffff;
 660
 661   /* Main loop, processing 16 bytes at a time.  */
 662   goto start;
 663
 664   do
 665     {
 666       uint8x8_t l;
 667       uint16x4_t m;
 668       uint32x2_t n;
 669       uint8x16_t t, u, v, w;
 670
 671       p += 16;
 672       data = vld1q_u8 (p);
 673       mask = 0xffff;
 674
 675     start:
 676       t = vceqq_u8 (data, repl_nl);
 677       u = vceqq_u8 (data, repl_cr);
 678       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 679       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 680       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 681       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 682       m = vpaddl_u8 (l);
 683       n = vpaddl_u16 (m);
 684
 685       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 686               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 687       found &= mask;
 688     }
 689   while (!found);
 690
 691   /* FOUND contains 1 in bits for which we matched a relevant
 692      character.  Conversion to the byte index is trivial.  */
 693   found = __builtin_ctz (found);
 694   return (const uchar *)p + found;
 695 }
 696
 697 #else
 698
 699 /* We only have one accellerated alternative.  Use a direct call so that
 700    we encourage inlining.  */
 701
 702 #define search_line_fast  search_line_acc_char
 703
 704 #endif
 705
 706 /* Initialize the lexer if needed.  */
 707
 708 void
 709 _cpp_init_lexer (void)
 710 {
 711 #ifdef HAVE_init_vectorized_lexer
 712   init_vectorized_lexer ();
 713 #endif
 714 }
 715
 716 /* Returns with a logical line that contains no escaped newlines or
 717    trigraphs.  This is a time-critical inner loop.  */
 718 void
 719 _cpp_clean_line (cpp_reader *pfile)
 720 {
 721   cpp_buffer *buffer;
 722   const uchar *s;
 723   uchar c, *d, *p;
 724
 725   buffer = pfile->buffer;
 726   buffer->cur_note = buffer->notes_used = 0;
 727   buffer->cur = buffer->line_base = buffer->next_line;
 728   buffer->need_line = false;
 729   s = buffer->next_line;
 730
 731   if (!buffer->from_stage3)
 732     {
 733       const uchar *pbackslash = NULL;
 734
 735       /* Fast path.  This is the common case of an un-escaped line with
 736          no trigraphs.  The primary win here is by not writing any
 737          data back to memory until we have to.  */
 738       while (1)
 739         {
 740           /* Perform an optimized search for \n, \r, \\, ?.  */
 741           s = search_line_fast (s, buffer->rlimit);
 742
 743           c = *s;
 744           if (c == '\\')
 745             {
 746               /* Record the location of the backslash and continue.  */
 747               pbackslash = s++;
 748             }
 749           else if (__builtin_expect (c == '?', 0))
 750             {
 751               if (__builtin_expect (s[1] == '?', false)
 752                    && _cpp_trigraph_map[s[2]])
 753                 {
 754                   /* Have a trigraph.  We may or may not have to convert
 755                      it.  Add a line note regardless, for -Wtrigraphs.  */
 756                   add_line_note (buffer, s, s[2]);
 757                   if (CPP_OPTION (pfile, trigraphs))
 758                     {
 759                       /* We do, and that means we have to switch to the
 760                          slow path.  */
 761                       d = (uchar *) s;
 762                       *d = _cpp_trigraph_map[s[2]];
 763                       s += 2;
 764                       goto slow_path;
 765                     }
 766                 }
 767               /* Not a trigraph.  Continue on fast-path.  */
 768               s++;
 769             }
 770           else
 771             break;
 772         }
 773
 774       /* This must be \r or \n.  We're either done, or we'll be forced
 775          to write back to the buffer and continue on the slow path.  */
 776       d = (uchar *) s;
 777
 778       if (__builtin_expect (s == buffer->rlimit, false))
 779         goto done;
 780
 781       /* DOS line ending? */
 782       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 783         {
 784           s++;
 785           if (s == buffer->rlimit)
 786             goto done;
 787         }
 788
 789       if (__builtin_expect (pbackslash == NULL, true))
 790         goto done;
 791
 792       /* Check for escaped newline.  */
 793       p = d;
 794       while (is_nvspace (p[-1]))
 795         p--;
 796       if (p - 1 != pbackslash)
 797         goto done;
 798
 799       /* Have an escaped newline; process it and proceed to
 800          the slow path.  */
 801       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 802       d = p - 2;
 803       buffer->next_line = p - 1;
 804
 805     slow_path:
 806       while (1)
 807         {
 808           c = *++s;
 809           *++d = c;
 810
 811           if (c == '\n' || c == '\r')
 812             {
 813               /* Handle DOS line endings.  */
 814               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 815                 s++;
 816               if (s == buffer->rlimit)
 817                 break;
 818
 819               /* Escaped?  */
 820               p = d;
 821               while (p != buffer->next_line && is_nvspace (p[-1]))
 822                 p--;
 823               if (p == buffer->next_line || p[-1] != '\\')
 824                 break;
 825
 826               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 827               d = p - 2;
 828               buffer->next_line = p - 1;
 829             }
 830           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 831             {
 832               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 833               add_line_note (buffer, d, s[2]);
 834               if (CPP_OPTION (pfile, trigraphs))
 835                 {
 836                   *d = _cpp_trigraph_map[s[2]];
 837                   s += 2;
 838                 }
 839             }
 840         }
 841     }
 842   else
 843     {
 844       while (*s != '\n' && *s != '\r')
 845         s++;
 846       d = (uchar *) s;
 847
 848       /* Handle DOS line endings.  */
 849       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 850         s++;
 851     }
 852
 853  done:
 854   *d = '\n';
 855   /* A sentinel note that should never be processed.  */
 856   add_line_note (buffer, d + 1, '\n');
 857   buffer->next_line = s + 1;
 858 }
 859
 860 /* Return true if the trigraph indicated by NOTE should be warned
 861    about in a comment.  */
 862 static bool
 863 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 864 {
 865   const uchar *p;
 866
 867   /* Within comments we don't warn about trigraphs, unless the
 868      trigraph forms an escaped newline, as that may change
 869      behavior.  */
 870   if (note->type != '/')
 871     return false;
 872
 873   /* If -trigraphs, then this was an escaped newline iff the next note
 874      is coincident.  */
 875   if (CPP_OPTION (pfile, trigraphs))
 876     return note[1].pos == note->pos;
 877
 878   /* Otherwise, see if this forms an escaped newline.  */
 879   p = note->pos + 3;
 880   while (is_nvspace (*p))
 881     p++;
 882
 883   /* There might have been escaped newlines between the trigraph and the
 884      newline we found.  Hence the position test.  */
 885   return (*p == '\n' && p < note[1].pos);
 886 }
 887
 888 /* Process the notes created by add_line_note as far as the current
 889    location.  */
 890 void
 891 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 892 {
 893   cpp_buffer *buffer = pfile->buffer;
 894
 895   for (;;)
 896     {
 897       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 898       unsigned int col;
 899
 900       if (note->pos > buffer->cur)
 901         break;
 902
 903       buffer->cur_note++;
 904       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 905
 906       if (note->type == '\\' || note->type == ' ')
 907         {
 908           if (note->type == ' ' && !in_comment)
 909             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 910                                  "backslash and newline separated by space");
 911
 912           if (buffer->next_line > buffer->rlimit)
 913             {
 914               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 915                                    "backslash-newline at end of file");
 916               /* Prevent "no newline at end of file" warning.  */
 917               buffer->next_line = buffer->rlimit;
 918             }
 919
 920           buffer->line_base = note->pos;
 921           CPP_INCREMENT_LINE (pfile, 0);
 922         }
 923       else if (_cpp_trigraph_map[note->type])
 924         {
 925           if (CPP_OPTION (pfile, warn_trigraphs)
 926               && (!in_comment || warn_in_comment (pfile, note)))
 927             {
 928               if (CPP_OPTION (pfile, trigraphs))
 929                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 930                                        pfile->line_table->highest_line, col,
 931                                        "trigraph ??%c converted to %c",
 932                                        note->type,
 933                                        (int) _cpp_trigraph_map[note->type]);
 934               else
 935                 {
 936                   cpp_warning_with_line
 937                     (pfile, CPP_W_TRIGRAPHS,
 938                      pfile->line_table->highest_line, col,
 939                      "trigraph ??%c ignored, use -trigraphs to enable",
 940                      note->type);
 941                 }
 942             }
 943         }
 944       else if (note->type == 0)
 945         /* Already processed in lex_raw_string.  */;
 946       else
 947         abort ();
 948     }
 949 }
 950
 951 /* Skip a C-style block comment.  We find the end of the comment by
 952    seeing if an asterisk is before every '/' we encounter.  Returns
 953    nonzero if comment terminated by EOF, zero otherwise.
 954
 955    Buffer->cur points to the initial asterisk of the comment.  */
 956 bool
 957 _cpp_skip_block_comment (cpp_reader *pfile)
 958 {
 959   cpp_buffer *buffer = pfile->buffer;
 960   const uchar *cur = buffer->cur;
 961   uchar c;
 962
 963   cur++;
 964   if (*cur == '/')
 965     cur++;
 966
 967   for (;;)
 968     {
 969       /* People like decorating comments with '*', so check for '/'
 970          instead for efficiency.  */
 971       c = *cur++;
 972
 973       if (c == '/')
 974         {
 975           if (cur[-2] == '*')
 976             break;
 977
 978           /* Warn about potential nested comments, but not if the '/'
 979              comes immediately before the true comment delimiter.
 980              Don't bother to get it right across escaped newlines.  */
 981           if (CPP_OPTION (pfile, warn_comments)
 982               && cur[0] == '*' && cur[1] != '/')
 983             {
 984               buffer->cur = cur;
 985               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 986                                      pfile->line_table->highest_line,
 987                                      CPP_BUF_COL (buffer),
 988                                      "\"/*\" within comment");
 989             }
 990         }
 991       else if (c == '\n')
 992         {
 993           unsigned int cols;
 994           buffer->cur = cur - 1;
 995           _cpp_process_line_notes (pfile, true);
 996           if (buffer->next_line >= buffer->rlimit)
 997             return true;
 998           _cpp_clean_line (pfile);
 999
1000           cols = buffer->next_line - buffer->line_base;
1001           CPP_INCREMENT_LINE (pfile, cols);
1002
1003           cur = buffer->cur;
1004         }
1005     }
1006
1007   buffer->cur = cur;
1008   _cpp_process_line_notes (pfile, true);
1009   return false;
1010 }
1011
1012 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1013    terminating newline.  Handles escaped newlines.  Returns nonzero
1014    if a multiline comment.  */
1015 static int
1016 skip_line_comment (cpp_reader *pfile)
1017 {
1018   cpp_buffer *buffer = pfile->buffer;
1019   source_location orig_line = pfile->line_table->highest_line;
1020
1021   while (*buffer->cur != '\n')
1022     buffer->cur++;
1023
1024   _cpp_process_line_notes (pfile, true);
1025   return orig_line != pfile->line_table->highest_line;
1026 }
1027
1028 /* Skips whitespace, saving the next non-whitespace character.  */
1029 static void
1030 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1031 {
1032   cpp_buffer *buffer = pfile->buffer;
1033   bool saw_NUL = false;
1034
1035   do
1036     {
1037       /* Horizontal space always OK.  */
1038       if (c == ' ' || c == '\t')
1039         ;
1040       /* Just \f \v or \0 left.  */
1041       else if (c == '\0')
1042         saw_NUL = true;
1043       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1044         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1045                              CPP_BUF_COL (buffer),
1046                              "%s in preprocessing directive",
1047                              c == '\f' ? "form feed" : "vertical tab");
1048
1049       c = *buffer->cur++;
1050     }
1051   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1052   while (is_nvspace (c));
1053
1054   if (saw_NUL)
1055     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1056
1057   buffer->cur--;
1058 }
1059
1060 /* See if the characters of a number token are valid in a name (no
1061    '.', '+' or '-').  */
1062 static int
1063 name_p (cpp_reader *pfile, const cpp_string *string)
1064 {
1065   unsigned int i;
1066
1067   for (i = 0; i < string->len; i++)
1068     if (!is_idchar (string->text[i]))
1069       return 0;
1070
1071   return 1;
1072 }
1073
1074 /* After parsing an identifier or other sequence, produce a warning about
1075    sequences not in NFC/NFKC.  */
1076 static void
1077 warn_about_normalization (cpp_reader *pfile,
1078                           const cpp_token *token,
1079                           const struct normalize_state *s)
1080 {
1081   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1082       && !pfile->state.skipping)
1083     {
1084       /* Make sure that the token is printed using UCNs, even
1085          if we'd otherwise happily print UTF-8.  */
1086       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1087       size_t sz;
1088
1089       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1090       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1091         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1092                                "`%.*s' is not in NFKC", (int) sz, buf);
1093       else
1094         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1095                                "`%.*s' is not in NFC", (int) sz, buf);
1096       free (buf);
1097     }
1098 }
1099
1100 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1101    an identifier.  FIRST is TRUE if this starts an identifier.  */
1102 static bool
1103 forms_identifier_p (cpp_reader *pfile, int first,
1104                     struct normalize_state *state)
1105 {
1106   cpp_buffer *buffer = pfile->buffer;
1107
1108   if (*buffer->cur == '$')
1109     {
1110       if (!CPP_OPTION (pfile, dollars_in_ident))
1111         return false;
1112
1113       buffer->cur++;
1114       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1115         {
1116           CPP_OPTION (pfile, warn_dollars) = 0;
1117           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1118         }
1119
1120       return true;
1121     }
1122
1123   /* Is this a syntactically valid UCN?  */
1124   if (CPP_OPTION (pfile, extended_identifiers)
1125       && *buffer->cur == '\\'
1126       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1127     {
1128       buffer->cur += 2;
1129       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1130                           state))
1131         return true;
1132       buffer->cur -= 2;
1133     }
1134
1135   return false;
1136 }
1137
1138 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1139 static cpp_hashnode *
1140 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1141 {
1142   cpp_hashnode *result;
1143   const uchar *cur;
1144   unsigned int len;
1145   unsigned int hash = HT_HASHSTEP (0, *base);
1146
1147   cur = base + 1;
1148   while (ISIDNUM (*cur))
1149     {
1150       hash = HT_HASHSTEP (hash, *cur);
1151       cur++;
1152     }
1153   len = cur - base;
1154   hash = HT_HASHFINISH (hash, len);
1155   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1156                                               base, len, hash, HT_ALLOC));
1157
1158   /* Rarely, identifiers require diagnostics when lexed.  */
1159   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1160                         && !pfile->state.skipping, 0))
1161     {
1162       /* It is allowed to poison the same identifier twice.  */
1163       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1164         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1165                    NODE_NAME (result));
1166
1167       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1168          replacement list of a variadic macro.  */
1169       if (result == pfile->spec_nodes.n__VA_ARGS__
1170           && !pfile->state.va_args_ok)
1171         cpp_error (pfile, CPP_DL_PEDWARN,
1172                    "__VA_ARGS__ can only appear in the expansion"
1173                    " of a C99 variadic macro");
1174
1175       /* For -Wc++-compat, warn about use of C++ named operators.  */
1176       if (result->flags & NODE_WARN_OPERATOR)
1177         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1178                      "identifier \"%s\" is a special operator name in C++",
1179                      NODE_NAME (result));
1180     }
1181
1182   return result;
1183 }
1184
1185 /* Get the cpp_hashnode of an identifier specified by NAME in
1186    the current cpp_reader object.  If none is found, NULL is returned.  */
1187 cpp_hashnode *
1188 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1189 {
1190   cpp_hashnode *result;
1191   result = lex_identifier_intern (pfile, (uchar *) name);
1192   return result;
1193 }
1194
1195 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1196 static cpp_hashnode *
1197 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1198                 struct normalize_state *nst)
1199 {
1200   cpp_hashnode *result;
1201   const uchar *cur;
1202   unsigned int len;
1203   unsigned int hash = HT_HASHSTEP (0, *base);
1204
1205   cur = pfile->buffer->cur;
1206   if (! starts_ucn)
1207     while (ISIDNUM (*cur))
1208       {
1209         hash = HT_HASHSTEP (hash, *cur);
1210         cur++;
1211       }
1212   pfile->buffer->cur = cur;
1213   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1214     {
1215       /* Slower version for identifiers containing UCNs (or $).  */
1216       do {
1217         while (ISIDNUM (*pfile->buffer->cur))
1218           {
1219             pfile->buffer->cur++;
1220             NORMALIZE_STATE_UPDATE_IDNUM (nst);
1221           }
1222       } while (forms_identifier_p (pfile, false, nst));
1223       result = _cpp_interpret_identifier (pfile, base,
1224                                           pfile->buffer->cur - base);
1225     }
1226   else
1227     {
1228       len = cur - base;
1229       hash = HT_HASHFINISH (hash, len);
1230
1231       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1232                                                   base, len, hash, HT_ALLOC));
1233     }
1234
1235   /* Rarely, identifiers require diagnostics when lexed.  */
1236   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1237                         && !pfile->state.skipping, 0))
1238     {
1239       /* It is allowed to poison the same identifier twice.  */
1240       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1241         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1242                    NODE_NAME (result));
1243
1244       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1245          replacement list of a variadic macro.  */
1246       if (result == pfile->spec_nodes.n__VA_ARGS__
1247           && !pfile->state.va_args_ok)
1248         cpp_error (pfile, CPP_DL_PEDWARN,
1249                    "__VA_ARGS__ can only appear in the expansion"
1250                    " of a C99 variadic macro");
1251
1252       /* For -Wc++-compat, warn about use of C++ named operators.  */
1253       if (result->flags & NODE_WARN_OPERATOR)
1254         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1255                      "identifier \"%s\" is a special operator name in C++",
1256                      NODE_NAME (result));
1257     }
1258
1259   return result;
1260 }
1261
1262 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1263 static void
1264 lex_number (cpp_reader *pfile, cpp_string *number,
1265             struct normalize_state *nst)
1266 {
1267   const uchar *cur;
1268   const uchar *base;
1269   uchar *dest;
1270
1271   base = pfile->buffer->cur - 1;
1272   do
1273     {
1274       cur = pfile->buffer->cur;
1275
1276       /* N.B. ISIDNUM does not include $.  */
1277       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1278         {
1279           cur++;
1280           NORMALIZE_STATE_UPDATE_IDNUM (nst);
1281         }
1282
1283       pfile->buffer->cur = cur;
1284     }
1285   while (forms_identifier_p (pfile, false, nst));
1286
1287   number->len = cur - base;
1288   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1289   memcpy (dest, base, number->len);
1290   dest[number->len] = '\0';
1291   number->text = dest;
1292 }
1293
1294 /* Create a token of type TYPE with a literal spelling.  */
1295 static void
1296 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1297                 unsigned int len, enum cpp_ttype type)
1298 {
1299   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1300
1301   memcpy (dest, base, len);
1302   dest[len] = '\0';
1303   token->type = type;
1304   token->val.str.len = len;
1305   token->val.str.text = dest;
1306 }
1307
1308 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1309    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1310
1311 static void
1312 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1313                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1314 {
1315   _cpp_buff *first_buff = *first_buff_p;
1316   _cpp_buff *last_buff = *last_buff_p;
1317
1318   if (first_buff == NULL)
1319     first_buff = last_buff = _cpp_get_buff (pfile, len);
1320   else if (len > BUFF_ROOM (last_buff))
1321     {
1322       size_t room = BUFF_ROOM (last_buff);
1323       memcpy (BUFF_FRONT (last_buff), base, room);
1324       BUFF_FRONT (last_buff) += room;
1325       base += room;
1326       len -= room;
1327       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1328     }
1329
1330   memcpy (BUFF_FRONT (last_buff), base, len);
1331   BUFF_FRONT (last_buff) += len;
1332
1333   *first_buff_p = first_buff;
1334   *last_buff_p = last_buff;
1335 }
1336
1337
1338 /* Returns true if a macro has been defined.
1339    This might not work if compile with -save-temps,
1340    or preprocess separately from compilation.  */
1341
1342 static bool
1343 is_macro(cpp_reader *pfile, const uchar *base)
1344 {
1345   const uchar *cur = base;
1346   if (! ISIDST (*cur))
1347     return false;
1348   unsigned int hash = HT_HASHSTEP (0, *cur);
1349   ++cur;
1350   while (ISIDNUM (*cur))
1351     {
1352       hash = HT_HASHSTEP (hash, *cur);
1353       ++cur;
1354     }
1355   hash = HT_HASHFINISH (hash, cur - base);
1356
1357   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1358                                         base, cur - base, hash, HT_NO_INSERT));
1359
1360   return !result ? false : (result->type == NT_MACRO);
1361 }
1362
1363
1364 /* Lexes a raw string.  The stored string contains the spelling, including
1365    double quotes, delimiter string, '(' and ')', any leading
1366    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1367    literal, or CPP_OTHER if it was not properly terminated.
1368
1369    The spelling is NUL-terminated, but it is not guaranteed that this
1370    is the first NUL since embedded NULs are preserved.  */
1371
1372 static void
1373 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1374                 const uchar *cur)
1375 {
1376   uchar raw_prefix[17];
1377   uchar temp_buffer[18];
1378   const uchar *orig_base;
1379   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1380   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1381   raw_str_phase phase = RAW_STR_PREFIX;
1382   enum cpp_ttype type;
1383   size_t total_len = 0;
1384   /* Index into temp_buffer during phases other than RAW_STR,
1385      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1386      be appended to temp_buffer.  */
1387   size_t temp_buffer_len = 0;
1388   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1389   size_t raw_prefix_start;
1390   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1391
1392   type = (*base == 'L' ? CPP_WSTRING :
1393           *base == 'U' ? CPP_STRING32 :
1394           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1395           : CPP_STRING);
1396
1397 #define BUF_APPEND(STR,LEN)                                     \
1398       do {                                                      \
1399         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1400                         &first_buff, &last_buff);               \
1401         total_len += (LEN);                                     \
1402         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1403             && (const uchar *)(STR) != base                     \
1404             && (LEN) <= 2)                                      \
1405           {                                                     \
1406             memcpy (temp_buffer + temp_buffer_len,              \
1407                     (const uchar *)(STR), (LEN));               \
1408             temp_buffer_len += (LEN);                           \
1409           }                                                     \
1410       } while (0);
1411
1412   orig_base = base;
1413   ++cur;
1414   raw_prefix_start = cur - base;
1415   for (;;)
1416     {
1417       cppchar_t c;
1418
1419       /* If we previously performed any trigraph or line splicing
1420          transformations, undo them in between the opening and closing
1421          double quote.  */
1422       while (note->pos < cur)
1423         ++note;
1424       for (; note->pos == cur; ++note)
1425         {
1426           switch (note->type)
1427             {
1428             case '\\':
1429             case ' ':
1430               /* Restore backslash followed by newline.  */
1431               BUF_APPEND (base, cur - base);
1432               base = cur;
1433               BUF_APPEND ("\\", 1);
1434             after_backslash:
1435               if (note->type == ' ')
1436                 {
1437                   /* GNU backslash whitespace newline extension.  FIXME
1438                      could be any sequence of non-vertical space.  When we
1439                      can properly restore any such sequence, we should mark
1440                      this note as handled so _cpp_process_line_notes
1441                      doesn't warn.  */
1442                   BUF_APPEND (" ", 1);
1443                 }
1444
1445               BUF_APPEND ("\n", 1);
1446               break;
1447
1448             case 0:
1449               /* Already handled.  */
1450               break;
1451
1452             default:
1453               if (_cpp_trigraph_map[note->type])
1454                 {
1455                   /* Don't warn about this trigraph in
1456                      _cpp_process_line_notes, since trigraphs show up as
1457                      trigraphs in raw strings.  */
1458                   uchar type = note->type;
1459                   note->type = 0;
1460
1461                   if (!CPP_OPTION (pfile, trigraphs))
1462                     /* If we didn't convert the trigraph in the first
1463                        place, don't do anything now either.  */
1464                     break;
1465
1466                   BUF_APPEND (base, cur - base);
1467                   base = cur;
1468                   BUF_APPEND ("??", 2);
1469
1470                   /* ??/ followed by newline gets two line notes, one for
1471                      the trigraph and one for the backslash/newline.  */
1472                   if (type == '/' && note[1].pos == cur)
1473                     {
1474                       if (note[1].type != '\\'
1475                           && note[1].type != ' ')
1476                         abort ();
1477                       BUF_APPEND ("/", 1);
1478                       ++note;
1479                       goto after_backslash;
1480                     }
1481                   else
1482                     {
1483                       /* Skip the replacement character.  */
1484                       base = ++cur;
1485                       BUF_APPEND (&type, 1);
1486                       c = type;
1487                       goto check_c;
1488                     }
1489                 }
1490               else
1491                 abort ();
1492               break;
1493             }
1494         }
1495       c = *cur++;
1496       if (__builtin_expect (temp_buffer_len < 17, 0))
1497         temp_buffer[temp_buffer_len++] = c;
1498
1499      check_c:
1500       if (phase == RAW_STR_PREFIX)
1501         {
1502           while (raw_prefix_len < temp_buffer_len)
1503             {
1504               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1505               switch (raw_prefix[raw_prefix_len])
1506                 {
1507                 case ' ': case '(': case ')': case '\\': case '\t':
1508                 case '\v': case '\f': case '\n': default:
1509                   break;
1510                 /* Basic source charset except the above chars.  */
1511                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1512                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1513                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1514                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1515                 case 'y': case 'z':
1516                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1517                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1518                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1519                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1520                 case 'Y': case 'Z':
1521                 case '0': case '1': case '2': case '3': case '4': case '5':
1522                 case '6': case '7': case '8': case '9':
1523                 case '_': case '{': case '}': case '#': case '[': case ']':
1524                 case '<': case '>': case '%': case ':': case ';': case '.':
1525                 case '?': case '*': case '+': case '-': case '/': case '^':
1526                 case '&': case '|': case '~': case '!': case '=': case ',':
1527                 case '"': case '\'':
1528                   if (raw_prefix_len < 16)
1529                     {
1530                       raw_prefix_len++;
1531                       continue;
1532                     }
1533                   break;
1534                 }
1535
1536               if (raw_prefix[raw_prefix_len] != '(')
1537                 {
1538                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1539                   if (raw_prefix_len == 16)
1540                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1541                                          col, "raw string delimiter longer "
1542                                               "than 16 characters");
1543                   else if (raw_prefix[raw_prefix_len] == '\n')
1544                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1545                                          col, "invalid new-line in raw "
1546                                               "string delimiter");
1547                   else
1548                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1549                                          col, "invalid character '%c' in "
1550                                               "raw string delimiter",
1551                                          (int) raw_prefix[raw_prefix_len]);
1552                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1553                   create_literal (pfile, token, orig_base,
1554                                   raw_prefix_start - 1, CPP_OTHER);
1555                   if (first_buff)
1556                     _cpp_release_buff (pfile, first_buff);
1557                   return;
1558                 }
1559               raw_prefix[raw_prefix_len] = '"';
1560               phase = RAW_STR;
1561               /* Nothing should be appended to temp_buffer during
1562                  RAW_STR phase.  */
1563               temp_buffer_len = 17;
1564               break;
1565             }
1566           continue;
1567         }
1568       else if (phase == RAW_STR_SUFFIX)
1569         {
1570           while (raw_suffix_len <= raw_prefix_len
1571                  && raw_suffix_len < temp_buffer_len
1572                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1573             raw_suffix_len++;
1574           if (raw_suffix_len > raw_prefix_len)
1575             break;
1576           if (raw_suffix_len == temp_buffer_len)
1577             continue;
1578           phase = RAW_STR;
1579           /* Nothing should be appended to temp_buffer during
1580              RAW_STR phase.  */
1581           temp_buffer_len = 17;
1582         }
1583       if (c == ')')
1584         {
1585           phase = RAW_STR_SUFFIX;
1586           raw_suffix_len = 0;
1587           temp_buffer_len = 0;
1588         }
1589       else if (c == '\n')
1590         {
1591           if (pfile->state.in_directive
1592               || (pfile->state.parsing_args
1593                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1594             {
1595               cur--;
1596               type = CPP_OTHER;
1597               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1598                                    "unterminated raw string");
1599               break;
1600             }
1601
1602           BUF_APPEND (base, cur - base);
1603
1604           if (pfile->buffer->cur < pfile->buffer->rlimit)
1605             CPP_INCREMENT_LINE (pfile, 0);
1606           pfile->buffer->need_line = true;
1607
1608           pfile->buffer->cur = cur-1;
1609           _cpp_process_line_notes (pfile, false);
1610           if (!_cpp_get_fresh_line (pfile))
1611             {
1612               source_location src_loc = token->src_loc;
1613               token->type = CPP_EOF;
1614               /* Tell the compiler the line number of the EOF token.  */
1615               token->src_loc = pfile->line_table->highest_line;
1616               token->flags = BOL;
1617               if (first_buff != NULL)
1618                 _cpp_release_buff (pfile, first_buff);
1619               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1620                                    "unterminated raw string");
1621               return;
1622             }
1623
1624           cur = base = pfile->buffer->cur;
1625           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1626         }
1627     }
1628
1629   if (CPP_OPTION (pfile, user_literals))
1630     {
1631       /* If a string format macro, say from inttypes.h, is placed touching
1632          a string literal it could be parsed as a C++11 user-defined string
1633          literal thus breaking the program.
1634          Try to identify macros with is_macro. A warning is issued. */
1635       if (is_macro (pfile, cur))
1636         {
1637           /* Raise a warning, but do not consume subsequent tokens.  */
1638           if (CPP_OPTION (pfile, warn_literal_suffix))
1639             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1640                                    token->src_loc, 0,
1641                                    "invalid suffix on literal; C++11 requires "
1642                                    "a space between literal and string macro");
1643         }
1644       /* Grab user defined literal suffix.  */
1645       else if (ISIDST (*cur))
1646         {
1647           type = cpp_userdef_string_add_type (type);
1648           ++cur;
1649
1650           while (ISIDNUM (*cur))
1651             ++cur;
1652         }
1653     }
1654
1655   pfile->buffer->cur = cur;
1656   if (first_buff == NULL)
1657     create_literal (pfile, token, base, cur - base, type);
1658   else
1659     {
1660       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1661
1662       token->type = type;
1663       token->val.str.len = total_len + (cur - base);
1664       token->val.str.text = dest;
1665       last_buff = first_buff;
1666       while (last_buff != NULL)
1667         {
1668           memcpy (dest, last_buff->base,
1669                   BUFF_FRONT (last_buff) - last_buff->base);
1670           dest += BUFF_FRONT (last_buff) - last_buff->base;
1671           last_buff = last_buff->next;
1672         }
1673       _cpp_release_buff (pfile, first_buff);
1674       memcpy (dest, base, cur - base);
1675       dest[cur - base] = '\0';
1676     }
1677 }
1678
1679 /* Lexes a string, character constant, or angle-bracketed header file
1680    name.  The stored string contains the spelling, including opening
1681    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1682    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1683    if it was not properly terminated, or CPP_LESS for an unterminated
1684    header name which must be relexed as normal tokens.
1685
1686    The spelling is NUL-terminated, but it is not guaranteed that this
1687    is the first NUL since embedded NULs are preserved.  */
1688 static void
1689 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1690 {
1691   bool saw_NUL = false;
1692   const uchar *cur;
1693   cppchar_t terminator;
1694   enum cpp_ttype type;
1695
1696   cur = base;
1697   terminator = *cur++;
1698   if (terminator == 'L' || terminator == 'U')
1699     terminator = *cur++;
1700   else if (terminator == 'u')
1701     {
1702       terminator = *cur++;
1703       if (terminator == '8')
1704         terminator = *cur++;
1705     }
1706   if (terminator == 'R')
1707     {
1708       lex_raw_string (pfile, token, base, cur);
1709       return;
1710     }
1711   if (terminator == '"')
1712     type = (*base == 'L' ? CPP_WSTRING :
1713             *base == 'U' ? CPP_STRING32 :
1714             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1715                          : CPP_STRING);
1716   else if (terminator == '\'')
1717     type = (*base == 'L' ? CPP_WCHAR :
1718             *base == 'U' ? CPP_CHAR32 :
1719             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1720   else
1721     terminator = '>', type = CPP_HEADER_NAME;
1722
1723   for (;;)
1724     {
1725       cppchar_t c = *cur++;
1726
1727       /* In #include-style directives, terminators are not escapable.  */
1728       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1729         cur++;
1730       else if (c == terminator)
1731         break;
1732       else if (c == '\n')
1733         {
1734           cur--;
1735           /* Unmatched quotes always yield undefined behavior, but
1736              greedy lexing means that what appears to be an unterminated
1737              header name may actually be a legitimate sequence of tokens.  */
1738           if (terminator == '>')
1739             {
1740               token->type = CPP_LESS;
1741               return;
1742             }
1743           type = CPP_OTHER;
1744           break;
1745         }
1746       else if (c == '\0')
1747         saw_NUL = true;
1748     }
1749
1750   if (saw_NUL && !pfile->state.skipping)
1751     cpp_error (pfile, CPP_DL_WARNING,
1752                "null character(s) preserved in literal");
1753
1754   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1755     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1756                (int) terminator);
1757
1758   if (CPP_OPTION (pfile, user_literals))
1759     {
1760       /* If a string format macro, say from inttypes.h, is placed touching
1761          a string literal it could be parsed as a C++11 user-defined string
1762          literal thus breaking the program.
1763          Try to identify macros with is_macro. A warning is issued. */
1764       if (is_macro (pfile, cur))
1765         {
1766           /* Raise a warning, but do not consume subsequent tokens.  */
1767           if (CPP_OPTION (pfile, warn_literal_suffix))
1768             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1769                                    token->src_loc, 0,
1770                                    "invalid suffix on literal; C++11 requires "
1771                                    "a space between literal and string macro");
1772         }
1773       /* Grab user defined literal suffix.  */
1774       else if (ISIDST (*cur))
1775         {
1776           type = cpp_userdef_char_add_type (type);
1777           type = cpp_userdef_string_add_type (type);
1778           ++cur;
1779
1780           while (ISIDNUM (*cur))
1781             ++cur;
1782         }
1783     }
1784
1785   pfile->buffer->cur = cur;
1786   create_literal (pfile, token, base, cur - base, type);
1787 }
1788
1789 /* Return the comment table. The client may not make any assumption
1790    about the ordering of the table.  */
1791 cpp_comment_table *
1792 cpp_get_comments (cpp_reader *pfile)
1793 {
1794   return &pfile->comments;
1795 }
1796
1797 /* Append a comment to the end of the comment table. */
1798 static void
1799 store_comment (cpp_reader *pfile, cpp_token *token)
1800 {
1801   int len;
1802
1803   if (pfile->comments.allocated == 0)
1804     {
1805       pfile->comments.allocated = 256;
1806       pfile->comments.entries = (cpp_comment *) xmalloc
1807         (pfile->comments.allocated * sizeof (cpp_comment));
1808     }
1809
1810   if (pfile->comments.count == pfile->comments.allocated)
1811     {
1812       pfile->comments.allocated *= 2;
1813       pfile->comments.entries = (cpp_comment *) xrealloc
1814         (pfile->comments.entries,
1815          pfile->comments.allocated * sizeof (cpp_comment));
1816     }
1817
1818   len = token->val.str.len;
1819
1820   /* Copy comment. Note, token may not be NULL terminated. */
1821   pfile->comments.entries[pfile->comments.count].comment =
1822     (char *) xmalloc (sizeof (char) * (len + 1));
1823   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1824           token->val.str.text, len);
1825   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1826
1827   /* Set source location. */
1828   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1829
1830   /* Increment the count of entries in the comment table. */
1831   pfile->comments.count++;
1832 }
1833
1834 /* The stored comment includes the comment start and any terminator.  */
1835 static void
1836 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1837               cppchar_t type)
1838 {
1839   unsigned char *buffer;
1840   unsigned int len, clen, i;
1841
1842   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1843
1844   /* C++ comments probably (not definitely) have moved past a new
1845      line, which we don't want to save in the comment.  */
1846   if (is_vspace (pfile->buffer->cur[-1]))
1847     len--;
1848
1849   /* If we are currently in a directive or in argument parsing, then
1850      we need to store all C++ comments as C comments internally, and
1851      so we need to allocate a little extra space in that case.
1852
1853      Note that the only time we encounter a directive here is
1854      when we are saving comments in a "#define".  */
1855   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1856           && type == '/') ? len + 2 : len;
1857
1858   buffer = _cpp_unaligned_alloc (pfile, clen);
1859
1860   token->type = CPP_COMMENT;
1861   token->val.str.len = clen;
1862   token->val.str.text = buffer;
1863
1864   buffer[0] = '/';
1865   memcpy (buffer + 1, from, len - 1);
1866
1867   /* Finish conversion to a C comment, if necessary.  */
1868   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1869     {
1870       buffer[1] = '*';
1871       buffer[clen - 2] = '*';
1872       buffer[clen - 1] = '/';
1873       /* As there can be in a C++ comments illegal sequences for C comments
1874          we need to filter them out.  */
1875       for (i = 2; i < (clen - 2); i++)
1876         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1877           buffer[i] = '|';
1878     }
1879
1880   /* Finally store this comment for use by clients of libcpp. */
1881   store_comment (pfile, token);
1882 }
1883
1884 /* Allocate COUNT tokens for RUN.  */
1885 void
1886 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1887 {
1888   run->base = XNEWVEC (cpp_token, count);
1889   run->limit = run->base + count;
1890   run->next = NULL;
1891 }
1892
1893 /* Returns the next tokenrun, or creates one if there is none.  */
1894 static tokenrun *
1895 next_tokenrun (tokenrun *run)
1896 {
1897   if (run->next == NULL)
1898     {
1899       run->next = XNEW (tokenrun);
1900       run->next->prev = run;
1901       _cpp_init_tokenrun (run->next, 250);
1902     }
1903
1904   return run->next;
1905 }
1906
1907 /* Return the number of not yet processed token in a given
1908    context.  */
1909 int
1910 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1911 {
1912   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1913     return (LAST (context).token - FIRST (context).token);
1914   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1915            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1916     return (LAST (context).ptoken - FIRST (context).ptoken);
1917   else
1918       abort ();
1919 }
1920
1921 /* Returns the token present at index INDEX in a given context.  If
1922    INDEX is zero, the next token to be processed is returned.  */
1923 static const cpp_token*
1924 _cpp_token_from_context_at (cpp_context *context, int index)
1925 {
1926   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1927     return &(FIRST (context).token[index]);
1928   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1929            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1930     return FIRST (context).ptoken[index];
1931  else
1932    abort ();
1933 }
1934
1935 /* Look ahead in the input stream.  */
1936 const cpp_token *
1937 cpp_peek_token (cpp_reader *pfile, int index)
1938 {
1939   cpp_context *context = pfile->context;
1940   const cpp_token *peektok;
1941   int count;
1942
1943   /* First, scan through any pending cpp_context objects.  */
1944   while (context->prev)
1945     {
1946       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1947
1948       if (index < (int) sz)
1949         return _cpp_token_from_context_at (context, index);
1950       index -= (int) sz;
1951       context = context->prev;
1952     }
1953
1954   /* We will have to read some new tokens after all (and do so
1955      without invalidating preceding tokens).  */
1956   count = index;
1957   pfile->keep_tokens++;
1958
1959   do
1960     {
1961       peektok = _cpp_lex_token (pfile);
1962       if (peektok->type == CPP_EOF)
1963         return peektok;
1964     }
1965   while (index--);
1966
1967   _cpp_backup_tokens_direct (pfile, count + 1);
1968   pfile->keep_tokens--;
1969
1970   return peektok;
1971 }
1972
1973 /* Allocate a single token that is invalidated at the same time as the
1974    rest of the tokens on the line.  Has its line and col set to the
1975    same as the last lexed token, so that diagnostics appear in the
1976    right place.  */
1977 cpp_token *
1978 _cpp_temp_token (cpp_reader *pfile)
1979 {
1980   cpp_token *old, *result;
1981   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
1982   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
1983
1984   old = pfile->cur_token - 1;
1985   /* Any pre-existing lookaheads must not be clobbered.  */
1986   if (la)
1987     {
1988       if (sz <= la)
1989         {
1990           tokenrun *next = next_tokenrun (pfile->cur_run);
1991
1992           if (sz < la)
1993             memmove (next->base + 1, next->base,
1994                      (la - sz) * sizeof (cpp_token));
1995
1996           next->base[0] = pfile->cur_run->limit[-1];
1997         }
1998
1999       if (sz > 1)
2000         memmove (pfile->cur_token + 1, pfile->cur_token,
2001                  MIN (la, sz - 1) * sizeof (cpp_token));
2002     }
2003
2004   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2005     {
2006       pfile->cur_run = next_tokenrun (pfile->cur_run);
2007       pfile->cur_token = pfile->cur_run->base;
2008     }
2009
2010   result = pfile->cur_token++;
2011   result->src_loc = old->src_loc;
2012   return result;
2013 }
2014
2015 /* Lex a token into RESULT (external interface).  Takes care of issues
2016    like directive handling, token lookahead, multiple include
2017    optimization and skipping.  */
2018 const cpp_token *
2019 _cpp_lex_token (cpp_reader *pfile)
2020 {
2021   cpp_token *result;
2022
2023   for (;;)
2024     {
2025       if (pfile->cur_token == pfile->cur_run->limit)
2026         {
2027           pfile->cur_run = next_tokenrun (pfile->cur_run);
2028           pfile->cur_token = pfile->cur_run->base;
2029         }
2030       /* We assume that the current token is somewhere in the current
2031          run.  */
2032       if (pfile->cur_token < pfile->cur_run->base
2033           || pfile->cur_token >= pfile->cur_run->limit)
2034         abort ();
2035
2036       if (pfile->lookaheads)
2037         {
2038           pfile->lookaheads--;
2039           result = pfile->cur_token++;
2040         }
2041       else
2042         result = _cpp_lex_direct (pfile);
2043
2044       if (result->flags & BOL)
2045         {
2046           /* Is this a directive.  If _cpp_handle_directive returns
2047              false, it is an assembler #.  */
2048           if (result->type == CPP_HASH
2049               /* 6.10.3 p 11: Directives in a list of macro arguments
2050                  gives undefined behavior.  This implementation
2051                  handles the directive as normal.  */
2052               && pfile->state.parsing_args != 1)
2053             {
2054               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2055                 {
2056                   if (pfile->directive_result.type == CPP_PADDING)
2057                     continue;
2058                   result = &pfile->directive_result;
2059                 }
2060             }
2061           else if (pfile->state.in_deferred_pragma)
2062             result = &pfile->directive_result;
2063
2064           if (pfile->cb.line_change && !pfile->state.skipping)
2065             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2066         }
2067
2068       /* We don't skip tokens in directives.  */
2069       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2070         break;
2071
2072       /* Outside a directive, invalidate controlling macros.  At file
2073          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2074          get here and MI optimization works.  */
2075       pfile->mi_valid = false;
2076
2077       if (!pfile->state.skipping || result->type == CPP_EOF)
2078         break;
2079     }
2080
2081   return result;
2082 }
2083
2084 /* Returns true if a fresh line has been loaded.  */
2085 bool
2086 _cpp_get_fresh_line (cpp_reader *pfile)
2087 {
2088   int return_at_eof;
2089
2090   /* We can't get a new line until we leave the current directive.  */
2091   if (pfile->state.in_directive)
2092     return false;
2093
2094   for (;;)
2095     {
2096       cpp_buffer *buffer = pfile->buffer;
2097
2098       if (!buffer->need_line)
2099         return true;
2100
2101       if (buffer->next_line < buffer->rlimit)
2102         {
2103           _cpp_clean_line (pfile);
2104           return true;
2105         }
2106
2107       /* First, get out of parsing arguments state.  */
2108       if (pfile->state.parsing_args)
2109         return false;
2110
2111       /* End of buffer.  Non-empty files should end in a newline.  */
2112       if (buffer->buf != buffer->rlimit
2113           && buffer->next_line > buffer->rlimit
2114           && !buffer->from_stage3)
2115         {
2116           /* Clip to buffer size.  */
2117           buffer->next_line = buffer->rlimit;
2118         }
2119
2120       return_at_eof = buffer->return_at_eof;
2121       _cpp_pop_buffer (pfile);
2122       if (pfile->buffer == NULL || return_at_eof)
2123         return false;
2124     }
2125 }
2126
2127 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2128   do                                                    \
2129     {                                                   \
2130       result->type = ELSE_TYPE;                         \
2131       if (*buffer->cur == CHAR)                         \
2132         buffer->cur++, result->type = THEN_TYPE;        \
2133     }                                                   \
2134   while (0)
2135
2136 /* Lex a token into pfile->cur_token, which is also incremented, to
2137    get diagnostics pointing to the correct location.
2138
2139    Does not handle issues such as token lookahead, multiple-include
2140    optimization, directives, skipping etc.  This function is only
2141    suitable for use by _cpp_lex_token, and in special cases like
2142    lex_expansion_token which doesn't care for any of these issues.
2143
2144    When meeting a newline, returns CPP_EOF if parsing a directive,
2145    otherwise returns to the start of the token buffer if permissible.
2146    Returns the location of the lexed token.  */
2147 cpp_token *
2148 _cpp_lex_direct (cpp_reader *pfile)
2149 {
2150   cppchar_t c;
2151   cpp_buffer *buffer;
2152   const unsigned char *comment_start;
2153   cpp_token *result = pfile->cur_token++;
2154
2155  fresh_line:
2156   result->flags = 0;
2157   buffer = pfile->buffer;
2158   if (buffer->need_line)
2159     {
2160       if (pfile->state.in_deferred_pragma)
2161         {
2162           result->type = CPP_PRAGMA_EOL;
2163           pfile->state.in_deferred_pragma = false;
2164           if (!pfile->state.pragma_allow_expansion)
2165             pfile->state.prevent_expansion--;
2166           return result;
2167         }
2168       if (!_cpp_get_fresh_line (pfile))
2169         {
2170           result->type = CPP_EOF;
2171           if (!pfile->state.in_directive)
2172             {
2173               /* Tell the compiler the line number of the EOF token.  */
2174               result->src_loc = pfile->line_table->highest_line;
2175               result->flags = BOL;
2176             }
2177           return result;
2178         }
2179       if (!pfile->keep_tokens)
2180         {
2181           pfile->cur_run = &pfile->base_run;
2182           result = pfile->base_run.base;
2183           pfile->cur_token = result + 1;
2184         }
2185       result->flags = BOL;
2186       if (pfile->state.parsing_args == 2)
2187         result->flags |= PREV_WHITE;
2188     }
2189   buffer = pfile->buffer;
2190  update_tokens_line:
2191   result->src_loc = pfile->line_table->highest_line;
2192
2193  skipped_white:
2194   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2195       && !pfile->overlaid_buffer)
2196     {
2197       _cpp_process_line_notes (pfile, false);
2198       result->src_loc = pfile->line_table->highest_line;
2199     }
2200   c = *buffer->cur++;
2201
2202   if (pfile->forced_token_location_p)
2203     result->src_loc = *pfile->forced_token_location_p;
2204   else
2205     result->src_loc = linemap_position_for_column (pfile->line_table,
2206                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2207
2208   switch (c)
2209     {
2210     case ' ': case '\t': case '\f': case '\v': case '\0':
2211       result->flags |= PREV_WHITE;
2212       skip_whitespace (pfile, c);
2213       goto skipped_white;
2214
2215     case '\n':
2216       if (buffer->cur < buffer->rlimit)
2217         CPP_INCREMENT_LINE (pfile, 0);
2218       buffer->need_line = true;
2219       goto fresh_line;
2220
2221     case '0': case '1': case '2': case '3': case '4':
2222     case '5': case '6': case '7': case '8': case '9':
2223       {
2224         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2225         result->type = CPP_NUMBER;
2226         lex_number (pfile, &result->val.str, &nst);
2227         warn_about_normalization (pfile, result, &nst);
2228         break;
2229       }
2230
2231     case 'L':
2232     case 'u':
2233     case 'U':
2234     case 'R':
2235       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2236          wide strings or raw strings.  */
2237       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2238           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2239         {
2240           if ((*buffer->cur == '\'' && c != 'R')
2241               || *buffer->cur == '"'
2242               || (*buffer->cur == 'R'
2243                   && c != 'R'
2244                   && buffer->cur[1] == '"'
2245                   && CPP_OPTION (pfile, rliterals))
2246               || (*buffer->cur == '8'
2247                   && c == 'u'
2248                   && (buffer->cur[1] == '"'
2249                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2250                           && CPP_OPTION (pfile, rliterals)))))
2251             {
2252               lex_string (pfile, result, buffer->cur - 1);
2253               break;
2254             }
2255         }
2256       /* Fall through.  */
2257
2258     case '_':
2259     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2260     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2261     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2262     case 's': case 't':           case 'v': case 'w': case 'x':
2263     case 'y': case 'z':
2264     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2265     case 'G': case 'H': case 'I': case 'J': case 'K':
2266     case 'M': case 'N': case 'O': case 'P': case 'Q':
2267     case 'S': case 'T':           case 'V': case 'W': case 'X':
2268     case 'Y': case 'Z':
2269       result->type = CPP_NAME;
2270       {
2271         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2272         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2273                                                 &nst);
2274         warn_about_normalization (pfile, result, &nst);
2275       }
2276
2277       /* Convert named operators to their proper types.  */
2278       if (result->val.node.node->flags & NODE_OPERATOR)
2279         {
2280           result->flags |= NAMED_OP;
2281           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2282         }
2283       break;
2284
2285     case '\'':
2286     case '"':
2287       lex_string (pfile, result, buffer->cur - 1);
2288       break;
2289
2290     case '/':
2291       /* A potential block or line comment.  */
2292       comment_start = buffer->cur;
2293       c = *buffer->cur;
2294
2295       if (c == '*')
2296         {
2297           if (_cpp_skip_block_comment (pfile))
2298             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2299         }
2300       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2301                             || cpp_in_system_header (pfile)))
2302         {
2303           /* Warn about comments only if pedantically GNUC89, and not
2304              in system headers.  */
2305           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2306               && ! buffer->warned_cplusplus_comments)
2307             {
2308               cpp_error (pfile, CPP_DL_PEDWARN,
2309                          "C++ style comments are not allowed in ISO C90");
2310               cpp_error (pfile, CPP_DL_PEDWARN,
2311                          "(this will be reported only once per input file)");
2312               buffer->warned_cplusplus_comments = 1;
2313             }
2314
2315           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2316             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2317         }
2318       else if (c == '=')
2319         {
2320           buffer->cur++;
2321           result->type = CPP_DIV_EQ;
2322           break;
2323         }
2324       else
2325         {
2326           result->type = CPP_DIV;
2327           break;
2328         }
2329
2330       if (!pfile->state.save_comments)
2331         {
2332           result->flags |= PREV_WHITE;
2333           goto update_tokens_line;
2334         }
2335
2336       /* Save the comment as a token in its own right.  */
2337       save_comment (pfile, result, comment_start, c);
2338       break;
2339
2340     case '<':
2341       if (pfile->state.angled_headers)
2342         {
2343           lex_string (pfile, result, buffer->cur - 1);
2344           if (result->type != CPP_LESS)
2345             break;
2346         }
2347
2348       result->type = CPP_LESS;
2349       if (*buffer->cur == '=')
2350         buffer->cur++, result->type = CPP_LESS_EQ;
2351       else if (*buffer->cur == '<')
2352         {
2353           buffer->cur++;
2354           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2355         }
2356       else if (CPP_OPTION (pfile, digraphs))
2357         {
2358           if (*buffer->cur == ':')
2359             {
2360               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2361                  three characters are <:: and the subsequent character
2362                  is neither : nor >, the < is treated as a preprocessor
2363                  token by itself".  */
2364               if (CPP_OPTION (pfile, cplusplus)
2365                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2366                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2367                   && buffer->cur[1] == ':'
2368                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2369                 break;
2370
2371               buffer->cur++;
2372               result->flags |= DIGRAPH;
2373               result->type = CPP_OPEN_SQUARE;
2374             }
2375           else if (*buffer->cur == '%')
2376             {
2377               buffer->cur++;
2378               result->flags |= DIGRAPH;
2379               result->type = CPP_OPEN_BRACE;
2380             }
2381         }
2382       break;
2383
2384     case '>':
2385       result->type = CPP_GREATER;
2386       if (*buffer->cur == '=')
2387         buffer->cur++, result->type = CPP_GREATER_EQ;
2388       else if (*buffer->cur == '>')
2389         {
2390           buffer->cur++;
2391           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2392         }
2393       break;
2394
2395     case '%':
2396       result->type = CPP_MOD;
2397       if (*buffer->cur == '=')
2398         buffer->cur++, result->type = CPP_MOD_EQ;
2399       else if (CPP_OPTION (pfile, digraphs))
2400         {
2401           if (*buffer->cur == ':')
2402             {
2403               buffer->cur++;
2404               result->flags |= DIGRAPH;
2405               result->type = CPP_HASH;
2406               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2407                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2408             }
2409           else if (*buffer->cur == '>')
2410             {
2411               buffer->cur++;
2412               result->flags |= DIGRAPH;
2413               result->type = CPP_CLOSE_BRACE;
2414             }
2415         }
2416       break;
2417
2418     case '.':
2419       result->type = CPP_DOT;
2420       if (ISDIGIT (*buffer->cur))
2421         {
2422           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2423           result->type = CPP_NUMBER;
2424           lex_number (pfile, &result->val.str, &nst);
2425           warn_about_normalization (pfile, result, &nst);
2426         }
2427       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2428         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2429       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2430         buffer->cur++, result->type = CPP_DOT_STAR;
2431       break;
2432
2433     case '+':
2434       result->type = CPP_PLUS;
2435       if (*buffer->cur == '+')
2436         buffer->cur++, result->type = CPP_PLUS_PLUS;
2437       else if (*buffer->cur == '=')
2438         buffer->cur++, result->type = CPP_PLUS_EQ;
2439       break;
2440
2441     case '-':
2442       result->type = CPP_MINUS;
2443       if (*buffer->cur == '>')
2444         {
2445           buffer->cur++;
2446           result->type = CPP_DEREF;
2447           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2448             buffer->cur++, result->type = CPP_DEREF_STAR;
2449         }
2450       else if (*buffer->cur == '-')
2451         buffer->cur++, result->type = CPP_MINUS_MINUS;
2452       else if (*buffer->cur == '=')
2453         buffer->cur++, result->type = CPP_MINUS_EQ;
2454       break;
2455
2456     case '&':
2457       result->type = CPP_AND;
2458       if (*buffer->cur == '&')
2459         buffer->cur++, result->type = CPP_AND_AND;
2460       else if (*buffer->cur == '=')
2461         buffer->cur++, result->type = CPP_AND_EQ;
2462       break;
2463
2464     case '|':
2465       result->type = CPP_OR;
2466       if (*buffer->cur == '|')
2467         buffer->cur++, result->type = CPP_OR_OR;
2468       else if (*buffer->cur == '=')
2469         buffer->cur++, result->type = CPP_OR_EQ;
2470       break;
2471
2472     case ':':
2473       result->type = CPP_COLON;
2474       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2475         buffer->cur++, result->type = CPP_SCOPE;
2476       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2477         {
2478           buffer->cur++;
2479           result->flags |= DIGRAPH;
2480           result->type = CPP_CLOSE_SQUARE;
2481         }
2482       break;
2483
2484     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2485     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2486     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2487     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2488     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2489
2490     case '?': result->type = CPP_QUERY; break;
2491     case '~': result->type = CPP_COMPL; break;
2492     case ',': result->type = CPP_COMMA; break;
2493     case '(': result->type = CPP_OPEN_PAREN; break;
2494     case ')': result->type = CPP_CLOSE_PAREN; break;
2495     case '[': result->type = CPP_OPEN_SQUARE; break;
2496     case ']': result->type = CPP_CLOSE_SQUARE; break;
2497     case '{': result->type = CPP_OPEN_BRACE; break;
2498     case '}': result->type = CPP_CLOSE_BRACE; break;
2499     case ';': result->type = CPP_SEMICOLON; break;
2500
2501       /* @ is a punctuator in Objective-C.  */
2502     case '@': result->type = CPP_ATSIGN; break;
2503
2504     case '$':
2505     case '\\':
2506       {
2507         const uchar *base = --buffer->cur;
2508         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2509
2510         if (forms_identifier_p (pfile, true, &nst))
2511           {
2512             result->type = CPP_NAME;
2513             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2514             warn_about_normalization (pfile, result, &nst);
2515             break;
2516           }
2517         buffer->cur++;
2518       }
2519
2520     default:
2521       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2522       break;
2523     }
2524
2525   return result;
2526 }
2527
2528 /* An upper bound on the number of bytes needed to spell TOKEN.
2529    Does not include preceding whitespace.  */
2530 unsigned int
2531 cpp_token_len (const cpp_token *token)
2532 {
2533   unsigned int len;
2534
2535   switch (TOKEN_SPELL (token))
2536     {
2537     default:            len = 6;                                break;
2538     case SPELL_LITERAL: len = token->val.str.len;               break;
2539     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2540     }
2541
2542   return len;
2543 }
2544
2545 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2546    Return the number of bytes read out of NAME.  (There are always
2547    10 bytes written to BUFFER.)  */
2548
2549 static size_t
2550 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2551 {
2552   int j;
2553   int ucn_len = 0;
2554   int ucn_len_c;
2555   unsigned t;
2556   unsigned long utf32;
2557
2558   /* Compute the length of the UTF-8 sequence.  */
2559   for (t = *name; t & 0x80; t <<= 1)
2560     ucn_len++;
2561
2562   utf32 = *name & (0x7F >> ucn_len);
2563   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2564     {
2565       utf32 = (utf32 << 6) | (*++name & 0x3F);
2566
2567       /* Ill-formed UTF-8.  */
2568       if ((*name & ~0x3F) != 0x80)
2569         abort ();
2570     }
2571
2572   *buffer++ = '\\';
2573   *buffer++ = 'U';
2574   for (j = 7; j >= 0; j--)
2575     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2576   return ucn_len;
2577 }
2578
2579 /* Given a token TYPE corresponding to a digraph, return a pointer to
2580    the spelling of the digraph.  */
2581 static const unsigned char *
2582 cpp_digraph2name (enum cpp_ttype type)
2583 {
2584   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2585 }
2586
2587 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2588    already contain the enough space to hold the token's spelling.
2589    Returns a pointer to the character after the last character written.
2590    FORSTRING is true if this is to be the spelling after translation
2591    phase 1 (this is different for UCNs).
2592    FIXME: Would be nice if we didn't need the PFILE argument.  */
2593 unsigned char *
2594 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2595                  unsigned char *buffer, bool forstring)
2596 {
2597   switch (TOKEN_SPELL (token))
2598     {
2599     case SPELL_OPERATOR:
2600       {
2601         const unsigned char *spelling;
2602         unsigned char c;
2603
2604         if (token->flags & DIGRAPH)
2605           spelling = cpp_digraph2name (token->type);
2606         else if (token->flags & NAMED_OP)
2607           goto spell_ident;
2608         else
2609           spelling = TOKEN_NAME (token);
2610
2611         while ((c = *spelling++) != '\0')
2612           *buffer++ = c;
2613       }
2614       break;
2615
2616     spell_ident:
2617     case SPELL_IDENT:
2618       if (forstring)
2619         {
2620           memcpy (buffer, NODE_NAME (token->val.node.node),
2621                   NODE_LEN (token->val.node.node));
2622           buffer += NODE_LEN (token->val.node.node);
2623         }
2624       else
2625         {
2626           size_t i;
2627           const unsigned char * name = NODE_NAME (token->val.node.node);
2628
2629           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2630             if (name[i] & ~0x7F)
2631               {
2632                 i += utf8_to_ucn (buffer, name + i) - 1;
2633                 buffer += 10;
2634               }
2635             else
2636               *buffer++ = NODE_NAME (token->val.node.node)[i];
2637         }
2638       break;
2639
2640     case SPELL_LITERAL:
2641       memcpy (buffer, token->val.str.text, token->val.str.len);
2642       buffer += token->val.str.len;
2643       break;
2644
2645     case SPELL_NONE:
2646       cpp_error (pfile, CPP_DL_ICE,
2647                  "unspellable token %s", TOKEN_NAME (token));
2648       break;
2649     }
2650
2651   return buffer;
2652 }
2653
2654 /* Returns TOKEN spelt as a null-terminated string.  The string is
2655    freed when the reader is destroyed.  Useful for diagnostics.  */
2656 unsigned char *
2657 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2658 {
2659   unsigned int len = cpp_token_len (token) + 1;
2660   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2661
2662   end = cpp_spell_token (pfile, token, start, false);
2663   end[0] = '\0';
2664
2665   return start;
2666 }
2667
2668 /* Returns a pointer to a string which spells the token defined by
2669    TYPE and FLAGS.  Used by C front ends, which really should move to
2670    using cpp_token_as_text.  */
2671 const char *
2672 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2673 {
2674   if (flags & DIGRAPH)
2675     return (const char *) cpp_digraph2name (type);
2676   else if (flags & NAMED_OP)
2677     return cpp_named_operator2name (type);
2678
2679   return (const char *) token_spellings[type].name;
2680 }
2681
2682 /* Writes the spelling of token to FP, without any preceding space.
2683    Separated from cpp_spell_token for efficiency - to avoid stdio
2684    double-buffering.  */
2685 void
2686 cpp_output_token (const cpp_token *token, FILE *fp)
2687 {
2688   switch (TOKEN_SPELL (token))
2689     {
2690     case SPELL_OPERATOR:
2691       {
2692         const unsigned char *spelling;
2693         int c;
2694
2695         if (token->flags & DIGRAPH)
2696           spelling = cpp_digraph2name (token->type);
2697         else if (token->flags & NAMED_OP)
2698           goto spell_ident;
2699         else
2700           spelling = TOKEN_NAME (token);
2701
2702         c = *spelling;
2703         do
2704           putc (c, fp);
2705         while ((c = *++spelling) != '\0');
2706       }
2707       break;
2708
2709     spell_ident:
2710     case SPELL_IDENT:
2711       {
2712         size_t i;
2713         const unsigned char * name = NODE_NAME (token->val.node.node);
2714
2715         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2716           if (name[i] & ~0x7F)
2717             {
2718               unsigned char buffer[10];
2719               i += utf8_to_ucn (buffer, name + i) - 1;
2720               fwrite (buffer, 1, 10, fp);
2721             }
2722           else
2723             fputc (NODE_NAME (token->val.node.node)[i], fp);
2724       }
2725       break;
2726
2727     case SPELL_LITERAL:
2728       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2729       break;
2730
2731     case SPELL_NONE:
2732       /* An error, most probably.  */
2733       break;
2734     }
2735 }
2736
2737 /* Compare two tokens.  */
2738 int
2739 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2740 {
2741   if (a->type == b->type && a->flags == b->flags)
2742     switch (TOKEN_SPELL (a))
2743       {
2744       default:                  /* Keep compiler happy.  */
2745       case SPELL_OPERATOR:
2746         /* token_no is used to track where multiple consecutive ##
2747            tokens were originally located.  */
2748         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2749       case SPELL_NONE:
2750         return (a->type != CPP_MACRO_ARG
2751                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2752       case SPELL_IDENT:
2753         return a->val.node.node == b->val.node.node;
2754       case SPELL_LITERAL:
2755         return (a->val.str.len == b->val.str.len
2756                 && !memcmp (a->val.str.text, b->val.str.text,
2757                             a->val.str.len));
2758       }
2759
2760   return 0;
2761 }
2762
2763 /* Returns nonzero if a space should be inserted to avoid an
2764    accidental token paste for output.  For simplicity, it is
2765    conservative, and occasionally advises a space where one is not
2766    needed, e.g. "." and ".2".  */
2767 int
2768 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2769                  const cpp_token *token2)
2770 {
2771   enum cpp_ttype a = token1->type, b = token2->type;
2772   cppchar_t c;
2773
2774   if (token1->flags & NAMED_OP)
2775     a = CPP_NAME;
2776   if (token2->flags & NAMED_OP)
2777     b = CPP_NAME;
2778
2779   c = EOF;
2780   if (token2->flags & DIGRAPH)
2781     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2782   else if (token_spellings[b].category == SPELL_OPERATOR)
2783     c = token_spellings[b].name[0];
2784
2785   /* Quickly get everything that can paste with an '='.  */
2786   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2787     return 1;
2788
2789   switch (a)
2790     {
2791     case CPP_GREATER:   return c == '>';
2792     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2793     case CPP_PLUS:      return c == '+';
2794     case CPP_MINUS:     return c == '-' || c == '>';
2795     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2796     case CPP_MOD:       return c == ':' || c == '>';
2797     case CPP_AND:       return c == '&';
2798     case CPP_OR:        return c == '|';
2799     case CPP_COLON:     return c == ':' || c == '>';
2800     case CPP_DEREF:     return c == '*';
2801     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2802     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2803     case CPP_NAME:      return ((b == CPP_NUMBER
2804                                  && name_p (pfile, &token2->val.str))
2805                                 || b == CPP_NAME
2806                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2807     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2808                                 || c == '.' || c == '+' || c == '-');
2809                                       /* UCNs */
2810     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2811                                  && b == CPP_NAME)
2812                                 || (CPP_OPTION (pfile, objc)
2813                                     && token1->val.str.text[0] == '@'
2814                                     && (b == CPP_NAME || b == CPP_STRING)));
2815     case CPP_STRING:
2816     case CPP_WSTRING:
2817     case CPP_UTF8STRING:
2818     case CPP_STRING16:
2819     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
2820                                 && (b == CPP_NAME
2821                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
2822                                         && ISIDST (token2->val.str.text[0]))));
2823
2824     default:            break;
2825     }
2826
2827   return 0;
2828 }
2829
2830 /* Output all the remaining tokens on the current line, and a newline
2831    character, to FP.  Leading whitespace is removed.  If there are
2832    macros, special token padding is not performed.  */
2833 void
2834 cpp_output_line (cpp_reader *pfile, FILE *fp)
2835 {
2836   const cpp_token *token;
2837
2838   token = cpp_get_token (pfile);
2839   while (token->type != CPP_EOF)
2840     {
2841       cpp_output_token (token, fp);
2842       token = cpp_get_token (pfile);
2843       if (token->flags & PREV_WHITE)
2844         putc (' ', fp);
2845     }
2846
2847   putc ('\n', fp);
2848 }
2849
2850 /* Return a string representation of all the remaining tokens on the
2851    current line.  The result is allocated using xmalloc and must be
2852    freed by the caller.  */
2853 unsigned char *
2854 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2855 {
2856   const cpp_token *token;
2857   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2858   unsigned int alloced = 120 + out;
2859   unsigned char *result = (unsigned char *) xmalloc (alloced);
2860
2861   /* If DIR_NAME is empty, there are no initial contents.  */
2862   if (dir_name)
2863     {
2864       sprintf ((char *) result, "#%s ", dir_name);
2865       out += 2;
2866     }
2867
2868   token = cpp_get_token (pfile);
2869   while (token->type != CPP_EOF)
2870     {
2871       unsigned char *last;
2872       /* Include room for a possible space and the terminating nul.  */
2873       unsigned int len = cpp_token_len (token) + 2;
2874
2875       if (out + len > alloced)
2876         {
2877           alloced *= 2;
2878           if (out + len > alloced)
2879             alloced = out + len;
2880           result = (unsigned char *) xrealloc (result, alloced);
2881         }
2882
2883       last = cpp_spell_token (pfile, token, &result[out], 0);
2884       out = last - result;
2885
2886       token = cpp_get_token (pfile);
2887       if (token->flags & PREV_WHITE)
2888         result[out++] = ' ';
2889     }
2890
2891   result[out] = '\0';
2892   return result;
2893 }
2894
2895 /* Memory buffers.  Changing these three constants can have a dramatic
2896    effect on performance.  The values here are reasonable defaults,
2897    but might be tuned.  If you adjust them, be sure to test across a
2898    range of uses of cpplib, including heavy nested function-like macro
2899    expansion.  Also check the change in peak memory usage (NJAMD is a
2900    good tool for this).  */
2901 #define MIN_BUFF_SIZE 8000
2902 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2903 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2904         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2905
2906 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2907   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2908 #endif
2909
2910 /* Create a new allocation buffer.  Place the control block at the end
2911    of the buffer, so that buffer overflows will cause immediate chaos.  */
2912 static _cpp_buff *
2913 new_buff (size_t len)
2914 {
2915   _cpp_buff *result;
2916   unsigned char *base;
2917
2918   if (len < MIN_BUFF_SIZE)
2919     len = MIN_BUFF_SIZE;
2920   len = CPP_ALIGN (len);
2921
2922 #ifdef ENABLE_VALGRIND_CHECKING
2923   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
2924      struct first.  */
2925   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
2926   base = XNEWVEC (unsigned char, len + slen);
2927   result = (_cpp_buff *) base;
2928   base += slen;
2929 #else
2930   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2931   result = (_cpp_buff *) (base + len);
2932 #endif
2933   result->base = base;
2934   result->cur = base;
2935   result->limit = base + len;
2936   result->next = NULL;
2937   return result;
2938 }
2939
2940 /* Place a chain of unwanted allocation buffers on the free list.  */
2941 void
2942 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2943 {
2944   _cpp_buff *end = buff;
2945
2946   while (end->next)
2947     end = end->next;
2948   end->next = pfile->free_buffs;
2949   pfile->free_buffs = buff;
2950 }
2951
2952 /* Return a free buffer of size at least MIN_SIZE.  */
2953 _cpp_buff *
2954 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2955 {
2956   _cpp_buff *result, **p;
2957
2958   for (p = &pfile->free_buffs;; p = &(*p)->next)
2959     {
2960       size_t size;
2961
2962       if (*p == NULL)
2963         return new_buff (min_size);
2964       result = *p;
2965       size = result->limit - result->base;
2966       /* Return a buffer that's big enough, but don't waste one that's
2967          way too big.  */
2968       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2969         break;
2970     }
2971
2972   *p = result->next;
2973   result->next = NULL;
2974   result->cur = result->base;
2975   return result;
2976 }
2977
2978 /* Creates a new buffer with enough space to hold the uncommitted
2979    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
2980    the excess bytes to the new buffer.  Chains the new buffer after
2981    BUFF, and returns the new buffer.  */
2982 _cpp_buff *
2983 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
2984 {
2985   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
2986   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
2987
2988   buff->next = new_buff;
2989   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
2990   return new_buff;
2991 }
2992
2993 /* Creates a new buffer with enough space to hold the uncommitted
2994    remaining bytes of the buffer pointed to by BUFF, and at least
2995    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
2996    Chains the new buffer before the buffer pointed to by BUFF, and
2997    updates the pointer to point to the new buffer.  */
2998 void
2999 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3000 {
3001   _cpp_buff *new_buff, *old_buff = *pbuff;
3002   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3003
3004   new_buff = _cpp_get_buff (pfile, size);
3005   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3006   new_buff->next = old_buff;
3007   *pbuff = new_buff;
3008 }
3009
3010 /* Free a chain of buffers starting at BUFF.  */
3011 void
3012 _cpp_free_buff (_cpp_buff *buff)
3013 {
3014   _cpp_buff *next;
3015
3016   for (; buff; buff = next)
3017     {
3018       next = buff->next;
3019 #ifdef ENABLE_VALGRIND_CHECKING
3020       free (buff);
3021 #else
3022       free (buff->base);
3023 #endif
3024     }
3025 }
3026
3027 /* Allocate permanent, unaligned storage of length LEN.  */
3028 unsigned char *
3029 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3030 {
3031   _cpp_buff *buff = pfile->u_buff;
3032   unsigned char *result = buff->cur;
3033
3034   if (len > (size_t) (buff->limit - result))
3035     {
3036       buff = _cpp_get_buff (pfile, len);
3037       buff->next = pfile->u_buff;
3038       pfile->u_buff = buff;
3039       result = buff->cur;
3040     }
3041
3042   buff->cur = result + len;
3043   return result;
3044 }
3045
3046 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3047    That buffer is used for growing allocations when saving macro
3048    replacement lists in a #define, and when parsing an answer to an
3049    assertion in #assert, #unassert or #if (and therefore possibly
3050    whilst expanding macros).  It therefore must not be used by any
3051    code that they might call: specifically the lexer and the guts of
3052    the macro expander.
3053
3054    All existing other uses clearly fit this restriction: storing
3055    registered pragmas during initialization.  */
3056 unsigned char *
3057 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3058 {
3059   _cpp_buff *buff = pfile->a_buff;
3060   unsigned char *result = buff->cur;
3061
3062   if (len > (size_t) (buff->limit - result))
3063     {
3064       buff = _cpp_get_buff (pfile, len);
3065       buff->next = pfile->a_buff;
3066       pfile->a_buff = buff;
3067       result = buff->cur;
3068     }
3069
3070   buff->cur = result + len;
3071   return result;
3072 }
3073
3074 /* Say which field of TOK is in use.  */
3075
3076 enum cpp_token_fld_kind
3077 cpp_token_val_index (const cpp_token *tok)
3078 {
3079   switch (TOKEN_SPELL (tok))
3080     {
3081     case SPELL_IDENT:
3082       return CPP_TOKEN_FLD_NODE;
3083     case SPELL_LITERAL:
3084       return CPP_TOKEN_FLD_STR;
3085     case SPELL_OPERATOR:
3086       if (tok->type == CPP_PASTE)
3087         return CPP_TOKEN_FLD_TOKEN_NO;
3088       else
3089         return CPP_TOKEN_FLD_NONE;
3090     case SPELL_NONE:
3091       if (tok->type == CPP_MACRO_ARG)
3092         return CPP_TOKEN_FLD_ARG_NO;
3093       else if (tok->type == CPP_PADDING)
3094         return CPP_TOKEN_FLD_SOURCE;
3095       else if (tok->type == CPP_PRAGMA)
3096         return CPP_TOKEN_FLD_PRAGMA;
3097       /* else fall through */
3098     default:
3099       return CPP_TOKEN_FLD_NONE;
3100     }
3101 }
3102
3103 /* All tokens lexed in R after calling this function will be forced to have
3104    their source_location the same as the location referenced by P, until
3105    cpp_stop_forcing_token_locations is called for R.  */
3106
3107 void
3108 cpp_force_token_locations (cpp_reader *r, source_location *p)
3109 {
3110   r->forced_token_location_p = p;
3111 }
3112
3113 /* Go back to assigning locations naturally for lexed tokens.  */
3114
3115 void
3116 cpp_stop_forcing_token_locations (cpp_reader *r)
3117 {
3118   r->forced_token_location_p = NULL;
3119 }