gettext-tools/src/x-python.c

   1 /* xgettext Python backend.
   2    Copyright (C) 2002-2003, 2005-2015 Free Software Foundation,
   3    Inc.
   4
   5    This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
   6
   7    This program is free software: you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 /* Specification.  */
  25 #include "x-python.h"
  26
  27 #include <assert.h>
  28 #include <errno.h>
  29 #include <stdbool.h>
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <string.h>
  33
  34 #include "message.h"
  35 #include "xgettext.h"
  36 #include "error.h"
  37 #include "error-progname.h"
  38 #include "progname.h"
  39 #include "basename.h"
  40 #include "xerror.h"
  41 #include "xvasprintf.h"
  42 #include "xalloc.h"
  43 #include "c-strstr.h"
  44 #include "c-ctype.h"
  45 #include "po-charset.h"
  46 #include "uniname.h"
  47 #include "unistr.h"
  48 #include "gettext.h"
  49
  50 #define _(s) gettext(s)
  51
  52 #define max(a,b) ((a) > (b) ? (a) : (b))
  53
  54 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  55
  56
  57 /* The Python syntax is defined in the Python Reference Manual
  58    /usr/share/doc/packages/python/html/ref/index.html.
  59    See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
  60    Python-2.0/Objects/unicodeobject.c.  */
  61
  62
  63 /* ====================== Keyword set customization.  ====================== */
  64
  65 /* If true extract all strings.  */
  66 static bool extract_all = false;
  67
  68 static hash_table keywords;
  69 static bool default_keywords = true;
  70
  71
  72 void
  73 x_python_extract_all ()
  74 {
  75   extract_all = true;
  76 }
  77
  78
  79 void
  80 x_python_keyword (const char *name)
  81 {
  82   if (name == NULL)
  83     default_keywords = false;
  84   else
  85     {
  86       const char *end;
  87       struct callshape shape;
  88       const char *colon;
  89
  90       if (keywords.table == NULL)
  91         hash_init (&keywords, 100);
  92
  93       split_keywordspec (name, &end, &shape);
  94
  95       /* The characters between name and end should form a valid C identifier.
  96          A colon means an invalid parse in split_keywordspec().  */
  97       colon = strchr (name, ':');
  98       if (colon == NULL || colon >= end)
  99         insert_keyword_callshape (&keywords, name, end - name, &shape);
 100     }
 101 }
 102
 103 /* Finish initializing the keywords hash table.
 104    Called after argument processing, before each file is processed.  */
 105 static void
 106 init_keywords ()
 107 {
 108   if (default_keywords)
 109     {
 110       /* When adding new keywords here, also update the documentation in
 111          xgettext.texi!  */
 112       x_python_keyword ("gettext");
 113       x_python_keyword ("ugettext");
 114       x_python_keyword ("dgettext:2");
 115       x_python_keyword ("ngettext:1,2");
 116       x_python_keyword ("ungettext:1,2");
 117       x_python_keyword ("dngettext:2,3");
 118       x_python_keyword ("_");
 119       default_keywords = false;
 120     }
 121 }
 122
 123 void
 124 init_flag_table_python ()
 125 {
 126   xgettext_record_flag ("gettext:1:pass-python-format");
 127   xgettext_record_flag ("ugettext:1:pass-python-format");
 128   xgettext_record_flag ("dgettext:2:pass-python-format");
 129   xgettext_record_flag ("ngettext:1:pass-python-format");
 130   xgettext_record_flag ("ngettext:2:pass-python-format");
 131   xgettext_record_flag ("ungettext:1:pass-python-format");
 132   xgettext_record_flag ("ungettext:2:pass-python-format");
 133   xgettext_record_flag ("dngettext:2:pass-python-format");
 134   xgettext_record_flag ("dngettext:3:pass-python-format");
 135   xgettext_record_flag ("_:1:pass-python-format");
 136   /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
 137
 138   xgettext_record_flag ("gettext:1:pass-python-brace-format");
 139   xgettext_record_flag ("ugettext:1:pass-python-brace-format");
 140   xgettext_record_flag ("dgettext:2:pass-python-brace-format");
 141   xgettext_record_flag ("ngettext:1:pass-python-brace-format");
 142   xgettext_record_flag ("ngettext:2:pass-python-brace-format");
 143   xgettext_record_flag ("ungettext:1:pass-python-brace-format");
 144   xgettext_record_flag ("ungettext:2:pass-python-brace-format");
 145   xgettext_record_flag ("dngettext:2:pass-python-brace-format");
 146   xgettext_record_flag ("dngettext:3:pass-python-brace-format");
 147   xgettext_record_flag ("_:1:pass-python-brace-format");
 148   /* xgettext_record_flag ("format:1:python-brace-format"); */
 149 }
 150
 151
 152 /* ======================== Reading of characters.  ======================== */
 153
 154 /* Real filename, used in error messages about the input file.  */
 155 static const char *real_file_name;
 156
 157 /* Logical filename and line number, used to label the extracted messages.  */
 158 static char *logical_file_name;
 159 static int line_number;
 160
 161 /* The input file stream.  */
 162 static FILE *fp;
 163
 164
 165 /* 0. Terminate line by \n, regardless whether the external
 166    representation of a line terminator is CR (Mac), and CR/LF
 167    (DOS/Windows), as Python treats them equally.  */
 168 static int
 169 phase0_getc ()
 170 {
 171   int c;
 172
 173   c = getc (fp);
 174   if (c == EOF)
 175     {
 176       if (ferror (fp))
 177         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 178                real_file_name);
 179       return EOF;
 180     }
 181
 182   if (c == '\r')
 183     {
 184       int c1 = getc (fp);
 185
 186       if (c1 != EOF && c1 != '\n')
 187         ungetc (c1, fp);
 188
 189       /* Seen line terminator CR or CR/LF.  */
 190       return '\n';
 191     }
 192
 193   return c;
 194 }
 195
 196 /* Supports only one pushback character, and not '\n'.  */
 197 static inline void
 198 phase0_ungetc (int c)
 199 {
 200   if (c != EOF)
 201     ungetc (c, fp);
 202 }
 203
 204
 205 /* 1. line_number handling.  */
 206
 207 /* Maximum used, roughly a safer MB_LEN_MAX.  */
 208 #define MAX_PHASE1_PUSHBACK 16
 209 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
 210 static int phase1_pushback_length;
 211
 212 /* Read the next single byte from the input file.  */
 213 static int
 214 phase1_getc ()
 215 {
 216   int c;
 217
 218   if (phase1_pushback_length)
 219     c = phase1_pushback[--phase1_pushback_length];
 220   else
 221     c = phase0_getc ();
 222
 223   if (c == '\n')
 224     ++line_number;
 225
 226   return c;
 227 }
 228
 229 /* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
 230 static void
 231 phase1_ungetc (int c)
 232 {
 233   if (c != EOF)
 234     {
 235       if (c == '\n')
 236         --line_number;
 237
 238       if (phase1_pushback_length == SIZEOF (phase1_pushback))
 239         abort ();
 240       phase1_pushback[phase1_pushback_length++] = c;
 241     }
 242 }
 243
 244
 245 /* Phase 2: Conversion to Unicode.
 246    This is done early because PEP 0263 specifies that conversion to Unicode
 247    conceptually occurs before tokenization.  A test case where it matters
 248    is with encodings like BIG5: when a double-byte character ending in 0x5C
 249    is followed by '\' or 'u0021', the tokenizer must not treat the second
 250    half of the double-byte character as a backslash.  */
 251
 252 /* End-of-file indicator for functions returning an UCS-4 character.  */
 253 #define UEOF -1
 254
 255 static lexical_context_ty lexical_context;
 256
 257 static int phase2_pushback[max (9, UNINAME_MAX + 3)];
 258 static int phase2_pushback_length;
 259
 260 /* Read the next Unicode UCS-4 character from the input file.  */
 261 static int
 262 phase2_getc ()
 263 {
 264   if (phase2_pushback_length)
 265     return phase2_pushback[--phase2_pushback_length];
 266
 267   if (xgettext_current_source_encoding == po_charset_ascii)
 268     {
 269       int c = phase1_getc ();
 270       if (c == EOF)
 271         return UEOF;
 272       if (!c_isascii (c))
 273         {
 274           multiline_error (xstrdup (""),
 275                            xasprintf ("%s\n%s\n",
 276                                       non_ascii_error_message (lexical_context,
 277                                                                real_file_name,
 278                                                                line_number),
 279                                       _("\
 280 Please specify the source encoding through --from-code or through a comment\n\
 281 as specified in http://www.python.org/peps/pep-0263.html.\n")));
 282           exit (EXIT_FAILURE);
 283         }
 284       return c;
 285     }
 286   else if (xgettext_current_source_encoding != po_charset_utf8)
 287     {
 288 #if HAVE_ICONV
 289       /* Use iconv on an increasing number of bytes.  Read only as many bytes
 290          through phase1_getc as needed.  This is needed to give reasonable
 291          interactive behaviour when fp is connected to an interactive tty.  */
 292       unsigned char buf[MAX_PHASE1_PUSHBACK];
 293       size_t bufcount;
 294       int c = phase1_getc ();
 295       if (c == EOF)
 296         return UEOF;
 297       buf[0] = (unsigned char) c;
 298       bufcount = 1;
 299
 300       for (;;)
 301         {
 302           unsigned char scratchbuf[6];
 303           const char *inptr = (const char *) &buf[0];
 304           size_t insize = bufcount;
 305           char *outptr = (char *) &scratchbuf[0];
 306           size_t outsize = sizeof (scratchbuf);
 307
 308           size_t res = iconv (xgettext_current_source_iconv,
 309                               (ICONV_CONST char **) &inptr, &insize,
 310                               &outptr, &outsize);
 311           /* We expect that a character has been produced if and only if
 312              some input bytes have been consumed.  */
 313           if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
 314             abort ();
 315           if (outsize == sizeof (scratchbuf))
 316             {
 317               /* No character has been produced.  Must be an error.  */
 318               if (res != (size_t)(-1))
 319                 abort ();
 320
 321               if (errno == EILSEQ)
 322                 {
 323                   /* An invalid multibyte sequence was encountered.  */
 324                   multiline_error (xstrdup (""),
 325                                    xasprintf (_("\
 326 %s:%d: Invalid multibyte sequence.\n\
 327 Please specify the correct source encoding through --from-code or through a\n\
 328 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
 329                                    real_file_name, line_number));
 330                   exit (EXIT_FAILURE);
 331                 }
 332               else if (errno == EINVAL)
 333                 {
 334                   /* An incomplete multibyte character.  */
 335                   int c;
 336
 337                   if (bufcount == MAX_PHASE1_PUSHBACK)
 338                     {
 339                       /* An overlong incomplete multibyte sequence was
 340                          encountered.  */
 341                       multiline_error (xstrdup (""),
 342                                        xasprintf (_("\
 343 %s:%d: Long incomplete multibyte sequence.\n\
 344 Please specify the correct source encoding through --from-code or through a\n\
 345 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
 346                                        real_file_name, line_number));
 347                       exit (EXIT_FAILURE);
 348                     }
 349
 350                   /* Read one more byte and retry iconv.  */
 351                   c = phase1_getc ();
 352                   if (c == EOF)
 353                     {
 354                       multiline_error (xstrdup (""),
 355                                        xasprintf (_("\
 356 %s:%d: Incomplete multibyte sequence at end of file.\n\
 357 Please specify the correct source encoding through --from-code or through a\n\
 358 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
 359                                        real_file_name, line_number));
 360                       exit (EXIT_FAILURE);
 361                     }
 362                   if (c == '\n')
 363                     {
 364                       multiline_error (xstrdup (""),
 365                                        xasprintf (_("\
 366 %s:%d: Incomplete multibyte sequence at end of line.\n\
 367 Please specify the correct source encoding through --from-code or through a\n\
 368 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
 369                                        real_file_name, line_number - 1));
 370                       exit (EXIT_FAILURE);
 371                     }
 372                   buf[bufcount++] = (unsigned char) c;
 373                 }
 374               else
 375                 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
 376                        real_file_name, line_number);
 377             }
 378           else
 379             {
 380               size_t outbytes = sizeof (scratchbuf) - outsize;
 381               size_t bytes = bufcount - insize;
 382               ucs4_t uc;
 383
 384               /* We expect that one character has been produced.  */
 385               if (bytes == 0)
 386                 abort ();
 387               if (outbytes == 0)
 388                 abort ();
 389               /* Push back the unused bytes.  */
 390               while (insize > 0)
 391                 phase1_ungetc (buf[--insize]);
 392               /* Convert the character from UTF-8 to UCS-4.  */
 393               if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
 394                 {
 395                   /* scratchbuf contains an out-of-range Unicode character
 396                      (> 0x10ffff).  */
 397                   multiline_error (xstrdup (""),
 398                                    xasprintf (_("\
 399 %s:%d: Invalid multibyte sequence.\n\
 400 Please specify the source encoding through --from-code or through a comment\n\
 401 as specified in http://www.python.org/peps/pep-0263.html.\n"),
 402                                    real_file_name, line_number));
 403                   exit (EXIT_FAILURE);
 404                 }
 405               return uc;
 406             }
 407         }
 408 #else
 409       /* If we don't have iconv(), the only supported values for
 410          xgettext_global_source_encoding and thus also for
 411          xgettext_current_source_encoding are ASCII and UTF-8.  */
 412       abort ();
 413 #endif
 414     }
 415   else
 416     {
 417       /* Read an UTF-8 encoded character.  */
 418       unsigned char buf[6];
 419       unsigned int count;
 420       int c;
 421       ucs4_t uc;
 422
 423       c = phase1_getc ();
 424       if (c == EOF)
 425         return UEOF;
 426       buf[0] = c;
 427       count = 1;
 428
 429       if (buf[0] >= 0xc0)
 430         {
 431           c = phase1_getc ();
 432           if (c == EOF)
 433             return UEOF;
 434           buf[1] = c;
 435           count = 2;
 436         }
 437
 438       if (buf[0] >= 0xe0
 439           && ((buf[1] ^ 0x80) < 0x40))
 440         {
 441           c = phase1_getc ();
 442           if (c == EOF)
 443             return UEOF;
 444           buf[2] = c;
 445           count = 3;
 446         }
 447
 448       if (buf[0] >= 0xf0
 449           && ((buf[1] ^ 0x80) < 0x40)
 450           && ((buf[2] ^ 0x80) < 0x40))
 451         {
 452           c = phase1_getc ();
 453           if (c == EOF)
 454             return UEOF;
 455           buf[3] = c;
 456           count = 4;
 457         }
 458
 459       if (buf[0] >= 0xf8
 460           && ((buf[1] ^ 0x80) < 0x40)
 461           && ((buf[2] ^ 0x80) < 0x40)
 462           && ((buf[3] ^ 0x80) < 0x40))
 463         {
 464           c = phase1_getc ();
 465           if (c == EOF)
 466             return UEOF;
 467           buf[4] = c;
 468           count = 5;
 469         }
 470
 471       if (buf[0] >= 0xfc
 472           && ((buf[1] ^ 0x80) < 0x40)
 473           && ((buf[2] ^ 0x80) < 0x40)
 474           && ((buf[3] ^ 0x80) < 0x40)
 475           && ((buf[4] ^ 0x80) < 0x40))
 476         {
 477           c = phase1_getc ();
 478           if (c == EOF)
 479             return UEOF;
 480           buf[5] = c;
 481           count = 6;
 482         }
 483
 484       u8_mbtouc (&uc, buf, count);
 485       return uc;
 486     }
 487 }
 488
 489 /* Supports max (9, UNINAME_MAX + 3) pushback characters.  */
 490 static void
 491 phase2_ungetc (int c)
 492 {
 493   if (c != UEOF)
 494     {
 495       if (phase2_pushback_length == SIZEOF (phase2_pushback))
 496         abort ();
 497       phase2_pushback[phase2_pushback_length++] = c;
 498     }
 499 }
 500
 501
 502 /* ========================= Accumulating strings.  ======================== */
 503
 504 /* A string buffer type that allows appending Unicode characters.
 505    Returns the entire string in UTF-8 encoding.  */
 506
 507 struct unicode_string_buffer
 508 {
 509   /* The part of the string that has already been converted to UTF-8.  */
 510   char *utf8_buffer;
 511   size_t utf8_buflen;
 512   size_t utf8_allocated;
 513 };
 514
 515 /* Initialize a 'struct unicode_string_buffer' to empty.  */
 516 static inline void
 517 init_unicode_string_buffer (struct unicode_string_buffer *bp)
 518 {
 519   bp->utf8_buffer = NULL;
 520   bp->utf8_buflen = 0;
 521   bp->utf8_allocated = 0;
 522 }
 523
 524 /* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
 525 static inline void
 526 unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp,
 527                                            size_t count)
 528 {
 529   if (bp->utf8_buflen + count > bp->utf8_allocated)
 530     {
 531       size_t new_allocated = 2 * bp->utf8_allocated + 10;
 532       if (new_allocated < bp->utf8_buflen + count)
 533         new_allocated = bp->utf8_buflen + count;
 534       bp->utf8_allocated = new_allocated;
 535       bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
 536     }
 537 }
 538
 539 /* Auxiliary function: Append a Unicode character to bp->utf8.
 540    uc must be < 0x110000.  */
 541 static inline void
 542 unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp,
 543                                       unsigned int uc)
 544 {
 545   unsigned char utf8buf[6];
 546   int count = u8_uctomb (utf8buf, uc, 6);
 547
 548   if (count < 0)
 549     /* The caller should have ensured that uc is not out-of-range.  */
 550     abort ();
 551
 552   unicode_string_buffer_append_unicode_grow (bp, count);
 553   memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
 554   bp->utf8_buflen += count;
 555 }
 556
 557 /* Return the string buffer's contents.  */
 558 static char *
 559 unicode_string_buffer_result (struct unicode_string_buffer *bp)
 560 {
 561   /* NUL-terminate it.  */
 562   unicode_string_buffer_append_unicode_grow (bp, 1);
 563   bp->utf8_buffer[bp->utf8_buflen] = '\0';
 564   /* Return it.  */
 565   return bp->utf8_buffer;
 566 }
 567
 568 /* Free the memory pointed to by a 'struct unicode_string_buffer'.  */
 569 static inline void
 570 free_unicode_string_buffer (struct unicode_string_buffer *bp)
 571 {
 572   free (bp->utf8_buffer);
 573 }
 574
 575
 576 /* ======================== Accumulating comments.  ======================== */
 577
 578
 579 /* Accumulating a single comment line.  */
 580
 581 static struct unicode_string_buffer comment_buffer;
 582
 583 static inline void
 584 comment_start ()
 585 {
 586   lexical_context = lc_comment;
 587   comment_buffer.utf8_buflen = 0;
 588 }
 589
 590 static inline bool
 591 comment_at_start ()
 592 {
 593   return (comment_buffer.utf8_buflen == 0);
 594 }
 595
 596 static inline void
 597 comment_add (int c)
 598 {
 599   unicode_string_buffer_append_unicode (&comment_buffer, c);
 600 }
 601
 602 static inline const char *
 603 comment_line_end ()
 604 {
 605   char *buffer = unicode_string_buffer_result (&comment_buffer);
 606   size_t buflen = strlen (buffer);
 607
 608   while (buflen >= 1
 609          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 610     --buflen;
 611   buffer[buflen] = '\0';
 612   savable_comment_add (buffer);
 613   lexical_context = lc_outside;
 614   return buffer;
 615 }
 616
 617
 618 /* These are for tracking whether comments count as immediately before
 619    keyword.  */
 620 static int last_comment_line;
 621 static int last_non_comment_line;
 622
 623
 624 /* ======================== Recognizing comments.  ======================== */
 625
 626
 627 /* Recognizing the "coding" comment.
 628    As specified in PEP 0263, it takes the form
 629      "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
 630    or
 631      "set" "fileencoding" "=" {alphanumeric or "-" or "_" or "*"}*
 632    and is located in a comment in a line that
 633      - is either the first or second line,
 634      - is not a continuation line,
 635      - in the first form, contains no other tokens except this comment.  */
 636
 637 /* Canonicalized encoding name for the current input file.  */
 638 static const char *xgettext_current_file_source_encoding;
 639
 640 #if HAVE_ICONV
 641 /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
 642    ASCII or UTF-8, when this conversion is a no-op).  */
 643 static iconv_t xgettext_current_file_source_iconv;
 644 #endif
 645
 646 static inline void
 647 set_current_file_source_encoding (const char *canon_encoding)
 648 {
 649   xgettext_current_file_source_encoding = canon_encoding;
 650
 651   if (xgettext_current_file_source_encoding != po_charset_ascii
 652       && xgettext_current_file_source_encoding != po_charset_utf8)
 653     {
 654 #if HAVE_ICONV
 655       iconv_t cd;
 656
 657       /* Avoid glibc-2.1 bug with EUC-KR.  */
 658 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 659      && !defined _LIBICONV_VERSION
 660       if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
 661         cd = (iconv_t)(-1);
 662       else
 663 # endif
 664       cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
 665       if (cd == (iconv_t)(-1))
 666         error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
 667 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
 668 and iconv() does not support this conversion."),
 669                xgettext_current_file_source_encoding, po_charset_utf8,
 670                basename (program_name));
 671       xgettext_current_file_source_iconv = cd;
 672 #else
 673       error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
 674 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
 675 This version was built without iconv()."),
 676              xgettext_global_source_encoding, po_charset_utf8,
 677              basename (program_name));
 678 #endif
 679     }
 680
 681   xgettext_current_source_encoding = xgettext_current_file_source_encoding;
 682 #if HAVE_ICONV
 683   xgettext_current_source_iconv = xgettext_current_file_source_iconv;
 684 #endif
 685 }
 686
 687 static inline void
 688 try_to_extract_coding (const char *comment)
 689 {
 690   const char *p = c_strstr (comment, "coding");
 691
 692   if (p != NULL)
 693     {
 694       p += 6;
 695       if (*p == ':' || *p == '=')
 696         {
 697           p++;
 698           while (*p == ' ' || *p == '\t')
 699             p++;
 700           {
 701             const char *encoding_start = p;
 702
 703             while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
 704               p++;
 705             {
 706               const char *encoding_end = p;
 707
 708               if (encoding_end > encoding_start)
 709                 {
 710                   /* Extract the encoding string.  */
 711                   size_t encoding_len = encoding_end - encoding_start;
 712                   char *encoding = XNMALLOC (encoding_len + 1, char);
 713
 714                   memcpy (encoding, encoding_start, encoding_len);
 715                   encoding[encoding_len] = '\0';
 716
 717                   {
 718                     /* Canonicalize it.  */
 719                     const char *canon_encoding = po_charset_canonicalize (encoding);
 720                     if (canon_encoding == NULL)
 721                       {
 722                         error_at_line (0, 0,
 723                                        logical_file_name, line_number - 1, _("\
 724 Unknown encoding \"%s\". Proceeding with ASCII instead."),
 725                                        encoding);
 726                         canon_encoding = po_charset_ascii;
 727                       }
 728
 729                     /* Activate it.  */
 730                     set_current_file_source_encoding (canon_encoding);
 731                   }
 732
 733                   free (encoding);
 734                 }
 735             }
 736           }
 737         }
 738     }
 739 }
 740
 741 /* Tracking whether the current line is a continuation line or contains a
 742    non-blank character.  */
 743 static bool continuation_or_nonblank_line = false;
 744
 745
 746 /* Phase 3: Outside strings, replace backslash-newline with nothing and a
 747    comment with nothing.  */
 748
 749 static int
 750 phase3_getc ()
 751 {
 752   int c;
 753
 754   for (;;)
 755     {
 756       c = phase2_getc ();
 757       if (c == '\\')
 758         {
 759           c = phase2_getc ();
 760           if (c != '\n')
 761             {
 762               phase2_ungetc (c);
 763               /* This shouldn't happen usually, because "A backslash is
 764                  illegal elsewhere on a line outside a string literal."  */
 765               return '\\';
 766             }
 767           /* Eat backslash-newline.  */
 768           continuation_or_nonblank_line = true;
 769         }
 770       else if (c == '#')
 771         {
 772           /* Eat a comment.  */
 773           const char *comment;
 774
 775           last_comment_line = line_number;
 776           comment_start ();
 777           for (;;)
 778             {
 779               c = phase2_getc ();
 780               if (c == UEOF || c == '\n')
 781                 break;
 782               /* We skip all leading white space, but not EOLs.  */
 783               if (!(comment_at_start () && (c == ' ' || c == '\t')))
 784                 comment_add (c);
 785             }
 786           comment = comment_line_end ();
 787           if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
 788             try_to_extract_coding (comment);
 789           continuation_or_nonblank_line = false;
 790           return c;
 791         }
 792       else
 793         {
 794           if (c == '\n')
 795             continuation_or_nonblank_line = false;
 796           else if (!(c == ' ' || c == '\t' || c == '\f'))
 797             continuation_or_nonblank_line = true;
 798           return c;
 799         }
 800     }
 801 }
 802
 803 /* Supports only one pushback character.  */
 804 static void
 805 phase3_ungetc (int c)
 806 {
 807   phase2_ungetc (c);
 808 }
 809
 810
 811 /* ========================= Accumulating strings.  ======================== */
 812
 813 /* Return value of phase7_getuc when EOF is reached.  */
 814 #define P7_EOF (-1)
 815 #define P7_STRING_END (-2)
 816
 817 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
 818    distinguished from a single-byte return value.  */
 819 #define UNICODE(code) (0x100 + (code))
 820
 821 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
 822    UTF-32 code point.  */
 823 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
 824
 825 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
 826    IS_UNICODE.  */
 827 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
 828
 829
 830 /* ========================== Reading of tokens.  ========================== */
 831
 832
 833 enum token_type_ty
 834 {
 835   token_type_eof,
 836   token_type_lparen,            /* ( */
 837   token_type_rparen,            /* ) */
 838   token_type_comma,             /* , */
 839   token_type_lbracket,          /* [ */
 840   token_type_rbracket,          /* ] */
 841   token_type_string,            /* "abc", 'abc', """abc""", '''abc''' */
 842   token_type_symbol,            /* symbol, number */
 843   token_type_plus,              /* + */
 844   token_type_other              /* misc. operator */
 845 };
 846 typedef enum token_type_ty token_type_ty;
 847
 848 typedef struct token_ty token_ty;
 849 struct token_ty
 850 {
 851   token_type_ty type;
 852   char *string;         /* for token_type_string, token_type_symbol */
 853   refcounted_string_list_ty *comment;   /* for token_type_string */
 854   int line_number;
 855 };
 856
 857 /* Free the memory pointed to by a 'struct token_ty'.  */
 858 static inline void
 859 free_token (token_ty *tp)
 860 {
 861   if (tp->type == token_type_string || tp->type == token_type_symbol)
 862     free (tp->string);
 863   if (tp->type == token_type_string)
 864     drop_reference (tp->comment);
 865 }
 866
 867
 868 /* There are two different input syntaxes for strings, "abc" and r"abc",
 869    and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
 870    Which escape sequences are understood, i.e. what is interpreted specially
 871    after backslash?
 872     "abc"     \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
 873     r"abc"
 874     u"abc"    \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
 875     ur"abc"                                           \unnnn
 876    The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
 877    \unnnn items.  The \ooo and \xnn values are in the current source encoding
 878    for byte strings, and Unicode code points for Unicode strings.
 879  */
 880
 881 static int
 882 phase7_getuc (int quote_char,
 883               bool triple, bool interpret_ansic, bool interpret_unicode,
 884               unsigned int *backslash_counter)
 885 {
 886   int c;
 887
 888   for (;;)
 889     {
 890       /* Use phase 2, because phase 3 elides comments.  */
 891       c = phase2_getc ();
 892
 893       if (c == UEOF)
 894         return P7_EOF;
 895
 896       if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
 897         {
 898           if (triple)
 899             {
 900               int c1 = phase2_getc ();
 901               if (c1 == quote_char)
 902                 {
 903                   int c2 = phase2_getc ();
 904                   if (c2 == quote_char)
 905                     return P7_STRING_END;
 906                   phase2_ungetc (c2);
 907                 }
 908               phase2_ungetc (c1);
 909               return UNICODE (c);
 910             }
 911           else
 912             return P7_STRING_END;
 913         }
 914
 915       if (c == '\n')
 916         {
 917           if (triple)
 918             {
 919               *backslash_counter = 0;
 920               return UNICODE ('\n');
 921             }
 922           /* In r"..." and ur"..." strings, newline is only allowed
 923              immediately after an odd number of backslashes (although the
 924              backslashes are not interpreted!).  */
 925           if (!(interpret_ansic || (*backslash_counter & 1) == 0))
 926             {
 927               *backslash_counter = 0;
 928               return UNICODE ('\n');
 929             }
 930           phase2_ungetc (c);
 931           error_with_progname = false;
 932           error (0, 0, _("%s:%d: warning: unterminated string"),
 933                  logical_file_name, line_number);
 934           error_with_progname = true;
 935           return P7_STRING_END;
 936         }
 937
 938       if (c != '\\')
 939         {
 940           *backslash_counter = 0;
 941           return UNICODE (c);
 942         }
 943
 944       /* Backslash handling.  */
 945
 946       if (!interpret_ansic && !interpret_unicode)
 947         {
 948           ++*backslash_counter;
 949           return UNICODE ('\\');
 950         }
 951
 952       /* Dispatch according to the character following the backslash.  */
 953       c = phase2_getc ();
 954       if (c == UEOF)
 955         {
 956           ++*backslash_counter;
 957           return UNICODE ('\\');
 958         }
 959
 960       if (interpret_ansic)
 961         switch (c)
 962           {
 963           case '\n':
 964             continue;
 965           case '\\':
 966             ++*backslash_counter;
 967             return UNICODE (c);
 968           case '\'': case '"':
 969             *backslash_counter = 0;
 970             return UNICODE (c);
 971           case 'a':
 972             *backslash_counter = 0;
 973             return UNICODE ('\a');
 974           case 'b':
 975             *backslash_counter = 0;
 976             return UNICODE ('\b');
 977           case 'f':
 978             *backslash_counter = 0;
 979             return UNICODE ('\f');
 980           case 'n':
 981             *backslash_counter = 0;
 982             return UNICODE ('\n');
 983           case 'r':
 984             *backslash_counter = 0;
 985             return UNICODE ('\r');
 986           case 't':
 987             *backslash_counter = 0;
 988             return UNICODE ('\t');
 989           case 'v':
 990             *backslash_counter = 0;
 991             return UNICODE ('\v');
 992           case '0': case '1': case '2': case '3': case '4':
 993           case '5': case '6': case '7':
 994             {
 995               int n = c - '0';
 996
 997               c = phase2_getc ();
 998               if (c != UEOF)
 999                 {
1000                   if (c >= '0' && c <= '7')
1001                     {
1002                       n = (n << 3) + (c - '0');
1003                       c = phase2_getc ();
1004                       if (c != UEOF)
1005                         {
1006                           if (c >= '0' && c <= '7')
1007                             n = (n << 3) + (c - '0');
1008                           else
1009                             phase2_ungetc (c);
1010                         }
1011                     }
1012                   else
1013                     phase2_ungetc (c);
1014                 }
1015               *backslash_counter = 0;
1016               if (interpret_unicode)
1017                 return UNICODE (n);
1018               else
1019                 return (unsigned char) n;
1020             }
1021           case 'x':
1022             {
1023               int c1 = phase2_getc ();
1024               int n1;
1025
1026               if (c1 >= '0' && c1 <= '9')
1027                 n1 = c1 - '0';
1028               else if (c1 >= 'A' && c1 <= 'F')
1029                 n1 = c1 - 'A' + 10;
1030               else if (c1 >= 'a' && c1 <= 'f')
1031                 n1 = c1 - 'a' + 10;
1032               else
1033                 n1 = -1;
1034
1035               if (n1 >= 0)
1036                 {
1037                   int c2 = phase2_getc ();
1038                   int n2;
1039
1040                   if (c2 >= '0' && c2 <= '9')
1041                     n2 = c2 - '0';
1042                   else if (c2 >= 'A' && c2 <= 'F')
1043                     n2 = c2 - 'A' + 10;
1044                   else if (c2 >= 'a' && c2 <= 'f')
1045                     n2 = c2 - 'a' + 10;
1046                   else
1047                     n2 = -1;
1048
1049                   if (n2 >= 0)
1050                     {
1051                       int n = (n1 << 4) + n2;
1052                       *backslash_counter = 0;
1053                       if (interpret_unicode)
1054                         return UNICODE (n);
1055                       else
1056                         return (unsigned char) n;
1057                     }
1058
1059                   phase2_ungetc (c2);
1060                 }
1061               phase2_ungetc (c1);
1062               phase2_ungetc (c);
1063               ++*backslash_counter;
1064               return UNICODE ('\\');
1065             }
1066           }
1067
1068       if (interpret_unicode)
1069         {
1070           if (c == 'u')
1071             {
1072               unsigned char buf[4];
1073               unsigned int n = 0;
1074               int i;
1075
1076               for (i = 0; i < 4; i++)
1077                 {
1078                   int c1 = phase2_getc ();
1079
1080                   if (c1 >= '0' && c1 <= '9')
1081                     n = (n << 4) + (c1 - '0');
1082                   else if (c1 >= 'A' && c1 <= 'F')
1083                     n = (n << 4) + (c1 - 'A' + 10);
1084                   else if (c1 >= 'a' && c1 <= 'f')
1085                     n = (n << 4) + (c1 - 'a' + 10);
1086                   else
1087                     {
1088                       phase2_ungetc (c1);
1089                       while (--i >= 0)
1090                         phase2_ungetc (buf[i]);
1091                       phase2_ungetc (c);
1092                       ++*backslash_counter;
1093                       return UNICODE ('\\');
1094                     }
1095
1096                   buf[i] = c1;
1097                 }
1098               *backslash_counter = 0;
1099               return UNICODE (n);
1100             }
1101
1102           if (interpret_ansic)
1103             {
1104               if (c == 'U')
1105                 {
1106                   unsigned char buf[8];
1107                   unsigned int n = 0;
1108                   int i;
1109
1110                   for (i = 0; i < 8; i++)
1111                     {
1112                       int c1 = phase2_getc ();
1113
1114                       if (c1 >= '0' && c1 <= '9')
1115                         n = (n << 4) + (c1 - '0');
1116                       else if (c1 >= 'A' && c1 <= 'F')
1117                         n = (n << 4) + (c1 - 'A' + 10);
1118                       else if (c1 >= 'a' && c1 <= 'f')
1119                         n = (n << 4) + (c1 - 'a' + 10);
1120                       else
1121                         {
1122                           phase2_ungetc (c1);
1123                           while (--i >= 0)
1124                             phase2_ungetc (buf[i]);
1125                           phase2_ungetc (c);
1126                           ++*backslash_counter;
1127                           return UNICODE ('\\');
1128                         }
1129
1130                       buf[i] = c1;
1131                     }
1132                   if (n < 0x110000)
1133                     {
1134                       *backslash_counter = 0;
1135                       return UNICODE (n);
1136                     }
1137
1138                   error_with_progname = false;
1139                   error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1140                          logical_file_name, line_number);
1141                   error_with_progname = true;
1142
1143                   while (--i >= 0)
1144                     phase2_ungetc (buf[i]);
1145                   phase2_ungetc (c);
1146                   ++*backslash_counter;
1147                   return UNICODE ('\\');
1148                 }
1149
1150               if (c == 'N')
1151                 {
1152                   int c1 = phase2_getc ();
1153                   if (c1 == '{')
1154                     {
1155                       unsigned char buf[UNINAME_MAX + 1];
1156                       int i;
1157                       unsigned int n;
1158
1159                       for (i = 0; i < UNINAME_MAX; i++)
1160                         {
1161                           int c2 = phase2_getc ();
1162                           if (!(c2 >= ' ' && c2 <= '~'))
1163                             {
1164                               phase2_ungetc (c2);
1165                               while (--i >= 0)
1166                                 phase2_ungetc (buf[i]);
1167                               phase2_ungetc (c1);
1168                               phase2_ungetc (c);
1169                               ++*backslash_counter;
1170                               return UNICODE ('\\');
1171                             }
1172                           if (c2 == '}')
1173                             break;
1174                           buf[i] = c2;
1175                         }
1176                       buf[i] = '\0';
1177
1178                       n = unicode_name_character ((char *) buf);
1179                       if (n != UNINAME_INVALID)
1180                         {
1181                           *backslash_counter = 0;
1182                           return UNICODE (n);
1183                         }
1184
1185                       phase2_ungetc ('}');
1186                       while (--i >= 0)
1187                         phase2_ungetc (buf[i]);
1188                     }
1189                   phase2_ungetc (c1);
1190                   phase2_ungetc (c);
1191                   ++*backslash_counter;
1192                   return UNICODE ('\\');
1193                 }
1194             }
1195         }
1196
1197       phase2_ungetc (c);
1198       ++*backslash_counter;
1199       return UNICODE ('\\');
1200     }
1201 }
1202
1203
1204 /* Combine characters into tokens.  Discard whitespace except newlines at
1205    the end of logical lines.  */
1206
1207 /* Number of pending open parentheses/braces/brackets.  */
1208 static int open_pbb;
1209
1210 static token_ty phase5_pushback[2];
1211 static int phase5_pushback_length;
1212
1213 static void
1214 phase5_get (token_ty *tp)
1215 {
1216   int c;
1217
1218   if (phase5_pushback_length)
1219     {
1220       *tp = phase5_pushback[--phase5_pushback_length];
1221       return;
1222     }
1223
1224   for (;;)
1225     {
1226       tp->line_number = line_number;
1227       c = phase3_getc ();
1228
1229       switch (c)
1230         {
1231         case UEOF:
1232           tp->type = token_type_eof;
1233           return;
1234
1235         case ' ':
1236         case '\t':
1237         case '\f':
1238           /* Ignore whitespace and comments.  */
1239           continue;
1240
1241         case '\n':
1242           if (last_non_comment_line > last_comment_line)
1243             savable_comment_reset ();
1244           /* Ignore newline if and only if it is used for implicit line
1245              joining.  */
1246           if (open_pbb > 0)
1247             continue;
1248           tp->type = token_type_other;
1249           return;
1250         }
1251
1252       last_non_comment_line = tp->line_number;
1253
1254       switch (c)
1255         {
1256         case '.':
1257           {
1258             int c1 = phase3_getc ();
1259             phase3_ungetc (c1);
1260             if (!(c1 >= '0' && c1 <= '9'))
1261               {
1262
1263                 tp->type = token_type_other;
1264                 return;
1265               }
1266           }
1267           /* FALLTHROUGH */
1268         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1269         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1270         case 'M': case 'N': case 'O': case 'P': case 'Q':
1271         case 'S': case 'T':           case 'V': case 'W': case 'X':
1272         case 'Y': case 'Z':
1273         case '_':
1274         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1275         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1276         case 'm': case 'n': case 'o': case 'p': case 'q':
1277         case 's': case 't':           case 'v': case 'w': case 'x':
1278         case 'y': case 'z':
1279         case '0': case '1': case '2': case '3': case '4':
1280         case '5': case '6': case '7': case '8': case '9':
1281         symbol:
1282           /* Symbol, or part of a number.  */
1283           {
1284             static char *buffer;
1285             static int bufmax;
1286             int bufpos;
1287
1288             bufpos = 0;
1289             for (;;)
1290               {
1291                 if (bufpos >= bufmax)
1292                   {
1293                     bufmax = 2 * bufmax + 10;
1294                     buffer = xrealloc (buffer, bufmax);
1295                   }
1296                 buffer[bufpos++] = c;
1297                 c = phase3_getc ();
1298                 switch (c)
1299                   {
1300                   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1301                   case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1302                   case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1303                   case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1304                   case 'Y': case 'Z':
1305                   case '_':
1306                   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1307                   case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1308                   case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1309                   case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1310                   case 'y': case 'z':
1311                   case '0': case '1': case '2': case '3': case '4':
1312                   case '5': case '6': case '7': case '8': case '9':
1313                     continue;
1314                   default:
1315                     phase3_ungetc (c);
1316                     break;
1317                   }
1318                 break;
1319               }
1320             if (bufpos >= bufmax)
1321               {
1322                 bufmax = 2 * bufmax + 10;
1323                 buffer = xrealloc (buffer, bufmax);
1324               }
1325             buffer[bufpos] = '\0';
1326             tp->string = xstrdup (buffer);
1327             tp->type = token_type_symbol;
1328             return;
1329           }
1330
1331         /* Strings.  */
1332           {
1333             struct mixed_string_buffer *bp;
1334             int quote_char;
1335             bool interpret_ansic;
1336             bool interpret_unicode;
1337             bool triple;
1338             unsigned int backslash_counter;
1339
1340             case 'R': case 'r':
1341               {
1342                 int c1 = phase2_getc ();
1343                 if (c1 == '"' || c1 == '\'')
1344                   {
1345                     quote_char = c1;
1346                     interpret_ansic = false;
1347                     interpret_unicode = false;
1348                     goto string;
1349                   }
1350                 phase2_ungetc (c1);
1351                 goto symbol;
1352               }
1353
1354             case 'U': case 'u':
1355               {
1356                 int c1 = phase2_getc ();
1357                 if (c1 == '"' || c1 == '\'')
1358                   {
1359                     quote_char = c1;
1360                     interpret_ansic = true;
1361                     interpret_unicode = true;
1362                     goto string;
1363                   }
1364                 if (c1 == 'R' || c1 == 'r')
1365                   {
1366                     int c2 = phase2_getc ();
1367                     if (c2 == '"' || c2 == '\'')
1368                       {
1369                         quote_char = c2;
1370                         interpret_ansic = false;
1371                         interpret_unicode = true;
1372                         goto string;
1373                       }
1374                     phase2_ungetc (c2);
1375                   }
1376                 phase2_ungetc (c1);
1377                 goto symbol;
1378               }
1379
1380             case '"': case '\'':
1381               quote_char = c;
1382               interpret_ansic = true;
1383               interpret_unicode = false;
1384             string:
1385               triple = false;
1386               lexical_context = lc_string;
1387               {
1388                 int c1 = phase2_getc ();
1389                 if (c1 == quote_char)
1390                   {
1391                     int c2 = phase2_getc ();
1392                     if (c2 == quote_char)
1393                       triple = true;
1394                     else
1395                       {
1396                         phase2_ungetc (c2);
1397                         phase2_ungetc (c1);
1398                       }
1399                   }
1400                 else
1401                   phase2_ungetc (c1);
1402               }
1403               backslash_counter = 0;
1404               /* Start accumulating the string.  */
1405               bp = mixed_string_buffer_alloc (lexical_context,
1406                                               logical_file_name,
1407                                               line_number);
1408               for (;;)
1409                 {
1410                   int uc = phase7_getuc (quote_char, triple, interpret_ansic,
1411                                          interpret_unicode, &backslash_counter);
1412
1413                   /* Keep line_number in sync.  */
1414                   bp->line_number = line_number;
1415
1416                   if (uc == P7_EOF || uc == P7_STRING_END)
1417                     break;
1418
1419                   if (IS_UNICODE (uc))
1420                     {
1421                       assert (UNICODE_VALUE (uc) >= 0
1422                               && UNICODE_VALUE (uc) < 0x110000);
1423                       mixed_string_buffer_append_unicode (bp,
1424                                                           UNICODE_VALUE (uc));
1425                     }
1426                   else
1427                     mixed_string_buffer_append_char (bp, uc);
1428                 }
1429               tp->string = mixed_string_buffer_done (bp);
1430               tp->comment = add_reference (savable_comment);
1431               lexical_context = lc_outside;
1432               tp->type = token_type_string;
1433               return;
1434           }
1435
1436         case '(':
1437           open_pbb++;
1438           tp->type = token_type_lparen;
1439           return;
1440
1441         case ')':
1442           if (open_pbb > 0)
1443             open_pbb--;
1444           tp->type = token_type_rparen;
1445           return;
1446
1447         case ',':
1448           tp->type = token_type_comma;
1449           return;
1450
1451         case '[': case '{':
1452           open_pbb++;
1453           tp->type = (c == '[' ? token_type_lbracket : token_type_other);
1454           return;
1455
1456         case ']': case '}':
1457           if (open_pbb > 0)
1458             open_pbb--;
1459           tp->type = (c == ']' ? token_type_rbracket : token_type_other);
1460           return;
1461
1462         case '+':
1463           tp->type = token_type_plus;
1464           return;
1465
1466         default:
1467           /* We could carefully recognize each of the 2 and 3 character
1468              operators, but it is not necessary, as we only need to recognize
1469              gettext invocations.  Don't bother.  */
1470           tp->type = token_type_other;
1471           return;
1472         }
1473     }
1474 }
1475
1476 /* Supports only one pushback token.  */
1477 static void
1478 phase5_unget (token_ty *tp)
1479 {
1480   if (tp->type != token_type_eof)
1481     {
1482       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1483         abort ();
1484       phase5_pushback[phase5_pushback_length++] = *tp;
1485     }
1486 }
1487
1488
1489 /* Combine adjacent strings to form a single string.  Note that the end
1490    of a logical line appears as a token of its own, therefore strings that
1491    belong to different logical lines will not be concatenated.  */
1492
1493 static void
1494 x_python_lex (token_ty *tp)
1495 {
1496   phase5_get (tp);
1497   if (tp->type == token_type_string)
1498     {
1499       char *sum = tp->string;
1500       size_t sum_len = strlen (sum);
1501
1502       for (;;)
1503         {
1504           token_ty token2, *tp2 = NULL;
1505           token_ty token3;
1506
1507           phase5_get (&token2);
1508           switch (token2.type)
1509             {
1510             case token_type_plus:
1511               {
1512                 phase5_get (&token3);
1513                 if (token3.type == token_type_string)
1514                   {
1515                     free_token (&token2);
1516                     tp2 = &token3;
1517                   }
1518                 else
1519                   phase5_unget (&token3);
1520               }
1521               break;
1522             case token_type_string:
1523               tp2 = &token2;
1524               break;
1525             default:
1526               break;
1527             }
1528
1529           if (tp2)
1530             {
1531               char *addend = tp2->string;
1532               size_t addend_len = strlen (addend);
1533
1534               sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1535               memcpy (sum + sum_len, addend, addend_len + 1);
1536               sum_len += addend_len;
1537
1538               free_token (tp2);
1539               continue;
1540             }
1541           phase5_unget (&token2);
1542           break;
1543         }
1544       tp->string = sum;
1545     }
1546 }
1547
1548
1549 /* ========================= Extracting strings.  ========================== */
1550
1551
1552 /* Context lookup table.  */
1553 static flag_context_list_table_ty *flag_context_list_table;
1554
1555
1556 /* The file is broken into tokens.  Scan the token stream, looking for
1557    a keyword, followed by a left paren, followed by a string.  When we
1558    see this sequence, we have something to remember.  We assume we are
1559    looking at a valid C or C++ program, and leave the complaints about
1560    the grammar to the compiler.
1561
1562      Normal handling: Look for
1563        keyword ( ... msgid ... )
1564      Plural handling: Look for
1565        keyword ( ... msgid ... msgid_plural ... )
1566
1567    We use recursion because the arguments before msgid or between msgid
1568    and msgid_plural can contain subexpressions of the same form.  */
1569
1570
1571 /* Extract messages until the next balanced closing parenthesis or bracket.
1572    Extracted messages are added to MLP.
1573    DELIM can be either token_type_rparen or token_type_rbracket, or
1574    token_type_eof to accept both.
1575    Return true upon eof, false upon closing parenthesis or bracket.  */
1576 static bool
1577 extract_balanced (message_list_ty *mlp,
1578                   token_type_ty delim,
1579                   flag_context_ty outer_context,
1580                   flag_context_list_iterator_ty context_iter,
1581                   struct arglist_parser *argparser)
1582 {
1583   /* Current argument number.  */
1584   int arg = 1;
1585   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1586   int state;
1587   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1588   const struct callshapes *next_shapes = NULL;
1589   /* Context iterator that will be used if the next token is a '('.  */
1590   flag_context_list_iterator_ty next_context_iter =
1591     passthrough_context_list_iterator;
1592   /* Current context.  */
1593   flag_context_ty inner_context =
1594     inherited_context (outer_context,
1595                        flag_context_list_iterator_advance (&context_iter));
1596
1597   /* Start state is 0.  */
1598   state = 0;
1599
1600   for (;;)
1601     {
1602       token_ty token;
1603
1604       x_python_lex (&token);
1605       switch (token.type)
1606         {
1607         case token_type_symbol:
1608           {
1609             void *keyword_value;
1610
1611             if (hash_find_entry (&keywords, token.string, strlen (token.string),
1612                                  &keyword_value)
1613                 == 0)
1614               {
1615                 next_shapes = (const struct callshapes *) keyword_value;
1616                 state = 1;
1617               }
1618             else
1619               state = 0;
1620           }
1621           next_context_iter =
1622             flag_context_list_iterator (
1623               flag_context_list_table_lookup (
1624                 flag_context_list_table,
1625                 token.string, strlen (token.string)));
1626           free (token.string);
1627           continue;
1628
1629         case token_type_lparen:
1630           if (extract_balanced (mlp, token_type_rparen,
1631                                 inner_context, next_context_iter,
1632                                 arglist_parser_alloc (mlp,
1633                                                       state ? next_shapes : NULL)))
1634             {
1635               xgettext_current_source_encoding = po_charset_utf8;
1636               arglist_parser_done (argparser, arg);
1637               xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1638               return true;
1639             }
1640           next_context_iter = null_context_list_iterator;
1641           state = 0;
1642           continue;
1643
1644         case token_type_rparen:
1645           if (delim == token_type_rparen || delim == token_type_eof)
1646             {
1647               xgettext_current_source_encoding = po_charset_utf8;
1648               arglist_parser_done (argparser, arg);
1649               xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1650               return false;
1651             }
1652           next_context_iter = null_context_list_iterator;
1653           state = 0;
1654           continue;
1655
1656         case token_type_comma:
1657           arg++;
1658           inner_context =
1659             inherited_context (outer_context,
1660                                flag_context_list_iterator_advance (
1661                                  &context_iter));
1662           next_context_iter = passthrough_context_list_iterator;
1663           state = 0;
1664           continue;
1665
1666         case token_type_lbracket:
1667           if (extract_balanced (mlp, token_type_rbracket,
1668                                 null_context, null_context_list_iterator,
1669                                 arglist_parser_alloc (mlp, NULL)))
1670             {
1671               xgettext_current_source_encoding = po_charset_utf8;
1672               arglist_parser_done (argparser, arg);
1673               xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1674               return true;
1675             }
1676           next_context_iter = null_context_list_iterator;
1677           state = 0;
1678           continue;
1679
1680         case token_type_rbracket:
1681           if (delim == token_type_rbracket || delim == token_type_eof)
1682             {
1683               xgettext_current_source_encoding = po_charset_utf8;
1684               arglist_parser_done (argparser, arg);
1685               xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1686               return false;
1687             }
1688           next_context_iter = null_context_list_iterator;
1689           state = 0;
1690           continue;
1691
1692         case token_type_string:
1693           {
1694             lex_pos_ty pos;
1695             pos.file_name = logical_file_name;
1696             pos.line_number = token.line_number;
1697
1698             xgettext_current_source_encoding = po_charset_utf8;
1699             if (extract_all)
1700               remember_a_message (mlp, NULL, token.string, inner_context,
1701                                   &pos, NULL, token.comment);
1702             else
1703               arglist_parser_remember (argparser, arg, token.string,
1704                                        inner_context,
1705                                        pos.file_name, pos.line_number,
1706                                        token.comment);
1707             xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1708           }
1709           drop_reference (token.comment);
1710           next_context_iter = null_context_list_iterator;
1711           state = 0;
1712           continue;
1713
1714         case token_type_eof:
1715           xgettext_current_source_encoding = po_charset_utf8;
1716           arglist_parser_done (argparser, arg);
1717           xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1718           return true;
1719
1720         case token_type_plus:
1721         case token_type_other:
1722           next_context_iter = null_context_list_iterator;
1723           state = 0;
1724           continue;
1725
1726         default:
1727           abort ();
1728         }
1729     }
1730 }
1731
1732
1733 void
1734 extract_python (FILE *f,
1735                 const char *real_filename, const char *logical_filename,
1736                 flag_context_list_table_ty *flag_table,
1737                 msgdomain_list_ty *mdlp)
1738 {
1739   message_list_ty *mlp = mdlp->item[0]->messages;
1740
1741   fp = f;
1742   real_file_name = real_filename;
1743   logical_file_name = xstrdup (logical_filename);
1744   line_number = 1;
1745
1746   lexical_context = lc_outside;
1747
1748   last_comment_line = -1;
1749   last_non_comment_line = -1;
1750
1751   xgettext_current_file_source_encoding = xgettext_global_source_encoding;
1752 #if HAVE_ICONV
1753   xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1754 #endif
1755
1756   xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1757 #if HAVE_ICONV
1758   xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1759 #endif
1760
1761   continuation_or_nonblank_line = false;
1762
1763   open_pbb = 0;
1764
1765   flag_context_list_table = flag_table;
1766
1767   init_keywords ();
1768
1769   /* Eat tokens until eof is seen.  When extract_balanced returns
1770      due to an unbalanced closing parenthesis, just restart it.  */
1771   while (!extract_balanced (mlp, token_type_eof,
1772                             null_context, null_context_list_iterator,
1773                             arglist_parser_alloc (mlp, NULL)))
1774     ;
1775
1776   fp = NULL;
1777   real_file_name = NULL;
1778   logical_file_name = NULL;
1779   line_number = 0;
1780 }