gettext-tools/src/x-python.c

   1 /* xgettext Python backend.
   2    Copyright (C) 2002-2003, 2005-2013 Free Software Foundation, Inc.
   3
   4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22
  23 /* Specification.  */
  24 #include "x-python.h"
  25
  26 #include <assert.h>
  27 #include <errno.h>
  28 #include <stdbool.h>
  29 #include <stdio.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32
  33 #include "message.h"
  34 #include "xgettext.h"
  35 #include "error.h"
  36 #include "error-progname.h"
  37 #include "progname.h"
  38 #include "basename.h"
  39 #include "xerror.h"
  40 #include "xvasprintf.h"
  41 #include "xalloc.h"
  42 #include "c-strstr.h"
  43 #include "c-ctype.h"
  44 #include "po-charset.h"
  45 #include "uniname.h"
  46 #include "unistr.h"
  47 #include "gettext.h"
  48
  49 #define _(s) gettext(s)
  50
  51 #define max(a,b) ((a) > (b) ? (a) : (b))
  52
  53 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  54
  55
  56 /* The Python syntax is defined in the Python Reference Manual
  57    /usr/share/doc/packages/python/html/ref/index.html.
  58    See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
  59    Python-2.0/Objects/unicodeobject.c.  */
  60
  61
  62 /* ====================== Keyword set customization.  ====================== */
  63
  64 /* If true extract all strings.  */
  65 static bool extract_all = false;
  66
  67 static hash_table keywords;
  68 static bool default_keywords = true;
  69
  70
  71 void
  72 x_python_extract_all ()
  73 {
  74   extract_all = true;
  75 }
  76
  77
  78 void
  79 x_python_keyword (const char *name)
  80 {
  81   if (name == NULL)
  82     default_keywords = false;
  83   else
  84     {
  85       const char *end;
  86       struct callshape shape;
  87       const char *colon;
  88
  89       if (keywords.table == NULL)
  90         hash_init (&keywords, 100);
  91
  92       split_keywordspec (name, &end, &shape);
  93
  94       /* The characters between name and end should form a valid C identifier.
  95          A colon means an invalid parse in split_keywordspec().  */
  96       colon = strchr (name, ':');
  97       if (colon == NULL || colon >= end)
  98         insert_keyword_callshape (&keywords, name, end - name, &shape);
  99     }
 100 }
 101
 102 /* Finish initializing the keywords hash table.
 103    Called after argument processing, before each file is processed.  */
 104 static void
 105 init_keywords ()
 106 {
 107   if (default_keywords)
 108     {
 109       /* When adding new keywords here, also update the documentation in
 110          xgettext.texi!  */
 111       x_python_keyword ("gettext");
 112       x_python_keyword ("ugettext");
 113       x_python_keyword ("dgettext:2");
 114       x_python_keyword ("ngettext:1,2");
 115       x_python_keyword ("ungettext:1,2");
 116       x_python_keyword ("dngettext:2,3");
 117       x_python_keyword ("_");
 118       default_keywords = false;
 119     }
 120 }
 121
 122 void
 123 init_flag_table_python ()
 124 {
 125   xgettext_record_flag ("gettext:1:pass-python-format");
 126   xgettext_record_flag ("ugettext:1:pass-python-format");
 127   xgettext_record_flag ("dgettext:2:pass-python-format");
 128   xgettext_record_flag ("ngettext:1:pass-python-format");
 129   xgettext_record_flag ("ngettext:2:pass-python-format");
 130   xgettext_record_flag ("ungettext:1:pass-python-format");
 131   xgettext_record_flag ("ungettext:2:pass-python-format");
 132   xgettext_record_flag ("dngettext:2:pass-python-format");
 133   xgettext_record_flag ("dngettext:3:pass-python-format");
 134   xgettext_record_flag ("_:1:pass-python-format");
 135   /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
 136
 137   xgettext_record_flag ("gettext:1:pass-python-brace-format");
 138   xgettext_record_flag ("ugettext:1:pass-python-brace-format");
 139   xgettext_record_flag ("dgettext:2:pass-python-brace-format");
 140   xgettext_record_flag ("ngettext:1:pass-python-brace-format");
 141   xgettext_record_flag ("ngettext:2:pass-python-brace-format");
 142   xgettext_record_flag ("ungettext:1:pass-python-brace-format");
 143   xgettext_record_flag ("ungettext:2:pass-python-brace-format");
 144   xgettext_record_flag ("dngettext:2:pass-python-brace-format");
 145   xgettext_record_flag ("dngettext:3:pass-python-brace-format");
 146   xgettext_record_flag ("_:1:pass-python-brace-format");
 147   /* xgettext_record_flag ("format:1:python-brace-format"); */
 148 }
 149
 150
 151 /* ======================== Reading of characters.  ======================== */
 152
 153 /* Real filename, used in error messages about the input file.  */
 154 static const char *real_file_name;
 155
 156 /* Logical filename and line number, used to label the extracted messages.  */
 157 static char *logical_file_name;
 158 static int line_number;
 159
 160 /* The input file stream.  */
 161 static FILE *fp;
 162
 163
 164 /* 0. Terminate line by \n, regardless whether the external
 165    representation of a line terminator is CR (Mac), and CR/LF
 166    (DOS/Windows), as Python treats them equally.  */
 167 static int
 168 phase0_getc ()
 169 {
 170   int c;
 171
 172   c = getc (fp);
 173   if (c == EOF)
 174     {
 175       if (ferror (fp))
 176         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 177                real_file_name);
 178       return EOF;
 179     }
 180
 181   if (c == '\r')
 182     {
 183       int c1 = getc (fp);
 184
 185       if (c1 != EOF && c1 != '\n')
 186         ungetc (c1, fp);
 187
 188       /* Seen line terminator CR or CR/LF.  */
 189       return '\n';
 190     }
 191
 192   return c;
 193 }
 194
 195 /* Supports only one pushback character, and not '\n'.  */
 196 static inline void
 197 phase0_ungetc (int c)
 198 {
 199   if (c != EOF)
 200     ungetc (c, fp);
 201 }
 202
 203
 204 /* 1. line_number handling.  */
 205
 206 /* Maximum used, roughly a safer MB_LEN_MAX.  */
 207 #define MAX_PHASE1_PUSHBACK 16
 208 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
 209 static int phase1_pushback_length;
 210
 211 /* Read the next single byte from the input file.  */
 212 static int
 213 phase1_getc ()
 214 {
 215   int c;
 216
 217   if (phase1_pushback_length)
 218     c = phase1_pushback[--phase1_pushback_length];
 219   else
 220     c = phase0_getc ();
 221
 222   if (c == '\n')
 223     ++line_number;
 224
 225   return c;
 226 }
 227
 228 /* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
 229 static void
 230 phase1_ungetc (int c)
 231 {
 232   if (c != EOF)
 233     {
 234       if (c == '\n')
 235         --line_number;
 236
 237       if (phase1_pushback_length == SIZEOF (phase1_pushback))
 238         abort ();
 239       phase1_pushback[phase1_pushback_length++] = c;
 240     }
 241 }
 242
 243
 244 /* Phase 2: Conversion to Unicode.
 245    This is done early because PEP 0263 specifies that conversion to Unicode
 246    conceptually occurs before tokenization.  A test case where it matters
 247    is with encodings like BIG5: when a double-byte character ending in 0x5C
 248    is followed by '\' or 'u0021', the tokenizer must not treat the second
 249    half of the double-byte character as a backslash.  */
 250
 251 /* End-of-file indicator for functions returning an UCS-4 character.  */
 252 #define UEOF -1
 253
 254 static lexical_context_ty lexical_context;
 255
 256 static int phase2_pushback[max (9, UNINAME_MAX + 3)];
 257 static int phase2_pushback_length;
 258
 259 /* Read the next Unicode UCS-4 character from the input file.  */
 260 static int
 261 phase2_getc ()
 262 {
 263   if (phase2_pushback_length)
 264     return phase2_pushback[--phase2_pushback_length];
 265
 266   if (xgettext_current_source_encoding == po_charset_ascii)
 267     {
 268       int c = phase1_getc ();
 269       if (c == EOF)
 270         return UEOF;
 271       if (!c_isascii (c))
 272         {
 273           multiline_error (xstrdup (""),
 274                            xasprintf ("%s\n%s\n",
 275                                       non_ascii_error_message (lexical_context,
 276                                                                real_file_name,
 277                                                                line_number),
 278                                       _("\
 279 Please specify the source encoding through --from-code or through a comment\n\
 280 as specified in http://www.python.org/peps/pep-0263.html.\n")));
 281           exit (EXIT_FAILURE);
 282         }
 283       return c;
 284     }
 285   else if (xgettext_current_source_encoding != po_charset_utf8)
 286     {
 287 #if HAVE_ICONV
 288       /* Use iconv on an increasing number of bytes.  Read only as many bytes
 289          through phase1_getc as needed.  This is needed to give reasonable
 290          interactive behaviour when fp is connected to an interactive tty.  */
 291       unsigned char buf[MAX_PHASE1_PUSHBACK];
 292       size_t bufcount;
 293       int c = phase1_getc ();
 294       if (c == EOF)
 295         return UEOF;
 296       buf[0] = (unsigned char) c;
 297       bufcount = 1;
 298
 299       for (;;)
 300         {
 301           unsigned char scratchbuf[6];
 302           const char *inptr = (const char *) &buf[0];
 303           size_t insize = bufcount;
 304           char *outptr = (char *) &scratchbuf[0];
 305           size_t outsize = sizeof (scratchbuf);
 306
 307           size_t res = iconv (xgettext_current_source_iconv,
 308                               (ICONV_CONST char **) &inptr, &insize,
 309                               &outptr, &outsize);
 310           /* We expect that a character has been produced if and only if
 311              some input bytes have been consumed.  */
 312           if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
 313             abort ();
 314           if (outsize == sizeof (scratchbuf))
 315             {
 316               /* No character has been produced.  Must be an error.  */
 317               if (res != (size_t)(-1))
 318                 abort ();
 319
 320               if (errno == EILSEQ)
 321                 {
 322                   /* An invalid multibyte sequence was encountered.  */
 323                   multiline_error (xstrdup (""),
 324                                    xasprintf (_("\
 325 %s:%d: Invalid multibyte sequence.\n\
 326 Please specify the correct source encoding through --from-code or through a\n\
 327 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
 328                                    real_file_name, line_number));
 329                   exit (EXIT_FAILURE);
 330                 }
 331               else if (errno == EINVAL)
 332                 {
 333                   /* An incomplete multibyte character.  */
 334                   int c;
 335
 336                   if (bufcount == MAX_PHASE1_PUSHBACK)
 337                     {
 338                       /* An overlong incomplete multibyte sequence was
 339                          encountered.  */
 340                       multiline_error (xstrdup (""),
 341                                        xasprintf (_("\
 342 %s:%d: Long incomplete multibyte sequence.\n\
 343 Please specify the correct source encoding through --from-code or through a\n\
 344 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
 345                                        real_file_name, line_number));
 346                       exit (EXIT_FAILURE);
 347                     }
 348
 349                   /* Read one more byte and retry iconv.  */
 350                   c = phase1_getc ();
 351                   if (c == EOF)
 352                     {
 353                       multiline_error (xstrdup (""),
 354                                        xasprintf (_("\
 355 %s:%d: Incomplete multibyte sequence at end of file.\n\
 356 Please specify the correct source encoding through --from-code or through a\n\
 357 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
 358                                        real_file_name, line_number));
 359                       exit (EXIT_FAILURE);
 360                     }
 361                   if (c == '\n')
 362                     {
 363                       multiline_error (xstrdup (""),
 364                                        xasprintf (_("\
 365 %s:%d: Incomplete multibyte sequence at end of line.\n\
 366 Please specify the correct source encoding through --from-code or through a\n\
 367 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
 368                                        real_file_name, line_number - 1));
 369                       exit (EXIT_FAILURE);
 370                     }
 371                   buf[bufcount++] = (unsigned char) c;
 372                 }
 373               else
 374                 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
 375                        real_file_name, line_number);
 376             }
 377           else
 378             {
 379               size_t outbytes = sizeof (scratchbuf) - outsize;
 380               size_t bytes = bufcount - insize;
 381               ucs4_t uc;
 382
 383               /* We expect that one character has been produced.  */
 384               if (bytes == 0)
 385                 abort ();
 386               if (outbytes == 0)
 387                 abort ();
 388               /* Push back the unused bytes.  */
 389               while (insize > 0)
 390                 phase1_ungetc (buf[--insize]);
 391               /* Convert the character from UTF-8 to UCS-4.  */
 392               if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
 393                 {
 394                   /* scratchbuf contains an out-of-range Unicode character
 395                      (> 0x10ffff).  */
 396                   multiline_error (xstrdup (""),
 397                                    xasprintf (_("\
 398 %s:%d: Invalid multibyte sequence.\n\
 399 Please specify the source encoding through --from-code or through a comment\n\
 400 as specified in http://www.python.org/peps/pep-0263.html.\n"),
 401                                    real_file_name, line_number));
 402                   exit (EXIT_FAILURE);
 403                 }
 404               return uc;
 405             }
 406         }
 407 #else
 408       /* If we don't have iconv(), the only supported values for
 409          xgettext_global_source_encoding and thus also for
 410          xgettext_current_source_encoding are ASCII and UTF-8.  */
 411       abort ();
 412 #endif
 413     }
 414   else
 415     {
 416       /* Read an UTF-8 encoded character.  */
 417       unsigned char buf[6];
 418       unsigned int count;
 419       int c;
 420       ucs4_t uc;
 421
 422       c = phase1_getc ();
 423       if (c == EOF)
 424         return UEOF;
 425       buf[0] = c;
 426       count = 1;
 427
 428       if (buf[0] >= 0xc0)
 429         {
 430           c = phase1_getc ();
 431           if (c == EOF)
 432             return UEOF;
 433           buf[1] = c;
 434           count = 2;
 435         }
 436
 437       if (buf[0] >= 0xe0
 438           && ((buf[1] ^ 0x80) < 0x40))
 439         {
 440           c = phase1_getc ();
 441           if (c == EOF)
 442             return UEOF;
 443           buf[2] = c;
 444           count = 3;
 445         }
 446
 447       if (buf[0] >= 0xf0
 448           && ((buf[1] ^ 0x80) < 0x40)
 449           && ((buf[2] ^ 0x80) < 0x40))
 450         {
 451           c = phase1_getc ();
 452           if (c == EOF)
 453             return UEOF;
 454           buf[3] = c;
 455           count = 4;
 456         }
 457
 458       if (buf[0] >= 0xf8
 459           && ((buf[1] ^ 0x80) < 0x40)
 460           && ((buf[2] ^ 0x80) < 0x40)
 461           && ((buf[3] ^ 0x80) < 0x40))
 462         {
 463           c = phase1_getc ();
 464           if (c == EOF)
 465             return UEOF;
 466           buf[4] = c;
 467           count = 5;
 468         }
 469
 470       if (buf[0] >= 0xfc
 471           && ((buf[1] ^ 0x80) < 0x40)
 472           && ((buf[2] ^ 0x80) < 0x40)
 473           && ((buf[3] ^ 0x80) < 0x40)
 474           && ((buf[4] ^ 0x80) < 0x40))
 475         {
 476           c = phase1_getc ();
 477           if (c == EOF)
 478             return UEOF;
 479           buf[5] = c;
 480           count = 6;
 481         }
 482
 483       u8_mbtouc (&uc, buf, count);
 484       return uc;
 485     }
 486 }
 487
 488 /* Supports max (9, UNINAME_MAX + 3) pushback characters.  */
 489 static void
 490 phase2_ungetc (int c)
 491 {
 492   if (c != UEOF)
 493     {
 494       if (phase2_pushback_length == SIZEOF (phase2_pushback))
 495         abort ();
 496       phase2_pushback[phase2_pushback_length++] = c;
 497     }
 498 }
 499
 500
 501 /* ========================= Accumulating strings.  ======================== */
 502
 503 /* A string buffer type that allows appending Unicode characters.
 504    Returns the entire string in UTF-8 encoding.  */
 505
 506 struct unicode_string_buffer
 507 {
 508   /* The part of the string that has already been converted to UTF-8.  */
 509   char *utf8_buffer;
 510   size_t utf8_buflen;
 511   size_t utf8_allocated;
 512 };
 513
 514 /* Initialize a 'struct unicode_string_buffer' to empty.  */
 515 static inline void
 516 init_unicode_string_buffer (struct unicode_string_buffer *bp)
 517 {
 518   bp->utf8_buffer = NULL;
 519   bp->utf8_buflen = 0;
 520   bp->utf8_allocated = 0;
 521 }
 522
 523 /* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
 524 static inline void
 525 unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp,
 526                                            size_t count)
 527 {
 528   if (bp->utf8_buflen + count > bp->utf8_allocated)
 529     {
 530       size_t new_allocated = 2 * bp->utf8_allocated + 10;
 531       if (new_allocated < bp->utf8_buflen + count)
 532         new_allocated = bp->utf8_buflen + count;
 533       bp->utf8_allocated = new_allocated;
 534       bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
 535     }
 536 }
 537
 538 /* Auxiliary function: Append a Unicode character to bp->utf8.
 539    uc must be < 0x110000.  */
 540 static inline void
 541 unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp,
 542                                       unsigned int uc)
 543 {
 544   unsigned char utf8buf[6];
 545   int count = u8_uctomb (utf8buf, uc, 6);
 546
 547   if (count < 0)
 548     /* The caller should have ensured that uc is not out-of-range.  */
 549     abort ();
 550
 551   unicode_string_buffer_append_unicode_grow (bp, count);
 552   memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
 553   bp->utf8_buflen += count;
 554 }
 555
 556 /* Return the string buffer's contents.  */
 557 static char *
 558 unicode_string_buffer_result (struct unicode_string_buffer *bp)
 559 {
 560   /* NUL-terminate it.  */
 561   unicode_string_buffer_append_unicode_grow (bp, 1);
 562   bp->utf8_buffer[bp->utf8_buflen] = '\0';
 563   /* Return it.  */
 564   return bp->utf8_buffer;
 565 }
 566
 567 /* Free the memory pointed to by a 'struct unicode_string_buffer'.  */
 568 static inline void
 569 free_unicode_string_buffer (struct unicode_string_buffer *bp)
 570 {
 571   free (bp->utf8_buffer);
 572 }
 573
 574
 575 /* ======================== Accumulating comments.  ======================== */
 576
 577
 578 /* Accumulating a single comment line.  */
 579
 580 static struct unicode_string_buffer comment_buffer;
 581
 582 static inline void
 583 comment_start ()
 584 {
 585   lexical_context = lc_comment;
 586   comment_buffer.utf8_buflen = 0;
 587 }
 588
 589 static inline bool
 590 comment_at_start ()
 591 {
 592   return (comment_buffer.utf8_buflen == 0);
 593 }
 594
 595 static inline void
 596 comment_add (int c)
 597 {
 598   unicode_string_buffer_append_unicode (&comment_buffer, c);
 599 }
 600
 601 static inline const char *
 602 comment_line_end ()
 603 {
 604   char *buffer = unicode_string_buffer_result (&comment_buffer);
 605   size_t buflen = strlen (buffer);
 606
 607   while (buflen >= 1
 608          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 609     --buflen;
 610   buffer[buflen] = '\0';
 611   savable_comment_add (buffer);
 612   lexical_context = lc_outside;
 613   return buffer;
 614 }
 615
 616
 617 /* These are for tracking whether comments count as immediately before
 618    keyword.  */
 619 static int last_comment_line;
 620 static int last_non_comment_line;
 621
 622
 623 /* ======================== Recognizing comments.  ======================== */
 624
 625
 626 /* Recognizing the "coding" comment.
 627    As specified in PEP 0263, it takes the form
 628      "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
 629    or
 630      "set" "fileencoding" "=" {alphanumeric or "-" or "_" or "*"}*
 631    and is located in a comment in a line that
 632      - is either the first or second line,
 633      - is not a continuation line,
 634      - in the first form, contains no other tokens except this comment.  */
 635
 636 /* Canonicalized encoding name for the current input file.  */
 637 static const char *xgettext_current_file_source_encoding;
 638
 639 #if HAVE_ICONV
 640 /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
 641    ASCII or UTF-8, when this conversion is a no-op).  */
 642 static iconv_t xgettext_current_file_source_iconv;
 643 #endif
 644
 645 static inline void
 646 set_current_file_source_encoding (const char *canon_encoding)
 647 {
 648   xgettext_current_file_source_encoding = canon_encoding;
 649
 650   if (xgettext_current_file_source_encoding != po_charset_ascii
 651       && xgettext_current_file_source_encoding != po_charset_utf8)
 652     {
 653 #if HAVE_ICONV
 654       iconv_t cd;
 655
 656       /* Avoid glibc-2.1 bug with EUC-KR.  */
 657 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 658      && !defined _LIBICONV_VERSION
 659       if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
 660         cd = (iconv_t)(-1);
 661       else
 662 # endif
 663       cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
 664       if (cd == (iconv_t)(-1))
 665         error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
 666 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
 667 and iconv() does not support this conversion."),
 668                xgettext_current_file_source_encoding, po_charset_utf8,
 669                basename (program_name));
 670       xgettext_current_file_source_iconv = cd;
 671 #else
 672       error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
 673 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
 674 This version was built without iconv()."),
 675              xgettext_global_source_encoding, po_charset_utf8,
 676              basename (program_name));
 677 #endif
 678     }
 679
 680   xgettext_current_source_encoding = xgettext_current_file_source_encoding;
 681 #if HAVE_ICONV
 682   xgettext_current_source_iconv = xgettext_current_file_source_iconv;
 683 #endif
 684 }
 685
 686 static inline void
 687 try_to_extract_coding (const char *comment)
 688 {
 689   const char *p = c_strstr (comment, "coding");
 690
 691   if (p != NULL)
 692     {
 693       p += 6;
 694       if (*p == ':' || *p == '=')
 695         {
 696           p++;
 697           while (*p == ' ' || *p == '\t')
 698             p++;
 699           {
 700             const char *encoding_start = p;
 701
 702             while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
 703               p++;
 704             {
 705               const char *encoding_end = p;
 706
 707               if (encoding_end > encoding_start)
 708                 {
 709                   /* Extract the encoding string.  */
 710                   size_t encoding_len = encoding_end - encoding_start;
 711                   char *encoding = XNMALLOC (encoding_len + 1, char);
 712
 713                   memcpy (encoding, encoding_start, encoding_len);
 714                   encoding[encoding_len] = '\0';
 715
 716                   {
 717                     /* Canonicalize it.  */
 718                     const char *canon_encoding = po_charset_canonicalize (encoding);
 719                     if (canon_encoding == NULL)
 720                       {
 721                         error_at_line (0, 0,
 722                                        logical_file_name, line_number - 1, _("\
 723 Unknown encoding \"%s\". Proceeding with ASCII instead."),
 724                                        encoding);
 725                         canon_encoding = po_charset_ascii;
 726                       }
 727
 728                     /* Activate it.  */
 729                     set_current_file_source_encoding (canon_encoding);
 730                   }
 731
 732                   free (encoding);
 733                 }
 734             }
 735           }
 736         }
 737     }
 738 }
 739
 740 /* Tracking whether the current line is a continuation line or contains a
 741    non-blank character.  */
 742 static bool continuation_or_nonblank_line = false;
 743
 744
 745 /* Phase 3: Outside strings, replace backslash-newline with nothing and a
 746    comment with nothing.  */
 747
 748 static int
 749 phase3_getc ()
 750 {
 751   int c;
 752
 753   for (;;)
 754     {
 755       c = phase2_getc ();
 756       if (c == '\\')
 757         {
 758           c = phase2_getc ();
 759           if (c != '\n')
 760             {
 761               phase2_ungetc (c);
 762               /* This shouldn't happen usually, because "A backslash is
 763                  illegal elsewhere on a line outside a string literal."  */
 764               return '\\';
 765             }
 766           /* Eat backslash-newline.  */
 767           continuation_or_nonblank_line = true;
 768         }
 769       else if (c == '#')
 770         {
 771           /* Eat a comment.  */
 772           const char *comment;
 773
 774           last_comment_line = line_number;
 775           comment_start ();
 776           for (;;)
 777             {
 778               c = phase2_getc ();
 779               if (c == UEOF || c == '\n')
 780                 break;
 781               /* We skip all leading white space, but not EOLs.  */
 782               if (!(comment_at_start () && (c == ' ' || c == '\t')))
 783                 comment_add (c);
 784             }
 785           comment = comment_line_end ();
 786           if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
 787             try_to_extract_coding (comment);
 788           continuation_or_nonblank_line = false;
 789           return c;
 790         }
 791       else
 792         {
 793           if (c == '\n')
 794             continuation_or_nonblank_line = false;
 795           else if (!(c == ' ' || c == '\t' || c == '\f'))
 796             continuation_or_nonblank_line = true;
 797           return c;
 798         }
 799     }
 800 }
 801
 802 /* Supports only one pushback character.  */
 803 static void
 804 phase3_ungetc (int c)
 805 {
 806   phase2_ungetc (c);
 807 }
 808
 809
 810 /* ========================= Accumulating strings.  ======================== */
 811
 812 /* Return value of phase7_getuc when EOF is reached.  */
 813 #define P7_EOF (-1)
 814 #define P7_STRING_END (-2)
 815
 816 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
 817    distinguished from a single-byte return value.  */
 818 #define UNICODE(code) (0x100 + (code))
 819
 820 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
 821    UTF-32 code point.  */
 822 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
 823
 824 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
 825    IS_UNICODE.  */
 826 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
 827
 828 /* A string buffer type that allows appending bytes (in the
 829    xgettext_current_source_encoding) or Unicode characters.
 830    Returns the entire string in UTF-8 encoding.  */
 831
 832 struct mixed_string_buffer
 833 {
 834   /* The part of the string that has already been converted to UTF-8.  */
 835   char *utf8_buffer;
 836   size_t utf8_buflen;
 837   size_t utf8_allocated;
 838   /* The first half of an UTF-16 surrogate character.  */
 839   unsigned short utf16_surr;
 840   /* The part of the string that is still in the source encoding.  */
 841   char *curr_buffer;
 842   size_t curr_buflen;
 843   size_t curr_allocated;
 844   /* The lexical context.  Used only for error message purposes.  */
 845   lexical_context_ty lcontext;
 846 };
 847
 848 /* Initialize a 'struct mixed_string_buffer' to empty.  */
 849 static inline void
 850 init_mixed_string_buffer (struct mixed_string_buffer *bp, lexical_context_ty lcontext)
 851 {
 852   bp->utf8_buffer = NULL;
 853   bp->utf8_buflen = 0;
 854   bp->utf8_allocated = 0;
 855   bp->utf16_surr = 0;
 856   bp->curr_buffer = NULL;
 857   bp->curr_buflen = 0;
 858   bp->curr_allocated = 0;
 859   bp->lcontext = lcontext;
 860 }
 861
 862 /* Auxiliary function: Append a byte to bp->curr.  */
 863 static inline void
 864 mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c)
 865 {
 866   if (bp->curr_buflen == bp->curr_allocated)
 867     {
 868       bp->curr_allocated = 2 * bp->curr_allocated + 10;
 869       bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
 870     }
 871   bp->curr_buffer[bp->curr_buflen++] = c;
 872 }
 873
 874 /* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
 875 static inline void
 876 mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count)
 877 {
 878   if (bp->utf8_buflen + count > bp->utf8_allocated)
 879     {
 880       size_t new_allocated = 2 * bp->utf8_allocated + 10;
 881       if (new_allocated < bp->utf8_buflen + count)
 882         new_allocated = bp->utf8_buflen + count;
 883       bp->utf8_allocated = new_allocated;
 884       bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
 885     }
 886 }
 887
 888 /* Auxiliary function: Append a Unicode character to bp->utf8.
 889    uc must be < 0x110000.  */
 890 static inline void
 891 mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, ucs4_t uc)
 892 {
 893   unsigned char utf8buf[6];
 894   int count = u8_uctomb (utf8buf, uc, 6);
 895
 896   if (count < 0)
 897     /* The caller should have ensured that uc is not out-of-range.  */
 898     abort ();
 899
 900   mixed_string_buffer_append_unicode_grow (bp, count);
 901   memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
 902   bp->utf8_buflen += count;
 903 }
 904
 905 /* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer.  */
 906 static inline void
 907 mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
 908 {
 909   if (bp->utf16_surr != 0)
 910     {
 911       /* A half surrogate is invalid, therefore use U+FFFD instead.  */
 912       mixed_string_buffer_append_unicode (bp, 0xfffd);
 913       bp->utf16_surr = 0;
 914     }
 915 }
 916
 917 /* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer.  */
 918 static inline void
 919 mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno)
 920 {
 921   if (bp->curr_buflen > 0)
 922     {
 923       char *curr;
 924       size_t count;
 925
 926       mixed_string_buffer_append_byte (bp, '\0');
 927
 928       /* Convert from the source encoding to UTF-8.  */
 929       curr = from_current_source_encoding (bp->curr_buffer, bp->lcontext,
 930                                            logical_file_name, lineno);
 931
 932       /* Append it to bp->utf8_buffer.  */
 933       count = strlen (curr);
 934       mixed_string_buffer_append_unicode_grow (bp, count);
 935       memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
 936       bp->utf8_buflen += count;
 937
 938       if (curr != bp->curr_buffer)
 939         free (curr);
 940       bp->curr_buflen = 0;
 941     }
 942 }
 943
 944 /* Append a character or Unicode character to a 'struct mixed_string_buffer'.  */
 945 static void
 946 mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
 947 {
 948   if (IS_UNICODE (c))
 949     {
 950       /* Append a Unicode character.  */
 951
 952       /* Switch from multibyte character mode to Unicode character mode.  */
 953       mixed_string_buffer_flush_curr_buffer (bp, line_number);
 954
 955       /* Test whether this character and the previous one form a Unicode
 956          surrogate character pair.  */
 957       if (bp->utf16_surr != 0
 958           && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
 959         {
 960           unsigned short utf16buf[2];
 961           ucs4_t uc;
 962
 963           utf16buf[0] = bp->utf16_surr;
 964           utf16buf[1] = UNICODE_VALUE (c);
 965           if (u16_mbtouc (&uc, utf16buf, 2) != 2)
 966             abort ();
 967
 968           mixed_string_buffer_append_unicode (bp, uc);
 969           bp->utf16_surr = 0;
 970         }
 971       else
 972         {
 973           mixed_string_buffer_flush_utf16_surr (bp);
 974
 975           if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
 976             bp->utf16_surr = UNICODE_VALUE (c);
 977           else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
 978             {
 979               /* A half surrogate is invalid, therefore use U+FFFD instead.  */
 980               mixed_string_buffer_append_unicode (bp, 0xfffd);
 981             }
 982           else
 983             mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c));
 984         }
 985     }
 986   else
 987     {
 988       /* Append a single byte.  */
 989
 990       /* Switch from Unicode character mode to multibyte character mode.  */
 991       mixed_string_buffer_flush_utf16_surr (bp);
 992
 993       /* When a newline is seen, convert the accumulated multibyte sequence.
 994          This ensures a correct line number in the error message in case of
 995          a conversion error.  The "- 1" is to account for the newline.  */
 996       if (c == '\n')
 997         mixed_string_buffer_flush_curr_buffer (bp, line_number - 1);
 998
 999       mixed_string_buffer_append_byte (bp, (unsigned char) c);
1000     }
1001 }
1002
1003 /* Return the string buffer's contents.  */
1004 static char *
1005 mixed_string_buffer_result (struct mixed_string_buffer *bp)
1006 {
1007   /* Flush all into bp->utf8_buffer.  */
1008   mixed_string_buffer_flush_utf16_surr (bp);
1009   mixed_string_buffer_flush_curr_buffer (bp, line_number);
1010   /* NUL-terminate it.  */
1011   mixed_string_buffer_append_unicode_grow (bp, 1);
1012   bp->utf8_buffer[bp->utf8_buflen] = '\0';
1013   /* Return it.  */
1014   return bp->utf8_buffer;
1015 }
1016
1017 /* Free the memory pointed to by a 'struct mixed_string_buffer'.  */
1018 static inline void
1019 free_mixed_string_buffer (struct mixed_string_buffer *bp)
1020 {
1021   free (bp->utf8_buffer);
1022   free (bp->curr_buffer);
1023 }
1024
1025
1026 /* ========================== Reading of tokens.  ========================== */
1027
1028
1029 enum token_type_ty
1030 {
1031   token_type_eof,
1032   token_type_lparen,            /* ( */
1033   token_type_rparen,            /* ) */
1034   token_type_comma,             /* , */
1035   token_type_lbracket,          /* [ */
1036   token_type_rbracket,          /* ] */
1037   token_type_string,            /* "abc", 'abc', """abc""", '''abc''' */
1038   token_type_symbol,            /* symbol, number */
1039   token_type_plus,              /* + */
1040   token_type_other              /* misc. operator */
1041 };
1042 typedef enum token_type_ty token_type_ty;
1043
1044 typedef struct token_ty token_ty;
1045 struct token_ty
1046 {
1047   token_type_ty type;
1048   char *string;         /* for token_type_string, token_type_symbol */
1049   refcounted_string_list_ty *comment;   /* for token_type_string */
1050   int line_number;
1051 };
1052
1053 /* Free the memory pointed to by a 'struct token_ty'.  */
1054 static inline void
1055 free_token (token_ty *tp)
1056 {
1057   if (tp->type == token_type_string || tp->type == token_type_symbol)
1058     free (tp->string);
1059   if (tp->type == token_type_string)
1060     drop_reference (tp->comment);
1061 }
1062
1063
1064 /* There are two different input syntaxes for strings, "abc" and r"abc",
1065    and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
1066    Which escape sequences are understood, i.e. what is interpreted specially
1067    after backslash?
1068     "abc"     \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
1069     r"abc"
1070     u"abc"    \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
1071     ur"abc"                                           \unnnn
1072    The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
1073    \unnnn items.  The \ooo and \xnn values are in the current source encoding
1074    for byte strings, and Unicode code points for Unicode strings.
1075  */
1076
1077 static int
1078 phase7_getuc (int quote_char,
1079               bool triple, bool interpret_ansic, bool interpret_unicode,
1080               unsigned int *backslash_counter)
1081 {
1082   int c;
1083
1084   for (;;)
1085     {
1086       /* Use phase 2, because phase 3 elides comments.  */
1087       c = phase2_getc ();
1088
1089       if (c == UEOF)
1090         return P7_EOF;
1091
1092       if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
1093         {
1094           if (triple)
1095             {
1096               int c1 = phase2_getc ();
1097               if (c1 == quote_char)
1098                 {
1099                   int c2 = phase2_getc ();
1100                   if (c2 == quote_char)
1101                     return P7_STRING_END;
1102                   phase2_ungetc (c2);
1103                 }
1104               phase2_ungetc (c1);
1105               return UNICODE (c);
1106             }
1107           else
1108             return P7_STRING_END;
1109         }
1110
1111       if (c == '\n')
1112         {
1113           if (triple)
1114             {
1115               *backslash_counter = 0;
1116               return UNICODE ('\n');
1117             }
1118           /* In r"..." and ur"..." strings, newline is only allowed
1119              immediately after an odd number of backslashes (although the
1120              backslashes are not interpreted!).  */
1121           if (!(interpret_ansic || (*backslash_counter & 1) == 0))
1122             {
1123               *backslash_counter = 0;
1124               return UNICODE ('\n');
1125             }
1126           phase2_ungetc (c);
1127           error_with_progname = false;
1128           error (0, 0, _("%s:%d: warning: unterminated string"),
1129                  logical_file_name, line_number);
1130           error_with_progname = true;
1131           return P7_STRING_END;
1132         }
1133
1134       if (c != '\\')
1135         {
1136           *backslash_counter = 0;
1137           return UNICODE (c);
1138         }
1139
1140       /* Backslash handling.  */
1141
1142       if (!interpret_ansic && !interpret_unicode)
1143         {
1144           ++*backslash_counter;
1145           return UNICODE ('\\');
1146         }
1147
1148       /* Dispatch according to the character following the backslash.  */
1149       c = phase2_getc ();
1150       if (c == UEOF)
1151         {
1152           ++*backslash_counter;
1153           return UNICODE ('\\');
1154         }
1155
1156       if (interpret_ansic)
1157         switch (c)
1158           {
1159           case '\n':
1160             continue;
1161           case '\\':
1162             ++*backslash_counter;
1163             return UNICODE (c);
1164           case '\'': case '"':
1165             *backslash_counter = 0;
1166             return UNICODE (c);
1167           case 'a':
1168             *backslash_counter = 0;
1169             return UNICODE ('\a');
1170           case 'b':
1171             *backslash_counter = 0;
1172             return UNICODE ('\b');
1173           case 'f':
1174             *backslash_counter = 0;
1175             return UNICODE ('\f');
1176           case 'n':
1177             *backslash_counter = 0;
1178             return UNICODE ('\n');
1179           case 'r':
1180             *backslash_counter = 0;
1181             return UNICODE ('\r');
1182           case 't':
1183             *backslash_counter = 0;
1184             return UNICODE ('\t');
1185           case 'v':
1186             *backslash_counter = 0;
1187             return UNICODE ('\v');
1188           case '0': case '1': case '2': case '3': case '4':
1189           case '5': case '6': case '7':
1190             {
1191               int n = c - '0';
1192
1193               c = phase2_getc ();
1194               if (c != UEOF)
1195                 {
1196                   if (c >= '0' && c <= '7')
1197                     {
1198                       n = (n << 3) + (c - '0');
1199                       c = phase2_getc ();
1200                       if (c != UEOF)
1201                         {
1202                           if (c >= '0' && c <= '7')
1203                             n = (n << 3) + (c - '0');
1204                           else
1205                             phase2_ungetc (c);
1206                         }
1207                     }
1208                   else
1209                     phase2_ungetc (c);
1210                 }
1211               *backslash_counter = 0;
1212               if (interpret_unicode)
1213                 return UNICODE (n);
1214               else
1215                 return (unsigned char) n;
1216             }
1217           case 'x':
1218             {
1219               int c1 = phase2_getc ();
1220               int n1;
1221
1222               if (c1 >= '0' && c1 <= '9')
1223                 n1 = c1 - '0';
1224               else if (c1 >= 'A' && c1 <= 'F')
1225                 n1 = c1 - 'A' + 10;
1226               else if (c1 >= 'a' && c1 <= 'f')
1227                 n1 = c1 - 'a' + 10;
1228               else
1229                 n1 = -1;
1230
1231               if (n1 >= 0)
1232                 {
1233                   int c2 = phase2_getc ();
1234                   int n2;
1235
1236                   if (c2 >= '0' && c2 <= '9')
1237                     n2 = c2 - '0';
1238                   else if (c2 >= 'A' && c2 <= 'F')
1239                     n2 = c2 - 'A' + 10;
1240                   else if (c2 >= 'a' && c2 <= 'f')
1241                     n2 = c2 - 'a' + 10;
1242                   else
1243                     n2 = -1;
1244
1245                   if (n2 >= 0)
1246                     {
1247                       int n = (n1 << 4) + n2;
1248                       *backslash_counter = 0;
1249                       if (interpret_unicode)
1250                         return UNICODE (n);
1251                       else
1252                         return (unsigned char) n;
1253                     }
1254
1255                   phase2_ungetc (c2);
1256                 }
1257               phase2_ungetc (c1);
1258               phase2_ungetc (c);
1259               ++*backslash_counter;
1260               return UNICODE ('\\');
1261             }
1262           }
1263
1264       if (interpret_unicode)
1265         {
1266           if (c == 'u')
1267             {
1268               unsigned char buf[4];
1269               unsigned int n = 0;
1270               int i;
1271
1272               for (i = 0; i < 4; i++)
1273                 {
1274                   int c1 = phase2_getc ();
1275
1276                   if (c1 >= '0' && c1 <= '9')
1277                     n = (n << 4) + (c1 - '0');
1278                   else if (c1 >= 'A' && c1 <= 'F')
1279                     n = (n << 4) + (c1 - 'A' + 10);
1280                   else if (c1 >= 'a' && c1 <= 'f')
1281                     n = (n << 4) + (c1 - 'a' + 10);
1282                   else
1283                     {
1284                       phase2_ungetc (c1);
1285                       while (--i >= 0)
1286                         phase2_ungetc (buf[i]);
1287                       phase2_ungetc (c);
1288                       ++*backslash_counter;
1289                       return UNICODE ('\\');
1290                     }
1291
1292                   buf[i] = c1;
1293                 }
1294               *backslash_counter = 0;
1295               return UNICODE (n);
1296             }
1297
1298           if (interpret_ansic)
1299             {
1300               if (c == 'U')
1301                 {
1302                   unsigned char buf[8];
1303                   unsigned int n = 0;
1304                   int i;
1305
1306                   for (i = 0; i < 8; i++)
1307                     {
1308                       int c1 = phase2_getc ();
1309
1310                       if (c1 >= '0' && c1 <= '9')
1311                         n = (n << 4) + (c1 - '0');
1312                       else if (c1 >= 'A' && c1 <= 'F')
1313                         n = (n << 4) + (c1 - 'A' + 10);
1314                       else if (c1 >= 'a' && c1 <= 'f')
1315                         n = (n << 4) + (c1 - 'a' + 10);
1316                       else
1317                         {
1318                           phase2_ungetc (c1);
1319                           while (--i >= 0)
1320                             phase2_ungetc (buf[i]);
1321                           phase2_ungetc (c);
1322                           ++*backslash_counter;
1323                           return UNICODE ('\\');
1324                         }
1325
1326                       buf[i] = c1;
1327                     }
1328                   if (n < 0x110000)
1329                     {
1330                       *backslash_counter = 0;
1331                       return UNICODE (n);
1332                     }
1333
1334                   error_with_progname = false;
1335                   error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1336                          logical_file_name, line_number);
1337                   error_with_progname = true;
1338
1339                   while (--i >= 0)
1340                     phase2_ungetc (buf[i]);
1341                   phase2_ungetc (c);
1342                   ++*backslash_counter;
1343                   return UNICODE ('\\');
1344                 }
1345
1346               if (c == 'N')
1347                 {
1348                   int c1 = phase2_getc ();
1349                   if (c1 == '{')
1350                     {
1351                       unsigned char buf[UNINAME_MAX + 1];
1352                       int i;
1353                       unsigned int n;
1354
1355                       for (i = 0; i < UNINAME_MAX; i++)
1356                         {
1357                           int c2 = phase2_getc ();
1358                           if (!(c2 >= ' ' && c2 <= '~'))
1359                             {
1360                               phase2_ungetc (c2);
1361                               while (--i >= 0)
1362                                 phase2_ungetc (buf[i]);
1363                               phase2_ungetc (c1);
1364                               phase2_ungetc (c);
1365                               ++*backslash_counter;
1366                               return UNICODE ('\\');
1367                             }
1368                           if (c2 == '}')
1369                             break;
1370                           buf[i] = c2;
1371                         }
1372                       buf[i] = '\0';
1373
1374                       n = unicode_name_character ((char *) buf);
1375                       if (n != UNINAME_INVALID)
1376                         {
1377                           *backslash_counter = 0;
1378                           return UNICODE (n);
1379                         }
1380
1381                       phase2_ungetc ('}');
1382                       while (--i >= 0)
1383                         phase2_ungetc (buf[i]);
1384                     }
1385                   phase2_ungetc (c1);
1386                   phase2_ungetc (c);
1387                   ++*backslash_counter;
1388                   return UNICODE ('\\');
1389                 }
1390             }
1391         }
1392
1393       phase2_ungetc (c);
1394       ++*backslash_counter;
1395       return UNICODE ('\\');
1396     }
1397 }
1398
1399
1400 /* Combine characters into tokens.  Discard whitespace except newlines at
1401    the end of logical lines.  */
1402
1403 /* Number of pending open parentheses/braces/brackets.  */
1404 static int open_pbb;
1405
1406 static token_ty phase5_pushback[2];
1407 static int phase5_pushback_length;
1408
1409 static void
1410 phase5_get (token_ty *tp)
1411 {
1412   int c;
1413
1414   if (phase5_pushback_length)
1415     {
1416       *tp = phase5_pushback[--phase5_pushback_length];
1417       return;
1418     }
1419
1420   for (;;)
1421     {
1422       tp->line_number = line_number;
1423       c = phase3_getc ();
1424
1425       switch (c)
1426         {
1427         case UEOF:
1428           tp->type = token_type_eof;
1429           return;
1430
1431         case ' ':
1432         case '\t':
1433         case '\f':
1434           /* Ignore whitespace and comments.  */
1435           continue;
1436
1437         case '\n':
1438           if (last_non_comment_line > last_comment_line)
1439             savable_comment_reset ();
1440           /* Ignore newline if and only if it is used for implicit line
1441              joining.  */
1442           if (open_pbb > 0)
1443             continue;
1444           tp->type = token_type_other;
1445           return;
1446         }
1447
1448       last_non_comment_line = tp->line_number;
1449
1450       switch (c)
1451         {
1452         case '.':
1453           {
1454             int c1 = phase3_getc ();
1455             phase3_ungetc (c1);
1456             if (!(c1 >= '0' && c1 <= '9'))
1457               {
1458
1459                 tp->type = token_type_other;
1460                 return;
1461               }
1462           }
1463           /* FALLTHROUGH */
1464         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1465         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1466         case 'M': case 'N': case 'O': case 'P': case 'Q':
1467         case 'S': case 'T':           case 'V': case 'W': case 'X':
1468         case 'Y': case 'Z':
1469         case '_':
1470         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1471         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1472         case 'm': case 'n': case 'o': case 'p': case 'q':
1473         case 's': case 't':           case 'v': case 'w': case 'x':
1474         case 'y': case 'z':
1475         case '0': case '1': case '2': case '3': case '4':
1476         case '5': case '6': case '7': case '8': case '9':
1477         symbol:
1478           /* Symbol, or part of a number.  */
1479           {
1480             static char *buffer;
1481             static int bufmax;
1482             int bufpos;
1483
1484             bufpos = 0;
1485             for (;;)
1486               {
1487                 if (bufpos >= bufmax)
1488                   {
1489                     bufmax = 2 * bufmax + 10;
1490                     buffer = xrealloc (buffer, bufmax);
1491                   }
1492                 buffer[bufpos++] = c;
1493                 c = phase3_getc ();
1494                 switch (c)
1495                   {
1496                   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1497                   case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1498                   case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1499                   case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1500                   case 'Y': case 'Z':
1501                   case '_':
1502                   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1503                   case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1504                   case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1505                   case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1506                   case 'y': case 'z':
1507                   case '0': case '1': case '2': case '3': case '4':
1508                   case '5': case '6': case '7': case '8': case '9':
1509                     continue;
1510                   default:
1511                     phase3_ungetc (c);
1512                     break;
1513                   }
1514                 break;
1515               }
1516             if (bufpos >= bufmax)
1517               {
1518                 bufmax = 2 * bufmax + 10;
1519                 buffer = xrealloc (buffer, bufmax);
1520               }
1521             buffer[bufpos] = '\0';
1522             tp->string = xstrdup (buffer);
1523             tp->type = token_type_symbol;
1524             return;
1525           }
1526
1527         /* Strings.  */
1528           {
1529             struct mixed_string_buffer literal;
1530             int quote_char;
1531             bool interpret_ansic;
1532             bool interpret_unicode;
1533             bool triple;
1534             unsigned int backslash_counter;
1535
1536             case 'R': case 'r':
1537               {
1538                 int c1 = phase2_getc ();
1539                 if (c1 == '"' || c1 == '\'')
1540                   {
1541                     quote_char = c1;
1542                     interpret_ansic = false;
1543                     interpret_unicode = false;
1544                     goto string;
1545                   }
1546                 phase2_ungetc (c1);
1547                 goto symbol;
1548               }
1549
1550             case 'U': case 'u':
1551               {
1552                 int c1 = phase2_getc ();
1553                 if (c1 == '"' || c1 == '\'')
1554                   {
1555                     quote_char = c1;
1556                     interpret_ansic = true;
1557                     interpret_unicode = true;
1558                     goto string;
1559                   }
1560                 if (c1 == 'R' || c1 == 'r')
1561                   {
1562                     int c2 = phase2_getc ();
1563                     if (c2 == '"' || c2 == '\'')
1564                       {
1565                         quote_char = c2;
1566                         interpret_ansic = false;
1567                         interpret_unicode = true;
1568                         goto string;
1569                       }
1570                     phase2_ungetc (c2);
1571                   }
1572                 phase2_ungetc (c1);
1573                 goto symbol;
1574               }
1575
1576             case '"': case '\'':
1577               quote_char = c;
1578               interpret_ansic = true;
1579               interpret_unicode = false;
1580             string:
1581               triple = false;
1582               lexical_context = lc_string;
1583               {
1584                 int c1 = phase2_getc ();
1585                 if (c1 == quote_char)
1586                   {
1587                     int c2 = phase2_getc ();
1588                     if (c2 == quote_char)
1589                       triple = true;
1590                     else
1591                       {
1592                         phase2_ungetc (c2);
1593                         phase2_ungetc (c1);
1594                       }
1595                   }
1596                 else
1597                   phase2_ungetc (c1);
1598               }
1599               backslash_counter = 0;
1600               /* Start accumulating the string.  */
1601               init_mixed_string_buffer (&literal, lc_string);
1602               for (;;)
1603                 {
1604                   int uc = phase7_getuc (quote_char, triple, interpret_ansic,
1605                                          interpret_unicode, &backslash_counter);
1606
1607                   if (uc == P7_EOF || uc == P7_STRING_END)
1608                     break;
1609
1610                   if (IS_UNICODE (uc))
1611                     assert (UNICODE_VALUE (uc) >= 0
1612                             && UNICODE_VALUE (uc) < 0x110000);
1613
1614                   mixed_string_buffer_append (&literal, uc);
1615                 }
1616               tp->string = xstrdup (mixed_string_buffer_result (&literal));
1617               free_mixed_string_buffer (&literal);
1618               tp->comment = add_reference (savable_comment);
1619               lexical_context = lc_outside;
1620               tp->type = token_type_string;
1621               return;
1622           }
1623
1624         case '(':
1625           open_pbb++;
1626           tp->type = token_type_lparen;
1627           return;
1628
1629         case ')':
1630           if (open_pbb > 0)
1631             open_pbb--;
1632           tp->type = token_type_rparen;
1633           return;
1634
1635         case ',':
1636           tp->type = token_type_comma;
1637           return;
1638
1639         case '[': case '{':
1640           open_pbb++;
1641           tp->type = (c == '[' ? token_type_lbracket : token_type_other);
1642           return;
1643
1644         case ']': case '}':
1645           if (open_pbb > 0)
1646             open_pbb--;
1647           tp->type = (c == ']' ? token_type_rbracket : token_type_other);
1648           return;
1649
1650         case '+':
1651           tp->type = token_type_plus;
1652           return;
1653
1654         default:
1655           /* We could carefully recognize each of the 2 and 3 character
1656              operators, but it is not necessary, as we only need to recognize
1657              gettext invocations.  Don't bother.  */
1658           tp->type = token_type_other;
1659           return;
1660         }
1661     }
1662 }
1663
1664 /* Supports only one pushback token.  */
1665 static void
1666 phase5_unget (token_ty *tp)
1667 {
1668   if (tp->type != token_type_eof)
1669     {
1670       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1671         abort ();
1672       phase5_pushback[phase5_pushback_length++] = *tp;
1673     }
1674 }
1675
1676
1677 /* Combine adjacent strings to form a single string.  Note that the end
1678    of a logical line appears as a token of its own, therefore strings that
1679    belong to different logical lines will not be concatenated.  */
1680
1681 static void
1682 x_python_lex (token_ty *tp)
1683 {
1684   phase5_get (tp);
1685   if (tp->type == token_type_string)
1686     {
1687       char *sum = tp->string;
1688       size_t sum_len = strlen (sum);
1689
1690       for (;;)
1691         {
1692           token_ty token2, *tp2 = NULL;
1693
1694           phase5_get (&token2);
1695           switch (token2.type)
1696             {
1697             case token_type_plus:
1698               {
1699                 token_ty token3;
1700
1701                 phase5_get (&token3);
1702                 if (token3.type == token_type_string)
1703                   {
1704                     free_token (&token2);
1705                     tp2 = &token3;
1706                   }
1707                 else
1708                   phase5_unget (&token3);
1709               }
1710               break;
1711             case token_type_string:
1712               tp2 = &token2;
1713               break;
1714             default:
1715               break;
1716             }
1717
1718           if (tp2)
1719             {
1720               char *addend = tp2->string;
1721               size_t addend_len = strlen (addend);
1722
1723               sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1724               memcpy (sum + sum_len, addend, addend_len + 1);
1725               sum_len += addend_len;
1726
1727               free_token (tp2);
1728               continue;
1729             }
1730           phase5_unget (&token2);
1731           break;
1732         }
1733       tp->string = sum;
1734     }
1735 }
1736
1737
1738 /* ========================= Extracting strings.  ========================== */
1739
1740
1741 /* Context lookup table.  */
1742 static flag_context_list_table_ty *flag_context_list_table;
1743
1744
1745 /* The file is broken into tokens.  Scan the token stream, looking for
1746    a keyword, followed by a left paren, followed by a string.  When we
1747    see this sequence, we have something to remember.  We assume we are
1748    looking at a valid C or C++ program, and leave the complaints about
1749    the grammar to the compiler.
1750
1751      Normal handling: Look for
1752        keyword ( ... msgid ... )
1753      Plural handling: Look for
1754        keyword ( ... msgid ... msgid_plural ... )
1755
1756    We use recursion because the arguments before msgid or between msgid
1757    and msgid_plural can contain subexpressions of the same form.  */
1758
1759
1760 /* Extract messages until the next balanced closing parenthesis or bracket.
1761    Extracted messages are added to MLP.
1762    DELIM can be either token_type_rparen or token_type_rbracket, or
1763    token_type_eof to accept both.
1764    Return true upon eof, false upon closing parenthesis or bracket.  */
1765 static bool
1766 extract_balanced (message_list_ty *mlp,
1767                   token_type_ty delim,
1768                   flag_context_ty outer_context,
1769                   flag_context_list_iterator_ty context_iter,
1770                   struct arglist_parser *argparser)
1771 {
1772   /* Current argument number.  */
1773   int arg = 1;
1774   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1775   int state;
1776   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1777   const struct callshapes *next_shapes = NULL;
1778   /* Context iterator that will be used if the next token is a '('.  */
1779   flag_context_list_iterator_ty next_context_iter =
1780     passthrough_context_list_iterator;
1781   /* Current context.  */
1782   flag_context_ty inner_context =
1783     inherited_context (outer_context,
1784                        flag_context_list_iterator_advance (&context_iter));
1785
1786   /* Start state is 0.  */
1787   state = 0;
1788
1789   for (;;)
1790     {
1791       token_ty token;
1792
1793       x_python_lex (&token);
1794       switch (token.type)
1795         {
1796         case token_type_symbol:
1797           {
1798             void *keyword_value;
1799
1800             if (hash_find_entry (&keywords, token.string, strlen (token.string),
1801                                  &keyword_value)
1802                 == 0)
1803               {
1804                 next_shapes = (const struct callshapes *) keyword_value;
1805                 state = 1;
1806               }
1807             else
1808               state = 0;
1809           }
1810           next_context_iter =
1811             flag_context_list_iterator (
1812               flag_context_list_table_lookup (
1813                 flag_context_list_table,
1814                 token.string, strlen (token.string)));
1815           free (token.string);
1816           continue;
1817
1818         case token_type_lparen:
1819           if (extract_balanced (mlp, token_type_rparen,
1820                                 inner_context, next_context_iter,
1821                                 arglist_parser_alloc (mlp,
1822                                                       state ? next_shapes : NULL)))
1823             {
1824               xgettext_current_source_encoding = po_charset_utf8;
1825               arglist_parser_done (argparser, arg);
1826               xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1827               return true;
1828             }
1829           next_context_iter = null_context_list_iterator;
1830           state = 0;
1831           continue;
1832
1833         case token_type_rparen:
1834           if (delim == token_type_rparen || delim == token_type_eof)
1835             {
1836               xgettext_current_source_encoding = po_charset_utf8;
1837               arglist_parser_done (argparser, arg);
1838               xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1839               return false;
1840             }
1841           next_context_iter = null_context_list_iterator;
1842           state = 0;
1843           continue;
1844
1845         case token_type_comma:
1846           arg++;
1847           inner_context =
1848             inherited_context (outer_context,
1849                                flag_context_list_iterator_advance (
1850                                  &context_iter));
1851           next_context_iter = passthrough_context_list_iterator;
1852           state = 0;
1853           continue;
1854
1855         case token_type_lbracket:
1856           if (extract_balanced (mlp, token_type_rbracket,
1857                                 null_context, null_context_list_iterator,
1858                                 arglist_parser_alloc (mlp, NULL)))
1859             {
1860               xgettext_current_source_encoding = po_charset_utf8;
1861               arglist_parser_done (argparser, arg);
1862               xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1863               return true;
1864             }
1865           next_context_iter = null_context_list_iterator;
1866           state = 0;
1867           continue;
1868
1869         case token_type_rbracket:
1870           if (delim == token_type_rbracket || delim == token_type_eof)
1871             {
1872               xgettext_current_source_encoding = po_charset_utf8;
1873               arglist_parser_done (argparser, arg);
1874               xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1875               return false;
1876             }
1877           next_context_iter = null_context_list_iterator;
1878           state = 0;
1879           continue;
1880
1881         case token_type_string:
1882           {
1883             lex_pos_ty pos;
1884             pos.file_name = logical_file_name;
1885             pos.line_number = token.line_number;
1886
1887             xgettext_current_source_encoding = po_charset_utf8;
1888             if (extract_all)
1889               remember_a_message (mlp, NULL, token.string, inner_context,
1890                                   &pos, NULL, token.comment);
1891             else
1892               arglist_parser_remember (argparser, arg, token.string,
1893                                        inner_context,
1894                                        pos.file_name, pos.line_number,
1895                                        token.comment);
1896             xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1897           }
1898           drop_reference (token.comment);
1899           next_context_iter = null_context_list_iterator;
1900           state = 0;
1901           continue;
1902
1903         case token_type_eof:
1904           xgettext_current_source_encoding = po_charset_utf8;
1905           arglist_parser_done (argparser, arg);
1906           xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1907           return true;
1908
1909         case token_type_plus:
1910         case token_type_other:
1911           next_context_iter = null_context_list_iterator;
1912           state = 0;
1913           continue;
1914
1915         default:
1916           abort ();
1917         }
1918     }
1919 }
1920
1921
1922 void
1923 extract_python (FILE *f,
1924                 const char *real_filename, const char *logical_filename,
1925                 flag_context_list_table_ty *flag_table,
1926                 msgdomain_list_ty *mdlp)
1927 {
1928   message_list_ty *mlp = mdlp->item[0]->messages;
1929
1930   fp = f;
1931   real_file_name = real_filename;
1932   logical_file_name = xstrdup (logical_filename);
1933   line_number = 1;
1934
1935   lexical_context = lc_outside;
1936
1937   last_comment_line = -1;
1938   last_non_comment_line = -1;
1939
1940   xgettext_current_file_source_encoding = xgettext_global_source_encoding;
1941 #if HAVE_ICONV
1942   xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1943 #endif
1944
1945   xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1946 #if HAVE_ICONV
1947   xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1948 #endif
1949
1950   continuation_or_nonblank_line = false;
1951
1952   open_pbb = 0;
1953
1954   flag_context_list_table = flag_table;
1955
1956   init_keywords ();
1957
1958   /* Eat tokens until eof is seen.  When extract_balanced returns
1959      due to an unbalanced closing parenthesis, just restart it.  */
1960   while (!extract_balanced (mlp, token_type_eof,
1961                             null_context, null_context_list_iterator,
1962                             arglist_parser_alloc (mlp, NULL)))
1963     ;
1964
1965   fp = NULL;
1966   real_file_name = NULL;
1967   logical_file_name = NULL;
1968   line_number = 0;
1969 }