gettext-tools/src/x-perl.c

   1 /* xgettext Perl backend.
   2    Copyright (C) 2002-2010, 2015 Free Software Foundation, Inc.
   3
   4    This file was written by Guido Flohr <guido@imperia.net>, 2002-2010.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22
  23 /* Specification.  */
  24 #include "x-perl.h"
  25
  26 #include <errno.h>
  27 #include <stdbool.h>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31
  32 #include "message.h"
  33 #include "xgettext.h"
  34 #include "error.h"
  35 #include "error-progname.h"
  36 #include "xalloc.h"
  37 #include "po-charset.h"
  38 #include "unistr.h"
  39 #include "uniname.h"
  40 #include "gettext.h"
  41
  42 #define _(s) gettext(s)
  43
  44 /* The Perl syntax is defined in perlsyn.pod.  Try the command
  45    "man perlsyn" or "perldoc perlsyn".
  46    Also, the syntax after the 'sub' keyword is specified in perlsub.pod.
  47    Try the command "man perlsub" or "perldoc perlsub".
  48    Perl 5.10 has new operators '//' and '//=', see
  49    <http://perldoc.perl.org/perldelta.html#Defined-or-operator>.  */
  50
  51 #define DEBUG_PERL 0
  52
  53
  54 /* ====================== Keyword set customization.  ====================== */
  55
  56 /* If true extract all strings.  */
  57 static bool extract_all = false;
  58
  59 static hash_table keywords;
  60 static bool default_keywords = true;
  61
  62
  63 void
  64 x_perl_extract_all ()
  65 {
  66   extract_all = true;
  67 }
  68
  69
  70 void
  71 x_perl_keyword (const char *name)
  72 {
  73   if (name == NULL)
  74     default_keywords = false;
  75   else
  76     {
  77       const char *end;
  78       struct callshape shape;
  79       const char *colon;
  80
  81       if (keywords.table == NULL)
  82         hash_init (&keywords, 100);
  83
  84       split_keywordspec (name, &end, &shape);
  85
  86       /* The characters between name and end should form a valid C identifier.
  87          A colon means an invalid parse in split_keywordspec().  */
  88       colon = strchr (name, ':');
  89       if (colon == NULL || colon >= end)
  90         insert_keyword_callshape (&keywords, name, end - name, &shape);
  91     }
  92 }
  93
  94 /* Finish initializing the keywords hash table.
  95    Called after argument processing, before each file is processed.  */
  96 static void
  97 init_keywords ()
  98 {
  99   if (default_keywords)
 100     {
 101       /* When adding new keywords here, also update the documentation in
 102          xgettext.texi!  */
 103       x_perl_keyword ("gettext");
 104       x_perl_keyword ("%gettext");
 105       x_perl_keyword ("$gettext");
 106       x_perl_keyword ("dgettext:2");
 107       x_perl_keyword ("dcgettext:2");
 108       x_perl_keyword ("ngettext:1,2");
 109       x_perl_keyword ("dngettext:2,3");
 110       x_perl_keyword ("dcngettext:2,3");
 111       x_perl_keyword ("gettext_noop");
 112 #if 0
 113       x_perl_keyword ("__");
 114       x_perl_keyword ("$__");
 115       x_perl_keyword ("%__");
 116       x_perl_keyword ("__x");
 117       x_perl_keyword ("__n:1,2");
 118       x_perl_keyword ("__nx:1,2");
 119       x_perl_keyword ("__xn:1,2");
 120       x_perl_keyword ("N__");
 121 #endif
 122       default_keywords = false;
 123     }
 124 }
 125
 126 void
 127 init_flag_table_perl ()
 128 {
 129   xgettext_record_flag ("gettext:1:pass-perl-format");
 130   xgettext_record_flag ("gettext:1:pass-perl-brace-format");
 131   xgettext_record_flag ("%gettext:1:pass-perl-format");
 132   xgettext_record_flag ("%gettext:1:pass-perl-brace-format");
 133   xgettext_record_flag ("$gettext:1:pass-perl-format");
 134   xgettext_record_flag ("$gettext:1:pass-perl-brace-format");
 135   xgettext_record_flag ("dgettext:2:pass-perl-format");
 136   xgettext_record_flag ("dgettext:2:pass-perl-brace-format");
 137   xgettext_record_flag ("dcgettext:2:pass-perl-format");
 138   xgettext_record_flag ("dcgettext:2:pass-perl-brace-format");
 139   xgettext_record_flag ("ngettext:1:pass-perl-format");
 140   xgettext_record_flag ("ngettext:2:pass-perl-format");
 141   xgettext_record_flag ("ngettext:1:pass-perl-brace-format");
 142   xgettext_record_flag ("ngettext:2:pass-perl-brace-format");
 143   xgettext_record_flag ("dngettext:2:pass-perl-format");
 144   xgettext_record_flag ("dngettext:3:pass-perl-format");
 145   xgettext_record_flag ("dngettext:2:pass-perl-brace-format");
 146   xgettext_record_flag ("dngettext:3:pass-perl-brace-format");
 147   xgettext_record_flag ("dcngettext:2:pass-perl-format");
 148   xgettext_record_flag ("dcngettext:3:pass-perl-format");
 149   xgettext_record_flag ("dcngettext:2:pass-perl-brace-format");
 150   xgettext_record_flag ("dcngettext:3:pass-perl-brace-format");
 151   xgettext_record_flag ("gettext_noop:1:pass-perl-format");
 152   xgettext_record_flag ("gettext_noop:1:pass-perl-brace-format");
 153   xgettext_record_flag ("printf:1:perl-format"); /* argument 1 or 2 ?? */
 154   xgettext_record_flag ("sprintf:1:perl-format");
 155 #if 0
 156   xgettext_record_flag ("__:1:pass-perl-format");
 157   xgettext_record_flag ("__:1:pass-perl-brace-format");
 158   xgettext_record_flag ("%__:1:pass-perl-format");
 159   xgettext_record_flag ("%__:1:pass-perl-brace-format");
 160   xgettext_record_flag ("$__:1:pass-perl-format");
 161   xgettext_record_flag ("$__:1:pass-perl-brace-format");
 162   xgettext_record_flag ("__x:1:perl-brace-format");
 163   xgettext_record_flag ("__n:1:pass-perl-format");
 164   xgettext_record_flag ("__n:2:pass-perl-format");
 165   xgettext_record_flag ("__n:1:pass-perl-brace-format");
 166   xgettext_record_flag ("__n:2:pass-perl-brace-format");
 167   xgettext_record_flag ("__nx:1:perl-brace-format");
 168   xgettext_record_flag ("__nx:2:perl-brace-format");
 169   xgettext_record_flag ("__xn:1:perl-brace-format");
 170   xgettext_record_flag ("__xn:2:perl-brace-format");
 171   xgettext_record_flag ("N__:1:pass-perl-format");
 172   xgettext_record_flag ("N__:1:pass-perl-brace-format");
 173 #endif
 174 }
 175
 176
 177 /* ======================== Reading of characters.  ======================== */
 178
 179 /* Real filename, used in error messages about the input file.  */
 180 static const char *real_file_name;
 181
 182 /* Logical filename and line number, used to label the extracted messages.  */
 183 static char *logical_file_name;
 184 static int line_number;
 185
 186 /* The input file stream.  */
 187 static FILE *fp;
 188
 189 /* The current line buffer.  */
 190 static char *linebuf;
 191
 192 /* The size of the current line.  */
 193 static int linesize;
 194
 195 /* The position in the current line.  */
 196 static int linepos;
 197
 198 /* The size of the input buffer.  */
 199 static size_t linebuf_size;
 200
 201 /* Number of lines eaten for here documents.  */
 202 static int eaten_here;
 203
 204 /* Paranoia: EOF marker for __END__ or __DATA__.  */
 205 static bool end_of_file;
 206
 207
 208 /* 1. line_number handling.  */
 209
 210 /* Returns the next character from the input stream or EOF.  */
 211 static int
 212 phase1_getc ()
 213 {
 214   line_number += eaten_here;
 215   eaten_here = 0;
 216
 217   if (end_of_file)
 218     return EOF;
 219
 220   if (linepos >= linesize)
 221     {
 222       linesize = getline (&linebuf, &linebuf_size, fp);
 223
 224       if (linesize < 0)
 225         {
 226           if (ferror (fp))
 227             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 228                    real_file_name);
 229           end_of_file = true;
 230           return EOF;
 231         }
 232
 233       linepos = 0;
 234       ++line_number;
 235
 236       /* Undosify.  This is important for catching the end of <<EOF and
 237          <<'EOF'.  We could rely on stdio doing this for us but
 238          it is not uncommon to to come across Perl scripts with CRLF
 239          newline conventions on systems that do not follow this
 240          convention.  */
 241       if (linesize >= 2 && linebuf[linesize - 1] == '\n'
 242           && linebuf[linesize - 2] == '\r')
 243         {
 244           linebuf[linesize - 2] = '\n';
 245           linebuf[linesize - 1] = '\0';
 246           --linesize;
 247         }
 248     }
 249
 250   return linebuf[linepos++];
 251 }
 252
 253 /* Supports only one pushback character.  */
 254 static void
 255 phase1_ungetc (int c)
 256 {
 257   if (c != EOF)
 258     {
 259       if (linepos == 0)
 260         /* Attempt to ungetc across line boundary.  Shouldn't happen.
 261            No two phase1_ungetc calls are permitted in a row.  */
 262         abort ();
 263
 264       --linepos;
 265     }
 266 }
 267
 268 /* Read a here document and return its contents.
 269    The delimiter is an UTF-8 encoded string; the resulting string is UTF-8
 270    encoded as well.  */
 271
 272 static char *
 273 get_here_document (const char *delimiter)
 274 {
 275   /* Accumulator for the entire here document, including a NUL byte
 276      at the end.  */
 277   static char *buffer;
 278   static size_t bufmax = 0;
 279   size_t bufpos = 0;
 280   /* Current line being appended.  */
 281   static char *my_linebuf = NULL;
 282   static size_t my_linebuf_size = 0;
 283
 284   /* Allocate the initial buffer.  Later on, bufmax > 0.  */
 285   if (bufmax == 0)
 286     {
 287       buffer = XNMALLOC (1, char);
 288       buffer[0] = '\0';
 289       bufmax = 1;
 290     }
 291
 292   for (;;)
 293     {
 294       int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp);
 295       char *my_line_utf8;
 296       bool chomp;
 297
 298       if (read_bytes < 0)
 299         {
 300           if (ferror (fp))
 301             {
 302               error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 303                      real_file_name);
 304             }
 305           else
 306             {
 307               error_with_progname = false;
 308               error (EXIT_SUCCESS, 0, _("\
 309 %s:%d: can't find string terminator \"%s\" anywhere before EOF"),
 310                      real_file_name, line_number, delimiter);
 311               error_with_progname = true;
 312
 313               break;
 314             }
 315         }
 316
 317       ++eaten_here;
 318
 319       /* Convert to UTF-8.  */
 320       my_line_utf8 =
 321         from_current_source_encoding (my_linebuf, lc_string, logical_file_name,
 322                                       line_number + eaten_here);
 323       if (my_line_utf8 != my_linebuf)
 324         {
 325           if (strlen (my_line_utf8) >= my_linebuf_size)
 326             {
 327               my_linebuf_size = strlen (my_line_utf8) + 1;
 328               my_linebuf = xrealloc (my_linebuf, my_linebuf_size);
 329             }
 330           strcpy (my_linebuf, my_line_utf8);
 331           free (my_line_utf8);
 332         }
 333
 334       /* Undosify.  This is important for catching the end of <<EOF and
 335          <<'EOF'.  We could rely on stdio doing this for us but you
 336          it is not uncommon to to come across Perl scripts with CRLF
 337          newline conventions on systems that do not follow this
 338          convention.  */
 339       if (read_bytes >= 2 && my_linebuf[read_bytes - 1] == '\n'
 340           && my_linebuf[read_bytes - 2] == '\r')
 341         {
 342           my_linebuf[read_bytes - 2] = '\n';
 343           my_linebuf[read_bytes - 1] = '\0';
 344           --read_bytes;
 345         }
 346
 347       /* Temporarily remove the trailing newline from my_linebuf.  */
 348       chomp = false;
 349       if (read_bytes >= 1 && my_linebuf[read_bytes - 1] == '\n')
 350         {
 351           chomp = true;
 352           my_linebuf[read_bytes - 1] = '\0';
 353         }
 354
 355       /* See whether this line terminates the here document.  */
 356       if (strcmp (my_linebuf, delimiter) == 0)
 357         break;
 358
 359       /* Add back the trailing newline to my_linebuf.  */
 360       if (chomp)
 361         my_linebuf[read_bytes - 1] = '\n';
 362
 363       /* Ensure room for read_bytes + 1 bytes.  */
 364       if (bufpos + read_bytes >= bufmax)
 365         {
 366           do
 367             bufmax = 2 * bufmax + 10;
 368           while (bufpos + read_bytes >= bufmax);
 369           buffer = xrealloc (buffer, bufmax);
 370         }
 371       /* Append this line to the accumulator.  */
 372       strcpy (buffer + bufpos, my_linebuf);
 373       bufpos += read_bytes;
 374     }
 375
 376   /* Done accumulating the here document.  */
 377   return xstrdup (buffer);
 378 }
 379
 380 /* Skips pod sections.  */
 381 static void
 382 skip_pod ()
 383 {
 384   line_number += eaten_here;
 385   eaten_here = 0;
 386   linepos = 0;
 387
 388   for (;;)
 389     {
 390       linesize = getline (&linebuf, &linebuf_size, fp);
 391
 392       if (linesize < 0)
 393         {
 394           if (ferror (fp))
 395             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 396                    real_file_name);
 397           return;
 398         }
 399
 400       ++line_number;
 401
 402       if (strncmp ("=cut", linebuf, 4) == 0)
 403         {
 404           /* Force reading of a new line on next call to phase1_getc().  */
 405           linepos = linesize;
 406           return;
 407         }
 408     }
 409 }
 410
 411
 412 /* These are for tracking whether comments count as immediately before
 413    keyword.  */
 414 static int last_comment_line;
 415 static int last_non_comment_line;
 416
 417
 418 /* 2. Replace each comment that is not inside a string literal or regular
 419    expression with a newline character.  We need to remember the comment
 420    for later, because it may be attached to a keyword string.  */
 421
 422 static int
 423 phase2_getc ()
 424 {
 425   static char *buffer;
 426   static size_t bufmax;
 427   size_t buflen;
 428   int lineno;
 429   int c;
 430   char *utf8_string;
 431
 432   c = phase1_getc ();
 433   if (c == '#')
 434     {
 435       buflen = 0;
 436       lineno = line_number;
 437       /* Skip leading whitespace.  */
 438       for (;;)
 439         {
 440           c = phase1_getc ();
 441           if (c == EOF)
 442             break;
 443           if (c != ' ' && c != '\t' && c != '\r' && c != '\f')
 444             {
 445               phase1_ungetc (c);
 446               break;
 447             }
 448         }
 449       /* Accumulate the comment.  */
 450       for (;;)
 451         {
 452           c = phase1_getc ();
 453           if (c == '\n' || c == EOF)
 454             break;
 455           if (buflen >= bufmax)
 456             {
 457               bufmax = 2 * bufmax + 10;
 458               buffer = xrealloc (buffer, bufmax);
 459             }
 460           buffer[buflen++] = c;
 461         }
 462       if (buflen >= bufmax)
 463         {
 464           bufmax = 2 * bufmax + 10;
 465           buffer = xrealloc (buffer, bufmax);
 466         }
 467       buffer[buflen] = '\0';
 468       /* Convert it to UTF-8.  */
 469       utf8_string =
 470         from_current_source_encoding (buffer, lc_comment, logical_file_name,
 471                                       lineno);
 472       /* Save it until we encounter the corresponding string.  */
 473       savable_comment_add (utf8_string);
 474       last_comment_line = lineno;
 475     }
 476   return c;
 477 }
 478
 479 /* Supports only one pushback character.  */
 480 static void
 481 phase2_ungetc (int c)
 482 {
 483   if (c != EOF)
 484     phase1_ungetc (c);
 485 }
 486
 487 /* Whitespace recognition.  */
 488
 489 #define case_whitespace \
 490   case ' ': case '\t': case '\r': case '\n': case '\f'
 491
 492 static inline bool
 493 is_whitespace (int c)
 494 {
 495   return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f');
 496 }
 497
 498
 499 /* ========================== Reading of tokens.  ========================== */
 500
 501
 502 enum token_type_ty
 503 {
 504   token_type_eof,
 505   token_type_lparen,            /* ( */
 506   token_type_rparen,            /* ) */
 507   token_type_comma,             /* , */
 508   token_type_fat_comma,         /* => */
 509   token_type_dereference,       /* -> */
 510   token_type_semicolon,         /* ; */
 511   token_type_lbrace,            /* { */
 512   token_type_rbrace,            /* } */
 513   token_type_lbracket,          /* [ */
 514   token_type_rbracket,          /* ] */
 515   token_type_string,            /* quote-like */
 516   token_type_number,            /* starting with a digit o dot */
 517   token_type_named_op,          /* if, unless, while, ... */
 518   token_type_variable,          /* $... */
 519   token_type_object,            /* A dereferenced variable, maybe a blessed
 520                                    object.  */
 521   token_type_symbol,            /* symbol, number */
 522   token_type_regex_op,          /* s, tr, y, m.  */
 523   token_type_dot,               /* . */
 524   token_type_other,             /* regexp, misc. operator */
 525   /* The following are not really token types, but variants used by
 526      the parser.  */
 527   token_type_keyword_symbol     /* keyword symbol */
 528 };
 529 typedef enum token_type_ty token_type_ty;
 530
 531 /* Subtypes for strings, important for interpolation.  */
 532 enum string_type_ty
 533 {
 534   string_type_verbatim,     /* "<<'EOF'", "m'...'", "s'...''...'",
 535                                "tr/.../.../", "y/.../.../".  */
 536   string_type_q,            /* "'..'", "q/.../".  */
 537   string_type_qq,           /* '"..."', "`...`", "qq/.../", "qx/.../",
 538                                "<file*glob>".  */
 539   string_type_qr            /* Not supported.  */
 540 };
 541
 542 /* Subtypes for symbols, important for dollar interpretation.  */
 543 enum symbol_type_ty
 544 {
 545   symbol_type_none,         /* Nothing special.  */
 546   symbol_type_sub,          /* 'sub'.  */
 547   symbol_type_function      /* Function name after 'sub'.  */
 548 };
 549
 550 typedef struct token_ty token_ty;
 551 struct token_ty
 552 {
 553   token_type_ty type;
 554   token_type_ty last_type;
 555   int sub_type;                 /* for token_type_string, token_type_symbol */
 556   char *string;                 /* for:                 in encoding:
 557                                    token_type_named_op  ASCII
 558                                    token_type_string    UTF-8
 559                                    token_type_symbol    ASCII
 560                                    token_type_variable  global_source_encoding
 561                                    token_type_object    global_source_encoding
 562                                  */
 563   refcounted_string_list_ty *comment; /* for token_type_string */
 564   int line_number;
 565 };
 566
 567 #if DEBUG_PERL
 568 static const char *
 569 token2string (const token_ty *token)
 570 {
 571   switch (token->type)
 572     {
 573     case token_type_eof:
 574       return "token_type_eof";
 575     case token_type_lparen:
 576       return "token_type_lparen";
 577     case token_type_rparen:
 578       return "token_type_rparen";
 579     case token_type_comma:
 580       return "token_type_comma";
 581     case token_type_fat_comma:
 582       return "token_type_fat_comma";
 583     case token_type_dereference:
 584       return "token_type_dereference";
 585     case token_type_semicolon:
 586       return "token_type_semicolon";
 587     case token_type_lbrace:
 588       return "token_type_lbrace";
 589     case token_type_rbrace:
 590       return "token_type_rbrace";
 591     case token_type_lbracket:
 592       return "token_type_lbracket";
 593     case token_type_rbracket:
 594       return "token_type_rbracket";
 595     case token_type_string:
 596       return "token_type_string";
 597     case token_type_number:
 598       return "token type number";
 599     case token_type_named_op:
 600       return "token_type_named_op";
 601     case token_type_variable:
 602       return "token_type_variable";
 603     case token_type_object:
 604       return "token_type_object";
 605     case token_type_symbol:
 606       return "token_type_symbol";
 607     case token_type_regex_op:
 608       return "token_type_regex_op";
 609     case token_type_dot:
 610       return "token_type_dot";
 611     case token_type_other:
 612       return "token_type_other";
 613     default:
 614       return "unknown";
 615     }
 616 }
 617 #endif
 618
 619 /* Free the memory pointed to by a 'struct token_ty'.  */
 620 static inline void
 621 free_token (token_ty *tp)
 622 {
 623   switch (tp->type)
 624     {
 625     case token_type_named_op:
 626     case token_type_string:
 627     case token_type_symbol:
 628     case token_type_variable:
 629     case token_type_object:
 630       free (tp->string);
 631       break;
 632     default:
 633       break;
 634     }
 635   if (tp->type == token_type_string)
 636     drop_reference (tp->comment);
 637   free (tp);
 638 }
 639
 640 /* Pass 1 of extracting quotes: Find the end of the string, regardless
 641    of the semantics of the construct.  Return the complete string,
 642    including the starting and the trailing delimiter, with backslashes
 643    removed where appropriate.  */
 644 static char *
 645 extract_quotelike_pass1 (int delim)
 646 {
 647   /* This function is called recursively.  No way to allocate stuff
 648      statically.  Also alloca() is inappropriate due to limited stack
 649      size on some platforms.  So we use malloc().  */
 650   int bufmax = 10;
 651   char *buffer = XNMALLOC (bufmax, char);
 652   int bufpos = 0;
 653   bool nested = true;
 654   int counter_delim;
 655
 656   buffer[bufpos++] = delim;
 657
 658   /* Find the closing delimiter.  */
 659   switch (delim)
 660     {
 661     case '(':
 662       counter_delim = ')';
 663       break;
 664     case '{':
 665       counter_delim = '}';
 666       break;
 667     case '[':
 668       counter_delim = ']';
 669       break;
 670     case '<':
 671       counter_delim = '>';
 672       break;
 673     default: /* "..." or '...' or |...| etc. */
 674       nested = false;
 675       counter_delim = delim;
 676       break;
 677     }
 678
 679   for (;;)
 680     {
 681       int c = phase1_getc ();
 682
 683       /* This round can produce 1 or 2 bytes.  Ensure room for 2 bytes.  */
 684       if (bufpos + 2 > bufmax)
 685         {
 686           bufmax = 2 * bufmax + 10;
 687           buffer = xrealloc (buffer, bufmax);
 688         }
 689
 690       if (c == counter_delim || c == EOF)
 691         {
 692           buffer[bufpos++] = counter_delim; /* will be stripped off later */
 693           buffer[bufpos++] = '\0';
 694 #if DEBUG_PERL
 695           fprintf (stderr, "PASS1: %s\n", buffer);
 696 #endif
 697           return buffer;
 698         }
 699
 700       if (nested && c == delim)
 701         {
 702           char *inner = extract_quotelike_pass1 (delim);
 703           size_t len = strlen (inner);
 704
 705           /* Ensure room for len + 1 bytes.  */
 706           if (bufpos + len >= bufmax)
 707             {
 708               do
 709                 bufmax = 2 * bufmax + 10;
 710               while (bufpos + len >= bufmax);
 711               buffer = xrealloc (buffer, bufmax);
 712             }
 713           strcpy (buffer + bufpos, inner);
 714           free (inner);
 715           bufpos += len;
 716         }
 717       else if (c == '\\')
 718         {
 719           c = phase1_getc ();
 720           if (c == '\\')
 721             {
 722               buffer[bufpos++] = '\\';
 723               buffer[bufpos++] = '\\';
 724             }
 725           else if (c == delim || c == counter_delim)
 726             {
 727               /* This is pass2 in Perl.  */
 728               buffer[bufpos++] = c;
 729             }
 730           else
 731             {
 732               buffer[bufpos++] = '\\';
 733               phase1_ungetc (c);
 734             }
 735         }
 736       else
 737         {
 738           buffer[bufpos++] = c;
 739         }
 740     }
 741 }
 742
 743 /* Like extract_quotelike_pass1, but return the complete string in UTF-8
 744    encoding.  */
 745 static char *
 746 extract_quotelike_pass1_utf8 (int delim)
 747 {
 748   char *string = extract_quotelike_pass1 (delim);
 749   char *utf8_string =
 750     from_current_source_encoding (string, lc_string, logical_file_name,
 751                                   line_number);
 752   if (utf8_string != string)
 753     free (string);
 754   return utf8_string;
 755 }
 756
 757
 758 /* ========= Reading of tokens and commands.  Extracting strings.  ========= */
 759
 760
 761 /* Context lookup table.  */
 762 static flag_context_list_table_ty *flag_context_list_table;
 763
 764
 765 /* Forward declaration of local functions.  */
 766 static void interpolate_keywords (message_list_ty *mlp, const char *string,
 767                                   int lineno);
 768 static token_ty *x_perl_lex (message_list_ty *mlp);
 769 static void x_perl_unlex (token_ty *tp);
 770 static bool extract_balanced (message_list_ty *mlp,
 771                               token_type_ty delim, bool eat_delim,
 772                               bool comma_delim,
 773                               flag_context_ty outer_context,
 774                               flag_context_list_iterator_ty context_iter,
 775                               int arg, struct arglist_parser *argparser);
 776
 777
 778 /* Extract an unsigned hexadecimal number from STRING, considering at
 779    most LEN bytes and place the result in *RESULT.  Returns a pointer
 780    to the first character past the hexadecimal number.  */
 781 static const char *
 782 extract_hex (const char *string, size_t len, unsigned int *result)
 783 {
 784   size_t i;
 785
 786   *result = 0;
 787
 788   for (i = 0; i < len; i++)
 789     {
 790       char c = string[i];
 791       int number;
 792
 793       if (c >= 'A' && c <= 'F')
 794         number = c - 'A' + 10;
 795       else if (c >= 'a' && c <= 'f')
 796         number = c - 'a' + 10;
 797       else if (c >= '0' && c <= '9')
 798         number = c - '0';
 799       else
 800         break;
 801
 802       *result <<= 4;
 803       *result |= number;
 804     }
 805
 806   return string + i;
 807 }
 808
 809 /* Extract an unsigned octal number from STRING, considering at
 810    most LEN bytes and place the result in *RESULT.  Returns a pointer
 811    to the first character past the octal number.  */
 812 static const char *
 813 extract_oct (const char *string, size_t len, unsigned int *result)
 814 {
 815   size_t i;
 816
 817   *result = 0;
 818
 819   for (i = 0; i < len; i++)
 820     {
 821       char c = string[i];
 822       int number;
 823
 824       if (c >= '0' && c <= '7')
 825         number = c - '0';
 826       else
 827         break;
 828
 829       *result <<= 3;
 830       *result |= number;
 831     }
 832
 833   return string + i;
 834 }
 835
 836 /* Extract the various quotelike constructs except for <<EOF.  See the
 837    section "Gory details of parsing quoted constructs" in perlop.pod.
 838    Return the resulting token in *tp; tp->type == token_type_string.  */
 839 static void
 840 extract_quotelike (token_ty *tp, int delim)
 841 {
 842   char *string = extract_quotelike_pass1_utf8 (delim);
 843   size_t len = strlen (string);
 844
 845   tp->type = token_type_string;
 846   /* Take the string without the delimiters at the start and at the end.  */
 847   if (!(len >= 2))
 848     abort ();
 849   string[len - 1] = '\0';
 850   tp->string = xstrdup (string + 1);
 851   free (string);
 852   tp->comment = add_reference (savable_comment);
 853 }
 854
 855 /* Extract the quotelike constructs with double delimiters, like
 856    s/[SEARCH]/[REPLACE]/.  This function does not eat up trailing
 857    modifiers (left to the caller).
 858    Return the resulting token in *tp; tp->type == token_type_regex_op.  */
 859 static void
 860 extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
 861                           bool interpolate)
 862 {
 863   char *string;
 864
 865   tp->type = token_type_regex_op;
 866
 867   string = extract_quotelike_pass1_utf8 (delim);
 868   if (interpolate)
 869     interpolate_keywords (mlp, string, line_number);
 870   free (string);
 871
 872   if (delim == '(' || delim == '<' || delim == '{' || delim == '[')
 873     {
 874       /* The delimiter for the second string can be different, e.g.
 875          s{SEARCH}{REPLACE} or s{SEARCH}/REPLACE/.  See "man perlrequick".  */
 876       delim = phase1_getc ();
 877       while (is_whitespace (delim))
 878         {
 879           /* The hash-sign is not a valid delimiter after whitespace, ergo
 880              use phase2_getc() and not phase1_getc() now.  */
 881           delim = phase2_getc ();
 882         }
 883     }
 884   string = extract_quotelike_pass1_utf8 (delim);
 885   if (interpolate)
 886     interpolate_keywords (mlp, string, line_number);
 887   free (string);
 888 }
 889
 890 /* Perform pass 3 of quotelike extraction (interpolation).
 891    *tp is a token of type token_type_string.
 892    This function replaces tp->string.
 893    This function does not access tp->comment.  */
 894 /* FIXME: Currently may writes null-bytes into the string.  */
 895 static void
 896 extract_quotelike_pass3 (token_ty *tp, int error_level)
 897 {
 898   static char *buffer;
 899   static int bufmax = 0;
 900   int bufpos = 0;
 901   const char *crs;
 902   bool uppercase;
 903   bool lowercase;
 904   bool quotemeta;
 905
 906 #if DEBUG_PERL
 907   switch (tp->sub_type)
 908     {
 909     case string_type_verbatim:
 910       fprintf (stderr, "Interpolating string_type_verbatim:\n");
 911       break;
 912     case string_type_q:
 913       fprintf (stderr, "Interpolating string_type_q:\n");
 914       break;
 915     case string_type_qq:
 916       fprintf (stderr, "Interpolating string_type_qq:\n");
 917       break;
 918     case string_type_qr:
 919       fprintf (stderr, "Interpolating string_type_qr:\n");
 920       break;
 921     }
 922   fprintf (stderr, "%s\n", tp->string);
 923   if (tp->sub_type == string_type_verbatim)
 924     fprintf (stderr, "---> %s\n", tp->string);
 925 #endif
 926
 927   if (tp->sub_type == string_type_verbatim)
 928     return;
 929
 930   /* Loop over tp->string, accumulating the expansion in buffer.  */
 931   crs = tp->string;
 932   uppercase = false;
 933   lowercase = false;
 934   quotemeta = false;
 935   while (*crs)
 936     {
 937       bool backslashed;
 938
 939       /* Ensure room for 7 bytes, 6 (multi-)bytes plus a leading backslash
 940          if \Q modifier is present.  */
 941       if (bufpos + 7 > bufmax)
 942         {
 943           bufmax = 2 * bufmax + 10;
 944           buffer = xrealloc (buffer, bufmax);
 945         }
 946
 947       if (tp->sub_type == string_type_q)
 948         {
 949           switch (*crs)
 950             {
 951             case '\\':
 952               if (crs[1] == '\\')
 953                 {
 954                   crs += 2;
 955                   buffer[bufpos++] = '\\';
 956                   break;
 957                 }
 958               /* FALLTHROUGH */
 959             default:
 960               buffer[bufpos++] = *crs++;
 961               break;
 962             }
 963           continue;
 964         }
 965
 966       /* We only get here for double-quoted strings or regular expressions.
 967          Unescape escape sequences.  */
 968       if (*crs == '\\')
 969         {
 970           switch (crs[1])
 971             {
 972             case 't':
 973               crs += 2;
 974               buffer[bufpos++] = '\t';
 975               continue;
 976             case 'n':
 977               crs += 2;
 978               buffer[bufpos++] = '\n';
 979               continue;
 980             case 'r':
 981               crs += 2;
 982               buffer[bufpos++] = '\r';
 983               continue;
 984             case 'f':
 985               crs += 2;
 986               buffer[bufpos++] = '\f';
 987               continue;
 988             case 'b':
 989               crs += 2;
 990               buffer[bufpos++] = '\b';
 991               continue;
 992             case 'a':
 993               crs += 2;
 994               buffer[bufpos++] = '\a';
 995               continue;
 996             case 'e':
 997               crs += 2;
 998               buffer[bufpos++] = 0x1b;
 999               continue;
1000             case '0': case '1': case '2': case '3':
1001             case '4': case '5': case '6': case '7':
1002               {
1003                 unsigned int oct_number;
1004                 int length;
1005
1006                 crs = extract_oct (crs + 1, 3, &oct_number);
1007
1008                 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1009                    true, the character should be converted to its uppercase
1010                    resp. lowercase equivalent.  I don't know if the necessary
1011                    facilities are already included in gettext.  For US-Ascii
1012                    the conversion can be already be done, however.  */
1013                 if (uppercase && oct_number >= 'a' && oct_number <= 'z')
1014                   {
1015                     oct_number = oct_number - 'a' + 'A';
1016                   }
1017                 else if (lowercase && oct_number >= 'A' && oct_number <= 'Z')
1018                   {
1019                     oct_number = oct_number - 'A' + 'a';
1020                   }
1021
1022
1023                 /* Yes, octal escape sequences in the range 0x100..0x1ff are
1024                    valid.  */
1025                 length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1026                                     oct_number, 2);
1027                 if (length > 0)
1028                   bufpos += length;
1029               }
1030               continue;
1031             case 'x':
1032               {
1033                 unsigned int hex_number = 0;
1034                 int length;
1035
1036                 crs += 2;
1037                 if (*crs == '{')
1038                   {
1039                     const char *end = strchr (crs, '}');
1040                     if (end == NULL)
1041                       {
1042                         error_with_progname = false;
1043                         error (error_level, 0, _("\
1044 %s:%d: missing right brace on \\x{HEXNUMBER}"), real_file_name, line_number);
1045                         error_with_progname = true;
1046                         ++crs;
1047                         continue;
1048                       }
1049                     else
1050                       {
1051                         ++crs;
1052                         (void) extract_hex (crs, end - crs, &hex_number);
1053                         crs = end + 1;
1054                       }
1055                   }
1056                 else
1057                   {
1058                     crs = extract_hex (crs, 2, &hex_number);
1059                   }
1060
1061                 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1062                    true, the character should be converted to its uppercase
1063                    resp. lowercase equivalent.  I don't know if the necessary
1064                    facilities are already included in gettext.  For US-Ascii
1065                    the conversion can be already be done, however.  */
1066                 if (uppercase && hex_number >= 'a' && hex_number <= 'z')
1067                   {
1068                     hex_number = hex_number - 'a' + 'A';
1069                   }
1070                 else if (lowercase && hex_number >= 'A' && hex_number <= 'Z')
1071                   {
1072                     hex_number = hex_number - 'A' + 'a';
1073                   }
1074
1075                 length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1076                                     hex_number, 6);
1077
1078                 if (length > 0)
1079                   bufpos += length;
1080               }
1081               continue;
1082             case 'c':
1083               /* Perl's notion of control characters.  */
1084               crs += 2;
1085               if (*crs)
1086                 {
1087                   int the_char = (unsigned char) *crs;
1088                   if (the_char >= 'a' && the_char <= 'z')
1089                     the_char = the_char - 'a' + 'A';
1090                   buffer[bufpos++] = the_char ^ 0x40;
1091                 }
1092               continue;
1093             case 'N':
1094               crs += 2;
1095               if (*crs == '{')
1096                 {
1097                   const char *end = strchr (crs + 1, '}');
1098                   if (end != NULL)
1099                     {
1100                       char *name;
1101                       unsigned int unicode;
1102
1103                       name = XNMALLOC (end - (crs + 1) + 1, char);
1104                       memcpy (name, crs + 1, end - (crs + 1));
1105                       name[end - (crs + 1)] = '\0';
1106
1107                       unicode = unicode_name_character (name);
1108                       if (unicode != UNINAME_INVALID)
1109                         {
1110                           /* FIXME: Convert to upper/lowercase if the
1111                              corresponding flag is set to true.  */
1112                           int length =
1113                             u8_uctomb ((unsigned char *) (buffer + bufpos),
1114                                        unicode, 6);
1115                           if (length > 0)
1116                             bufpos += length;
1117                         }
1118
1119                       free (name);
1120
1121                       crs = end + 1;
1122                     }
1123                 }
1124               continue;
1125             }
1126         }
1127
1128       /* No escape sequence, go on.  */
1129       if (*crs == '\\')
1130         {
1131           ++crs;
1132           switch (*crs)
1133             {
1134             case 'E':
1135               uppercase = false;
1136               lowercase = false;
1137               quotemeta = false;
1138               ++crs;
1139               continue;
1140             case 'L':
1141               uppercase = false;
1142               lowercase = true;
1143               ++crs;
1144               continue;
1145             case 'U':
1146               uppercase = true;
1147               lowercase = false;
1148               ++crs;
1149               continue;
1150             case 'Q':
1151               quotemeta = true;
1152               ++crs;
1153               continue;
1154             case 'l':
1155               ++crs;
1156               if (*crs >= 'A' && *crs <= 'Z')
1157                 {
1158                   buffer[bufpos++] = *crs - 'A' + 'a';
1159                 }
1160               else if ((unsigned char) *crs >= 0x80)
1161                 {
1162                   error_with_progname = false;
1163                   error (error_level, 0, _("\
1164 %s:%d: invalid interpolation (\"\\l\") of 8bit character \"%c\""),
1165                          real_file_name, line_number, *crs);
1166                   error_with_progname = true;
1167                 }
1168               else
1169                 {
1170                   buffer[bufpos++] = *crs;
1171                 }
1172               ++crs;
1173               continue;
1174             case 'u':
1175               ++crs;
1176               if (*crs >= 'a' && *crs <= 'z')
1177                 {
1178                   buffer[bufpos++] = *crs - 'a' + 'A';
1179                 }
1180               else if ((unsigned char) *crs >= 0x80)
1181                 {
1182                   error_with_progname = false;
1183                   error (error_level, 0, _("\
1184 %s:%d: invalid interpolation (\"\\u\") of 8bit character \"%c\""),
1185                          real_file_name, line_number, *crs);
1186                   error_with_progname = true;
1187                 }
1188               else
1189                 {
1190                   buffer[bufpos++] = *crs;
1191                 }
1192               ++crs;
1193               continue;
1194             case '\\':
1195               buffer[bufpos++] = *crs;
1196               ++crs;
1197               continue;
1198             default:
1199               backslashed = true;
1200               break;
1201             }
1202         }
1203       else
1204         backslashed = false;
1205
1206       if (quotemeta
1207           && !((*crs >= 'A' && *crs <= 'Z') || (*crs >= 'A' && *crs <= 'z')
1208                || (*crs >= '0' && *crs <= '9') || *crs == '_'))
1209         {
1210           buffer[bufpos++] = '\\';
1211           backslashed = true;
1212         }
1213
1214       if (!backslashed && !extract_all && (*crs == '$' || *crs == '@'))
1215         {
1216           error_with_progname = false;
1217           error (error_level, 0, _("\
1218 %s:%d: invalid variable interpolation at \"%c\""),
1219                  real_file_name, line_number, *crs);
1220           error_with_progname = true;
1221           ++crs;
1222         }
1223       else if (lowercase)
1224         {
1225           if (*crs >= 'A' && *crs <= 'Z')
1226             buffer[bufpos++] = *crs - 'A' + 'a';
1227           else if ((unsigned char) *crs >= 0x80)
1228             {
1229               error_with_progname = false;
1230               error (error_level, 0, _("\
1231 %s:%d: invalid interpolation (\"\\L\") of 8bit character \"%c\""),
1232                      real_file_name, line_number, *crs);
1233               error_with_progname = true;
1234               buffer[bufpos++] = *crs;
1235             }
1236           else
1237             buffer[bufpos++] = *crs;
1238           ++crs;
1239         }
1240       else if (uppercase)
1241         {
1242           if (*crs >= 'a' && *crs <= 'z')
1243             buffer[bufpos++] = *crs - 'a' + 'A';
1244           else if ((unsigned char) *crs >= 0x80)
1245             {
1246               error_with_progname = false;
1247               error (error_level, 0, _("\
1248 %s:%d: invalid interpolation (\"\\U\") of 8bit character \"%c\""),
1249                      real_file_name, line_number, *crs);
1250               error_with_progname = true;
1251               buffer[bufpos++] = *crs;
1252             }
1253           else
1254             buffer[bufpos++] = *crs;
1255           ++crs;
1256         }
1257       else
1258         {
1259           buffer[bufpos++] = *crs++;
1260         }
1261     }
1262
1263   /* Ensure room for 1 more byte.  */
1264   if (bufpos >= bufmax)
1265     {
1266       bufmax = 2 * bufmax + 10;
1267       buffer = xrealloc (buffer, bufmax);
1268     }
1269
1270   buffer[bufpos++] = '\0';
1271
1272 #if DEBUG_PERL
1273   fprintf (stderr, "---> %s\n", buffer);
1274 #endif
1275
1276   /* Replace tp->string.  */
1277   free (tp->string);
1278   tp->string = xstrdup (buffer);
1279 }
1280
1281 /* Parse a variable.  This is done in several steps:
1282      1) Consume all leading occurencies of '$', '@', '%', and '*'.
1283      2) Determine the name of the variable from the following input.
1284      3) Parse possible following hash keys or array indexes.
1285  */
1286 static void
1287 extract_variable (message_list_ty *mlp, token_ty *tp, int first)
1288 {
1289   static char *buffer;
1290   static int bufmax = 0;
1291   int bufpos = 0;
1292   int c = first;
1293   size_t varbody_length = 0;
1294   bool maybe_hash_deref = false;
1295   bool maybe_hash_value = false;
1296
1297   tp->type = token_type_variable;
1298
1299 #if DEBUG_PERL
1300   fprintf (stderr, "%s:%d: extracting variable type '%c'\n",
1301            real_file_name, line_number, first);
1302 #endif
1303
1304   /*
1305    * 1) Consume dollars and so on (not euros ...).  Unconditionally
1306    *    accepting the hash sign (#) will maybe lead to inaccurate
1307    *    results.  FIXME!
1308    */
1309   while (c == '$' || c == '*' || c == '#' || c == '@' || c == '%')
1310     {
1311       if (bufpos >= bufmax)
1312         {
1313           bufmax = 2 * bufmax + 10;
1314           buffer = xrealloc (buffer, bufmax);
1315         }
1316       buffer[bufpos++] = c;
1317       c = phase1_getc ();
1318     }
1319
1320   if (c == EOF)
1321     {
1322       tp->type = token_type_eof;
1323       return;
1324     }
1325
1326   /* Hash references are treated in a special way, when looking for
1327      our keywords.  */
1328   if (buffer[0] == '$')
1329     {
1330       if (bufpos == 1)
1331         maybe_hash_value = true;
1332       else if (bufpos == 2 && buffer[1] == '$')
1333         {
1334           if (!(c == '{'
1335                 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1336                 || (c >= '0' && c <= '9')
1337                 || c == '_' || c == ':' || c == '\'' || c >= 0x80))
1338             {
1339               /* Special variable $$ for pid.  */
1340               if (bufpos >= bufmax)
1341                 {
1342                   bufmax = 2 * bufmax + 10;
1343                   buffer = xrealloc (buffer, bufmax);
1344                 }
1345               buffer[bufpos++] = '\0';
1346               tp->string = xstrdup (buffer);
1347 #if DEBUG_PERL
1348               fprintf (stderr, "%s:%d: is PID ($$)\n",
1349                        real_file_name, line_number);
1350 #endif
1351
1352               phase1_ungetc (c);
1353               return;
1354             }
1355
1356           maybe_hash_deref = true;
1357           bufpos = 1;
1358         }
1359     }
1360
1361   /*
1362    * 2) Get the name of the variable.  The first character is practically
1363    *    arbitrary.  Punctuation and numbers automagically put a variable
1364    *    in the global namespace but that subtle difference is not interesting
1365    *    for us.
1366    */
1367   if (bufpos >= bufmax)
1368     {
1369       bufmax = 2 * bufmax + 10;
1370       buffer = xrealloc (buffer, bufmax);
1371     }
1372   if (c == '{')
1373     {
1374       /* Yuck, we cannot accept ${gettext} as a keyword...  Except for
1375        * debugging purposes it is also harmless, that we suppress the
1376        * real name of the variable.
1377        */
1378 #if DEBUG_PERL
1379       fprintf (stderr, "%s:%d: braced {variable_name}\n",
1380                real_file_name, line_number);
1381 #endif
1382
1383       if (extract_balanced (mlp, token_type_rbrace, true, false,
1384                             null_context, null_context_list_iterator,
1385                             1, arglist_parser_alloc (mlp, NULL)))
1386         {
1387           tp->type = token_type_eof;
1388           return;
1389         }
1390       buffer[bufpos++] = c;
1391     }
1392   else
1393     {
1394       while ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1395              || (c >= '0' && c <= '9')
1396              || c == '_' || c == ':' || c == '\'' || c >= 0x80)
1397         {
1398           ++varbody_length;
1399           if (bufpos >= bufmax)
1400             {
1401               bufmax = 2 * bufmax + 10;
1402               buffer = xrealloc (buffer, bufmax);
1403             }
1404           buffer[bufpos++] = c;
1405           c = phase1_getc ();
1406         }
1407       phase1_ungetc (c);
1408     }
1409
1410   /* Probably some strange Perl variable like $`.  */
1411   if (varbody_length == 0)
1412     {
1413       c = phase1_getc ();
1414       if (c == EOF || is_whitespace (c))
1415         phase1_ungetc (c);  /* Loser.  */
1416       else
1417         {
1418           if (bufpos >= bufmax)
1419             {
1420               bufmax = 2 * bufmax + 10;
1421               buffer = xrealloc (buffer, bufmax);
1422             }
1423           buffer[bufpos++] = c;
1424         }
1425     }
1426
1427   if (bufpos >= bufmax)
1428     {
1429       bufmax = 2 * bufmax + 10;
1430       buffer = xrealloc (buffer, bufmax);
1431     }
1432   buffer[bufpos++] = '\0';
1433
1434   tp->string = xstrdup (buffer);
1435
1436 #if DEBUG_PERL
1437   fprintf (stderr, "%s:%d: complete variable name: %s\n",
1438            real_file_name, line_number, tp->string);
1439 #endif
1440
1441   /*
1442    * 3) If the following looks strange to you, this is valid Perl syntax:
1443    *
1444    *      $var = $$hashref    # We can place a
1445    *                          # comment here and then ...
1446    *             {key_into_hashref};
1447    *
1448    *    POD sections are not allowed but we leave complaints about
1449    *    that to the compiler/interpreter.
1450    */
1451   /* We only extract strings from the first hash key (if present).  */
1452
1453   if (maybe_hash_deref || maybe_hash_value)
1454     {
1455       bool is_dereference = false;
1456       int c;
1457
1458       do
1459         c = phase2_getc ();
1460       while (is_whitespace (c));
1461
1462       if (c == '-')
1463         {
1464           int c2 = phase1_getc ();
1465
1466           if (c2 == '>')
1467             {
1468               is_dereference = true;
1469
1470               do
1471                 c = phase2_getc ();
1472               while (is_whitespace (c));
1473             }
1474           else if (c2 != '\n')
1475             {
1476               /* Discarding the newline is harmless here.  The only
1477                  special character recognized after a minus is greater-than
1478                  for dereference.  However, the sequence "-\n>" that we
1479                  treat incorrectly here, is a syntax error.  */
1480               phase1_ungetc (c2);
1481             }
1482         }
1483
1484       if (maybe_hash_value && is_dereference)
1485         {
1486           tp->type = token_type_object;
1487 #if DEBUG_PERL
1488           fprintf (stderr, "%s:%d: first keys preceded by \"->\"\n",
1489                    real_file_name, line_number);
1490 #endif
1491         }
1492       else if (maybe_hash_value)
1493         {
1494           /* Fake it into a hash.  */
1495           tp->string[0] = '%';
1496         }
1497
1498       /* Do NOT change that into else if (see above).  */
1499       if ((maybe_hash_value || maybe_hash_deref) && c == '{')
1500         {
1501           void *keyword_value;
1502
1503 #if DEBUG_PERL
1504           fprintf (stderr, "%s:%d: first keys preceded by '{'\n",
1505                    real_file_name, line_number);
1506 #endif
1507
1508           if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
1509                                &keyword_value) == 0)
1510             {
1511               /* TODO: Shouldn't we use the shapes of the keyword, instead
1512                  of hardwiring argnum1 = 1 ?
1513               const struct callshapes *shapes =
1514                 (const struct callshapes *) keyword_value;
1515               */
1516               struct callshapes shapes;
1517               shapes.keyword = tp->string; /* XXX storage duration? */
1518               shapes.keyword_len = strlen (tp->string);
1519               shapes.nshapes = 1;
1520               shapes.shapes[0].argnum1 = 1;
1521               shapes.shapes[0].argnum2 = 0;
1522               shapes.shapes[0].argnumc = 0;
1523               shapes.shapes[0].argnum1_glib_context = false;
1524               shapes.shapes[0].argnum2_glib_context = false;
1525               shapes.shapes[0].argtotal = 0;
1526               string_list_init (&shapes.shapes[0].xcomments);
1527
1528               {
1529                 /* Extract a possible string from the key.  Before proceeding
1530                    we check whether the open curly is followed by a symbol and
1531                    then by a right curly.  */
1532                 flag_context_list_iterator_ty context_iter =
1533                   flag_context_list_iterator (
1534                     flag_context_list_table_lookup (
1535                       flag_context_list_table,
1536                       tp->string, strlen (tp->string)));
1537                 token_ty *t1 = x_perl_lex (mlp);
1538
1539 #if DEBUG_PERL
1540                 fprintf (stderr, "%s:%d: extracting string key\n",
1541                          real_file_name, line_number);
1542 #endif
1543
1544                 if (t1->type == token_type_symbol
1545                     || t1->type == token_type_named_op)
1546                   {
1547                     token_ty *t2 = x_perl_lex (mlp);
1548                     if (t2->type == token_type_rbrace)
1549                       {
1550                         flag_context_ty context;
1551                         lex_pos_ty pos;
1552
1553                         context =
1554                           inherited_context (null_context,
1555                                              flag_context_list_iterator_advance (
1556                                                &context_iter));
1557
1558                         pos.line_number = line_number;
1559                         pos.file_name = logical_file_name;
1560
1561                         xgettext_current_source_encoding = po_charset_utf8;
1562                         remember_a_message (mlp, NULL, xstrdup (t1->string),
1563                                             context, &pos, NULL, savable_comment);
1564                         xgettext_current_source_encoding = xgettext_global_source_encoding;
1565                         free_token (t2);
1566                         free_token (t1);
1567                       }
1568                     else
1569                       {
1570                         x_perl_unlex (t2);
1571                       }
1572                   }
1573                 else
1574                   {
1575                     x_perl_unlex (t1);
1576                     if (extract_balanced (mlp, token_type_rbrace, true, false,
1577                                           null_context, context_iter,
1578                                           1, arglist_parser_alloc (mlp, &shapes)))
1579                       return;
1580                   }
1581               }
1582             }
1583           else
1584             {
1585               phase2_ungetc (c);
1586             }
1587         }
1588       else
1589         {
1590           phase2_ungetc (c);
1591         }
1592     }
1593
1594   /* Now consume "->", "[...]", and "{...}".  */
1595   for (;;)
1596     {
1597       int c = phase2_getc ();
1598       int c2;
1599
1600       switch (c)
1601         {
1602         case '{':
1603 #if DEBUG_PERL
1604           fprintf (stderr, "%s:%d: extracting balanced '{' after varname\n",
1605                    real_file_name, line_number);
1606 #endif
1607           extract_balanced (mlp, token_type_rbrace, true, false,
1608                             null_context, null_context_list_iterator,
1609                             1, arglist_parser_alloc (mlp, NULL));
1610           break;
1611
1612         case '[':
1613 #if DEBUG_PERL
1614           fprintf (stderr, "%s:%d: extracting balanced '[' after varname\n",
1615                    real_file_name, line_number);
1616 #endif
1617           extract_balanced (mlp, token_type_rbracket, true, false,
1618                             null_context, null_context_list_iterator,
1619                             1, arglist_parser_alloc (mlp, NULL));
1620           break;
1621
1622         case '-':
1623           c2 = phase1_getc ();
1624           if (c2 == '>')
1625             {
1626 #if DEBUG_PERL
1627               fprintf (stderr, "%s:%d: another \"->\" after varname\n",
1628                        real_file_name, line_number);
1629 #endif
1630               break;
1631             }
1632           else if (c2 != '\n')
1633             {
1634               /* Discarding the newline is harmless here.  The only
1635                  special character recognized after a minus is greater-than
1636                  for dereference.  However, the sequence "-\n>" that we
1637                  treat incorrectly here, is a syntax error.  */
1638               phase1_ungetc (c2);
1639             }
1640           /* FALLTHROUGH */
1641
1642         default:
1643 #if DEBUG_PERL
1644           fprintf (stderr, "%s:%d: variable finished\n",
1645                    real_file_name, line_number);
1646 #endif
1647           phase2_ungetc (c);
1648           return;
1649         }
1650     }
1651 }
1652
1653 /* Actually a simplified version of extract_variable().  It searches for
1654    variables inside a double-quoted string that may interpolate to
1655    some keyword hash (reference).  The string is UTF-8 encoded.  */
1656 static void
1657 interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
1658 {
1659   static char *buffer;
1660   static int bufmax = 0;
1661   int bufpos = 0;
1662   flag_context_ty context;
1663   int c;
1664   bool maybe_hash_deref = false;
1665   enum parser_state
1666     {
1667       initial,
1668       one_dollar,
1669       two_dollars,
1670       identifier,
1671       minus,
1672       wait_lbrace,
1673       wait_quote,
1674       dquote,
1675       squote,
1676       barekey,
1677       wait_rbrace
1678     } state;
1679   token_ty token;
1680
1681   lex_pos_ty pos;
1682
1683   /* States are:
1684    *
1685    * initial:      initial
1686    * one_dollar:   dollar sign seen in state INITIAL
1687    * two_dollars:  another dollar-sign has been seen in state ONE_DOLLAR
1688    * identifier:   a valid identifier character has been seen in state
1689    *               ONE_DOLLAR or TWO_DOLLARS
1690    * minus:        a minus-sign has been seen in state IDENTIFIER
1691    * wait_lbrace:  a greater-than has been seen in state MINUS
1692    * wait_quote:   a left brace has been seen in state IDENTIFIER or in
1693    *               state WAIT_LBRACE
1694    * dquote:       a double-quote has been seen in state WAIT_QUOTE
1695    * squote:       a single-quote has been seen in state WAIT_QUOTE
1696    * barekey:      an bareword character has been seen in state WAIT_QUOTE
1697    * wait_rbrace:  closing quote has been seen in state DQUOTE or SQUOTE
1698    *
1699    * In the states initial...identifier the context is null_context; in the
1700    * states minus...wait_rbrace the context is the one suitable for the first
1701    * argument of the last seen identifier.
1702    */
1703   state = initial;
1704   context = null_context;
1705
1706   token.type = token_type_string;
1707   token.sub_type = string_type_qq;
1708   token.line_number = line_number;
1709   /* No need for  token.comment = add_reference (savable_comment);  here.
1710      We can let token.comment uninitialized here, and use savable_comment
1711      directly, because this function only parses the given string and does
1712      not call phase2_getc.  */
1713   pos.file_name = logical_file_name;
1714   pos.line_number = lineno;
1715
1716   while ((c = (unsigned char) *string++) != '\0')
1717     {
1718       void *keyword_value;
1719
1720       if (state == initial)
1721         bufpos = 0;
1722
1723       if (c == '\n')
1724         lineno++;
1725
1726       if (bufpos + 1 >= bufmax)
1727         {
1728           bufmax = 2 * bufmax + 10;
1729           buffer = xrealloc (buffer, bufmax);
1730         }
1731
1732       switch (state)
1733         {
1734         case initial:
1735           switch (c)
1736             {
1737             case '\\':
1738               c = (unsigned char) *string++;
1739               if (c == '\0')
1740                 return;
1741               break;
1742             case '$':
1743               buffer[bufpos++] = '$';
1744               maybe_hash_deref = false;
1745               state = one_dollar;
1746               break;
1747             default:
1748               break;
1749             }
1750           break;
1751         case one_dollar:
1752           switch (c)
1753             {
1754             case '$':
1755               /*
1756                * This is enough to make us believe later that we dereference
1757                * a hash reference.
1758                */
1759               maybe_hash_deref = true;
1760               state = two_dollars;
1761               break;
1762             default:
1763               if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1764                   || (c >= 'A' && c <= 'Z')
1765                   || (c >= 'a' && c <= 'z')
1766                   || (c >= '0' && c <= '9'))
1767                 {
1768                   buffer[bufpos++] = c;
1769                   state = identifier;
1770                 }
1771               else
1772                 state = initial;
1773               break;
1774             }
1775           break;
1776         case two_dollars:
1777           if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1778               || (c >= 'A' && c <= 'Z')
1779               || (c >= 'a' && c <= 'z')
1780               || (c >= '0' && c <= '9'))
1781             {
1782               buffer[bufpos++] = c;
1783               state = identifier;
1784             }
1785           else
1786             state = initial;
1787           break;
1788         case identifier:
1789           switch (c)
1790             {
1791             case '-':
1792               if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
1793                   == 0)
1794                 {
1795                   flag_context_list_iterator_ty context_iter =
1796                     flag_context_list_iterator (
1797                       flag_context_list_table_lookup (
1798                         flag_context_list_table,
1799                         buffer, bufpos));
1800                   context =
1801                     inherited_context (null_context,
1802                                        flag_context_list_iterator_advance (
1803                                          &context_iter));
1804                   state = minus;
1805                 }
1806               else
1807                 state = initial;
1808               break;
1809             case '{':
1810               if (!maybe_hash_deref)
1811                 buffer[0] = '%';
1812               if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
1813                   == 0)
1814                 {
1815                   flag_context_list_iterator_ty context_iter =
1816                     flag_context_list_iterator (
1817                       flag_context_list_table_lookup (
1818                         flag_context_list_table,
1819                         buffer, bufpos));
1820                   context =
1821                     inherited_context (null_context,
1822                                        flag_context_list_iterator_advance (
1823                                          &context_iter));
1824                   state = wait_quote;
1825                 }
1826               else
1827                 state = initial;
1828               break;
1829             default:
1830               if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1831                   || (c >= 'A' && c <= 'Z')
1832                   || (c >= 'a' && c <= 'z')
1833                   || (c >= '0' && c <= '9'))
1834                 {
1835                   buffer[bufpos++] = c;
1836                 }
1837               else
1838                 state = initial;
1839               break;
1840             }
1841           break;
1842         case minus:
1843           switch (c)
1844             {
1845             case '>':
1846               state = wait_lbrace;
1847               break;
1848             default:
1849               context = null_context;
1850               state = initial;
1851               break;
1852             }
1853           break;
1854         case wait_lbrace:
1855           switch (c)
1856             {
1857             case '{':
1858               state = wait_quote;
1859               break;
1860             default:
1861               context = null_context;
1862               state = initial;
1863               break;
1864             }
1865           break;
1866         case wait_quote:
1867           switch (c)
1868             {
1869             case_whitespace:
1870               break;
1871             case '\'':
1872               pos.line_number = lineno;
1873               bufpos = 0;
1874               state = squote;
1875               break;
1876             case '"':
1877               pos.line_number = lineno;
1878               bufpos = 0;
1879               state = dquote;
1880               break;
1881             default:
1882               if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1883                   || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1884                 {
1885                   pos.line_number = lineno;
1886                   bufpos = 0;
1887                   buffer[bufpos++] = c;
1888                   state = barekey;
1889                 }
1890               else
1891                 {
1892                   context = null_context;
1893                   state = initial;
1894                 }
1895               break;
1896             }
1897           break;
1898         case dquote:
1899           switch (c)
1900             {
1901             case '"':
1902               /* The resulting string has to be interpolated twice.  */
1903               buffer[bufpos] = '\0';
1904               token.string = xstrdup (buffer);
1905               extract_quotelike_pass3 (&token, EXIT_FAILURE);
1906               /* The string can only shrink with interpolation (because
1907                  we ignore \Q).  */
1908               if (!(strlen (token.string) <= bufpos))
1909                 abort ();
1910               strcpy (buffer, token.string);
1911               free (token.string);
1912               state = wait_rbrace;
1913               break;
1914             case '\\':
1915               if (string[0] == '\"')
1916                 {
1917                   buffer[bufpos++] = string++[0];
1918                 }
1919               else if (string[0])
1920                 {
1921                   buffer[bufpos++] = '\\';
1922                   buffer[bufpos++] = string++[0];
1923                 }
1924               else
1925                 {
1926                   context = null_context;
1927                   state = initial;
1928                 }
1929               break;
1930             default:
1931               buffer[bufpos++] = c;
1932               break;
1933             }
1934           break;
1935         case squote:
1936           switch (c)
1937             {
1938             case '\'':
1939               state = wait_rbrace;
1940               break;
1941             case '\\':
1942               if (string[0] == '\'')
1943                 {
1944                   buffer[bufpos++] = string++[0];
1945                 }
1946               else if (string[0])
1947                 {
1948                   buffer[bufpos++] = '\\';
1949                   buffer[bufpos++] = string++[0];
1950                 }
1951               else
1952                 {
1953                   context = null_context;
1954                   state = initial;
1955                 }
1956               break;
1957             default:
1958               buffer[bufpos++] = c;
1959               break;
1960             }
1961           break;
1962         case barekey:
1963           if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1964               || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1965             {
1966               buffer[bufpos++] = c;
1967               break;
1968             }
1969           else if (is_whitespace (c))
1970             {
1971               state = wait_rbrace;
1972               break;
1973             }
1974           else if (c != '}')
1975             {
1976               context = null_context;
1977               state = initial;
1978               break;
1979             }
1980           /* Must be right brace.  */
1981           /* FALLTHROUGH */
1982         case wait_rbrace:
1983           switch (c)
1984             {
1985             case_whitespace:
1986               break;
1987             case '}':
1988               buffer[bufpos] = '\0';
1989               token.string = xstrdup (buffer);
1990               extract_quotelike_pass3 (&token, EXIT_FAILURE);
1991               xgettext_current_source_encoding = po_charset_utf8;
1992               remember_a_message (mlp, NULL, token.string, context, &pos,
1993                                   NULL, savable_comment);
1994               xgettext_current_source_encoding = xgettext_global_source_encoding;
1995               /* FALLTHROUGH */
1996             default:
1997               context = null_context;
1998               state = initial;
1999               break;
2000             }
2001           break;
2002         }
2003     }
2004 }
2005
2006 /* There is an ambiguity about '/' and '?': They can start an operator
2007    (division operator '/' or '/=' or the conditional operator '?'), or they can
2008    start a regular expression.  The distinction is important because inside
2009    regular expressions, '#' loses its special meaning.  This function helps
2010    making the decision (a heuristic).  See the documentation for details.  */
2011 static bool
2012 prefer_regexp_over_division (token_type_ty type)
2013 {
2014   bool retval = true;
2015
2016   switch (type)
2017     {
2018       case token_type_eof:
2019         retval = true;
2020         break;
2021       case token_type_lparen:
2022         retval = true;
2023         break;
2024       case token_type_rparen:
2025         retval = false;
2026         break;
2027       case token_type_comma:
2028         retval = true;
2029         break;
2030       case token_type_fat_comma:
2031         retval = true;
2032         break;
2033       case token_type_dereference:
2034         retval = true;
2035         break;
2036       case token_type_semicolon:
2037         retval = true;
2038         break;
2039       case token_type_lbrace:
2040         retval = true;
2041         break;
2042       case token_type_rbrace:
2043         retval = false;
2044         break;
2045       case token_type_lbracket:
2046         retval = true;
2047         break;
2048       case token_type_rbracket:
2049         retval = false;
2050         break;
2051       case token_type_string:
2052         retval = false;
2053         break;
2054       case token_type_number:
2055         retval = false;
2056         break;
2057       case token_type_named_op:
2058         retval = true;
2059         break;
2060       case token_type_variable:
2061         retval = false;
2062         break;
2063       case token_type_object:
2064         retval = false;
2065         break;
2066       case token_type_symbol:
2067       case token_type_keyword_symbol:
2068         retval = true;
2069         break;
2070       case token_type_regex_op:
2071         retval = false;
2072         break;
2073       case token_type_dot:
2074         retval = true;
2075         break;
2076       case token_type_other:
2077         retval = true;
2078         break;
2079   }
2080
2081 #if DEBUG_PERL
2082   token_ty ty;
2083   ty.type = type;
2084   fprintf (stderr, "Prefer regexp over division after %s: %s\n",
2085            token2string (&ty), retval ? "true" : "false");
2086 #endif
2087
2088   return retval;
2089 }
2090
2091 /* Last token type seen in the stream.  Important for the interpretation
2092    of slash and question mark.  */
2093 static token_type_ty last_token_type;
2094
2095 /* Combine characters into tokens.  Discard whitespace.  */
2096
2097 static void
2098 x_perl_prelex (message_list_ty *mlp, token_ty *tp)
2099 {
2100   static char *buffer;
2101   static int bufmax;
2102   int bufpos;
2103   int c;
2104
2105   for (;;)
2106     {
2107       c = phase2_getc ();
2108       tp->line_number = line_number;
2109       tp->last_type = last_token_type;
2110
2111       switch (c)
2112         {
2113         case EOF:
2114           tp->type = token_type_eof;
2115           return;
2116
2117         case '\n':
2118           if (last_non_comment_line > last_comment_line)
2119             savable_comment_reset ();
2120           /* FALLTHROUGH */
2121         case '\t':
2122         case ' ':
2123           /* Ignore whitespace.  */
2124           continue;
2125
2126         case '%':
2127         case '@':
2128         case '*':
2129         case '$':
2130           if (!extract_all)
2131             {
2132               extract_variable (mlp, tp, c);
2133               return;
2134             }
2135           break;
2136         }
2137
2138       last_non_comment_line = tp->line_number;
2139
2140       switch (c)
2141         {
2142         case '.':
2143           {
2144             int c2 = phase1_getc ();
2145             phase1_ungetc (c2);
2146             if (c2 == '.')
2147               {
2148                 tp->type = token_type_other;
2149                 return;
2150               }
2151             else if (!(c2 >= '0' && c2 <= '9'))
2152               {
2153                 tp->type = token_type_dot;
2154                 return;
2155               }
2156           }
2157           /* FALLTHROUGH */
2158         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2159         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2160         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2161         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2162         case 'Y': case 'Z':
2163         case '_':
2164         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2165         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2166         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2167         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2168         case 'y': case 'z':
2169         case '0': case '1': case '2': case '3': case '4':
2170         case '5': case '6': case '7': case '8': case '9':
2171           /* Symbol, or part of a number.  */
2172           bufpos = 0;
2173           for (;;)
2174             {
2175               if (bufpos >= bufmax)
2176                 {
2177                   bufmax = 2 * bufmax + 10;
2178                   buffer = xrealloc (buffer, bufmax);
2179                 }
2180               buffer[bufpos++] = c;
2181               c = phase1_getc ();
2182               switch (c)
2183                 {
2184                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2185                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2186                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2187                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2188                 case 'Y': case 'Z':
2189                 case '_':
2190                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2191                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2192                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2193                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2194                 case 'y': case 'z':
2195                 case '0': case '1': case '2': case '3': case '4':
2196                 case '5': case '6': case '7': case '8': case '9':
2197                   continue;
2198
2199                 default:
2200                   phase1_ungetc (c);
2201                   break;
2202                 }
2203               break;
2204             }
2205           if (bufpos >= bufmax)
2206             {
2207               bufmax = 2 * bufmax + 10;
2208               buffer = xrealloc (buffer, bufmax);
2209             }
2210           buffer[bufpos] = '\0';
2211
2212           if (strcmp (buffer, "__END__") == 0
2213               || strcmp (buffer, "__DATA__") == 0)
2214             {
2215               end_of_file = true;
2216               tp->type = token_type_eof;
2217               return;
2218             }
2219           else if (strcmp (buffer, "and") == 0
2220                    || strcmp (buffer, "cmp") == 0
2221                    || strcmp (buffer, "eq") == 0
2222                    || strcmp (buffer, "if") == 0
2223                    || strcmp (buffer, "ge") == 0
2224                    || strcmp (buffer, "gt") == 0
2225                    || strcmp (buffer, "le") == 0
2226                    || strcmp (buffer, "lt") == 0
2227                    || strcmp (buffer, "ne") == 0
2228                    || strcmp (buffer, "not") == 0
2229                    || strcmp (buffer, "or") == 0
2230                    || strcmp (buffer, "unless") == 0
2231                    || strcmp (buffer, "while") == 0
2232                    || strcmp (buffer, "xor") == 0)
2233             {
2234               tp->type = token_type_named_op;
2235               tp->string = xstrdup (buffer);
2236               return;
2237             }
2238           else if (strcmp (buffer, "s") == 0
2239                  || strcmp (buffer, "y") == 0
2240                  || strcmp (buffer, "tr") == 0)
2241             {
2242               int delim = phase1_getc ();
2243
2244               while (is_whitespace (delim))
2245                 delim = phase2_getc ();
2246
2247               if (delim == EOF)
2248                 {
2249                   tp->type = token_type_eof;
2250                   return;
2251                 }
2252               if ((delim >= '0' && delim <= '9')
2253                   || (delim >= 'A' && delim <= 'Z')
2254                   || (delim >= 'a' && delim <= 'z'))
2255                 {
2256                   /* False positive.  */
2257                   phase2_ungetc (delim);
2258                   tp->type = token_type_symbol;
2259                   tp->sub_type = symbol_type_none;
2260                   tp->string = xstrdup (buffer);
2261                   return;
2262                 }
2263               extract_triple_quotelike (mlp, tp, delim,
2264                                         buffer[0] == 's' && delim != '\'');
2265
2266               /* Eat the following modifiers.  */
2267               do
2268                 c = phase1_getc ();
2269               while (c >= 'a' && c <= 'z');
2270               phase1_ungetc (c);
2271               return;
2272             }
2273           else if (strcmp (buffer, "m") == 0)
2274             {
2275               int delim = phase1_getc ();
2276
2277               while (is_whitespace (delim))
2278                 delim = phase2_getc ();
2279
2280               if (delim == EOF)
2281                 {
2282                   tp->type = token_type_eof;
2283                   return;
2284                 }
2285               if ((delim >= '0' && delim <= '9')
2286                   || (delim >= 'A' && delim <= 'Z')
2287                   || (delim >= 'a' && delim <= 'z'))
2288                 {
2289                   /* False positive.  */
2290                   phase2_ungetc (delim);
2291                   tp->type = token_type_symbol;
2292                   tp->sub_type = symbol_type_none;
2293                   tp->string = xstrdup (buffer);
2294                   return;
2295                 }
2296               extract_quotelike (tp, delim);
2297               if (delim != '\'')
2298                 interpolate_keywords (mlp, tp->string, line_number);
2299               free (tp->string);
2300               drop_reference (tp->comment);
2301               tp->type = token_type_regex_op;
2302
2303               /* Eat the following modifiers.  */
2304               do
2305                 c = phase1_getc ();
2306               while (c >= 'a' && c <= 'z');
2307               phase1_ungetc (c);
2308               return;
2309             }
2310           else if (strcmp (buffer, "qq") == 0
2311                    || strcmp (buffer, "q") == 0
2312                    || strcmp (buffer, "qx") == 0
2313                    || strcmp (buffer, "qw") == 0
2314                    || strcmp (buffer, "qr") == 0)
2315             {
2316               /* The qw (...) construct is not really a string but we
2317                  can treat in the same manner and then pretend it is
2318                  a symbol.  Rationale: Saying "qw (foo bar)" is the
2319                  same as "my @list = ('foo', 'bar'); @list;".  */
2320
2321               int delim = phase1_getc ();
2322
2323               while (is_whitespace (delim))
2324                 delim = phase2_getc ();
2325
2326               if (delim == EOF)
2327                 {
2328                   tp->type = token_type_eof;
2329                   return;
2330                 }
2331
2332               if ((delim >= '0' && delim <= '9')
2333                   || (delim >= 'A' && delim <= 'Z')
2334                   || (delim >= 'a' && delim <= 'z'))
2335                 {
2336                   /* False positive.  */
2337                   phase2_ungetc (delim);
2338                   tp->type = token_type_symbol;
2339                   tp->sub_type = symbol_type_none;
2340                   tp->string = xstrdup (buffer);
2341                   return;
2342                 }
2343
2344               extract_quotelike (tp, delim);
2345
2346               switch (buffer[1])
2347                 {
2348                 case 'q':
2349                 case 'x':
2350                   tp->type = token_type_string;
2351                   tp->sub_type = string_type_qq;
2352                   interpolate_keywords (mlp, tp->string, line_number);
2353                   break;
2354                 case 'r':
2355                   drop_reference (tp->comment);
2356                   tp->type = token_type_regex_op;
2357                   break;
2358                 case 'w':
2359                   drop_reference (tp->comment);
2360                   tp->type = token_type_symbol;
2361                   tp->sub_type = symbol_type_none;
2362                   break;
2363                 case '\0':
2364                   tp->type = token_type_string;
2365                   tp->sub_type = string_type_q;
2366                   break;
2367                 default:
2368                   abort ();
2369                 }
2370               return;
2371             }
2372           else if ((buffer[0] >= '0' && buffer[0] <= '9') || buffer[0] == '.')
2373             {
2374               tp->type = token_type_number;
2375               return;
2376             }
2377           tp->type = token_type_symbol;
2378           tp->sub_type = (strcmp (buffer, "sub") == 0
2379                           ? symbol_type_sub
2380                           : symbol_type_none);
2381           tp->string = xstrdup (buffer);
2382           return;
2383
2384         case '"':
2385           extract_quotelike (tp, c);
2386           tp->sub_type = string_type_qq;
2387           interpolate_keywords (mlp, tp->string, line_number);
2388           return;
2389
2390         case '`':
2391           extract_quotelike (tp, c);
2392           tp->sub_type = string_type_qq;
2393           interpolate_keywords (mlp, tp->string, line_number);
2394           return;
2395
2396         case '\'':
2397           extract_quotelike (tp, c);
2398           tp->sub_type = string_type_q;
2399           return;
2400
2401         case '(':
2402           c = phase2_getc ();
2403           if (c == ')')
2404             /* Ignore empty list.  */
2405             continue;
2406           else
2407             phase2_ungetc (c);
2408           tp->type = token_type_lparen;
2409           return;
2410
2411         case ')':
2412           tp->type = token_type_rparen;
2413           return;
2414
2415         case '{':
2416           tp->type = token_type_lbrace;
2417           return;
2418
2419         case '}':
2420           tp->type = token_type_rbrace;
2421           return;
2422
2423         case '[':
2424           tp->type = token_type_lbracket;
2425           return;
2426
2427         case ']':
2428           tp->type = token_type_rbracket;
2429           return;
2430
2431         case ';':
2432           tp->type = token_type_semicolon;
2433           return;
2434
2435         case ',':
2436           tp->type = token_type_comma;
2437           return;
2438
2439         case '=':
2440           /* Check for fat comma.  */
2441           c = phase1_getc ();
2442           if (c == '>')
2443             {
2444               tp->type = token_type_fat_comma;
2445               return;
2446             }
2447           else if (linepos == 2
2448                    && (last_token_type == token_type_semicolon
2449                        || last_token_type == token_type_rbrace)
2450                    && ((c >= 'A' && c <='Z')
2451                        || (c >= 'a' && c <= 'z')))
2452             {
2453 #if DEBUG_PERL
2454               fprintf (stderr, "%s:%d: start pod section\n",
2455                        real_file_name, line_number);
2456 #endif
2457               skip_pod ();
2458 #if DEBUG_PERL
2459               fprintf (stderr, "%s:%d: end pod section\n",
2460                        real_file_name, line_number);
2461 #endif
2462               continue;
2463             }
2464           phase1_ungetc (c);
2465           tp->type = token_type_other;
2466           return;
2467
2468         case '<':
2469           /* Check for <<EOF and friends.  */
2470           c = phase1_getc ();
2471           if (c == '<')
2472             {
2473               c = phase1_getc ();
2474               if (c == '\'')
2475                 {
2476                   char *string;
2477                   extract_quotelike (tp, c);
2478                   string = get_here_document (tp->string);
2479                   free (tp->string);
2480                   tp->string = string;
2481                   tp->type = token_type_string;
2482                   tp->sub_type = string_type_verbatim;
2483                   tp->line_number = line_number + 1;
2484                   return;
2485                 }
2486               else if (c == '"')
2487                 {
2488                   char *string;
2489                   extract_quotelike (tp, c);
2490                   string = get_here_document (tp->string);
2491                   free (tp->string);
2492                   tp->string = string;
2493                   tp->type = token_type_string;
2494                   tp->sub_type = string_type_qq;
2495                   tp->line_number = line_number + 1;
2496                   interpolate_keywords (mlp, tp->string, tp->line_number);
2497                   return;
2498                 }
2499               else if ((c >= 'A' && c <= 'Z')
2500                        || (c >= 'a' && c <= 'z')
2501                        || c == '_')
2502                 {
2503                   bufpos = 0;
2504                   while ((c >= 'A' && c <= 'Z')
2505                          || (c >= 'a' && c <= 'z')
2506                          || (c >= '0' && c <= '9')
2507                          || c == '_' || c >= 0x80)
2508                     {
2509                       if (bufpos >= bufmax)
2510                         {
2511                           bufmax = 2 * bufmax + 10;
2512                           buffer = xrealloc (buffer, bufmax);
2513                         }
2514                       buffer[bufpos++] = c;
2515                       c = phase1_getc ();
2516                     }
2517                   if (c == EOF)
2518                     {
2519                       tp->type = token_type_eof;
2520                       return;
2521                     }
2522                   else
2523                     {
2524                       char *string;
2525                       phase1_ungetc (c);
2526                       if (bufpos >= bufmax)
2527                         {
2528                           bufmax = 2 * bufmax + 10;
2529                           buffer = xrealloc (buffer, bufmax);
2530                         }
2531                       buffer[bufpos++] = '\0';
2532                       string = get_here_document (buffer);
2533                       tp->string = string;
2534                       tp->type = token_type_string;
2535                       tp->sub_type = string_type_qq;
2536                       tp->comment = add_reference (savable_comment);
2537                       tp->line_number = line_number + 1;
2538                       interpolate_keywords (mlp, tp->string, tp->line_number);
2539                       return;
2540                     }
2541                 }
2542               else
2543                 {
2544                   tp->type = token_type_other;
2545                   return;
2546                 }
2547             }
2548           else
2549             {
2550               phase1_ungetc (c);
2551               tp->type = token_type_other;
2552             }
2553           return;  /* End of case '>'.  */
2554
2555         case '-':
2556           /* Check for dereferencing operator.  */
2557           c = phase1_getc ();
2558           if (c == '>')
2559             {
2560               tp->type = token_type_dereference;
2561               return;
2562             }
2563           else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
2564             {
2565               /* One of the -X (filetest) functions.  We play safe
2566                  and accept all alphabetical characters here.  */
2567               tp->type = token_type_other;
2568               return;
2569             }
2570           phase1_ungetc (c);
2571           tp->type = token_type_other;
2572           return;
2573
2574         case '/':
2575         case '?':
2576           if (prefer_regexp_over_division (tp->last_type))
2577             {
2578               extract_quotelike (tp, c);
2579               interpolate_keywords (mlp, tp->string, line_number);
2580               free (tp->string);
2581               drop_reference (tp->comment);
2582               tp->type = token_type_regex_op;
2583               /* Eat the following modifiers.  */
2584               do
2585                 c = phase1_getc ();
2586               while (c >= 'a' && c <= 'z');
2587               phase1_ungetc (c);
2588               return;
2589             }
2590           /* Recognize operator '//'.  */
2591           if (c == '/')
2592             {
2593               c = phase1_getc ();
2594               if (c != '/')
2595                 phase1_ungetc (c);
2596             }
2597           /* FALLTHROUGH */
2598
2599         default:
2600           /* We could carefully recognize each of the 2 and 3 character
2601              operators, but it is not necessary, except for the '//' operator,
2602              as we only need to recognize gettext invocations.  Don't
2603              bother.  */
2604           tp->type = token_type_other;
2605           return;
2606         }
2607     }
2608 }
2609
2610
2611 /* A token stack used as a lookahead buffer.  */
2612
2613 typedef struct token_stack_ty token_stack_ty;
2614 struct token_stack_ty
2615 {
2616   token_ty **items;
2617   size_t nitems;
2618   size_t nitems_max;
2619 };
2620
2621 static struct token_stack_ty token_stack;
2622
2623 #if DEBUG_PERL
2624 /* Dumps all resources allocated by stack STACK.  */
2625 static int
2626 token_stack_dump (token_stack_ty *stack)
2627 {
2628   size_t i;
2629
2630   fprintf (stderr, "BEGIN STACK DUMP\n");
2631   for (i = 0; i < stack->nitems; i++)
2632     {
2633       token_ty *token = stack->items[i];
2634       fprintf (stderr, "  [%s]\n", token2string (token));
2635       switch (token->type)
2636         {
2637         case token_type_named_op:
2638         case token_type_string:
2639         case token_type_symbol:
2640         case token_type_variable:
2641           fprintf (stderr, "    string: %s\n", token->string);
2642           break;
2643         case token_type_object:
2644           fprintf (stderr, "    string: %s->\n", token->string);
2645         default:
2646           break;
2647         }
2648     }
2649   fprintf (stderr, "END STACK DUMP\n");
2650   return 0;
2651 }
2652 #endif
2653
2654 /* Pushes the token TOKEN onto the stack STACK.  */
2655 static inline void
2656 token_stack_push (token_stack_ty *stack, token_ty *token)
2657 {
2658   if (stack->nitems >= stack->nitems_max)
2659     {
2660       size_t nbytes;
2661
2662       stack->nitems_max = 2 * stack->nitems_max + 4;
2663       nbytes = stack->nitems_max * sizeof (token_ty *);
2664       stack->items = xrealloc (stack->items, nbytes);
2665     }
2666   stack->items[stack->nitems++] = token;
2667 }
2668
2669 /* Pops the most recently pushed token from the stack STACK and returns it.
2670    Returns NULL if the stack is empty.  */
2671 static inline token_ty *
2672 token_stack_pop (token_stack_ty *stack)
2673 {
2674   if (stack->nitems > 0)
2675     return stack->items[--(stack->nitems)];
2676   else
2677     return NULL;
2678 }
2679
2680 /* Return the top of the stack without removing it from the stack, or
2681    NULL if the stack is empty.  */
2682 static inline token_ty *
2683 token_stack_peek (const token_stack_ty *stack)
2684 {
2685   if (stack->nitems > 0)
2686     return stack->items[stack->nitems - 1];
2687   else
2688     return NULL;
2689 }
2690
2691 /* Frees all resources allocated by stack STACK.  */
2692 static inline void
2693 token_stack_free (token_stack_ty *stack)
2694 {
2695   size_t i;
2696
2697   for (i = 0; i < stack->nitems; i++)
2698     free_token (stack->items[i]);
2699   free (stack->items);
2700 }
2701
2702
2703 static token_ty *
2704 x_perl_lex (message_list_ty *mlp)
2705 {
2706 #if DEBUG_PERL
2707   int dummy = token_stack_dump (&token_stack);
2708 #endif
2709   token_ty *tp = token_stack_pop (&token_stack);
2710
2711   if (!tp)
2712     {
2713       tp = XMALLOC (token_ty);
2714       x_perl_prelex (mlp, tp);
2715       tp->last_type = last_token_type;
2716       last_token_type = tp->type;
2717
2718 #if DEBUG_PERL
2719       fprintf (stderr, "%s:%d: x_perl_prelex returned %s\n",
2720                real_file_name, line_number, token2string (tp));
2721 #endif
2722
2723       /* The interpretation of a slash or question mark after a function call
2724          depends on the prototype of that function.  If the function expects
2725          at least one argument, a regular expression is preferred, otherwise
2726          an operator.  With our limited means, we can only guess here.  If
2727          the function is a builtin that takes no arguments, we prefer an
2728          operator by silently turning the last symbol into a variable instead
2729          of a symbol.
2730
2731          Method calls without parentheses are not ambiguous.  After them, an
2732          operator must follow.  Due to some ideosyncrasies in this parser
2733          they are treated in two different manners.  If the call is
2734          chained ($foo->bar->baz) the token left of the symbol is a
2735          dereference operator.  If it is not chained ($foo->bar) the
2736          dereference operator is consumed with the extracted variable.  The
2737          latter case is handled below.  */
2738       if (tp->type == token_type_symbol)
2739         {
2740           if (tp->last_type == token_type_dereference)
2741             {
2742               /* Class method call or chained method call (with at least
2743                  two arrow operators).  */
2744               last_token_type = token_type_variable;
2745             }
2746           else if (tp->last_type == token_type_object)
2747             {
2748               /* Instance method, not chained.  */
2749               last_token_type = token_type_variable;
2750             }
2751           else if (strcmp (tp->string, "wantarray") == 0
2752                    || strcmp (tp->string, "fork") == 0
2753                    || strcmp (tp->string, "getlogin") == 0
2754                    || strcmp (tp->string, "getppid") == 0
2755                    || strcmp (tp->string, "getpwent") == 0
2756                    || strcmp (tp->string, "getgrent") == 0
2757                    || strcmp (tp->string, "gethostent") == 0
2758                    || strcmp (tp->string, "getnetent") == 0
2759                    || strcmp (tp->string, "getprotoent") == 0
2760                    || strcmp (tp->string, "getservent") == 0
2761                    || strcmp (tp->string, "setpwent") == 0
2762                    || strcmp (tp->string, "setgrent") == 0
2763                    || strcmp (tp->string, "endpwent") == 0
2764                    || strcmp (tp->string, "endgrent") == 0
2765                    || strcmp (tp->string, "endhostent") == 0
2766                    || strcmp (tp->string, "endnetent") == 0
2767                    || strcmp (tp->string, "endprotoent") == 0
2768                    || strcmp (tp->string, "endservent") == 0
2769                    || strcmp (tp->string, "time") == 0
2770                    || strcmp (tp->string, "times") == 0
2771                    || strcmp (tp->string, "wait") == 0
2772                    || strcmp (tp->string, "wantarray") == 0)
2773             {
2774               /* A Perl built-in function that does not accept arguments.  */
2775               last_token_type = token_type_variable;
2776             }
2777         }
2778     }
2779 #if DEBUG_PERL
2780   else
2781     {
2782       fprintf (stderr, "%s:%d: %s recycled from stack\n",
2783                real_file_name, line_number, token2string (tp));
2784     }
2785 #endif
2786
2787   /* A symbol followed by a fat comma is really a single-quoted string.
2788      Function definitions or forward declarations also need a special
2789      handling because the dollars and at signs inside the parentheses
2790      must not be interpreted as the beginning of a variable ')'.  */
2791   if (tp->type == token_type_symbol || tp->type == token_type_named_op)
2792     {
2793       token_ty *next = token_stack_peek (&token_stack);
2794
2795       if (!next)
2796         {
2797 #if DEBUG_PERL
2798           fprintf (stderr, "%s:%d: pre-fetching next token\n",
2799                    real_file_name, line_number);
2800 #endif
2801           next = x_perl_lex (mlp);
2802           x_perl_unlex (next);
2803 #if DEBUG_PERL
2804           fprintf (stderr, "%s:%d: unshifted next token\n",
2805                    real_file_name, line_number);
2806 #endif
2807         }
2808
2809 #if DEBUG_PERL
2810       fprintf (stderr, "%s:%d: next token is %s\n",
2811                real_file_name, line_number, token2string (next));
2812 #endif
2813
2814       if (next->type == token_type_fat_comma)
2815         {
2816           tp->type = token_type_string;
2817           tp->sub_type = string_type_q;
2818           tp->comment = add_reference (savable_comment);
2819 #if DEBUG_PERL
2820           fprintf (stderr,
2821                    "%s:%d: token %s mutated to token_type_string\n",
2822                    real_file_name, line_number, token2string (tp));
2823 #endif
2824         }
2825       else if (tp->type == token_type_symbol && tp->sub_type == symbol_type_sub
2826                && next->type == token_type_symbol)
2827         {
2828           /* Start of a function declaration or definition.  Mark this
2829              symbol as a function name, so that we can later eat up
2830              possible prototype information.  */
2831 #if DEBUG_PERL
2832           fprintf (stderr, "%s:%d: subroutine declaration/definition '%s'\n",
2833                    real_file_name, line_number, next->string);
2834 #endif
2835           next->sub_type = symbol_type_function;
2836         }
2837       else if (tp->type == token_type_symbol
2838                && (tp->sub_type == symbol_type_sub
2839                    || tp->sub_type == symbol_type_function)
2840                && next->type == token_type_lparen)
2841         {
2842           /* For simplicity we simply consume everything up to the
2843              closing parenthesis.  Actually only a limited set of
2844              characters is allowed inside parentheses but we leave
2845              complaints to the interpreter and are prepared for
2846              future extensions to the Perl syntax.  */
2847           int c;
2848
2849 #if DEBUG_PERL
2850           fprintf (stderr, "%s:%d: consuming prototype information\n",
2851                    real_file_name, line_number);
2852 #endif
2853
2854           do
2855             {
2856               c = phase1_getc ();
2857 #if DEBUG_PERL
2858               fprintf (stderr, "  consuming character '%c'\n", c);
2859 #endif
2860             }
2861           while (c != EOF && c != ')');
2862           phase1_ungetc (c);
2863         }
2864     }
2865
2866   return tp;
2867 }
2868
2869 static void
2870 x_perl_unlex (token_ty *tp)
2871 {
2872   token_stack_push (&token_stack, tp);
2873 }
2874
2875
2876 /* ========================= Extracting strings.  ========================== */
2877
2878 /* Assuming TP is a string token, this function accumulates all subsequent
2879    . string2 . string3 ... to the string.  (String concatenation.)  */
2880
2881 static char *
2882 collect_message (message_list_ty *mlp, token_ty *tp, int error_level)
2883 {
2884   char *string;
2885   size_t len;
2886
2887   extract_quotelike_pass3 (tp, error_level);
2888   string = xstrdup (tp->string);
2889   len = strlen (tp->string) + 1;
2890
2891   for (;;)
2892     {
2893       int c;
2894
2895       do
2896         c = phase2_getc ();
2897       while (is_whitespace (c));
2898
2899       if (c != '.')
2900         {
2901           phase2_ungetc (c);
2902           return string;
2903         }
2904
2905       do
2906         c = phase2_getc ();
2907       while (is_whitespace (c));
2908
2909       phase2_ungetc (c);
2910
2911       if (c == '"' || c == '\'' || c == '`'
2912           || ((c == '/' || c == '?')
2913               && prefer_regexp_over_division (tp->last_type))
2914           || c == 'q')
2915         {
2916           token_ty *qstring = x_perl_lex (mlp);
2917           if (qstring->type != token_type_string)
2918             {
2919               /* assert (qstring->type == token_type_symbol) */
2920               x_perl_unlex (qstring);
2921               return string;
2922             }
2923
2924           extract_quotelike_pass3 (qstring, error_level);
2925           len += strlen (qstring->string);
2926           string = xrealloc (string, len);
2927           strcat (string, qstring->string);
2928           free_token (qstring);
2929         }
2930     }
2931 }
2932
2933 /* The file is broken into tokens.  Scan the token stream, looking for
2934    a keyword, followed by a left paren, followed by a string.  When we
2935    see this sequence, we have something to remember.  We assume we are
2936    looking at a valid Perl program, and leave the complaints about
2937    the grammar to the compiler.
2938
2939      Normal handling: Look for
2940        keyword ( ... msgid ... )
2941      Plural handling: Look for
2942        keyword ( ... msgid ... msgid_plural ... )
2943
2944    We use recursion because the arguments before msgid or between msgid
2945    and msgid_plural can contain subexpressions of the same form.
2946
2947    In Perl, parentheses around function arguments can be omitted.
2948
2949    The general rules are:
2950      1) Functions declared with a prototype take exactly the specified number
2951         of arguments.
2952           sub one_arg ($) { ... }
2953           sub two_args ($$) { ... }
2954      2) When a function name is immediately followed by an opening parenthesis,
2955         the argument list ends at the corresponding closing parenthesis.
2956
2957    If rule 1 and rule 2 are contradictory, i.e. when the program calls a
2958    function with an explicit argument list and the wrong number of arguments,
2959    the program is invalid:
2960      sub two_args ($$) { ... }
2961      foo two_args (x), y             - invalid due to rules 1 and 2
2962
2963    Ambiguities are resolved as follows:
2964      3) Some built-ins, such as 'abs', 'sqrt', 'sin', 'cos', ..., and functions
2965         declared with a prototype of exactly one argument take exactly one
2966         argument:
2967           foo sin x, y  ==>  foo (sin (x), y)
2968           sub one_arg ($) { ... }
2969           foo one_arg x, y, z  ==>  foo (one_arg (x), y, z)
2970      4) Other identifiers, if not immediately followed by an opening
2971         parenthesis, consume the entire remaining argument list:
2972           foo bar x, y  ==>  foo (bar (x, y))
2973           sub two_args ($$) { ... }
2974           foo two_args x, y  ==>  foo (two_args (x, y))
2975
2976    Other series of comma separated expressions without a function name at
2977    the beginning are comma expressions:
2978           sub two_args ($$) { ... }
2979           foo two_args x, (y, z)  ==>  foo (two_args (x, (y, z)))
2980    Note that the evaluation of comma expressions returns a list of values
2981    when in list context (e.g. inside the argument list of a function without
2982    prototype) but only one value when inside the argument list of a function
2983    with a prototype:
2984           sub print3 ($$$) { print @_ }
2985           print3 5, (6, 7), 8  ==>  578
2986           print 5, (6, 7), 8  ==>  5678
2987
2988    Where rule 3 or 4 contradict rule 1 or 2, the program is invalid:
2989      sin (x, y)                      - invalid due to rules 2 and 3
2990      sub one_arg ($) { ... }
2991      one_arg (x, y)                  - invalid due to rules 2 and 3
2992      sub two_args ($$) { ... }
2993      foo two_args x, y, z            - invalid due to rules 1 and 4
2994  */
2995
2996 /* Extract messages until the next balanced closing parenthesis.
2997    Extracted messages are added to MLP.
2998
2999    DELIM can be either token_type_rbrace, token_type_rbracket,
3000    token_type_rparen.  Additionally, if COMMA_DELIM is true, parsing
3001    stops at the next comma outside parentheses.
3002
3003    ARG is the current argument list position, starts with 1.
3004    ARGPARSER is the corresponding argument list parser.
3005
3006    Returns true for EOF, false otherwise.  */
3007
3008 static bool
3009 extract_balanced (message_list_ty *mlp,
3010                   token_type_ty delim, bool eat_delim, bool comma_delim,
3011                   flag_context_ty outer_context,
3012                   flag_context_list_iterator_ty context_iter,
3013                   int arg, struct arglist_parser *argparser)
3014 {
3015   /* Whether to implicitly assume the next tokens are arguments even without
3016      a '('.  */
3017   bool next_is_argument = false;
3018   /* Parameters of the keyword just seen.  Defined only when next_is_argument
3019      is true.  */
3020   const struct callshapes *next_shapes = NULL;
3021   struct arglist_parser *next_argparser = NULL;
3022
3023   /* Whether to not consider strings until the next comma.  */
3024   bool skip_until_comma = false;
3025
3026   /* Context iterator that will be used if the next token is a '('.  */
3027   flag_context_list_iterator_ty next_context_iter =
3028     passthrough_context_list_iterator;
3029   /* Current context.  */
3030   flag_context_ty inner_context =
3031     inherited_context (outer_context,
3032                        flag_context_list_iterator_advance (&context_iter));
3033
3034 #if DEBUG_PERL
3035   static int nesting_level = 0;
3036
3037   ++nesting_level;
3038 #endif
3039
3040   for (;;)
3041     {
3042       /* The current token.  */
3043       token_ty *tp;
3044
3045       tp = x_perl_lex (mlp);
3046
3047       if (delim == tp->type)
3048         {
3049           xgettext_current_source_encoding = po_charset_utf8;
3050           arglist_parser_done (argparser, arg);
3051           xgettext_current_source_encoding = xgettext_global_source_encoding;
3052           if (next_argparser != NULL)
3053             free (next_argparser);
3054 #if DEBUG_PERL
3055           fprintf (stderr, "%s:%d: extract_balanced finished (%d)\n",
3056                    logical_file_name, tp->line_number, --nesting_level);
3057 #endif
3058           if (eat_delim)
3059             free_token (tp);
3060           else
3061             /* Preserve the delimiter for the caller.  */
3062             x_perl_unlex (tp);
3063           return false;
3064         }
3065
3066       if (comma_delim && tp->type == token_type_comma)
3067         {
3068           xgettext_current_source_encoding = po_charset_utf8;
3069           arglist_parser_done (argparser, arg);
3070           xgettext_current_source_encoding = xgettext_global_source_encoding;
3071           if (next_argparser != NULL)
3072             free (next_argparser);
3073 #if DEBUG_PERL
3074           fprintf (stderr, "%s:%d: extract_balanced finished at comma (%d)\n",
3075                    logical_file_name, tp->line_number, --nesting_level);
3076 #endif
3077           x_perl_unlex (tp);
3078           return false;
3079         }
3080
3081       if (next_is_argument && tp->type != token_type_lparen)
3082         {
3083           /* An argument list starts, even though there is no '('.  */
3084           bool next_comma_delim;
3085
3086           x_perl_unlex (tp);
3087
3088           if (next_shapes != NULL)
3089             /* We know something about the function being called.  Assume
3090                that it consumes only one argument if no argument number or
3091                total > 1 is specified.  */
3092             {
3093               size_t i;
3094
3095               next_comma_delim = true;
3096               for (i = 0; i < next_shapes->nshapes; i++)
3097                 {
3098                   const struct callshape *shape = &next_shapes->shapes[i];
3099
3100                   if (shape->argnum1 > 1
3101                       || shape->argnum2 > 1
3102                       || shape->argnumc > 1
3103                       || shape->argtotal > 1)
3104                     next_comma_delim = false;
3105                 }
3106             }
3107           else
3108             /* We know nothing about the function being called.  It could be
3109                a function prototyped to take only one argument, or on the other
3110                hand it could be prototyped to take more than one argument or an
3111                arbitrary argument list or it could be unprototyped.  Due to
3112                the way the parser works, assuming the first case gives the
3113                best results.  */
3114             next_comma_delim = true;
3115
3116           if (extract_balanced (mlp, delim, false, next_comma_delim,
3117                                 inner_context, next_context_iter,
3118                                 1, next_argparser))
3119             {
3120               xgettext_current_source_encoding = po_charset_utf8;
3121               arglist_parser_done (argparser, arg);
3122               xgettext_current_source_encoding = xgettext_global_source_encoding;
3123               return true;
3124             }
3125
3126           next_is_argument = false;
3127           next_argparser = NULL;
3128           next_context_iter = null_context_list_iterator;
3129           continue;
3130         }
3131
3132       switch (tp->type)
3133         {
3134         case token_type_symbol:
3135         case token_type_keyword_symbol:
3136 #if DEBUG_PERL
3137           fprintf (stderr, "%s:%d: type symbol (%d) \"%s\"\n",
3138                    logical_file_name, tp->line_number, nesting_level,
3139                    tp->string);
3140 #endif
3141
3142           {
3143             void *keyword_value;
3144
3145             if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
3146                                  &keyword_value) == 0)
3147               {
3148                 const struct callshapes *shapes =
3149                   (const struct callshapes *) keyword_value;
3150
3151                 next_shapes = shapes;
3152                 next_argparser = arglist_parser_alloc (mlp, shapes);
3153               }
3154             else
3155               {
3156                 next_shapes = NULL;
3157                 next_argparser = arglist_parser_alloc (mlp, NULL);
3158               }
3159           }
3160           next_is_argument = true;
3161           next_context_iter =
3162             flag_context_list_iterator (
3163               flag_context_list_table_lookup (
3164                 flag_context_list_table,
3165                 tp->string, strlen (tp->string)));
3166           break;
3167
3168         case token_type_variable:
3169 #if DEBUG_PERL
3170           fprintf (stderr, "%s:%d: type variable (%d) \"%s\"\n",
3171                    logical_file_name, tp->line_number, nesting_level,
3172                    tp->string);
3173 #endif
3174           next_is_argument = false;
3175           if (next_argparser != NULL)
3176             free (next_argparser);
3177           next_argparser = NULL;
3178           next_context_iter = null_context_list_iterator;
3179           break;
3180
3181         case token_type_object:
3182 #if DEBUG_PERL
3183           fprintf (stderr, "%s:%d: type object (%d) \"%s->\"\n",
3184                    logical_file_name, tp->line_number, nesting_level,
3185                    tp->string);
3186 #endif
3187           next_is_argument = false;
3188           if (next_argparser != NULL)
3189             free (next_argparser);
3190           next_argparser = NULL;
3191           next_context_iter = null_context_list_iterator;
3192           break;
3193
3194         case token_type_lparen:
3195 #if DEBUG_PERL
3196           fprintf (stderr, "%s:%d: type left parenthesis (%d)\n",
3197                    logical_file_name, tp->line_number, nesting_level);
3198 #endif
3199           if (next_is_argument)
3200             {
3201               /* Parse the argument list of a function call.  */
3202               if (extract_balanced (mlp, token_type_rparen, true, false,
3203                                     inner_context, next_context_iter,
3204                                     1, next_argparser))
3205                 {
3206                   xgettext_current_source_encoding = po_charset_utf8;
3207                   arglist_parser_done (argparser, arg);
3208                   xgettext_current_source_encoding = xgettext_global_source_encoding;
3209                   return true;
3210                 }
3211               next_is_argument = false;
3212               next_argparser = NULL;
3213             }
3214           else
3215             {
3216               /* Parse a parenthesized expression or comma expression.  */
3217               if (extract_balanced (mlp, token_type_rparen, true, false,
3218                                     inner_context, next_context_iter,
3219                                     arg, arglist_parser_clone (argparser)))
3220                 {
3221                   xgettext_current_source_encoding = po_charset_utf8;
3222                   arglist_parser_done (argparser, arg);
3223                   xgettext_current_source_encoding = xgettext_global_source_encoding;
3224                   if (next_argparser != NULL)
3225                     free (next_argparser);
3226                   free_token (tp);
3227                   return true;
3228                 }
3229               next_is_argument = false;
3230               if (next_argparser != NULL)
3231                 free (next_argparser);
3232               next_argparser = NULL;
3233             }
3234           skip_until_comma = true;
3235           next_context_iter = null_context_list_iterator;
3236           break;
3237
3238         case token_type_rparen:
3239 #if DEBUG_PERL
3240           fprintf (stderr, "%s:%d: type right parenthesis (%d)\n",
3241                    logical_file_name, tp->line_number, nesting_level);
3242 #endif
3243           next_is_argument = false;
3244           if (next_argparser != NULL)
3245             free (next_argparser);
3246           next_argparser = NULL;
3247           skip_until_comma = true;
3248           next_context_iter = null_context_list_iterator;
3249           break;
3250
3251         case token_type_comma:
3252         case token_type_fat_comma:
3253 #if DEBUG_PERL
3254           fprintf (stderr, "%s:%d: type comma (%d)\n",
3255                    logical_file_name, tp->line_number, nesting_level);
3256 #endif
3257           if (arglist_parser_decidedp (argparser, arg))
3258             {
3259               /* We have missed the argument.  */
3260               xgettext_current_source_encoding = po_charset_utf8;
3261               arglist_parser_done (argparser, arg);
3262               xgettext_current_source_encoding = xgettext_global_source_encoding;
3263               argparser = arglist_parser_alloc (mlp, NULL);
3264               arg = 0;
3265             }
3266           arg++;
3267 #if DEBUG_PERL
3268           fprintf (stderr, "%s:%d: arg: %d\n",
3269                    real_file_name, tp->line_number, arg);
3270 #endif
3271           inner_context =
3272             inherited_context (outer_context,
3273                                flag_context_list_iterator_advance (
3274                                  &context_iter));
3275           next_is_argument = false;
3276           if (next_argparser != NULL)
3277             free (next_argparser);
3278           next_argparser = NULL;
3279           skip_until_comma = false;
3280           next_context_iter = passthrough_context_list_iterator;
3281           break;
3282
3283         case token_type_string:
3284 #if DEBUG_PERL
3285           fprintf (stderr, "%s:%d: type string (%d): \"%s\"\n",
3286                    logical_file_name, tp->line_number, nesting_level,
3287                    tp->string);
3288 #endif
3289
3290           if (extract_all)
3291             {
3292               char *string = collect_message (mlp, tp, EXIT_SUCCESS);
3293               lex_pos_ty pos;
3294
3295               pos.file_name = logical_file_name;
3296               pos.line_number = tp->line_number;
3297               xgettext_current_source_encoding = po_charset_utf8;
3298               remember_a_message (mlp, NULL, string, inner_context, &pos,
3299                                   NULL, tp->comment);
3300               xgettext_current_source_encoding = xgettext_global_source_encoding;
3301             }
3302           else if (!skip_until_comma)
3303             {
3304               /* Need to collect the complete string, with error checking,
3305                  only if the argument ARG is used in ARGPARSER.  */
3306               bool must_collect = false;
3307               {
3308                 size_t nalternatives = argparser->nalternatives;
3309                 size_t i;
3310
3311                 for (i = 0; i < nalternatives; i++)
3312                   {
3313                     struct partial_call *cp = &argparser->alternative[i];
3314
3315                     if (arg == cp->argnumc
3316                         || arg == cp->argnum1 || arg == cp->argnum2)
3317                       must_collect = true;
3318                   }
3319               }
3320
3321               if (must_collect)
3322                 {
3323                   char *string = collect_message (mlp, tp, EXIT_FAILURE);
3324
3325                   xgettext_current_source_encoding = po_charset_utf8;
3326                   arglist_parser_remember (argparser, arg,
3327                                            string, inner_context,
3328                                            logical_file_name, tp->line_number,
3329                                            tp->comment);
3330                   xgettext_current_source_encoding = xgettext_global_source_encoding;
3331                 }
3332             }
3333
3334           if (arglist_parser_decidedp (argparser, arg))
3335             {
3336               xgettext_current_source_encoding = po_charset_utf8;
3337               arglist_parser_done (argparser, arg);
3338               xgettext_current_source_encoding = xgettext_global_source_encoding;
3339               argparser = arglist_parser_alloc (mlp, NULL);
3340             }
3341
3342           next_is_argument = false;
3343           if (next_argparser != NULL)
3344             free (next_argparser);
3345           next_argparser = NULL;
3346           next_context_iter = null_context_list_iterator;
3347           break;
3348
3349         case token_type_number:
3350 #if DEBUG_PERL
3351           fprintf (stderr, "%s:%d: type number (%d)\n",
3352                    logical_file_name, tp->line_number, nesting_level);
3353 #endif
3354           next_is_argument = false;
3355           if (next_argparser != NULL)
3356             free (next_argparser);
3357           next_argparser = NULL;
3358           next_context_iter = null_context_list_iterator;
3359           break;
3360
3361         case token_type_eof:
3362 #if DEBUG_PERL
3363           fprintf (stderr, "%s:%d: type EOF (%d)\n",
3364                    logical_file_name, tp->line_number, nesting_level);
3365 #endif
3366           xgettext_current_source_encoding = po_charset_utf8;
3367           arglist_parser_done (argparser, arg);
3368           xgettext_current_source_encoding = xgettext_global_source_encoding;
3369           if (next_argparser != NULL)
3370             free (next_argparser);
3371           next_argparser = NULL;
3372           free_token (tp);
3373           return true;
3374
3375         case token_type_lbrace:
3376 #if DEBUG_PERL
3377           fprintf (stderr, "%s:%d: type lbrace (%d)\n",
3378                    logical_file_name, tp->line_number, nesting_level);
3379 #endif
3380           if (extract_balanced (mlp, token_type_rbrace, true, false,
3381                                 null_context, null_context_list_iterator,
3382                                 1, arglist_parser_alloc (mlp, NULL)))
3383             {
3384               xgettext_current_source_encoding = po_charset_utf8;
3385               arglist_parser_done (argparser, arg);
3386               xgettext_current_source_encoding = xgettext_global_source_encoding;
3387               if (next_argparser != NULL)
3388                 free (next_argparser);
3389               free_token (tp);
3390               return true;
3391             }
3392           next_is_argument = false;
3393           if (next_argparser != NULL)
3394             free (next_argparser);
3395           next_argparser = NULL;
3396           next_context_iter = null_context_list_iterator;
3397           break;
3398
3399         case token_type_rbrace:
3400 #if DEBUG_PERL
3401           fprintf (stderr, "%s:%d: type rbrace (%d)\n",
3402                    logical_file_name, tp->line_number, nesting_level);
3403 #endif
3404           next_is_argument = false;
3405           if (next_argparser != NULL)
3406             free (next_argparser);
3407           next_argparser = NULL;
3408           next_context_iter = null_context_list_iterator;
3409           break;
3410
3411         case token_type_lbracket:
3412 #if DEBUG_PERL
3413           fprintf (stderr, "%s:%d: type lbracket (%d)\n",
3414                    logical_file_name, tp->line_number, nesting_level);
3415 #endif
3416           if (extract_balanced (mlp, token_type_rbracket, true, false,
3417                                 null_context, null_context_list_iterator,
3418                                 1, arglist_parser_alloc (mlp, NULL)))
3419             {
3420               xgettext_current_source_encoding = po_charset_utf8;
3421               arglist_parser_done (argparser, arg);
3422               xgettext_current_source_encoding = xgettext_global_source_encoding;
3423               if (next_argparser != NULL)
3424                 free (next_argparser);
3425               free_token (tp);
3426               return true;
3427             }
3428           next_is_argument = false;
3429           if (next_argparser != NULL)
3430             free (next_argparser);
3431           next_argparser = NULL;
3432           next_context_iter = null_context_list_iterator;
3433           break;
3434
3435         case token_type_rbracket:
3436 #if DEBUG_PERL
3437           fprintf (stderr, "%s:%d: type rbracket (%d)\n",
3438                    logical_file_name, tp->line_number, nesting_level);
3439 #endif
3440           next_is_argument = false;
3441           if (next_argparser != NULL)
3442             free (next_argparser);
3443           next_argparser = NULL;
3444           next_context_iter = null_context_list_iterator;
3445           break;
3446
3447         case token_type_semicolon:
3448 #if DEBUG_PERL
3449           fprintf (stderr, "%s:%d: type semicolon (%d)\n",
3450                    logical_file_name, tp->line_number, nesting_level);
3451 #endif
3452
3453           /* The ultimate sign.  */
3454           xgettext_current_source_encoding = po_charset_utf8;
3455           arglist_parser_done (argparser, arg);
3456           xgettext_current_source_encoding = xgettext_global_source_encoding;
3457           argparser = arglist_parser_alloc (mlp, NULL);
3458
3459           /* FIXME: Instead of resetting outer_context here, it may be better
3460              to recurse in the next_is_argument handling above, waiting for
3461              the next semicolon or other statement terminator.  */
3462           outer_context = null_context;
3463           context_iter = null_context_list_iterator;
3464           next_is_argument = false;
3465           if (next_argparser != NULL)
3466             free (next_argparser);
3467           next_argparser = NULL;
3468           next_context_iter = passthrough_context_list_iterator;
3469           inner_context =
3470             inherited_context (outer_context,
3471                                flag_context_list_iterator_advance (
3472                                  &context_iter));
3473           break;
3474
3475         case token_type_dereference:
3476 #if DEBUG_PERL
3477           fprintf (stderr, "%s:%d: type dereference (%d)\n",
3478                    logical_file_name, tp->line_number, nesting_level);
3479 #endif
3480           next_is_argument = false;
3481           if (next_argparser != NULL)
3482             free (next_argparser);
3483           next_argparser = NULL;
3484           next_context_iter = null_context_list_iterator;
3485           break;
3486
3487         case token_type_dot:
3488 #if DEBUG_PERL
3489           fprintf (stderr, "%s:%d: type dot (%d)\n",
3490                    logical_file_name, tp->line_number, nesting_level);
3491 #endif
3492           next_is_argument = false;
3493           if (next_argparser != NULL)
3494             free (next_argparser);
3495           next_argparser = NULL;
3496           next_context_iter = null_context_list_iterator;
3497           break;
3498
3499         case token_type_named_op:
3500 #if DEBUG_PERL
3501           fprintf (stderr, "%s:%d: type named operator (%d): %s\n",
3502                    logical_file_name, tp->line_number, nesting_level,
3503                    tp->string);
3504 #endif
3505           next_is_argument = false;
3506           if (next_argparser != NULL)
3507             free (next_argparser);
3508           next_argparser = NULL;
3509           next_context_iter = null_context_list_iterator;
3510           break;
3511
3512         case token_type_regex_op:
3513 #if DEBUG_PERL
3514           fprintf (stderr, "%s:%d: type regex operator (%d)\n",
3515                    logical_file_name, tp->line_number, nesting_level);
3516 #endif
3517           next_is_argument = false;
3518           if (next_argparser != NULL)
3519             free (next_argparser);
3520           next_argparser = NULL;
3521           next_context_iter = null_context_list_iterator;
3522           break;
3523
3524         case token_type_other:
3525 #if DEBUG_PERL
3526           fprintf (stderr, "%s:%d: type other (%d)\n",
3527                    logical_file_name, tp->line_number, nesting_level);
3528 #endif
3529           next_is_argument = false;
3530           if (next_argparser != NULL)
3531             free (next_argparser);
3532           next_argparser = NULL;
3533           next_context_iter = null_context_list_iterator;
3534           break;
3535
3536         default:
3537           fprintf (stderr, "%s:%d: unknown token type %d\n",
3538                    real_file_name, tp->line_number, tp->type);
3539           abort ();
3540         }
3541
3542       free_token (tp);
3543     }
3544 }
3545
3546 void
3547 extract_perl (FILE *f, const char *real_filename, const char *logical_filename,
3548               flag_context_list_table_ty *flag_table,
3549               msgdomain_list_ty *mdlp)
3550 {
3551   message_list_ty *mlp = mdlp->item[0]->messages;
3552
3553   fp = f;
3554   real_file_name = real_filename;
3555   logical_file_name = xstrdup (logical_filename);
3556   line_number = 0;
3557
3558   last_comment_line = -1;
3559   last_non_comment_line = -1;
3560
3561   flag_context_list_table = flag_table;
3562
3563   init_keywords ();
3564
3565   token_stack.items = NULL;
3566   token_stack.nitems = 0;
3567   token_stack.nitems_max = 0;
3568   linesize = 0;
3569   linepos = 0;
3570   eaten_here = 0;
3571   end_of_file = false;
3572
3573   /* Safe assumption.  */
3574   last_token_type = token_type_semicolon;
3575
3576   /* Eat tokens until eof is seen.  When extract_balanced returns
3577      due to an unbalanced closing brace, just restart it.  */
3578   while (!extract_balanced (mlp, token_type_rbrace, true, false,
3579                             null_context, null_context_list_iterator,
3580                             1, arglist_parser_alloc (mlp, NULL)))
3581     ;
3582
3583   fp = NULL;
3584   real_file_name = NULL;
3585   free (logical_file_name);
3586   logical_file_name = NULL;
3587   line_number = 0;
3588   last_token_type = token_type_semicolon;
3589   token_stack_free (&token_stack);
3590   eaten_here = 0;
3591   end_of_file = true;
3592 }