gettext-tools/src/x-php.c

   1 /* xgettext PHP backend.
   2    Copyright (C) 2001-2003, 2005-2010 Free Software Foundation, Inc.
   3
   4    This file was written by Bruno Haible <bruno@clisp.org>, 2002.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22
  23 /* Specification.  */
  24 #include "x-php.h"
  25
  26 #include <errno.h>
  27 #include <stdbool.h>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30
  31 #include "message.h"
  32 #include "xgettext.h"
  33 #include "error.h"
  34 #include "xalloc.h"
  35 #include "gettext.h"
  36
  37 #define _(s) gettext(s)
  38
  39 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  40
  41
  42 /* The PHP syntax is defined in phpdoc/manual/langref.html.
  43    See also php-4.1.0/Zend/zend_language_scanner.l
  44    and      php-4.1.0/Zend/zend_language_parser.y.
  45    Note that variable and function names can contain bytes in the range
  46    0x7f..0xff; see
  47      http://www.php.net/manual/en/language.variables.php
  48      http://www.php.net/manual/en/language.functions.php  */
  49
  50
  51 /* ====================== Keyword set customization.  ====================== */
  52
  53 /* If true extract all strings.  */
  54 static bool extract_all = false;
  55
  56 static hash_table keywords;
  57 static bool default_keywords = true;
  58
  59
  60 void
  61 x_php_extract_all ()
  62 {
  63   extract_all = true;
  64 }
  65
  66
  67 void
  68 x_php_keyword (const char *name)
  69 {
  70   if (name == NULL)
  71     default_keywords = false;
  72   else
  73     {
  74       const char *end;
  75       struct callshape shape;
  76       const char *colon;
  77
  78       if (keywords.table == NULL)
  79         hash_init (&keywords, 100);
  80
  81       split_keywordspec (name, &end, &shape);
  82
  83       /* The characters between name and end should form a valid C identifier.
  84          A colon means an invalid parse in split_keywordspec().  */
  85       colon = strchr (name, ':');
  86       if (colon == NULL || colon >= end)
  87         insert_keyword_callshape (&keywords, name, end - name, &shape);
  88     }
  89 }
  90
  91 /* Finish initializing the keywords hash table.
  92    Called after argument processing, before each file is processed.  */
  93 static void
  94 init_keywords ()
  95 {
  96   if (default_keywords)
  97     {
  98       /* When adding new keywords here, also update the documentation in
  99          xgettext.texi!  */
 100       x_php_keyword ("_");
 101       x_php_keyword ("gettext");
 102       x_php_keyword ("dgettext:2");
 103       x_php_keyword ("dcgettext:2");
 104       /* The following were added in PHP 4.2.0.  */
 105       x_php_keyword ("ngettext:1,2");
 106       x_php_keyword ("dngettext:2,3");
 107       x_php_keyword ("dcngettext:2,3");
 108       default_keywords = false;
 109     }
 110 }
 111
 112 void
 113 init_flag_table_php ()
 114 {
 115   xgettext_record_flag ("_:1:pass-php-format");
 116   xgettext_record_flag ("gettext:1:pass-php-format");
 117   xgettext_record_flag ("dgettext:2:pass-php-format");
 118   xgettext_record_flag ("dcgettext:2:pass-php-format");
 119   xgettext_record_flag ("ngettext:1:pass-php-format");
 120   xgettext_record_flag ("ngettext:2:pass-php-format");
 121   xgettext_record_flag ("dngettext:2:pass-php-format");
 122   xgettext_record_flag ("dngettext:3:pass-php-format");
 123   xgettext_record_flag ("dcngettext:2:pass-php-format");
 124   xgettext_record_flag ("dcngettext:3:pass-php-format");
 125   xgettext_record_flag ("sprintf:1:php-format");
 126   xgettext_record_flag ("printf:1:php-format");
 127 }
 128
 129
 130 /* ======================== Reading of characters.  ======================== */
 131
 132
 133 /* Real filename, used in error messages about the input file.  */
 134 static const char *real_file_name;
 135
 136 /* Logical filename and line number, used to label the extracted messages.  */
 137 static char *logical_file_name;
 138 static int line_number;
 139
 140 /* The input file stream.  */
 141 static FILE *fp;
 142
 143
 144 /* 1. line_number handling.  */
 145
 146 static unsigned char phase1_pushback[2];
 147 static int phase1_pushback_length;
 148
 149 static int
 150 phase1_getc ()
 151 {
 152   int c;
 153
 154   if (phase1_pushback_length)
 155     c = phase1_pushback[--phase1_pushback_length];
 156   else
 157     {
 158       c = getc (fp);
 159
 160       if (c == EOF)
 161         {
 162           if (ferror (fp))
 163             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 164                    real_file_name);
 165           return EOF;
 166         }
 167     }
 168
 169   if (c == '\n')
 170     line_number++;
 171
 172   return c;
 173 }
 174
 175 /* Supports 2 characters of pushback.  */
 176 static void
 177 phase1_ungetc (int c)
 178 {
 179   if (c != EOF)
 180     {
 181       if (c == '\n')
 182         --line_number;
 183
 184       if (phase1_pushback_length == SIZEOF (phase1_pushback))
 185         abort ();
 186       phase1_pushback[phase1_pushback_length++] = c;
 187     }
 188 }
 189
 190
 191 /* 2. Ignore HTML sections.  They are equivalent to PHP echo commands and
 192    therefore don't contain translatable strings.  */
 193
 194 static void
 195 skip_html ()
 196 {
 197   for (;;)
 198     {
 199       int c = phase1_getc ();
 200
 201       if (c == EOF)
 202         return;
 203
 204       if (c == '<')
 205         {
 206           int c2 = phase1_getc ();
 207
 208           if (c2 == EOF)
 209             break;
 210
 211           if (c2 == '?')
 212             {
 213               /* <?php is the normal way to enter PHP mode. <? and <?= are
 214                  recognized by PHP depending on a configuration setting.  */
 215               int c3 = phase1_getc ();
 216
 217               if (c3 != '=')
 218                 phase1_ungetc (c3);
 219
 220               return;
 221             }
 222
 223           if (c2 == '%')
 224             {
 225               /* <% and <%= are recognized by PHP depending on a configuration
 226                  setting.  */
 227               int c3 = phase1_getc ();
 228
 229               if (c3 != '=')
 230                 phase1_ungetc (c3);
 231
 232               return;
 233             }
 234
 235           if (c2 == '<')
 236             {
 237               phase1_ungetc (c2);
 238               continue;
 239             }
 240
 241           /* < script language = php >
 242              < script language = "php" >
 243              < script language = 'php' >
 244              are always recognized.  */
 245           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
 246             c2 = phase1_getc ();
 247           if (c2 != 's' && c2 != 'S')
 248             {
 249               phase1_ungetc (c2);
 250               continue;
 251             }
 252           c2 = phase1_getc ();
 253           if (c2 != 'c' && c2 != 'C')
 254             {
 255               phase1_ungetc (c2);
 256               continue;
 257             }
 258           c2 = phase1_getc ();
 259           if (c2 != 'r' && c2 != 'R')
 260             {
 261               phase1_ungetc (c2);
 262               continue;
 263             }
 264           c2 = phase1_getc ();
 265           if (c2 != 'i' && c2 != 'I')
 266             {
 267               phase1_ungetc (c2);
 268               continue;
 269             }
 270           c2 = phase1_getc ();
 271           if (c2 != 'p' && c2 != 'P')
 272             {
 273               phase1_ungetc (c2);
 274               continue;
 275             }
 276           c2 = phase1_getc ();
 277           if (c2 != 't' && c2 != 'T')
 278             {
 279               phase1_ungetc (c2);
 280               continue;
 281             }
 282           c2 = phase1_getc ();
 283           if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'))
 284             {
 285               phase1_ungetc (c2);
 286               continue;
 287             }
 288           do
 289             c2 = phase1_getc ();
 290           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
 291           if (c2 != 'l' && c2 != 'L')
 292             {
 293               phase1_ungetc (c2);
 294               continue;
 295             }
 296           c2 = phase1_getc ();
 297           if (c2 != 'a' && c2 != 'A')
 298             {
 299               phase1_ungetc (c2);
 300               continue;
 301             }
 302           c2 = phase1_getc ();
 303           if (c2 != 'n' && c2 != 'N')
 304             {
 305               phase1_ungetc (c2);
 306               continue;
 307             }
 308           c2 = phase1_getc ();
 309           if (c2 != 'g' && c2 != 'G')
 310             {
 311               phase1_ungetc (c2);
 312               continue;
 313             }
 314           c2 = phase1_getc ();
 315           if (c2 != 'u' && c2 != 'U')
 316             {
 317               phase1_ungetc (c2);
 318               continue;
 319             }
 320           c2 = phase1_getc ();
 321           if (c2 != 'a' && c2 != 'A')
 322             {
 323               phase1_ungetc (c2);
 324               continue;
 325             }
 326           c2 = phase1_getc ();
 327           if (c2 != 'g' && c2 != 'G')
 328             {
 329               phase1_ungetc (c2);
 330               continue;
 331             }
 332           c2 = phase1_getc ();
 333           if (c2 != 'e' && c2 != 'E')
 334             {
 335               phase1_ungetc (c2);
 336               continue;
 337             }
 338           c2 = phase1_getc ();
 339           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
 340             c2 = phase1_getc ();
 341           if (c2 != '=')
 342             {
 343               phase1_ungetc (c2);
 344               continue;
 345             }
 346           c2 = phase1_getc ();
 347           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
 348             c2 = phase1_getc ();
 349           if (c2 == '"')
 350             {
 351               c2 = phase1_getc ();
 352               if (c2 != 'p')
 353                 {
 354                   phase1_ungetc (c2);
 355                   continue;
 356                 }
 357               c2 = phase1_getc ();
 358               if (c2 != 'h')
 359                 {
 360                   phase1_ungetc (c2);
 361                   continue;
 362                 }
 363               c2 = phase1_getc ();
 364               if (c2 != 'p')
 365                 {
 366                   phase1_ungetc (c2);
 367                   continue;
 368                 }
 369               c2 = phase1_getc ();
 370               if (c2 != '"')
 371                 {
 372                   phase1_ungetc (c2);
 373                   continue;
 374                 }
 375             }
 376           else if (c2 == '\'')
 377             {
 378               c2 = phase1_getc ();
 379               if (c2 != 'p')
 380                 {
 381                   phase1_ungetc (c2);
 382                   continue;
 383                 }
 384               c2 = phase1_getc ();
 385               if (c2 != 'h')
 386                 {
 387                   phase1_ungetc (c2);
 388                   continue;
 389                 }
 390               c2 = phase1_getc ();
 391               if (c2 != 'p')
 392                 {
 393                   phase1_ungetc (c2);
 394                   continue;
 395                 }
 396               c2 = phase1_getc ();
 397               if (c2 != '\'')
 398                 {
 399                   phase1_ungetc (c2);
 400                   continue;
 401                 }
 402             }
 403           else
 404             {
 405               if (c2 != 'p')
 406                 {
 407                   phase1_ungetc (c2);
 408                   continue;
 409                 }
 410               c2 = phase1_getc ();
 411               if (c2 != 'h')
 412                 {
 413                   phase1_ungetc (c2);
 414                   continue;
 415                 }
 416               c2 = phase1_getc ();
 417               if (c2 != 'p')
 418                 {
 419                   phase1_ungetc (c2);
 420                   continue;
 421                 }
 422             }
 423           c2 = phase1_getc ();
 424           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
 425             c2 = phase1_getc ();
 426           if (c2 != '>')
 427             {
 428               phase1_ungetc (c2);
 429               continue;
 430             }
 431           return;
 432         }
 433     }
 434 }
 435
 436 #if 0
 437
 438 static unsigned char phase2_pushback[1];
 439 static int phase2_pushback_length;
 440
 441 static int
 442 phase2_getc ()
 443 {
 444   int c;
 445
 446   if (phase2_pushback_length)
 447     return phase2_pushback[--phase2_pushback_length];
 448
 449   c = phase1_getc ();
 450   switch (c)
 451     {
 452     case '?':
 453     case '%':
 454       {
 455         int c2 = phase1_getc ();
 456         if (c2 == '>')
 457           {
 458             /* ?> and %> terminate PHP mode and switch back to HTML mode.  */
 459             skip_html ();
 460             return ' ';
 461           }
 462         phase1_ungetc (c2);
 463       }
 464       break;
 465
 466     case '<':
 467       {
 468         int c2 = phase1_getc ();
 469
 470         /* < / script > terminates PHP mode and switches back to HTML mode.  */
 471         while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
 472           c2 = phase1_getc ();
 473         if (c2 == '/')
 474           {
 475             do
 476               c2 = phase1_getc ();
 477             while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
 478             if (c2 == 's' || c2 == 'S')
 479               {
 480                 c2 = phase1_getc ();
 481                 if (c2 == 'c' || c2 == 'C')
 482                   {
 483                     c2 = phase1_getc ();
 484                     if (c2 == 'r' || c2 == 'R')
 485                       {
 486                         c2 = phase1_getc ();
 487                         if (c2 == 'i' || c2 == 'I')
 488                           {
 489                             c2 = phase1_getc ();
 490                             if (c2 == 'p' || c2 == 'P')
 491                               {
 492                                 c2 = phase1_getc ();
 493                                 if (c2 == 't' || c2 == 'T')
 494                                   {
 495                                     do
 496                                       c2 = phase1_getc ();
 497                                     while (c2 == ' ' || c2 == '\t'
 498                                            || c2 == '\n' || c2 == '\r');
 499                                     if (c2 == '>')
 500                                       {
 501                                         skip_html ();
 502                                         return ' ';
 503                                       }
 504                                   }
 505                               }
 506                           }
 507                       }
 508                   }
 509               }
 510           }
 511         phase1_ungetc (c2);
 512       }
 513       break;
 514     }
 515
 516   return c;
 517 }
 518
 519 static void
 520 phase2_ungetc (int c)
 521 {
 522   if (c != EOF)
 523     {
 524       if (phase2_pushback_length == SIZEOF (phase2_pushback))
 525         abort ();
 526       phase2_pushback[phase2_pushback_length++] = c;
 527     }
 528 }
 529
 530 #endif
 531
 532
 533 /* Accumulating comments.  */
 534
 535 static char *buffer;
 536 static size_t bufmax;
 537 static size_t buflen;
 538
 539 static inline void
 540 comment_start ()
 541 {
 542   buflen = 0;
 543 }
 544
 545 static inline void
 546 comment_add (int c)
 547 {
 548   if (buflen >= bufmax)
 549     {
 550       bufmax = 2 * bufmax + 10;
 551       buffer = xrealloc (buffer, bufmax);
 552     }
 553   buffer[buflen++] = c;
 554 }
 555
 556 static inline void
 557 comment_line_end (size_t chars_to_remove)
 558 {
 559   buflen -= chars_to_remove;
 560   while (buflen >= 1
 561          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 562     --buflen;
 563   if (chars_to_remove == 0 && buflen >= bufmax)
 564     {
 565       bufmax = 2 * bufmax + 10;
 566       buffer = xrealloc (buffer, bufmax);
 567     }
 568   buffer[buflen] = '\0';
 569   savable_comment_add (buffer);
 570 }
 571
 572
 573 /* 3. Replace each comment that is not inside a string literal with a
 574    space character.  We need to remember the comment for later, because
 575    it may be attached to a keyword string.  */
 576
 577 /* These are for tracking whether comments count as immediately before
 578    keyword.  */
 579 static int last_comment_line;
 580 static int last_non_comment_line;
 581
 582 static unsigned char phase3_pushback[1];
 583 static int phase3_pushback_length;
 584
 585 static int
 586 phase3_getc ()
 587 {
 588   int lineno;
 589   int c;
 590
 591   if (phase3_pushback_length)
 592     return phase3_pushback[--phase3_pushback_length];
 593
 594   c = phase1_getc ();
 595
 596   if (c == '#')
 597     {
 598       /* sh comment.  */
 599       bool last_was_qmark = false;
 600
 601       comment_start ();
 602       lineno = line_number;
 603       for (;;)
 604         {
 605           c = phase1_getc ();
 606           if (c == '\n' || c == EOF)
 607             {
 608               comment_line_end (0);
 609               break;
 610             }
 611           if (last_was_qmark && c == '>')
 612             {
 613               comment_line_end (1);
 614               skip_html ();
 615               break;
 616             }
 617           /* We skip all leading white space, but not EOLs.  */
 618           if (!(buflen == 0 && (c == ' ' || c == '\t')))
 619             comment_add (c);
 620           last_was_qmark = (c == '?' || c == '%');
 621         }
 622       last_comment_line = lineno;
 623       return '\n';
 624     }
 625   else if (c == '/')
 626     {
 627       c = phase1_getc ();
 628
 629       switch (c)
 630         {
 631         default:
 632           phase1_ungetc (c);
 633           return '/';
 634
 635         case '*':
 636           {
 637             /* C comment.  */
 638             bool last_was_star;
 639
 640             comment_start ();
 641             lineno = line_number;
 642             last_was_star = false;
 643             for (;;)
 644               {
 645                 c = phase1_getc ();
 646                 if (c == EOF)
 647                   break;
 648                 /* We skip all leading white space, but not EOLs.  */
 649                 if (buflen == 0 && (c == ' ' || c == '\t'))
 650                   continue;
 651                 comment_add (c);
 652                 switch (c)
 653                   {
 654                   case '\n':
 655                     comment_line_end (1);
 656                     comment_start ();
 657                     lineno = line_number;
 658                     last_was_star = false;
 659                     continue;
 660
 661                   case '*':
 662                     last_was_star = true;
 663                     continue;
 664
 665                   case '/':
 666                     if (last_was_star)
 667                       {
 668                         comment_line_end (2);
 669                         break;
 670                       }
 671                     /* FALLTHROUGH */
 672
 673                   default:
 674                     last_was_star = false;
 675                     continue;
 676                   }
 677                 break;
 678               }
 679             last_comment_line = lineno;
 680             return ' ';
 681           }
 682
 683         case '/':
 684           {
 685             /* C++ comment.  */
 686             bool last_was_qmark = false;
 687
 688             comment_start ();
 689             lineno = line_number;
 690             for (;;)
 691               {
 692                 c = phase1_getc ();
 693                 if (c == '\n' || c == EOF)
 694                   {
 695                     comment_line_end (0);
 696                     break;
 697                   }
 698                 if (last_was_qmark && c == '>')
 699                   {
 700                     comment_line_end (1);
 701                     skip_html ();
 702                     break;
 703                   }
 704                 /* We skip all leading white space, but not EOLs.  */
 705                 if (!(buflen == 0 && (c == ' ' || c == '\t')))
 706                   comment_add (c);
 707                 last_was_qmark = (c == '?' || c == '%');
 708               }
 709             last_comment_line = lineno;
 710             return '\n';
 711           }
 712         }
 713     }
 714   else
 715     return c;
 716 }
 717
 718 #ifdef unused
 719 static void
 720 phase3_ungetc (int c)
 721 {
 722   if (c != EOF)
 723     {
 724       if (phase3_pushback_length == SIZEOF (phase3_pushback))
 725         abort ();
 726       phase3_pushback[phase3_pushback_length++] = c;
 727     }
 728 }
 729 #endif
 730
 731
 732 /* ========================== Reading of tokens.  ========================== */
 733
 734
 735 enum token_type_ty
 736 {
 737   token_type_eof,
 738   token_type_lparen,            /* ( */
 739   token_type_rparen,            /* ) */
 740   token_type_comma,             /* , */
 741   token_type_lbracket,          /* [ */
 742   token_type_rbracket,          /* ] */
 743   token_type_dot,               /* . */
 744   token_type_operator1,         /* * / % ++ -- */
 745   token_type_operator2,         /* + - ! ~ @ */
 746   token_type_string_literal,    /* "abc" */
 747   token_type_symbol,            /* symbol, number */
 748   token_type_other              /* misc. operator */
 749 };
 750 typedef enum token_type_ty token_type_ty;
 751
 752 typedef struct token_ty token_ty;
 753 struct token_ty
 754 {
 755   token_type_ty type;
 756   char *string;         /* for token_type_string_literal, token_type_symbol */
 757   refcounted_string_list_ty *comment;   /* for token_type_string_literal */
 758   int line_number;
 759 };
 760
 761
 762 /* Free the memory pointed to by a 'struct token_ty'.  */
 763 static inline void
 764 free_token (token_ty *tp)
 765 {
 766   if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
 767     free (tp->string);
 768   if (tp->type == token_type_string_literal)
 769     drop_reference (tp->comment);
 770 }
 771
 772
 773 /* 4. Combine characters into tokens.  Discard whitespace.  */
 774
 775 static token_ty phase4_pushback[3];
 776 static int phase4_pushback_length;
 777
 778 static void
 779 phase4_get (token_ty *tp)
 780 {
 781   static char *buffer;
 782   static int bufmax;
 783   int bufpos;
 784   int c;
 785
 786   if (phase4_pushback_length)
 787     {
 788       *tp = phase4_pushback[--phase4_pushback_length];
 789       return;
 790     }
 791   tp->string = NULL;
 792
 793   for (;;)
 794     {
 795       tp->line_number = line_number;
 796       c = phase3_getc ();
 797       switch (c)
 798         {
 799         case EOF:
 800           tp->type = token_type_eof;
 801           return;
 802
 803         case '\n':
 804           if (last_non_comment_line > last_comment_line)
 805             savable_comment_reset ();
 806           /* FALLTHROUGH */
 807         case ' ':
 808         case '\t':
 809         case '\r':
 810           /* Ignore whitespace.  */
 811           continue;
 812         }
 813
 814       last_non_comment_line = tp->line_number;
 815
 816       switch (c)
 817         {
 818         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
 819         case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
 820         case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
 821         case 'V': case 'W': case 'X': case 'Y': case 'Z':
 822         case '_':
 823         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
 824         case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
 825         case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
 826         case 'v': case 'w': case 'x': case 'y': case 'z':
 827         case 127: case 128: case 129: case 130: case 131: case 132: case 133:
 828         case 134: case 135: case 136: case 137: case 138: case 139: case 140:
 829         case 141: case 142: case 143: case 144: case 145: case 146: case 147:
 830         case 148: case 149: case 150: case 151: case 152: case 153: case 154:
 831         case 155: case 156: case 157: case 158: case 159: case 160: case 161:
 832         case 162: case 163: case 164: case 165: case 166: case 167: case 168:
 833         case 169: case 170: case 171: case 172: case 173: case 174: case 175:
 834         case 176: case 177: case 178: case 179: case 180: case 181: case 182:
 835         case 183: case 184: case 185: case 186: case 187: case 188: case 189:
 836         case 190: case 191: case 192: case 193: case 194: case 195: case 196:
 837         case 197: case 198: case 199: case 200: case 201: case 202: case 203:
 838         case 204: case 205: case 206: case 207: case 208: case 209: case 210:
 839         case 211: case 212: case 213: case 214: case 215: case 216: case 217:
 840         case 218: case 219: case 220: case 221: case 222: case 223: case 224:
 841         case 225: case 226: case 227: case 228: case 229: case 230: case 231:
 842         case 232: case 233: case 234: case 235: case 236: case 237: case 238:
 843         case 239: case 240: case 241: case 242: case 243: case 244: case 245:
 844         case 246: case 247: case 248: case 249: case 250: case 251: case 252:
 845         case 253: case 254: case 255:
 846           bufpos = 0;
 847           for (;;)
 848             {
 849               if (bufpos >= bufmax)
 850                 {
 851                   bufmax = 2 * bufmax + 10;
 852                   buffer = xrealloc (buffer, bufmax);
 853                 }
 854               buffer[bufpos++] = c;
 855               c = phase1_getc ();
 856               switch (c)
 857                 {
 858                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 859                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 860                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 861                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 862                 case 'Y': case 'Z':
 863                 case '_':
 864                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 865                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 866                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 867                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 868                 case 'y': case 'z':
 869                 case '0': case '1': case '2': case '3': case '4':
 870                 case '5': case '6': case '7': case '8': case '9':
 871                 case 127: case 128: case 129: case 130: case 131: case 132:
 872                 case 133: case 134: case 135: case 136: case 137: case 138:
 873                 case 139: case 140: case 141: case 142: case 143: case 144:
 874                 case 145: case 146: case 147: case 148: case 149: case 150:
 875                 case 151: case 152: case 153: case 154: case 155: case 156:
 876                 case 157: case 158: case 159: case 160: case 161: case 162:
 877                 case 163: case 164: case 165: case 166: case 167: case 168:
 878                 case 169: case 170: case 171: case 172: case 173: case 174:
 879                 case 175: case 176: case 177: case 178: case 179: case 180:
 880                 case 181: case 182: case 183: case 184: case 185: case 186:
 881                 case 187: case 188: case 189: case 190: case 191: case 192:
 882                 case 193: case 194: case 195: case 196: case 197: case 198:
 883                 case 199: case 200: case 201: case 202: case 203: case 204:
 884                 case 205: case 206: case 207: case 208: case 209: case 210:
 885                 case 211: case 212: case 213: case 214: case 215: case 216:
 886                 case 217: case 218: case 219: case 220: case 221: case 222:
 887                 case 223: case 224: case 225: case 226: case 227: case 228:
 888                 case 229: case 230: case 231: case 232: case 233: case 234:
 889                 case 235: case 236: case 237: case 238: case 239: case 240:
 890                 case 241: case 242: case 243: case 244: case 245: case 246:
 891                 case 247: case 248: case 249: case 250: case 251: case 252:
 892                 case 253: case 254: case 255:
 893                   continue;
 894
 895                 default:
 896                   phase1_ungetc (c);
 897                   break;
 898                 }
 899               break;
 900             }
 901           if (bufpos >= bufmax)
 902             {
 903               bufmax = 2 * bufmax + 10;
 904               buffer = xrealloc (buffer, bufmax);
 905             }
 906           buffer[bufpos] = 0;
 907           tp->string = xstrdup (buffer);
 908           tp->type = token_type_symbol;
 909           return;
 910
 911         case '\'':
 912           /* Single-quoted string literal.  */
 913           bufpos = 0;
 914           for (;;)
 915             {
 916               c = phase1_getc ();
 917               if (c == EOF || c == '\'')
 918                 break;
 919               if (c == '\\')
 920                 {
 921                   c = phase1_getc ();
 922                   if (c != '\\' && c != '\'')
 923                     {
 924                       phase1_ungetc (c);
 925                       c = '\\';
 926                     }
 927                 }
 928               if (bufpos >= bufmax)
 929                 {
 930                   bufmax = 2 * bufmax + 10;
 931                   buffer = xrealloc (buffer, bufmax);
 932                 }
 933               buffer[bufpos++] = c;
 934             }
 935           if (bufpos >= bufmax)
 936             {
 937               bufmax = 2 * bufmax + 10;
 938               buffer = xrealloc (buffer, bufmax);
 939             }
 940           buffer[bufpos] = 0;
 941           tp->type = token_type_string_literal;
 942           tp->string = xstrdup (buffer);
 943           tp->comment = add_reference (savable_comment);
 944           return;
 945
 946         case '"':
 947           /* Double-quoted string literal.  */
 948           tp->type = token_type_string_literal;
 949           bufpos = 0;
 950           for (;;)
 951             {
 952               c = phase1_getc ();
 953               if (c == EOF || c == '"')
 954                 break;
 955               if (c == '$')
 956                 {
 957                   c = phase1_getc ();
 958                   if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
 959                       || c == '_' || c == '{' || c >= 0x7f)
 960                     {
 961                       /* String with variables.  */
 962                       tp->type = token_type_other;
 963                       continue;
 964                     }
 965                   phase1_ungetc (c);
 966                   c = '$';
 967                 }
 968               if (c == '{')
 969                 {
 970                   c = phase1_getc ();
 971                   if (c == '$')
 972                     {
 973                       /* String with expressions.  */
 974                       tp->type = token_type_other;
 975                       continue;
 976                     }
 977                   phase1_ungetc (c);
 978                   c = '{';
 979                 }
 980               if (c == '\\')
 981                 {
 982                   int n, j;
 983
 984                   c = phase1_getc ();
 985                   switch (c)
 986                     {
 987                     case '"':
 988                     case '\\':
 989                     case '$':
 990                       break;
 991
 992                     case '0': case '1': case '2': case '3':
 993                     case '4': case '5': case '6': case '7':
 994                       n = 0;
 995                       for (j = 0; j < 3; ++j)
 996                         {
 997                           n = n * 8 + c - '0';
 998                           c = phase1_getc ();
 999                           switch (c)
1000                             {
1001                             default:
1002                               break;
1003
1004                             case '0': case '1': case '2': case '3':
1005                             case '4': case '5': case '6': case '7':
1006                               continue;
1007                             }
1008                           break;
1009                         }
1010                       phase1_ungetc (c);
1011                       c = n;
1012                       break;
1013
1014                     case 'x':
1015                       n = 0;
1016                       for (j = 0; j < 2; ++j)
1017                         {
1018                           c = phase1_getc ();
1019                           switch (c)
1020                             {
1021                             case '0': case '1': case '2': case '3': case '4':
1022                             case '5': case '6': case '7': case '8': case '9':
1023                               n = n * 16 + c - '0';
1024                               break;
1025                             case 'A': case 'B': case 'C': case 'D': case 'E':
1026                             case 'F':
1027                               n = n * 16 + 10 + c - 'A';
1028                               break;
1029                             case 'a': case 'b': case 'c': case 'd': case 'e':
1030                             case 'f':
1031                               n = n * 16 + 10 + c - 'a';
1032                               break;
1033                             default:
1034                               phase1_ungetc (c);
1035                               c = 0;
1036                               break;
1037                             }
1038                           if (c == 0)
1039                             break;
1040                         }
1041                       if (j == 0)
1042                         {
1043                           phase1_ungetc ('x');
1044                           c = '\\';
1045                         }
1046                       else
1047                         c = n;
1048                       break;
1049
1050                     case 'n':
1051                       c = '\n';
1052                       break;
1053                     case 't':
1054                       c = '\t';
1055                       break;
1056                     case 'r':
1057                       c = '\r';
1058                       break;
1059
1060                     default:
1061                       phase1_ungetc (c);
1062                       c = '\\';
1063                       break;
1064                     }
1065                 }
1066               if (bufpos >= bufmax)
1067                 {
1068                   bufmax = 2 * bufmax + 10;
1069                   buffer = xrealloc (buffer, bufmax);
1070                 }
1071               buffer[bufpos++] = c;
1072             }
1073           if (bufpos >= bufmax)
1074             {
1075               bufmax = 2 * bufmax + 10;
1076               buffer = xrealloc (buffer, bufmax);
1077             }
1078           buffer[bufpos] = 0;
1079           if (tp->type == token_type_string_literal)
1080             {
1081               tp->string = xstrdup (buffer);
1082               tp->comment = add_reference (savable_comment);
1083             }
1084           return;
1085
1086         case '?':
1087         case '%':
1088           {
1089             int c2 = phase1_getc ();
1090             if (c2 == '>')
1091               {
1092                 /* ?> and %> terminate PHP mode and switch back to HTML
1093                    mode.  */
1094                 skip_html ();
1095                 tp->type = token_type_other;
1096               }
1097             else
1098               {
1099                 phase1_ungetc (c2);
1100                 tp->type = (c == '%' ? token_type_operator1 : token_type_other);
1101               }
1102             return;
1103           }
1104
1105         case '(':
1106           tp->type = token_type_lparen;
1107           return;
1108
1109         case ')':
1110           tp->type = token_type_rparen;
1111           return;
1112
1113         case ',':
1114           tp->type = token_type_comma;
1115           return;
1116
1117         case '[':
1118           tp->type = token_type_lbracket;
1119           return;
1120
1121         case ']':
1122           tp->type = token_type_rbracket;
1123           return;
1124
1125         case '.':
1126           tp->type = token_type_dot;
1127           return;
1128
1129         case '*':
1130         case '/':
1131           tp->type = token_type_operator1;
1132           return;
1133
1134         case '+':
1135         case '-':
1136           {
1137             int c2 = phase1_getc ();
1138             if (c2 == c)
1139               /* ++ or -- */
1140               tp->type = token_type_operator1;
1141             else
1142               /* + or - */
1143               {
1144                 phase1_ungetc (c2);
1145                 tp->type = token_type_operator2;
1146               }
1147             return;
1148           }
1149
1150         case '!':
1151         case '~':
1152         case '@':
1153           tp->type = token_type_operator2;
1154           return;
1155
1156         case '<':
1157           {
1158             int c2 = phase1_getc ();
1159             if (c2 == '<')
1160               {
1161                 int c3 = phase1_getc ();
1162                 if (c3 == '<')
1163                   {
1164                     /* Start of here document.
1165                        Parse whitespace, then label, then newline.  */
1166                     do
1167                       c = phase3_getc ();
1168                     while (c == ' ' || c == '\t' || c == '\n' || c == '\r');
1169
1170                     bufpos = 0;
1171                     do
1172                       {
1173                         if (bufpos >= bufmax)
1174                           {
1175                             bufmax = 2 * bufmax + 10;
1176                             buffer = xrealloc (buffer, bufmax);
1177                           }
1178                         buffer[bufpos++] = c;
1179                         c = phase3_getc ();
1180                       }
1181                     while (c != EOF && c != '\n' && c != '\r');
1182                     /* buffer[0..bufpos-1] now contains the label.  */
1183
1184                     /* Now skip the here document.  */
1185                     for (;;)
1186                       {
1187                         c = phase1_getc ();
1188                         if (c == EOF)
1189                           break;
1190                         if (c == '\n' || c == '\r')
1191                           {
1192                             int bufidx = 0;
1193
1194                             while (bufidx < bufpos)
1195                               {
1196                                 c = phase1_getc ();
1197                                 if (c == EOF)
1198                                   break;
1199                                 if (c != buffer[bufidx])
1200                                   {
1201                                     phase1_ungetc (c);
1202                                     break;
1203                                   }
1204                                 bufidx++;
1205                               }
1206                             if (bufidx == bufpos)
1207                               {
1208                                 c = phase1_getc ();
1209                                 if (c != ';')
1210                                   phase1_ungetc (c);
1211                                 c = phase1_getc ();
1212                                 if (c == '\n' || c == '\r')
1213                                   break;
1214                               }
1215                           }
1216                       }
1217
1218                     /* FIXME: Ideally we should turn the here document into a
1219                        string literal if it didn't contain $ substitution.  And
1220                        we should also respect backslash escape sequences like
1221                        in double-quoted strings.  */
1222                     tp->type = token_type_other;
1223                     return;
1224                   }
1225                 phase1_ungetc (c3);
1226               }
1227
1228             /* < / script > terminates PHP mode and switches back to HTML
1229                mode.  */
1230             while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
1231               c2 = phase1_getc ();
1232             if (c2 == '/')
1233               {
1234                 do
1235                   c2 = phase1_getc ();
1236                 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
1237                 if (c2 == 's' || c2 == 'S')
1238                   {
1239                     c2 = phase1_getc ();
1240                     if (c2 == 'c' || c2 == 'C')
1241                       {
1242                         c2 = phase1_getc ();
1243                         if (c2 == 'r' || c2 == 'R')
1244                           {
1245                             c2 = phase1_getc ();
1246                             if (c2 == 'i' || c2 == 'I')
1247                               {
1248                                 c2 = phase1_getc ();
1249                                 if (c2 == 'p' || c2 == 'P')
1250                                   {
1251                                     c2 = phase1_getc ();
1252                                     if (c2 == 't' || c2 == 'T')
1253                                       {
1254                                         do
1255                                           c2 = phase1_getc ();
1256                                         while (c2 == ' ' || c2 == '\t'
1257                                                || c2 == '\n' || c2 == '\r');
1258                                         if (c2 == '>')
1259                                           {
1260                                             skip_html ();
1261                                           }
1262                                         else
1263                                           phase1_ungetc (c2);
1264                                       }
1265                                     else
1266                                       phase1_ungetc (c2);
1267                                   }
1268                                 else
1269                                   phase1_ungetc (c2);
1270                               }
1271                             else
1272                               phase1_ungetc (c2);
1273                           }
1274                         else
1275                           phase1_ungetc (c2);
1276                       }
1277                     else
1278                       phase1_ungetc (c2);
1279                   }
1280                 else
1281                   phase1_ungetc (c2);
1282               }
1283             else
1284               phase1_ungetc (c2);
1285
1286             tp->type = token_type_other;
1287             return;
1288           }
1289
1290         case '`':
1291           /* Execution operator.  */
1292         default:
1293           /* We could carefully recognize each of the 2 and 3 character
1294              operators, but it is not necessary, as we only need to recognize
1295              gettext invocations.  Don't bother.  */
1296           tp->type = token_type_other;
1297           return;
1298         }
1299     }
1300 }
1301
1302 /* Supports 3 tokens of pushback.  */
1303 static void
1304 phase4_unget (token_ty *tp)
1305 {
1306   if (tp->type != token_type_eof)
1307     {
1308       if (phase4_pushback_length == SIZEOF (phase4_pushback))
1309         abort ();
1310       phase4_pushback[phase4_pushback_length++] = *tp;
1311     }
1312 }
1313
1314
1315 /* 5. Compile-time optimization of string literal concatenation.
1316    Combine "string1" . ... . "stringN" to the concatenated string if
1317      - the token before this expression is none of
1318        '+' '-' '.' '*' '/' '%' '!' '~' '++' '--' ')' '@'
1319        (because then the first string could be part of an expression with
1320        the same or higher precedence as '.', such as an additive,
1321        multiplicative, negation, preincrement, or cast expression),
1322      - the token after this expression is none of
1323        '*' '/' '%' '++' '--'
1324        (because then the last string could be part of an expression with
1325        higher precedence as '.', such as a multiplicative or postincrement
1326        expression).  */
1327
1328 static token_type_ty phase5_last;
1329
1330 static void
1331 x_php_lex (token_ty *tp)
1332 {
1333   phase4_get (tp);
1334   if (tp->type == token_type_string_literal
1335       && !(phase5_last == token_type_dot
1336            || phase5_last == token_type_operator1
1337            || phase5_last == token_type_operator2
1338            || phase5_last == token_type_rparen))
1339     {
1340       char *sum = tp->string;
1341       size_t sum_len = strlen (sum);
1342
1343       for (;;)
1344         {
1345           token_ty token2;
1346
1347           phase4_get (&token2);
1348           if (token2.type == token_type_dot)
1349             {
1350               token_ty token3;
1351
1352               phase4_get (&token3);
1353               if (token3.type == token_type_string_literal)
1354                 {
1355                   token_ty token_after;
1356
1357                   phase4_get (&token_after);
1358                   if (token_after.type != token_type_operator1)
1359                     {
1360                       char *addend = token3.string;
1361                       size_t addend_len = strlen (addend);
1362
1363                       sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1364                       memcpy (sum + sum_len, addend, addend_len + 1);
1365                       sum_len += addend_len;
1366
1367                       phase4_unget (&token_after);
1368                       free_token (&token3);
1369                       free_token (&token2);
1370                       continue;
1371                     }
1372                   phase4_unget (&token_after);
1373                 }
1374               phase4_unget (&token3);
1375             }
1376           phase4_unget (&token2);
1377           break;
1378         }
1379       tp->string = sum;
1380     }
1381   phase5_last = tp->type;
1382 }
1383
1384
1385 /* ========================= Extracting strings.  ========================== */
1386
1387
1388 /* Context lookup table.  */
1389 static flag_context_list_table_ty *flag_context_list_table;
1390
1391
1392 /* The file is broken into tokens.  Scan the token stream, looking for
1393    a keyword, followed by a left paren, followed by a string.  When we
1394    see this sequence, we have something to remember.  We assume we are
1395    looking at a valid C or C++ program, and leave the complaints about
1396    the grammar to the compiler.
1397
1398      Normal handling: Look for
1399        keyword ( ... msgid ... )
1400      Plural handling: Look for
1401        keyword ( ... msgid ... msgid_plural ... )
1402
1403    We use recursion because the arguments before msgid or between msgid
1404    and msgid_plural can contain subexpressions of the same form.  */
1405
1406
1407 /* Extract messages until the next balanced closing parenthesis or bracket.
1408    Extracted messages are added to MLP.
1409    DELIM can be either token_type_rparen or token_type_rbracket, or
1410    token_type_eof to accept both.
1411    Return true upon eof, false upon closing parenthesis or bracket.  */
1412 static bool
1413 extract_balanced (message_list_ty *mlp,
1414                   token_type_ty delim,
1415                   flag_context_ty outer_context,
1416                   flag_context_list_iterator_ty context_iter,
1417                   struct arglist_parser *argparser)
1418 {
1419   /* Current argument number.  */
1420   int arg = 1;
1421   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1422   int state;
1423   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1424   const struct callshapes *next_shapes = NULL;
1425   /* Context iterator that will be used if the next token is a '('.  */
1426   flag_context_list_iterator_ty next_context_iter =
1427     passthrough_context_list_iterator;
1428   /* Current context.  */
1429   flag_context_ty inner_context =
1430     inherited_context (outer_context,
1431                        flag_context_list_iterator_advance (&context_iter));
1432
1433   /* Start state is 0.  */
1434   state = 0;
1435
1436   for (;;)
1437     {
1438       token_ty token;
1439
1440       x_php_lex (&token);
1441       switch (token.type)
1442         {
1443         case token_type_symbol:
1444           {
1445             void *keyword_value;
1446
1447             if (hash_find_entry (&keywords, token.string, strlen (token.string),
1448                                  &keyword_value)
1449                 == 0)
1450               {
1451                 next_shapes = (const struct callshapes *) keyword_value;
1452                 state = 1;
1453               }
1454             else
1455               state = 0;
1456           }
1457           next_context_iter =
1458             flag_context_list_iterator (
1459               flag_context_list_table_lookup (
1460                 flag_context_list_table,
1461                 token.string, strlen (token.string)));
1462           free (token.string);
1463           continue;
1464
1465         case token_type_lparen:
1466           if (extract_balanced (mlp, token_type_rparen,
1467                                 inner_context, next_context_iter,
1468                                 arglist_parser_alloc (mlp,
1469                                                       state ? next_shapes : NULL)))
1470             {
1471               arglist_parser_done (argparser, arg);
1472               return true;
1473             }
1474           next_context_iter = null_context_list_iterator;
1475           state = 0;
1476           continue;
1477
1478         case token_type_rparen:
1479           if (delim == token_type_rparen || delim == token_type_eof)
1480             {
1481               arglist_parser_done (argparser, arg);
1482               return false;
1483             }
1484           next_context_iter = null_context_list_iterator;
1485           state = 0;
1486           continue;
1487
1488         case token_type_comma:
1489           arg++;
1490           inner_context =
1491             inherited_context (outer_context,
1492                                flag_context_list_iterator_advance (
1493                                  &context_iter));
1494           next_context_iter = passthrough_context_list_iterator;
1495           state = 0;
1496           continue;
1497
1498         case token_type_lbracket:
1499           if (extract_balanced (mlp, token_type_rbracket,
1500                                 null_context, null_context_list_iterator,
1501                                 arglist_parser_alloc (mlp, NULL)))
1502             {
1503               arglist_parser_done (argparser, arg);
1504               return true;
1505             }
1506           next_context_iter = null_context_list_iterator;
1507           state = 0;
1508           continue;
1509
1510         case token_type_rbracket:
1511           if (delim == token_type_rbracket || delim == token_type_eof)
1512             {
1513               arglist_parser_done (argparser, arg);
1514               return false;
1515             }
1516           next_context_iter = null_context_list_iterator;
1517           state = 0;
1518           continue;
1519
1520         case token_type_string_literal:
1521           {
1522             lex_pos_ty pos;
1523             pos.file_name = logical_file_name;
1524             pos.line_number = token.line_number;
1525
1526             if (extract_all)
1527               remember_a_message (mlp, NULL, token.string, inner_context,
1528                                   &pos, NULL, token.comment);
1529             else
1530               arglist_parser_remember (argparser, arg, token.string,
1531                                        inner_context,
1532                                        pos.file_name, pos.line_number,
1533                                        token.comment);
1534             drop_reference (token.comment);
1535           }
1536           next_context_iter = null_context_list_iterator;
1537           state = 0;
1538           continue;
1539
1540         case token_type_dot:
1541         case token_type_operator1:
1542         case token_type_operator2:
1543         case token_type_other:
1544           next_context_iter = null_context_list_iterator;
1545           state = 0;
1546           continue;
1547
1548         case token_type_eof:
1549           arglist_parser_done (argparser, arg);
1550           return true;
1551
1552         default:
1553           abort ();
1554         }
1555     }
1556 }
1557
1558
1559 void
1560 extract_php (FILE *f,
1561              const char *real_filename, const char *logical_filename,
1562              flag_context_list_table_ty *flag_table,
1563              msgdomain_list_ty *mdlp)
1564 {
1565   message_list_ty *mlp = mdlp->item[0]->messages;
1566
1567   fp = f;
1568   real_file_name = real_filename;
1569   logical_file_name = xstrdup (logical_filename);
1570   line_number = 1;
1571
1572   last_comment_line = -1;
1573   last_non_comment_line = -1;
1574
1575   phase5_last = token_type_eof;
1576
1577   flag_context_list_table = flag_table;
1578
1579   init_keywords ();
1580
1581   /* Initial mode is HTML mode, not PHP mode.  */
1582   skip_html ();
1583
1584   /* Eat tokens until eof is seen.  When extract_balanced returns
1585      due to an unbalanced closing parenthesis, just restart it.  */
1586   while (!extract_balanced (mlp, token_type_eof,
1587                             null_context, null_context_list_iterator,
1588                             arglist_parser_alloc (mlp, NULL)))
1589     ;
1590
1591   /* Close scanner.  */
1592   fp = NULL;
1593   real_file_name = NULL;
1594   logical_file_name = NULL;
1595   line_number = 0;
1596 }