gettext-tools/src/x-c.c

   1 /* xgettext C/C++/ObjectiveC backend.
   2    Copyright (C) 1995-1998, 2000-2009, 2012 Free Software Foundation, Inc.
   3
   4    This file was written by Peter Miller <millerp@canb.auug.org.au>
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22
  23 /* Specification.  */
  24 #include "x-c.h"
  25
  26 #include <errno.h>
  27 #include <stdbool.h>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31
  32 #include "message.h"
  33 #include "xgettext.h"
  34 #include "error.h"
  35 #include "error-progname.h"
  36 #include "xalloc.h"
  37 #include "xvasprintf.h"
  38 #include "hash.h"
  39 #include "gettext.h"
  40
  41 #define _(s) gettext(s)
  42
  43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  44
  45
  46 /* The ANSI C standard defines several phases of translation:
  47
  48    1. Terminate line by \n, regardless of the external representation
  49       of a text line.  Stdio does this for us.
  50
  51    2. Convert trigraphs to their single character equivalents.
  52
  53    3. Concatenate each line ending in backslash (\) with the following
  54       line.
  55
  56    4. Replace each comment with a space character.
  57
  58    5. Parse each resulting logical line as preprocessing tokens a
  59       white space.
  60
  61    6. Recognize and carry out directives (it also expands macros on
  62       non-directive lines, which we do not do here).
  63
  64    7. Replaces escape sequences within character strings with their
  65       single character equivalents (we do this in step 5, because we
  66       don't have to worry about the #include argument).
  67
  68    8. Concatenates adjacent string literals to form single string
  69       literals (because we don't expand macros, there are a few things
  70       we will miss).
  71
  72    9. Converts the remaining preprocessing tokens to C tokens and
  73       discards any white space from the translation unit.
  74
  75    This lexer implements the above, and presents the scanner (in
  76    xgettext.c) with a stream of C tokens.  The comments are
  77    accumulated in a buffer, and given to xgettext when asked for.  */
  78
  79
  80 /* ========================= Lexer customization.  ========================= */
  81
  82 static bool trigraphs = false;
  83
  84 void
  85 x_c_trigraphs ()
  86 {
  87   trigraphs = true;
  88 }
  89
  90
  91 /* ====================== Keyword set customization.  ====================== */
  92
  93 /* If true extract all strings.  */
  94 static bool extract_all = false;
  95
  96 static hash_table c_keywords;
  97 static hash_table objc_keywords;
  98 static bool default_keywords = true;
  99
 100
 101 void
 102 x_c_extract_all ()
 103 {
 104   extract_all = true;
 105 }
 106
 107
 108 static void
 109 add_keyword (const char *name, hash_table *keywords)
 110 {
 111   if (name == NULL)
 112     default_keywords = false;
 113   else
 114     {
 115       const char *end;
 116       struct callshape shape;
 117       const char *colon;
 118
 119       if (keywords->table == NULL)
 120         hash_init (keywords, 100);
 121
 122       split_keywordspec (name, &end, &shape);
 123
 124       /* The characters between name and end should form a valid C identifier.
 125          A colon means an invalid parse in split_keywordspec().  */
 126       colon = strchr (name, ':');
 127       if (colon == NULL || colon >= end)
 128         insert_keyword_callshape (keywords, name, end - name, &shape);
 129     }
 130 }
 131
 132 void
 133 x_c_keyword (const char *name)
 134 {
 135   add_keyword (name, &c_keywords);
 136 }
 137
 138 void
 139 x_objc_keyword (const char *name)
 140 {
 141   add_keyword (name, &objc_keywords);
 142 }
 143
 144 /* Finish initializing the keywords hash tables.
 145    Called after argument processing, before each file is processed.  */
 146 static void
 147 init_keywords ()
 148 {
 149   if (default_keywords)
 150     {
 151       /* When adding new keywords here, also update the documentation in
 152          xgettext.texi!  */
 153       x_c_keyword ("gettext");
 154       x_c_keyword ("dgettext:2");
 155       x_c_keyword ("dcgettext:2");
 156       x_c_keyword ("ngettext:1,2");
 157       x_c_keyword ("dngettext:2,3");
 158       x_c_keyword ("dcngettext:2,3");
 159       x_c_keyword ("gettext_noop");
 160       x_c_keyword ("pgettext:1c,2");
 161       x_c_keyword ("dpgettext:2c,3");
 162       x_c_keyword ("dcpgettext:2c,3");
 163       x_c_keyword ("npgettext:1c,2,3");
 164       x_c_keyword ("dnpgettext:2c,3,4");
 165       x_c_keyword ("dcnpgettext:2c,3,4");
 166
 167       x_objc_keyword ("gettext");
 168       x_objc_keyword ("dgettext:2");
 169       x_objc_keyword ("dcgettext:2");
 170       x_objc_keyword ("ngettext:1,2");
 171       x_objc_keyword ("dngettext:2,3");
 172       x_objc_keyword ("dcngettext:2,3");
 173       x_objc_keyword ("gettext_noop");
 174       x_objc_keyword ("pgettext:1c,2");
 175       x_objc_keyword ("dpgettext:2c,3");
 176       x_objc_keyword ("dcpgettext:2c,3");
 177       x_objc_keyword ("npgettext:1c,2,3");
 178       x_objc_keyword ("dnpgettext:2c,3,4");
 179       x_objc_keyword ("dcnpgettext:2c,3,4");
 180       x_objc_keyword ("NSLocalizedString");       /* similar to gettext */
 181       x_objc_keyword ("_");                       /* similar to gettext */
 182       x_objc_keyword ("NSLocalizedStaticString"); /* similar to gettext_noop */
 183       x_objc_keyword ("__");                      /* similar to gettext_noop */
 184
 185       default_keywords = false;
 186     }
 187 }
 188
 189 void
 190 init_flag_table_c ()
 191 {
 192   xgettext_record_flag ("gettext:1:pass-c-format");
 193   xgettext_record_flag ("dgettext:2:pass-c-format");
 194   xgettext_record_flag ("dcgettext:2:pass-c-format");
 195   xgettext_record_flag ("ngettext:1:pass-c-format");
 196   xgettext_record_flag ("ngettext:2:pass-c-format");
 197   xgettext_record_flag ("dngettext:2:pass-c-format");
 198   xgettext_record_flag ("dngettext:3:pass-c-format");
 199   xgettext_record_flag ("dcngettext:2:pass-c-format");
 200   xgettext_record_flag ("dcngettext:3:pass-c-format");
 201   xgettext_record_flag ("gettext_noop:1:pass-c-format");
 202   xgettext_record_flag ("pgettext:2:pass-c-format");
 203   xgettext_record_flag ("dpgettext:3:pass-c-format");
 204   xgettext_record_flag ("dcpgettext:3:pass-c-format");
 205   xgettext_record_flag ("npgettext:2:pass-c-format");
 206   xgettext_record_flag ("npgettext:3:pass-c-format");
 207   xgettext_record_flag ("dnpgettext:3:pass-c-format");
 208   xgettext_record_flag ("dnpgettext:4:pass-c-format");
 209   xgettext_record_flag ("dcnpgettext:3:pass-c-format");
 210   xgettext_record_flag ("dcnpgettext:4:pass-c-format");
 211
 212   /* <stdio.h> */
 213   xgettext_record_flag ("fprintf:2:c-format");
 214   xgettext_record_flag ("vfprintf:2:c-format");
 215   xgettext_record_flag ("printf:1:c-format");
 216   xgettext_record_flag ("vprintf:1:c-format");
 217   xgettext_record_flag ("sprintf:2:c-format");
 218   xgettext_record_flag ("vsprintf:2:c-format");
 219   xgettext_record_flag ("snprintf:3:c-format");
 220   xgettext_record_flag ("vsnprintf:3:c-format");
 221 #if 0 /* These functions are not standard.  */
 222   /* <stdio.h> */
 223   xgettext_record_flag ("asprintf:2:c-format");
 224   xgettext_record_flag ("vasprintf:2:c-format");
 225   xgettext_record_flag ("dprintf:2:c-format");
 226   xgettext_record_flag ("vdprintf:2:c-format");
 227   xgettext_record_flag ("obstack_printf:2:c-format");
 228   xgettext_record_flag ("obstack_vprintf:2:c-format");
 229   /* <error.h> */
 230   xgettext_record_flag ("error:3:c-format");
 231   xgettext_record_flag ("error_at_line:5:c-format");
 232   /* <argp.h> */
 233   xgettext_record_flag ("argp_error:2:c-format");
 234   xgettext_record_flag ("argp_failure:2:c-format");
 235 #endif
 236
 237   xgettext_record_flag ("gettext:1:pass-qt-format");
 238   xgettext_record_flag ("dgettext:2:pass-qt-format");
 239   xgettext_record_flag ("dcgettext:2:pass-qt-format");
 240   xgettext_record_flag ("ngettext:1:pass-qt-format");
 241   xgettext_record_flag ("ngettext:2:pass-qt-format");
 242   xgettext_record_flag ("dngettext:2:pass-qt-format");
 243   xgettext_record_flag ("dngettext:3:pass-qt-format");
 244   xgettext_record_flag ("dcngettext:2:pass-qt-format");
 245   xgettext_record_flag ("dcngettext:3:pass-qt-format");
 246   xgettext_record_flag ("gettext_noop:1:pass-qt-format");
 247   xgettext_record_flag ("pgettext:2:pass-qt-format");
 248   xgettext_record_flag ("dpgettext:3:pass-qt-format");
 249   xgettext_record_flag ("dcpgettext:3:pass-qt-format");
 250   xgettext_record_flag ("npgettext:2:pass-qt-format");
 251   xgettext_record_flag ("npgettext:3:pass-qt-format");
 252   xgettext_record_flag ("dnpgettext:3:pass-qt-format");
 253   xgettext_record_flag ("dnpgettext:4:pass-qt-format");
 254   xgettext_record_flag ("dcnpgettext:3:pass-qt-format");
 255   xgettext_record_flag ("dcnpgettext:4:pass-qt-format");
 256
 257   xgettext_record_flag ("gettext:1:pass-kde-format");
 258   xgettext_record_flag ("dgettext:2:pass-kde-format");
 259   xgettext_record_flag ("dcgettext:2:pass-kde-format");
 260   xgettext_record_flag ("ngettext:1:pass-kde-format");
 261   xgettext_record_flag ("ngettext:2:pass-kde-format");
 262   xgettext_record_flag ("dngettext:2:pass-kde-format");
 263   xgettext_record_flag ("dngettext:3:pass-kde-format");
 264   xgettext_record_flag ("dcngettext:2:pass-kde-format");
 265   xgettext_record_flag ("dcngettext:3:pass-kde-format");
 266   xgettext_record_flag ("gettext_noop:1:pass-kde-format");
 267   xgettext_record_flag ("pgettext:2:pass-kde-format");
 268   xgettext_record_flag ("dpgettext:3:pass-kde-format");
 269   xgettext_record_flag ("dcpgettext:3:pass-kde-format");
 270   xgettext_record_flag ("npgettext:2:pass-kde-format");
 271   xgettext_record_flag ("npgettext:3:pass-kde-format");
 272   xgettext_record_flag ("dnpgettext:3:pass-kde-format");
 273   xgettext_record_flag ("dnpgettext:4:pass-kde-format");
 274   xgettext_record_flag ("dcnpgettext:3:pass-kde-format");
 275   xgettext_record_flag ("dcnpgettext:4:pass-kde-format");
 276
 277   xgettext_record_flag ("gettext:1:pass-boost-format");
 278   xgettext_record_flag ("dgettext:2:pass-boost-format");
 279   xgettext_record_flag ("dcgettext:2:pass-boost-format");
 280   xgettext_record_flag ("ngettext:1:pass-boost-format");
 281   xgettext_record_flag ("ngettext:2:pass-boost-format");
 282   xgettext_record_flag ("dngettext:2:pass-boost-format");
 283   xgettext_record_flag ("dngettext:3:pass-boost-format");
 284   xgettext_record_flag ("dcngettext:2:pass-boost-format");
 285   xgettext_record_flag ("dcngettext:3:pass-boost-format");
 286   xgettext_record_flag ("gettext_noop:1:pass-boost-format");
 287   xgettext_record_flag ("pgettext:2:pass-boost-format");
 288   xgettext_record_flag ("dpgettext:3:pass-boost-format");
 289   xgettext_record_flag ("dcpgettext:3:pass-boost-format");
 290   xgettext_record_flag ("npgettext:2:pass-boost-format");
 291   xgettext_record_flag ("npgettext:3:pass-boost-format");
 292   xgettext_record_flag ("dnpgettext:3:pass-boost-format");
 293   xgettext_record_flag ("dnpgettext:4:pass-boost-format");
 294   xgettext_record_flag ("dcnpgettext:3:pass-boost-format");
 295   xgettext_record_flag ("dcnpgettext:4:pass-boost-format");
 296
 297   /* <boost/format.hpp> */
 298   xgettext_record_flag ("format:1:boost-format");
 299 }
 300
 301 void
 302 init_flag_table_objc ()
 303 {
 304   /* Since the settings done in init_flag_table_c() also have an effect for
 305      the ObjectiveC parser, we don't have to repeat them here.  */
 306   xgettext_record_flag ("gettext:1:pass-objc-format");
 307   xgettext_record_flag ("dgettext:2:pass-objc-format");
 308   xgettext_record_flag ("dcgettext:2:pass-objc-format");
 309   xgettext_record_flag ("ngettext:1:pass-objc-format");
 310   xgettext_record_flag ("ngettext:2:pass-objc-format");
 311   xgettext_record_flag ("dngettext:2:pass-objc-format");
 312   xgettext_record_flag ("dngettext:3:pass-objc-format");
 313   xgettext_record_flag ("dcngettext:2:pass-objc-format");
 314   xgettext_record_flag ("dcngettext:3:pass-objc-format");
 315   xgettext_record_flag ("gettext_noop:1:pass-objc-format");
 316   xgettext_record_flag ("pgettext:2:pass-objc-format");
 317   xgettext_record_flag ("dpgettext:3:pass-objc-format");
 318   xgettext_record_flag ("dcpgettext:3:pass-objc-format");
 319   xgettext_record_flag ("npgettext:2:pass-objc-format");
 320   xgettext_record_flag ("npgettext:3:pass-objc-format");
 321   xgettext_record_flag ("dnpgettext:3:pass-objc-format");
 322   xgettext_record_flag ("dnpgettext:4:pass-objc-format");
 323   xgettext_record_flag ("dcnpgettext:3:pass-objc-format");
 324   xgettext_record_flag ("dcnpgettext:4:pass-objc-format");
 325   xgettext_record_flag ("NSLocalizedString:1:pass-c-format");
 326   xgettext_record_flag ("NSLocalizedString:1:pass-objc-format");
 327   xgettext_record_flag ("_:1:pass-c-format");
 328   xgettext_record_flag ("_:1:pass-objc-format");
 329   xgettext_record_flag ("stringWithFormat::1:objc-format");
 330   xgettext_record_flag ("initWithFormat::1:objc-format");
 331   xgettext_record_flag ("stringByAppendingFormat::1:objc-format");
 332   xgettext_record_flag ("localizedStringWithFormat::1:objc-format");
 333   xgettext_record_flag ("appendFormat::1:objc-format");
 334 }
 335
 336 void
 337 init_flag_table_gcc_internal ()
 338 {
 339   xgettext_record_flag ("gettext:1:pass-gcc-internal-format");
 340   xgettext_record_flag ("dgettext:2:pass-gcc-internal-format");
 341   xgettext_record_flag ("dcgettext:2:pass-gcc-internal-format");
 342   xgettext_record_flag ("ngettext:1:pass-gcc-internal-format");
 343   xgettext_record_flag ("ngettext:2:pass-gcc-internal-format");
 344   xgettext_record_flag ("dngettext:2:pass-gcc-internal-format");
 345   xgettext_record_flag ("dngettext:3:pass-gcc-internal-format");
 346   xgettext_record_flag ("dcngettext:2:pass-gcc-internal-format");
 347   xgettext_record_flag ("dcngettext:3:pass-gcc-internal-format");
 348   xgettext_record_flag ("gettext_noop:1:pass-gcc-internal-format");
 349   xgettext_record_flag ("pgettext:2:pass-gcc-internal-format");
 350   xgettext_record_flag ("dpgettext:3:pass-gcc-internal-format");
 351   xgettext_record_flag ("dcpgettext:3:pass-gcc-internal-format");
 352   xgettext_record_flag ("npgettext:2:pass-gcc-internal-format");
 353   xgettext_record_flag ("npgettext:3:pass-gcc-internal-format");
 354   xgettext_record_flag ("dnpgettext:3:pass-gcc-internal-format");
 355   xgettext_record_flag ("dnpgettext:4:pass-gcc-internal-format");
 356   xgettext_record_flag ("dcnpgettext:3:pass-gcc-internal-format");
 357   xgettext_record_flag ("dcnpgettext:4:pass-gcc-internal-format");
 358 #if 0 /* This should better be done inside GCC.  */
 359   /* grepping for ATTRIBUTE_PRINTF in gcc-3.3/gcc/?*.h */
 360   /* c-format.c */
 361   xgettext_record_flag ("status_warning:2:gcc-internal-format");
 362   /* c-tree.h */
 363   xgettext_record_flag ("pedwarn_c99:1:pass-gcc-internal-format");
 364   /* collect2.h */
 365   //xgettext_record_flag ("error:1:c-format"); // 3 different versions
 366   xgettext_record_flag ("notice:1:c-format");
 367   //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
 368   xgettext_record_flag ("fatal_perror:1:c-format");
 369   /* cpplib.h */
 370   xgettext_record_flag ("cpp_error:3:c-format");
 371   xgettext_record_flag ("cpp_error_with_line:5:c-format");
 372   /* diagnostic.h */
 373   xgettext_record_flag ("diagnostic_set_info:2:pass-gcc-internal-format");
 374   xgettext_record_flag ("output_printf:2:gcc-internal-format");
 375   xgettext_record_flag ("output_verbatim:2:pass-gcc-internal-format");
 376   xgettext_record_flag ("verbatim:1:gcc-internal-format");
 377   xgettext_record_flag ("inform:1:pass-gcc-internal-format");
 378   /* gcc.h */
 379   //xgettext_record_flag ("fatal:1:c-format"); // 2 different versions
 380   //xgettext_record_flag ("error:1:c-format"); // 3 different versions
 381   /* genattrtab.h */
 382   xgettext_record_flag ("attr_printf:2:pass-c-format");
 383   /* gengtype.h */
 384   xgettext_record_flag ("error_at_line:2:pass-c-format");
 385   xgettext_record_flag ("xvasprintf:2:pass-c-format");
 386   xgettext_record_flag ("xasprintf:1:pass-c-format");
 387   xgettext_record_flag ("oprintf:2:pass-c-format");
 388   /* gensupport.h */
 389   xgettext_record_flag ("message_with_line:2:pass-c-format");
 390   /* output.h */
 391   xgettext_record_flag ("output_operand_lossage:1:c-format");
 392   /* ra.h */
 393    xgettext_record_flag ("ra_debug_msg:2:pass-c-format");
 394   /* toplev.h */
 395   xgettext_record_flag ("fnotice:2:c-format");
 396   xgettext_record_flag ("fatal_io_error:2:gcc-internal-format");
 397   xgettext_record_flag ("error_for_asm:2:pass-gcc-internal-format");
 398   xgettext_record_flag ("warning_for_asm:2:pass-gcc-internal-format");
 399   xgettext_record_flag ("error_with_file_and_line:3:pass-gcc-internal-format");
 400   xgettext_record_flag ("error_with_decl:2:pass-gcc-internal-format");
 401   xgettext_record_flag ("pedwarn:1:gcc-internal-format");
 402   xgettext_record_flag ("pedwarn_with_file_and_line:3:gcc-internal-format");
 403   xgettext_record_flag ("pedwarn_with_decl:2:gcc-internal-format");
 404   xgettext_record_flag ("sorry:1:gcc-internal-format");
 405   xgettext_record_flag ("error:1:pass-gcc-internal-format");
 406   xgettext_record_flag ("fatal_error:1:pass-gcc-internal-format");
 407   xgettext_record_flag ("internal_error:1:pass-gcc-internal-format");
 408   xgettext_record_flag ("warning:1:pass-gcc-internal-format");
 409   xgettext_record_flag ("warning_with_file_and_line:3:pass-gcc-internal-format");
 410   xgettext_record_flag ("warning_with_decl:2:pass-gcc-internal-format");
 411   /* f/com.h */
 412   xgettext_record_flag ("ffecom_get_invented_identifier:1:pass-c-format");
 413   /* f/sts.h */
 414   xgettext_record_flag ("ffests_printf:2:pass-c-format");
 415   /* java/java-tree.h */
 416   xgettext_record_flag ("parse_error_context:2:pass-c-format");
 417 #endif
 418
 419   xgettext_record_flag ("gettext:1:pass-gfc-internal-format");
 420   xgettext_record_flag ("dgettext:2:pass-gfc-internal-format");
 421   xgettext_record_flag ("dcgettext:2:pass-gfc-internal-format");
 422   xgettext_record_flag ("ngettext:1:pass-gfc-internal-format");
 423   xgettext_record_flag ("ngettext:2:pass-gfc-internal-format");
 424   xgettext_record_flag ("dngettext:2:pass-gfc-internal-format");
 425   xgettext_record_flag ("dngettext:3:pass-gfc-internal-format");
 426   xgettext_record_flag ("dcngettext:2:pass-gfc-internal-format");
 427   xgettext_record_flag ("dcngettext:3:pass-gfc-internal-format");
 428   xgettext_record_flag ("gettext_noop:1:pass-gfc-internal-format");
 429   xgettext_record_flag ("pgettext:2:pass-gfc-internal-format");
 430   xgettext_record_flag ("dpgettext:3:pass-gfc-internal-format");
 431   xgettext_record_flag ("dcpgettext:3:pass-gfc-internal-format");
 432   xgettext_record_flag ("npgettext:2:pass-gfc-internal-format");
 433   xgettext_record_flag ("npgettext:3:pass-gfc-internal-format");
 434   xgettext_record_flag ("dnpgettext:3:pass-gfc-internal-format");
 435   xgettext_record_flag ("dnpgettext:4:pass-gfc-internal-format");
 436   xgettext_record_flag ("dcnpgettext:3:pass-gfc-internal-format");
 437   xgettext_record_flag ("dcnpgettext:4:pass-gfc-internal-format");
 438 #if 0 /* This should better be done inside GCC.  */
 439   /* fortran/error.c */
 440   xgettext_record_flag ("gfc_error:1:gfc-internal-format");
 441   xgettext_record_flag ("gfc_error_now:1:gfc-internal-format");
 442   xgettext_record_flag ("gfc_fatal_error:1:gfc-internal-format");
 443   xgettext_record_flag ("gfc_internal_error:1:gfc-internal-format");
 444   xgettext_record_flag ("gfc_notify_std:2:gfc-internal-format");
 445   xgettext_record_flag ("gfc_warning:1:gfc-internal-format");
 446   xgettext_record_flag ("gfc_warning_now:1:gfc-internal-format");
 447 #endif
 448 }
 449
 450
 451 /* ======================== Reading of characters.  ======================== */
 452
 453 /* Real filename, used in error messages about the input file.  */
 454 static const char *real_file_name;
 455
 456 /* Logical filename and line number, used to label the extracted messages.  */
 457 static char *logical_file_name;
 458 static int line_number;
 459
 460 /* The input file stream.  */
 461 static FILE *fp;
 462
 463
 464 /* 0. Terminate line by \n, regardless whether the external representation of
 465    a line terminator is LF (Unix), CR (Mac) or CR/LF (DOS/Windows).
 466    It is debatable whether supporting CR/LF line terminators in C sources
 467    on Unix is ISO C or POSIX compliant, but since GCC 3.3 now supports it
 468    unconditionally, it must be OK.
 469    The so-called "text mode" in stdio on DOS/Windows translates CR/LF to \n
 470    automatically, but here we also need this conversion on Unix.  As a side
 471    effect, on DOS/Windows we also parse CR/CR/LF into a single \n, but this
 472    is not a problem.  */
 473
 474
 475 static int
 476 phase0_getc ()
 477 {
 478   int c;
 479
 480   c = getc (fp);
 481   if (c == EOF)
 482     {
 483       if (ferror (fp))
 484         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 485                real_file_name);
 486       return EOF;
 487     }
 488
 489   if (c == '\r')
 490     {
 491       int c1 = getc (fp);
 492
 493       if (c1 != EOF && c1 != '\n')
 494         ungetc (c1, fp);
 495
 496       /* Seen line terminator CR or CR/LF.  */
 497       return '\n';
 498     }
 499
 500   return c;
 501 }
 502
 503
 504 /* Supports only one pushback character, and not '\n'.  */
 505 static inline void
 506 phase0_ungetc (int c)
 507 {
 508   if (c != EOF)
 509     ungetc (c, fp);
 510 }
 511
 512
 513 /* 1. line_number handling.  Combine backslash-newline to nothing.  */
 514
 515 static unsigned char phase1_pushback[2];
 516 static int phase1_pushback_length;
 517
 518
 519 static int
 520 phase1_getc ()
 521 {
 522   int c;
 523
 524   if (phase1_pushback_length)
 525     {
 526       c = phase1_pushback[--phase1_pushback_length];
 527       if (c == '\n')
 528         ++line_number;
 529       return c;
 530     }
 531   for (;;)
 532     {
 533       c = phase0_getc ();
 534       switch (c)
 535         {
 536         case '\n':
 537           ++line_number;
 538           return '\n';
 539
 540         case '\\':
 541           c = phase0_getc ();
 542           if (c != '\n')
 543             {
 544               phase0_ungetc (c);
 545               return '\\';
 546             }
 547           ++line_number;
 548           break;
 549
 550         default:
 551           return c;
 552         }
 553     }
 554 }
 555
 556
 557 /* Supports 2 characters of pushback.  */
 558 static void
 559 phase1_ungetc (int c)
 560 {
 561   switch (c)
 562     {
 563     case EOF:
 564       break;
 565
 566     case '\n':
 567       --line_number;
 568       /* FALLTHROUGH */
 569
 570     default:
 571       if (phase1_pushback_length == SIZEOF (phase1_pushback))
 572         abort ();
 573       phase1_pushback[phase1_pushback_length++] = c;
 574       break;
 575     }
 576 }
 577
 578
 579 /* 2. Convert trigraphs to their single character equivalents.  Most
 580    sane human beings vomit copiously at the mention of trigraphs, which
 581    is why they are an option.  */
 582
 583 static unsigned char phase2_pushback[1];
 584 static int phase2_pushback_length;
 585
 586
 587 static int
 588 phase2_getc ()
 589 {
 590   int c;
 591
 592   if (phase2_pushback_length)
 593     return phase2_pushback[--phase2_pushback_length];
 594   if (!trigraphs)
 595     return phase1_getc ();
 596
 597   c = phase1_getc ();
 598   if (c != '?')
 599     return c;
 600   c = phase1_getc ();
 601   if (c != '?')
 602     {
 603       phase1_ungetc (c);
 604       return '?';
 605     }
 606   c = phase1_getc ();
 607   switch (c)
 608     {
 609     case '(':
 610       return '[';
 611     case '/':
 612       return '\\';
 613     case ')':
 614       return ']';
 615     case '\'':
 616       return '^';
 617     case '<':
 618       return '{';
 619     case '!':
 620       return '|';
 621     case '>':
 622       return '}';
 623     case '-':
 624       return '~';
 625     case '#':
 626       return '=';
 627     }
 628   phase1_ungetc (c);
 629   phase1_ungetc ('?');
 630   return '?';
 631 }
 632
 633
 634 /* Supports only one pushback character.  */
 635 static void
 636 phase2_ungetc (int c)
 637 {
 638   if (c != EOF)
 639     {
 640       if (phase2_pushback_length == SIZEOF (phase2_pushback))
 641         abort ();
 642       phase2_pushback[phase2_pushback_length++] = c;
 643     }
 644 }
 645
 646
 647 /* 3. Concatenate each line ending in backslash (\) with the following
 648    line.  Basically, all you need to do is elide "\\\n" sequences from
 649    the input.  */
 650
 651 static unsigned char phase3_pushback[2];
 652 static int phase3_pushback_length;
 653
 654
 655 static int
 656 phase3_getc ()
 657 {
 658   if (phase3_pushback_length)
 659     return phase3_pushback[--phase3_pushback_length];
 660   for (;;)
 661     {
 662       int c = phase2_getc ();
 663       if (c != '\\')
 664         return c;
 665       c = phase2_getc ();
 666       if (c != '\n')
 667         {
 668           phase2_ungetc (c);
 669           return '\\';
 670         }
 671     }
 672 }
 673
 674
 675 /* Supports 2 characters of pushback.  */
 676 static void
 677 phase3_ungetc (int c)
 678 {
 679   if (c != EOF)
 680     {
 681       if (phase3_pushback_length == SIZEOF (phase3_pushback))
 682         abort ();
 683       phase3_pushback[phase3_pushback_length++] = c;
 684     }
 685 }
 686
 687
 688 /* Accumulating comments.  */
 689
 690 static char *buffer;
 691 static size_t bufmax;
 692 static size_t buflen;
 693
 694 static inline void
 695 comment_start ()
 696 {
 697   buflen = 0;
 698 }
 699
 700 static inline void
 701 comment_add (int c)
 702 {
 703   if (buflen >= bufmax)
 704     {
 705       bufmax = 2 * bufmax + 10;
 706       buffer = xrealloc (buffer, bufmax);
 707     }
 708   buffer[buflen++] = c;
 709 }
 710
 711 static inline void
 712 comment_line_end (size_t chars_to_remove)
 713 {
 714   buflen -= chars_to_remove;
 715   while (buflen >= 1
 716          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 717     --buflen;
 718   if (chars_to_remove == 0 && buflen >= bufmax)
 719     {
 720       bufmax = 2 * bufmax + 10;
 721       buffer = xrealloc (buffer, bufmax);
 722     }
 723   buffer[buflen] = '\0';
 724   savable_comment_add (buffer);
 725 }
 726
 727
 728 /* These are for tracking whether comments count as immediately before
 729    keyword.  */
 730 static int last_comment_line;
 731 static int last_non_comment_line;
 732 static int newline_count;
 733
 734
 735 /* 4. Replace each comment that is not inside a character constant or
 736    string literal with a space character.  We need to remember the
 737    comment for later, because it may be attached to a keyword string.
 738    We also optionally understand C++ comments.  */
 739
 740 static int
 741 phase4_getc ()
 742 {
 743   int c;
 744   bool last_was_star;
 745
 746   c = phase3_getc ();
 747   if (c != '/')
 748     return c;
 749   c = phase3_getc ();
 750   switch (c)
 751     {
 752     default:
 753       phase3_ungetc (c);
 754       return '/';
 755
 756     case '*':
 757       /* C comment.  */
 758       comment_start ();
 759       last_was_star = false;
 760       for (;;)
 761         {
 762           c = phase3_getc ();
 763           if (c == EOF)
 764             break;
 765           /* We skip all leading white space, but not EOLs.  */
 766           if (!(buflen == 0 && (c == ' ' || c == '\t')))
 767             comment_add (c);
 768           switch (c)
 769             {
 770             case '\n':
 771               comment_line_end (1);
 772               comment_start ();
 773               last_was_star = false;
 774               continue;
 775
 776             case '*':
 777               last_was_star = true;
 778               continue;
 779
 780             case '/':
 781               if (last_was_star)
 782                 {
 783                   comment_line_end (2);
 784                   break;
 785                 }
 786               /* FALLTHROUGH */
 787
 788             default:
 789               last_was_star = false;
 790               continue;
 791             }
 792           break;
 793         }
 794       last_comment_line = newline_count;
 795       return ' ';
 796
 797     case '/':
 798       /* C++ or ISO C 99 comment.  */
 799       comment_start ();
 800       for (;;)
 801         {
 802           c = phase3_getc ();
 803           if (c == '\n' || c == EOF)
 804             break;
 805           /* We skip all leading white space, but not EOLs.  */
 806           if (!(buflen == 0 && (c == ' ' || c == '\t')))
 807             comment_add (c);
 808         }
 809       comment_line_end (0);
 810       last_comment_line = newline_count;
 811       return '\n';
 812     }
 813 }
 814
 815
 816 /* Supports only one pushback character.  */
 817 static void
 818 phase4_ungetc (int c)
 819 {
 820   phase3_ungetc (c);
 821 }
 822
 823
 824 /* ========================== Reading of tokens.  ========================== */
 825
 826
 827 /* True if ObjectiveC extensions are recognized.  */
 828 static bool objc_extensions;
 829
 830 enum token_type_ty
 831 {
 832   token_type_character_constant,        /* 'x' */
 833   token_type_eof,
 834   token_type_eoln,
 835   token_type_hash,                      /* # */
 836   token_type_lparen,                    /* ( */
 837   token_type_rparen,                    /* ) */
 838   token_type_comma,                     /* , */
 839   token_type_colon,                     /* : */
 840   token_type_name,                      /* abc */
 841   token_type_number,                    /* 2.7 */
 842   token_type_string_literal,            /* "abc" */
 843   token_type_symbol,                    /* < > = etc. */
 844   token_type_objc_special,              /* @ */
 845   token_type_white_space
 846 };
 847 typedef enum token_type_ty token_type_ty;
 848
 849 typedef struct token_ty token_ty;
 850 struct token_ty
 851 {
 852   token_type_ty type;
 853   char *string;         /* for token_type_name, token_type_string_literal */
 854   refcounted_string_list_ty *comment;   /* for token_type_string_literal,
 855                                            token_type_objc_special */
 856   long number;
 857   int line_number;
 858 };
 859
 860
 861 /* 7. Replace escape sequences within character strings with their
 862    single character equivalents.  This is called from phase 5, because
 863    we don't have to worry about the #include argument.  There are
 864    pathological cases which could bite us (like the DOS directory
 865    separator), but just pretend it can't happen.  */
 866
 867 #define P7_QUOTES (1000 + '"')
 868 #define P7_QUOTE (1000 + '\'')
 869 #define P7_NEWLINE (1000 + '\n')
 870
 871 static int
 872 phase7_getc ()
 873 {
 874   int c, n, j;
 875
 876   /* Use phase 3, because phase 4 elides comments.  */
 877   c = phase3_getc ();
 878
 879   /* Return a magic newline indicator, so that we can distinguish
 880      between the user requesting a newline in the string (e.g. using
 881      "\n" or "\012") from the user failing to terminate the string or
 882      character constant.  The ANSI C standard says: 3.1.3.4 Character
 883      Constants contain "any character except single quote, backslash or
 884      newline; or an escape sequence" and 3.1.4 String Literals contain
 885      "any character except double quote, backslash or newline; or an
 886      escape sequence".
 887
 888      Most compilers give a fatal error in this case, however gcc is
 889      stupidly silent, even though this is a very common typo.  OK, so
 890      "gcc --pedantic" will tell me, but that gripes about too much other
 891      stuff.  Could I have a "gcc -Wnewline-in-string" option, or
 892      better yet a "gcc -fno-newline-in-string" option, please?  Gcc is
 893      also inconsistent between string literals and character constants:
 894      you may not embed newlines in character constants; try it, you get
 895      a useful diagnostic.  --PMiller  */
 896   if (c == '\n')
 897     return P7_NEWLINE;
 898
 899   if (c == '"')
 900     return P7_QUOTES;
 901   if (c == '\'')
 902     return P7_QUOTE;
 903   if (c != '\\')
 904     return c;
 905   c = phase3_getc ();
 906   switch (c)
 907     {
 908     default:
 909       /* Unknown escape sequences really should be an error, but just
 910          ignore them, and let the real compiler complain.  */
 911       phase3_ungetc (c);
 912       return '\\';
 913
 914     case '"':
 915     case '\'':
 916     case '?':
 917     case '\\':
 918       return c;
 919
 920     case 'a':
 921       return '\a';
 922     case 'b':
 923       return '\b';
 924
 925       /* The \e escape is preculiar to gcc, and assumes an ASCII
 926          character set (or superset).  We don't provide support for it
 927          here.  */
 928
 929     case 'f':
 930       return '\f';
 931     case 'n':
 932       return '\n';
 933     case 'r':
 934       return '\r';
 935     case 't':
 936       return '\t';
 937     case 'v':
 938       return '\v';
 939
 940     case 'x':
 941       c = phase3_getc ();
 942       switch (c)
 943         {
 944         default:
 945           phase3_ungetc (c);
 946           phase3_ungetc ('x');
 947           return '\\';
 948
 949         case '0': case '1': case '2': case '3': case '4':
 950         case '5': case '6': case '7': case '8': case '9':
 951         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 952         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 953           break;
 954         }
 955       n = 0;
 956       for (;;)
 957         {
 958           switch (c)
 959             {
 960             default:
 961               phase3_ungetc (c);
 962               return n;
 963
 964             case '0': case '1': case '2': case '3': case '4':
 965             case '5': case '6': case '7': case '8': case '9':
 966               n = n * 16 + c - '0';
 967               break;
 968
 969             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 970               n = n * 16 + 10 + c - 'A';
 971               break;
 972
 973             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 974               n = n * 16 + 10 + c - 'a';
 975               break;
 976             }
 977           c = phase3_getc ();
 978         }
 979       return n;
 980
 981     case '0': case '1': case '2': case '3':
 982     case '4': case '5': case '6': case '7':
 983       n = 0;
 984       for (j = 0; j < 3; ++j)
 985         {
 986           n = n * 8 + c - '0';
 987           c = phase3_getc ();
 988           switch (c)
 989             {
 990             default:
 991               break;
 992
 993             case '0': case '1': case '2': case '3':
 994             case '4': case '5': case '6': case '7':
 995               continue;
 996             }
 997           break;
 998         }
 999       phase3_ungetc (c);
1000       return n;
1001     }
1002 }
1003
1004
1005 static void
1006 phase7_ungetc (int c)
1007 {
1008   phase3_ungetc (c);
1009 }
1010
1011
1012 /* Free the memory pointed to by a 'struct token_ty'.  */
1013 static inline void
1014 free_token (token_ty *tp)
1015 {
1016   if (tp->type == token_type_name || tp->type == token_type_string_literal)
1017     free (tp->string);
1018   if (tp->type == token_type_string_literal
1019       || tp->type == token_type_objc_special)
1020     drop_reference (tp->comment);
1021 }
1022
1023
1024 /* 5. Parse each resulting logical line as preprocessing tokens and
1025    white space.  Preprocessing tokens and C tokens don't always match.  */
1026
1027 static token_ty phase5_pushback[1];
1028 static int phase5_pushback_length;
1029
1030
1031 static void
1032 phase5_get (token_ty *tp)
1033 {
1034   static char *buffer;
1035   static int bufmax;
1036   int bufpos;
1037   int c;
1038
1039   if (phase5_pushback_length)
1040     {
1041       *tp = phase5_pushback[--phase5_pushback_length];
1042       return;
1043     }
1044   tp->string = NULL;
1045   tp->number = 0;
1046   tp->line_number = line_number;
1047   c = phase4_getc ();
1048   switch (c)
1049     {
1050     case EOF:
1051       tp->type = token_type_eof;
1052       return;
1053
1054     case '\n':
1055       tp->type = token_type_eoln;
1056       return;
1057
1058     case ' ':
1059     case '\f':
1060     case '\t':
1061       for (;;)
1062         {
1063           c = phase4_getc ();
1064           switch (c)
1065             {
1066             case ' ':
1067             case '\f':
1068             case '\t':
1069               continue;
1070
1071             default:
1072               phase4_ungetc (c);
1073               break;
1074             }
1075           break;
1076         }
1077       tp->type = token_type_white_space;
1078       return;
1079
1080     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1081     case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
1082     case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1083     case 'V': case 'W': case 'X': case 'Y': case 'Z':
1084     case '_':
1085     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1086     case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1087     case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1088     case 'v': case 'w': case 'x': case 'y': case 'z':
1089       bufpos = 0;
1090       for (;;)
1091         {
1092           if (bufpos >= bufmax)
1093             {
1094               bufmax = 2 * bufmax + 10;
1095               buffer = xrealloc (buffer, bufmax);
1096             }
1097           buffer[bufpos++] = c;
1098           c = phase4_getc ();
1099           switch (c)
1100             {
1101             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1102             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1103             case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1104             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1105             case 'Y': case 'Z':
1106             case '_':
1107             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1108             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1109             case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1110             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1111             case 'y': case 'z':
1112             case '0': case '1': case '2': case '3': case '4':
1113             case '5': case '6': case '7': case '8': case '9':
1114               continue;
1115
1116             default:
1117               phase4_ungetc (c);
1118               break;
1119             }
1120           break;
1121         }
1122       if (bufpos >= bufmax)
1123         {
1124           bufmax = 2 * bufmax + 10;
1125           buffer = xrealloc (buffer, bufmax);
1126         }
1127       buffer[bufpos] = 0;
1128       tp->string = xstrdup (buffer);
1129       tp->type = token_type_name;
1130       return;
1131
1132     case '.':
1133       c = phase4_getc ();
1134       phase4_ungetc (c);
1135       switch (c)
1136         {
1137         default:
1138           tp->type = token_type_symbol;
1139           return;
1140
1141         case '0': case '1': case '2': case '3': case '4':
1142         case '5': case '6': case '7': case '8': case '9':
1143           c = '.';
1144           break;
1145         }
1146       /* FALLTHROUGH */
1147
1148     case '0': case '1': case '2': case '3': case '4':
1149     case '5': case '6': case '7': case '8': case '9':
1150       /* The preprocessing number token is more "generous" than the C
1151          number tokens.  This is mostly due to token pasting (another
1152          thing we can ignore here).  */
1153       bufpos = 0;
1154       for (;;)
1155         {
1156           if (bufpos >= bufmax)
1157             {
1158               bufmax = 2 * bufmax + 10;
1159               buffer = xrealloc (buffer, bufmax);
1160             }
1161           buffer[bufpos++] = c;
1162           c = phase4_getc ();
1163           switch (c)
1164             {
1165             case 'e':
1166             case 'E':
1167               if (bufpos >= bufmax)
1168                 {
1169                   bufmax = 2 * bufmax + 10;
1170                   buffer = xrealloc (buffer, bufmax);
1171                 }
1172               buffer[bufpos++] = c;
1173               c = phase4_getc ();
1174               if (c != '+' && c != '-')
1175                 {
1176                   phase4_ungetc (c);
1177                   break;
1178                 }
1179               continue;
1180
1181             case 'A': case 'B': case 'C': case 'D':           case 'F':
1182             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1183             case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1184             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1185             case 'Y': case 'Z':
1186             case 'a': case 'b': case 'c': case 'd':           case 'f':
1187             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1188             case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1189             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1190             case 'y': case 'z':
1191             case '0': case '1': case '2': case '3': case '4':
1192             case '5': case '6': case '7': case '8': case '9':
1193             case '.':
1194               continue;
1195
1196             default:
1197               phase4_ungetc (c);
1198               break;
1199             }
1200           break;
1201         }
1202       if (bufpos >= bufmax)
1203         {
1204           bufmax = 2 * bufmax + 10;
1205           buffer = xrealloc (buffer, bufmax);
1206         }
1207       buffer[bufpos] = 0;
1208       tp->type = token_type_number;
1209       tp->number = atol (buffer);
1210       return;
1211
1212     case '\'':
1213       /* We could worry about the 'L' before wide character constants,
1214          but ignoring it has no effect unless one of the keywords is
1215          "L".  Just pretend it won't happen.  Also, we don't need to
1216          remember the character constant.  */
1217       for (;;)
1218         {
1219           c = phase7_getc ();
1220           if (c == P7_NEWLINE)
1221             {
1222               error_with_progname = false;
1223               error (0, 0, _("%s:%d: warning: unterminated character constant"),
1224                      logical_file_name, line_number - 1);
1225               error_with_progname = true;
1226               phase7_ungetc ('\n');
1227               break;
1228             }
1229           if (c == EOF || c == P7_QUOTE)
1230             break;
1231         }
1232       tp->type = token_type_character_constant;
1233       return;
1234
1235     case '"':
1236       /* We could worry about the 'L' before wide string constants,
1237          but since gettext's argument is not a wide character string,
1238          let the compiler complain about the argument not matching the
1239          prototype.  Just pretend it won't happen.  */
1240       bufpos = 0;
1241       for (;;)
1242         {
1243           c = phase7_getc ();
1244           if (c == P7_NEWLINE)
1245             {
1246               error_with_progname = false;
1247               error (0, 0, _("%s:%d: warning: unterminated string literal"),
1248                      logical_file_name, line_number - 1);
1249               error_with_progname = true;
1250               phase7_ungetc ('\n');
1251               break;
1252             }
1253           if (c == EOF || c == P7_QUOTES)
1254             break;
1255           if (c == P7_QUOTE)
1256             c = '\'';
1257           if (bufpos >= bufmax)
1258             {
1259               bufmax = 2 * bufmax + 10;
1260               buffer = xrealloc (buffer, bufmax);
1261             }
1262           buffer[bufpos++] = c;
1263         }
1264       if (bufpos >= bufmax)
1265         {
1266           bufmax = 2 * bufmax + 10;
1267           buffer = xrealloc (buffer, bufmax);
1268         }
1269       buffer[bufpos] = 0;
1270       tp->type = token_type_string_literal;
1271       tp->string = xstrdup (buffer);
1272       tp->comment = add_reference (savable_comment);
1273       return;
1274
1275     case '(':
1276       tp->type = token_type_lparen;
1277       return;
1278
1279     case ')':
1280       tp->type = token_type_rparen;
1281       return;
1282
1283     case ',':
1284       tp->type = token_type_comma;
1285       return;
1286
1287     case '#':
1288       tp->type = token_type_hash;
1289       return;
1290
1291     case ':':
1292       tp->type = token_type_colon;
1293       return;
1294
1295     case '@':
1296       if (objc_extensions)
1297         {
1298           tp->type = token_type_objc_special;
1299           tp->comment = add_reference (savable_comment);
1300           return;
1301         }
1302       /* FALLTHROUGH */
1303
1304     default:
1305       /* We could carefully recognize each of the 2 and 3 character
1306         operators, but it is not necessary, as we only need to recognize
1307         gettext invocations.  Don't bother.  */
1308       tp->type = token_type_symbol;
1309       return;
1310     }
1311 }
1312
1313
1314 /* Supports only one pushback token.  */
1315 static void
1316 phase5_unget (token_ty *tp)
1317 {
1318   if (tp->type != token_type_eof)
1319     {
1320       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1321         abort ();
1322       phase5_pushback[phase5_pushback_length++] = *tp;
1323     }
1324 }
1325
1326
1327 /* X. Recognize a leading # symbol.  Leave leading hash as a hash, but
1328    turn hash in the middle of a line into a plain symbol token.  This
1329    makes the phase 6 easier.  */
1330
1331 static void
1332 phaseX_get (token_ty *tp)
1333 {
1334   static bool middle;   /* false at the beginning of a line, true otherwise.  */
1335
1336   phase5_get (tp);
1337
1338   if (tp->type == token_type_eoln || tp->type == token_type_eof)
1339     middle = false;
1340   else
1341     {
1342       if (middle)
1343         {
1344           /* Turn hash in the middle of a line into a plain symbol token.  */
1345           if (tp->type == token_type_hash)
1346             tp->type = token_type_symbol;
1347         }
1348       else
1349         {
1350           /* When we see leading whitespace followed by a hash sign,
1351              discard the leading white space token.  The hash is all
1352              phase 6 is interested in.  */
1353           if (tp->type == token_type_white_space)
1354             {
1355               token_ty next;
1356
1357               phase5_get (&next);
1358               if (next.type == token_type_hash)
1359                 *tp = next;
1360               else
1361                 phase5_unget (&next);
1362             }
1363           middle = true;
1364         }
1365     }
1366 }
1367
1368
1369 /* 6. Recognize and carry out directives (it also expands macros on
1370    non-directive lines, which we do not do here).  The only directive
1371    we care about are the #line and #define directive.  We throw all the
1372    others away.  */
1373
1374 static token_ty phase6_pushback[2];
1375 static int phase6_pushback_length;
1376
1377
1378 static void
1379 phase6_get (token_ty *tp)
1380 {
1381   static token_ty *buf;
1382   static int bufmax;
1383   int bufpos;
1384   int j;
1385
1386   if (phase6_pushback_length)
1387     {
1388       *tp = phase6_pushback[--phase6_pushback_length];
1389       return;
1390     }
1391   for (;;)
1392     {
1393       /* Get the next token.  If it is not a '#' at the beginning of a
1394          line (ignoring whitespace), return immediately.  */
1395       phaseX_get (tp);
1396       if (tp->type != token_type_hash)
1397         return;
1398
1399       /* Accumulate the rest of the directive in a buffer, until the
1400          "define" keyword is seen or until end of line.  */
1401       bufpos = 0;
1402       for (;;)
1403         {
1404           phaseX_get (tp);
1405           if (tp->type == token_type_eoln || tp->type == token_type_eof)
1406             break;
1407
1408           /* Before the "define" keyword and inside other directives
1409              white space is irrelevant.  So just throw it away.  */
1410           if (tp->type != token_type_white_space)
1411             {
1412               /* If it is a #define directive, return immediately,
1413                  thus treating the body of the #define directive like
1414                  normal input.  */
1415               if (bufpos == 0
1416                   && tp->type == token_type_name
1417                   && strcmp (tp->string, "define") == 0)
1418                 return;
1419
1420               /* Accumulate.  */
1421               if (bufpos >= bufmax)
1422                 {
1423                   bufmax = 2 * bufmax + 10;
1424                   buf = xrealloc (buf, bufmax * sizeof (buf[0]));
1425                 }
1426               buf[bufpos++] = *tp;
1427             }
1428         }
1429
1430       /* If it is a #line directive, with no macros to expand, act on
1431          it.  Ignore all other directives.  */
1432       if (bufpos >= 3 && buf[0].type == token_type_name
1433           && strcmp (buf[0].string, "line") == 0
1434           && buf[1].type == token_type_number
1435           && buf[2].type == token_type_string_literal)
1436         {
1437           logical_file_name = xstrdup (buf[2].string);
1438           line_number = buf[1].number;
1439         }
1440       if (bufpos >= 2 && buf[0].type == token_type_number
1441           && buf[1].type == token_type_string_literal)
1442         {
1443           logical_file_name = xstrdup (buf[1].string);
1444           line_number = buf[0].number;
1445         }
1446
1447       /* Release the storage held by the directive.  */
1448       for (j = 0; j < bufpos; ++j)
1449         free_token (&buf[j]);
1450
1451       /* We must reset the selected comments.  */
1452       savable_comment_reset ();
1453     }
1454 }
1455
1456
1457 /* Supports 2 tokens of pushback.  */
1458 static void
1459 phase6_unget (token_ty *tp)
1460 {
1461   if (tp->type != token_type_eof)
1462     {
1463       if (phase6_pushback_length == SIZEOF (phase6_pushback))
1464         abort ();
1465       phase6_pushback[phase6_pushback_length++] = *tp;
1466     }
1467 }
1468
1469
1470 /* 8a. Convert ISO C 99 section 7.8.1 format string directives to string
1471    literal placeholders.  */
1472
1473 /* Test for an ISO C 99 section 7.8.1 format string directive.  */
1474 static bool
1475 is_inttypes_macro (const char *name)
1476 {
1477   /* Syntax:
1478      P R I { d | i | o | u | x | X }
1479      { { | LEAST | FAST } { 8 | 16 | 32 | 64 } | MAX | PTR }  */
1480   if (name[0] == 'P' && name[1] == 'R' && name[2] == 'I')
1481     {
1482       name += 3;
1483       if (name[0] == 'd' || name[0] == 'i' || name[0] == 'o' || name[0] == 'u'
1484           || name[0] == 'x' || name[0] == 'X')
1485         {
1486           name += 1;
1487           if (name[0] == 'M' && name[1] == 'A' && name[2] == 'X'
1488               && name[3] == '\0')
1489             return true;
1490           if (name[0] == 'P' && name[1] == 'T' && name[2] == 'R'
1491               && name[3] == '\0')
1492             return true;
1493           if (name[0] == 'L' && name[1] == 'E' && name[2] == 'A'
1494               && name[3] == 'S' && name[4] == 'T')
1495             name += 5;
1496           else if (name[0] == 'F' && name[1] == 'A' && name[2] == 'S'
1497                    && name[3] == 'T')
1498             name += 4;
1499           if (name[0] == '8' && name[1] == '\0')
1500             return true;
1501           if (name[0] == '1' && name[1] == '6' && name[2] == '\0')
1502             return true;
1503           if (name[0] == '3' && name[1] == '2' && name[2] == '\0')
1504             return true;
1505           if (name[0] == '6' && name[1] == '4' && name[2] == '\0')
1506             return true;
1507         }
1508     }
1509   return false;
1510 }
1511
1512 static void
1513 phase8a_get (token_ty *tp)
1514 {
1515   phase6_get (tp);
1516   if (tp->type == token_type_name && is_inttypes_macro (tp->string))
1517     {
1518       /* Turn PRIdXXX into "<PRIdXXX>".  */
1519       char *new_string = xasprintf ("<%s>", tp->string);
1520       free (tp->string);
1521       tp->string = new_string;
1522       tp->comment = add_reference (savable_comment);
1523       tp->type = token_type_string_literal;
1524     }
1525 }
1526
1527 /* Supports 2 tokens of pushback.  */
1528 static inline void
1529 phase8a_unget (token_ty *tp)
1530 {
1531   phase6_unget (tp);
1532 }
1533
1534
1535 /* 8b. Drop whitespace.  */
1536 static void
1537 phase8b_get (token_ty *tp)
1538 {
1539   for (;;)
1540     {
1541       phase8a_get (tp);
1542
1543       if (tp->type == token_type_white_space)
1544         continue;
1545       if (tp->type == token_type_eoln)
1546         {
1547           /* We have to track the last occurrence of a string.  One
1548              mode of xgettext allows to group an extracted message
1549              with a comment for documentation.  The rule which states
1550              which comment is assumed to be grouped with the message
1551              says it should immediately precede it.  Our
1552              interpretation: between the last line of the comment and
1553              the line in which the keyword is found must be no line
1554              with non-white space tokens.  */
1555           ++newline_count;
1556           if (last_non_comment_line > last_comment_line)
1557             savable_comment_reset ();
1558           continue;
1559         }
1560       break;
1561     }
1562 }
1563
1564 /* Supports 2 tokens of pushback.  */
1565 static inline void
1566 phase8b_unget (token_ty *tp)
1567 {
1568   phase8a_unget (tp);
1569 }
1570
1571
1572 /* 8c. In ObjectiveC mode, drop '@' before a literal string.  We need to
1573    do this before performing concatenation of adjacent string literals.  */
1574 static void
1575 phase8c_get (token_ty *tp)
1576 {
1577   token_ty tmp;
1578
1579   phase8b_get (tp);
1580   if (tp->type != token_type_objc_special)
1581     return;
1582   phase8b_get (&tmp);
1583   if (tmp.type != token_type_string_literal)
1584     {
1585       phase8b_unget (&tmp);
1586       return;
1587     }
1588   /* Drop the '@' token and return immediately the following string.  */
1589   drop_reference (tmp.comment);
1590   tmp.comment = tp->comment;
1591   *tp = tmp;
1592 }
1593
1594 /* Supports only one pushback token.  */
1595 static inline void
1596 phase8c_unget (token_ty *tp)
1597 {
1598   phase8b_unget (tp);
1599 }
1600
1601
1602 /* 8. Concatenate adjacent string literals to form single string
1603    literals (because we don't expand macros, there are a few things we
1604    will miss).  */
1605
1606 static void
1607 phase8_get (token_ty *tp)
1608 {
1609   phase8c_get (tp);
1610   if (tp->type != token_type_string_literal)
1611     return;
1612   for (;;)
1613     {
1614       token_ty tmp;
1615       size_t len;
1616
1617       phase8c_get (&tmp);
1618       if (tmp.type != token_type_string_literal)
1619         {
1620           phase8c_unget (&tmp);
1621           return;
1622         }
1623       len = strlen (tp->string);
1624       tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
1625       strcpy (tp->string + len, tmp.string);
1626       free_token (&tmp);
1627     }
1628 }
1629
1630
1631 /* ===================== Reading of high-level tokens.  ==================== */
1632
1633
1634 enum xgettext_token_type_ty
1635 {
1636   xgettext_token_type_eof,
1637   xgettext_token_type_keyword,
1638   xgettext_token_type_symbol,
1639   xgettext_token_type_lparen,
1640   xgettext_token_type_rparen,
1641   xgettext_token_type_comma,
1642   xgettext_token_type_colon,
1643   xgettext_token_type_string_literal,
1644   xgettext_token_type_other
1645 };
1646 typedef enum xgettext_token_type_ty xgettext_token_type_ty;
1647
1648 typedef struct xgettext_token_ty xgettext_token_ty;
1649 struct xgettext_token_ty
1650 {
1651   xgettext_token_type_ty type;
1652
1653   /* This field is used only for xgettext_token_type_keyword.  */
1654   const struct callshapes *shapes;
1655
1656   /* This field is used only for xgettext_token_type_string_literal,
1657      xgettext_token_type_keyword, xgettext_token_type_symbol.  */
1658   char *string;
1659
1660   /* This field is used only for xgettext_token_type_string_literal.  */
1661   refcounted_string_list_ty *comment;
1662
1663   /* These fields are only for
1664        xgettext_token_type_keyword,
1665        xgettext_token_type_string_literal.  */
1666   lex_pos_ty pos;
1667 };
1668
1669
1670 /* 9. Convert the remaining preprocessing tokens to C tokens and
1671    discards any white space from the translation unit.  */
1672
1673 static void
1674 x_c_lex (xgettext_token_ty *tp)
1675 {
1676   for (;;)
1677     {
1678       token_ty token;
1679       void *keyword_value;
1680
1681       phase8_get (&token);
1682       switch (token.type)
1683         {
1684         case token_type_eof:
1685           tp->type = xgettext_token_type_eof;
1686           return;
1687
1688         case token_type_name:
1689           last_non_comment_line = newline_count;
1690
1691           if (hash_find_entry (objc_extensions ? &objc_keywords : &c_keywords,
1692                                token.string, strlen (token.string),
1693                                &keyword_value)
1694               == 0)
1695             {
1696               tp->type = xgettext_token_type_keyword;
1697               tp->shapes = (const struct callshapes *) keyword_value;
1698               tp->pos.file_name = logical_file_name;
1699               tp->pos.line_number = token.line_number;
1700             }
1701           else
1702             tp->type = xgettext_token_type_symbol;
1703           tp->string = token.string;
1704           return;
1705
1706         case token_type_lparen:
1707           last_non_comment_line = newline_count;
1708
1709           tp->type = xgettext_token_type_lparen;
1710           return;
1711
1712         case token_type_rparen:
1713           last_non_comment_line = newline_count;
1714
1715           tp->type = xgettext_token_type_rparen;
1716           return;
1717
1718         case token_type_comma:
1719           last_non_comment_line = newline_count;
1720
1721           tp->type = xgettext_token_type_comma;
1722           return;
1723
1724         case token_type_colon:
1725           last_non_comment_line = newline_count;
1726
1727           tp->type = xgettext_token_type_colon;
1728           return;
1729
1730         case token_type_string_literal:
1731           last_non_comment_line = newline_count;
1732
1733           tp->type = xgettext_token_type_string_literal;
1734           tp->string = token.string;
1735           tp->comment = token.comment;
1736           tp->pos.file_name = logical_file_name;
1737           tp->pos.line_number = token.line_number;
1738           return;
1739
1740         case token_type_objc_special:
1741           drop_reference (token.comment);
1742           /* FALLTHROUGH */
1743
1744         default:
1745           last_non_comment_line = newline_count;
1746
1747           tp->type = xgettext_token_type_other;
1748           return;
1749         }
1750     }
1751 }
1752
1753
1754 /* ========================= Extracting strings.  ========================== */
1755
1756
1757 /* Context lookup table.  */
1758 static flag_context_list_table_ty *flag_context_list_table;
1759
1760
1761 /* The file is broken into tokens.  Scan the token stream, looking for
1762    a keyword, followed by a left paren, followed by a string.  When we
1763    see this sequence, we have something to remember.  We assume we are
1764    looking at a valid C or C++ program, and leave the complaints about
1765    the grammar to the compiler.
1766
1767      Normal handling: Look for
1768        keyword ( ... msgid ... )
1769      Plural handling: Look for
1770        keyword ( ... msgid ... msgid_plural ... )
1771
1772    We use recursion because the arguments before msgid or between msgid
1773    and msgid_plural can contain subexpressions of the same form.  */
1774
1775
1776 /* Extract messages until the next balanced closing parenthesis.
1777    Extracted messages are added to MLP.
1778    Return true upon eof, false upon closing parenthesis.  */
1779 static bool
1780 extract_parenthesized (message_list_ty *mlp,
1781                        flag_context_ty outer_context,
1782                        flag_context_list_iterator_ty context_iter,
1783                        struct arglist_parser *argparser)
1784 {
1785   /* Current argument number.  */
1786   int arg = 1;
1787   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1788   int state;
1789   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1790   const struct callshapes *next_shapes = NULL;
1791   /* Context iterator that will be used if the next token is a '('.  */
1792   flag_context_list_iterator_ty next_context_iter =
1793     passthrough_context_list_iterator;
1794   /* Context iterator that will be used if the next token is a ':'.
1795      (Objective C selector syntax.)  */
1796   flag_context_list_iterator_ty selectorcall_context_iter =
1797     passthrough_context_list_iterator;
1798   /* Current context.  */
1799   flag_context_ty inner_context =
1800     inherited_context (outer_context,
1801                        flag_context_list_iterator_advance (&context_iter));
1802
1803   /* Start state is 0.  */
1804   state = 0;
1805
1806   for (;;)
1807     {
1808       xgettext_token_ty token;
1809
1810       x_c_lex (&token);
1811       switch (token.type)
1812         {
1813         case xgettext_token_type_keyword:
1814           next_shapes = token.shapes;
1815           state = 1;
1816           goto keyword_or_symbol;
1817
1818         case xgettext_token_type_symbol:
1819           state = 0;
1820         keyword_or_symbol:
1821           next_context_iter =
1822             flag_context_list_iterator (
1823               flag_context_list_table_lookup (
1824                 flag_context_list_table,
1825                 token.string, strlen (token.string)));
1826           if (objc_extensions)
1827             {
1828               size_t token_string_len = strlen (token.string);
1829               token.string = xrealloc (token.string, token_string_len + 2);
1830               token.string[token_string_len] = ':';
1831               token.string[token_string_len + 1] = '\0';
1832               selectorcall_context_iter =
1833                 flag_context_list_iterator (
1834                   flag_context_list_table_lookup (
1835                     flag_context_list_table,
1836                     token.string, token_string_len + 1));
1837             }
1838           free (token.string);
1839           continue;
1840
1841         case xgettext_token_type_lparen:
1842           if (extract_parenthesized (mlp, inner_context, next_context_iter,
1843                                      arglist_parser_alloc (mlp,
1844                                                            state ? next_shapes : NULL)))
1845             {
1846               arglist_parser_done (argparser, arg);
1847               return true;
1848             }
1849           next_context_iter = null_context_list_iterator;
1850           selectorcall_context_iter = null_context_list_iterator;
1851           state = 0;
1852           continue;
1853
1854         case xgettext_token_type_rparen:
1855           arglist_parser_done (argparser, arg);
1856           return false;
1857
1858         case xgettext_token_type_comma:
1859           arg++;
1860           inner_context =
1861             inherited_context (outer_context,
1862                                flag_context_list_iterator_advance (
1863                                  &context_iter));
1864           next_context_iter = passthrough_context_list_iterator;
1865           selectorcall_context_iter = passthrough_context_list_iterator;
1866           state = 0;
1867           continue;
1868
1869         case xgettext_token_type_colon:
1870           if (objc_extensions)
1871             {
1872               context_iter = selectorcall_context_iter;
1873               inner_context =
1874                 inherited_context (inner_context,
1875                                    flag_context_list_iterator_advance (
1876                                      &context_iter));
1877               next_context_iter = passthrough_context_list_iterator;
1878               selectorcall_context_iter = passthrough_context_list_iterator;
1879             }
1880           else
1881             {
1882               next_context_iter = null_context_list_iterator;
1883               selectorcall_context_iter = null_context_list_iterator;
1884             }
1885           state = 0;
1886           continue;
1887
1888         case xgettext_token_type_string_literal:
1889           if (extract_all)
1890             remember_a_message (mlp, NULL, token.string, inner_context,
1891                                 &token.pos, NULL, token.comment);
1892           else
1893             arglist_parser_remember (argparser, arg, token.string,
1894                                      inner_context,
1895                                      token.pos.file_name, token.pos.line_number,
1896                                      token.comment);
1897           drop_reference (token.comment);
1898           next_context_iter = null_context_list_iterator;
1899           selectorcall_context_iter = null_context_list_iterator;
1900           state = 0;
1901           continue;
1902
1903         case xgettext_token_type_other:
1904           next_context_iter = null_context_list_iterator;
1905           selectorcall_context_iter = null_context_list_iterator;
1906           state = 0;
1907           continue;
1908
1909         case xgettext_token_type_eof:
1910           arglist_parser_done (argparser, arg);
1911           return true;
1912
1913         default:
1914           abort ();
1915         }
1916     }
1917 }
1918
1919
1920 static void
1921 extract_whole_file (FILE *f,
1922                     const char *real_filename, const char *logical_filename,
1923                     flag_context_list_table_ty *flag_table,
1924                     msgdomain_list_ty *mdlp)
1925 {
1926   message_list_ty *mlp = mdlp->item[0]->messages;
1927
1928   fp = f;
1929   real_file_name = real_filename;
1930   logical_file_name = xstrdup (logical_filename);
1931   line_number = 1;
1932
1933   newline_count = 0;
1934   last_comment_line = -1;
1935   last_non_comment_line = -1;
1936
1937   flag_context_list_table = flag_table;
1938
1939   init_keywords ();
1940
1941   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1942      due to an unbalanced closing parenthesis, just restart it.  */
1943   while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1944                                  arglist_parser_alloc (mlp, NULL)))
1945     ;
1946
1947   /* Close scanner.  */
1948   fp = NULL;
1949   real_file_name = NULL;
1950   logical_file_name = NULL;
1951   line_number = 0;
1952 }
1953
1954
1955 void
1956 extract_c (FILE *f,
1957            const char *real_filename, const char *logical_filename,
1958            flag_context_list_table_ty *flag_table,
1959            msgdomain_list_ty *mdlp)
1960 {
1961   objc_extensions = false;
1962   extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
1963 }
1964
1965 void
1966 extract_objc (FILE *f,
1967               const char *real_filename, const char *logical_filename,
1968               flag_context_list_table_ty *flag_table,
1969               msgdomain_list_ty *mdlp)
1970 {
1971   objc_extensions = true;
1972   extract_whole_file (f, real_filename, logical_filename, flag_table, mdlp);
1973 }