gettext-tools/src/x-java.c

   1 /* xgettext Java backend.
   2    Copyright (C) 2003, 2005-2009 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2003.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #ifdef HAVE_CONFIG_H
  19 # include "config.h"
  20 #endif
  21
  22 /* Specification.  */
  23 #include "x-java.h"
  24
  25 #include <errno.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30
  31 #include "message.h"
  32 #include "xgettext.h"
  33 #include "error.h"
  34 #include "xalloc.h"
  35 #include "hash.h"
  36 #include "po-charset.h"
  37 #include "unistr.h"
  38 #include "gettext.h"
  39
  40 #define _(s) gettext(s)
  41
  42 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  43
  44
  45 /* The Java syntax is defined in the
  46      Java Language Specification, Second Edition,
  47      (available from http://java.sun.com/),
  48      chapter 3 "Lexical Structure".  */
  49
  50
  51 /* ====================== Keyword set customization.  ====================== */
  52
  53 /* If true extract all strings.  */
  54 static bool extract_all = false;
  55
  56 static hash_table keywords;
  57 static bool default_keywords = true;
  58
  59
  60 void
  61 x_java_extract_all ()
  62 {
  63   extract_all = true;
  64 }
  65
  66
  67 void
  68 x_java_keyword (const char *name)
  69 {
  70   if (name == NULL)
  71     default_keywords = false;
  72   else
  73     {
  74       const char *end;
  75       struct callshape shape;
  76       const char *colon;
  77
  78       if (keywords.table == NULL)
  79         hash_init (&keywords, 100);
  80
  81       split_keywordspec (name, &end, &shape);
  82
  83       /* The characters between name and end should form a valid Java
  84          identifier sequence with dots.
  85          A colon means an invalid parse in split_keywordspec().  */
  86       colon = strchr (name, ':');
  87       if (colon == NULL || colon >= end)
  88         insert_keyword_callshape (&keywords, name, end - name, &shape);
  89     }
  90 }
  91
  92 /* Finish initializing the keywords hash table.
  93    Called after argument processing, before each file is processed.  */
  94 static void
  95 init_keywords ()
  96 {
  97   if (default_keywords)
  98     {
  99       /* When adding new keywords here, also update the documentation in
 100          xgettext.texi!  */
 101       x_java_keyword ("GettextResource.gettext:2");        /* static method */
 102       x_java_keyword ("GettextResource.ngettext:2,3");     /* static method */
 103       x_java_keyword ("GettextResource.pgettext:2c,3");    /* static method */
 104       x_java_keyword ("GettextResource.npgettext:2c,3,4"); /* static method */
 105       x_java_keyword ("gettext");
 106       x_java_keyword ("ngettext:1,2");
 107       x_java_keyword ("pgettext:1c,2");
 108       x_java_keyword ("npgettext:1c,2,3");
 109       x_java_keyword ("getString");     /* ResourceBundle.getString */
 110       default_keywords = false;
 111     }
 112 }
 113
 114 void
 115 init_flag_table_java ()
 116 {
 117   xgettext_record_flag ("GettextResource.gettext:2:pass-java-format");
 118   xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format");
 119   xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format");
 120   xgettext_record_flag ("GettextResource.pgettext:3:pass-java-format");
 121   xgettext_record_flag ("GettextResource.npgettext:3:pass-java-format");
 122   xgettext_record_flag ("GettextResource.npgettext:4:pass-java-format");
 123   xgettext_record_flag ("gettext:1:pass-java-format");
 124   xgettext_record_flag ("ngettext:1:pass-java-format");
 125   xgettext_record_flag ("ngettext:2:pass-java-format");
 126   xgettext_record_flag ("pgettext:2:pass-java-format");
 127   xgettext_record_flag ("npgettext:2:pass-java-format");
 128   xgettext_record_flag ("npgettext:3:pass-java-format");
 129   xgettext_record_flag ("getString:1:pass-java-format");
 130   xgettext_record_flag ("MessageFormat:1:java-format");
 131   xgettext_record_flag ("MessageFormat.format:1:java-format");
 132 }
 133
 134
 135 /* ======================== Reading of characters.  ======================== */
 136
 137 /* Real filename, used in error messages about the input file.  */
 138 static const char *real_file_name;
 139
 140 /* Logical filename and line number, used to label the extracted messages.  */
 141 static char *logical_file_name;
 142 static int line_number;
 143
 144 /* The input file stream.  */
 145 static FILE *fp;
 146
 147
 148 /* Fetch the next single-byte character from the input file.
 149    Pushback can consist of an unlimited number of 'u' followed by up to 4
 150    other characters.  */
 151
 152 /* Special coding of multiple 'u's in the pushback buffer.  */
 153 #define MULTIPLE_U(count) (0x1000 + (count))
 154
 155 static int phase1_pushback[5];
 156 static unsigned int phase1_pushback_length;
 157
 158 static int
 159 phase1_getc ()
 160 {
 161   int c;
 162
 163   if (phase1_pushback_length)
 164     {
 165       c = phase1_pushback[--phase1_pushback_length];
 166       if (c >= MULTIPLE_U (0))
 167         {
 168           if (c > MULTIPLE_U (1))
 169             phase1_pushback[phase1_pushback_length++] = c - 1;
 170           return 'u';
 171         }
 172       else
 173         return c;
 174     }
 175
 176   c = getc (fp);
 177
 178   if (c == EOF)
 179     {
 180       if (ferror (fp))
 181         error (EXIT_FAILURE, errno, _("\
 182 error while reading \"%s\""), real_file_name);
 183     }
 184
 185   return c;
 186 }
 187
 188 /* Supports any number of 'u' and up to 4 arbitrary characters of pushback.  */
 189 static void
 190 phase1_ungetc (int c)
 191 {
 192   if (c != EOF)
 193     {
 194       if (c == 'u')
 195         {
 196           if (phase1_pushback_length > 0
 197               && phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0))
 198             phase1_pushback[phase1_pushback_length - 1]++;
 199           else
 200             {
 201               if (phase1_pushback_length == SIZEOF (phase1_pushback))
 202                 abort ();
 203               phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1);
 204             }
 205         }
 206       else
 207         {
 208           if (phase1_pushback_length == SIZEOF (phase1_pushback))
 209             abort ();
 210           phase1_pushback[phase1_pushback_length++] = c;
 211         }
 212     }
 213 }
 214
 215
 216 /* Fetch the next single-byte character or Unicode character from the file.
 217    (Here, as in the Java Language Specification, when we say "Unicode
 218    character", we actually mean "UTF-16 encoding unit".)  */
 219
 220 /* Return value of phase 2, 3, 4 when EOF is reached.  */
 221 #define P2_EOF 0xffff
 222
 223 /* Convert an UTF-16 code point to a return value that can be distinguished
 224    from a single-byte return value.  */
 225 #define UNICODE(code) (0x10000 + (code))
 226
 227 /* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code
 228    point.  */
 229 #define IS_UNICODE(p2_result) ((p2_result) >= 0x10000)
 230
 231 /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE.  */
 232 #define UTF16_VALUE(p2_result) ((p2_result) - 0x10000)
 233
 234 /* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit,
 235    so that it can be more easily compared against an ASCII character.
 236    (RED (c) == 'x')  is equivalent to  (c == 'x' || c == UNICODE ('x')).  */
 237 #define RED(p2_result) ((p2_result) & 0xffff)
 238
 239 static int phase2_pushback[1];
 240 static int phase2_pushback_length;
 241
 242 static int
 243 phase2_getc ()
 244 {
 245   int c;
 246
 247   if (phase2_pushback_length)
 248     return phase2_pushback[--phase2_pushback_length];
 249
 250   c = phase1_getc ();
 251   if (c == EOF)
 252     return P2_EOF;
 253   if (c == '\\')
 254     {
 255       c = phase1_getc ();
 256       if (c == 'u')
 257         {
 258           unsigned int u_count = 1;
 259           unsigned char buf[4];
 260           unsigned int n;
 261           int i;
 262
 263           for (;;)
 264             {
 265               c = phase1_getc ();
 266               if (c != 'u')
 267                 break;
 268               u_count++;
 269             }
 270           phase1_ungetc (c);
 271
 272           n = 0;
 273           for (i = 0; i < 4; i++)
 274             {
 275               c = phase1_getc ();
 276
 277               if (c >= '0' && c <= '9')
 278                 n = (n << 4) + (c - '0');
 279               else if (c >= 'A' && c <= 'F')
 280                 n = (n << 4) + (c - 'A' + 10);
 281               else if (c >= 'a' && c <= 'f')
 282                 n = (n << 4) + (c - 'a' + 10);
 283               else
 284                 {
 285                   phase1_ungetc (c);
 286                   while (--i >= 0)
 287                     phase1_ungetc (buf[i]);
 288                   for (; u_count > 0; u_count--)
 289                     phase1_ungetc ('u');
 290                   return '\\';
 291                 }
 292
 293               buf[i] = c;
 294             }
 295           return UNICODE (n);
 296         }
 297       phase1_ungetc (c);
 298       return '\\';
 299     }
 300   return c;
 301 }
 302
 303 /* Supports only one pushback character.  */
 304 static void
 305 phase2_ungetc (int c)
 306 {
 307   if (c != P2_EOF)
 308     {
 309       if (phase2_pushback_length == SIZEOF (phase2_pushback))
 310         abort ();
 311       phase2_pushback[phase2_pushback_length++] = c;
 312     }
 313 }
 314
 315
 316 /* Fetch the next single-byte character or Unicode character from the file.
 317    With line number handling.
 318    Convert line terminators to '\n' or UNICODE ('\n').  */
 319
 320 static int phase3_pushback[2];
 321 static int phase3_pushback_length;
 322
 323 static int
 324 phase3_getc ()
 325 {
 326   int c;
 327
 328   if (phase3_pushback_length)
 329     {
 330       c = phase3_pushback[--phase3_pushback_length];
 331       if (c == '\n')
 332         ++line_number;
 333       return c;
 334     }
 335
 336   c = phase2_getc ();
 337
 338   /* Handle line terminators.  */
 339   if (RED (c) == '\r')
 340     {
 341       int c1 = phase2_getc ();
 342
 343       if (RED (c1) != '\n')
 344         phase2_ungetc (c1);
 345
 346       /* Seen line terminator CR or CR/LF.  */
 347       if (c == '\r' || c1 == '\n')
 348         {
 349           ++line_number;
 350           return '\n';
 351         }
 352       else
 353         return UNICODE ('\n');
 354     }
 355   else if (RED (c) == '\n')
 356     {
 357       /* Seen line terminator LF.  */
 358       if (c == '\n')
 359         {
 360           ++line_number;
 361           return '\n';
 362         }
 363       else
 364         return UNICODE ('\n');
 365     }
 366
 367   return c;
 368 }
 369
 370 /* Supports 2 characters of pushback.  */
 371 static void
 372 phase3_ungetc (int c)
 373 {
 374   if (c != P2_EOF)
 375     {
 376       if (c == '\n')
 377         --line_number;
 378       if (phase3_pushback_length == SIZEOF (phase3_pushback))
 379         abort ();
 380       phase3_pushback[phase3_pushback_length++] = c;
 381     }
 382 }
 383
 384
 385 /* ========================= Accumulating strings.  ======================== */
 386
 387 /* A string buffer type that allows appending bytes (in the
 388    xgettext_current_source_encoding) or Unicode characters.
 389    Returns the entire string in UTF-8 encoding.  */
 390
 391 struct string_buffer
 392 {
 393   /* The part of the string that has already been converted to UTF-8.  */
 394   char *utf8_buffer;
 395   size_t utf8_buflen;
 396   size_t utf8_allocated;
 397   /* The first half of an UTF-16 surrogate character.  */
 398   unsigned short utf16_surr;
 399   /* The part of the string that is still in the source encoding.  */
 400   char *curr_buffer;
 401   size_t curr_buflen;
 402   size_t curr_allocated;
 403   /* The lexical context.  Used only for error message purposes.  */
 404   lexical_context_ty lcontext;
 405 };
 406
 407 /* Initialize a 'struct string_buffer' to empty.  */
 408 static inline void
 409 init_string_buffer (struct string_buffer *bp, lexical_context_ty lcontext)
 410 {
 411   bp->utf8_buffer = NULL;
 412   bp->utf8_buflen = 0;
 413   bp->utf8_allocated = 0;
 414   bp->utf16_surr = 0;
 415   bp->curr_buffer = NULL;
 416   bp->curr_buflen = 0;
 417   bp->curr_allocated = 0;
 418   bp->lcontext = lcontext;
 419 }
 420
 421 /* Auxiliary function: Append a byte to bp->curr.  */
 422 static inline void
 423 string_buffer_append_byte (struct string_buffer *bp, unsigned char c)
 424 {
 425   if (bp->curr_buflen == bp->curr_allocated)
 426     {
 427       bp->curr_allocated = 2 * bp->curr_allocated + 10;
 428       bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
 429     }
 430   bp->curr_buffer[bp->curr_buflen++] = c;
 431 }
 432
 433 /* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
 434 static inline void
 435 string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count)
 436 {
 437   if (bp->utf8_buflen + count > bp->utf8_allocated)
 438     {
 439       size_t new_allocated = 2 * bp->utf8_allocated + 10;
 440       if (new_allocated < bp->utf8_buflen + count)
 441         new_allocated = bp->utf8_buflen + count;
 442       bp->utf8_allocated = new_allocated;
 443       bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
 444     }
 445 }
 446
 447 /* Auxiliary function: Append a Unicode character to bp->utf8.
 448    uc must be < 0x110000.  */
 449 static inline void
 450 string_buffer_append_unicode (struct string_buffer *bp, ucs4_t uc)
 451 {
 452   unsigned char utf8buf[6];
 453   int count = u8_uctomb (utf8buf, uc, 6);
 454
 455   if (count < 0)
 456     /* The caller should have ensured that uc is not out-of-range.  */
 457     abort ();
 458
 459   string_buffer_append_unicode_grow (bp, count);
 460   memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
 461   bp->utf8_buflen += count;
 462 }
 463
 464 /* Auxiliary function: Handle the attempt to append a lone surrogate to
 465    bp->utf8.  */
 466 static void
 467 string_buffer_append_lone_surrogate (struct string_buffer *bp, unsigned int uc)
 468 {
 469   /* A half surrogate is invalid, therefore use U+FFFD instead.
 470      It appears to be valid Java: The Java Language Specification,
 471      3rd ed., says "The Java programming language represents text
 472      in sequences of 16-bit code units, using the UTF-16 encoding."
 473      but does not impose constraints on the use of \uxxxx escape
 474      sequences for surrogates.  And the JDK's javac happily groks
 475      half surrogates.
 476      But a half surrogate is invalid in UTF-8:
 477        - RFC 3629 says
 478            "The definition of UTF-8 prohibits encoding character
 479             numbers between U+D800 and U+DFFF".
 480        - Unicode 4.0 chapter 3
 481          <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
 482          section 3.9, p.77, says
 483            "Because surrogate code points are not Unicode scalar
 484             values, any UTF-8 byte sequence that would otherwise
 485             map to code points D800..DFFF is ill-formed."
 486          and in table 3-6, p. 78, does not mention D800..DFFF.
 487        - The unicode.org FAQ question "How do I convert an unpaired
 488          UTF-16 surrogate to UTF-8?" has the answer
 489            "By representing such an unpaired surrogate on its own
 490             as a 3-byte sequence, the resulting UTF-8 data stream
 491             would become ill-formed."
 492      So use U+FFFD instead.  */
 493   error_with_progname = false;
 494   error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
 495          logical_file_name, line_number, uc);
 496   error_with_progname = true;
 497   string_buffer_append_unicode (bp, 0xfffd);
 498 }
 499
 500 /* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer.  */
 501 static inline void
 502 string_buffer_flush_utf16_surr (struct string_buffer *bp)
 503 {
 504   if (bp->utf16_surr != 0)
 505     {
 506       string_buffer_append_lone_surrogate (bp, bp->utf16_surr);
 507       bp->utf16_surr = 0;
 508     }
 509 }
 510
 511 /* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer.  */
 512 static inline void
 513 string_buffer_flush_curr_buffer (struct string_buffer *bp, int lineno)
 514 {
 515   if (bp->curr_buflen > 0)
 516     {
 517       char *curr;
 518       size_t count;
 519
 520       string_buffer_append_byte (bp, '\0');
 521
 522       /* Convert from the source encoding to UTF-8.  */
 523       curr = from_current_source_encoding (bp->curr_buffer, bp->lcontext,
 524                                            logical_file_name, lineno);
 525
 526       /* Append it to bp->utf8_buffer.  */
 527       count = strlen (curr);
 528       string_buffer_append_unicode_grow (bp, count);
 529       memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
 530       bp->utf8_buflen += count;
 531
 532       if (curr != bp->curr_buffer)
 533         free (curr);
 534       bp->curr_buflen = 0;
 535     }
 536 }
 537
 538 /* Append a character or Unicode character to a 'struct string_buffer'.  */
 539 static void
 540 string_buffer_append (struct string_buffer *bp, int c)
 541 {
 542   if (IS_UNICODE (c))
 543     {
 544       /* Append a Unicode character.  */
 545
 546       /* Switch from multibyte character mode to Unicode character mode.  */
 547       string_buffer_flush_curr_buffer (bp, line_number);
 548
 549       /* Test whether this character and the previous one form a Unicode
 550          surrogate character pair.  */
 551       if (bp->utf16_surr != 0
 552           && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
 553         {
 554           unsigned short utf16buf[2];
 555           ucs4_t uc;
 556
 557           utf16buf[0] = bp->utf16_surr;
 558           utf16buf[1] = UTF16_VALUE (c);
 559           if (u16_mbtouc (&uc, utf16buf, 2) != 2)
 560             abort ();
 561
 562           string_buffer_append_unicode (bp, uc);
 563           bp->utf16_surr = 0;
 564         }
 565       else
 566         {
 567           string_buffer_flush_utf16_surr (bp);
 568
 569           if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
 570             bp->utf16_surr = UTF16_VALUE (c);
 571           else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
 572             string_buffer_append_lone_surrogate (bp, UTF16_VALUE (c));
 573           else
 574             string_buffer_append_unicode (bp, UTF16_VALUE (c));
 575         }
 576     }
 577   else
 578     {
 579       /* Append a single byte.  */
 580
 581       /* Switch from Unicode character mode to multibyte character mode.  */
 582       string_buffer_flush_utf16_surr (bp);
 583
 584       /* When a newline is seen, convert the accumulated multibyte sequence.
 585          This ensures a correct line number in the error message in case of
 586          a conversion error.  The "- 1" is to account for the newline.  */
 587       if (c == '\n')
 588         string_buffer_flush_curr_buffer (bp, line_number - 1);
 589
 590       string_buffer_append_byte (bp, (unsigned char) c);
 591     }
 592 }
 593
 594 /* Return the string buffer's contents.  */
 595 static char *
 596 string_buffer_result (struct string_buffer *bp)
 597 {
 598   /* Flush all into bp->utf8_buffer.  */
 599   string_buffer_flush_utf16_surr (bp);
 600   string_buffer_flush_curr_buffer (bp, line_number);
 601   /* NUL-terminate it.  */
 602   string_buffer_append_unicode_grow (bp, 1);
 603   bp->utf8_buffer[bp->utf8_buflen] = '\0';
 604   /* Return it.  */
 605   return bp->utf8_buffer;
 606 }
 607
 608 /* Free the memory pointed to by a 'struct string_buffer'.  */
 609 static inline void
 610 free_string_buffer (struct string_buffer *bp)
 611 {
 612   free (bp->utf8_buffer);
 613   free (bp->curr_buffer);
 614 }
 615
 616
 617 /* ======================== Accumulating comments.  ======================== */
 618
 619
 620 /* Accumulating a single comment line.  */
 621
 622 static struct string_buffer comment_buffer;
 623
 624 static inline void
 625 comment_start ()
 626 {
 627   comment_buffer.utf8_buflen = 0;
 628   comment_buffer.utf16_surr = 0;
 629   comment_buffer.curr_buflen = 0;
 630   comment_buffer.lcontext = lc_comment;
 631 }
 632
 633 static inline bool
 634 comment_at_start ()
 635 {
 636   return (comment_buffer.utf8_buflen == 0 && comment_buffer.utf16_surr == 0
 637           && comment_buffer.curr_buflen == 0);
 638 }
 639
 640 static inline void
 641 comment_add (int c)
 642 {
 643   string_buffer_append (&comment_buffer, c);
 644 }
 645
 646 static inline void
 647 comment_line_end (size_t chars_to_remove)
 648 {
 649   char *buffer = string_buffer_result (&comment_buffer);
 650   size_t buflen = strlen (buffer);
 651
 652   buflen -= chars_to_remove;
 653   while (buflen >= 1
 654          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 655     --buflen;
 656   buffer[buflen] = '\0';
 657   savable_comment_add (buffer);
 658 }
 659
 660
 661 /* These are for tracking whether comments count as immediately before
 662    keyword.  */
 663 static int last_comment_line;
 664 static int last_non_comment_line;
 665
 666
 667 /* Replace each comment that is not inside a character constant or string
 668    literal with a space or newline character.  */
 669
 670 static int
 671 phase4_getc ()
 672 {
 673   int c0;
 674   int c;
 675   bool last_was_star;
 676
 677   c0 = phase3_getc ();
 678   if (RED (c0) != '/')
 679     return c0;
 680   c = phase3_getc ();
 681   switch (RED (c))
 682     {
 683     default:
 684       phase3_ungetc (c);
 685       return c0;
 686
 687     case '*':
 688       /* C style comment.  */
 689       comment_start ();
 690       last_was_star = false;
 691       for (;;)
 692         {
 693           c = phase3_getc ();
 694           if (c == P2_EOF)
 695             break;
 696           /* We skip all leading white space, but not EOLs.  */
 697           if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
 698             comment_add (c);
 699           switch (RED (c))
 700             {
 701             case '\n':
 702               comment_line_end (1);
 703               comment_start ();
 704               last_was_star = false;
 705               continue;
 706
 707             case '*':
 708               last_was_star = true;
 709               continue;
 710
 711             case '/':
 712               if (last_was_star)
 713                 {
 714                   comment_line_end (2);
 715                   break;
 716                 }
 717               /* FALLTHROUGH */
 718
 719             default:
 720               last_was_star = false;
 721               continue;
 722             }
 723           break;
 724         }
 725       last_comment_line = line_number;
 726       return ' ';
 727
 728     case '/':
 729       /* C++ style comment.  */
 730       last_comment_line = line_number;
 731       comment_start ();
 732       for (;;)
 733         {
 734           c = phase3_getc ();
 735           if (RED (c) == '\n' || c == P2_EOF)
 736             break;
 737           /* We skip all leading white space, but not EOLs.  */
 738           if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
 739             comment_add (c);
 740         }
 741       phase3_ungetc (c); /* push back the newline, to decrement line_number */
 742       comment_line_end (0);
 743       phase3_getc (); /* read the newline again */
 744       return '\n';
 745     }
 746 }
 747
 748 /* Supports only one pushback character.  */
 749 static void
 750 phase4_ungetc (int c)
 751 {
 752   phase3_ungetc (c);
 753 }
 754
 755
 756 /* ========================== Reading of tokens.  ========================== */
 757
 758 enum token_type_ty
 759 {
 760   token_type_eof,
 761   token_type_lparen,            /* ( */
 762   token_type_rparen,            /* ) */
 763   token_type_lbrace,            /* { */
 764   token_type_rbrace,            /* } */
 765   token_type_comma,             /* , */
 766   token_type_dot,               /* . */
 767   token_type_string_literal,    /* "abc" */
 768   token_type_number,            /* 1.23 */
 769   token_type_symbol,            /* identifier, keyword, null */
 770   token_type_plus,              /* + */
 771   token_type_other              /* character literal, misc. operator */
 772 };
 773 typedef enum token_type_ty token_type_ty;
 774
 775 typedef struct token_ty token_ty;
 776 struct token_ty
 777 {
 778   token_type_ty type;
 779   char *string;         /* for token_type_string_literal, token_type_symbol */
 780   refcounted_string_list_ty *comment;   /* for token_type_string_literal */
 781   int line_number;
 782 };
 783
 784
 785 /* Free the memory pointed to by a 'struct token_ty'.  */
 786 static inline void
 787 free_token (token_ty *tp)
 788 {
 789   if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
 790     free (tp->string);
 791   if (tp->type == token_type_string_literal)
 792     drop_reference (tp->comment);
 793 }
 794
 795
 796 /* Read an escape sequence inside a string literal or character literal.  */
 797 static inline int
 798 do_getc_escaped ()
 799 {
 800   int c;
 801
 802   /* Use phase 3, because phase 4 elides comments.  */
 803   c = phase3_getc ();
 804   if (c == P2_EOF)
 805     return UNICODE ('\\');
 806   switch (RED (c))
 807     {
 808     case 'b':
 809       return UNICODE (0x08);
 810     case 't':
 811       return UNICODE (0x09);
 812     case 'n':
 813       return UNICODE (0x0a);
 814     case 'f':
 815       return UNICODE (0x0c);
 816     case 'r':
 817       return UNICODE (0x0d);
 818     case '"':
 819       return UNICODE ('"');
 820     case '\'':
 821       return UNICODE ('\'');
 822     case '\\':
 823       return UNICODE ('\\');
 824     case '0': case '1': case '2': case '3':
 825     case '4': case '5': case '6': case '7':
 826       {
 827         int n = RED (c) - '0';
 828         bool maybe3digits = (n < 4);
 829
 830         c = phase3_getc ();
 831         if (RED (c) >= '0' && RED (c) <= '7')
 832           {
 833             n = (n << 3) + (RED (c) - '0');
 834             if (maybe3digits)
 835               {
 836                 c = phase3_getc ();
 837                 if (RED (c) >= '0' && RED (c) <= '7')
 838                   n = (n << 3) + (RED (c) - '0');
 839                 else
 840                   phase3_ungetc (c);
 841               }
 842           }
 843         else
 844           phase3_ungetc (c);
 845
 846         return UNICODE (n);
 847       }
 848     default:
 849       /* Invalid escape sequence.  */
 850       phase3_ungetc (c);
 851       return UNICODE ('\\');
 852     }
 853 }
 854
 855 /* Read a string literal or character literal.  */
 856 static void
 857 accumulate_escaped (struct string_buffer *literal, int delimiter)
 858 {
 859   int c;
 860
 861   for (;;)
 862     {
 863       /* Use phase 3, because phase 4 elides comments.  */
 864       c = phase3_getc ();
 865       if (c == P2_EOF || RED (c) == delimiter)
 866         break;
 867       if (RED (c) == '\n')
 868         {
 869           phase3_ungetc (c);
 870           error_with_progname = false;
 871           if (delimiter == '\'')
 872             error (0, 0, _("%s:%d: warning: unterminated character constant"),
 873                    logical_file_name, line_number);
 874           else
 875             error (0, 0, _("%s:%d: warning: unterminated string constant"),
 876                    logical_file_name, line_number);
 877           error_with_progname = true;
 878           break;
 879         }
 880       if (RED (c) == '\\')
 881         c = do_getc_escaped ();
 882       string_buffer_append (literal, c);
 883     }
 884 }
 885
 886
 887 /* Combine characters into tokens.  Discard whitespace.  */
 888
 889 static token_ty phase5_pushback[3];
 890 static int phase5_pushback_length;
 891
 892 static void
 893 phase5_get (token_ty *tp)
 894 {
 895   int c;
 896
 897   if (phase5_pushback_length)
 898     {
 899       *tp = phase5_pushback[--phase5_pushback_length];
 900       return;
 901     }
 902   tp->string = NULL;
 903
 904   for (;;)
 905     {
 906       tp->line_number = line_number;
 907       c = phase4_getc ();
 908
 909       if (c == P2_EOF)
 910         {
 911           tp->type = token_type_eof;
 912           return;
 913         }
 914
 915       switch (RED (c))
 916         {
 917         case '\n':
 918           if (last_non_comment_line > last_comment_line)
 919             savable_comment_reset ();
 920           /* FALLTHROUGH */
 921         case ' ':
 922         case '\t':
 923         case '\f':
 924           /* Ignore whitespace and comments.  */
 925           continue;
 926         }
 927
 928       last_non_comment_line = tp->line_number;
 929
 930       switch (RED (c))
 931         {
 932         case '(':
 933           tp->type = token_type_lparen;
 934           return;
 935
 936         case ')':
 937           tp->type = token_type_rparen;
 938           return;
 939
 940         case '{':
 941           tp->type = token_type_lbrace;
 942           return;
 943
 944         case '}':
 945           tp->type = token_type_rbrace;
 946           return;
 947
 948         case ',':
 949           tp->type = token_type_comma;
 950           return;
 951
 952         case '.':
 953           c = phase4_getc ();
 954           if (!(RED (c) >= '0' && RED (c) <= '9'))
 955             {
 956               phase4_ungetc (c);
 957               tp->type = token_type_dot;
 958               return;
 959             }
 960           /* FALLTHROUGH */
 961
 962         case '0': case '1': case '2': case '3': case '4':
 963         case '5': case '6': case '7': case '8': case '9':
 964           {
 965             /* Don't need to verify the complicated syntax of integers and
 966                floating-point numbers.  We assume a valid Java input.
 967                The simplified syntax that we recognize as number is: any
 968                sequence of alphanumeric characters, additionally '+' and '-'
 969                immediately after 'e' or 'E' except in hexadecimal numbers.  */
 970             bool hexadecimal = false;
 971
 972             for (;;)
 973               {
 974                 c = phase4_getc ();
 975                 if (RED (c) >= '0' && RED (c) <= '9')
 976                   continue;
 977                 if ((RED (c) >= 'A' && RED (c) <= 'Z')
 978                     || (RED (c) >= 'a' && RED (c) <= 'z'))
 979                   {
 980                     if (RED (c) == 'X' || RED (c) == 'x')
 981                       hexadecimal = true;
 982                     if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal)
 983                       {
 984                         c = phase4_getc ();
 985                         if (!(RED (c) == '+' || RED (c) == '-'))
 986                           phase4_ungetc (c);
 987                       }
 988                     continue;
 989                   }
 990                 if (RED (c) == '.')
 991                   continue;
 992                 break;
 993               }
 994             phase4_ungetc (c);
 995             tp->type = token_type_number;
 996             return;
 997           }
 998
 999         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1000         case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
1001         case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1002         case 'V': case 'W': case 'X': case 'Y': case 'Z':
1003         case '_':
1004         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1005         case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1006         case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1007         case 'v': case 'w': case 'x': case 'y': case 'z':
1008           /* Although Java allows identifiers containing many Unicode
1009              characters, we recognize only identifiers consisting of ASCII
1010              characters.  This avoids conversion hassles w.r.t. the --keyword
1011              arguments, and shouldn't be a big problem in practice.  */
1012           {
1013             static char *buffer;
1014             static int bufmax;
1015             int bufpos = 0;
1016             for (;;)
1017               {
1018                 if (bufpos >= bufmax)
1019                   {
1020                     bufmax = 2 * bufmax + 10;
1021                     buffer = xrealloc (buffer, bufmax);
1022                   }
1023                 buffer[bufpos++] = RED (c);
1024                 c = phase4_getc ();
1025                 if (!((RED (c) >= 'A' && RED (c) <= 'Z')
1026                       || (RED (c) >= 'a' && RED (c) <= 'z')
1027                       || (RED (c) >= '0' && RED (c) <= '9')
1028                       || RED (c) == '_'))
1029                   break;
1030               }
1031             phase4_ungetc (c);
1032             if (bufpos >= bufmax)
1033               {
1034                 bufmax = 2 * bufmax + 10;
1035                 buffer = xrealloc (buffer, bufmax);
1036               }
1037             buffer[bufpos] = '\0';
1038             tp->string = xstrdup (buffer);
1039             tp->type = token_type_symbol;
1040             return;
1041           }
1042
1043         case '"':
1044           /* String literal.  */
1045           {
1046             struct string_buffer literal;
1047
1048             init_string_buffer (&literal, lc_string);
1049             accumulate_escaped (&literal, '"');
1050             tp->string = xstrdup (string_buffer_result (&literal));
1051             free_string_buffer (&literal);
1052             tp->comment = add_reference (savable_comment);
1053             tp->type = token_type_string_literal;
1054             return;
1055           }
1056
1057         case '\'':
1058           /* Character literal.  */
1059           {
1060             struct string_buffer literal;
1061
1062             init_string_buffer (&literal, lc_outside);
1063             accumulate_escaped (&literal, '\'');
1064             free_string_buffer (&literal);
1065             tp->type = token_type_other;
1066             return;
1067           }
1068
1069         case '+':
1070           c = phase4_getc ();
1071           if (RED (c) == '+')
1072             /* Operator ++ */
1073             tp->type = token_type_other;
1074           else if (RED (c) == '=')
1075             /* Operator += */
1076             tp->type = token_type_other;
1077           else
1078             {
1079               /* Operator + */
1080               phase4_ungetc (c);
1081               tp->type = token_type_plus;
1082             }
1083           return;
1084
1085         default:
1086           /* Misc. operator.  */
1087           tp->type = token_type_other;
1088           return;
1089         }
1090     }
1091 }
1092
1093 /* Supports 3 tokens of pushback.  */
1094 static void
1095 phase5_unget (token_ty *tp)
1096 {
1097   if (tp->type != token_type_eof)
1098     {
1099       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1100         abort ();
1101       phase5_pushback[phase5_pushback_length++] = *tp;
1102     }
1103 }
1104
1105
1106 /* Compile-time optimization of string literal concatenation.
1107    Combine "string1" + ... + "stringN" to the concatenated string if
1108      - the token before this expression is not ')' (because then the first
1109        string could be part of a cast expression),
1110      - the token after this expression is not '.' (because then the last
1111        string could be part of a method call expression).  */
1112
1113 static token_ty phase6_pushback[2];
1114 static int phase6_pushback_length;
1115
1116 static token_type_ty phase6_last;
1117
1118 static void
1119 phase6_get (token_ty *tp)
1120 {
1121   if (phase6_pushback_length)
1122     {
1123       *tp = phase6_pushback[--phase6_pushback_length];
1124       return;
1125     }
1126
1127   phase5_get (tp);
1128   if (tp->type == token_type_string_literal && phase6_last != token_type_rparen)
1129     {
1130       char *sum = tp->string;
1131       size_t sum_len = strlen (sum);
1132
1133       for (;;)
1134         {
1135           token_ty token2;
1136
1137           phase5_get (&token2);
1138           if (token2.type == token_type_plus)
1139             {
1140               token_ty token3;
1141
1142               phase5_get (&token3);
1143               if (token3.type == token_type_string_literal)
1144                 {
1145                   token_ty token_after;
1146
1147                   phase5_get (&token_after);
1148                   if (token_after.type != token_type_dot)
1149                     {
1150                       char *addend = token3.string;
1151                       size_t addend_len = strlen (addend);
1152
1153                       sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1154                       memcpy (sum + sum_len, addend, addend_len + 1);
1155                       sum_len += addend_len;
1156
1157                       phase5_unget (&token_after);
1158                       free_token (&token3);
1159                       free_token (&token2);
1160                       continue;
1161                     }
1162                   phase5_unget (&token_after);
1163                 }
1164               phase5_unget (&token3);
1165             }
1166           phase5_unget (&token2);
1167           break;
1168         }
1169       tp->string = sum;
1170     }
1171   phase6_last = tp->type;
1172 }
1173
1174 /* Supports 2 tokens of pushback.  */
1175 static void
1176 phase6_unget (token_ty *tp)
1177 {
1178   if (tp->type != token_type_eof)
1179     {
1180       if (phase6_pushback_length == SIZEOF (phase6_pushback))
1181         abort ();
1182       phase6_pushback[phase6_pushback_length++] = *tp;
1183     }
1184 }
1185
1186
1187 static void
1188 x_java_lex (token_ty *tp)
1189 {
1190   phase6_get (tp);
1191 }
1192
1193 /* Supports 2 tokens of pushback.  */
1194 static void
1195 x_java_unlex (token_ty *tp)
1196 {
1197   phase6_unget (tp);
1198 }
1199
1200
1201 /* ========================= Extracting strings.  ========================== */
1202
1203
1204 /* Context lookup table.  */
1205 static flag_context_list_table_ty *flag_context_list_table;
1206
1207
1208 /* The file is broken into tokens.  Scan the token stream, looking for
1209    a keyword, followed by a left paren, followed by a string.  When we
1210    see this sequence, we have something to remember.  We assume we are
1211    looking at a valid C or C++ program, and leave the complaints about
1212    the grammar to the compiler.
1213
1214      Normal handling: Look for
1215        keyword ( ... msgid ... )
1216      Plural handling: Look for
1217        keyword ( ... msgid ... msgid_plural ... )
1218
1219    We use recursion because the arguments before msgid or between msgid
1220    and msgid_plural can contain subexpressions of the same form.  */
1221
1222
1223 /* Extract messages until the next balanced closing parenthesis or brace,
1224    depending on TERMINATOR.
1225    Extracted messages are added to MLP.
1226    Return true upon eof, false upon closing parenthesis or brace.  */
1227 static bool
1228 extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1229                        flag_context_ty outer_context,
1230                        flag_context_list_iterator_ty context_iter,
1231                        struct arglist_parser *argparser)
1232 {
1233   /* Current argument number.  */
1234   int arg = 1;
1235   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1236   int state;
1237   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1238   const struct callshapes *next_shapes = NULL;
1239   /* Context iterator that will be used if the next token is a '('.  */
1240   flag_context_list_iterator_ty next_context_iter =
1241     passthrough_context_list_iterator;
1242   /* Current context.  */
1243   flag_context_ty inner_context =
1244     inherited_context (outer_context,
1245                        flag_context_list_iterator_advance (&context_iter));
1246
1247   /* Start state is 0.  */
1248   state = 0;
1249
1250   for (;;)
1251     {
1252       token_ty token;
1253
1254       x_java_lex (&token);
1255       switch (token.type)
1256         {
1257         case token_type_symbol:
1258           {
1259             /* Combine symbol1 . ... . symbolN to a single strings, so that
1260                we can recognize static function calls like
1261                GettextResource.gettext.  The information present for
1262                symbolI.....symbolN has precedence over the information for
1263                symbolJ.....symbolN with J > I.  */
1264             char *sum = token.string;
1265             size_t sum_len = strlen (sum);
1266             const char *dottedname;
1267             flag_context_list_ty *context_list;
1268
1269             for (;;)
1270               {
1271                 token_ty token2;
1272
1273                 x_java_lex (&token2);
1274                 if (token2.type == token_type_dot)
1275                   {
1276                     token_ty token3;
1277
1278                     x_java_lex (&token3);
1279                     if (token3.type == token_type_symbol)
1280                       {
1281                         char *addend = token3.string;
1282                         size_t addend_len = strlen (addend);
1283
1284                         sum =
1285                           (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1286                         sum[sum_len] = '.';
1287                         memcpy (sum + sum_len + 1, addend, addend_len + 1);
1288                         sum_len += 1 + addend_len;
1289
1290                         free_token (&token3);
1291                         free_token (&token2);
1292                         continue;
1293                       }
1294                     x_java_unlex (&token3);
1295                   }
1296                 x_java_unlex (&token2);
1297                 break;
1298               }
1299
1300             for (dottedname = sum;;)
1301               {
1302                 void *keyword_value;
1303
1304                 if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
1305                                      &keyword_value)
1306                     == 0)
1307                   {
1308                     next_shapes = (const struct callshapes *) keyword_value;
1309                     state = 1;
1310                     break;
1311                   }
1312
1313                 dottedname = strchr (dottedname, '.');
1314                 if (dottedname == NULL)
1315                   {
1316                     state = 0;
1317                     break;
1318                   }
1319                 dottedname++;
1320               }
1321
1322             for (dottedname = sum;;)
1323               {
1324                 context_list =
1325                   flag_context_list_table_lookup (
1326                     flag_context_list_table,
1327                     dottedname, strlen (dottedname));
1328                 if (context_list != NULL)
1329                   break;
1330
1331                 dottedname = strchr (dottedname, '.');
1332                 if (dottedname == NULL)
1333                   break;
1334                 dottedname++;
1335               }
1336             next_context_iter = flag_context_list_iterator (context_list);
1337
1338             free (sum);
1339             continue;
1340           }
1341
1342         case token_type_lparen:
1343           if (extract_parenthesized (mlp, token_type_rparen,
1344                                      inner_context, next_context_iter,
1345                                      arglist_parser_alloc (mlp,
1346                                                            state ? next_shapes : NULL)))
1347             {
1348               xgettext_current_source_encoding = po_charset_utf8;
1349               arglist_parser_done (argparser, arg);
1350               xgettext_current_source_encoding = xgettext_global_source_encoding;
1351               return true;
1352             }
1353           next_context_iter = null_context_list_iterator;
1354           state = 0;
1355           continue;
1356
1357         case token_type_rparen:
1358           if (terminator == token_type_rparen)
1359             {
1360               xgettext_current_source_encoding = po_charset_utf8;
1361               arglist_parser_done (argparser, arg);
1362               xgettext_current_source_encoding = xgettext_global_source_encoding;
1363               return false;
1364             }
1365           if (terminator == token_type_rbrace)
1366             {
1367               error_with_progname = false;
1368               error (0, 0,
1369                      _("%s:%d: warning: ')' found where '}' was expected"),
1370                      logical_file_name, token.line_number);
1371               error_with_progname = true;
1372             }
1373           next_context_iter = null_context_list_iterator;
1374           state = 0;
1375           continue;
1376
1377         case token_type_lbrace:
1378           if (extract_parenthesized (mlp, token_type_rbrace,
1379                                      null_context, null_context_list_iterator,
1380                                      arglist_parser_alloc (mlp, NULL)))
1381             {
1382               xgettext_current_source_encoding = po_charset_utf8;
1383               arglist_parser_done (argparser, arg);
1384               xgettext_current_source_encoding = xgettext_global_source_encoding;
1385               return true;
1386             }
1387           next_context_iter = null_context_list_iterator;
1388           state = 0;
1389           continue;
1390
1391         case token_type_rbrace:
1392           if (terminator == token_type_rbrace)
1393             {
1394               xgettext_current_source_encoding = po_charset_utf8;
1395               arglist_parser_done (argparser, arg);
1396               xgettext_current_source_encoding = xgettext_global_source_encoding;
1397               return false;
1398             }
1399           if (terminator == token_type_rparen)
1400             {
1401               error_with_progname = false;
1402               error (0, 0,
1403                      _("%s:%d: warning: '}' found where ')' was expected"),
1404                      logical_file_name, token.line_number);
1405               error_with_progname = true;
1406             }
1407           next_context_iter = null_context_list_iterator;
1408           state = 0;
1409           continue;
1410
1411         case token_type_comma:
1412           arg++;
1413           inner_context =
1414             inherited_context (outer_context,
1415                                flag_context_list_iterator_advance (
1416                                  &context_iter));
1417           next_context_iter = passthrough_context_list_iterator;
1418           state = 0;
1419           continue;
1420
1421         case token_type_string_literal:
1422           {
1423             lex_pos_ty pos;
1424             pos.file_name = logical_file_name;
1425             pos.line_number = token.line_number;
1426
1427             xgettext_current_source_encoding = po_charset_utf8;
1428             if (extract_all)
1429               remember_a_message (mlp, NULL, token.string, inner_context,
1430                                   &pos, NULL, token.comment);
1431             else
1432               arglist_parser_remember (argparser, arg, token.string,
1433                                        inner_context,
1434                                        pos.file_name, pos.line_number,
1435                                        token.comment);
1436             xgettext_current_source_encoding = xgettext_global_source_encoding;
1437           }
1438           drop_reference (token.comment);
1439           next_context_iter = null_context_list_iterator;
1440           state = 0;
1441           continue;
1442
1443         case token_type_eof:
1444           xgettext_current_source_encoding = po_charset_utf8;
1445           arglist_parser_done (argparser, arg);
1446           xgettext_current_source_encoding = xgettext_global_source_encoding;
1447           return true;
1448
1449         case token_type_dot:
1450         case token_type_number:
1451         case token_type_plus:
1452         case token_type_other:
1453           next_context_iter = null_context_list_iterator;
1454           state = 0;
1455           continue;
1456
1457         default:
1458           abort ();
1459         }
1460     }
1461 }
1462
1463
1464 void
1465 extract_java (FILE *f,
1466               const char *real_filename, const char *logical_filename,
1467               flag_context_list_table_ty *flag_table,
1468               msgdomain_list_ty *mdlp)
1469 {
1470   message_list_ty *mlp = mdlp->item[0]->messages;
1471
1472   fp = f;
1473   real_file_name = real_filename;
1474   logical_file_name = xstrdup (logical_filename);
1475   line_number = 1;
1476
1477   last_comment_line = -1;
1478   last_non_comment_line = -1;
1479
1480   phase6_last = token_type_eof;
1481
1482   flag_context_list_table = flag_table;
1483
1484   init_keywords ();
1485
1486   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1487      due to an unbalanced closing parenthesis, just restart it.  */
1488   while (!extract_parenthesized (mlp, token_type_eof,
1489                                  null_context, null_context_list_iterator,
1490                                  arglist_parser_alloc (mlp, NULL)))
1491     ;
1492
1493   fp = NULL;
1494   real_file_name = NULL;
1495   logical_file_name = NULL;
1496   line_number = 0;
1497 }