1 /* xgettext Java backend.
2 Copyright (C) 2003, 2005-2009 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
36 #include "po-charset.h"
40 #define _(s) gettext(s)
42 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
45 /* The Java syntax is defined in the
46 Java Language Specification, Second Edition,
47 (available from http://java.sun.com/),
48 chapter 3 "Lexical Structure". */
51 /* ====================== Keyword set customization. ====================== */
53 /* If true extract all strings. */
54 static bool extract_all = false;
56 static hash_table keywords;
57 static bool default_keywords = true;
68 x_java_keyword (const char *name)
71 default_keywords = false;
75 struct callshape shape;
78 if (keywords.table == NULL)
79 hash_init (&keywords, 100);
81 split_keywordspec (name, &end, &shape);
83 /* The characters between name and end should form a valid Java
84 identifier sequence with dots.
85 A colon means an invalid parse in split_keywordspec(). */
86 colon = strchr (name, ':');
87 if (colon == NULL || colon >= end)
88 insert_keyword_callshape (&keywords, name, end - name, &shape);
92 /* Finish initializing the keywords hash table.
93 Called after argument processing, before each file is processed. */
99 /* When adding new keywords here, also update the documentation in
101 x_java_keyword ("GettextResource.gettext:2"); /* static method */
102 x_java_keyword ("GettextResource.ngettext:2,3"); /* static method */
103 x_java_keyword ("GettextResource.pgettext:2c,3"); /* static method */
104 x_java_keyword ("GettextResource.npgettext:2c,3,4"); /* static method */
105 x_java_keyword ("gettext");
106 x_java_keyword ("ngettext:1,2");
107 x_java_keyword ("pgettext:1c,2");
108 x_java_keyword ("npgettext:1c,2,3");
109 x_java_keyword ("getString"); /* ResourceBundle.getString */
110 default_keywords = false;
115 init_flag_table_java ()
117 xgettext_record_flag ("GettextResource.gettext:2:pass-java-format");
118 xgettext_record_flag ("GettextResource.ngettext:2:pass-java-format");
119 xgettext_record_flag ("GettextResource.ngettext:3:pass-java-format");
120 xgettext_record_flag ("GettextResource.pgettext:3:pass-java-format");
121 xgettext_record_flag ("GettextResource.npgettext:3:pass-java-format");
122 xgettext_record_flag ("GettextResource.npgettext:4:pass-java-format");
123 xgettext_record_flag ("gettext:1:pass-java-format");
124 xgettext_record_flag ("ngettext:1:pass-java-format");
125 xgettext_record_flag ("ngettext:2:pass-java-format");
126 xgettext_record_flag ("pgettext:2:pass-java-format");
127 xgettext_record_flag ("npgettext:2:pass-java-format");
128 xgettext_record_flag ("npgettext:3:pass-java-format");
129 xgettext_record_flag ("getString:1:pass-java-format");
130 xgettext_record_flag ("MessageFormat:1:java-format");
131 xgettext_record_flag ("MessageFormat.format:1:java-format");
135 /* ======================== Reading of characters. ======================== */
137 /* Real filename, used in error messages about the input file. */
138 static const char *real_file_name;
140 /* Logical filename and line number, used to label the extracted messages. */
141 static char *logical_file_name;
142 static int line_number;
144 /* The input file stream. */
148 /* Fetch the next single-byte character from the input file.
149 Pushback can consist of an unlimited number of 'u' followed by up to 4
152 /* Special coding of multiple 'u's in the pushback buffer. */
153 #define MULTIPLE_U(count) (0x1000 + (count))
155 static int phase1_pushback[5];
156 static unsigned int phase1_pushback_length;
163 if (phase1_pushback_length)
165 c = phase1_pushback[--phase1_pushback_length];
166 if (c >= MULTIPLE_U (0))
168 if (c > MULTIPLE_U (1))
169 phase1_pushback[phase1_pushback_length++] = c - 1;
181 error (EXIT_FAILURE, errno, _("\
182 error while reading \"%s\""), real_file_name);
188 /* Supports any number of 'u' and up to 4 arbitrary characters of pushback. */
190 phase1_ungetc (int c)
196 if (phase1_pushback_length > 0
197 && phase1_pushback[phase1_pushback_length - 1] >= MULTIPLE_U (0))
198 phase1_pushback[phase1_pushback_length - 1]++;
201 if (phase1_pushback_length == SIZEOF (phase1_pushback))
203 phase1_pushback[phase1_pushback_length++] = MULTIPLE_U (1);
208 if (phase1_pushback_length == SIZEOF (phase1_pushback))
210 phase1_pushback[phase1_pushback_length++] = c;
216 /* Fetch the next single-byte character or Unicode character from the file.
217 (Here, as in the Java Language Specification, when we say "Unicode
218 character", we actually mean "UTF-16 encoding unit".) */
220 /* Return value of phase 2, 3, 4 when EOF is reached. */
221 #define P2_EOF 0xffff
223 /* Convert an UTF-16 code point to a return value that can be distinguished
224 from a single-byte return value. */
225 #define UNICODE(code) (0x10000 + (code))
227 /* Test a return value of phase 2, 3, 4 whether it designates an UTF-16 code
229 #define IS_UNICODE(p2_result) ((p2_result) >= 0x10000)
231 /* Extract the UTF-16 code of a return value that satisfies IS_UNICODE. */
232 #define UTF16_VALUE(p2_result) ((p2_result) - 0x10000)
234 /* Reduces a return value of phase 2, 3, 4 by unmasking the UNICODE bit,
235 so that it can be more easily compared against an ASCII character.
236 (RED (c) == 'x') is equivalent to (c == 'x' || c == UNICODE ('x')). */
237 #define RED(p2_result) ((p2_result) & 0xffff)
239 static int phase2_pushback[1];
240 static int phase2_pushback_length;
247 if (phase2_pushback_length)
248 return phase2_pushback[--phase2_pushback_length];
258 unsigned int u_count = 1;
259 unsigned char buf[4];
273 for (i = 0; i < 4; i++)
277 if (c >= '0' && c <= '9')
278 n = (n << 4) + (c - '0');
279 else if (c >= 'A' && c <= 'F')
280 n = (n << 4) + (c - 'A' + 10);
281 else if (c >= 'a' && c <= 'f')
282 n = (n << 4) + (c - 'a' + 10);
287 phase1_ungetc (buf[i]);
288 for (; u_count > 0; u_count--)
303 /* Supports only one pushback character. */
305 phase2_ungetc (int c)
309 if (phase2_pushback_length == SIZEOF (phase2_pushback))
311 phase2_pushback[phase2_pushback_length++] = c;
316 /* Fetch the next single-byte character or Unicode character from the file.
317 With line number handling.
318 Convert line terminators to '\n' or UNICODE ('\n'). */
320 static int phase3_pushback[2];
321 static int phase3_pushback_length;
328 if (phase3_pushback_length)
330 c = phase3_pushback[--phase3_pushback_length];
338 /* Handle line terminators. */
341 int c1 = phase2_getc ();
343 if (RED (c1) != '\n')
346 /* Seen line terminator CR or CR/LF. */
347 if (c == '\r' || c1 == '\n')
353 return UNICODE ('\n');
355 else if (RED (c) == '\n')
357 /* Seen line terminator LF. */
364 return UNICODE ('\n');
370 /* Supports 2 characters of pushback. */
372 phase3_ungetc (int c)
378 if (phase3_pushback_length == SIZEOF (phase3_pushback))
380 phase3_pushback[phase3_pushback_length++] = c;
385 /* ========================= Accumulating strings. ======================== */
387 /* A string buffer type that allows appending bytes (in the
388 xgettext_current_source_encoding) or Unicode characters.
389 Returns the entire string in UTF-8 encoding. */
393 /* The part of the string that has already been converted to UTF-8. */
396 size_t utf8_allocated;
397 /* The first half of an UTF-16 surrogate character. */
398 unsigned short utf16_surr;
399 /* The part of the string that is still in the source encoding. */
402 size_t curr_allocated;
403 /* The lexical context. Used only for error message purposes. */
404 lexical_context_ty lcontext;
407 /* Initialize a 'struct string_buffer' to empty. */
409 init_string_buffer (struct string_buffer *bp, lexical_context_ty lcontext)
411 bp->utf8_buffer = NULL;
413 bp->utf8_allocated = 0;
415 bp->curr_buffer = NULL;
417 bp->curr_allocated = 0;
418 bp->lcontext = lcontext;
421 /* Auxiliary function: Append a byte to bp->curr. */
423 string_buffer_append_byte (struct string_buffer *bp, unsigned char c)
425 if (bp->curr_buflen == bp->curr_allocated)
427 bp->curr_allocated = 2 * bp->curr_allocated + 10;
428 bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
430 bp->curr_buffer[bp->curr_buflen++] = c;
433 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
435 string_buffer_append_unicode_grow (struct string_buffer *bp, size_t count)
437 if (bp->utf8_buflen + count > bp->utf8_allocated)
439 size_t new_allocated = 2 * bp->utf8_allocated + 10;
440 if (new_allocated < bp->utf8_buflen + count)
441 new_allocated = bp->utf8_buflen + count;
442 bp->utf8_allocated = new_allocated;
443 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
447 /* Auxiliary function: Append a Unicode character to bp->utf8.
448 uc must be < 0x110000. */
450 string_buffer_append_unicode (struct string_buffer *bp, ucs4_t uc)
452 unsigned char utf8buf[6];
453 int count = u8_uctomb (utf8buf, uc, 6);
456 /* The caller should have ensured that uc is not out-of-range. */
459 string_buffer_append_unicode_grow (bp, count);
460 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
461 bp->utf8_buflen += count;
464 /* Auxiliary function: Handle the attempt to append a lone surrogate to
467 string_buffer_append_lone_surrogate (struct string_buffer *bp, unsigned int uc)
469 /* A half surrogate is invalid, therefore use U+FFFD instead.
470 It appears to be valid Java: The Java Language Specification,
471 3rd ed., says "The Java programming language represents text
472 in sequences of 16-bit code units, using the UTF-16 encoding."
473 but does not impose constraints on the use of \uxxxx escape
474 sequences for surrogates. And the JDK's javac happily groks
476 But a half surrogate is invalid in UTF-8:
478 "The definition of UTF-8 prohibits encoding character
479 numbers between U+D800 and U+DFFF".
480 - Unicode 4.0 chapter 3
481 <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
482 section 3.9, p.77, says
483 "Because surrogate code points are not Unicode scalar
484 values, any UTF-8 byte sequence that would otherwise
485 map to code points D800..DFFF is ill-formed."
486 and in table 3-6, p. 78, does not mention D800..DFFF.
487 - The unicode.org FAQ question "How do I convert an unpaired
488 UTF-16 surrogate to UTF-8?" has the answer
489 "By representing such an unpaired surrogate on its own
490 as a 3-byte sequence, the resulting UTF-8 data stream
491 would become ill-formed."
492 So use U+FFFD instead. */
493 error_with_progname = false;
494 error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
495 logical_file_name, line_number, uc);
496 error_with_progname = true;
497 string_buffer_append_unicode (bp, 0xfffd);
500 /* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */
502 string_buffer_flush_utf16_surr (struct string_buffer *bp)
504 if (bp->utf16_surr != 0)
506 string_buffer_append_lone_surrogate (bp, bp->utf16_surr);
511 /* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */
513 string_buffer_flush_curr_buffer (struct string_buffer *bp, int lineno)
515 if (bp->curr_buflen > 0)
520 string_buffer_append_byte (bp, '\0');
522 /* Convert from the source encoding to UTF-8. */
523 curr = from_current_source_encoding (bp->curr_buffer, bp->lcontext,
524 logical_file_name, lineno);
526 /* Append it to bp->utf8_buffer. */
527 count = strlen (curr);
528 string_buffer_append_unicode_grow (bp, count);
529 memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
530 bp->utf8_buflen += count;
532 if (curr != bp->curr_buffer)
538 /* Append a character or Unicode character to a 'struct string_buffer'. */
540 string_buffer_append (struct string_buffer *bp, int c)
544 /* Append a Unicode character. */
546 /* Switch from multibyte character mode to Unicode character mode. */
547 string_buffer_flush_curr_buffer (bp, line_number);
549 /* Test whether this character and the previous one form a Unicode
550 surrogate character pair. */
551 if (bp->utf16_surr != 0
552 && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
554 unsigned short utf16buf[2];
557 utf16buf[0] = bp->utf16_surr;
558 utf16buf[1] = UTF16_VALUE (c);
559 if (u16_mbtouc (&uc, utf16buf, 2) != 2)
562 string_buffer_append_unicode (bp, uc);
567 string_buffer_flush_utf16_surr (bp);
569 if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
570 bp->utf16_surr = UTF16_VALUE (c);
571 else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
572 string_buffer_append_lone_surrogate (bp, UTF16_VALUE (c));
574 string_buffer_append_unicode (bp, UTF16_VALUE (c));
579 /* Append a single byte. */
581 /* Switch from Unicode character mode to multibyte character mode. */
582 string_buffer_flush_utf16_surr (bp);
584 /* When a newline is seen, convert the accumulated multibyte sequence.
585 This ensures a correct line number in the error message in case of
586 a conversion error. The "- 1" is to account for the newline. */
588 string_buffer_flush_curr_buffer (bp, line_number - 1);
590 string_buffer_append_byte (bp, (unsigned char) c);
594 /* Return the string buffer's contents. */
596 string_buffer_result (struct string_buffer *bp)
598 /* Flush all into bp->utf8_buffer. */
599 string_buffer_flush_utf16_surr (bp);
600 string_buffer_flush_curr_buffer (bp, line_number);
601 /* NUL-terminate it. */
602 string_buffer_append_unicode_grow (bp, 1);
603 bp->utf8_buffer[bp->utf8_buflen] = '\0';
605 return bp->utf8_buffer;
608 /* Free the memory pointed to by a 'struct string_buffer'. */
610 free_string_buffer (struct string_buffer *bp)
612 free (bp->utf8_buffer);
613 free (bp->curr_buffer);
617 /* ======================== Accumulating comments. ======================== */
620 /* Accumulating a single comment line. */
622 static struct string_buffer comment_buffer;
627 comment_buffer.utf8_buflen = 0;
628 comment_buffer.utf16_surr = 0;
629 comment_buffer.curr_buflen = 0;
630 comment_buffer.lcontext = lc_comment;
636 return (comment_buffer.utf8_buflen == 0 && comment_buffer.utf16_surr == 0
637 && comment_buffer.curr_buflen == 0);
643 string_buffer_append (&comment_buffer, c);
647 comment_line_end (size_t chars_to_remove)
649 char *buffer = string_buffer_result (&comment_buffer);
650 size_t buflen = strlen (buffer);
652 buflen -= chars_to_remove;
654 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
656 buffer[buflen] = '\0';
657 savable_comment_add (buffer);
661 /* These are for tracking whether comments count as immediately before
663 static int last_comment_line;
664 static int last_non_comment_line;
667 /* Replace each comment that is not inside a character constant or string
668 literal with a space or newline character. */
688 /* C style comment. */
690 last_was_star = false;
696 /* We skip all leading white space, but not EOLs. */
697 if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
702 comment_line_end (1);
704 last_was_star = false;
708 last_was_star = true;
714 comment_line_end (2);
720 last_was_star = false;
725 last_comment_line = line_number;
729 /* C++ style comment. */
730 last_comment_line = line_number;
735 if (RED (c) == '\n' || c == P2_EOF)
737 /* We skip all leading white space, but not EOLs. */
738 if (!(comment_at_start () && (RED (c) == ' ' || RED (c) == '\t')))
741 phase3_ungetc (c); /* push back the newline, to decrement line_number */
742 comment_line_end (0);
743 phase3_getc (); /* read the newline again */
748 /* Supports only one pushback character. */
750 phase4_ungetc (int c)
756 /* ========================== Reading of tokens. ========================== */
761 token_type_lparen, /* ( */
762 token_type_rparen, /* ) */
763 token_type_lbrace, /* { */
764 token_type_rbrace, /* } */
765 token_type_comma, /* , */
766 token_type_dot, /* . */
767 token_type_string_literal, /* "abc" */
768 token_type_number, /* 1.23 */
769 token_type_symbol, /* identifier, keyword, null */
770 token_type_plus, /* + */
771 token_type_other /* character literal, misc. operator */
773 typedef enum token_type_ty token_type_ty;
775 typedef struct token_ty token_ty;
779 char *string; /* for token_type_string_literal, token_type_symbol */
780 refcounted_string_list_ty *comment; /* for token_type_string_literal */
785 /* Free the memory pointed to by a 'struct token_ty'. */
787 free_token (token_ty *tp)
789 if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
791 if (tp->type == token_type_string_literal)
792 drop_reference (tp->comment);
796 /* Read an escape sequence inside a string literal or character literal. */
802 /* Use phase 3, because phase 4 elides comments. */
805 return UNICODE ('\\');
809 return UNICODE (0x08);
811 return UNICODE (0x09);
813 return UNICODE (0x0a);
815 return UNICODE (0x0c);
817 return UNICODE (0x0d);
819 return UNICODE ('"');
821 return UNICODE ('\'');
823 return UNICODE ('\\');
824 case '0': case '1': case '2': case '3':
825 case '4': case '5': case '6': case '7':
827 int n = RED (c) - '0';
828 bool maybe3digits = (n < 4);
831 if (RED (c) >= '0' && RED (c) <= '7')
833 n = (n << 3) + (RED (c) - '0');
837 if (RED (c) >= '0' && RED (c) <= '7')
838 n = (n << 3) + (RED (c) - '0');
849 /* Invalid escape sequence. */
851 return UNICODE ('\\');
855 /* Read a string literal or character literal. */
857 accumulate_escaped (struct string_buffer *literal, int delimiter)
863 /* Use phase 3, because phase 4 elides comments. */
865 if (c == P2_EOF || RED (c) == delimiter)
870 error_with_progname = false;
871 if (delimiter == '\'')
872 error (0, 0, _("%s:%d: warning: unterminated character constant"),
873 logical_file_name, line_number);
875 error (0, 0, _("%s:%d: warning: unterminated string constant"),
876 logical_file_name, line_number);
877 error_with_progname = true;
881 c = do_getc_escaped ();
882 string_buffer_append (literal, c);
887 /* Combine characters into tokens. Discard whitespace. */
889 static token_ty phase5_pushback[3];
890 static int phase5_pushback_length;
893 phase5_get (token_ty *tp)
897 if (phase5_pushback_length)
899 *tp = phase5_pushback[--phase5_pushback_length];
906 tp->line_number = line_number;
911 tp->type = token_type_eof;
918 if (last_non_comment_line > last_comment_line)
919 savable_comment_reset ();
924 /* Ignore whitespace and comments. */
928 last_non_comment_line = tp->line_number;
933 tp->type = token_type_lparen;
937 tp->type = token_type_rparen;
941 tp->type = token_type_lbrace;
945 tp->type = token_type_rbrace;
949 tp->type = token_type_comma;
954 if (!(RED (c) >= '0' && RED (c) <= '9'))
957 tp->type = token_type_dot;
962 case '0': case '1': case '2': case '3': case '4':
963 case '5': case '6': case '7': case '8': case '9':
965 /* Don't need to verify the complicated syntax of integers and
966 floating-point numbers. We assume a valid Java input.
967 The simplified syntax that we recognize as number is: any
968 sequence of alphanumeric characters, additionally '+' and '-'
969 immediately after 'e' or 'E' except in hexadecimal numbers. */
970 bool hexadecimal = false;
975 if (RED (c) >= '0' && RED (c) <= '9')
977 if ((RED (c) >= 'A' && RED (c) <= 'Z')
978 || (RED (c) >= 'a' && RED (c) <= 'z'))
980 if (RED (c) == 'X' || RED (c) == 'x')
982 if ((RED (c) == 'E' || RED (c) == 'e') && !hexadecimal)
985 if (!(RED (c) == '+' || RED (c) == '-'))
995 tp->type = token_type_number;
999 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1000 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
1001 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1002 case 'V': case 'W': case 'X': case 'Y': case 'Z':
1004 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1005 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1006 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1007 case 'v': case 'w': case 'x': case 'y': case 'z':
1008 /* Although Java allows identifiers containing many Unicode
1009 characters, we recognize only identifiers consisting of ASCII
1010 characters. This avoids conversion hassles w.r.t. the --keyword
1011 arguments, and shouldn't be a big problem in practice. */
1013 static char *buffer;
1018 if (bufpos >= bufmax)
1020 bufmax = 2 * bufmax + 10;
1021 buffer = xrealloc (buffer, bufmax);
1023 buffer[bufpos++] = RED (c);
1025 if (!((RED (c) >= 'A' && RED (c) <= 'Z')
1026 || (RED (c) >= 'a' && RED (c) <= 'z')
1027 || (RED (c) >= '0' && RED (c) <= '9')
1032 if (bufpos >= bufmax)
1034 bufmax = 2 * bufmax + 10;
1035 buffer = xrealloc (buffer, bufmax);
1037 buffer[bufpos] = '\0';
1038 tp->string = xstrdup (buffer);
1039 tp->type = token_type_symbol;
1044 /* String literal. */
1046 struct string_buffer literal;
1048 init_string_buffer (&literal, lc_string);
1049 accumulate_escaped (&literal, '"');
1050 tp->string = xstrdup (string_buffer_result (&literal));
1051 free_string_buffer (&literal);
1052 tp->comment = add_reference (savable_comment);
1053 tp->type = token_type_string_literal;
1058 /* Character literal. */
1060 struct string_buffer literal;
1062 init_string_buffer (&literal, lc_outside);
1063 accumulate_escaped (&literal, '\'');
1064 free_string_buffer (&literal);
1065 tp->type = token_type_other;
1073 tp->type = token_type_other;
1074 else if (RED (c) == '=')
1076 tp->type = token_type_other;
1081 tp->type = token_type_plus;
1086 /* Misc. operator. */
1087 tp->type = token_type_other;
1093 /* Supports 3 tokens of pushback. */
1095 phase5_unget (token_ty *tp)
1097 if (tp->type != token_type_eof)
1099 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1101 phase5_pushback[phase5_pushback_length++] = *tp;
1106 /* Compile-time optimization of string literal concatenation.
1107 Combine "string1" + ... + "stringN" to the concatenated string if
1108 - the token before this expression is not ')' (because then the first
1109 string could be part of a cast expression),
1110 - the token after this expression is not '.' (because then the last
1111 string could be part of a method call expression). */
1113 static token_ty phase6_pushback[2];
1114 static int phase6_pushback_length;
1116 static token_type_ty phase6_last;
1119 phase6_get (token_ty *tp)
1121 if (phase6_pushback_length)
1123 *tp = phase6_pushback[--phase6_pushback_length];
1128 if (tp->type == token_type_string_literal && phase6_last != token_type_rparen)
1130 char *sum = tp->string;
1131 size_t sum_len = strlen (sum);
1137 phase5_get (&token2);
1138 if (token2.type == token_type_plus)
1142 phase5_get (&token3);
1143 if (token3.type == token_type_string_literal)
1145 token_ty token_after;
1147 phase5_get (&token_after);
1148 if (token_after.type != token_type_dot)
1150 char *addend = token3.string;
1151 size_t addend_len = strlen (addend);
1153 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1154 memcpy (sum + sum_len, addend, addend_len + 1);
1155 sum_len += addend_len;
1157 phase5_unget (&token_after);
1158 free_token (&token3);
1159 free_token (&token2);
1162 phase5_unget (&token_after);
1164 phase5_unget (&token3);
1166 phase5_unget (&token2);
1171 phase6_last = tp->type;
1174 /* Supports 2 tokens of pushback. */
1176 phase6_unget (token_ty *tp)
1178 if (tp->type != token_type_eof)
1180 if (phase6_pushback_length == SIZEOF (phase6_pushback))
1182 phase6_pushback[phase6_pushback_length++] = *tp;
1188 x_java_lex (token_ty *tp)
1193 /* Supports 2 tokens of pushback. */
1195 x_java_unlex (token_ty *tp)
1201 /* ========================= Extracting strings. ========================== */
1204 /* Context lookup table. */
1205 static flag_context_list_table_ty *flag_context_list_table;
1208 /* The file is broken into tokens. Scan the token stream, looking for
1209 a keyword, followed by a left paren, followed by a string. When we
1210 see this sequence, we have something to remember. We assume we are
1211 looking at a valid C or C++ program, and leave the complaints about
1212 the grammar to the compiler.
1214 Normal handling: Look for
1215 keyword ( ... msgid ... )
1216 Plural handling: Look for
1217 keyword ( ... msgid ... msgid_plural ... )
1219 We use recursion because the arguments before msgid or between msgid
1220 and msgid_plural can contain subexpressions of the same form. */
1223 /* Extract messages until the next balanced closing parenthesis or brace,
1224 depending on TERMINATOR.
1225 Extracted messages are added to MLP.
1226 Return true upon eof, false upon closing parenthesis or brace. */
1228 extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1229 flag_context_ty outer_context,
1230 flag_context_list_iterator_ty context_iter,
1231 struct arglist_parser *argparser)
1233 /* Current argument number. */
1235 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1237 /* Parameters of the keyword just seen. Defined only in state 1. */
1238 const struct callshapes *next_shapes = NULL;
1239 /* Context iterator that will be used if the next token is a '('. */
1240 flag_context_list_iterator_ty next_context_iter =
1241 passthrough_context_list_iterator;
1242 /* Current context. */
1243 flag_context_ty inner_context =
1244 inherited_context (outer_context,
1245 flag_context_list_iterator_advance (&context_iter));
1247 /* Start state is 0. */
1254 x_java_lex (&token);
1257 case token_type_symbol:
1259 /* Combine symbol1 . ... . symbolN to a single strings, so that
1260 we can recognize static function calls like
1261 GettextResource.gettext. The information present for
1262 symbolI.....symbolN has precedence over the information for
1263 symbolJ.....symbolN with J > I. */
1264 char *sum = token.string;
1265 size_t sum_len = strlen (sum);
1266 const char *dottedname;
1267 flag_context_list_ty *context_list;
1273 x_java_lex (&token2);
1274 if (token2.type == token_type_dot)
1278 x_java_lex (&token3);
1279 if (token3.type == token_type_symbol)
1281 char *addend = token3.string;
1282 size_t addend_len = strlen (addend);
1285 (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1287 memcpy (sum + sum_len + 1, addend, addend_len + 1);
1288 sum_len += 1 + addend_len;
1290 free_token (&token3);
1291 free_token (&token2);
1294 x_java_unlex (&token3);
1296 x_java_unlex (&token2);
1300 for (dottedname = sum;;)
1302 void *keyword_value;
1304 if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
1308 next_shapes = (const struct callshapes *) keyword_value;
1313 dottedname = strchr (dottedname, '.');
1314 if (dottedname == NULL)
1322 for (dottedname = sum;;)
1325 flag_context_list_table_lookup (
1326 flag_context_list_table,
1327 dottedname, strlen (dottedname));
1328 if (context_list != NULL)
1331 dottedname = strchr (dottedname, '.');
1332 if (dottedname == NULL)
1336 next_context_iter = flag_context_list_iterator (context_list);
1342 case token_type_lparen:
1343 if (extract_parenthesized (mlp, token_type_rparen,
1344 inner_context, next_context_iter,
1345 arglist_parser_alloc (mlp,
1346 state ? next_shapes : NULL)))
1348 xgettext_current_source_encoding = po_charset_utf8;
1349 arglist_parser_done (argparser, arg);
1350 xgettext_current_source_encoding = xgettext_global_source_encoding;
1353 next_context_iter = null_context_list_iterator;
1357 case token_type_rparen:
1358 if (terminator == token_type_rparen)
1360 xgettext_current_source_encoding = po_charset_utf8;
1361 arglist_parser_done (argparser, arg);
1362 xgettext_current_source_encoding = xgettext_global_source_encoding;
1365 if (terminator == token_type_rbrace)
1367 error_with_progname = false;
1369 _("%s:%d: warning: ')' found where '}' was expected"),
1370 logical_file_name, token.line_number);
1371 error_with_progname = true;
1373 next_context_iter = null_context_list_iterator;
1377 case token_type_lbrace:
1378 if (extract_parenthesized (mlp, token_type_rbrace,
1379 null_context, null_context_list_iterator,
1380 arglist_parser_alloc (mlp, NULL)))
1382 xgettext_current_source_encoding = po_charset_utf8;
1383 arglist_parser_done (argparser, arg);
1384 xgettext_current_source_encoding = xgettext_global_source_encoding;
1387 next_context_iter = null_context_list_iterator;
1391 case token_type_rbrace:
1392 if (terminator == token_type_rbrace)
1394 xgettext_current_source_encoding = po_charset_utf8;
1395 arglist_parser_done (argparser, arg);
1396 xgettext_current_source_encoding = xgettext_global_source_encoding;
1399 if (terminator == token_type_rparen)
1401 error_with_progname = false;
1403 _("%s:%d: warning: '}' found where ')' was expected"),
1404 logical_file_name, token.line_number);
1405 error_with_progname = true;
1407 next_context_iter = null_context_list_iterator;
1411 case token_type_comma:
1414 inherited_context (outer_context,
1415 flag_context_list_iterator_advance (
1417 next_context_iter = passthrough_context_list_iterator;
1421 case token_type_string_literal:
1424 pos.file_name = logical_file_name;
1425 pos.line_number = token.line_number;
1427 xgettext_current_source_encoding = po_charset_utf8;
1429 remember_a_message (mlp, NULL, token.string, inner_context,
1430 &pos, NULL, token.comment);
1432 arglist_parser_remember (argparser, arg, token.string,
1434 pos.file_name, pos.line_number,
1436 xgettext_current_source_encoding = xgettext_global_source_encoding;
1438 drop_reference (token.comment);
1439 next_context_iter = null_context_list_iterator;
1443 case token_type_eof:
1444 xgettext_current_source_encoding = po_charset_utf8;
1445 arglist_parser_done (argparser, arg);
1446 xgettext_current_source_encoding = xgettext_global_source_encoding;
1449 case token_type_dot:
1450 case token_type_number:
1451 case token_type_plus:
1452 case token_type_other:
1453 next_context_iter = null_context_list_iterator;
1465 extract_java (FILE *f,
1466 const char *real_filename, const char *logical_filename,
1467 flag_context_list_table_ty *flag_table,
1468 msgdomain_list_ty *mdlp)
1470 message_list_ty *mlp = mdlp->item[0]->messages;
1473 real_file_name = real_filename;
1474 logical_file_name = xstrdup (logical_filename);
1477 last_comment_line = -1;
1478 last_non_comment_line = -1;
1480 phase6_last = token_type_eof;
1482 flag_context_list_table = flag_table;
1486 /* Eat tokens until eof is seen. When extract_parenthesized returns
1487 due to an unbalanced closing parenthesis, just restart it. */
1488 while (!extract_parenthesized (mlp, token_type_eof,
1489 null_context, null_context_list_iterator,
1490 arglist_parser_alloc (mlp, NULL)))
1494 real_file_name = NULL;
1495 logical_file_name = NULL;