1 /* xgettext Vala backend.
2 Copyright (C) 2013 Free Software Foundation, Inc.
4 This file was written by Daiki Ueno <ueno@gnu.org>, 2013.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
35 #include "error-progname.h"
37 #include "xvasprintf.h"
41 #define _(s) gettext(s)
43 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
45 /* The Vala syntax is defined in the Vala Reference Manual
46 http://www.vala-project.org/doc/vala/.
47 See also vala/valascanner.vala. */
49 /* ====================== Keyword set customization. ====================== */
51 /* If true extract all strings. */
52 static bool extract_all = false;
54 static hash_table keywords;
55 static bool default_keywords = true;
66 add_keyword (const char *name, hash_table *keywords)
69 default_keywords = false;
73 struct callshape shape;
76 if (keywords->table == NULL)
77 hash_init (keywords, 100);
79 split_keywordspec (name, &end, &shape);
81 /* The characters between name and end should form a valid C identifier.
82 A colon means an invalid parse in split_keywordspec(). */
83 colon = strchr (name, ':');
84 if (colon == NULL || colon >= end)
85 insert_keyword_callshape (keywords, name, end - name, &shape);
90 x_vala_keyword (const char *name)
92 add_keyword (name, &keywords);
100 /* When adding new keywords here, also update the documentation in
102 x_vala_keyword ("dgettext:2");
103 x_vala_keyword ("dcgettext:2");
104 x_vala_keyword ("ngettext:1,2");
105 x_vala_keyword ("dngettext:2,3");
106 x_vala_keyword ("dpgettext:2g");
107 x_vala_keyword ("dpgettext2:2c,3");
108 x_vala_keyword ("_");
109 x_vala_keyword ("Q_");
110 x_vala_keyword ("N_");
111 x_vala_keyword ("NC_:1c,2");
113 default_keywords = false;
118 init_flag_table_vala ()
120 xgettext_record_flag ("dgettext:2:pass-c-format");
121 xgettext_record_flag ("dcgettext:2:pass-c-format");
122 xgettext_record_flag ("ngettext:1:pass-c-format");
123 xgettext_record_flag ("ngettext:2:pass-c-format");
124 xgettext_record_flag ("dngettext:2:pass-c-format");
125 xgettext_record_flag ("dngettext:3:pass-c-format");
126 xgettext_record_flag ("dpgettext:2:pass-c-format");
127 xgettext_record_flag ("dpgettext2:3:pass-c-format");
128 xgettext_record_flag ("_:1:pass-c-format");
129 xgettext_record_flag ("Q_:1:pass-c-format");
130 xgettext_record_flag ("N_:1:pass-c-format");
131 xgettext_record_flag ("NC_:2:pass-c-format");
133 /* Vala leaves string formatting to Glib functions and thus the
134 format string is exactly same as C. See also
135 vapi/glib-2.0.vapi. */
136 xgettext_record_flag ("printf:1:c-format");
137 xgettext_record_flag ("vprintf:1:c-format");
141 /* ======================== Reading of characters. ======================== */
143 /* Real filename, used in error messages about the input file. */
144 static const char *real_file_name;
146 /* Logical filename and line number, used to label the extracted messages. */
147 static char *logical_file_name;
148 static int line_number;
150 /* The input file stream. */
154 /* 1. line_number handling. */
156 #define MAX_PHASE1_PUSHBACK 16
157 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
158 static int phase1_pushback_length;
166 if (phase1_pushback_length)
167 c = phase1_pushback[--phase1_pushback_length];
174 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
186 /* Supports 2 characters of pushback. */
188 phase1_ungetc (int c)
195 if (phase1_pushback_length == SIZEOF (phase1_pushback))
197 phase1_pushback[phase1_pushback_length++] = c;
202 /* These are for tracking whether comments count as immediately before
204 static int last_comment_line;
205 static int last_non_comment_line;
207 /* Accumulating comments. */
210 static size_t bufmax;
211 static size_t buflen;
222 if (buflen >= bufmax)
224 bufmax = 2 * bufmax + 10;
225 buffer = xrealloc (buffer, bufmax);
227 buffer[buflen++] = c;
231 comment_line_end (size_t chars_to_remove)
233 buflen -= chars_to_remove;
235 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
237 if (chars_to_remove == 0 && buflen >= bufmax)
239 bufmax = 2 * bufmax + 10;
240 buffer = xrealloc (buffer, bufmax);
242 buffer[buflen] = '\0';
243 savable_comment_add (buffer);
247 /* 2. Replace each comment that is not inside a character constant or
248 string literal with a space character. */
269 last_was_star = false;
275 /* We skip all leading white space, but not EOLs. */
276 if (!(buflen == 0 && (c == ' ' || c == '\t')))
281 comment_line_end (1);
283 last_was_star = false;
287 last_was_star = true;
293 comment_line_end (2);
299 last_was_star = false;
304 last_comment_line = line_number;
308 /* C++ or ISO C 99 comment. */
313 if (c == '\n' || c == EOF)
315 /* We skip all leading white space, but not EOLs. */
316 if (!(buflen == 0 && (c == ' ' || c == '\t')))
319 comment_line_end (0);
320 last_comment_line = line_number;
327 phase2_ungetc (int c)
333 /* ========================== Reading of tokens. ========================== */
337 token_type_character_constant, /* 'x' */
339 token_type_lparen, /* ( */
340 token_type_rparen, /* ) */
341 token_type_lbrace, /* { */
342 token_type_rbrace, /* } */
343 token_type_assign, /* = */
344 token_type_return, /* return */
345 token_type_plus, /* + */
346 token_type_minus, /* - */
347 token_type_equality_test_operator, /* == < > >= <= != */
348 token_type_logic_operator, /* ! && || */
349 token_type_comma, /* , */
350 token_type_colon, /* : */
351 token_type_number, /* 2.7 */
352 token_type_string_literal, /* "abc" */
353 token_type_string_template, /* @"abc" */
354 token_type_regex_literal, /* /.../ */
355 token_type_symbol, /* if else etc. */
358 typedef enum token_type_ty token_type_ty;
360 typedef struct token_ty token_ty;
364 char *string; /* for token_type_symbol, token_type_string_literal */
365 refcounted_string_list_ty *comment; /* for token_type_string_literal */
369 /* Free the memory pointed to by a 'struct token_ty'. */
371 free_token (token_ty *tp)
373 if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
375 if (tp->type == token_type_string_literal)
376 drop_reference (tp->comment);
380 /* Replace escape sequences within character strings with their single
381 character equivalents. */
383 #define P7_QUOTES (1000 + '"')
384 #define P7_QUOTE (1000 + '\'')
385 #define P7_NEWLINE (1000 + '\n')
392 /* Use phase 1, because phase 2 elides comments. */
395 /* Return a magic newline indicator, so that we can distinguish
396 between the user requesting a newline in the string (e.g. using
397 "\n" or "\012") from the user failing to terminate the string or
398 character constant. The ANSI C standard says: 3.1.3.4 Character
399 Constants contain "any character except single quote, backslash or
400 newline; or an escape sequence" and 3.1.4 String Literals contain
401 "any character except double quote, backslash or newline; or an
404 Most compilers give a fatal error in this case, however gcc is
405 stupidly silent, even though this is a very common typo. OK, so
406 "gcc --pedantic" will tell me, but that gripes about too much other
407 stuff. Could I have a "gcc -Wnewline-in-string" option, or
408 better yet a "gcc -fno-newline-in-string" option, please? Gcc is
409 also inconsistent between string literals and character constants:
410 you may not embed newlines in character constants; try it, you get
411 a useful diagnostic. --PMiller */
425 /* Unknown escape sequences really should be an error, but just
426 ignore them, and let the real compiler complain. */
441 /* The \e escape is preculiar to gcc, and assumes an ASCII
442 character set (or superset). We don't provide support for it
465 case '0': case '1': case '2': case '3': case '4':
466 case '5': case '6': case '7': case '8': case '9':
467 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
468 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
480 case '0': case '1': case '2': case '3': case '4':
481 case '5': case '6': case '7': case '8': case '9':
482 n = n * 16 + c - '0';
485 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
486 n = n * 16 + 10 + c - 'A';
489 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
490 n = n * 16 + 10 + c - 'a';
497 case '0': case '1': case '2': case '3':
498 case '4': case '5': case '6': case '7':
500 for (j = 0; j < 3; ++j)
509 case '0': case '1': case '2': case '3':
510 case '4': case '5': case '6': case '7':
522 phase7_ungetc (int c)
528 /* 3. Parse each resulting logical line as preprocessing tokens and
529 white space. Preprocessing tokens and Vala tokens don't always
532 static token_ty phase3_pushback[2];
533 static int phase3_pushback_length;
536 static token_type_ty last_token_type = token_type_other;
556 error_with_progname = false;
558 _("%s:%d: warning: regular expression literal terminated too early"),
559 logical_file_name, line_number);
560 error_with_progname = true;
566 if (!(c == 'i' || c == 's' || c == 'm' || c == 'x'))
571 phase3_get (token_ty *tp)
577 if (phase3_pushback_length)
579 *tp = phase3_pushback[--phase3_pushback_length];
580 last_token_type = tp->type;
590 tp->line_number = line_number;
596 tp->type = last_token_type = token_type_eof;
600 if (last_non_comment_line > last_comment_line)
601 savable_comment_reset ();
606 /* Ignore whitespace and comments. */
612 last_non_comment_line = tp->line_number;
618 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
619 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
620 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
621 case 'V': case 'W': case 'X': case 'Y': case 'Z':
623 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
624 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
625 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
626 case 'v': case 'w': case 'x': case 'y': case 'z':
630 if (bufpos >= bufmax)
632 bufmax = 2 * bufmax + 10;
633 buffer = xrealloc (buffer, bufmax);
635 buffer[bufpos++] = c;
639 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
640 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
641 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
642 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
645 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
646 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
647 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
648 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
650 case '0': case '1': case '2': case '3': case '4':
651 case '5': case '6': case '7': case '8': case '9':
660 if (bufpos >= bufmax)
662 bufmax = 2 * bufmax + 10;
663 buffer = xrealloc (buffer, bufmax);
666 if (strcmp (buffer, "return") == 0)
667 tp->type = last_token_type = token_type_return;
670 tp->string = xstrdup (buffer);
671 tp->type = last_token_type = token_type_symbol;
681 tp->string = xstrdup (".");
682 tp->type = last_token_type = token_type_symbol;
685 case '0': case '1': case '2': case '3': case '4':
686 case '5': case '6': case '7': case '8': case '9':
692 case '0': case '1': case '2': case '3': case '4':
693 case '5': case '6': case '7': case '8': case '9':
694 /* The preprocessing number token is more "generous" than the C
695 number tokens. This is mostly due to token pasting (another
696 thing we can ignore here). */
700 if (bufpos >= bufmax)
702 bufmax = 2 * bufmax + 10;
703 buffer = xrealloc (buffer, bufmax);
705 buffer[bufpos++] = c;
711 if (bufpos >= bufmax)
713 bufmax = 2 * bufmax + 10;
714 buffer = xrealloc (buffer, bufmax);
716 buffer[bufpos++] = c;
718 if (c != '+' && c != '-')
725 case 'A': case 'B': case 'C': case 'D': case 'F':
726 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
727 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
728 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
730 case 'a': case 'b': case 'c': case 'd': case 'f':
731 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
732 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
733 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
735 case '0': case '1': case '2': case '3': case '4':
736 case '5': case '6': case '7': case '8': case '9':
746 if (bufpos >= bufmax)
748 bufmax = 2 * bufmax + 10;
749 buffer = xrealloc (buffer, bufmax);
752 tp->type = last_token_type = token_type_number;
761 error_with_progname = false;
762 error (0, 0, _("%s:%d: warning: unterminated character constant"),
763 logical_file_name, line_number - 1);
764 error_with_progname = true;
765 phase7_ungetc ('\n');
768 if (c == EOF || c == P7_QUOTE)
771 tp->type = last_token_type = token_type_character_constant;
774 /* Vala provides strings in three different formats.
776 Usual string literals:
778 Verbatim string literals:
779 """...""" (where ... can include newlines and double quotes)
783 Note that, with the current implementation string
784 templates are not subject to translation, because they are
785 inspected at compile time. For example, the following code
788 string foo = _(@"foo $bar");
790 will be translated into the C code, like:
792 _(g_strconcat ("foo ", "bar", NULL)); */
798 tp->type = last_token_type = token_type_other;
805 int c2 = phase2_getc ();
808 int c3 = phase2_getc ();
831 error_with_progname = false;
832 error (0, 0, _("%s:%d: warning: unterminated string literal"),
833 logical_file_name, line_number - 1);
834 error_with_progname = true;
835 phase7_ungetc ('\n');
843 int c2 = phase2_getc ();
846 int c3 = phase2_getc ();
861 if (bufpos >= bufmax)
863 bufmax = 2 * bufmax + 10;
864 buffer = xrealloc (buffer, bufmax);
866 buffer[bufpos++] = c;
868 if (bufpos >= bufmax)
870 bufmax = 2 * bufmax + 10;
871 buffer = xrealloc (buffer, bufmax);
874 tp->type = last_token_type = template ? token_type_string_template : token_type_string_literal;
875 tp->string = xstrdup (buffer);
876 tp->comment = add_reference (savable_comment);
880 switch (last_token_type)
882 case token_type_lparen:
883 case token_type_lbrace:
884 case token_type_assign:
885 case token_type_return:
886 case token_type_plus:
887 case token_type_minus:
888 case token_type_equality_test_operator:
889 case token_type_logic_operator:
890 case token_type_comma:
891 phase3_scan_regex ();
892 tp->type = last_token_type = token_type_regex_literal;
896 int c2 = phase2_getc ();
902 tp->type = last_token_type = token_type_other;
909 tp->type = last_token_type = token_type_lparen;
913 tp->type = last_token_type = token_type_rparen;
917 tp->type = last_token_type = token_type_lbrace;
921 tp->type = last_token_type = token_type_rbrace;
926 int c2 = phase2_getc ();
930 tp->type = last_token_type = token_type_other;
934 tp->type = last_token_type = token_type_plus;
942 int c2 = phase2_getc ();
946 tp->type = last_token_type = token_type_other;
950 tp->type = last_token_type = token_type_minus;
958 int c2 = phase2_getc ();
962 tp->type = last_token_type = token_type_equality_test_operator;
965 tp->type = last_token_type = token_type_other;
969 tp->type = last_token_type = token_type_assign;
977 int c2 = phase2_getc ();
980 tp->type = last_token_type = token_type_equality_test_operator;
984 tp->type = last_token_type = token_type_logic_operator;
991 int c2 = phase2_getc ();
993 tp->type = last_token_type = token_type_equality_test_operator;
996 int c3 = phase2_getc ();
999 tp->type = last_token_type = token_type_other;
1004 tp->type = last_token_type = token_type_equality_test_operator;
1010 tp->type = last_token_type = token_type_comma;
1014 tp->type = last_token_type = token_type_colon;
1020 int c2 = phase2_getc ();
1022 tp->type = last_token_type = token_type_logic_operator;
1024 tp->type = last_token_type = token_type_other;
1028 tp->type = last_token_type = token_type_other;
1035 int c2 = phase2_getc ();
1038 tp->type = last_token_type = token_type_logic_operator;
1042 tp->type = last_token_type = token_type_other;
1047 tp->type = last_token_type = token_type_other;
1054 phase3_unget (token_ty *tp)
1056 if (tp->type != token_type_eof)
1058 if (phase3_pushback_length == SIZEOF (phase3_pushback))
1060 phase3_pushback[phase3_pushback_length++] = *tp;
1065 /* String concatenation with '+'. */
1068 x_vala_lex (token_ty *tp)
1071 if (tp->type == token_type_string_literal)
1073 char *sum = tp->string;
1074 size_t sum_len = strlen (sum);
1080 phase3_get (&token2);
1081 if (token2.type == token_type_plus)
1085 phase3_get (&token3);
1086 if (token3.type == token_type_string_literal)
1088 char *addend = token3.string;
1089 size_t addend_len = strlen (addend);
1091 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1092 memcpy (sum + sum_len, addend, addend_len + 1);
1093 sum_len += addend_len;
1095 free_token (&token3);
1096 free_token (&token2);
1099 phase3_unget (&token3);
1101 phase3_unget (&token2);
1109 /* ========================= Extracting strings. ========================== */
1112 /* Context lookup table. */
1113 static flag_context_list_table_ty *flag_context_list_table;
1116 /* The file is broken into tokens. Scan the token stream, looking for
1117 a keyword, followed by a left paren, followed by a string. When we
1118 see this sequence, we have something to remember. We assume we are
1119 looking at a valid Vala program, and leave the complaints about the
1120 grammar to the compiler.
1122 Normal handling: Look for
1123 keyword ( ... msgid ... )
1125 Plural handling: Look for
1126 keyword ( ... msgid ... msgid_plural ... )
1128 We use recursion because the arguments before msgid or between msgid
1129 and msgid_plural can contain subexpressions of the same form. */
1131 /* Extract messages until the next balanced closing parenthesis or bracket.
1132 Extracted messages are added to MLP.
1133 DELIM can be either token_type_rparen or token_type_rbracket, or
1134 token_type_eof to accept both.
1135 Return true upon eof, false upon closing parenthesis or bracket. */
1137 extract_balanced (message_list_ty *mlp, token_type_ty delim,
1138 flag_context_ty outer_context,
1139 flag_context_list_iterator_ty context_iter,
1140 struct arglist_parser *argparser)
1142 /* Current argument number. */
1144 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1146 /* Parameters of the keyword just seen. Defined only in state 1. */
1147 const struct callshapes *next_shapes = NULL;
1148 /* Context iterator that will be used if the next token is a '('. */
1149 flag_context_list_iterator_ty next_context_iter =
1150 passthrough_context_list_iterator;
1151 /* Current context. */
1152 flag_context_ty inner_context =
1153 inherited_context (outer_context,
1154 flag_context_list_iterator_advance (&context_iter));
1156 /* Start state is 0. */
1163 x_vala_lex (&token);
1167 case token_type_symbol:
1169 void *keyword_value;
1171 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1175 next_shapes = (const struct callshapes *) keyword_value;
1182 flag_context_list_iterator (
1183 flag_context_list_table_lookup (
1184 flag_context_list_table,
1185 token.string, strlen (token.string)));
1186 free (token.string);
1189 case token_type_lparen:
1190 if (extract_balanced (mlp, token_type_rparen,
1191 inner_context, next_context_iter,
1192 arglist_parser_alloc (mlp,
1193 state ? next_shapes : NULL)))
1195 arglist_parser_done (argparser, arg);
1198 next_context_iter = null_context_list_iterator;
1202 case token_type_rparen:
1203 if (delim == token_type_rparen || delim == token_type_eof)
1205 arglist_parser_done (argparser, arg);
1209 next_context_iter = null_context_list_iterator;
1213 case token_type_comma:
1216 inherited_context (outer_context,
1217 flag_context_list_iterator_advance (
1219 next_context_iter = passthrough_context_list_iterator;
1223 case token_type_eof:
1224 arglist_parser_done (argparser, arg);
1227 case token_type_string_literal:
1230 pos.file_name = logical_file_name;
1231 pos.line_number = token.line_number;
1234 remember_a_message (mlp, NULL, token.string, inner_context,
1235 &pos, NULL, token.comment);
1238 /* A string immediately after a symbol means a function call. */
1241 struct arglist_parser *tmp_argparser;
1242 tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
1244 arglist_parser_remember (tmp_argparser, 1, token.string,
1245 inner_context, pos.file_name,
1246 pos.line_number, token.comment);
1247 arglist_parser_done (tmp_argparser, 1);
1250 arglist_parser_remember (argparser, arg, token.string,
1251 inner_context, pos.file_name,
1252 pos.line_number, token.comment);
1255 drop_reference (token.comment);
1256 next_context_iter = null_context_list_iterator;
1260 case token_type_character_constant:
1261 case token_type_lbrace:
1262 case token_type_rbrace:
1263 case token_type_assign:
1264 case token_type_return:
1265 case token_type_plus:
1266 case token_type_minus:
1267 case token_type_equality_test_operator:
1268 case token_type_logic_operator:
1269 case token_type_colon:
1270 case token_type_number:
1271 case token_type_string_template:
1272 case token_type_regex_literal:
1273 case token_type_other:
1274 next_context_iter = null_context_list_iterator;
1285 extract_vala (FILE *f,
1286 const char *real_filename, const char *logical_filename,
1287 flag_context_list_table_ty *flag_table,
1288 msgdomain_list_ty *mdlp)
1290 message_list_ty *mlp = mdlp->item[0]->messages;
1293 real_file_name = real_filename;
1294 logical_file_name = xstrdup (logical_filename);
1297 last_comment_line = -1;
1298 last_non_comment_line = -1;
1300 flag_context_list_table = flag_table;
1304 /* Eat tokens until eof is seen. When extract_parenthesized returns
1305 due to an unbalanced closing parenthesis, just restart it. */
1306 while (!extract_balanced (mlp, token_type_eof,
1307 null_context, null_context_list_iterator,
1308 arglist_parser_alloc (mlp, NULL)))
1312 real_file_name = NULL;
1313 logical_file_name = NULL;