1 /* xgettext Vala backend.
2 Copyright (C) 2013, 2015 Free Software Foundation, Inc.
4 This file was written by Daiki Ueno <ueno@gnu.org>, 2013.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
36 #include "error-progname.h"
38 #include "xvasprintf.h"
40 #include "po-charset.h"
43 #define _(s) gettext(s)
45 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
47 /* The Vala syntax is defined in the Vala Reference Manual
48 http://www.vala-project.org/doc/vala/.
49 See also vala/valascanner.vala. */
51 /* ====================== Keyword set customization. ====================== */
53 /* If true extract all strings. */
54 static bool extract_all = false;
56 static hash_table keywords;
57 static bool default_keywords = true;
68 add_keyword (const char *name, hash_table *keywords)
71 default_keywords = false;
75 struct callshape shape;
78 if (keywords->table == NULL)
79 hash_init (keywords, 100);
81 split_keywordspec (name, &end, &shape);
83 /* The characters between name and end should form a valid C identifier.
84 A colon means an invalid parse in split_keywordspec(). */
85 colon = strchr (name, ':');
86 if (colon == NULL || colon >= end)
87 insert_keyword_callshape (keywords, name, end - name, &shape);
92 x_vala_keyword (const char *name)
94 add_keyword (name, &keywords);
100 if (default_keywords)
102 /* When adding new keywords here, also update the documentation in
104 x_vala_keyword ("dgettext:2");
105 x_vala_keyword ("dcgettext:2");
106 x_vala_keyword ("ngettext:1,2");
107 x_vala_keyword ("dngettext:2,3");
108 x_vala_keyword ("dpgettext:2g");
109 x_vala_keyword ("dpgettext2:2c,3");
110 x_vala_keyword ("_");
111 x_vala_keyword ("Q_");
112 x_vala_keyword ("N_");
113 x_vala_keyword ("NC_:1c,2");
115 default_keywords = false;
120 init_flag_table_vala ()
122 xgettext_record_flag ("dgettext:2:pass-c-format");
123 xgettext_record_flag ("dcgettext:2:pass-c-format");
124 xgettext_record_flag ("ngettext:1:pass-c-format");
125 xgettext_record_flag ("ngettext:2:pass-c-format");
126 xgettext_record_flag ("dngettext:2:pass-c-format");
127 xgettext_record_flag ("dngettext:3:pass-c-format");
128 xgettext_record_flag ("dpgettext:2:pass-c-format");
129 xgettext_record_flag ("dpgettext2:3:pass-c-format");
130 xgettext_record_flag ("_:1:pass-c-format");
131 xgettext_record_flag ("Q_:1:pass-c-format");
132 xgettext_record_flag ("N_:1:pass-c-format");
133 xgettext_record_flag ("NC_:2:pass-c-format");
135 /* Vala leaves string formatting to Glib functions and thus the
136 format string is exactly same as C. See also
137 vapi/glib-2.0.vapi. */
138 xgettext_record_flag ("printf:1:c-format");
139 xgettext_record_flag ("vprintf:1:c-format");
143 /* ======================== Reading of characters. ======================== */
145 /* Real filename, used in error messages about the input file. */
146 static const char *real_file_name;
148 /* Logical filename and line number, used to label the extracted messages. */
149 static char *logical_file_name;
150 static int line_number;
152 /* The input file stream. */
156 /* 1. line_number handling. */
158 #define MAX_PHASE1_PUSHBACK 16
159 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
160 static int phase1_pushback_length;
168 if (phase1_pushback_length)
169 c = phase1_pushback[--phase1_pushback_length];
176 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
188 /* Supports 2 characters of pushback. */
190 phase1_ungetc (int c)
197 if (phase1_pushback_length == SIZEOF (phase1_pushback))
199 phase1_pushback[phase1_pushback_length++] = c;
204 /* These are for tracking whether comments count as immediately before
206 static int last_comment_line;
207 static int last_non_comment_line;
209 /* Accumulating comments. */
212 static size_t bufmax;
213 static size_t buflen;
224 if (buflen >= bufmax)
226 bufmax = 2 * bufmax + 10;
227 buffer = xrealloc (buffer, bufmax);
229 buffer[buflen++] = c;
233 comment_line_end (size_t chars_to_remove)
235 buflen -= chars_to_remove;
237 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
239 if (chars_to_remove == 0 && buflen >= bufmax)
241 bufmax = 2 * bufmax + 10;
242 buffer = xrealloc (buffer, bufmax);
244 buffer[buflen] = '\0';
245 savable_comment_add (buffer);
249 /* 2. Replace each comment that is not inside a character constant or
250 string literal with a space character. */
271 last_was_star = false;
277 /* We skip all leading white space, but not EOLs. */
278 if (!(buflen == 0 && (c == ' ' || c == '\t')))
283 comment_line_end (1);
285 last_was_star = false;
289 last_was_star = true;
295 comment_line_end (2);
301 last_was_star = false;
306 last_comment_line = line_number;
310 /* C++ or ISO C 99 comment. */
315 if (c == '\n' || c == EOF)
317 /* We skip all leading white space, but not EOLs. */
318 if (!(buflen == 0 && (c == ' ' || c == '\t')))
321 comment_line_end (0);
322 last_comment_line = line_number;
329 phase2_ungetc (int c)
335 /* ========================== Reading of tokens. ========================== */
339 token_type_character_constant, /* 'x' */
341 token_type_lparen, /* ( */
342 token_type_rparen, /* ) */
343 token_type_lbrace, /* { */
344 token_type_rbrace, /* } */
345 token_type_assign, /* = += -= *= /= %= <<= >>= &= |= ^= */
346 token_type_return, /* return */
347 token_type_plus, /* + */
348 token_type_arithmetic_operator, /* - * / % << >> & | ^ */
349 token_type_equality_test_operator, /* == < > >= <= != */
350 token_type_logic_operator, /* ! && || */
351 token_type_comma, /* , */
352 token_type_question, /* ? */
353 token_type_colon, /* : */
354 token_type_number, /* 2.7 */
355 token_type_string_literal, /* "abc" */
356 token_type_string_template, /* @"abc" */
357 token_type_regex_literal, /* /.../ */
358 token_type_symbol, /* if else etc. */
361 typedef enum token_type_ty token_type_ty;
363 typedef struct token_ty token_ty;
367 char *string; /* for token_type_symbol, token_type_string_literal */
368 refcounted_string_list_ty *comment; /* for token_type_string_literal */
369 enum literalstring_escape_type escape;
373 /* Free the memory pointed to by a 'struct token_ty'. */
375 free_token (token_ty *tp)
377 if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
379 if (tp->type == token_type_string_literal)
380 drop_reference (tp->comment);
384 /* 3. Parse each resulting logical line as preprocessing tokens and
385 white space. Preprocessing tokens and Vala tokens don't always
388 static token_ty phase3_pushback[2];
389 static int phase3_pushback_length;
392 static token_type_ty last_token_type = token_type_other;
412 error_with_progname = false;
414 _("%s:%d: warning: regular expression literal terminated too early"),
415 logical_file_name, line_number);
416 error_with_progname = true;
422 if (!(c == 'i' || c == 's' || c == 'm' || c == 'x'))
427 phase3_get (token_ty *tp)
432 int last_was_backslash;
438 if (bufpos >= bufmax) \
440 bufmax = 2 * bufmax + 10; \
441 buffer = xrealloc (buffer, bufmax); \
443 buffer[bufpos++] = c; \
447 if (phase3_pushback_length)
449 *tp = phase3_pushback[--phase3_pushback_length];
450 last_token_type = tp->type;
460 tp->line_number = line_number;
466 tp->type = last_token_type = token_type_eof;
470 if (last_non_comment_line > last_comment_line)
471 savable_comment_reset ();
476 /* Ignore whitespace and comments. */
482 last_non_comment_line = tp->line_number;
488 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
489 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
490 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
491 case 'V': case 'W': case 'X': case 'Y': case 'Z':
493 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
494 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
495 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
496 case 'v': case 'w': case 'x': case 'y': case 'z':
504 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
505 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
506 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
507 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
510 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
511 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
512 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
513 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
515 case '0': case '1': case '2': case '3': case '4':
516 case '5': case '6': case '7': case '8': case '9':
526 if (strcmp (buffer, "return") == 0)
527 tp->type = last_token_type = token_type_return;
530 tp->string = xstrdup (buffer);
531 tp->type = last_token_type = token_type_symbol;
541 tp->string = xstrdup (".");
542 tp->type = last_token_type = token_type_symbol;
545 case '0': case '1': case '2': case '3': case '4':
546 case '5': case '6': case '7': case '8': case '9':
552 case '0': case '1': case '2': case '3': case '4':
553 case '5': case '6': case '7': case '8': case '9':
554 /* The preprocessing number token is more "generous" than the C
555 number tokens. This is mostly due to token pasting (another
556 thing we can ignore here). */
568 if (c != '+' && c != '-')
575 case 'A': case 'B': case 'C': case 'D': case 'F':
576 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
577 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
578 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
580 case 'a': case 'b': case 'c': case 'd': case 'f':
581 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
582 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
583 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
585 case '0': case '1': case '2': case '3': case '4':
586 case '5': case '6': case '7': case '8': case '9':
597 tp->type = last_token_type = token_type_number;
601 last_was_backslash = false;
605 if (last_was_backslash)
607 last_was_backslash = false;
613 last_was_backslash = true;
618 error_with_progname = false;
619 error (0, 0, _("%s:%d: warning: unterminated character constant"),
620 logical_file_name, line_number - 1);
621 error_with_progname = true;
622 phase2_ungetc ('\n');
629 tp->type = last_token_type = token_type_character_constant;
632 /* Vala provides strings in three different formats.
634 Usual string literals:
636 Verbatim string literals:
637 """...""" (where ... can include newlines and double quotes)
641 Note that, with the current implementation string
642 templates are not subject to translation, because they are
643 inspected at compile time. For example, the following code
646 string foo = _(@"foo $bar");
648 will be translated into the C code, like:
650 _(g_strconcat ("foo ", "bar", NULL)); */
656 tp->type = last_token_type = token_type_other;
663 int c2 = phase2_getc ();
667 int c3 = phase2_getc ();
684 /* Use phase 1, because phase 2 elides comments. */
691 int c2 = phase1_getc ();
694 int c3 = phase1_getc ();
706 last_was_backslash = false;
711 if (last_was_backslash)
713 last_was_backslash = false;
721 last_was_backslash = true;
728 error_with_progname = false;
730 %s:%d: warning: unterminated string literal"),
731 logical_file_name, line_number - 1);
732 error_with_progname = true;
733 phase1_ungetc ('\n');
742 tp->type = last_token_type = template
743 ? token_type_string_template : token_type_string_literal;
744 tp->string = xstrdup (buffer);
745 tp->comment = add_reference (savable_comment);
746 tp->escape = verbatim ? 0 : LET_ANSI_C | LET_UNICODE;
751 switch (last_token_type)
753 case token_type_lparen:
754 case token_type_lbrace:
755 case token_type_assign:
756 case token_type_return:
757 case token_type_plus:
758 case token_type_arithmetic_operator:
759 case token_type_equality_test_operator:
760 case token_type_logic_operator:
761 case token_type_comma:
762 case token_type_question:
763 case token_type_colon:
764 phase3_scan_regex ();
765 tp->type = last_token_type = token_type_regex_literal;
769 int c2 = phase2_getc ();
771 tp->type = last_token_type = token_type_assign;
775 tp->type = last_token_type = token_type_arithmetic_operator;
783 tp->type = last_token_type = token_type_lparen;
787 tp->type = last_token_type = token_type_rparen;
791 tp->type = last_token_type = token_type_lbrace;
795 tp->type = last_token_type = token_type_rbrace;
800 int c2 = phase2_getc ();
804 tp->type = last_token_type = token_type_other;
807 tp->type = last_token_type = token_type_assign;
811 tp->type = last_token_type = token_type_plus;
819 int c2 = phase2_getc ();
823 tp->type = last_token_type = token_type_other;
826 tp->type = last_token_type = token_type_assign;
830 tp->type = last_token_type = token_type_arithmetic_operator;
839 int c2 = phase2_getc ();
841 tp->type = last_token_type = token_type_assign;
845 tp->type = last_token_type = token_type_logic_operator;
852 int c2 = phase2_getc ();
856 tp->type = last_token_type = token_type_equality_test_operator;
859 tp->type = last_token_type = token_type_other;
863 tp->type = last_token_type = token_type_assign;
871 int c2 = phase2_getc ();
873 tp->type = last_token_type = token_type_equality_test_operator;
877 tp->type = last_token_type = token_type_logic_operator;
885 int c2 = phase2_getc ();
887 tp->type = last_token_type = token_type_equality_test_operator;
890 int c3 = phase2_getc ();
892 tp->type = last_token_type = token_type_assign;
897 tp->type = last_token_type = token_type_other;
903 tp->type = last_token_type = token_type_equality_test_operator;
909 tp->type = last_token_type = token_type_comma;
913 tp->type = last_token_type = token_type_colon;
919 int c2 = phase2_getc ();
921 tp->type = last_token_type = token_type_logic_operator;
923 tp->type = last_token_type = token_type_assign;
927 tp->type = last_token_type = token_type_arithmetic_operator;
934 int c2 = phase2_getc ();
936 tp->type = last_token_type = token_type_logic_operator;
940 tp->type = last_token_type = token_type_question;
946 tp->type = last_token_type = token_type_other;
954 phase3_unget (token_ty *tp)
956 if (tp->type != token_type_eof)
958 if (phase3_pushback_length == SIZEOF (phase3_pushback))
960 phase3_pushback[phase3_pushback_length++] = *tp;
965 /* String concatenation with '+'. */
968 x_vala_lex (token_ty *tp)
971 if (tp->type == token_type_string_literal)
973 char *sum = tp->string;
974 size_t sum_len = strlen (sum);
980 phase3_get (&token2);
981 if (token2.type == token_type_plus)
985 phase3_get (&token3);
986 if (token3.type == token_type_string_literal)
988 char *addend = token3.string;
989 size_t addend_len = strlen (addend);
991 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
992 memcpy (sum + sum_len, addend, addend_len + 1);
993 sum_len += addend_len;
995 free_token (&token3);
996 free_token (&token2);
999 phase3_unget (&token3);
1001 phase3_unget (&token2);
1009 /* ========================= Extracting strings. ========================== */
1012 /* Context lookup table. */
1013 static flag_context_list_table_ty *flag_context_list_table;
1015 /* Use the same literalstring_parser provided by the C scanner. */
1016 extern struct literalstring_parser literalstring_c;
1018 /* The file is broken into tokens. Scan the token stream, looking for
1019 a keyword, followed by a left paren, followed by a string. When we
1020 see this sequence, we have something to remember. We assume we are
1021 looking at a valid Vala program, and leave the complaints about the
1022 grammar to the compiler.
1024 Normal handling: Look for
1025 keyword ( ... msgid ... )
1027 Plural handling: Look for
1028 keyword ( ... msgid ... msgid_plural ... )
1030 We use recursion because the arguments before msgid or between msgid
1031 and msgid_plural can contain subexpressions of the same form. */
1033 /* Extract messages until the next balanced closing parenthesis or bracket.
1034 Extracted messages are added to MLP.
1035 DELIM can be either token_type_rparen or token_type_rbracket, or
1036 token_type_eof to accept both.
1037 Return true upon eof, false upon closing parenthesis or bracket. */
1039 extract_balanced (message_list_ty *mlp, token_type_ty delim,
1040 flag_context_ty outer_context,
1041 flag_context_list_iterator_ty context_iter,
1042 struct arglist_parser *argparser)
1044 /* Current argument number. */
1046 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1048 /* Parameters of the keyword just seen. Defined only in state 1. */
1049 const struct callshapes *next_shapes = NULL;
1050 /* Context iterator that will be used if the next token is a '('. */
1051 flag_context_list_iterator_ty next_context_iter =
1052 passthrough_context_list_iterator;
1053 /* Current context. */
1054 flag_context_ty inner_context =
1055 inherited_context (outer_context,
1056 flag_context_list_iterator_advance (&context_iter));
1058 /* Start state is 0. */
1065 x_vala_lex (&token);
1069 case token_type_symbol:
1071 void *keyword_value;
1073 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1077 next_shapes = (const struct callshapes *) keyword_value;
1084 flag_context_list_iterator (
1085 flag_context_list_table_lookup (
1086 flag_context_list_table,
1087 token.string, strlen (token.string)));
1088 free (token.string);
1091 case token_type_lparen:
1092 if (extract_balanced (mlp, token_type_rparen,
1093 inner_context, next_context_iter,
1094 arglist_parser_alloc (mlp,
1095 state ? next_shapes : NULL)))
1097 arglist_parser_done (argparser, arg);
1100 next_context_iter = null_context_list_iterator;
1104 case token_type_rparen:
1105 if (delim == token_type_rparen || delim == token_type_eof)
1107 arglist_parser_done (argparser, arg);
1111 next_context_iter = null_context_list_iterator;
1115 case token_type_comma:
1118 inherited_context (outer_context,
1119 flag_context_list_iterator_advance (
1121 next_context_iter = passthrough_context_list_iterator;
1125 case token_type_eof:
1126 arglist_parser_done (argparser, arg);
1129 case token_type_string_literal:
1132 pos.file_name = logical_file_name;
1133 pos.line_number = token.line_number;
1138 refcounted_string_list_ty *comment;
1139 const char *encoding;
1141 string = literalstring_c.parse (token.string, &pos,
1143 free (token.string);
1144 token.string = string;
1146 if (token.comment != NULL)
1148 comment = savable_comment_convert_encoding (token.comment,
1150 drop_reference (token.comment);
1151 token.comment = comment;
1154 /* token.string and token.comment are already converted
1155 to UTF-8. Prevent further conversion in
1156 remember_a_message. */
1157 encoding = xgettext_current_source_encoding;
1158 xgettext_current_source_encoding = po_charset_utf8;
1159 remember_a_message (mlp, NULL, token.string, inner_context,
1160 &pos, NULL, token.comment);
1161 xgettext_current_source_encoding = encoding;
1165 /* A string immediately after a symbol means a
1169 struct arglist_parser *tmp_argparser;
1170 tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
1172 arglist_parser_remember_literal (tmp_argparser, 1,
1179 arglist_parser_done (tmp_argparser, 1);
1182 arglist_parser_remember_literal (argparser, arg, token.string,
1183 inner_context, pos.file_name,
1189 drop_reference (token.comment);
1190 next_context_iter = null_context_list_iterator;
1194 case token_type_character_constant:
1195 case token_type_lbrace:
1196 case token_type_rbrace:
1197 case token_type_assign:
1198 case token_type_return:
1199 case token_type_plus:
1200 case token_type_arithmetic_operator:
1201 case token_type_equality_test_operator:
1202 case token_type_logic_operator:
1203 case token_type_question:
1204 case token_type_colon:
1205 case token_type_number:
1206 case token_type_string_template:
1207 case token_type_regex_literal:
1208 case token_type_other:
1209 next_context_iter = null_context_list_iterator;
1220 extract_vala (FILE *f,
1221 const char *real_filename, const char *logical_filename,
1222 flag_context_list_table_ty *flag_table,
1223 msgdomain_list_ty *mdlp)
1225 message_list_ty *mlp = mdlp->item[0]->messages;
1228 real_file_name = real_filename;
1229 logical_file_name = xstrdup (logical_filename);
1232 last_comment_line = -1;
1233 last_non_comment_line = -1;
1235 flag_context_list_table = flag_table;
1239 /* Eat tokens until eof is seen. When extract_parenthesized returns
1240 due to an unbalanced closing parenthesis, just restart it. */
1241 while (!extract_balanced (mlp, token_type_eof,
1242 null_context, null_context_list_iterator,
1243 arglist_parser_alloc (mlp, NULL)))
1247 real_file_name = NULL;
1248 logical_file_name = NULL;