1 /* xgettext Lua backend.
2 Copyright (C) 2012-2015 Free Software Foundation, Inc.
4 This file was written by Ľubomír Remák <lubomirr@lubomirr.eu>, 2012.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
36 #include "po-charset.h"
38 #define _(s) gettext(s)
40 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
42 /* The Lua syntax is defined in the Lua manual section 9,
44 http://www.lua.org/manual/5.2/manual.html#9 */
46 /* If true extract all strings. */
47 static bool extract_all = false;
49 /* A hash table for keywords. */
50 static hash_table keywords;
51 static bool default_keywords = true;
53 /* Set extract_all flag (gettext will extract all strings). */
60 /* Adds a keyword. Copied from other lexers. */
62 x_lua_keyword (const char *name)
65 default_keywords = false;
69 struct callshape shape;
72 if (keywords.table == NULL)
73 hash_init (&keywords, 100);
75 split_keywordspec (name, &end, &shape);
77 /* The characters between name and end should form a valid C identifier.
78 A colon means an invalid parse in split_keywordspec(). */
79 colon = strchr (name, ':');
80 if (colon == NULL || colon >= end)
81 insert_keyword_callshape (&keywords, name, end - name, &shape);
85 /* Finish initializing the keywords hash table.
86 Called after argument processing, before each file is processed. */
92 /* When adding new keywords here, also update the documentation in
95 x_lua_keyword ("gettext.gettext");
96 x_lua_keyword ("gettext.dgettext:2");
97 x_lua_keyword ("gettext.dcgettext:2");
98 x_lua_keyword ("gettext.ngettext:1,2");
99 x_lua_keyword ("gettext.dngettext:2,3");
100 x_lua_keyword ("gettext.dcngettext:2,3");
101 default_keywords = false;
106 init_flag_table_lua ()
108 xgettext_record_flag ("_:1:pass-lua-format");
109 xgettext_record_flag ("gettext.gettext:1:pass-lua-format");
110 xgettext_record_flag ("gettext.dgettext:2:pass-lua-format");
111 xgettext_record_flag ("gettext.dcgettext:2:pass-lua-format");
112 xgettext_record_flag ("gettext.ngettext:1:pass-lua-format");
113 xgettext_record_flag ("gettext.ngettext:2:pass-lua-format");
114 xgettext_record_flag ("gettext.dngettext:2:pass-lua-format");
115 xgettext_record_flag ("gettext.dngettext:3:pass-lua-format");
116 xgettext_record_flag ("gettext.dcngettext:2:pass-lua-format");
117 xgettext_record_flag ("gettext.dcngettext:3:pass-lua-format");
118 xgettext_record_flag ("string.format:1:lua-format");
121 /* ======================== Reading of characters. ======================== */
124 /* Real filename, used in error messages about the input file. */
125 static const char *real_file_name;
127 /* Logical filename and line number, used to label the extracted messages. */
128 static char *logical_file_name;
129 static int line_number;
131 /* The input file stream. */
135 /* 1. line_number handling. */
137 static unsigned char phase1_pushback[2];
138 static int phase1_pushback_length;
140 static int first_character = 1;
147 if (phase1_pushback_length)
148 c = phase1_pushback[--phase1_pushback_length];
157 /* Ignore shebang line. No pushback required in this case. */
160 while (c != '\n' && c != EOF)
173 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
185 /* Supports 2 characters of pushback. */
188 phase1_ungetc (int c)
195 if (phase1_pushback_length == SIZEOF (phase1_pushback))
197 phase1_pushback[phase1_pushback_length++] = c;
202 /* These are for tracking whether comments count as immediately before
204 static int last_comment_line;
205 static int last_non_comment_line;
207 /* Accumulating comments. */
210 static size_t bufmax;
211 static size_t buflen;
222 if (buflen >= bufmax)
224 bufmax = 2 * bufmax + 10;
225 buffer = xrealloc (buffer, bufmax);
227 buffer[buflen++] = c;
231 comment_line_end (size_t chars_to_remove)
233 buflen -= chars_to_remove;
235 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
237 if (chars_to_remove == 0 && buflen >= bufmax)
239 bufmax = 2 * bufmax + 10;
240 buffer = xrealloc (buffer, bufmax);
242 buffer[buflen] = '\0';
243 savable_comment_add (buffer);
246 /* Eats characters until '\n' and adds them to the comment. */
252 int c = phase1_getc ();
253 if (c == '\n' || c == EOF)
255 comment_line_end (0);
259 if (!(buflen == 0 && (c == ' ' || c == '\t')))
278 /* It starts with '--', so it must be either a short or a long
296 bool right_bracket = false;
300 lineno = line_number;
309 /* Ignore leading spaces and tabs. */
310 if (buflen == 0 && (c == ' ' || c == '\t'))
320 right_bracket = true;
325 if (esigns2 == esigns)
327 comment_line_end (2 + esigns);
339 comment_line_end (1);
341 lineno = line_number;
342 /* Intentionally not breaking. */
345 right_bracket = false;
348 last_comment_line = lineno;
353 /* One line (short) comment, starting with '--[=...='. */
354 lineno = last_comment_line;
361 last_comment_line = lineno;
367 /* One line (short) comment. */
368 lineno = line_number;
372 last_comment_line = lineno;
387 /* ========================== Reading of tokens. ========================== */
392 token_type_lparen, /* ( */
393 token_type_rparen, /* ) */
394 token_type_lbracket, /* [ */
395 token_type_rbracket, /* ] */
396 token_type_comma, /* , */
397 token_type_dot, /* . */
398 token_type_doubledot, /* .. */
399 token_type_operator1, /* + - * / % not # - ^ */
400 token_type_operator2, /* < > <= >= ~= == and or */
407 typedef enum token_type_ty token_type_ty;
409 typedef struct token_ty token_ty;
413 char *string; /* for token_type_string_literal, token_type_symbol */
414 refcounted_string_list_ty *comment; /* for token_type_string_literal */
418 /* Free the memory pointed to by a 'struct token_ty'. */
420 free_token (token_ty *tp)
422 if (tp->type == token_type_string || tp->type == token_type_symbol)
424 if (tp->type == token_type_string)
425 drop_reference (tp->comment);
428 /* Our current string. */
429 static int string_buf_length;
430 static int string_buf_alloc;
431 static char *string_buf;
436 string_buf_length = 0;
442 if (string_buf_length >= string_buf_alloc)
444 string_buf_alloc = 2 * string_buf_alloc + 10;
445 string_buf = xrealloc (string_buf, string_buf_alloc);
448 string_buf[string_buf_length++] = c;
454 string_buf[string_buf_length] = '\0';
458 /* We need 3 pushback tokens for string optimization. */
459 static int phase3_pushback_length;
460 static token_ty phase3_pushback[3];
464 phase3_unget (token_ty *tp)
466 if (tp->type != token_type_eof)
468 if (phase3_pushback_length == SIZEOF (phase3_pushback))
470 phase3_pushback[phase3_pushback_length++] = *tp;
475 phase3_get (token_ty *tp)
481 if (phase3_pushback_length)
483 *tp = phase3_pushback[--phase3_pushback_length];
491 tp->line_number = line_number;
497 tp->type = token_type_eof;
501 if (last_non_comment_line > last_comment_line)
502 savable_comment_reset ();
503 /* Intentionally not breaking. */
516 tp->type = token_type_operator1;
524 tp->type = token_type_operator2;
530 tp->type = token_type_operator2;
537 tp->type = token_type_lparen;
540 tp->type = token_type_rparen;
543 tp->type = token_type_comma;
547 tp->type = token_type_other;
550 /* There are three operators beginning with a dot. '.',
551 '..' and '...'. The most useful for us is the string
552 concatenation operator ('..'). */
560 tp->type = token_type_other;
566 tp->type = token_type_doubledot;
570 else if (c >= '0' && c <= '9')
572 /* It's a number. We aren't interested in the actual
573 numeric value, so ignore the dot and let next
574 iteration eat the number. */
581 tp->type = token_type_dot;
592 /* We need unprocessed characters from phase 1. */
595 /* We got '\', this is probably an escape sequence. */
627 for (i = 0; i < 2; i++)
630 if (c >= '0' && c <= '9')
632 else if (c >= 'a' && c <= 'f')
634 else if (c >= 'A' && c <= 'F')
652 /* Ignore the following whitespace. */
657 while (c == ' ' || c == '\n' || c == '\t' || c == '\r'
658 || c == '\f' || c == '\v');
664 /* Check if it's a '\ddd' sequence. */
665 if (c >= '0' && c <= '9')
670 while (c >= '0' && c <= '9' && i < 3)
678 /* The last read character is either a
679 non-number or another number after our
680 '\ddd' sequence. We need to ungetc it. */
683 /* The sequence number is too big, this
684 causes a lexical error. Ignore it. */
692 else if (c == c_start || c == EOF || c == '\n')
696 tp->string = xstrdup (string_buf);
697 tp->comment = add_reference (savable_comment);
698 tp->type = token_type_string;
709 /* Count the number of equal signs. */
719 /* We did not find what we were looking for, ungetc it. */
723 /* Our current character isn't '[' and we got 0 equal
724 signs, so the first '[' must have been a left
726 tp->type = token_type_lbracket;
730 /* Lexical error, ignore it. */
744 /* Count the number of equal signs. */
752 if (c == ']' && esigns == esigns2)
754 /* We got ']==...==]', where the number of equal
755 signs matches the number of equal signs in
756 the opening bracket. */
758 tp->string = xstrdup (string_buf);
759 tp->comment = add_reference (savable_comment);
760 tp->type = token_type_string;
765 /* Otherwise we got either ']==' garbage or
766 ']==...==]' with a different number of equal
769 Add ']' and equal signs to the string, and
770 ungetc the current character, because the
771 second ']' might be a part of another closing
772 long bracket, e.g. '==]===]'. */
785 tp->string = xstrdup (string_buf);
786 tp->comment = add_reference (savable_comment);
787 tp->type = token_type_string;
797 tp->type = token_type_rbracket;
801 if (c >= '0' && c <= '9')
803 while (c >= '0' && c <= '9')
809 while (c >= '0' && c <= '9')
813 if (c == 'e' || c == 'E')
815 if (c == '+' || c == '-')
817 while (c >= '0' && c <= '9')
823 tp->type = token_type_number;
826 else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
830 while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
831 || c == '_' || (c >= '0' && c <= '9'))
839 if (strcmp (string_buf, "not") == 0)
840 tp->type = token_type_operator1;
841 else if (strcmp (string_buf, "and") == 0)
842 tp->type = token_type_operator2;
843 else if (strcmp (string_buf, "or") == 0)
844 tp->type = token_type_operator2;
847 tp->string = xstrdup (string_buf);
848 tp->type = token_type_symbol;
853 tp->type = token_type_other;
858 /* String and symbol concatenation. */
860 static token_type_ty phase4_last;
862 /* We need 3 pushback tokens for string and symbol concatenation. */
863 static int phase4_pushback_length;
864 static token_ty phase4_pushback[3];
867 phase4_unget (token_ty *tp)
869 if (tp->type != token_type_eof)
871 if (phase4_pushback_length == SIZEOF (phase4_pushback))
873 phase4_pushback[phase4_pushback_length++] = *tp;
878 phase4_get (token_ty *tp)
880 if (phase4_pushback_length)
882 *tp = phase4_pushback[--phase4_pushback_length];
883 phase4_last = tp->type;
888 if (tp->type == token_type_string
889 && !(phase4_last == token_type_operator1
890 || phase4_last == token_type_dot
891 || phase4_last == token_type_symbol
892 || phase4_last == token_type_doubledot
893 || phase4_last == token_type_rparen))
895 char *sum = tp->string;
896 size_t sum_len = strlen (sum);
902 phase3_get (&token2);
903 if (token2.type == token_type_doubledot)
907 phase3_get (&token3);
908 if (token3.type == token_type_string)
910 token_ty token_after;
912 phase3_get (&token_after);
913 if (token_after.type != token_type_operator1)
915 char *addend = token3.string;
916 size_t addend_len = strlen (addend);
918 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
919 memcpy (sum + sum_len, addend, addend_len + 1);
920 sum_len += addend_len;
922 phase3_unget (&token_after);
923 free_token (&token3);
924 free_token (&token2);
927 phase3_unget (&token_after);
929 phase3_unget (&token3);
931 phase3_unget (&token2);
936 phase4_last = tp->type;
940 phase5_get (token_ty *tp)
944 /* Combine symbol1 . ... . symbolN to a single strings, so that
945 we can recognize function calls like
946 gettext.gettext. The information present for
947 symbolI.....symbolN has precedence over the information for
948 symbolJ.....symbolN with J > I. */
949 if (tp->type == token_type_symbol)
951 char *sum = tp->string;
952 size_t sum_len = strlen (sum);
958 phase4_get (&token2);
959 if (token2.type == token_type_dot)
963 phase4_get (&token3);
964 if (token3.type == token_type_symbol)
966 char *addend = token3.string;
967 size_t addend_len = strlen (addend);
969 sum = (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
971 memcpy (sum + sum_len + 1, addend, addend_len + 1);
972 sum_len += 1 + addend_len;
974 free_token (&token2);
975 free_token (&token3);
978 phase4_unget (&token3);
980 phase4_unget (&token2);
988 x_lua_lex (token_ty *tok)
994 /* ========================= Extracting strings. ========================== */
997 /* Context lookup table. */
998 static flag_context_list_table_ty *flag_context_list_table;
1001 /* The file is broken into tokens. Scan the token stream, looking for
1002 a keyword, followed by a left paren, followed by a string. When we
1003 see this sequence, we have something to remember. We assume we are
1004 looking at a valid Lua program, and leave the complaints about the
1005 grammar to the compiler.
1007 Normal handling: Look for
1008 keyword ( ... msgid ... )
1010 Plural handling: Look for
1011 keyword ( ... msgid ... msgid_plural ... )
1013 We use recursion because the arguments before msgid or between msgid
1014 and msgid_plural can contain subexpressions of the same form. */
1016 /* Extract messages until the next balanced closing parenthesis or bracket.
1017 Extracted messages are added to MLP.
1018 DELIM can be either token_type_rparen or token_type_rbracket, or
1019 token_type_eof to accept both.
1020 Return true upon eof, false upon closing parenthesis or bracket. */
1022 extract_balanced (message_list_ty *mlp, token_type_ty delim,
1023 flag_context_ty outer_context,
1024 flag_context_list_iterator_ty context_iter,
1025 struct arglist_parser *argparser)
1027 /* Current argument number. */
1029 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1031 /* Parameters of the keyword just seen. Defined only in state 1. */
1032 const struct callshapes *next_shapes = NULL;
1033 /* Context iterator that will be used if the next token is a '('. */
1034 flag_context_list_iterator_ty next_context_iter =
1035 passthrough_context_list_iterator;
1036 /* Current context. */
1037 flag_context_ty inner_context =
1038 inherited_context (outer_context,
1039 flag_context_list_iterator_advance (&context_iter));
1041 /* Start state is 0. */
1052 case token_type_symbol:
1054 void *keyword_value;
1056 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1060 next_shapes = (const struct callshapes *) keyword_value;
1067 flag_context_list_iterator (
1068 flag_context_list_table_lookup (
1069 flag_context_list_table,
1070 token.string, strlen (token.string)));
1071 free (token.string);
1074 case token_type_lparen:
1075 if (extract_balanced (mlp, token_type_rparen,
1076 inner_context, next_context_iter,
1077 arglist_parser_alloc (mlp,
1078 state ? next_shapes : NULL)))
1080 arglist_parser_done (argparser, arg);
1083 next_context_iter = null_context_list_iterator;
1087 case token_type_rparen:
1088 if (delim == token_type_rparen || delim == token_type_eof)
1090 arglist_parser_done (argparser, arg);
1094 next_context_iter = null_context_list_iterator;
1098 case token_type_lbracket:
1099 if (extract_balanced (mlp, token_type_rbracket,
1100 null_context, null_context_list_iterator,
1101 arglist_parser_alloc (mlp, NULL)))
1103 arglist_parser_done (argparser, arg);
1106 next_context_iter = null_context_list_iterator;
1110 case token_type_rbracket:
1111 if (delim == token_type_rbracket || delim == token_type_eof)
1113 arglist_parser_done (argparser, arg);
1117 next_context_iter = null_context_list_iterator;
1121 case token_type_comma:
1124 inherited_context (outer_context,
1125 flag_context_list_iterator_advance (
1127 next_context_iter = passthrough_context_list_iterator;
1131 case token_type_eof:
1132 arglist_parser_done (argparser, arg);
1135 case token_type_string:
1138 pos.file_name = logical_file_name;
1139 pos.line_number = token.line_number;
1142 remember_a_message (mlp, NULL, token.string, inner_context,
1143 &pos, NULL, token.comment);
1146 /* A string immediately after a symbol means a function call. */
1149 struct arglist_parser *tmp_argparser;
1150 tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
1152 arglist_parser_remember (tmp_argparser, 1, token.string,
1153 inner_context, pos.file_name,
1154 pos.line_number, token.comment);
1155 arglist_parser_done (tmp_argparser, 1);
1158 arglist_parser_remember (argparser, arg, token.string,
1159 inner_context, pos.file_name,
1160 pos.line_number, token.comment);
1163 drop_reference (token.comment);
1164 next_context_iter = null_context_list_iterator;
1168 case token_type_dot:
1169 case token_type_doubledot:
1170 case token_type_operator1:
1171 case token_type_operator2:
1172 case token_type_number:
1173 case token_type_other:
1174 next_context_iter = null_context_list_iterator;
1185 extract_lua (FILE *f,
1186 const char *real_filename, const char *logical_filename,
1187 flag_context_list_table_ty *flag_table,
1188 msgdomain_list_ty *mdlp)
1190 message_list_ty *mlp = mdlp->item[0]->messages;
1193 real_file_name = real_filename;
1194 logical_file_name = xstrdup (logical_filename);
1197 last_comment_line = -1;
1198 last_non_comment_line = -1;
1200 flag_context_list_table = flag_table;
1204 /* Eat tokens until eof is seen. When extract_parenthesized returns
1205 due to an unbalanced closing parenthesis, just restart it. */
1206 while (!extract_balanced (mlp, token_type_eof,
1207 null_context, null_context_list_iterator,
1208 arglist_parser_alloc (mlp, NULL)))
1212 real_file_name = NULL;
1213 logical_file_name = NULL;