1 /* xgettext awk backend.
2 Copyright (C) 2002-2003, 2005-2009 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
35 #include "error-progname.h"
39 #define _(s) gettext(s)
42 /* The awk syntax is defined in the gawk manual page and documentation.
43 See also gawk/awkgram.y. */
46 /* ====================== Keyword set customization. ====================== */
48 /* If true extract all strings. */
49 static bool extract_all = false;
51 static hash_table keywords;
52 static bool default_keywords = true;
63 x_awk_keyword (const char *name)
66 default_keywords = false;
70 struct callshape shape;
73 if (keywords.table == NULL)
74 hash_init (&keywords, 100);
76 split_keywordspec (name, &end, &shape);
78 /* The characters between name and end should form a valid C identifier.
79 A colon means an invalid parse in split_keywordspec(). */
80 colon = strchr (name, ':');
81 if (colon == NULL || colon >= end)
82 insert_keyword_callshape (&keywords, name, end - name, &shape);
86 /* Finish initializing the keywords hash table.
87 Called after argument processing, before each file is processed. */
93 /* When adding new keywords here, also update the documentation in
95 x_awk_keyword ("dcgettext");
96 x_awk_keyword ("dcngettext:1,2");
97 default_keywords = false;
102 init_flag_table_awk ()
104 xgettext_record_flag ("dcgettext:1:pass-awk-format");
105 xgettext_record_flag ("dcngettext:1:pass-awk-format");
106 xgettext_record_flag ("dcngettext:2:pass-awk-format");
107 xgettext_record_flag ("printf:1:awk-format");
111 /* ======================== Reading of characters. ======================== */
113 /* Real filename, used in error messages about the input file. */
114 static const char *real_file_name;
116 /* Logical filename and line number, used to label the extracted messages. */
117 static char *logical_file_name;
118 static int line_number;
120 /* The input file stream. */
123 /* These are for tracking whether comments count as immediately before
125 static int last_comment_line;
126 static int last_non_comment_line;
129 /* 1. line_number handling. */
139 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
150 /* Supports only one pushback character. */
152 phase1_ungetc (int c)
164 /* 2. Replace each comment that is not inside a string literal or regular
165 expression with a newline character. We need to remember the comment
166 for later, because it may be attached to a keyword string. */
172 static size_t bufmax;
181 lineno = line_number;
185 if (c == '\n' || c == EOF)
187 /* We skip all leading white space, but not EOLs. */
188 if (!(buflen == 0 && (c == ' ' || c == '\t')))
190 if (buflen >= bufmax)
192 bufmax = 2 * bufmax + 10;
193 buffer = xrealloc (buffer, bufmax);
195 buffer[buflen++] = c;
198 if (buflen >= bufmax)
200 bufmax = 2 * bufmax + 10;
201 buffer = xrealloc (buffer, bufmax);
203 buffer[buflen] = '\0';
204 savable_comment_add (buffer);
205 last_comment_line = lineno;
210 /* Supports only one pushback character. */
212 phase2_ungetc (int c)
219 /* ========================== Reading of tokens. ========================== */
225 token_type_lparen, /* ( */
226 token_type_rparen, /* ) */
227 token_type_comma, /* , */
228 token_type_string, /* "abc" */
229 token_type_i18nstring, /* _"abc" */
230 token_type_symbol, /* symbol, number */
231 token_type_semicolon, /* ; */
232 token_type_other /* regexp, misc. operator */
234 typedef enum token_type_ty token_type_ty;
236 typedef struct token_ty token_ty;
240 char *string; /* for token_type_{symbol,string,i18nstring} */
245 /* 7. Replace escape sequences within character strings with their
246 single character equivalents. */
248 #define P7_QUOTES (1000 + '"')
257 /* Use phase 1, because phase 2 elides comments. */
260 if (c == EOF || c == '\n')
286 case '0': case '1': case '2': case '3': case '4':
287 case '5': case '6': case '7':
294 if (c >= '0' && c <= '7')
296 n = (n << 3) + (c - '0');
300 if (c >= '0' && c <= '7')
301 n = (n << 3) + (c - '0');
309 return (unsigned char) n;
320 else if (c >= '0' && c <= '9')
321 n = (n << 4) + (c - '0');
322 else if (c >= 'A' && c <= 'F')
323 n = (n << 4) + (c - 'A' + 10);
324 else if (c >= 'a' && c <= 'f')
325 n = (n << 4) + (c - 'a' + 10);
332 return (unsigned char) n;
340 error_with_progname = false;
341 error (0, 0, _("%s:%d: warning: unterminated string"), logical_file_name,
343 error_with_progname = true;
348 /* Free the memory pointed to by a 'struct token_ty'. */
350 free_token (token_ty *tp)
354 case token_type_string:
355 case token_type_i18nstring:
356 case token_type_symbol:
365 /* Combine characters into tokens. Discard whitespace. */
367 /* There is an ambiguity about '/': It can start a division operator ('/' or
368 '/=') or it can start a regular expression. The distinction is important
369 because inside regular expressions, '#' and '"' lose its special meanings.
370 If you look at the awk grammar, you see that the operator is only allowed
371 right after a 'variable' or 'simp_exp' nonterminal, and these nonterminals
372 can only end in the NAME, LENGTH, YSTRING, YNUMBER, ')', ']' terminals.
373 So we prefer the division operator interpretation only right after
374 symbol, string, number, ')', ']', with whitespace but no newline allowed
376 static bool prefer_division_over_regexp;
379 x_awk_lex (token_ty *tp)
388 tp->line_number = line_number;
394 tp->type = token_type_eof;
398 if (last_non_comment_line > last_comment_line)
399 savable_comment_reset ();
400 /* Newline is not allowed inside expressions. It usually
401 introduces a fresh statement.
402 FIXME: Newlines after any of ',' '{' '?' ':' '||' '&&' 'do' 'else'
403 does *not* introduce a fresh statement. */
404 prefer_division_over_regexp = false;
408 /* Ignore whitespace and comments. */
412 /* Backslash ought to be immediately followed by a newline. */
416 last_non_comment_line = tp->line_number;
422 int c2 = phase2_getc ();
424 if (!(c2 >= '0' && c2 <= '9'))
427 tp->type = token_type_other;
428 prefer_division_over_regexp = false;
433 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
434 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
435 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
436 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
439 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
440 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
441 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
442 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
444 case '0': case '1': case '2': case '3': case '4':
445 case '5': case '6': case '7': case '8': case '9':
446 /* Symbol, or part of a number. */
450 if (bufpos >= bufmax)
452 bufmax = 2 * bufmax + 10;
453 buffer = xrealloc (buffer, bufmax);
455 buffer[bufpos++] = c;
459 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
460 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
461 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
462 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
465 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
466 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
467 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
468 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
470 case '0': case '1': case '2': case '3': case '4':
471 case '5': case '6': case '7': case '8': case '9':
474 if (bufpos == 1 && buffer[0] == '_' && c == '"')
476 tp->type = token_type_i18nstring;
484 if (bufpos >= bufmax)
486 bufmax = 2 * bufmax + 10;
487 buffer = xrealloc (buffer, bufmax);
489 buffer[bufpos] = '\0';
490 tp->string = xstrdup (buffer);
491 tp->type = token_type_symbol;
492 /* Most identifiers can be variable names; after them we must
493 interpret '/' as division operator. But for awk's builtin
494 keywords we have three cases:
495 (a) Must interpret '/' as division operator. "length".
496 (b) Must interpret '/' as start of a regular expression.
497 "do", "exit", "print", "printf", "return".
498 (c) '/' after this keyword in invalid anyway. All others.
499 I used the following script for the distinction.
500 for k in $awk_keywords; do
501 echo; echo $k; awk "function foo () { $k / 10 }" < /dev/null
504 if (strcmp (buffer, "do") == 0
505 || strcmp (buffer, "exit") == 0
506 || strcmp (buffer, "print") == 0
507 || strcmp (buffer, "printf") == 0
508 || strcmp (buffer, "return") == 0)
509 prefer_division_over_regexp = false;
511 prefer_division_over_regexp = true;
515 tp->type = token_type_string;
521 if (c == EOF || c == P7_QUOTES)
523 if (bufpos >= bufmax)
525 bufmax = 2 * bufmax + 10;
526 buffer = xrealloc (buffer, bufmax);
528 buffer[bufpos++] = c;
530 if (bufpos >= bufmax)
532 bufmax = 2 * bufmax + 10;
533 buffer = xrealloc (buffer, bufmax);
535 buffer[bufpos] = '\0';
536 tp->string = xstrdup (buffer);
537 prefer_division_over_regexp = true;
541 tp->type = token_type_lparen;
542 prefer_division_over_regexp = false;
546 tp->type = token_type_rparen;
547 prefer_division_over_regexp = true;
551 tp->type = token_type_comma;
552 prefer_division_over_regexp = false;
556 tp->type = token_type_semicolon;
557 prefer_division_over_regexp = false;
561 tp->type = token_type_other;
562 prefer_division_over_regexp = true;
566 if (!prefer_division_over_regexp)
568 /* Regular expression.
569 Counting brackets is non-trivial. [[] is balanced, and so is
570 [\]]. Also, /[/]/ is balanced and ends at the third slash.
571 Do not count [ or ] if either one is preceded by a \.
572 A '[' should be counted if
573 a) it is the first one so far (brackets == 0), or
574 b) it is the '[' in '[:'.
575 A ']' should be counted if not preceded by a \.
576 According to POSIX, []] is how you put a ] into a set.
577 Try to handle that too.
580 bool pos0 = true; /* true at start of regexp */
581 bool pos1_open = false; /* true after [ at start of regexp */
582 bool pos2_open_not = false; /* true after [^ at start of regexp */
588 if (c == EOF || c == '\n')
591 error_with_progname = false;
592 error (0, 0, _("%s:%d: warning: unterminated regular expression"),
593 logical_file_name, line_number);
594 error_with_progname = true;
617 if (!(pos1_open || pos2_open_not))
625 pos2_open_not = true;
632 /* Backslash-newline is valid and ignored. */
642 pos2_open_not = false;
645 tp->type = token_type_other;
646 prefer_division_over_regexp = false;
652 /* We could carefully recognize each of the 2 and 3 character
653 operators, but it is not necessary, as we only need to recognize
654 gettext invocations. Don't bother. */
655 tp->type = token_type_other;
656 prefer_division_over_regexp = false;
663 /* ========================= Extracting strings. ========================== */
666 /* Context lookup table. */
667 static flag_context_list_table_ty *flag_context_list_table;
670 /* The file is broken into tokens. Scan the token stream, looking for
671 a keyword, followed by a left paren, followed by a string. When we
672 see this sequence, we have something to remember. We assume we are
673 looking at a valid C or C++ program, and leave the complaints about
674 the grammar to the compiler.
676 Normal handling: Look for
677 keyword ( ... msgid ... )
678 Plural handling: Look for
679 keyword ( ... msgid ... msgid_plural ... )
681 We use recursion because the arguments before msgid or between msgid
682 and msgid_plural can contain subexpressions of the same form. */
685 /* Extract messages until the next balanced closing parenthesis.
686 Extracted messages are added to MLP.
687 Return true upon eof, false upon closing parenthesis. */
689 extract_parenthesized (message_list_ty *mlp,
690 flag_context_ty outer_context,
691 flag_context_list_iterator_ty context_iter,
692 struct arglist_parser *argparser)
694 /* Current argument number. */
696 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
698 /* Parameters of the keyword just seen. Defined only in state 1. */
699 const struct callshapes *next_shapes = NULL;
700 /* Whether to implicitly assume the next tokens are arguments even without
702 bool next_is_argument = false;
703 /* Context iterator that will be used if the next token is a '('. */
704 flag_context_list_iterator_ty next_context_iter =
705 passthrough_context_list_iterator;
706 /* Current context. */
707 flag_context_ty inner_context =
708 inherited_context (outer_context,
709 flag_context_list_iterator_advance (&context_iter));
711 /* Start state is 0. */
720 if (next_is_argument && token.type != token_type_lparen)
722 /* An argument list starts, even though there is no '('. */
723 context_iter = next_context_iter;
724 outer_context = inner_context;
726 inherited_context (outer_context,
727 flag_context_list_iterator_advance (
733 case token_type_symbol:
737 if (hash_find_entry (&keywords, token.string, strlen (token.string),
741 next_shapes = (const struct callshapes *) keyword_value;
748 (strcmp (token.string, "print") == 0
749 || strcmp (token.string, "printf") == 0);
751 flag_context_list_iterator (
752 flag_context_list_table_lookup (
753 flag_context_list_table,
754 token.string, strlen (token.string)));
758 case token_type_lparen:
759 if (extract_parenthesized (mlp, inner_context, next_context_iter,
760 arglist_parser_alloc (mlp,
761 state ? next_shapes : NULL)))
763 arglist_parser_done (argparser, arg);
766 next_is_argument = false;
767 next_context_iter = null_context_list_iterator;
771 case token_type_rparen:
772 arglist_parser_done (argparser, arg);
775 case token_type_comma:
778 inherited_context (outer_context,
779 flag_context_list_iterator_advance (
781 next_is_argument = false;
782 next_context_iter = passthrough_context_list_iterator;
786 case token_type_string:
789 pos.file_name = logical_file_name;
790 pos.line_number = token.line_number;
793 remember_a_message (mlp, NULL, token.string, inner_context, &pos,
794 NULL, savable_comment);
796 arglist_parser_remember (argparser, arg, token.string,
798 pos.file_name, pos.line_number,
801 next_is_argument = false;
802 next_context_iter = null_context_list_iterator;
806 case token_type_i18nstring:
809 pos.file_name = logical_file_name;
810 pos.line_number = token.line_number;
812 remember_a_message (mlp, NULL, token.string, inner_context, &pos,
813 NULL, savable_comment);
815 next_is_argument = false;
816 next_context_iter = null_context_list_iterator;
820 case token_type_semicolon:
821 /* An argument list ends, and a new statement begins. */
822 /* FIXME: Should handle newline that acts as statement separator
824 /* FIXME: Instead of resetting outer_context here, it may be better
825 to recurse in the next_is_argument handling above, waiting for
826 the next semicolon or other statement terminator. */
827 outer_context = null_context;
828 context_iter = null_context_list_iterator;
829 next_is_argument = false;
830 next_context_iter = passthrough_context_list_iterator;
832 inherited_context (outer_context,
833 flag_context_list_iterator_advance (
839 arglist_parser_done (argparser, arg);
842 case token_type_other:
843 next_is_argument = false;
844 next_context_iter = null_context_list_iterator;
856 extract_awk (FILE *f,
857 const char *real_filename, const char *logical_filename,
858 flag_context_list_table_ty *flag_table,
859 msgdomain_list_ty *mdlp)
861 message_list_ty *mlp = mdlp->item[0]->messages;
864 real_file_name = real_filename;
865 logical_file_name = xstrdup (logical_filename);
868 last_comment_line = -1;
869 last_non_comment_line = -1;
871 prefer_division_over_regexp = false;
873 flag_context_list_table = flag_table;
877 /* Eat tokens until eof is seen. When extract_parenthesized returns
878 due to an unbalanced closing parenthesis, just restart it. */
879 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
880 arglist_parser_alloc (mlp, NULL)))
884 real_file_name = NULL;
885 logical_file_name = NULL;