1 /* xgettext awk backend.
2 Copyright (C) 2002-2003, 2005-2009, 2015 Free Software Foundation,
5 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
36 #include "error-progname.h"
40 #define _(s) gettext(s)
43 /* The awk syntax is defined in the gawk manual page and documentation.
44 See also gawk/awkgram.y. */
47 /* ====================== Keyword set customization. ====================== */
49 /* If true extract all strings. */
50 static bool extract_all = false;
52 static hash_table keywords;
53 static bool default_keywords = true;
64 x_awk_keyword (const char *name)
67 default_keywords = false;
71 struct callshape shape;
74 if (keywords.table == NULL)
75 hash_init (&keywords, 100);
77 split_keywordspec (name, &end, &shape);
79 /* The characters between name and end should form a valid C identifier.
80 A colon means an invalid parse in split_keywordspec(). */
81 colon = strchr (name, ':');
82 if (colon == NULL || colon >= end)
83 insert_keyword_callshape (&keywords, name, end - name, &shape);
87 /* Finish initializing the keywords hash table.
88 Called after argument processing, before each file is processed. */
94 /* When adding new keywords here, also update the documentation in
96 x_awk_keyword ("dcgettext");
97 x_awk_keyword ("dcngettext:1,2");
98 default_keywords = false;
103 init_flag_table_awk ()
105 xgettext_record_flag ("dcgettext:1:pass-awk-format");
106 xgettext_record_flag ("dcngettext:1:pass-awk-format");
107 xgettext_record_flag ("dcngettext:2:pass-awk-format");
108 xgettext_record_flag ("printf:1:awk-format");
112 /* ======================== Reading of characters. ======================== */
114 /* Real filename, used in error messages about the input file. */
115 static const char *real_file_name;
117 /* Logical filename and line number, used to label the extracted messages. */
118 static char *logical_file_name;
119 static int line_number;
121 /* The input file stream. */
124 /* These are for tracking whether comments count as immediately before
126 static int last_comment_line;
127 static int last_non_comment_line;
130 /* 1. line_number handling. */
140 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
151 /* Supports only one pushback character. */
153 phase1_ungetc (int c)
165 /* 2. Replace each comment that is not inside a string literal or regular
166 expression with a newline character. We need to remember the comment
167 for later, because it may be attached to a keyword string. */
173 static size_t bufmax;
182 lineno = line_number;
186 if (c == '\n' || c == EOF)
188 /* We skip all leading white space, but not EOLs. */
189 if (!(buflen == 0 && (c == ' ' || c == '\t')))
191 if (buflen >= bufmax)
193 bufmax = 2 * bufmax + 10;
194 buffer = xrealloc (buffer, bufmax);
196 buffer[buflen++] = c;
199 if (buflen >= bufmax)
201 bufmax = 2 * bufmax + 10;
202 buffer = xrealloc (buffer, bufmax);
204 buffer[buflen] = '\0';
205 savable_comment_add (buffer);
206 last_comment_line = lineno;
211 /* Supports only one pushback character. */
213 phase2_ungetc (int c)
220 /* ========================== Reading of tokens. ========================== */
226 token_type_lparen, /* ( */
227 token_type_rparen, /* ) */
228 token_type_comma, /* , */
229 token_type_string, /* "abc" */
230 token_type_i18nstring, /* _"abc" */
231 token_type_symbol, /* symbol, number */
232 token_type_semicolon, /* ; */
233 token_type_other /* regexp, misc. operator */
235 typedef enum token_type_ty token_type_ty;
237 typedef struct token_ty token_ty;
241 char *string; /* for token_type_{symbol,string,i18nstring} */
246 /* 7. Replace escape sequences within character strings with their
247 single character equivalents. */
249 #define P7_QUOTES (1000 + '"')
258 /* Use phase 1, because phase 2 elides comments. */
261 if (c == EOF || c == '\n')
287 case '0': case '1': case '2': case '3': case '4':
288 case '5': case '6': case '7':
295 if (c >= '0' && c <= '7')
297 n = (n << 3) + (c - '0');
301 if (c >= '0' && c <= '7')
302 n = (n << 3) + (c - '0');
310 return (unsigned char) n;
321 else if (c >= '0' && c <= '9')
322 n = (n << 4) + (c - '0');
323 else if (c >= 'A' && c <= 'F')
324 n = (n << 4) + (c - 'A' + 10);
325 else if (c >= 'a' && c <= 'f')
326 n = (n << 4) + (c - 'a' + 10);
333 return (unsigned char) n;
341 error_with_progname = false;
342 error (0, 0, _("%s:%d: warning: unterminated string"), logical_file_name,
344 error_with_progname = true;
349 /* Free the memory pointed to by a 'struct token_ty'. */
351 free_token (token_ty *tp)
355 case token_type_string:
356 case token_type_i18nstring:
357 case token_type_symbol:
366 /* Combine characters into tokens. Discard whitespace. */
368 /* There is an ambiguity about '/': It can start a division operator ('/' or
369 '/=') or it can start a regular expression. The distinction is important
370 because inside regular expressions, '#' and '"' lose its special meanings.
371 If you look at the awk grammar, you see that the operator is only allowed
372 right after a 'variable' or 'simp_exp' nonterminal, and these nonterminals
373 can only end in the NAME, LENGTH, YSTRING, YNUMBER, ')', ']' terminals.
374 So we prefer the division operator interpretation only right after
375 symbol, string, number, ')', ']', with whitespace but no newline allowed
377 static bool prefer_division_over_regexp;
380 x_awk_lex (token_ty *tp)
389 tp->line_number = line_number;
395 tp->type = token_type_eof;
399 if (last_non_comment_line > last_comment_line)
400 savable_comment_reset ();
401 /* Newline is not allowed inside expressions. It usually
402 introduces a fresh statement.
403 FIXME: Newlines after any of ',' '{' '?' ':' '||' '&&' 'do' 'else'
404 does *not* introduce a fresh statement. */
405 prefer_division_over_regexp = false;
409 /* Ignore whitespace and comments. */
413 /* Backslash ought to be immediately followed by a newline. */
417 last_non_comment_line = tp->line_number;
423 int c2 = phase2_getc ();
425 if (!(c2 >= '0' && c2 <= '9'))
428 tp->type = token_type_other;
429 prefer_division_over_regexp = false;
434 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
435 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
436 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
437 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
440 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
441 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
442 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
443 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
445 case '0': case '1': case '2': case '3': case '4':
446 case '5': case '6': case '7': case '8': case '9':
447 /* Symbol, or part of a number. */
451 if (bufpos >= bufmax)
453 bufmax = 2 * bufmax + 10;
454 buffer = xrealloc (buffer, bufmax);
456 buffer[bufpos++] = c;
460 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
461 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
462 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
463 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
466 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
467 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
468 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
469 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
471 case '0': case '1': case '2': case '3': case '4':
472 case '5': case '6': case '7': case '8': case '9':
475 if (bufpos == 1 && buffer[0] == '_' && c == '"')
477 tp->type = token_type_i18nstring;
485 if (bufpos >= bufmax)
487 bufmax = 2 * bufmax + 10;
488 buffer = xrealloc (buffer, bufmax);
490 buffer[bufpos] = '\0';
491 tp->string = xstrdup (buffer);
492 tp->type = token_type_symbol;
493 /* Most identifiers can be variable names; after them we must
494 interpret '/' as division operator. But for awk's builtin
495 keywords we have three cases:
496 (a) Must interpret '/' as division operator. "length".
497 (b) Must interpret '/' as start of a regular expression.
498 "do", "exit", "print", "printf", "return".
499 (c) '/' after this keyword in invalid anyway. All others.
500 I used the following script for the distinction.
501 for k in $awk_keywords; do
502 echo; echo $k; awk "function foo () { $k / 10 }" < /dev/null
505 if (strcmp (buffer, "do") == 0
506 || strcmp (buffer, "exit") == 0
507 || strcmp (buffer, "print") == 0
508 || strcmp (buffer, "printf") == 0
509 || strcmp (buffer, "return") == 0)
510 prefer_division_over_regexp = false;
512 prefer_division_over_regexp = true;
516 tp->type = token_type_string;
522 if (c == EOF || c == P7_QUOTES)
524 if (bufpos >= bufmax)
526 bufmax = 2 * bufmax + 10;
527 buffer = xrealloc (buffer, bufmax);
529 buffer[bufpos++] = c;
531 if (bufpos >= bufmax)
533 bufmax = 2 * bufmax + 10;
534 buffer = xrealloc (buffer, bufmax);
536 buffer[bufpos] = '\0';
537 tp->string = xstrdup (buffer);
538 prefer_division_over_regexp = true;
542 tp->type = token_type_lparen;
543 prefer_division_over_regexp = false;
547 tp->type = token_type_rparen;
548 prefer_division_over_regexp = true;
552 tp->type = token_type_comma;
553 prefer_division_over_regexp = false;
557 tp->type = token_type_semicolon;
558 prefer_division_over_regexp = false;
562 tp->type = token_type_other;
563 prefer_division_over_regexp = true;
567 if (!prefer_division_over_regexp)
569 /* Regular expression.
570 Counting brackets is non-trivial. [[] is balanced, and so is
571 [\]]. Also, /[/]/ is balanced and ends at the third slash.
572 Do not count [ or ] if either one is preceded by a \.
573 A '[' should be counted if
574 a) it is the first one so far (brackets == 0), or
575 b) it is the '[' in '[:'.
576 A ']' should be counted if not preceded by a \.
577 According to POSIX, []] is how you put a ] into a set.
578 Try to handle that too.
581 bool pos0 = true; /* true at start of regexp */
582 bool pos1_open = false; /* true after [ at start of regexp */
583 bool pos2_open_not = false; /* true after [^ at start of regexp */
589 if (c == EOF || c == '\n')
592 error_with_progname = false;
593 error (0, 0, _("%s:%d: warning: unterminated regular expression"),
594 logical_file_name, line_number);
595 error_with_progname = true;
618 if (!(pos1_open || pos2_open_not))
626 pos2_open_not = true;
633 /* Backslash-newline is valid and ignored. */
643 pos2_open_not = false;
646 tp->type = token_type_other;
647 prefer_division_over_regexp = false;
653 /* We could carefully recognize each of the 2 and 3 character
654 operators, but it is not necessary, as we only need to recognize
655 gettext invocations. Don't bother. */
656 tp->type = token_type_other;
657 prefer_division_over_regexp = false;
664 /* ========================= Extracting strings. ========================== */
667 /* Context lookup table. */
668 static flag_context_list_table_ty *flag_context_list_table;
671 /* The file is broken into tokens. Scan the token stream, looking for
672 a keyword, followed by a left paren, followed by a string. When we
673 see this sequence, we have something to remember. We assume we are
674 looking at a valid C or C++ program, and leave the complaints about
675 the grammar to the compiler.
677 Normal handling: Look for
678 keyword ( ... msgid ... )
679 Plural handling: Look for
680 keyword ( ... msgid ... msgid_plural ... )
682 We use recursion because the arguments before msgid or between msgid
683 and msgid_plural can contain subexpressions of the same form. */
686 /* Extract messages until the next balanced closing parenthesis.
687 Extracted messages are added to MLP.
688 Return true upon eof, false upon closing parenthesis. */
690 extract_parenthesized (message_list_ty *mlp,
691 flag_context_ty outer_context,
692 flag_context_list_iterator_ty context_iter,
693 struct arglist_parser *argparser)
695 /* Current argument number. */
697 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
699 /* Parameters of the keyword just seen. Defined only in state 1. */
700 const struct callshapes *next_shapes = NULL;
701 /* Whether to implicitly assume the next tokens are arguments even without
703 bool next_is_argument = false;
704 /* Context iterator that will be used if the next token is a '('. */
705 flag_context_list_iterator_ty next_context_iter =
706 passthrough_context_list_iterator;
707 /* Current context. */
708 flag_context_ty inner_context =
709 inherited_context (outer_context,
710 flag_context_list_iterator_advance (&context_iter));
712 /* Start state is 0. */
721 if (next_is_argument && token.type != token_type_lparen)
723 /* An argument list starts, even though there is no '('. */
724 context_iter = next_context_iter;
725 outer_context = inner_context;
727 inherited_context (outer_context,
728 flag_context_list_iterator_advance (
734 case token_type_symbol:
738 if (hash_find_entry (&keywords, token.string, strlen (token.string),
742 next_shapes = (const struct callshapes *) keyword_value;
749 (strcmp (token.string, "print") == 0
750 || strcmp (token.string, "printf") == 0);
752 flag_context_list_iterator (
753 flag_context_list_table_lookup (
754 flag_context_list_table,
755 token.string, strlen (token.string)));
759 case token_type_lparen:
760 if (extract_parenthesized (mlp, inner_context, next_context_iter,
761 arglist_parser_alloc (mlp,
762 state ? next_shapes : NULL)))
764 arglist_parser_done (argparser, arg);
767 next_is_argument = false;
768 next_context_iter = null_context_list_iterator;
772 case token_type_rparen:
773 arglist_parser_done (argparser, arg);
776 case token_type_comma:
779 inherited_context (outer_context,
780 flag_context_list_iterator_advance (
782 next_is_argument = false;
783 next_context_iter = passthrough_context_list_iterator;
787 case token_type_string:
790 pos.file_name = logical_file_name;
791 pos.line_number = token.line_number;
794 remember_a_message (mlp, NULL, token.string, inner_context, &pos,
795 NULL, savable_comment);
797 arglist_parser_remember (argparser, arg, token.string,
799 pos.file_name, pos.line_number,
802 next_is_argument = false;
803 next_context_iter = null_context_list_iterator;
807 case token_type_i18nstring:
810 pos.file_name = logical_file_name;
811 pos.line_number = token.line_number;
813 remember_a_message (mlp, NULL, token.string, inner_context, &pos,
814 NULL, savable_comment);
816 next_is_argument = false;
817 next_context_iter = null_context_list_iterator;
821 case token_type_semicolon:
822 /* An argument list ends, and a new statement begins. */
823 /* FIXME: Should handle newline that acts as statement separator
825 /* FIXME: Instead of resetting outer_context here, it may be better
826 to recurse in the next_is_argument handling above, waiting for
827 the next semicolon or other statement terminator. */
828 outer_context = null_context;
829 context_iter = null_context_list_iterator;
830 next_is_argument = false;
831 next_context_iter = passthrough_context_list_iterator;
833 inherited_context (outer_context,
834 flag_context_list_iterator_advance (
840 arglist_parser_done (argparser, arg);
843 case token_type_other:
844 next_is_argument = false;
845 next_context_iter = null_context_list_iterator;
857 extract_awk (FILE *f,
858 const char *real_filename, const char *logical_filename,
859 flag_context_list_table_ty *flag_table,
860 msgdomain_list_ty *mdlp)
862 message_list_ty *mlp = mdlp->item[0]->messages;
865 real_file_name = real_filename;
866 logical_file_name = xstrdup (logical_filename);
869 last_comment_line = -1;
870 last_non_comment_line = -1;
872 prefer_division_over_regexp = false;
874 flag_context_list_table = flag_table;
878 /* Eat tokens until eof is seen. When extract_parenthesized returns
879 due to an unbalanced closing parenthesis, just restart it. */
880 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
881 arglist_parser_alloc (mlp, NULL)))
885 real_file_name = NULL;
886 logical_file_name = NULL;