1 /* xgettext Scheme backend.
2 Copyright (C) 2004-2009, 2011, 2015 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <bruno@clisp.org>, 2004-2005.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
39 #define _(s) gettext(s)
42 /* The Scheme syntax is described in R5RS. It is implemented in
43 guile-2.0.0/libguile/read.c.
44 Since we are interested only in strings and in forms similar to
46 or (ngettext msgid msgid_plural ...)
47 we make the following simplifications:
49 - Assume the keywords and strings are in an ASCII compatible encoding.
50 This means we can read the input file one byte at a time, instead of
51 one character at a time. No need to worry about multibyte characters:
52 If they occur as part of identifiers, they most probably act as
53 constituent characters, and the byte based approach will do the same.
55 - Assume the read-hash-procedures is in the default state.
56 Non-standard reader extensions are mostly used to read data, not programs.
58 The remaining syntax rules are:
60 - The syntax code assigned to each character, and how tokens are built
61 up from characters (single escape, multiple escape etc.).
63 - Comment syntax: ';' and '#! ... !#' and '#| ... |#' (may be nested).
65 - String syntax: "..." with single escapes.
67 - Read macros and dispatch macro character '#'. Needed to be able to
68 tell which is the n-th argument of a function call.
73 /* ====================== Keyword set customization. ====================== */
75 /* If true extract all strings. */
76 static bool extract_all = false;
78 static hash_table keywords;
79 static bool default_keywords = true;
83 x_scheme_extract_all ()
90 x_scheme_keyword (const char *name)
93 default_keywords = false;
97 struct callshape shape;
100 if (keywords.table == NULL)
101 hash_init (&keywords, 100);
103 split_keywordspec (name, &end, &shape);
105 /* The characters between name and end should form a valid Lisp symbol.
106 Extract the symbol name part. */
107 colon = strchr (name, ':');
108 if (colon != NULL && colon < end)
111 if (name < end && *name == ':')
113 colon = strchr (name, ':');
114 if (colon != NULL && colon < end)
118 insert_keyword_callshape (&keywords, name, end - name, &shape);
122 /* Finish initializing the keywords hash table.
123 Called after argument processing, before each file is processed. */
127 if (default_keywords)
129 /* When adding new keywords here, also update the documentation in
131 x_scheme_keyword ("gettext"); /* libguile/i18n.c */
132 x_scheme_keyword ("ngettext:1,2"); /* libguile/i18n.c */
133 x_scheme_keyword ("gettext-noop");
134 default_keywords = false;
139 init_flag_table_scheme ()
141 xgettext_record_flag ("gettext:1:pass-scheme-format");
142 xgettext_record_flag ("ngettext:1:pass-scheme-format");
143 xgettext_record_flag ("ngettext:2:pass-scheme-format");
144 xgettext_record_flag ("gettext-noop:1:pass-scheme-format");
145 xgettext_record_flag ("format:2:scheme-format");
149 /* ======================== Reading of characters. ======================== */
151 /* Real filename, used in error messages about the input file. */
152 static const char *real_file_name;
154 /* Logical filename and line number, used to label the extracted messages. */
155 static char *logical_file_name;
156 static int line_number;
158 /* The input file stream. */
162 /* Fetch the next character from the input file. */
171 error (EXIT_FAILURE, errno, _("\
172 error while reading \"%s\""), real_file_name);
180 /* Put back the last fetched character, not EOF. */
190 /* ========================== Reading of tokens. ========================== */
193 /* A token consists of a sequence of characters. */
196 int allocated; /* number of allocated 'token_char's */
197 int charcount; /* number of used 'token_char's */
198 char *chars; /* the token's constituents */
201 /* Initialize a 'struct token'. */
203 init_token (struct token *tp)
206 tp->chars = XNMALLOC (tp->allocated, char);
210 /* Free the memory pointed to by a 'struct token'. */
212 free_token (struct token *tp)
217 /* Ensure there is enough room in the token for one more character. */
219 grow_token (struct token *tp)
221 if (tp->charcount == tp->allocated)
224 tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
228 /* Read the next token. 'first' is the first character, which has already
231 read_token (struct token *tp, int first)
236 tp->chars[tp->charcount++] = first;
244 if (c == ' ' || c == '\r' || c == '\f' || c == '\t' || c == '\n'
245 || c == '"' || c == '(' || c == ')' || c == ';')
251 tp->chars[tp->charcount++] = c;
255 /* Tests if a token represents an integer.
256 Taken from guile-1.6.4/libguile/numbers.c:scm_istr2int(). */
258 is_integer_syntax (const char *str, int len, int radix)
261 const char *p_end = str + len;
263 /* The accepted syntax is
265 where DIGIT is a hexadecimal digit whose value is below radix. */
269 if (*p == '+' || *p == '-')
279 if (c >= '0' && c <= '9')
281 else if (c >= 'A' && c <= 'F')
283 else if (c >= 'a' && c <= 'f')
294 /* Tests if a token represents a rational, floating-point or complex number.
295 If unconstrained is false, only real numbers are accepted; otherwise,
296 complex numbers are accepted as well.
297 Taken from guile-1.6.4/libguile/numbers.c:scm_istr2flo(). */
299 is_other_number_syntax (const char *str, int len, int radix, bool unconstrained)
302 const char *p_end = str + len;
306 /* The accepted syntaxes are:
307 for a floating-point number:
308 ['+'|'-'] DIGIT+ [EXPONENT]
309 ['+'|'-'] DIGIT* '.' DIGIT+ [EXPONENT]
310 where EXPONENT ::= ['d'|'e'|'f'|'l'|'s'] DIGIT+
311 (Dot and exponent are allowed only if radix is 10.)
312 for a rational number:
313 ['+'|'-'] DIGIT+ '/' DIGIT+
314 for a complex number:
315 REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
316 REAL-NUMBER {'+'|'-'} 'i'
317 {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
319 REAL-NUMBER '@' REAL-NUMBER
323 /* Parse leading sign. */
325 if (*p == '+' || *p == '-')
331 /* Recognize complex number syntax: {'+'|'-'} 'i' */
332 if (unconstrained && (*p == 'I' || *p == 'i') && p + 1 == p_end)
335 /* Parse digits before dot or exponent or slash. */
341 if (c >= '0' && c <= '9')
343 else if (c >= 'A' && c <= 'F')
345 if (c >= 'D' && radix == 10) /* exponent? */
349 else if (c >= 'a' && c <= 'f')
351 if (c >= 'd' && radix == 10) /* exponent? */
363 /* If p == p_end, we know that seen_digits = true, and the number is an
364 integer without exponent. */
367 /* If we have no digits so far, we need a decimal point later. */
368 if (!seen_digits && !(*p == '.' && radix == 10))
370 /* Trailing '#' signs are equivalent to zeroes. */
371 while (p < p_end && *p == '#')
377 /* Parse digits after the slash. */
378 bool all_zeroes = true;
380 for (; p < p_end; p++)
384 if (c >= '0' && c <= '9')
386 else if (c >= 'A' && c <= 'F')
388 else if (c >= 'a' && c <= 'f')
397 /* A zero denominator is not allowed. */
400 /* Trailing '#' signs are equivalent to zeroes. */
401 while (p < p_end && *p == '#')
408 /* Decimal point notation. */
411 /* Parse digits after the decimal point. */
413 for (; p < p_end; p++)
417 if (c >= '0' && c <= '9')
422 /* Digits are required before or after the decimal point. */
425 /* Trailing '#' signs are equivalent to zeroes. */
426 while (p < p_end && *p == '#')
431 /* Parse exponent. */
444 if (*p == '+' || *p == '-')
450 if (!(*p >= '0' && *p <= '9'))
457 if (!(*p >= '0' && *p <= '9'))
470 /* Recognize complex number syntax. */
473 /* Recognize the syntax {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i' */
474 if (seen_sign && (*p == 'I' || *p == 'i') && p + 1 == p_end)
476 /* Recognize the syntaxes
477 REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
478 REAL-NUMBER {'+'|'-'} 'i'
480 if (*p == '+' || *p == '-')
481 return (p_end[-1] == 'I' || p_end[-1] == 'i')
482 && (p + 1 == p_end - 1
483 || is_other_number_syntax (p, p_end - 1 - p, radix, false));
484 /* Recognize the syntax REAL-NUMBER '@' REAL-NUMBER */
488 return is_other_number_syntax (p, p_end - p, radix, false);
494 /* Tests if a token represents a number.
495 Taken from guile-1.6.4/libguile/numbers.c:scm_istring2number(). */
497 is_number (const struct token *tp)
499 const char *str = tp->chars;
500 int len = tp->charcount;
501 enum { unknown, exact, inexact } exactness = unknown;
502 bool seen_radix_prefix = false;
503 bool seen_exactness_prefix = false;
506 if (*str == '+' || *str == '-')
508 while (len >= 2 && *str == '#')
513 if (seen_radix_prefix)
515 seen_radix_prefix = true;
518 if (seen_radix_prefix)
520 seen_radix_prefix = true;
523 if (seen_radix_prefix)
525 seen_radix_prefix = true;
528 if (seen_radix_prefix)
530 seen_radix_prefix = true;
533 if (seen_exactness_prefix)
536 seen_exactness_prefix = true;
539 if (seen_exactness_prefix)
542 seen_exactness_prefix = true;
550 if (exactness != inexact)
552 /* Try to parse an integer. */
553 if (is_integer_syntax (str, len, 10))
555 /* FIXME: Other Scheme implementations support exact rational numbers
556 or exact complex numbers. */
558 if (exactness != exact)
560 /* Try to parse a rational, floating-point or complex number. */
561 if (is_other_number_syntax (str, len, 10, true))
568 /* ========================= Accumulating comments ========================= */
572 static size_t bufmax;
573 static size_t buflen;
584 if (buflen >= bufmax)
586 bufmax = 2 * bufmax + 10;
587 buffer = xrealloc (buffer, bufmax);
589 buffer[buflen++] = c;
593 comment_line_end (size_t chars_to_remove)
595 buflen -= chars_to_remove;
597 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
599 if (chars_to_remove == 0 && buflen >= bufmax)
601 bufmax = 2 * bufmax + 10;
602 buffer = xrealloc (buffer, bufmax);
604 buffer[buflen] = '\0';
605 savable_comment_add (buffer);
609 /* These are for tracking whether comments count as immediately before
611 static int last_comment_line;
612 static int last_non_comment_line;
615 /* ========================= Accumulating messages ========================= */
618 static message_list_ty *mlp;
621 /* ========================== Reading of objects. ========================= */
624 /* We are only interested in symbols (e.g. gettext or ngettext) and strings.
625 Other objects need not to be represented precisely. */
628 t_symbol, /* symbol */
629 t_string, /* string */
630 t_other, /* other kind of real object */
631 t_dot, /* '.' pseudo object */
632 t_close, /* ')' pseudo object */
633 t_eof /* EOF marker */
638 enum object_type type;
639 struct token *token; /* for t_symbol and t_string */
640 int line_number_at_start; /* for t_string */
643 /* Free the memory pointed to by a 'struct object'. */
645 free_object (struct object *op)
647 if (op->type == t_symbol || op->type == t_string)
649 free_token (op->token);
654 /* Convert a t_symbol/t_string token to a char*. */
656 string_of_object (const struct object *op)
661 if (!(op->type == t_symbol || op->type == t_string))
663 n = op->token->charcount;
664 str = XNMALLOC (n + 1, char);
665 memcpy (str, op->token->chars, n);
670 /* Context lookup table. */
671 static flag_context_list_table_ty *flag_context_list_table;
673 /* Read the next object. */
675 read_object (struct object *op, flag_context_ty outer_context)
680 bool seen_underscore_prefix = false;
688 case ' ': case '\r': case '\f': case '\t':
692 /* Comments assumed to be grouped with a message must immediately
693 precede it, with no non-whitespace token on a line between
695 if (last_non_comment_line > last_comment_line)
696 savable_comment_reset ();
701 bool all_semicolons = true;
703 last_comment_line = line_number;
708 if (c == EOF || c == '\n')
711 all_semicolons = false;
714 /* We skip all leading white space, but not EOLs. */
715 if (!(buflen == 0 && (c == ' ' || c == '\t')))
719 comment_line_end (0);
725 int arg = 0; /* Current argument number. */
726 flag_context_list_iterator_ty context_iter;
727 const struct callshapes *shapes = NULL;
728 struct arglist_parser *argparser = NULL;
733 flag_context_ty inner_context;
736 inner_context = null_context;
739 inherited_context (outer_context,
740 flag_context_list_iterator_advance (
743 read_object (&inner, inner_context);
745 /* Recognize end of list. */
746 if (inner.type == t_close)
749 last_non_comment_line = line_number;
750 if (argparser != NULL)
751 arglist_parser_done (argparser, arg);
755 /* Dots are not allowed in every position.
758 /* EOF inside list is illegal.
760 if (inner.type == t_eof)
765 /* This is the function position. */
766 if (inner.type == t_symbol)
768 char *symbol_name = string_of_object (&inner);
771 if (hash_find_entry (&keywords,
772 symbol_name, strlen (symbol_name),
775 shapes = (const struct callshapes *) keyword_value;
777 argparser = arglist_parser_alloc (mlp, shapes);
780 flag_context_list_iterator (
781 flag_context_list_table_lookup (
782 flag_context_list_table,
783 symbol_name, strlen (symbol_name)));
788 context_iter = null_context_list_iterator;
792 /* These are the argument positions. */
793 if (argparser != NULL && inner.type == t_string)
794 arglist_parser_remember (argparser, arg,
795 string_of_object (&inner),
798 inner.line_number_at_start,
802 free_object (&inner);
804 if (argparser != NULL)
805 arglist_parser_done (argparser, arg);
808 last_non_comment_line = line_number;
812 /* Tell the caller about the end of list.
813 Unmatched closing parenthesis is illegal.
816 last_non_comment_line = line_number;
822 /* The ,@ handling inside lists is wrong anyway, because
823 ,@form expands to an unknown number of elements. */
824 if (c != EOF && c != '@')
833 read_object (&inner, null_context);
835 /* Dots and EOF are not allowed here. But be tolerant. */
837 free_object (&inner);
840 last_non_comment_line = line_number;
845 /* Dispatch macro handling. */
849 /* Invalid input. Be tolerant, no error message. */
857 case '(': /* Vector */
861 read_object (&inner, null_context);
862 /* Dots and EOF are not allowed here.
864 free_object (&inner);
866 last_non_comment_line = line_number;
870 case 'T': case 't': /* Boolean true */
871 case 'F': case 'f': /* Boolean false */
873 last_non_comment_line = line_number;
885 read_token (&token, '#');
886 if (is_number (&token))
891 last_non_comment_line = line_number;
896 if (token.charcount == 2
897 && (token.chars[1] == 'e' || token.chars[1] == 'i'))
903 /* Homogenous vector syntax, see arrays.scm. */
904 case 'a': /* Vectors of char */
905 case 'c': /* Vectors of complex */
906 /*case 'e':*/ /* Vectors of long */
907 case 'h': /* Vectors of short */
908 /*case 'i':*/ /* Vectors of double-float */
909 case 'l': /* Vectors of long long */
910 case 's': /* Vectors of single-float */
911 case 'u': /* Vectors of unsigned long */
912 case 'y': /* Vectors of byte */
915 read_object (&inner, null_context);
916 /* Dots and EOF are not allowed here.
919 free_object (&inner);
921 last_non_comment_line = line_number;
925 /* Unknown # object. But be tolerant. */
928 last_non_comment_line = line_number;
934 /* Block comment '#! ... !#'. See
935 <http://www.gnu.org/software/guile/manual/html_node/Block-Comments.html>. */
952 comment_line_end (0);
960 /* We skip all leading white space. */
961 if (!(buflen == 0 && (c == ' ' || c == '\t')))
965 comment_line_end (1);
973 /* EOF not allowed here. But be tolerant. */
977 last_comment_line = line_number;
982 /* Block comment '#| ... |#'. See
983 <http://www.gnu.org/software/guile/manual/html_node/Block-Comments.html>
984 and <http://srfi.schemers.org/srfi-30/srfi-30.html>. */
1004 comment_line_end (0);
1030 /* We skip all leading white space. */
1031 if (!(buflen == 0 && (c == ' ' || c == '\t')))
1035 comment_line_end (1);
1043 /* EOF not allowed here. But be tolerant. */
1047 last_comment_line = line_number;
1055 read_token (&token, c);
1056 /* The token should consists only of '0' and '1', except
1057 for the initial '*'. But be tolerant. */
1058 free_token (&token);
1060 last_non_comment_line = line_number;
1065 /* Symbol with multiple escapes: #{...}# */
1067 op->token = XMALLOC (struct token);
1069 init_token (op->token);
1092 grow_token (op->token);
1093 op->token->chars[op->token->charcount++] = c;
1096 op->type = t_symbol;
1097 last_non_comment_line = line_number;
1108 read_token (&token, c);
1109 free_token (&token);
1112 last_non_comment_line = line_number;
1116 case ':': /* Keyword. */
1117 case '&': /* Deprecated keyword, installed in optargs.scm. */
1120 read_token (&token, '-');
1121 free_token (&token);
1123 last_non_comment_line = line_number;
1127 /* The following are installed through read-hash-extend. */
1130 case '0': case '1': case '2': case '3': case '4':
1131 case '5': case '6': case '7': case '8': case '9':
1132 /* Multidimensional array syntax: #nx(...) where
1134 x ::= {'a'|'b'|'c'|'e'|'i'|'s'|'u'}
1138 while (c >= '0' && c <= '9');
1139 /* c should be one of {'a'|'b'|'c'|'e'|'i'|'s'|'u'}.
1142 case '\'': /* boot-9.scm */
1143 case '.': /* boot-9.scm */
1144 case ',': /* srfi-10.scm */
1146 struct object inner;
1147 read_object (&inner, null_context);
1148 /* Dots and EOF are not allowed here.
1150 free_object (&inner);
1152 last_non_comment_line = line_number;
1159 last_non_comment_line = line_number;
1167 /* GIMP script-fu extension: '_' before a string literal is
1168 considered a gettext call on the string. */
1172 /* Invalid input. Be tolerant, no error message. */
1181 /* If '_' is not followed by a string literal,
1182 consider it a part of symbol. */
1183 op->token = XMALLOC (struct token);
1184 read_token (op->token, '_');
1185 op->type = t_symbol;
1186 last_non_comment_line = line_number;
1189 seen_underscore_prefix = true;
1195 op->token = XMALLOC (struct token);
1196 init_token (op->token);
1197 op->line_number_at_start = line_number;
1202 /* Invalid input. Be tolerant, no error message. */
1210 /* Invalid input. Be tolerant, no error message. */
1241 grow_token (op->token);
1242 op->token->chars[op->token->charcount++] = c;
1244 op->type = t_string;
1246 if (seen_underscore_prefix || extract_all)
1250 pos.file_name = logical_file_name;
1251 pos.line_number = op->line_number_at_start;
1252 remember_a_message (mlp, NULL, string_of_object (op),
1253 null_context, &pos, NULL, savable_comment);
1255 last_non_comment_line = line_number;
1259 case '0': case '1': case '2': case '3': case '4':
1260 case '5': case '6': case '7': case '8': case '9':
1261 case '+': case '-': case '.':
1262 /* Read a number or symbol token. */
1263 op->token = XMALLOC (struct token);
1264 read_token (op->token, c);
1265 if (op->token->charcount == 1 && op->token->chars[0] == '.')
1267 free_token (op->token);
1271 else if (is_number (op->token))
1274 free_token (op->token);
1281 op->type = t_symbol;
1283 last_non_comment_line = line_number;
1288 /* Read a symbol token. */
1289 op->token = XMALLOC (struct token);
1290 read_token (op->token, c);
1291 op->type = t_symbol;
1292 last_non_comment_line = line_number;
1300 extract_scheme (FILE *f,
1301 const char *real_filename, const char *logical_filename,
1302 flag_context_list_table_ty *flag_table,
1303 msgdomain_list_ty *mdlp)
1305 mlp = mdlp->item[0]->messages;
1308 real_file_name = real_filename;
1309 logical_file_name = xstrdup (logical_filename);
1312 last_comment_line = -1;
1313 last_non_comment_line = -1;
1315 flag_context_list_table = flag_table;
1319 /* Eat tokens until eof is seen. When read_object returns
1320 due to an unbalanced closing parenthesis, just restart it. */
1323 struct object toplevel_object;
1325 read_object (&toplevel_object, null_context);
1327 if (toplevel_object.type == t_eof)
1330 free_object (&toplevel_object);
1334 /* Close scanner. */
1336 real_file_name = NULL;
1337 logical_file_name = NULL;