1 /* xgettext Scheme backend.
2 Copyright (C) 2004-2009 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <bruno@clisp.org>, 2004-2005.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
39 #define _(s) gettext(s)
42 /* The Scheme syntax is described in R5RS. It is implemented in
43 guile-1.6.4/libguile/read.c.
44 Since we are interested only in strings and in forms similar to
46 or (ngettext msgid msgid_plural ...)
47 we make the following simplifications:
49 - Assume the keywords and strings are in an ASCII compatible encoding.
50 This means we can read the input file one byte at a time, instead of
51 one character at a time. No need to worry about multibyte characters:
52 If they occur as part of identifiers, they most probably act as
53 constituent characters, and the byte based approach will do the same.
55 - Assume the read-hash-procedures is in the default state.
56 Non-standard reader extensions are mostly used to read data, not programs.
58 The remaining syntax rules are:
60 - The syntax code assigned to each character, and how tokens are built
61 up from characters (single escape, multiple escape etc.).
63 - Comment syntax: ';' and '#! ... \n!#\n'.
65 - String syntax: "..." with single escapes.
67 - Read macros and dispatch macro character '#'. Needed to be able to
68 tell which is the n-th argument of a function call.
73 /* ====================== Keyword set customization. ====================== */
75 /* If true extract all strings. */
76 static bool extract_all = false;
78 static hash_table keywords;
79 static bool default_keywords = true;
83 x_scheme_extract_all ()
90 x_scheme_keyword (const char *name)
93 default_keywords = false;
97 struct callshape shape;
100 if (keywords.table == NULL)
101 hash_init (&keywords, 100);
103 split_keywordspec (name, &end, &shape);
105 /* The characters between name and end should form a valid Lisp symbol.
106 Extract the symbol name part. */
107 colon = strchr (name, ':');
108 if (colon != NULL && colon < end)
111 if (name < end && *name == ':')
113 colon = strchr (name, ':');
114 if (colon != NULL && colon < end)
118 insert_keyword_callshape (&keywords, name, end - name, &shape);
122 /* Finish initializing the keywords hash table.
123 Called after argument processing, before each file is processed. */
127 if (default_keywords)
129 /* When adding new keywords here, also update the documentation in
131 x_scheme_keyword ("gettext"); /* libguile/i18n.c */
132 x_scheme_keyword ("ngettext:1,2"); /* libguile/i18n.c */
133 x_scheme_keyword ("gettext-noop");
134 default_keywords = false;
139 init_flag_table_scheme ()
141 xgettext_record_flag ("gettext:1:pass-scheme-format");
142 xgettext_record_flag ("ngettext:1:pass-scheme-format");
143 xgettext_record_flag ("ngettext:2:pass-scheme-format");
144 xgettext_record_flag ("gettext-noop:1:pass-scheme-format");
145 xgettext_record_flag ("format:2:scheme-format");
149 /* ======================== Reading of characters. ======================== */
151 /* Real filename, used in error messages about the input file. */
152 static const char *real_file_name;
154 /* Logical filename and line number, used to label the extracted messages. */
155 static char *logical_file_name;
156 static int line_number;
158 /* The input file stream. */
162 /* Fetch the next character from the input file. */
171 error (EXIT_FAILURE, errno, _("\
172 error while reading \"%s\""), real_file_name);
180 /* Put back the last fetched character, not EOF. */
190 /* ========================== Reading of tokens. ========================== */
193 /* A token consists of a sequence of characters. */
196 int allocated; /* number of allocated 'token_char's */
197 int charcount; /* number of used 'token_char's */
198 char *chars; /* the token's constituents */
201 /* Initialize a 'struct token'. */
203 init_token (struct token *tp)
206 tp->chars = XNMALLOC (tp->allocated, char);
210 /* Free the memory pointed to by a 'struct token'. */
212 free_token (struct token *tp)
217 /* Ensure there is enough room in the token for one more character. */
219 grow_token (struct token *tp)
221 if (tp->charcount == tp->allocated)
224 tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
228 /* Read the next token. 'first' is the first character, which has already
231 read_token (struct token *tp, int first)
236 tp->chars[tp->charcount++] = first;
244 if (c == ' ' || c == '\r' || c == '\f' || c == '\t' || c == '\n'
245 || c == '"' || c == '(' || c == ')' || c == ';')
251 tp->chars[tp->charcount++] = c;
255 /* Tests if a token represents an integer.
256 Taken from guile-1.6.4/libguile/numbers.c:scm_istr2int(). */
258 is_integer_syntax (const char *str, int len, int radix)
261 const char *p_end = str + len;
263 /* The accepted syntax is
265 where DIGIT is a hexadecimal digit whose value is below radix. */
269 if (*p == '+' || *p == '-')
279 if (c >= '0' && c <= '9')
281 else if (c >= 'A' && c <= 'F')
283 else if (c >= 'a' && c <= 'f')
294 /* Tests if a token represents a rational, floating-point or complex number.
295 If unconstrained is false, only real numbers are accepted; otherwise,
296 complex numbers are accepted as well.
297 Taken from guile-1.6.4/libguile/numbers.c:scm_istr2flo(). */
299 is_other_number_syntax (const char *str, int len, int radix, bool unconstrained)
302 const char *p_end = str + len;
306 /* The accepted syntaxes are:
307 for a floating-point number:
308 ['+'|'-'] DIGIT+ [EXPONENT]
309 ['+'|'-'] DIGIT* '.' DIGIT+ [EXPONENT]
310 where EXPONENT ::= ['d'|'e'|'f'|'l'|'s'] DIGIT+
311 (Dot and exponent are allowed only if radix is 10.)
312 for a rational number:
313 ['+'|'-'] DIGIT+ '/' DIGIT+
314 for a complex number:
315 REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
316 REAL-NUMBER {'+'|'-'} 'i'
317 {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
319 REAL-NUMBER '@' REAL-NUMBER
323 /* Parse leading sign. */
325 if (*p == '+' || *p == '-')
331 /* Recognize complex number syntax: {'+'|'-'} 'i' */
332 if (unconstrained && (*p == 'I' || *p == 'i') && p + 1 == p_end)
335 /* Parse digits before dot or exponent or slash. */
341 if (c >= '0' && c <= '9')
343 else if (c >= 'A' && c <= 'F')
345 if (c >= 'D' && radix == 10) /* exponent? */
349 else if (c >= 'a' && c <= 'f')
351 if (c >= 'd' && radix == 10) /* exponent? */
363 /* If p == p_end, we know that seen_digits = true, and the number is an
364 integer without exponent. */
367 /* If we have no digits so far, we need a decimal point later. */
368 if (!seen_digits && !(*p == '.' && radix == 10))
370 /* Trailing '#' signs are equivalent to zeroes. */
371 while (p < p_end && *p == '#')
377 /* Parse digits after the slash. */
378 bool all_zeroes = true;
380 for (; p < p_end; p++)
384 if (c >= '0' && c <= '9')
386 else if (c >= 'A' && c <= 'F')
388 else if (c >= 'a' && c <= 'f')
397 /* A zero denominator is not allowed. */
400 /* Trailing '#' signs are equivalent to zeroes. */
401 while (p < p_end && *p == '#')
408 /* Decimal point notation. */
411 /* Parse digits after the decimal point. */
413 for (; p < p_end; p++)
417 if (c >= '0' && c <= '9')
422 /* Digits are required before or after the decimal point. */
425 /* Trailing '#' signs are equivalent to zeroes. */
426 while (p < p_end && *p == '#')
431 /* Parse exponent. */
444 if (*p == '+' || *p == '-')
450 if (!(*p >= '0' && *p <= '9'))
457 if (!(*p >= '0' && *p <= '9'))
470 /* Recognize complex number syntax. */
473 /* Recognize the syntax {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i' */
474 if (seen_sign && (*p == 'I' || *p == 'i') && p + 1 == p_end)
476 /* Recognize the syntaxes
477 REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
478 REAL-NUMBER {'+'|'-'} 'i'
480 if (*p == '+' || *p == '-')
481 return (p_end[-1] == 'I' || p_end[-1] == 'i')
482 && (p + 1 == p_end - 1
483 || is_other_number_syntax (p, p_end - 1 - p, radix, false));
484 /* Recognize the syntax REAL-NUMBER '@' REAL-NUMBER */
488 return is_other_number_syntax (p, p_end - p, radix, false);
494 /* Tests if a token represents a number.
495 Taken from guile-1.6.4/libguile/numbers.c:scm_istring2number(). */
497 is_number (const struct token *tp)
499 const char *str = tp->chars;
500 int len = tp->charcount;
502 enum { unknown, exact, inexact } exactness = unknown;
503 bool seen_radix_prefix = false;
504 bool seen_exactness_prefix = false;
507 if (*str == '+' || *str == '-')
509 while (len >= 2 && *str == '#')
514 if (seen_radix_prefix)
517 seen_radix_prefix = true;
520 if (seen_radix_prefix)
523 seen_radix_prefix = true;
526 if (seen_radix_prefix)
529 seen_radix_prefix = true;
532 if (seen_radix_prefix)
535 seen_radix_prefix = true;
538 if (seen_exactness_prefix)
541 seen_exactness_prefix = true;
544 if (seen_exactness_prefix)
547 seen_exactness_prefix = true;
555 if (exactness != inexact)
557 /* Try to parse an integer. */
558 if (is_integer_syntax (str, len, 10))
560 /* FIXME: Other Scheme implementations support exact rational numbers
561 or exact complex numbers. */
563 if (exactness != exact)
565 /* Try to parse a rational, floating-point or complex number. */
566 if (is_other_number_syntax (str, len, 10, true))
573 /* ========================= Accumulating comments ========================= */
577 static size_t bufmax;
578 static size_t buflen;
589 if (buflen >= bufmax)
591 bufmax = 2 * bufmax + 10;
592 buffer = xrealloc (buffer, bufmax);
594 buffer[buflen++] = c;
598 comment_line_end (size_t chars_to_remove)
600 buflen -= chars_to_remove;
602 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
604 if (chars_to_remove == 0 && buflen >= bufmax)
606 bufmax = 2 * bufmax + 10;
607 buffer = xrealloc (buffer, bufmax);
609 buffer[buflen] = '\0';
610 savable_comment_add (buffer);
614 /* These are for tracking whether comments count as immediately before
616 static int last_comment_line;
617 static int last_non_comment_line;
620 /* ========================= Accumulating messages ========================= */
623 static message_list_ty *mlp;
626 /* ========================== Reading of objects. ========================= */
629 /* We are only interested in symbols (e.g. gettext or ngettext) and strings.
630 Other objects need not to be represented precisely. */
633 t_symbol, /* symbol */
634 t_string, /* string */
635 t_other, /* other kind of real object */
636 t_dot, /* '.' pseudo object */
637 t_close, /* ')' pseudo object */
638 t_eof /* EOF marker */
643 enum object_type type;
644 struct token *token; /* for t_symbol and t_string */
645 int line_number_at_start; /* for t_string */
648 /* Free the memory pointed to by a 'struct object'. */
650 free_object (struct object *op)
652 if (op->type == t_symbol || op->type == t_string)
654 free_token (op->token);
659 /* Convert a t_symbol/t_string token to a char*. */
661 string_of_object (const struct object *op)
666 if (!(op->type == t_symbol || op->type == t_string))
668 n = op->token->charcount;
669 str = XNMALLOC (n + 1, char);
670 memcpy (str, op->token->chars, n);
675 /* Context lookup table. */
676 static flag_context_list_table_ty *flag_context_list_table;
678 /* Read the next object. */
680 read_object (struct object *op, flag_context_ty outer_context)
692 case ' ': case '\r': case '\f': case '\t':
696 /* Comments assumed to be grouped with a message must immediately
697 precede it, with no non-whitespace token on a line between
699 if (last_non_comment_line > last_comment_line)
700 savable_comment_reset ();
705 bool all_semicolons = true;
707 last_comment_line = line_number;
712 if (c == EOF || c == '\n')
715 all_semicolons = false;
718 /* We skip all leading white space, but not EOLs. */
719 if (!(buflen == 0 && (c == ' ' || c == '\t')))
723 comment_line_end (0);
729 int arg = 0; /* Current argument number. */
730 flag_context_list_iterator_ty context_iter;
731 const struct callshapes *shapes = NULL;
732 struct arglist_parser *argparser = NULL;
737 flag_context_ty inner_context;
740 inner_context = null_context;
743 inherited_context (outer_context,
744 flag_context_list_iterator_advance (
747 read_object (&inner, inner_context);
749 /* Recognize end of list. */
750 if (inner.type == t_close)
753 last_non_comment_line = line_number;
754 if (argparser != NULL)
755 arglist_parser_done (argparser, arg);
759 /* Dots are not allowed in every position.
762 /* EOF inside list is illegal.
764 if (inner.type == t_eof)
769 /* This is the function position. */
770 if (inner.type == t_symbol)
772 char *symbol_name = string_of_object (&inner);
775 if (hash_find_entry (&keywords,
776 symbol_name, strlen (symbol_name),
779 shapes = (const struct callshapes *) keyword_value;
781 argparser = arglist_parser_alloc (mlp, shapes);
784 flag_context_list_iterator (
785 flag_context_list_table_lookup (
786 flag_context_list_table,
787 symbol_name, strlen (symbol_name)));
792 context_iter = null_context_list_iterator;
796 /* These are the argument positions. */
797 if (argparser != NULL && inner.type == t_string)
798 arglist_parser_remember (argparser, arg,
799 string_of_object (&inner),
802 inner.line_number_at_start,
806 free_object (&inner);
808 if (argparser != NULL)
809 arglist_parser_done (argparser, arg);
812 last_non_comment_line = line_number;
816 /* Tell the caller about the end of list.
817 Unmatched closing parenthesis is illegal.
820 last_non_comment_line = line_number;
826 /* The ,@ handling inside lists is wrong anyway, because
827 ,@form expands to an unknown number of elements. */
828 if (c != EOF && c != '@')
837 read_object (&inner, null_context);
839 /* Dots and EOF are not allowed here. But be tolerant. */
841 free_object (&inner);
844 last_non_comment_line = line_number;
849 /* Dispatch macro handling. */
853 /* Invalid input. Be tolerant, no error message. */
861 case '(': /* Vector */
865 read_object (&inner, null_context);
866 /* Dots and EOF are not allowed here.
868 free_object (&inner);
870 last_non_comment_line = line_number;
874 case 'T': case 't': /* Boolean true */
875 case 'F': case 'f': /* Boolean false */
877 last_non_comment_line = line_number;
889 read_token (&token, '#');
890 if (is_number (&token))
895 last_non_comment_line = line_number;
900 if (token.charcount == 2
901 && (token.chars[1] == 'e' || token.chars[1] == 'i'))
907 /* Homogenous vector syntax, see arrays.scm. */
908 case 'a': /* Vectors of char */
909 case 'c': /* Vectors of complex */
910 /*case 'e':*/ /* Vectors of long */
911 case 'h': /* Vectors of short */
912 /*case 'i':*/ /* Vectors of double-float */
913 case 'l': /* Vectors of long long */
914 case 's': /* Vectors of single-float */
915 case 'u': /* Vectors of unsigned long */
916 case 'y': /* Vectors of byte */
919 read_object (&inner, null_context);
920 /* Dots and EOF are not allowed here.
923 free_object (&inner);
925 last_non_comment_line = line_number;
929 /* Unknown # object. But be tolerant. */
932 last_non_comment_line = line_number;
938 /* Block comment '#! ... \n!#\n'. We don't extract it
939 because it's only used to introduce scripts on Unix. */
949 /* EOF is not allowed here. But be tolerant. */
951 if (last3 == '\n' && last2 == '!' && last1 == '#'
965 read_token (&token, c);
966 /* The token should consists only of '0' and '1', except
967 for the initial '*'. But be tolerant. */
970 last_non_comment_line = line_number;
975 /* Symbol with multiple escapes: #{...}# */
977 op->token = XMALLOC (struct token);
979 init_token (op->token);
1002 grow_token (op->token);
1003 op->token->chars[op->token->charcount++] = c;
1006 op->type = t_symbol;
1007 last_non_comment_line = line_number;
1018 read_token (&token, c);
1019 free_token (&token);
1022 last_non_comment_line = line_number;
1026 case ':': /* Keyword. */
1027 case '&': /* Deprecated keyword, installed in optargs.scm. */
1030 read_token (&token, '-');
1031 free_token (&token);
1033 last_non_comment_line = line_number;
1037 /* The following are installed through read-hash-extend. */
1040 case '0': case '1': case '2': case '3': case '4':
1041 case '5': case '6': case '7': case '8': case '9':
1042 /* Multidimensional array syntax: #nx(...) where
1044 x ::= {'a'|'b'|'c'|'e'|'i'|'s'|'u'}
1048 while (c >= '0' && c <= '9');
1049 /* c should be one of {'a'|'b'|'c'|'e'|'i'|'s'|'u'}.
1052 case '\'': /* boot-9.scm */
1053 case '.': /* boot-9.scm */
1054 case ',': /* srfi-10.scm */
1056 struct object inner;
1057 read_object (&inner, null_context);
1058 /* Dots and EOF are not allowed here.
1060 free_object (&inner);
1062 last_non_comment_line = line_number;
1069 last_non_comment_line = line_number;
1078 op->token = XMALLOC (struct token);
1079 init_token (op->token);
1080 op->line_number_at_start = line_number;
1085 /* Invalid input. Be tolerant, no error message. */
1093 /* Invalid input. Be tolerant, no error message. */
1124 grow_token (op->token);
1125 op->token->chars[op->token->charcount++] = c;
1127 op->type = t_string;
1133 pos.file_name = logical_file_name;
1134 pos.line_number = op->line_number_at_start;
1135 remember_a_message (mlp, NULL, string_of_object (op),
1136 null_context, &pos, NULL, savable_comment);
1138 last_non_comment_line = line_number;
1142 case '0': case '1': case '2': case '3': case '4':
1143 case '5': case '6': case '7': case '8': case '9':
1144 case '+': case '-': case '.':
1145 /* Read a number or symbol token. */
1146 op->token = XMALLOC (struct token);
1147 read_token (op->token, c);
1148 if (op->token->charcount == 1 && op->token->chars[0] == '.')
1150 free_token (op->token);
1154 else if (is_number (op->token))
1157 free_token (op->token);
1164 op->type = t_symbol;
1166 last_non_comment_line = line_number;
1171 /* Read a symbol token. */
1172 op->token = XMALLOC (struct token);
1173 read_token (op->token, c);
1174 op->type = t_symbol;
1175 last_non_comment_line = line_number;
1183 extract_scheme (FILE *f,
1184 const char *real_filename, const char *logical_filename,
1185 flag_context_list_table_ty *flag_table,
1186 msgdomain_list_ty *mdlp)
1188 mlp = mdlp->item[0]->messages;
1191 real_file_name = real_filename;
1192 logical_file_name = xstrdup (logical_filename);
1195 last_comment_line = -1;
1196 last_non_comment_line = -1;
1198 flag_context_list_table = flag_table;
1202 /* Eat tokens until eof is seen. When read_object returns
1203 due to an unbalanced closing parenthesis, just restart it. */
1206 struct object toplevel_object;
1208 read_object (&toplevel_object, null_context);
1210 if (toplevel_object.type == t_eof)
1213 free_object (&toplevel_object);
1217 /* Close scanner. */
1219 real_file_name = NULL;
1220 logical_file_name = NULL;