1 /* xgettext Python backend.
2 Copyright (C) 2002-2003, 2005-2015 Free Software Foundation,
5 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
37 #include "error-progname.h"
41 #include "xvasprintf.h"
45 #include "po-charset.h"
50 #define _(s) gettext(s)
52 #define max(a,b) ((a) > (b) ? (a) : (b))
54 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
57 /* The Python syntax is defined in the Python Reference Manual
58 /usr/share/doc/packages/python/html/ref/index.html.
59 See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
60 Python-2.0/Objects/unicodeobject.c. */
63 /* ====================== Keyword set customization. ====================== */
65 /* If true extract all strings. */
66 static bool extract_all = false;
68 static hash_table keywords;
69 static bool default_keywords = true;
73 x_python_extract_all ()
80 x_python_keyword (const char *name)
83 default_keywords = false;
87 struct callshape shape;
90 if (keywords.table == NULL)
91 hash_init (&keywords, 100);
93 split_keywordspec (name, &end, &shape);
95 /* The characters between name and end should form a valid C identifier.
96 A colon means an invalid parse in split_keywordspec(). */
97 colon = strchr (name, ':');
98 if (colon == NULL || colon >= end)
99 insert_keyword_callshape (&keywords, name, end - name, &shape);
103 /* Finish initializing the keywords hash table.
104 Called after argument processing, before each file is processed. */
108 if (default_keywords)
110 /* When adding new keywords here, also update the documentation in
112 x_python_keyword ("gettext");
113 x_python_keyword ("ugettext");
114 x_python_keyword ("dgettext:2");
115 x_python_keyword ("ngettext:1,2");
116 x_python_keyword ("ungettext:1,2");
117 x_python_keyword ("dngettext:2,3");
118 x_python_keyword ("_");
119 default_keywords = false;
124 init_flag_table_python ()
126 xgettext_record_flag ("gettext:1:pass-python-format");
127 xgettext_record_flag ("ugettext:1:pass-python-format");
128 xgettext_record_flag ("dgettext:2:pass-python-format");
129 xgettext_record_flag ("ngettext:1:pass-python-format");
130 xgettext_record_flag ("ngettext:2:pass-python-format");
131 xgettext_record_flag ("ungettext:1:pass-python-format");
132 xgettext_record_flag ("ungettext:2:pass-python-format");
133 xgettext_record_flag ("dngettext:2:pass-python-format");
134 xgettext_record_flag ("dngettext:3:pass-python-format");
135 xgettext_record_flag ("_:1:pass-python-format");
136 /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
138 xgettext_record_flag ("gettext:1:pass-python-brace-format");
139 xgettext_record_flag ("ugettext:1:pass-python-brace-format");
140 xgettext_record_flag ("dgettext:2:pass-python-brace-format");
141 xgettext_record_flag ("ngettext:1:pass-python-brace-format");
142 xgettext_record_flag ("ngettext:2:pass-python-brace-format");
143 xgettext_record_flag ("ungettext:1:pass-python-brace-format");
144 xgettext_record_flag ("ungettext:2:pass-python-brace-format");
145 xgettext_record_flag ("dngettext:2:pass-python-brace-format");
146 xgettext_record_flag ("dngettext:3:pass-python-brace-format");
147 xgettext_record_flag ("_:1:pass-python-brace-format");
148 /* xgettext_record_flag ("format:1:python-brace-format"); */
152 /* ======================== Reading of characters. ======================== */
154 /* Real filename, used in error messages about the input file. */
155 static const char *real_file_name;
157 /* Logical filename and line number, used to label the extracted messages. */
158 static char *logical_file_name;
159 static int line_number;
161 /* The input file stream. */
165 /* 0. Terminate line by \n, regardless whether the external
166 representation of a line terminator is CR (Mac), and CR/LF
167 (DOS/Windows), as Python treats them equally. */
177 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
186 if (c1 != EOF && c1 != '\n')
189 /* Seen line terminator CR or CR/LF. */
196 /* Supports only one pushback character, and not '\n'. */
198 phase0_ungetc (int c)
205 /* 1. line_number handling. */
207 /* Maximum used, roughly a safer MB_LEN_MAX. */
208 #define MAX_PHASE1_PUSHBACK 16
209 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
210 static int phase1_pushback_length;
212 /* Read the next single byte from the input file. */
218 if (phase1_pushback_length)
219 c = phase1_pushback[--phase1_pushback_length];
229 /* Supports MAX_PHASE1_PUSHBACK characters of pushback. */
231 phase1_ungetc (int c)
238 if (phase1_pushback_length == SIZEOF (phase1_pushback))
240 phase1_pushback[phase1_pushback_length++] = c;
245 /* Phase 2: Conversion to Unicode.
246 This is done early because PEP 0263 specifies that conversion to Unicode
247 conceptually occurs before tokenization. A test case where it matters
248 is with encodings like BIG5: when a double-byte character ending in 0x5C
249 is followed by '\' or 'u0021', the tokenizer must not treat the second
250 half of the double-byte character as a backslash. */
252 /* End-of-file indicator for functions returning an UCS-4 character. */
255 static lexical_context_ty lexical_context;
257 static int phase2_pushback[max (9, UNINAME_MAX + 3)];
258 static int phase2_pushback_length;
260 /* Read the next Unicode UCS-4 character from the input file. */
264 if (phase2_pushback_length)
265 return phase2_pushback[--phase2_pushback_length];
267 if (xgettext_current_source_encoding == po_charset_ascii)
269 int c = phase1_getc ();
274 multiline_error (xstrdup (""),
275 xasprintf ("%s\n%s\n",
276 non_ascii_error_message (lexical_context,
280 Please specify the source encoding through --from-code or through a comment\n\
281 as specified in http://www.python.org/peps/pep-0263.html.\n")));
286 else if (xgettext_current_source_encoding != po_charset_utf8)
289 /* Use iconv on an increasing number of bytes. Read only as many bytes
290 through phase1_getc as needed. This is needed to give reasonable
291 interactive behaviour when fp is connected to an interactive tty. */
292 unsigned char buf[MAX_PHASE1_PUSHBACK];
294 int c = phase1_getc ();
297 buf[0] = (unsigned char) c;
302 unsigned char scratchbuf[6];
303 const char *inptr = (const char *) &buf[0];
304 size_t insize = bufcount;
305 char *outptr = (char *) &scratchbuf[0];
306 size_t outsize = sizeof (scratchbuf);
308 size_t res = iconv (xgettext_current_source_iconv,
309 (ICONV_CONST char **) &inptr, &insize,
311 /* We expect that a character has been produced if and only if
312 some input bytes have been consumed. */
313 if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
315 if (outsize == sizeof (scratchbuf))
317 /* No character has been produced. Must be an error. */
318 if (res != (size_t)(-1))
323 /* An invalid multibyte sequence was encountered. */
324 multiline_error (xstrdup (""),
326 %s:%d: Invalid multibyte sequence.\n\
327 Please specify the correct source encoding through --from-code or through a\n\
328 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
329 real_file_name, line_number));
332 else if (errno == EINVAL)
334 /* An incomplete multibyte character. */
337 if (bufcount == MAX_PHASE1_PUSHBACK)
339 /* An overlong incomplete multibyte sequence was
341 multiline_error (xstrdup (""),
343 %s:%d: Long incomplete multibyte sequence.\n\
344 Please specify the correct source encoding through --from-code or through a\n\
345 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
346 real_file_name, line_number));
350 /* Read one more byte and retry iconv. */
354 multiline_error (xstrdup (""),
356 %s:%d: Incomplete multibyte sequence at end of file.\n\
357 Please specify the correct source encoding through --from-code or through a\n\
358 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
359 real_file_name, line_number));
364 multiline_error (xstrdup (""),
366 %s:%d: Incomplete multibyte sequence at end of line.\n\
367 Please specify the correct source encoding through --from-code or through a\n\
368 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
369 real_file_name, line_number - 1));
372 buf[bufcount++] = (unsigned char) c;
375 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
376 real_file_name, line_number);
380 size_t outbytes = sizeof (scratchbuf) - outsize;
381 size_t bytes = bufcount - insize;
384 /* We expect that one character has been produced. */
389 /* Push back the unused bytes. */
391 phase1_ungetc (buf[--insize]);
392 /* Convert the character from UTF-8 to UCS-4. */
393 if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
395 /* scratchbuf contains an out-of-range Unicode character
397 multiline_error (xstrdup (""),
399 %s:%d: Invalid multibyte sequence.\n\
400 Please specify the source encoding through --from-code or through a comment\n\
401 as specified in http://www.python.org/peps/pep-0263.html.\n"),
402 real_file_name, line_number));
409 /* If we don't have iconv(), the only supported values for
410 xgettext_global_source_encoding and thus also for
411 xgettext_current_source_encoding are ASCII and UTF-8. */
417 /* Read an UTF-8 encoded character. */
418 unsigned char buf[6];
439 && ((buf[1] ^ 0x80) < 0x40))
449 && ((buf[1] ^ 0x80) < 0x40)
450 && ((buf[2] ^ 0x80) < 0x40))
460 && ((buf[1] ^ 0x80) < 0x40)
461 && ((buf[2] ^ 0x80) < 0x40)
462 && ((buf[3] ^ 0x80) < 0x40))
472 && ((buf[1] ^ 0x80) < 0x40)
473 && ((buf[2] ^ 0x80) < 0x40)
474 && ((buf[3] ^ 0x80) < 0x40)
475 && ((buf[4] ^ 0x80) < 0x40))
484 u8_mbtouc (&uc, buf, count);
489 /* Supports max (9, UNINAME_MAX + 3) pushback characters. */
491 phase2_ungetc (int c)
495 if (phase2_pushback_length == SIZEOF (phase2_pushback))
497 phase2_pushback[phase2_pushback_length++] = c;
502 /* ========================= Accumulating strings. ======================== */
504 /* A string buffer type that allows appending Unicode characters.
505 Returns the entire string in UTF-8 encoding. */
507 struct unicode_string_buffer
509 /* The part of the string that has already been converted to UTF-8. */
512 size_t utf8_allocated;
515 /* Initialize a 'struct unicode_string_buffer' to empty. */
517 init_unicode_string_buffer (struct unicode_string_buffer *bp)
519 bp->utf8_buffer = NULL;
521 bp->utf8_allocated = 0;
524 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
526 unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp,
529 if (bp->utf8_buflen + count > bp->utf8_allocated)
531 size_t new_allocated = 2 * bp->utf8_allocated + 10;
532 if (new_allocated < bp->utf8_buflen + count)
533 new_allocated = bp->utf8_buflen + count;
534 bp->utf8_allocated = new_allocated;
535 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
539 /* Auxiliary function: Append a Unicode character to bp->utf8.
540 uc must be < 0x110000. */
542 unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp,
545 unsigned char utf8buf[6];
546 int count = u8_uctomb (utf8buf, uc, 6);
549 /* The caller should have ensured that uc is not out-of-range. */
552 unicode_string_buffer_append_unicode_grow (bp, count);
553 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
554 bp->utf8_buflen += count;
557 /* Return the string buffer's contents. */
559 unicode_string_buffer_result (struct unicode_string_buffer *bp)
561 /* NUL-terminate it. */
562 unicode_string_buffer_append_unicode_grow (bp, 1);
563 bp->utf8_buffer[bp->utf8_buflen] = '\0';
565 return bp->utf8_buffer;
568 /* Free the memory pointed to by a 'struct unicode_string_buffer'. */
570 free_unicode_string_buffer (struct unicode_string_buffer *bp)
572 free (bp->utf8_buffer);
576 /* ======================== Accumulating comments. ======================== */
579 /* Accumulating a single comment line. */
581 static struct unicode_string_buffer comment_buffer;
586 lexical_context = lc_comment;
587 comment_buffer.utf8_buflen = 0;
593 return (comment_buffer.utf8_buflen == 0);
599 unicode_string_buffer_append_unicode (&comment_buffer, c);
602 static inline const char *
605 char *buffer = unicode_string_buffer_result (&comment_buffer);
606 size_t buflen = strlen (buffer);
609 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
611 buffer[buflen] = '\0';
612 savable_comment_add (buffer);
613 lexical_context = lc_outside;
618 /* These are for tracking whether comments count as immediately before
620 static int last_comment_line;
621 static int last_non_comment_line;
624 /* ======================== Recognizing comments. ======================== */
627 /* Recognizing the "coding" comment.
628 As specified in PEP 0263, it takes the form
629 "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
631 "set" "fileencoding" "=" {alphanumeric or "-" or "_" or "*"}*
632 and is located in a comment in a line that
633 - is either the first or second line,
634 - is not a continuation line,
635 - in the first form, contains no other tokens except this comment. */
637 /* Canonicalized encoding name for the current input file. */
638 static const char *xgettext_current_file_source_encoding;
641 /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
642 ASCII or UTF-8, when this conversion is a no-op). */
643 static iconv_t xgettext_current_file_source_iconv;
647 set_current_file_source_encoding (const char *canon_encoding)
649 xgettext_current_file_source_encoding = canon_encoding;
651 if (xgettext_current_file_source_encoding != po_charset_ascii
652 && xgettext_current_file_source_encoding != po_charset_utf8)
657 /* Avoid glibc-2.1 bug with EUC-KR. */
658 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
659 && !defined _LIBICONV_VERSION
660 if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
664 cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
665 if (cd == (iconv_t)(-1))
666 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
667 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
668 and iconv() does not support this conversion."),
669 xgettext_current_file_source_encoding, po_charset_utf8,
670 basename (program_name));
671 xgettext_current_file_source_iconv = cd;
673 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
674 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
675 This version was built without iconv()."),
676 xgettext_global_source_encoding, po_charset_utf8,
677 basename (program_name));
681 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
683 xgettext_current_source_iconv = xgettext_current_file_source_iconv;
688 try_to_extract_coding (const char *comment)
690 const char *p = c_strstr (comment, "coding");
695 if (*p == ':' || *p == '=')
698 while (*p == ' ' || *p == '\t')
701 const char *encoding_start = p;
703 while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
706 const char *encoding_end = p;
708 if (encoding_end > encoding_start)
710 /* Extract the encoding string. */
711 size_t encoding_len = encoding_end - encoding_start;
712 char *encoding = XNMALLOC (encoding_len + 1, char);
714 memcpy (encoding, encoding_start, encoding_len);
715 encoding[encoding_len] = '\0';
718 /* Canonicalize it. */
719 const char *canon_encoding = po_charset_canonicalize (encoding);
720 if (canon_encoding == NULL)
723 logical_file_name, line_number - 1, _("\
724 Unknown encoding \"%s\". Proceeding with ASCII instead."),
726 canon_encoding = po_charset_ascii;
730 set_current_file_source_encoding (canon_encoding);
741 /* Tracking whether the current line is a continuation line or contains a
742 non-blank character. */
743 static bool continuation_or_nonblank_line = false;
746 /* Phase 3: Outside strings, replace backslash-newline with nothing and a
747 comment with nothing. */
763 /* This shouldn't happen usually, because "A backslash is
764 illegal elsewhere on a line outside a string literal." */
767 /* Eat backslash-newline. */
768 continuation_or_nonblank_line = true;
775 last_comment_line = line_number;
780 if (c == UEOF || c == '\n')
782 /* We skip all leading white space, but not EOLs. */
783 if (!(comment_at_start () && (c == ' ' || c == '\t')))
786 comment = comment_line_end ();
787 if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
788 try_to_extract_coding (comment);
789 continuation_or_nonblank_line = false;
795 continuation_or_nonblank_line = false;
796 else if (!(c == ' ' || c == '\t' || c == '\f'))
797 continuation_or_nonblank_line = true;
803 /* Supports only one pushback character. */
805 phase3_ungetc (int c)
811 /* ========================= Accumulating strings. ======================== */
813 /* Return value of phase7_getuc when EOF is reached. */
815 #define P7_STRING_END (-2)
817 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
818 distinguished from a single-byte return value. */
819 #define UNICODE(code) (0x100 + (code))
821 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
822 UTF-32 code point. */
823 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
825 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
827 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
830 /* ========================== Reading of tokens. ========================== */
836 token_type_lparen, /* ( */
837 token_type_rparen, /* ) */
838 token_type_comma, /* , */
839 token_type_lbracket, /* [ */
840 token_type_rbracket, /* ] */
841 token_type_string, /* "abc", 'abc', """abc""", '''abc''' */
842 token_type_symbol, /* symbol, number */
843 token_type_plus, /* + */
844 token_type_other /* misc. operator */
846 typedef enum token_type_ty token_type_ty;
848 typedef struct token_ty token_ty;
852 char *string; /* for token_type_string, token_type_symbol */
853 refcounted_string_list_ty *comment; /* for token_type_string */
857 /* Free the memory pointed to by a 'struct token_ty'. */
859 free_token (token_ty *tp)
861 if (tp->type == token_type_string || tp->type == token_type_symbol)
863 if (tp->type == token_type_string)
864 drop_reference (tp->comment);
868 /* There are two different input syntaxes for strings, "abc" and r"abc",
869 and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
870 Which escape sequences are understood, i.e. what is interpreted specially
872 "abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
874 u"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
876 The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
877 \unnnn items. The \ooo and \xnn values are in the current source encoding
878 for byte strings, and Unicode code points for Unicode strings.
882 phase7_getuc (int quote_char,
883 bool triple, bool interpret_ansic, bool interpret_unicode,
884 unsigned int *backslash_counter)
890 /* Use phase 2, because phase 3 elides comments. */
896 if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
900 int c1 = phase2_getc ();
901 if (c1 == quote_char)
903 int c2 = phase2_getc ();
904 if (c2 == quote_char)
905 return P7_STRING_END;
912 return P7_STRING_END;
919 *backslash_counter = 0;
920 return UNICODE ('\n');
922 /* In r"..." and ur"..." strings, newline is only allowed
923 immediately after an odd number of backslashes (although the
924 backslashes are not interpreted!). */
925 if (!(interpret_ansic || (*backslash_counter & 1) == 0))
927 *backslash_counter = 0;
928 return UNICODE ('\n');
931 error_with_progname = false;
932 error (0, 0, _("%s:%d: warning: unterminated string"),
933 logical_file_name, line_number);
934 error_with_progname = true;
935 return P7_STRING_END;
940 *backslash_counter = 0;
944 /* Backslash handling. */
946 if (!interpret_ansic && !interpret_unicode)
948 ++*backslash_counter;
949 return UNICODE ('\\');
952 /* Dispatch according to the character following the backslash. */
956 ++*backslash_counter;
957 return UNICODE ('\\');
966 ++*backslash_counter;
969 *backslash_counter = 0;
972 *backslash_counter = 0;
973 return UNICODE ('\a');
975 *backslash_counter = 0;
976 return UNICODE ('\b');
978 *backslash_counter = 0;
979 return UNICODE ('\f');
981 *backslash_counter = 0;
982 return UNICODE ('\n');
984 *backslash_counter = 0;
985 return UNICODE ('\r');
987 *backslash_counter = 0;
988 return UNICODE ('\t');
990 *backslash_counter = 0;
991 return UNICODE ('\v');
992 case '0': case '1': case '2': case '3': case '4':
993 case '5': case '6': case '7':
1000 if (c >= '0' && c <= '7')
1002 n = (n << 3) + (c - '0');
1006 if (c >= '0' && c <= '7')
1007 n = (n << 3) + (c - '0');
1015 *backslash_counter = 0;
1016 if (interpret_unicode)
1019 return (unsigned char) n;
1023 int c1 = phase2_getc ();
1026 if (c1 >= '0' && c1 <= '9')
1028 else if (c1 >= 'A' && c1 <= 'F')
1030 else if (c1 >= 'a' && c1 <= 'f')
1037 int c2 = phase2_getc ();
1040 if (c2 >= '0' && c2 <= '9')
1042 else if (c2 >= 'A' && c2 <= 'F')
1044 else if (c2 >= 'a' && c2 <= 'f')
1051 int n = (n1 << 4) + n2;
1052 *backslash_counter = 0;
1053 if (interpret_unicode)
1056 return (unsigned char) n;
1063 ++*backslash_counter;
1064 return UNICODE ('\\');
1068 if (interpret_unicode)
1072 unsigned char buf[4];
1076 for (i = 0; i < 4; i++)
1078 int c1 = phase2_getc ();
1080 if (c1 >= '0' && c1 <= '9')
1081 n = (n << 4) + (c1 - '0');
1082 else if (c1 >= 'A' && c1 <= 'F')
1083 n = (n << 4) + (c1 - 'A' + 10);
1084 else if (c1 >= 'a' && c1 <= 'f')
1085 n = (n << 4) + (c1 - 'a' + 10);
1090 phase2_ungetc (buf[i]);
1092 ++*backslash_counter;
1093 return UNICODE ('\\');
1098 *backslash_counter = 0;
1102 if (interpret_ansic)
1106 unsigned char buf[8];
1110 for (i = 0; i < 8; i++)
1112 int c1 = phase2_getc ();
1114 if (c1 >= '0' && c1 <= '9')
1115 n = (n << 4) + (c1 - '0');
1116 else if (c1 >= 'A' && c1 <= 'F')
1117 n = (n << 4) + (c1 - 'A' + 10);
1118 else if (c1 >= 'a' && c1 <= 'f')
1119 n = (n << 4) + (c1 - 'a' + 10);
1124 phase2_ungetc (buf[i]);
1126 ++*backslash_counter;
1127 return UNICODE ('\\');
1134 *backslash_counter = 0;
1138 error_with_progname = false;
1139 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1140 logical_file_name, line_number);
1141 error_with_progname = true;
1144 phase2_ungetc (buf[i]);
1146 ++*backslash_counter;
1147 return UNICODE ('\\');
1152 int c1 = phase2_getc ();
1155 unsigned char buf[UNINAME_MAX + 1];
1159 for (i = 0; i < UNINAME_MAX; i++)
1161 int c2 = phase2_getc ();
1162 if (!(c2 >= ' ' && c2 <= '~'))
1166 phase2_ungetc (buf[i]);
1169 ++*backslash_counter;
1170 return UNICODE ('\\');
1178 n = unicode_name_character ((char *) buf);
1179 if (n != UNINAME_INVALID)
1181 *backslash_counter = 0;
1185 phase2_ungetc ('}');
1187 phase2_ungetc (buf[i]);
1191 ++*backslash_counter;
1192 return UNICODE ('\\');
1198 ++*backslash_counter;
1199 return UNICODE ('\\');
1204 /* Combine characters into tokens. Discard whitespace except newlines at
1205 the end of logical lines. */
1207 /* Number of pending open parentheses/braces/brackets. */
1208 static int open_pbb;
1210 static token_ty phase5_pushback[2];
1211 static int phase5_pushback_length;
1214 phase5_get (token_ty *tp)
1218 if (phase5_pushback_length)
1220 *tp = phase5_pushback[--phase5_pushback_length];
1226 tp->line_number = line_number;
1232 tp->type = token_type_eof;
1238 /* Ignore whitespace and comments. */
1242 if (last_non_comment_line > last_comment_line)
1243 savable_comment_reset ();
1244 /* Ignore newline if and only if it is used for implicit line
1248 tp->type = token_type_other;
1252 last_non_comment_line = tp->line_number;
1258 int c1 = phase3_getc ();
1260 if (!(c1 >= '0' && c1 <= '9'))
1263 tp->type = token_type_other;
1268 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1269 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1270 case 'M': case 'N': case 'O': case 'P': case 'Q':
1271 case 'S': case 'T': case 'V': case 'W': case 'X':
1274 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1275 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1276 case 'm': case 'n': case 'o': case 'p': case 'q':
1277 case 's': case 't': case 'v': case 'w': case 'x':
1279 case '0': case '1': case '2': case '3': case '4':
1280 case '5': case '6': case '7': case '8': case '9':
1282 /* Symbol, or part of a number. */
1284 static char *buffer;
1291 if (bufpos >= bufmax)
1293 bufmax = 2 * bufmax + 10;
1294 buffer = xrealloc (buffer, bufmax);
1296 buffer[bufpos++] = c;
1300 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1301 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1302 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1303 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1306 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1307 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1308 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1309 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1311 case '0': case '1': case '2': case '3': case '4':
1312 case '5': case '6': case '7': case '8': case '9':
1320 if (bufpos >= bufmax)
1322 bufmax = 2 * bufmax + 10;
1323 buffer = xrealloc (buffer, bufmax);
1325 buffer[bufpos] = '\0';
1326 tp->string = xstrdup (buffer);
1327 tp->type = token_type_symbol;
1333 struct mixed_string_buffer *bp;
1335 bool interpret_ansic;
1336 bool interpret_unicode;
1338 unsigned int backslash_counter;
1342 int c1 = phase2_getc ();
1343 if (c1 == '"' || c1 == '\'')
1346 interpret_ansic = false;
1347 interpret_unicode = false;
1356 int c1 = phase2_getc ();
1357 if (c1 == '"' || c1 == '\'')
1360 interpret_ansic = true;
1361 interpret_unicode = true;
1364 if (c1 == 'R' || c1 == 'r')
1366 int c2 = phase2_getc ();
1367 if (c2 == '"' || c2 == '\'')
1370 interpret_ansic = false;
1371 interpret_unicode = true;
1380 case '"': case '\'':
1382 interpret_ansic = true;
1383 interpret_unicode = false;
1386 lexical_context = lc_string;
1388 int c1 = phase2_getc ();
1389 if (c1 == quote_char)
1391 int c2 = phase2_getc ();
1392 if (c2 == quote_char)
1403 backslash_counter = 0;
1404 /* Start accumulating the string. */
1405 bp = mixed_string_buffer_alloc (lexical_context,
1410 int uc = phase7_getuc (quote_char, triple, interpret_ansic,
1411 interpret_unicode, &backslash_counter);
1413 /* Keep line_number in sync. */
1414 bp->line_number = line_number;
1416 if (uc == P7_EOF || uc == P7_STRING_END)
1419 if (IS_UNICODE (uc))
1421 assert (UNICODE_VALUE (uc) >= 0
1422 && UNICODE_VALUE (uc) < 0x110000);
1423 mixed_string_buffer_append_unicode (bp,
1424 UNICODE_VALUE (uc));
1427 mixed_string_buffer_append_char (bp, uc);
1429 tp->string = mixed_string_buffer_done (bp);
1430 tp->comment = add_reference (savable_comment);
1431 lexical_context = lc_outside;
1432 tp->type = token_type_string;
1438 tp->type = token_type_lparen;
1444 tp->type = token_type_rparen;
1448 tp->type = token_type_comma;
1453 tp->type = (c == '[' ? token_type_lbracket : token_type_other);
1459 tp->type = (c == ']' ? token_type_rbracket : token_type_other);
1463 tp->type = token_type_plus;
1467 /* We could carefully recognize each of the 2 and 3 character
1468 operators, but it is not necessary, as we only need to recognize
1469 gettext invocations. Don't bother. */
1470 tp->type = token_type_other;
1476 /* Supports only one pushback token. */
1478 phase5_unget (token_ty *tp)
1480 if (tp->type != token_type_eof)
1482 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1484 phase5_pushback[phase5_pushback_length++] = *tp;
1489 /* Combine adjacent strings to form a single string. Note that the end
1490 of a logical line appears as a token of its own, therefore strings that
1491 belong to different logical lines will not be concatenated. */
1494 x_python_lex (token_ty *tp)
1497 if (tp->type == token_type_string)
1499 char *sum = tp->string;
1500 size_t sum_len = strlen (sum);
1504 token_ty token2, *tp2 = NULL;
1507 phase5_get (&token2);
1508 switch (token2.type)
1510 case token_type_plus:
1512 phase5_get (&token3);
1513 if (token3.type == token_type_string)
1515 free_token (&token2);
1519 phase5_unget (&token3);
1522 case token_type_string:
1531 char *addend = tp2->string;
1532 size_t addend_len = strlen (addend);
1534 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1535 memcpy (sum + sum_len, addend, addend_len + 1);
1536 sum_len += addend_len;
1541 phase5_unget (&token2);
1549 /* ========================= Extracting strings. ========================== */
1552 /* Context lookup table. */
1553 static flag_context_list_table_ty *flag_context_list_table;
1556 /* The file is broken into tokens. Scan the token stream, looking for
1557 a keyword, followed by a left paren, followed by a string. When we
1558 see this sequence, we have something to remember. We assume we are
1559 looking at a valid C or C++ program, and leave the complaints about
1560 the grammar to the compiler.
1562 Normal handling: Look for
1563 keyword ( ... msgid ... )
1564 Plural handling: Look for
1565 keyword ( ... msgid ... msgid_plural ... )
1567 We use recursion because the arguments before msgid or between msgid
1568 and msgid_plural can contain subexpressions of the same form. */
1571 /* Extract messages until the next balanced closing parenthesis or bracket.
1572 Extracted messages are added to MLP.
1573 DELIM can be either token_type_rparen or token_type_rbracket, or
1574 token_type_eof to accept both.
1575 Return true upon eof, false upon closing parenthesis or bracket. */
1577 extract_balanced (message_list_ty *mlp,
1578 token_type_ty delim,
1579 flag_context_ty outer_context,
1580 flag_context_list_iterator_ty context_iter,
1581 struct arglist_parser *argparser)
1583 /* Current argument number. */
1585 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1587 /* Parameters of the keyword just seen. Defined only in state 1. */
1588 const struct callshapes *next_shapes = NULL;
1589 /* Context iterator that will be used if the next token is a '('. */
1590 flag_context_list_iterator_ty next_context_iter =
1591 passthrough_context_list_iterator;
1592 /* Current context. */
1593 flag_context_ty inner_context =
1594 inherited_context (outer_context,
1595 flag_context_list_iterator_advance (&context_iter));
1597 /* Start state is 0. */
1604 x_python_lex (&token);
1607 case token_type_symbol:
1609 void *keyword_value;
1611 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1615 next_shapes = (const struct callshapes *) keyword_value;
1622 flag_context_list_iterator (
1623 flag_context_list_table_lookup (
1624 flag_context_list_table,
1625 token.string, strlen (token.string)));
1626 free (token.string);
1629 case token_type_lparen:
1630 if (extract_balanced (mlp, token_type_rparen,
1631 inner_context, next_context_iter,
1632 arglist_parser_alloc (mlp,
1633 state ? next_shapes : NULL)))
1635 xgettext_current_source_encoding = po_charset_utf8;
1636 arglist_parser_done (argparser, arg);
1637 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1640 next_context_iter = null_context_list_iterator;
1644 case token_type_rparen:
1645 if (delim == token_type_rparen || delim == token_type_eof)
1647 xgettext_current_source_encoding = po_charset_utf8;
1648 arglist_parser_done (argparser, arg);
1649 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1652 next_context_iter = null_context_list_iterator;
1656 case token_type_comma:
1659 inherited_context (outer_context,
1660 flag_context_list_iterator_advance (
1662 next_context_iter = passthrough_context_list_iterator;
1666 case token_type_lbracket:
1667 if (extract_balanced (mlp, token_type_rbracket,
1668 null_context, null_context_list_iterator,
1669 arglist_parser_alloc (mlp, NULL)))
1671 xgettext_current_source_encoding = po_charset_utf8;
1672 arglist_parser_done (argparser, arg);
1673 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1676 next_context_iter = null_context_list_iterator;
1680 case token_type_rbracket:
1681 if (delim == token_type_rbracket || delim == token_type_eof)
1683 xgettext_current_source_encoding = po_charset_utf8;
1684 arglist_parser_done (argparser, arg);
1685 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1688 next_context_iter = null_context_list_iterator;
1692 case token_type_string:
1695 pos.file_name = logical_file_name;
1696 pos.line_number = token.line_number;
1698 xgettext_current_source_encoding = po_charset_utf8;
1700 remember_a_message (mlp, NULL, token.string, inner_context,
1701 &pos, NULL, token.comment);
1703 arglist_parser_remember (argparser, arg, token.string,
1705 pos.file_name, pos.line_number,
1707 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1709 drop_reference (token.comment);
1710 next_context_iter = null_context_list_iterator;
1714 case token_type_eof:
1715 xgettext_current_source_encoding = po_charset_utf8;
1716 arglist_parser_done (argparser, arg);
1717 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1720 case token_type_plus:
1721 case token_type_other:
1722 next_context_iter = null_context_list_iterator;
1734 extract_python (FILE *f,
1735 const char *real_filename, const char *logical_filename,
1736 flag_context_list_table_ty *flag_table,
1737 msgdomain_list_ty *mdlp)
1739 message_list_ty *mlp = mdlp->item[0]->messages;
1742 real_file_name = real_filename;
1743 logical_file_name = xstrdup (logical_filename);
1746 lexical_context = lc_outside;
1748 last_comment_line = -1;
1749 last_non_comment_line = -1;
1751 xgettext_current_file_source_encoding = xgettext_global_source_encoding;
1753 xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1756 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1758 xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1761 continuation_or_nonblank_line = false;
1765 flag_context_list_table = flag_table;
1769 /* Eat tokens until eof is seen. When extract_balanced returns
1770 due to an unbalanced closing parenthesis, just restart it. */
1771 while (!extract_balanced (mlp, token_type_eof,
1772 null_context, null_context_list_iterator,
1773 arglist_parser_alloc (mlp, NULL)))
1777 real_file_name = NULL;
1778 logical_file_name = NULL;