1 /* xgettext Python backend.
2 Copyright (C) 2002-2003, 2005-2013 Free Software Foundation, Inc.
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
36 #include "error-progname.h"
40 #include "xvasprintf.h"
44 #include "po-charset.h"
49 #define _(s) gettext(s)
51 #define max(a,b) ((a) > (b) ? (a) : (b))
53 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
56 /* The Python syntax is defined in the Python Reference Manual
57 /usr/share/doc/packages/python/html/ref/index.html.
58 See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
59 Python-2.0/Objects/unicodeobject.c. */
62 /* ====================== Keyword set customization. ====================== */
64 /* If true extract all strings. */
65 static bool extract_all = false;
67 static hash_table keywords;
68 static bool default_keywords = true;
72 x_python_extract_all ()
79 x_python_keyword (const char *name)
82 default_keywords = false;
86 struct callshape shape;
89 if (keywords.table == NULL)
90 hash_init (&keywords, 100);
92 split_keywordspec (name, &end, &shape);
94 /* The characters between name and end should form a valid C identifier.
95 A colon means an invalid parse in split_keywordspec(). */
96 colon = strchr (name, ':');
97 if (colon == NULL || colon >= end)
98 insert_keyword_callshape (&keywords, name, end - name, &shape);
102 /* Finish initializing the keywords hash table.
103 Called after argument processing, before each file is processed. */
107 if (default_keywords)
109 /* When adding new keywords here, also update the documentation in
111 x_python_keyword ("gettext");
112 x_python_keyword ("ugettext");
113 x_python_keyword ("dgettext:2");
114 x_python_keyword ("ngettext:1,2");
115 x_python_keyword ("ungettext:1,2");
116 x_python_keyword ("dngettext:2,3");
117 x_python_keyword ("_");
118 default_keywords = false;
123 init_flag_table_python ()
125 xgettext_record_flag ("gettext:1:pass-python-format");
126 xgettext_record_flag ("ugettext:1:pass-python-format");
127 xgettext_record_flag ("dgettext:2:pass-python-format");
128 xgettext_record_flag ("ngettext:1:pass-python-format");
129 xgettext_record_flag ("ngettext:2:pass-python-format");
130 xgettext_record_flag ("ungettext:1:pass-python-format");
131 xgettext_record_flag ("ungettext:2:pass-python-format");
132 xgettext_record_flag ("dngettext:2:pass-python-format");
133 xgettext_record_flag ("dngettext:3:pass-python-format");
134 xgettext_record_flag ("_:1:pass-python-format");
135 /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
137 xgettext_record_flag ("gettext:1:pass-python-brace-format");
138 xgettext_record_flag ("ugettext:1:pass-python-brace-format");
139 xgettext_record_flag ("dgettext:2:pass-python-brace-format");
140 xgettext_record_flag ("ngettext:1:pass-python-brace-format");
141 xgettext_record_flag ("ngettext:2:pass-python-brace-format");
142 xgettext_record_flag ("ungettext:1:pass-python-brace-format");
143 xgettext_record_flag ("ungettext:2:pass-python-brace-format");
144 xgettext_record_flag ("dngettext:2:pass-python-brace-format");
145 xgettext_record_flag ("dngettext:3:pass-python-brace-format");
146 xgettext_record_flag ("_:1:pass-python-brace-format");
147 /* xgettext_record_flag ("format:1:python-brace-format"); */
151 /* ======================== Reading of characters. ======================== */
153 /* Real filename, used in error messages about the input file. */
154 static const char *real_file_name;
156 /* Logical filename and line number, used to label the extracted messages. */
157 static char *logical_file_name;
158 static int line_number;
160 /* The input file stream. */
164 /* 0. Terminate line by \n, regardless whether the external
165 representation of a line terminator is CR (Mac), and CR/LF
166 (DOS/Windows), as Python treats them equally. */
176 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
185 if (c1 != EOF && c1 != '\n')
188 /* Seen line terminator CR or CR/LF. */
195 /* Supports only one pushback character, and not '\n'. */
197 phase0_ungetc (int c)
204 /* 1. line_number handling. */
206 /* Maximum used, roughly a safer MB_LEN_MAX. */
207 #define MAX_PHASE1_PUSHBACK 16
208 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
209 static int phase1_pushback_length;
211 /* Read the next single byte from the input file. */
217 if (phase1_pushback_length)
218 c = phase1_pushback[--phase1_pushback_length];
228 /* Supports MAX_PHASE1_PUSHBACK characters of pushback. */
230 phase1_ungetc (int c)
237 if (phase1_pushback_length == SIZEOF (phase1_pushback))
239 phase1_pushback[phase1_pushback_length++] = c;
244 /* Phase 2: Conversion to Unicode.
245 This is done early because PEP 0263 specifies that conversion to Unicode
246 conceptually occurs before tokenization. A test case where it matters
247 is with encodings like BIG5: when a double-byte character ending in 0x5C
248 is followed by '\' or 'u0021', the tokenizer must not treat the second
249 half of the double-byte character as a backslash. */
251 /* End-of-file indicator for functions returning an UCS-4 character. */
254 static lexical_context_ty lexical_context;
256 static int phase2_pushback[max (9, UNINAME_MAX + 3)];
257 static int phase2_pushback_length;
259 /* Read the next Unicode UCS-4 character from the input file. */
263 if (phase2_pushback_length)
264 return phase2_pushback[--phase2_pushback_length];
266 if (xgettext_current_source_encoding == po_charset_ascii)
268 int c = phase1_getc ();
273 multiline_error (xstrdup (""),
274 xasprintf ("%s\n%s\n",
275 non_ascii_error_message (lexical_context,
279 Please specify the source encoding through --from-code or through a comment\n\
280 as specified in http://www.python.org/peps/pep-0263.html.\n")));
285 else if (xgettext_current_source_encoding != po_charset_utf8)
288 /* Use iconv on an increasing number of bytes. Read only as many bytes
289 through phase1_getc as needed. This is needed to give reasonable
290 interactive behaviour when fp is connected to an interactive tty. */
291 unsigned char buf[MAX_PHASE1_PUSHBACK];
293 int c = phase1_getc ();
296 buf[0] = (unsigned char) c;
301 unsigned char scratchbuf[6];
302 const char *inptr = (const char *) &buf[0];
303 size_t insize = bufcount;
304 char *outptr = (char *) &scratchbuf[0];
305 size_t outsize = sizeof (scratchbuf);
307 size_t res = iconv (xgettext_current_source_iconv,
308 (ICONV_CONST char **) &inptr, &insize,
310 /* We expect that a character has been produced if and only if
311 some input bytes have been consumed. */
312 if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
314 if (outsize == sizeof (scratchbuf))
316 /* No character has been produced. Must be an error. */
317 if (res != (size_t)(-1))
322 /* An invalid multibyte sequence was encountered. */
323 multiline_error (xstrdup (""),
325 %s:%d: Invalid multibyte sequence.\n\
326 Please specify the correct source encoding through --from-code or through a\n\
327 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
328 real_file_name, line_number));
331 else if (errno == EINVAL)
333 /* An incomplete multibyte character. */
336 if (bufcount == MAX_PHASE1_PUSHBACK)
338 /* An overlong incomplete multibyte sequence was
340 multiline_error (xstrdup (""),
342 %s:%d: Long incomplete multibyte sequence.\n\
343 Please specify the correct source encoding through --from-code or through a\n\
344 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
345 real_file_name, line_number));
349 /* Read one more byte and retry iconv. */
353 multiline_error (xstrdup (""),
355 %s:%d: Incomplete multibyte sequence at end of file.\n\
356 Please specify the correct source encoding through --from-code or through a\n\
357 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
358 real_file_name, line_number));
363 multiline_error (xstrdup (""),
365 %s:%d: Incomplete multibyte sequence at end of line.\n\
366 Please specify the correct source encoding through --from-code or through a\n\
367 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
368 real_file_name, line_number - 1));
371 buf[bufcount++] = (unsigned char) c;
374 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
375 real_file_name, line_number);
379 size_t outbytes = sizeof (scratchbuf) - outsize;
380 size_t bytes = bufcount - insize;
383 /* We expect that one character has been produced. */
388 /* Push back the unused bytes. */
390 phase1_ungetc (buf[--insize]);
391 /* Convert the character from UTF-8 to UCS-4. */
392 if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
394 /* scratchbuf contains an out-of-range Unicode character
396 multiline_error (xstrdup (""),
398 %s:%d: Invalid multibyte sequence.\n\
399 Please specify the source encoding through --from-code or through a comment\n\
400 as specified in http://www.python.org/peps/pep-0263.html.\n"),
401 real_file_name, line_number));
408 /* If we don't have iconv(), the only supported values for
409 xgettext_global_source_encoding and thus also for
410 xgettext_current_source_encoding are ASCII and UTF-8. */
416 /* Read an UTF-8 encoded character. */
417 unsigned char buf[6];
438 && ((buf[1] ^ 0x80) < 0x40))
448 && ((buf[1] ^ 0x80) < 0x40)
449 && ((buf[2] ^ 0x80) < 0x40))
459 && ((buf[1] ^ 0x80) < 0x40)
460 && ((buf[2] ^ 0x80) < 0x40)
461 && ((buf[3] ^ 0x80) < 0x40))
471 && ((buf[1] ^ 0x80) < 0x40)
472 && ((buf[2] ^ 0x80) < 0x40)
473 && ((buf[3] ^ 0x80) < 0x40)
474 && ((buf[4] ^ 0x80) < 0x40))
483 u8_mbtouc (&uc, buf, count);
488 /* Supports max (9, UNINAME_MAX + 3) pushback characters. */
490 phase2_ungetc (int c)
494 if (phase2_pushback_length == SIZEOF (phase2_pushback))
496 phase2_pushback[phase2_pushback_length++] = c;
501 /* ========================= Accumulating strings. ======================== */
503 /* A string buffer type that allows appending Unicode characters.
504 Returns the entire string in UTF-8 encoding. */
506 struct unicode_string_buffer
508 /* The part of the string that has already been converted to UTF-8. */
511 size_t utf8_allocated;
514 /* Initialize a 'struct unicode_string_buffer' to empty. */
516 init_unicode_string_buffer (struct unicode_string_buffer *bp)
518 bp->utf8_buffer = NULL;
520 bp->utf8_allocated = 0;
523 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
525 unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp,
528 if (bp->utf8_buflen + count > bp->utf8_allocated)
530 size_t new_allocated = 2 * bp->utf8_allocated + 10;
531 if (new_allocated < bp->utf8_buflen + count)
532 new_allocated = bp->utf8_buflen + count;
533 bp->utf8_allocated = new_allocated;
534 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
538 /* Auxiliary function: Append a Unicode character to bp->utf8.
539 uc must be < 0x110000. */
541 unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp,
544 unsigned char utf8buf[6];
545 int count = u8_uctomb (utf8buf, uc, 6);
548 /* The caller should have ensured that uc is not out-of-range. */
551 unicode_string_buffer_append_unicode_grow (bp, count);
552 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
553 bp->utf8_buflen += count;
556 /* Return the string buffer's contents. */
558 unicode_string_buffer_result (struct unicode_string_buffer *bp)
560 /* NUL-terminate it. */
561 unicode_string_buffer_append_unicode_grow (bp, 1);
562 bp->utf8_buffer[bp->utf8_buflen] = '\0';
564 return bp->utf8_buffer;
567 /* Free the memory pointed to by a 'struct unicode_string_buffer'. */
569 free_unicode_string_buffer (struct unicode_string_buffer *bp)
571 free (bp->utf8_buffer);
575 /* ======================== Accumulating comments. ======================== */
578 /* Accumulating a single comment line. */
580 static struct unicode_string_buffer comment_buffer;
585 lexical_context = lc_comment;
586 comment_buffer.utf8_buflen = 0;
592 return (comment_buffer.utf8_buflen == 0);
598 unicode_string_buffer_append_unicode (&comment_buffer, c);
601 static inline const char *
604 char *buffer = unicode_string_buffer_result (&comment_buffer);
605 size_t buflen = strlen (buffer);
608 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
610 buffer[buflen] = '\0';
611 savable_comment_add (buffer);
612 lexical_context = lc_outside;
617 /* These are for tracking whether comments count as immediately before
619 static int last_comment_line;
620 static int last_non_comment_line;
623 /* ======================== Recognizing comments. ======================== */
626 /* Recognizing the "coding" comment.
627 As specified in PEP 0263, it takes the form
628 "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
630 "set" "fileencoding" "=" {alphanumeric or "-" or "_" or "*"}*
631 and is located in a comment in a line that
632 - is either the first or second line,
633 - is not a continuation line,
634 - in the first form, contains no other tokens except this comment. */
636 /* Canonicalized encoding name for the current input file. */
637 static const char *xgettext_current_file_source_encoding;
640 /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
641 ASCII or UTF-8, when this conversion is a no-op). */
642 static iconv_t xgettext_current_file_source_iconv;
646 set_current_file_source_encoding (const char *canon_encoding)
648 xgettext_current_file_source_encoding = canon_encoding;
650 if (xgettext_current_file_source_encoding != po_charset_ascii
651 && xgettext_current_file_source_encoding != po_charset_utf8)
656 /* Avoid glibc-2.1 bug with EUC-KR. */
657 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
658 && !defined _LIBICONV_VERSION
659 if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
663 cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
664 if (cd == (iconv_t)(-1))
665 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
666 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
667 and iconv() does not support this conversion."),
668 xgettext_current_file_source_encoding, po_charset_utf8,
669 basename (program_name));
670 xgettext_current_file_source_iconv = cd;
672 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
673 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
674 This version was built without iconv()."),
675 xgettext_global_source_encoding, po_charset_utf8,
676 basename (program_name));
680 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
682 xgettext_current_source_iconv = xgettext_current_file_source_iconv;
687 try_to_extract_coding (const char *comment)
689 const char *p = c_strstr (comment, "coding");
694 if (*p == ':' || *p == '=')
697 while (*p == ' ' || *p == '\t')
700 const char *encoding_start = p;
702 while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
705 const char *encoding_end = p;
707 if (encoding_end > encoding_start)
709 /* Extract the encoding string. */
710 size_t encoding_len = encoding_end - encoding_start;
711 char *encoding = XNMALLOC (encoding_len + 1, char);
713 memcpy (encoding, encoding_start, encoding_len);
714 encoding[encoding_len] = '\0';
717 /* Canonicalize it. */
718 const char *canon_encoding = po_charset_canonicalize (encoding);
719 if (canon_encoding == NULL)
722 logical_file_name, line_number - 1, _("\
723 Unknown encoding \"%s\". Proceeding with ASCII instead."),
725 canon_encoding = po_charset_ascii;
729 set_current_file_source_encoding (canon_encoding);
740 /* Tracking whether the current line is a continuation line or contains a
741 non-blank character. */
742 static bool continuation_or_nonblank_line = false;
745 /* Phase 3: Outside strings, replace backslash-newline with nothing and a
746 comment with nothing. */
762 /* This shouldn't happen usually, because "A backslash is
763 illegal elsewhere on a line outside a string literal." */
766 /* Eat backslash-newline. */
767 continuation_or_nonblank_line = true;
774 last_comment_line = line_number;
779 if (c == UEOF || c == '\n')
781 /* We skip all leading white space, but not EOLs. */
782 if (!(comment_at_start () && (c == ' ' || c == '\t')))
785 comment = comment_line_end ();
786 if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
787 try_to_extract_coding (comment);
788 continuation_or_nonblank_line = false;
794 continuation_or_nonblank_line = false;
795 else if (!(c == ' ' || c == '\t' || c == '\f'))
796 continuation_or_nonblank_line = true;
802 /* Supports only one pushback character. */
804 phase3_ungetc (int c)
810 /* ========================= Accumulating strings. ======================== */
812 /* Return value of phase7_getuc when EOF is reached. */
814 #define P7_STRING_END (-2)
816 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
817 distinguished from a single-byte return value. */
818 #define UNICODE(code) (0x100 + (code))
820 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
821 UTF-32 code point. */
822 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
824 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
826 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
828 /* A string buffer type that allows appending bytes (in the
829 xgettext_current_source_encoding) or Unicode characters.
830 Returns the entire string in UTF-8 encoding. */
832 struct mixed_string_buffer
834 /* The part of the string that has already been converted to UTF-8. */
837 size_t utf8_allocated;
838 /* The first half of an UTF-16 surrogate character. */
839 unsigned short utf16_surr;
840 /* The part of the string that is still in the source encoding. */
843 size_t curr_allocated;
844 /* The lexical context. Used only for error message purposes. */
845 lexical_context_ty lcontext;
848 /* Initialize a 'struct mixed_string_buffer' to empty. */
850 init_mixed_string_buffer (struct mixed_string_buffer *bp, lexical_context_ty lcontext)
852 bp->utf8_buffer = NULL;
854 bp->utf8_allocated = 0;
856 bp->curr_buffer = NULL;
858 bp->curr_allocated = 0;
859 bp->lcontext = lcontext;
862 /* Auxiliary function: Append a byte to bp->curr. */
864 mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c)
866 if (bp->curr_buflen == bp->curr_allocated)
868 bp->curr_allocated = 2 * bp->curr_allocated + 10;
869 bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
871 bp->curr_buffer[bp->curr_buflen++] = c;
874 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
876 mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count)
878 if (bp->utf8_buflen + count > bp->utf8_allocated)
880 size_t new_allocated = 2 * bp->utf8_allocated + 10;
881 if (new_allocated < bp->utf8_buflen + count)
882 new_allocated = bp->utf8_buflen + count;
883 bp->utf8_allocated = new_allocated;
884 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
888 /* Auxiliary function: Append a Unicode character to bp->utf8.
889 uc must be < 0x110000. */
891 mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, ucs4_t uc)
893 unsigned char utf8buf[6];
894 int count = u8_uctomb (utf8buf, uc, 6);
897 /* The caller should have ensured that uc is not out-of-range. */
900 mixed_string_buffer_append_unicode_grow (bp, count);
901 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
902 bp->utf8_buflen += count;
905 /* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */
907 mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
909 if (bp->utf16_surr != 0)
911 /* A half surrogate is invalid, therefore use U+FFFD instead. */
912 mixed_string_buffer_append_unicode (bp, 0xfffd);
917 /* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */
919 mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno)
921 if (bp->curr_buflen > 0)
926 mixed_string_buffer_append_byte (bp, '\0');
928 /* Convert from the source encoding to UTF-8. */
929 curr = from_current_source_encoding (bp->curr_buffer, bp->lcontext,
930 logical_file_name, lineno);
932 /* Append it to bp->utf8_buffer. */
933 count = strlen (curr);
934 mixed_string_buffer_append_unicode_grow (bp, count);
935 memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
936 bp->utf8_buflen += count;
938 if (curr != bp->curr_buffer)
944 /* Append a character or Unicode character to a 'struct mixed_string_buffer'. */
946 mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
950 /* Append a Unicode character. */
952 /* Switch from multibyte character mode to Unicode character mode. */
953 mixed_string_buffer_flush_curr_buffer (bp, line_number);
955 /* Test whether this character and the previous one form a Unicode
956 surrogate character pair. */
957 if (bp->utf16_surr != 0
958 && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
960 unsigned short utf16buf[2];
963 utf16buf[0] = bp->utf16_surr;
964 utf16buf[1] = UNICODE_VALUE (c);
965 if (u16_mbtouc (&uc, utf16buf, 2) != 2)
968 mixed_string_buffer_append_unicode (bp, uc);
973 mixed_string_buffer_flush_utf16_surr (bp);
975 if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
976 bp->utf16_surr = UNICODE_VALUE (c);
977 else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
979 /* A half surrogate is invalid, therefore use U+FFFD instead. */
980 mixed_string_buffer_append_unicode (bp, 0xfffd);
983 mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c));
988 /* Append a single byte. */
990 /* Switch from Unicode character mode to multibyte character mode. */
991 mixed_string_buffer_flush_utf16_surr (bp);
993 /* When a newline is seen, convert the accumulated multibyte sequence.
994 This ensures a correct line number in the error message in case of
995 a conversion error. The "- 1" is to account for the newline. */
997 mixed_string_buffer_flush_curr_buffer (bp, line_number - 1);
999 mixed_string_buffer_append_byte (bp, (unsigned char) c);
1003 /* Return the string buffer's contents. */
1005 mixed_string_buffer_result (struct mixed_string_buffer *bp)
1007 /* Flush all into bp->utf8_buffer. */
1008 mixed_string_buffer_flush_utf16_surr (bp);
1009 mixed_string_buffer_flush_curr_buffer (bp, line_number);
1010 /* NUL-terminate it. */
1011 mixed_string_buffer_append_unicode_grow (bp, 1);
1012 bp->utf8_buffer[bp->utf8_buflen] = '\0';
1014 return bp->utf8_buffer;
1017 /* Free the memory pointed to by a 'struct mixed_string_buffer'. */
1019 free_mixed_string_buffer (struct mixed_string_buffer *bp)
1021 free (bp->utf8_buffer);
1022 free (bp->curr_buffer);
1026 /* ========================== Reading of tokens. ========================== */
1032 token_type_lparen, /* ( */
1033 token_type_rparen, /* ) */
1034 token_type_comma, /* , */
1035 token_type_lbracket, /* [ */
1036 token_type_rbracket, /* ] */
1037 token_type_string, /* "abc", 'abc', """abc""", '''abc''' */
1038 token_type_symbol, /* symbol, number */
1039 token_type_plus, /* + */
1040 token_type_other /* misc. operator */
1042 typedef enum token_type_ty token_type_ty;
1044 typedef struct token_ty token_ty;
1048 char *string; /* for token_type_string, token_type_symbol */
1049 refcounted_string_list_ty *comment; /* for token_type_string */
1053 /* Free the memory pointed to by a 'struct token_ty'. */
1055 free_token (token_ty *tp)
1057 if (tp->type == token_type_string || tp->type == token_type_symbol)
1059 if (tp->type == token_type_string)
1060 drop_reference (tp->comment);
1064 /* There are two different input syntaxes for strings, "abc" and r"abc",
1065 and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
1066 Which escape sequences are understood, i.e. what is interpreted specially
1068 "abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
1070 u"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
1072 The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
1073 \unnnn items. The \ooo and \xnn values are in the current source encoding
1074 for byte strings, and Unicode code points for Unicode strings.
1078 phase7_getuc (int quote_char,
1079 bool triple, bool interpret_ansic, bool interpret_unicode,
1080 unsigned int *backslash_counter)
1086 /* Use phase 2, because phase 3 elides comments. */
1092 if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
1096 int c1 = phase2_getc ();
1097 if (c1 == quote_char)
1099 int c2 = phase2_getc ();
1100 if (c2 == quote_char)
1101 return P7_STRING_END;
1108 return P7_STRING_END;
1115 *backslash_counter = 0;
1116 return UNICODE ('\n');
1118 /* In r"..." and ur"..." strings, newline is only allowed
1119 immediately after an odd number of backslashes (although the
1120 backslashes are not interpreted!). */
1121 if (!(interpret_ansic || (*backslash_counter & 1) == 0))
1123 *backslash_counter = 0;
1124 return UNICODE ('\n');
1127 error_with_progname = false;
1128 error (0, 0, _("%s:%d: warning: unterminated string"),
1129 logical_file_name, line_number);
1130 error_with_progname = true;
1131 return P7_STRING_END;
1136 *backslash_counter = 0;
1140 /* Backslash handling. */
1142 if (!interpret_ansic && !interpret_unicode)
1144 ++*backslash_counter;
1145 return UNICODE ('\\');
1148 /* Dispatch according to the character following the backslash. */
1152 ++*backslash_counter;
1153 return UNICODE ('\\');
1156 if (interpret_ansic)
1162 ++*backslash_counter;
1164 case '\'': case '"':
1165 *backslash_counter = 0;
1168 *backslash_counter = 0;
1169 return UNICODE ('\a');
1171 *backslash_counter = 0;
1172 return UNICODE ('\b');
1174 *backslash_counter = 0;
1175 return UNICODE ('\f');
1177 *backslash_counter = 0;
1178 return UNICODE ('\n');
1180 *backslash_counter = 0;
1181 return UNICODE ('\r');
1183 *backslash_counter = 0;
1184 return UNICODE ('\t');
1186 *backslash_counter = 0;
1187 return UNICODE ('\v');
1188 case '0': case '1': case '2': case '3': case '4':
1189 case '5': case '6': case '7':
1196 if (c >= '0' && c <= '7')
1198 n = (n << 3) + (c - '0');
1202 if (c >= '0' && c <= '7')
1203 n = (n << 3) + (c - '0');
1211 *backslash_counter = 0;
1212 if (interpret_unicode)
1215 return (unsigned char) n;
1219 int c1 = phase2_getc ();
1222 if (c1 >= '0' && c1 <= '9')
1224 else if (c1 >= 'A' && c1 <= 'F')
1226 else if (c1 >= 'a' && c1 <= 'f')
1233 int c2 = phase2_getc ();
1236 if (c2 >= '0' && c2 <= '9')
1238 else if (c2 >= 'A' && c2 <= 'F')
1240 else if (c2 >= 'a' && c2 <= 'f')
1247 int n = (n1 << 4) + n2;
1248 *backslash_counter = 0;
1249 if (interpret_unicode)
1252 return (unsigned char) n;
1259 ++*backslash_counter;
1260 return UNICODE ('\\');
1264 if (interpret_unicode)
1268 unsigned char buf[4];
1272 for (i = 0; i < 4; i++)
1274 int c1 = phase2_getc ();
1276 if (c1 >= '0' && c1 <= '9')
1277 n = (n << 4) + (c1 - '0');
1278 else if (c1 >= 'A' && c1 <= 'F')
1279 n = (n << 4) + (c1 - 'A' + 10);
1280 else if (c1 >= 'a' && c1 <= 'f')
1281 n = (n << 4) + (c1 - 'a' + 10);
1286 phase2_ungetc (buf[i]);
1288 ++*backslash_counter;
1289 return UNICODE ('\\');
1294 *backslash_counter = 0;
1298 if (interpret_ansic)
1302 unsigned char buf[8];
1306 for (i = 0; i < 8; i++)
1308 int c1 = phase2_getc ();
1310 if (c1 >= '0' && c1 <= '9')
1311 n = (n << 4) + (c1 - '0');
1312 else if (c1 >= 'A' && c1 <= 'F')
1313 n = (n << 4) + (c1 - 'A' + 10);
1314 else if (c1 >= 'a' && c1 <= 'f')
1315 n = (n << 4) + (c1 - 'a' + 10);
1320 phase2_ungetc (buf[i]);
1322 ++*backslash_counter;
1323 return UNICODE ('\\');
1330 *backslash_counter = 0;
1334 error_with_progname = false;
1335 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1336 logical_file_name, line_number);
1337 error_with_progname = true;
1340 phase2_ungetc (buf[i]);
1342 ++*backslash_counter;
1343 return UNICODE ('\\');
1348 int c1 = phase2_getc ();
1351 unsigned char buf[UNINAME_MAX + 1];
1355 for (i = 0; i < UNINAME_MAX; i++)
1357 int c2 = phase2_getc ();
1358 if (!(c2 >= ' ' && c2 <= '~'))
1362 phase2_ungetc (buf[i]);
1365 ++*backslash_counter;
1366 return UNICODE ('\\');
1374 n = unicode_name_character ((char *) buf);
1375 if (n != UNINAME_INVALID)
1377 *backslash_counter = 0;
1381 phase2_ungetc ('}');
1383 phase2_ungetc (buf[i]);
1387 ++*backslash_counter;
1388 return UNICODE ('\\');
1394 ++*backslash_counter;
1395 return UNICODE ('\\');
1400 /* Combine characters into tokens. Discard whitespace except newlines at
1401 the end of logical lines. */
1403 /* Number of pending open parentheses/braces/brackets. */
1404 static int open_pbb;
1406 static token_ty phase5_pushback[2];
1407 static int phase5_pushback_length;
1410 phase5_get (token_ty *tp)
1414 if (phase5_pushback_length)
1416 *tp = phase5_pushback[--phase5_pushback_length];
1422 tp->line_number = line_number;
1428 tp->type = token_type_eof;
1434 /* Ignore whitespace and comments. */
1438 if (last_non_comment_line > last_comment_line)
1439 savable_comment_reset ();
1440 /* Ignore newline if and only if it is used for implicit line
1444 tp->type = token_type_other;
1448 last_non_comment_line = tp->line_number;
1454 int c1 = phase3_getc ();
1456 if (!(c1 >= '0' && c1 <= '9'))
1459 tp->type = token_type_other;
1464 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1465 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1466 case 'M': case 'N': case 'O': case 'P': case 'Q':
1467 case 'S': case 'T': case 'V': case 'W': case 'X':
1470 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1471 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1472 case 'm': case 'n': case 'o': case 'p': case 'q':
1473 case 's': case 't': case 'v': case 'w': case 'x':
1475 case '0': case '1': case '2': case '3': case '4':
1476 case '5': case '6': case '7': case '8': case '9':
1478 /* Symbol, or part of a number. */
1480 static char *buffer;
1487 if (bufpos >= bufmax)
1489 bufmax = 2 * bufmax + 10;
1490 buffer = xrealloc (buffer, bufmax);
1492 buffer[bufpos++] = c;
1496 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1497 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1498 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1499 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1502 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1503 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1504 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1505 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1507 case '0': case '1': case '2': case '3': case '4':
1508 case '5': case '6': case '7': case '8': case '9':
1516 if (bufpos >= bufmax)
1518 bufmax = 2 * bufmax + 10;
1519 buffer = xrealloc (buffer, bufmax);
1521 buffer[bufpos] = '\0';
1522 tp->string = xstrdup (buffer);
1523 tp->type = token_type_symbol;
1529 struct mixed_string_buffer literal;
1531 bool interpret_ansic;
1532 bool interpret_unicode;
1534 unsigned int backslash_counter;
1538 int c1 = phase2_getc ();
1539 if (c1 == '"' || c1 == '\'')
1542 interpret_ansic = false;
1543 interpret_unicode = false;
1552 int c1 = phase2_getc ();
1553 if (c1 == '"' || c1 == '\'')
1556 interpret_ansic = true;
1557 interpret_unicode = true;
1560 if (c1 == 'R' || c1 == 'r')
1562 int c2 = phase2_getc ();
1563 if (c2 == '"' || c2 == '\'')
1566 interpret_ansic = false;
1567 interpret_unicode = true;
1576 case '"': case '\'':
1578 interpret_ansic = true;
1579 interpret_unicode = false;
1582 lexical_context = lc_string;
1584 int c1 = phase2_getc ();
1585 if (c1 == quote_char)
1587 int c2 = phase2_getc ();
1588 if (c2 == quote_char)
1599 backslash_counter = 0;
1600 /* Start accumulating the string. */
1601 init_mixed_string_buffer (&literal, lc_string);
1604 int uc = phase7_getuc (quote_char, triple, interpret_ansic,
1605 interpret_unicode, &backslash_counter);
1607 if (uc == P7_EOF || uc == P7_STRING_END)
1610 if (IS_UNICODE (uc))
1611 assert (UNICODE_VALUE (uc) >= 0
1612 && UNICODE_VALUE (uc) < 0x110000);
1614 mixed_string_buffer_append (&literal, uc);
1616 tp->string = xstrdup (mixed_string_buffer_result (&literal));
1617 free_mixed_string_buffer (&literal);
1618 tp->comment = add_reference (savable_comment);
1619 lexical_context = lc_outside;
1620 tp->type = token_type_string;
1626 tp->type = token_type_lparen;
1632 tp->type = token_type_rparen;
1636 tp->type = token_type_comma;
1641 tp->type = (c == '[' ? token_type_lbracket : token_type_other);
1647 tp->type = (c == ']' ? token_type_rbracket : token_type_other);
1651 tp->type = token_type_plus;
1655 /* We could carefully recognize each of the 2 and 3 character
1656 operators, but it is not necessary, as we only need to recognize
1657 gettext invocations. Don't bother. */
1658 tp->type = token_type_other;
1664 /* Supports only one pushback token. */
1666 phase5_unget (token_ty *tp)
1668 if (tp->type != token_type_eof)
1670 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1672 phase5_pushback[phase5_pushback_length++] = *tp;
1677 /* Combine adjacent strings to form a single string. Note that the end
1678 of a logical line appears as a token of its own, therefore strings that
1679 belong to different logical lines will not be concatenated. */
1682 x_python_lex (token_ty *tp)
1685 if (tp->type == token_type_string)
1687 char *sum = tp->string;
1688 size_t sum_len = strlen (sum);
1692 token_ty token2, *tp2 = NULL;
1694 phase5_get (&token2);
1695 switch (token2.type)
1697 case token_type_plus:
1701 phase5_get (&token3);
1702 if (token3.type == token_type_string)
1704 free_token (&token2);
1708 phase5_unget (&token3);
1711 case token_type_string:
1720 char *addend = tp2->string;
1721 size_t addend_len = strlen (addend);
1723 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1724 memcpy (sum + sum_len, addend, addend_len + 1);
1725 sum_len += addend_len;
1730 phase5_unget (&token2);
1738 /* ========================= Extracting strings. ========================== */
1741 /* Context lookup table. */
1742 static flag_context_list_table_ty *flag_context_list_table;
1745 /* The file is broken into tokens. Scan the token stream, looking for
1746 a keyword, followed by a left paren, followed by a string. When we
1747 see this sequence, we have something to remember. We assume we are
1748 looking at a valid C or C++ program, and leave the complaints about
1749 the grammar to the compiler.
1751 Normal handling: Look for
1752 keyword ( ... msgid ... )
1753 Plural handling: Look for
1754 keyword ( ... msgid ... msgid_plural ... )
1756 We use recursion because the arguments before msgid or between msgid
1757 and msgid_plural can contain subexpressions of the same form. */
1760 /* Extract messages until the next balanced closing parenthesis or bracket.
1761 Extracted messages are added to MLP.
1762 DELIM can be either token_type_rparen or token_type_rbracket, or
1763 token_type_eof to accept both.
1764 Return true upon eof, false upon closing parenthesis or bracket. */
1766 extract_balanced (message_list_ty *mlp,
1767 token_type_ty delim,
1768 flag_context_ty outer_context,
1769 flag_context_list_iterator_ty context_iter,
1770 struct arglist_parser *argparser)
1772 /* Current argument number. */
1774 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1776 /* Parameters of the keyword just seen. Defined only in state 1. */
1777 const struct callshapes *next_shapes = NULL;
1778 /* Context iterator that will be used if the next token is a '('. */
1779 flag_context_list_iterator_ty next_context_iter =
1780 passthrough_context_list_iterator;
1781 /* Current context. */
1782 flag_context_ty inner_context =
1783 inherited_context (outer_context,
1784 flag_context_list_iterator_advance (&context_iter));
1786 /* Start state is 0. */
1793 x_python_lex (&token);
1796 case token_type_symbol:
1798 void *keyword_value;
1800 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1804 next_shapes = (const struct callshapes *) keyword_value;
1811 flag_context_list_iterator (
1812 flag_context_list_table_lookup (
1813 flag_context_list_table,
1814 token.string, strlen (token.string)));
1815 free (token.string);
1818 case token_type_lparen:
1819 if (extract_balanced (mlp, token_type_rparen,
1820 inner_context, next_context_iter,
1821 arglist_parser_alloc (mlp,
1822 state ? next_shapes : NULL)))
1824 xgettext_current_source_encoding = po_charset_utf8;
1825 arglist_parser_done (argparser, arg);
1826 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1829 next_context_iter = null_context_list_iterator;
1833 case token_type_rparen:
1834 if (delim == token_type_rparen || delim == token_type_eof)
1836 xgettext_current_source_encoding = po_charset_utf8;
1837 arglist_parser_done (argparser, arg);
1838 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1841 next_context_iter = null_context_list_iterator;
1845 case token_type_comma:
1848 inherited_context (outer_context,
1849 flag_context_list_iterator_advance (
1851 next_context_iter = passthrough_context_list_iterator;
1855 case token_type_lbracket:
1856 if (extract_balanced (mlp, token_type_rbracket,
1857 null_context, null_context_list_iterator,
1858 arglist_parser_alloc (mlp, NULL)))
1860 xgettext_current_source_encoding = po_charset_utf8;
1861 arglist_parser_done (argparser, arg);
1862 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1865 next_context_iter = null_context_list_iterator;
1869 case token_type_rbracket:
1870 if (delim == token_type_rbracket || delim == token_type_eof)
1872 xgettext_current_source_encoding = po_charset_utf8;
1873 arglist_parser_done (argparser, arg);
1874 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1877 next_context_iter = null_context_list_iterator;
1881 case token_type_string:
1884 pos.file_name = logical_file_name;
1885 pos.line_number = token.line_number;
1887 xgettext_current_source_encoding = po_charset_utf8;
1889 remember_a_message (mlp, NULL, token.string, inner_context,
1890 &pos, NULL, token.comment);
1892 arglist_parser_remember (argparser, arg, token.string,
1894 pos.file_name, pos.line_number,
1896 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1898 drop_reference (token.comment);
1899 next_context_iter = null_context_list_iterator;
1903 case token_type_eof:
1904 xgettext_current_source_encoding = po_charset_utf8;
1905 arglist_parser_done (argparser, arg);
1906 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1909 case token_type_plus:
1910 case token_type_other:
1911 next_context_iter = null_context_list_iterator;
1923 extract_python (FILE *f,
1924 const char *real_filename, const char *logical_filename,
1925 flag_context_list_table_ty *flag_table,
1926 msgdomain_list_ty *mdlp)
1928 message_list_ty *mlp = mdlp->item[0]->messages;
1931 real_file_name = real_filename;
1932 logical_file_name = xstrdup (logical_filename);
1935 lexical_context = lc_outside;
1937 last_comment_line = -1;
1938 last_non_comment_line = -1;
1940 xgettext_current_file_source_encoding = xgettext_global_source_encoding;
1942 xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1945 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1947 xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1950 continuation_or_nonblank_line = false;
1954 flag_context_list_table = flag_table;
1958 /* Eat tokens until eof is seen. When extract_balanced returns
1959 due to an unbalanced closing parenthesis, just restart it. */
1960 while (!extract_balanced (mlp, token_type_eof,
1961 null_context, null_context_list_iterator,
1962 arglist_parser_alloc (mlp, NULL)))
1966 real_file_name = NULL;
1967 logical_file_name = NULL;