1 /* xgettext JavaScript backend.
2 Copyright (C) 2002-2003, 2005-2009, 2013 Free Software Foundation, Inc.
4 This file was written by Andreas Stricker <andy@knitter.ch>, 2010
5 It's based on x-python from Bruno Haible.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
25 #include "x-javascript.h"
37 #include "error-progname.h"
41 #include "xvasprintf.h"
45 #include "po-charset.h"
50 #define _(s) gettext(s)
52 #define max(a,b) ((a) > (b) ? (a) : (b))
54 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
56 /* The JavaScript aka ECMA-Script syntax is defined in ECMA-262
58 http://www.ecma-international.org/publications/standards/Ecma-262.htm */
60 /* ====================== Keyword set customization. ====================== */
62 /* If true extract all strings. */
63 static bool extract_all = false;
65 static hash_table keywords;
66 static bool default_keywords = true;
70 x_javascript_extract_all ()
77 x_javascript_keyword (const char *name)
80 default_keywords = false;
84 struct callshape shape;
87 if (keywords.table == NULL)
88 hash_init (&keywords, 100);
90 split_keywordspec (name, &end, &shape);
92 /* The characters between name and end should form a valid C identifier.
93 A colon means an invalid parse in split_keywordspec(). */
94 colon = strchr (name, ':');
95 if (colon == NULL || colon >= end)
96 insert_keyword_callshape (&keywords, name, end - name, &shape);
100 /* Finish initializing the keywords hash table.
101 Called after argument processing, before each file is processed. */
105 if (default_keywords)
107 /* When adding new keywords here, also update the documentation in
109 x_javascript_keyword ("gettext");
110 x_javascript_keyword ("dgettext:2");
111 x_javascript_keyword ("dcgettext:2");
112 x_javascript_keyword ("ngettext:1,2");
113 x_javascript_keyword ("dngettext:2,3");
114 x_javascript_keyword ("pgettext:1c,2");
115 x_javascript_keyword ("dpgettext:2c,3");
116 x_javascript_keyword ("_");
117 default_keywords = false;
122 init_flag_table_javascript ()
124 xgettext_record_flag ("gettext:1:pass-javascript-format");
125 xgettext_record_flag ("dgettext:2:pass-javascript-format");
126 xgettext_record_flag ("dcgettext:2:pass-javascript-format");
127 xgettext_record_flag ("ngettext:1:pass-javascript-format");
128 xgettext_record_flag ("ngettext:2:pass-javascript-format");
129 xgettext_record_flag ("dngettext:2:pass-javascript-format");
130 xgettext_record_flag ("dngettext:3:pass-javascript-format");
131 xgettext_record_flag ("pgettext:2:pass-javascript-format");
132 xgettext_record_flag ("dpgettext:3:pass-javascript-format");
133 xgettext_record_flag ("_:1:pass-javascript-format");
137 /* ======================== Reading of characters. ======================== */
139 /* Real filename, used in error messages about the input file. */
140 static const char *real_file_name;
142 /* Logical filename and line number, used to label the extracted messages. */
143 static char *logical_file_name;
144 static int line_number;
146 /* The input file stream. */
150 /* 1. line_number handling. */
152 /* Maximum used, roughly a safer MB_LEN_MAX. */
153 #define MAX_PHASE1_PUSHBACK 16
154 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
155 static int phase1_pushback_length;
157 /* Read the next single byte from the input file. */
163 if (phase1_pushback_length)
164 c = phase1_pushback[--phase1_pushback_length];
172 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
184 /* Supports MAX_PHASE1_PUSHBACK characters of pushback. */
186 phase1_ungetc (int c)
193 if (phase1_pushback_length == SIZEOF (phase1_pushback))
195 phase1_pushback[phase1_pushback_length++] = c;
200 /* Phase 2: Conversion to Unicode.
201 For now, we expect JavaScript files to be encoded as UTF-8. */
203 /* End-of-file indicator for functions returning an UCS-4 character. */
206 static lexical_context_ty lexical_context;
208 static int phase2_pushback[max (9, UNINAME_MAX + 3)];
209 static int phase2_pushback_length;
211 /* Read the next Unicode UCS-4 character from the input file. */
215 if (phase2_pushback_length)
216 return phase2_pushback[--phase2_pushback_length];
218 if (xgettext_current_source_encoding == po_charset_ascii)
220 int c = phase1_getc ();
225 multiline_error (xstrdup (""),
226 xasprintf ("%s\n%s\n",
227 non_ascii_error_message (lexical_context,
231 Please specify the source encoding through --from-code\n")));
236 else if (xgettext_current_source_encoding != po_charset_utf8)
239 /* Use iconv on an increasing number of bytes. Read only as many bytes
240 through phase1_getc as needed. This is needed to give reasonable
241 interactive behaviour when fp is connected to an interactive tty. */
242 unsigned char buf[MAX_PHASE1_PUSHBACK];
244 int c = phase1_getc ();
247 buf[0] = (unsigned char) c;
252 unsigned char scratchbuf[6];
253 const char *inptr = (const char *) &buf[0];
254 size_t insize = bufcount;
255 char *outptr = (char *) &scratchbuf[0];
256 size_t outsize = sizeof (scratchbuf);
258 size_t res = iconv (xgettext_current_source_iconv,
259 (ICONV_CONST char **) &inptr, &insize,
261 /* We expect that a character has been produced if and only if
262 some input bytes have been consumed. */
263 if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
265 if (outsize == sizeof (scratchbuf))
267 /* No character has been produced. Must be an error. */
268 if (res != (size_t)(-1))
273 /* An invalid multibyte sequence was encountered. */
274 multiline_error (xstrdup (""),
276 %s:%d: Invalid multibyte sequence.\n\
277 Please specify the correct source encoding through --from-code\n"),
278 real_file_name, line_number));
281 else if (errno == EINVAL)
283 /* An incomplete multibyte character. */
286 if (bufcount == MAX_PHASE1_PUSHBACK)
288 /* An overlong incomplete multibyte sequence was
290 multiline_error (xstrdup (""),
292 %s:%d: Long incomplete multibyte sequence.\n\
293 Please specify the correct source encoding through --from-code\n"),
294 real_file_name, line_number));
298 /* Read one more byte and retry iconv. */
302 multiline_error (xstrdup (""),
304 %s:%d: Incomplete multibyte sequence at end of file.\n\
305 Please specify the correct source encoding through --from-code\n"),
306 real_file_name, line_number));
311 multiline_error (xstrdup (""),
313 %s:%d: Incomplete multibyte sequence at end of line.\n\
314 Please specify the correct source encoding through --from-code\n"),
315 real_file_name, line_number - 1));
318 buf[bufcount++] = (unsigned char) c;
321 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
322 real_file_name, line_number);
326 size_t outbytes = sizeof (scratchbuf) - outsize;
327 size_t bytes = bufcount - insize;
330 /* We expect that one character has been produced. */
335 /* Push back the unused bytes. */
337 phase1_ungetc (buf[--insize]);
338 /* Convert the character from UTF-8 to UCS-4. */
339 if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
341 /* scratchbuf contains an out-of-range Unicode character
343 multiline_error (xstrdup (""),
345 %s:%d: Invalid multibyte sequence.\n\
346 Please specify the source encoding through --from-code\n"),
347 real_file_name, line_number));
354 /* If we don't have iconv(), the only supported values for
355 xgettext_global_source_encoding and thus also for
356 xgettext_current_source_encoding are ASCII and UTF-8. */
362 /* Read an UTF-8 encoded character. */
363 unsigned char buf[6];
384 && ((buf[1] ^ 0x80) < 0x40))
394 && ((buf[1] ^ 0x80) < 0x40)
395 && ((buf[2] ^ 0x80) < 0x40))
405 && ((buf[1] ^ 0x80) < 0x40)
406 && ((buf[2] ^ 0x80) < 0x40)
407 && ((buf[3] ^ 0x80) < 0x40))
417 && ((buf[1] ^ 0x80) < 0x40)
418 && ((buf[2] ^ 0x80) < 0x40)
419 && ((buf[3] ^ 0x80) < 0x40)
420 && ((buf[4] ^ 0x80) < 0x40))
429 u8_mbtouc (&uc, buf, count);
434 /* Supports max (9, UNINAME_MAX + 3) pushback characters. */
436 phase2_ungetc (int c)
440 if (phase2_pushback_length == SIZEOF (phase2_pushback))
442 phase2_pushback[phase2_pushback_length++] = c;
447 /* ========================= Accumulating strings. ======================== */
449 /* A string buffer type that allows appending Unicode characters.
450 Returns the entire string in UTF-8 encoding. */
452 struct unicode_string_buffer
454 /* The part of the string that has already been converted to UTF-8. */
457 size_t utf8_allocated;
460 /* Initialize a 'struct unicode_string_buffer' to empty. */
462 init_unicode_string_buffer (struct unicode_string_buffer *bp)
464 bp->utf8_buffer = NULL;
466 bp->utf8_allocated = 0;
469 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
471 unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp,
474 if (bp->utf8_buflen + count > bp->utf8_allocated)
476 size_t new_allocated = 2 * bp->utf8_allocated + 10;
477 if (new_allocated < bp->utf8_buflen + count)
478 new_allocated = bp->utf8_buflen + count;
479 bp->utf8_allocated = new_allocated;
480 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
484 /* Auxiliary function: Append a Unicode character to bp->utf8.
485 uc must be < 0x110000. */
487 unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp,
490 unsigned char utf8buf[6];
491 int count = u8_uctomb (utf8buf, uc, 6);
494 /* The caller should have ensured that uc is not out-of-range. */
497 unicode_string_buffer_append_unicode_grow (bp, count);
498 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
499 bp->utf8_buflen += count;
502 /* Return the string buffer's contents. */
504 unicode_string_buffer_result (struct unicode_string_buffer *bp)
506 /* NUL-terminate it. */
507 unicode_string_buffer_append_unicode_grow (bp, 1);
508 bp->utf8_buffer[bp->utf8_buflen] = '\0';
510 return bp->utf8_buffer;
513 /* Free the memory pointed to by a 'struct unicode_string_buffer'. */
515 free_unicode_string_buffer (struct unicode_string_buffer *bp)
517 free (bp->utf8_buffer);
521 /* ======================== Accumulating comments. ======================== */
524 /* Accumulating a single comment line. */
526 static struct unicode_string_buffer comment_buffer;
531 lexical_context = lc_comment;
532 comment_buffer.utf8_buflen = 0;
538 return (comment_buffer.utf8_buflen == 0);
544 unicode_string_buffer_append_unicode (&comment_buffer, c);
547 static inline const char *
550 char *buffer = unicode_string_buffer_result (&comment_buffer);
551 size_t buflen = strlen (buffer);
554 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
556 buffer[buflen] = '\0';
557 savable_comment_add (buffer);
558 lexical_context = lc_outside;
563 /* These are for tracking whether comments count as immediately before
565 static int last_comment_line;
566 static int last_non_comment_line;
569 /* ======================== Recognizing comments. ======================== */
572 /* Canonicalized encoding name for the current input file. */
573 static const char *xgettext_current_file_source_encoding;
576 /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
577 ASCII or UTF-8, when this conversion is a no-op). */
578 static iconv_t xgettext_current_file_source_iconv;
581 /* Tracking whether the current line is a continuation line or contains a
582 non-blank character. */
583 static bool continuation_or_nonblank_line = false;
586 /* Phase 3: Outside strings, replace backslash-newline with nothing and a
587 comment with nothing. */
603 /* This shouldn't happen usually, because "A backslash is
604 illegal elsewhere on a line outside a string literal." */
607 /* Eat backslash-newline. */
608 continuation_or_nonblank_line = true;
615 /* C++ style comment. */
616 last_comment_line = line_number;
621 if (c == UEOF || c == '\n')
623 /* We skip all leading white space, but not EOLs. */
624 if (!(comment_at_start () && (c == ' ' || c == '\t')))
627 continuation_or_nonblank_line = false;
632 /* C style comment. */
633 bool last_was_star = false;
634 last_comment_line = line_number;
641 /* We skip all leading white space, but not EOLs. */
642 if (!(comment_at_start () && (c == ' ' || c == '\t')))
647 comment_line_end (1);
649 last_was_star = false;
653 last_was_star = true;
658 comment_line_end (2);
664 last_was_star = false;
669 continuation_or_nonblank_line = false;
680 continuation_or_nonblank_line = false;
681 else if (!(c == ' ' || c == '\t' || c == '\f'))
682 continuation_or_nonblank_line = true;
688 /* Supports only one pushback character. */
690 phase3_ungetc (int c)
696 /* ========================= Accumulating strings. ======================== */
698 /* Return value of phase7_getuc when EOF is reached. */
700 #define P7_STRING_END (-2)
702 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
703 distinguished from a single-byte return value. */
704 #define UNICODE(code) (0x100 + (code))
706 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
707 UTF-32 code point. */
708 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
710 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
712 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
714 /* A string buffer type that allows appending bytes (in the
715 xgettext_current_source_encoding) or Unicode characters.
716 Returns the entire string in UTF-8 encoding. */
718 struct mixed_string_buffer
720 /* The part of the string that has already been converted to UTF-8. */
723 size_t utf8_allocated;
724 /* The first half of an UTF-16 surrogate character. */
725 unsigned short utf16_surr;
726 /* The part of the string that is still in the source encoding. */
729 size_t curr_allocated;
730 /* The lexical context. Used only for error message purposes. */
731 lexical_context_ty lcontext;
734 /* Initialize a 'struct mixed_string_buffer' to empty. */
736 init_mixed_string_buffer (struct mixed_string_buffer *bp, lexical_context_ty lcontext)
738 bp->utf8_buffer = NULL;
740 bp->utf8_allocated = 0;
742 bp->curr_buffer = NULL;
744 bp->curr_allocated = 0;
745 bp->lcontext = lcontext;
748 /* Auxiliary function: Append a byte to bp->curr. */
750 mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c)
752 if (bp->curr_buflen == bp->curr_allocated)
754 bp->curr_allocated = 2 * bp->curr_allocated + 10;
755 bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
757 bp->curr_buffer[bp->curr_buflen++] = c;
760 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
762 mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count)
764 if (bp->utf8_buflen + count > bp->utf8_allocated)
766 size_t new_allocated = 2 * bp->utf8_allocated + 10;
767 if (new_allocated < bp->utf8_buflen + count)
768 new_allocated = bp->utf8_buflen + count;
769 bp->utf8_allocated = new_allocated;
770 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
774 /* Auxiliary function: Append a Unicode character to bp->utf8.
775 uc must be < 0x110000. */
777 mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, ucs4_t uc)
779 unsigned char utf8buf[6];
780 int count = u8_uctomb (utf8buf, uc, 6);
783 /* The caller should have ensured that uc is not out-of-range. */
786 mixed_string_buffer_append_unicode_grow (bp, count);
787 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
788 bp->utf8_buflen += count;
791 /* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */
793 mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
795 if (bp->utf16_surr != 0)
797 /* A half surrogate is invalid, therefore use U+FFFD instead. */
798 mixed_string_buffer_append_unicode (bp, 0xfffd);
803 /* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */
805 mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno)
807 if (bp->curr_buflen > 0)
812 mixed_string_buffer_append_byte (bp, '\0');
814 /* Convert from the source encoding to UTF-8. */
815 curr = from_current_source_encoding (bp->curr_buffer, bp->lcontext,
816 logical_file_name, lineno);
818 /* Append it to bp->utf8_buffer. */
819 count = strlen (curr);
820 mixed_string_buffer_append_unicode_grow (bp, count);
821 memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
822 bp->utf8_buflen += count;
824 if (curr != bp->curr_buffer)
830 /* Append a character or Unicode character to a 'struct mixed_string_buffer'. */
832 mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
836 /* Append a Unicode character. */
838 /* Switch from multibyte character mode to Unicode character mode. */
839 mixed_string_buffer_flush_curr_buffer (bp, line_number);
841 /* Test whether this character and the previous one form a Unicode
842 surrogate character pair. */
843 if (bp->utf16_surr != 0
844 && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
846 unsigned short utf16buf[2];
849 utf16buf[0] = bp->utf16_surr;
850 utf16buf[1] = UNICODE_VALUE (c);
851 if (u16_mbtouc (&uc, utf16buf, 2) != 2)
854 mixed_string_buffer_append_unicode (bp, uc);
859 mixed_string_buffer_flush_utf16_surr (bp);
861 if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
862 bp->utf16_surr = UNICODE_VALUE (c);
863 else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
865 /* A half surrogate is invalid, therefore use U+FFFD instead. */
866 mixed_string_buffer_append_unicode (bp, 0xfffd);
869 mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c));
874 /* Append a single byte. */
876 /* Switch from Unicode character mode to multibyte character mode. */
877 mixed_string_buffer_flush_utf16_surr (bp);
879 /* When a newline is seen, convert the accumulated multibyte sequence.
880 This ensures a correct line number in the error message in case of
881 a conversion error. The "- 1" is to account for the newline. */
883 mixed_string_buffer_flush_curr_buffer (bp, line_number - 1);
885 mixed_string_buffer_append_byte (bp, (unsigned char) c);
889 /* Return the string buffer's contents. */
891 mixed_string_buffer_result (struct mixed_string_buffer *bp)
893 /* Flush all into bp->utf8_buffer. */
894 mixed_string_buffer_flush_utf16_surr (bp);
895 mixed_string_buffer_flush_curr_buffer (bp, line_number);
896 /* NUL-terminate it. */
897 mixed_string_buffer_append_unicode_grow (bp, 1);
898 bp->utf8_buffer[bp->utf8_buflen] = '\0';
900 return bp->utf8_buffer;
903 /* Free the memory pointed to by a 'struct mixed_string_buffer'. */
905 free_mixed_string_buffer (struct mixed_string_buffer *bp)
907 free (bp->utf8_buffer);
908 free (bp->curr_buffer);
912 /* ========================== Reading of tokens. ========================== */
918 token_type_lparen, /* ( */
919 token_type_rparen, /* ) */
920 token_type_comma, /* , */
921 token_type_lbracket, /* [ */
922 token_type_rbracket, /* ] */
923 token_type_plus, /* + */
924 token_type_regexp, /* /.../ */
925 token_type_operator, /* - * / % . < > = ~ ! | & ? : ^ */
926 token_type_string, /* "abc", 'abc' */
927 token_type_keyword, /* return, else */
928 token_type_symbol, /* symbol, number */
929 token_type_other /* misc. operator */
931 typedef enum token_type_ty token_type_ty;
933 typedef struct token_ty token_ty;
937 char *string; /* for token_type_string, token_type_symbol,
938 token_type_keyword */
939 refcounted_string_list_ty *comment; /* for token_type_string */
944 /* Free the memory pointed to by a 'struct token_ty'. */
946 free_token (token_ty *tp)
948 if (tp->type == token_type_string || tp->type == token_type_symbol)
950 if (tp->type == token_type_string)
951 drop_reference (tp->comment);
955 /* JavaScript provides strings with either double or single quotes:
957 Both may contain special sequences after a backslash:
958 \', \", \\, \b, \f, \n, \r, \t, \v
959 Special characters can be entered using hexadecimal escape
960 sequences or deprecated octal escape sequences:
962 Any unicode point can be entered using Unicode escape sequences:
964 If a sequence after a backslash is not a legitimate character
965 escape sequence, the character value is the sequence itself without
966 a backslash. For example, \xxx is treated as xxx. */
969 phase7_getuc (int quote_char)
975 /* Use phase 2, because phase 3 elides comments. */
982 return P7_STRING_END;
987 error_with_progname = false;
988 error (0, 0, _("%s:%d: warning: unterminated string"),
989 logical_file_name, line_number);
990 error_with_progname = true;
991 return P7_STRING_END;
997 /* Dispatch according to the character following the backslash. */
1007 return UNICODE ('\b');
1009 return UNICODE ('\f');
1011 return UNICODE ('\n');
1013 return UNICODE ('\r');
1015 return UNICODE ('\t');
1017 return UNICODE ('\v');
1018 case '0': case '1': case '2': case '3': case '4':
1019 case '5': case '6': case '7':
1026 if (c >= '0' && c <= '7')
1028 n = (n << 3) + (c - '0');
1032 if (c >= '0' && c <= '7')
1033 n = (n << 3) + (c - '0');
1045 int c1 = phase2_getc ();
1048 if (c1 >= '0' && c1 <= '9')
1050 else if (c1 >= 'A' && c1 <= 'F')
1052 else if (c1 >= 'a' && c1 <= 'f')
1059 int c2 = phase2_getc ();
1062 if (c2 >= '0' && c2 <= '9')
1064 else if (c2 >= 'A' && c2 <= 'F')
1066 else if (c2 >= 'a' && c2 <= 'f')
1073 int n = (n1 << 4) + n2;
1084 unsigned char buf[4];
1088 for (i = 0; i < 4; i++)
1090 int c1 = phase2_getc ();
1092 if (c1 >= '0' && c1 <= '9')
1093 n = (n << 4) + (c1 - '0');
1094 else if (c1 >= 'A' && c1 <= 'F')
1095 n = (n << 4) + (c1 - 'A' + 10);
1096 else if (c1 >= 'a' && c1 <= 'f')
1097 n = (n << 4) + (c1 - 'a' + 10);
1102 phase2_ungetc (buf[i]);
1117 /* Combine characters into tokens. Discard whitespace except newlines at
1118 the end of logical lines. */
1120 static token_ty phase5_pushback[2];
1121 static int phase5_pushback_length;
1123 static token_type_ty last_token_type = token_type_other;
1126 phase5_scan_regexp ()
1130 /* Scan for end of RegExp literal ('/'). */
1133 /* Must use phase2 as there can't be comments. */
1145 error_with_progname = false;
1147 _("%s:%d: warning: RegExp literal terminated too early"),
1148 logical_file_name, line_number);
1149 error_with_progname = true;
1154 /* Scan for modifier flags (ECMA-262 5th section 15.10.4.1). */
1156 if (!(c == 'g' || c == 'i' || c == 'm'))
1161 phase5_get (token_ty *tp)
1165 if (phase5_pushback_length)
1167 *tp = phase5_pushback[--phase5_pushback_length];
1168 last_token_type = tp->type;
1174 tp->line_number = line_number;
1180 tp->type = last_token_type = token_type_eof;
1184 if (last_non_comment_line > last_comment_line)
1185 savable_comment_reset ();
1190 /* Ignore whitespace and comments. */
1194 last_non_comment_line = tp->line_number;
1200 int c1 = phase3_getc ();
1202 if (!(c1 >= '0' && c1 <= '9'))
1205 tp->type = last_token_type = token_type_other;
1210 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1211 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1212 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1213 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1216 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1217 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1218 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1219 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1221 case '0': case '1': case '2': case '3': case '4':
1222 case '5': case '6': case '7': case '8': case '9':
1223 /* Symbol, or part of a number. */
1225 static char *buffer;
1232 if (bufpos >= bufmax)
1234 bufmax = 2 * bufmax + 10;
1235 buffer = xrealloc (buffer, bufmax);
1237 buffer[bufpos++] = c;
1241 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1242 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1243 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1244 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1247 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1248 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1249 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1250 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1252 case '0': case '1': case '2': case '3': case '4':
1253 case '5': case '6': case '7': case '8': case '9':
1261 if (bufpos >= bufmax)
1263 bufmax = 2 * bufmax + 10;
1264 buffer = xrealloc (buffer, bufmax);
1266 buffer[bufpos] = '\0';
1267 tp->string = xstrdup (buffer);
1268 if (strcmp (buffer, "return") == 0
1269 || strcmp (buffer, "else") == 0)
1270 tp->type = last_token_type = token_type_keyword;
1272 tp->type = last_token_type = token_type_symbol;
1278 struct mixed_string_buffer literal;
1281 case '"': case '\'':
1283 lexical_context = lc_string;
1284 /* Start accumulating the string. */
1285 init_mixed_string_buffer (&literal, lc_string);
1288 int uc = phase7_getuc (quote_char);
1290 if (uc == P7_EOF || uc == P7_STRING_END)
1293 if (IS_UNICODE (uc))
1294 assert (UNICODE_VALUE (uc) >= 0
1295 && UNICODE_VALUE (uc) < 0x110000);
1297 mixed_string_buffer_append (&literal, uc);
1299 tp->string = xstrdup (mixed_string_buffer_result (&literal));
1300 free_mixed_string_buffer (&literal);
1301 tp->comment = add_reference (savable_comment);
1302 lexical_context = lc_outside;
1303 tp->type = last_token_type = token_type_string;
1308 tp->type = last_token_type = token_type_plus;
1311 /* Identify operators. The multiple character ones are simply ignored
1312 * as they are recognized here and are otherwise not relevant. */
1313 case '-': case '*': /* '+' and '/' are not listed here! */
1314 case '%': case '<': case '>': case '=':
1315 case '~': case '!': case '|': case '&': case '^':
1317 tp->type = last_token_type = token_type_operator;
1321 /* Either a division operator or the start of a regular
1322 expression literal. If the '/' token is spotted after a
1323 symbol it's a division, otherwise it's a regular
1325 if (last_token_type == token_type_symbol
1326 || last_token_type == token_type_rparen
1327 || last_token_type == token_type_rbracket)
1328 tp->type = last_token_type = token_type_operator;
1331 phase5_scan_regexp (tp);
1332 tp->type = last_token_type = token_type_regexp;
1337 tp->type = last_token_type = token_type_lparen;
1341 tp->type = last_token_type = token_type_rparen;
1345 tp->type = last_token_type = token_type_comma;
1349 tp->type = last_token_type = token_type_lbracket;
1353 tp->type = last_token_type = token_type_rbracket;
1357 /* We could carefully recognize each of the 2 and 3 character
1358 operators, but it is not necessary, as we only need to recognize
1359 gettext invocations. Don't bother. */
1360 tp->type = last_token_type = token_type_other;
1366 /* Supports only one pushback token. */
1368 phase5_unget (token_ty *tp)
1370 if (tp->type != token_type_eof)
1372 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1374 phase5_pushback[phase5_pushback_length++] = *tp;
1379 /* String concatenation with '+'. */
1382 x_javascript_lex (token_ty *tp)
1385 if (tp->type == token_type_string)
1387 char *sum = tp->string;
1388 size_t sum_len = strlen (sum);
1394 phase5_get (&token2);
1395 if (token2.type == token_type_plus)
1399 phase5_get (&token3);
1400 if (token3.type == token_type_string)
1402 char *addend = token3.string;
1403 size_t addend_len = strlen (addend);
1405 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1406 memcpy (sum + sum_len, addend, addend_len + 1);
1407 sum_len += addend_len;
1409 free_token (&token3);
1410 free_token (&token2);
1413 phase5_unget (&token3);
1415 phase5_unget (&token2);
1423 /* ========================= Extracting strings. ========================== */
1426 /* Context lookup table. */
1427 static flag_context_list_table_ty *flag_context_list_table;
1430 /* The file is broken into tokens. Scan the token stream, looking for
1431 a keyword, followed by a left paren, followed by a string. When we
1432 see this sequence, we have something to remember. We assume we are
1433 looking at a valid JavaScript program, and leave the complaints about
1434 the grammar to the compiler.
1436 Normal handling: Look for
1437 keyword ( ... msgid ... )
1438 Plural handling: Look for
1439 keyword ( ... msgid ... msgid_plural ... )
1441 We use recursion because the arguments before msgid or between msgid
1442 and msgid_plural can contain subexpressions of the same form. */
1445 /* Extract messages until the next balanced closing parenthesis or bracket.
1446 Extracted messages are added to MLP.
1447 DELIM can be either token_type_rparen or token_type_rbracket, or
1448 token_type_eof to accept both.
1449 Return true upon eof, false upon closing parenthesis or bracket. */
1451 extract_balanced (message_list_ty *mlp,
1452 token_type_ty delim,
1453 flag_context_ty outer_context,
1454 flag_context_list_iterator_ty context_iter,
1455 struct arglist_parser *argparser)
1457 /* Current argument number. */
1459 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1461 /* Parameters of the keyword just seen. Defined only in state 1. */
1462 const struct callshapes *next_shapes = NULL;
1463 /* Context iterator that will be used if the next token is a '('. */
1464 flag_context_list_iterator_ty next_context_iter =
1465 passthrough_context_list_iterator;
1466 /* Current context. */
1467 flag_context_ty inner_context =
1468 inherited_context (outer_context,
1469 flag_context_list_iterator_advance (&context_iter));
1471 /* Start state is 0. */
1478 x_javascript_lex (&token);
1481 case token_type_symbol:
1483 void *keyword_value;
1485 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1489 next_shapes = (const struct callshapes *) keyword_value;
1496 flag_context_list_iterator (
1497 flag_context_list_table_lookup (
1498 flag_context_list_table,
1499 token.string, strlen (token.string)));
1500 free (token.string);
1503 case token_type_lparen:
1504 if (extract_balanced (mlp, token_type_rparen,
1505 inner_context, next_context_iter,
1506 arglist_parser_alloc (mlp,
1507 state ? next_shapes : NULL)))
1509 xgettext_current_source_encoding = po_charset_utf8;
1510 arglist_parser_done (argparser, arg);
1511 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1514 next_context_iter = null_context_list_iterator;
1518 case token_type_rparen:
1519 if (delim == token_type_rparen || delim == token_type_eof)
1521 xgettext_current_source_encoding = po_charset_utf8;
1522 arglist_parser_done (argparser, arg);
1523 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1526 next_context_iter = null_context_list_iterator;
1530 case token_type_comma:
1533 inherited_context (outer_context,
1534 flag_context_list_iterator_advance (
1536 next_context_iter = passthrough_context_list_iterator;
1540 case token_type_lbracket:
1541 if (extract_balanced (mlp, token_type_rbracket,
1542 null_context, null_context_list_iterator,
1543 arglist_parser_alloc (mlp, NULL)))
1545 xgettext_current_source_encoding = po_charset_utf8;
1546 arglist_parser_done (argparser, arg);
1547 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1550 next_context_iter = null_context_list_iterator;
1554 case token_type_rbracket:
1555 if (delim == token_type_rbracket || delim == token_type_eof)
1557 xgettext_current_source_encoding = po_charset_utf8;
1558 arglist_parser_done (argparser, arg);
1559 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1562 next_context_iter = null_context_list_iterator;
1566 case token_type_string:
1569 pos.file_name = logical_file_name;
1570 pos.line_number = token.line_number;
1572 xgettext_current_source_encoding = po_charset_utf8;
1574 remember_a_message (mlp, NULL, token.string, inner_context,
1575 &pos, NULL, token.comment);
1577 arglist_parser_remember (argparser, arg, token.string,
1579 pos.file_name, pos.line_number,
1581 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1583 drop_reference (token.comment);
1584 next_context_iter = null_context_list_iterator;
1588 case token_type_eof:
1589 xgettext_current_source_encoding = po_charset_utf8;
1590 arglist_parser_done (argparser, arg);
1591 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1594 case token_type_keyword:
1595 case token_type_plus:
1596 case token_type_regexp:
1597 case token_type_operator:
1598 case token_type_other:
1599 next_context_iter = null_context_list_iterator;
1611 extract_javascript (FILE *f,
1612 const char *real_filename, const char *logical_filename,
1613 flag_context_list_table_ty *flag_table,
1614 msgdomain_list_ty *mdlp)
1616 message_list_ty *mlp = mdlp->item[0]->messages;
1619 real_file_name = real_filename;
1620 logical_file_name = xstrdup (logical_filename);
1623 lexical_context = lc_outside;
1625 last_comment_line = -1;
1626 last_non_comment_line = -1;
1628 xgettext_current_file_source_encoding = xgettext_global_source_encoding;
1630 xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1633 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1635 xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1638 continuation_or_nonblank_line = false;
1640 flag_context_list_table = flag_table;
1644 /* Eat tokens until eof is seen. When extract_balanced returns
1645 due to an unbalanced closing parenthesis, just restart it. */
1646 while (!extract_balanced (mlp, token_type_eof,
1647 null_context, null_context_list_iterator,
1648 arglist_parser_alloc (mlp, NULL)))
1652 real_file_name = NULL;
1653 logical_file_name = NULL;