1 /* xgettext JavaScript backend.
2 Copyright (C) 2002-2003, 2005-2009, 2013, 2015 Free Software
5 This file was written by Andreas Stricker <andy@knitter.ch>, 2010
6 It's based on x-python from Bruno Haible.
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
26 #include "x-javascript.h"
38 #include "error-progname.h"
42 #include "xvasprintf.h"
46 #include "po-charset.h"
50 #define _(s) gettext(s)
52 #define max(a,b) ((a) > (b) ? (a) : (b))
54 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
56 /* The JavaScript aka ECMA-Script syntax is defined in ECMA-262
58 http://www.ecma-international.org/publications/standards/Ecma-262.htm */
60 /* ====================== Keyword set customization. ====================== */
62 /* If true extract all strings. */
63 static bool extract_all = false;
65 static hash_table keywords;
66 static bool default_keywords = true;
70 x_javascript_extract_all ()
77 x_javascript_keyword (const char *name)
80 default_keywords = false;
84 struct callshape shape;
87 if (keywords.table == NULL)
88 hash_init (&keywords, 100);
90 split_keywordspec (name, &end, &shape);
92 /* The characters between name and end should form a valid C identifier.
93 A colon means an invalid parse in split_keywordspec(). */
94 colon = strchr (name, ':');
95 if (colon == NULL || colon >= end)
96 insert_keyword_callshape (&keywords, name, end - name, &shape);
100 /* Finish initializing the keywords hash table.
101 Called after argument processing, before each file is processed. */
105 if (default_keywords)
107 /* When adding new keywords here, also update the documentation in
109 x_javascript_keyword ("gettext");
110 x_javascript_keyword ("dgettext:2");
111 x_javascript_keyword ("dcgettext:2");
112 x_javascript_keyword ("ngettext:1,2");
113 x_javascript_keyword ("dngettext:2,3");
114 x_javascript_keyword ("pgettext:1c,2");
115 x_javascript_keyword ("dpgettext:2c,3");
116 x_javascript_keyword ("_");
117 default_keywords = false;
122 init_flag_table_javascript ()
124 xgettext_record_flag ("gettext:1:pass-javascript-format");
125 xgettext_record_flag ("dgettext:2:pass-javascript-format");
126 xgettext_record_flag ("dcgettext:2:pass-javascript-format");
127 xgettext_record_flag ("ngettext:1:pass-javascript-format");
128 xgettext_record_flag ("ngettext:2:pass-javascript-format");
129 xgettext_record_flag ("dngettext:2:pass-javascript-format");
130 xgettext_record_flag ("dngettext:3:pass-javascript-format");
131 xgettext_record_flag ("pgettext:2:pass-javascript-format");
132 xgettext_record_flag ("dpgettext:3:pass-javascript-format");
133 xgettext_record_flag ("_:1:pass-javascript-format");
137 /* ======================== Reading of characters. ======================== */
139 /* Real filename, used in error messages about the input file. */
140 static const char *real_file_name;
142 /* Logical filename and line number, used to label the extracted messages. */
143 static char *logical_file_name;
144 static int line_number;
146 /* The input file stream. */
150 /* 1. line_number handling. */
152 /* Maximum used, roughly a safer MB_LEN_MAX. */
153 #define MAX_PHASE1_PUSHBACK 16
154 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
155 static int phase1_pushback_length;
157 /* Read the next single byte from the input file. */
163 if (phase1_pushback_length)
164 c = phase1_pushback[--phase1_pushback_length];
172 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
184 /* Supports MAX_PHASE1_PUSHBACK characters of pushback. */
186 phase1_ungetc (int c)
193 if (phase1_pushback_length == SIZEOF (phase1_pushback))
195 phase1_pushback[phase1_pushback_length++] = c;
200 /* Phase 2: Conversion to Unicode.
201 For now, we expect JavaScript files to be encoded as UTF-8. */
203 /* End-of-file indicator for functions returning an UCS-4 character. */
206 static lexical_context_ty lexical_context;
208 /* Maximum used, length of "<![CDATA[" tag minus one. */
209 static int phase2_pushback[8];
210 static int phase2_pushback_length;
212 /* Read the next Unicode UCS-4 character from the input file. */
216 if (phase2_pushback_length)
217 return phase2_pushback[--phase2_pushback_length];
219 if (xgettext_current_source_encoding == po_charset_ascii)
221 int c = phase1_getc ();
226 multiline_error (xstrdup (""),
227 xasprintf ("%s\n%s\n",
228 non_ascii_error_message (lexical_context,
232 Please specify the source encoding through --from-code\n")));
237 else if (xgettext_current_source_encoding != po_charset_utf8)
240 /* Use iconv on an increasing number of bytes. Read only as many bytes
241 through phase1_getc as needed. This is needed to give reasonable
242 interactive behaviour when fp is connected to an interactive tty. */
243 unsigned char buf[MAX_PHASE1_PUSHBACK];
245 int c = phase1_getc ();
248 buf[0] = (unsigned char) c;
253 unsigned char scratchbuf[6];
254 const char *inptr = (const char *) &buf[0];
255 size_t insize = bufcount;
256 char *outptr = (char *) &scratchbuf[0];
257 size_t outsize = sizeof (scratchbuf);
259 size_t res = iconv (xgettext_current_source_iconv,
260 (ICONV_CONST char **) &inptr, &insize,
262 /* We expect that a character has been produced if and only if
263 some input bytes have been consumed. */
264 if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
266 if (outsize == sizeof (scratchbuf))
268 /* No character has been produced. Must be an error. */
269 if (res != (size_t)(-1))
274 /* An invalid multibyte sequence was encountered. */
275 multiline_error (xstrdup (""),
277 %s:%d: Invalid multibyte sequence.\n\
278 Please specify the correct source encoding through --from-code\n"),
279 real_file_name, line_number));
282 else if (errno == EINVAL)
284 /* An incomplete multibyte character. */
287 if (bufcount == MAX_PHASE1_PUSHBACK)
289 /* An overlong incomplete multibyte sequence was
291 multiline_error (xstrdup (""),
293 %s:%d: Long incomplete multibyte sequence.\n\
294 Please specify the correct source encoding through --from-code\n"),
295 real_file_name, line_number));
299 /* Read one more byte and retry iconv. */
303 multiline_error (xstrdup (""),
305 %s:%d: Incomplete multibyte sequence at end of file.\n\
306 Please specify the correct source encoding through --from-code\n"),
307 real_file_name, line_number));
312 multiline_error (xstrdup (""),
314 %s:%d: Incomplete multibyte sequence at end of line.\n\
315 Please specify the correct source encoding through --from-code\n"),
316 real_file_name, line_number - 1));
319 buf[bufcount++] = (unsigned char) c;
322 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
323 real_file_name, line_number);
327 size_t outbytes = sizeof (scratchbuf) - outsize;
328 size_t bytes = bufcount - insize;
331 /* We expect that one character has been produced. */
336 /* Push back the unused bytes. */
338 phase1_ungetc (buf[--insize]);
339 /* Convert the character from UTF-8 to UCS-4. */
340 if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
342 /* scratchbuf contains an out-of-range Unicode character
344 multiline_error (xstrdup (""),
346 %s:%d: Invalid multibyte sequence.\n\
347 Please specify the source encoding through --from-code\n"),
348 real_file_name, line_number));
355 /* If we don't have iconv(), the only supported values for
356 xgettext_global_source_encoding and thus also for
357 xgettext_current_source_encoding are ASCII and UTF-8. */
363 /* Read an UTF-8 encoded character. */
364 unsigned char buf[6];
385 && ((buf[1] ^ 0x80) < 0x40))
395 && ((buf[1] ^ 0x80) < 0x40)
396 && ((buf[2] ^ 0x80) < 0x40))
406 && ((buf[1] ^ 0x80) < 0x40)
407 && ((buf[2] ^ 0x80) < 0x40)
408 && ((buf[3] ^ 0x80) < 0x40))
418 && ((buf[1] ^ 0x80) < 0x40)
419 && ((buf[2] ^ 0x80) < 0x40)
420 && ((buf[3] ^ 0x80) < 0x40)
421 && ((buf[4] ^ 0x80) < 0x40))
430 u8_mbtouc (&uc, buf, count);
435 /* Supports max (9, UNINAME_MAX + 3) pushback characters. */
437 phase2_ungetc (int c)
441 if (phase2_pushback_length == SIZEOF (phase2_pushback))
443 phase2_pushback[phase2_pushback_length++] = c;
448 /* ========================= Accumulating strings. ======================== */
450 /* A string buffer type that allows appending Unicode characters.
451 Returns the entire string in UTF-8 encoding. */
453 struct unicode_string_buffer
455 /* The part of the string that has already been converted to UTF-8. */
458 size_t utf8_allocated;
461 /* Initialize a 'struct unicode_string_buffer' to empty. */
463 init_unicode_string_buffer (struct unicode_string_buffer *bp)
465 bp->utf8_buffer = NULL;
467 bp->utf8_allocated = 0;
470 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
472 unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp,
475 if (bp->utf8_buflen + count > bp->utf8_allocated)
477 size_t new_allocated = 2 * bp->utf8_allocated + 10;
478 if (new_allocated < bp->utf8_buflen + count)
479 new_allocated = bp->utf8_buflen + count;
480 bp->utf8_allocated = new_allocated;
481 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
485 /* Auxiliary function: Append a Unicode character to bp->utf8.
486 uc must be < 0x110000. */
488 unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp,
491 unsigned char utf8buf[6];
492 int count = u8_uctomb (utf8buf, uc, 6);
495 /* The caller should have ensured that uc is not out-of-range. */
498 unicode_string_buffer_append_unicode_grow (bp, count);
499 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
500 bp->utf8_buflen += count;
503 /* Return the string buffer's contents. */
505 unicode_string_buffer_result (struct unicode_string_buffer *bp)
507 /* NUL-terminate it. */
508 unicode_string_buffer_append_unicode_grow (bp, 1);
509 bp->utf8_buffer[bp->utf8_buflen] = '\0';
511 return bp->utf8_buffer;
514 /* Free the memory pointed to by a 'struct unicode_string_buffer'. */
516 free_unicode_string_buffer (struct unicode_string_buffer *bp)
518 free (bp->utf8_buffer);
522 /* ======================== Accumulating comments. ======================== */
525 /* Accumulating a single comment line. */
527 static struct unicode_string_buffer comment_buffer;
532 lexical_context = lc_comment;
533 comment_buffer.utf8_buflen = 0;
539 return (comment_buffer.utf8_buflen == 0);
545 unicode_string_buffer_append_unicode (&comment_buffer, c);
548 static inline const char *
549 comment_line_end (size_t chars_to_remove)
551 char *buffer = unicode_string_buffer_result (&comment_buffer);
552 size_t buflen = strlen (buffer) - chars_to_remove;
555 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
557 buffer[buflen] = '\0';
558 savable_comment_add (buffer);
559 lexical_context = lc_outside;
564 /* These are for tracking whether comments count as immediately before
566 static int last_comment_line;
567 static int last_non_comment_line;
570 /* ======================== Recognizing comments. ======================== */
573 /* Canonicalized encoding name for the current input file. */
574 static const char *xgettext_current_file_source_encoding;
577 /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
578 ASCII or UTF-8, when this conversion is a no-op). */
579 static iconv_t xgettext_current_file_source_iconv;
582 /* Tracking whether the current line is a continuation line or contains a
583 non-blank character. */
584 static bool continuation_or_nonblank_line = false;
587 /* Phase 3: Outside strings, replace backslash-newline with nothing and a
588 comment with nothing. */
604 /* This shouldn't happen usually, because "A backslash is
605 illegal elsewhere on a line outside a string literal." */
608 /* Eat backslash-newline. */
609 continuation_or_nonblank_line = true;
616 /* C++ style comment. */
617 last_comment_line = line_number;
622 if (c == UEOF || c == '\n')
624 comment_line_end (0);
627 /* We skip all leading white space, but not EOLs. */
628 if (!(comment_at_start () && (c == ' ' || c == '\t')))
631 continuation_or_nonblank_line = false;
636 /* C style comment. */
637 bool last_was_star = false;
638 last_comment_line = line_number;
645 /* We skip all leading white space, but not EOLs. */
646 if (!(comment_at_start () && (c == ' ' || c == '\t')))
651 comment_line_end (1);
653 last_was_star = false;
657 last_was_star = true;
662 comment_line_end (2);
668 last_was_star = false;
673 continuation_or_nonblank_line = false;
684 continuation_or_nonblank_line = false;
685 else if (!(c == ' ' || c == '\t' || c == '\f'))
686 continuation_or_nonblank_line = true;
692 /* Supports only one pushback character. */
694 phase3_ungetc (int c)
700 /* ========================= Accumulating strings. ======================== */
702 /* Return value of phase7_getuc when EOF is reached. */
704 #define P7_STRING_END (-2)
706 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
707 distinguished from a single-byte return value. */
708 #define UNICODE(code) (0x100 + (code))
710 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
711 UTF-32 code point. */
712 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
714 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
716 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
719 /* ========================== Reading of tokens. ========================== */
725 token_type_lparen, /* ( */
726 token_type_rparen, /* ) */
727 token_type_comma, /* , */
728 token_type_lbracket, /* [ */
729 token_type_rbracket, /* ] */
730 token_type_plus, /* + */
731 token_type_regexp, /* /.../ */
732 token_type_operator, /* - * / % . < > = ~ ! | & ? : ^ */
733 token_type_equal, /* = */
734 token_type_string, /* "abc", 'abc' */
735 token_type_keyword, /* return, else */
736 token_type_symbol, /* symbol, number */
737 token_type_other /* misc. operator */
739 typedef enum token_type_ty token_type_ty;
741 typedef struct token_ty token_ty;
745 char *string; /* for token_type_string, token_type_symbol,
746 token_type_keyword */
747 refcounted_string_list_ty *comment; /* for token_type_string */
752 /* Free the memory pointed to by a 'struct token_ty'. */
754 free_token (token_ty *tp)
756 if (tp->type == token_type_string || tp->type == token_type_symbol)
758 if (tp->type == token_type_string)
759 drop_reference (tp->comment);
763 /* JavaScript provides strings with either double or single quotes:
765 Both may contain special sequences after a backslash:
766 \', \", \\, \b, \f, \n, \r, \t, \v
767 Special characters can be entered using hexadecimal escape
768 sequences or deprecated octal escape sequences:
770 Any unicode point can be entered using Unicode escape sequences:
772 If a sequence after a backslash is not a legitimate character
773 escape sequence, the character value is the sequence itself without
774 a backslash. For example, \xxx is treated as xxx. */
777 phase7_getuc (int quote_char)
783 /* Use phase 2, because phase 3 elides comments. */
790 return P7_STRING_END;
795 error_with_progname = false;
796 error (0, 0, _("%s:%d: warning: unterminated string"),
797 logical_file_name, line_number);
798 error_with_progname = true;
799 return P7_STRING_END;
805 /* Dispatch according to the character following the backslash. */
815 return UNICODE ('\b');
817 return UNICODE ('\f');
819 return UNICODE ('\n');
821 return UNICODE ('\r');
823 return UNICODE ('\t');
825 return UNICODE ('\v');
826 case '0': case '1': case '2': case '3': case '4':
827 case '5': case '6': case '7':
834 if (c >= '0' && c <= '7')
836 n = (n << 3) + (c - '0');
840 if (c >= '0' && c <= '7')
841 n = (n << 3) + (c - '0');
853 int c1 = phase2_getc ();
856 if (c1 >= '0' && c1 <= '9')
858 else if (c1 >= 'A' && c1 <= 'F')
860 else if (c1 >= 'a' && c1 <= 'f')
867 int c2 = phase2_getc ();
870 if (c2 >= '0' && c2 <= '9')
872 else if (c2 >= 'A' && c2 <= 'F')
874 else if (c2 >= 'a' && c2 <= 'f')
881 int n = (n1 << 4) + n2;
892 unsigned char buf[4];
896 for (i = 0; i < 4; i++)
898 int c1 = phase2_getc ();
900 if (c1 >= '0' && c1 <= '9')
901 n = (n << 4) + (c1 - '0');
902 else if (c1 >= 'A' && c1 <= 'F')
903 n = (n << 4) + (c1 - 'A' + 10);
904 else if (c1 >= 'a' && c1 <= 'f')
905 n = (n << 4) + (c1 - 'a' + 10);
910 phase2_ungetc (buf[i]);
925 /* Combine characters into tokens. Discard whitespace except newlines at
926 the end of logical lines. */
928 static token_ty phase5_pushback[2];
929 static int phase5_pushback_length;
931 static token_type_ty last_token_type = token_type_other;
934 phase5_scan_regexp ()
938 /* Scan for end of RegExp literal ('/'). */
941 /* Must use phase2 as there can't be comments. */
953 error_with_progname = false;
955 _("%s:%d: warning: RegExp literal terminated too early"),
956 logical_file_name, line_number);
957 error_with_progname = true;
962 /* Scan for modifier flags (ECMA-262 5th section 15.10.4.1). */
964 if (!(c == 'g' || c == 'i' || c == 'm'))
968 static int xml_element_depth = 0;
969 static bool inside_embedded_js_in_xml = false;
972 phase5_scan_xml_markup (token_ty *tp)
981 { "![CDATA[", "]]" },
986 for (i = 0; i < SIZEOF (markers); i++)
988 const char *start = markers[i].start;
989 const char *end = markers[i].end;
992 /* Look for a start marker. */
993 for (j = 0; start[j] != '\0'; j++)
997 assert (phase2_pushback_length + j < SIZEOF (phase2_pushback));
1009 phase2_ungetc (start[k]);
1014 if (start[j] != '\0')
1017 /* Skip until the end marker. */
1022 for (j = 0; end[j] != '\0'; j++)
1024 assert (phase2_pushback_length + 1 < SIZEOF (phase2_pushback));
1030 /* Don't push the first character back so the next
1031 iteration start from the second character. */
1040 phase2_ungetc (end[k]);
1054 error_with_progname = false;
1056 _("%s:%d: warning: %s is not allowed"),
1057 logical_file_name, line_number,
1059 error_with_progname = true;
1068 error_with_progname = false;
1070 _("%s:%d: warning: unterminated XML markup"),
1071 logical_file_name, line_number);
1072 error_with_progname = true;
1077 phase5_get (token_ty *tp)
1081 if (phase5_pushback_length)
1083 *tp = phase5_pushback[--phase5_pushback_length];
1084 last_token_type = tp->type;
1090 tp->line_number = line_number;
1096 tp->type = last_token_type = token_type_eof;
1100 if (last_non_comment_line > last_comment_line)
1101 savable_comment_reset ();
1106 /* Ignore whitespace and comments. */
1110 last_non_comment_line = tp->line_number;
1116 int c1 = phase3_getc ();
1118 if (!(c1 >= '0' && c1 <= '9'))
1121 tp->type = last_token_type = token_type_other;
1126 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1127 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1128 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1129 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1132 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1133 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1134 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1135 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1137 case '0': case '1': case '2': case '3': case '4':
1138 case '5': case '6': case '7': case '8': case '9':
1139 /* Symbol, or part of a number. */
1141 static char *buffer;
1148 if (bufpos >= bufmax)
1150 bufmax = 2 * bufmax + 10;
1151 buffer = xrealloc (buffer, bufmax);
1153 buffer[bufpos++] = c;
1157 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1158 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1159 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1160 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1163 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1164 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1165 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1166 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1168 case '0': case '1': case '2': case '3': case '4':
1169 case '5': case '6': case '7': case '8': case '9':
1177 if (bufpos >= bufmax)
1179 bufmax = 2 * bufmax + 10;
1180 buffer = xrealloc (buffer, bufmax);
1182 buffer[bufpos] = '\0';
1183 tp->string = xstrdup (buffer);
1184 if (strcmp (buffer, "return") == 0
1185 || strcmp (buffer, "else") == 0)
1186 tp->type = last_token_type = token_type_keyword;
1188 tp->type = last_token_type = token_type_symbol;
1194 struct mixed_string_buffer *bp;
1197 case '"': case '\'':
1199 lexical_context = lc_string;
1200 /* Start accumulating the string. */
1201 bp = mixed_string_buffer_alloc (lexical_context,
1206 int uc = phase7_getuc (quote_char);
1208 /* Keep line_number in sync. */
1209 bp->line_number = line_number;
1211 if (uc == P7_EOF || uc == P7_STRING_END)
1214 if (IS_UNICODE (uc))
1216 assert (UNICODE_VALUE (uc) >= 0
1217 && UNICODE_VALUE (uc) < 0x110000);
1218 mixed_string_buffer_append_unicode (bp,
1219 UNICODE_VALUE (uc));
1222 mixed_string_buffer_append_char (bp, uc);
1224 tp->string = mixed_string_buffer_done (bp);
1225 tp->comment = add_reference (savable_comment);
1226 lexical_context = lc_outside;
1227 tp->type = last_token_type = token_type_string;
1232 tp->type = last_token_type = token_type_plus;
1235 /* Identify operators. The multiple character ones are simply ignored
1236 * as they are recognized here and are otherwise not relevant. */
1237 case '-': case '*': /* '+' and '/' are not listed here! */
1239 case '~': case '!': case '|': case '&': case '^':
1241 tp->type = last_token_type = token_type_operator;
1245 tp->type = last_token_type = token_type_equal;
1251 - XMLMarkup and XMLElement are only allowed after '=' or '('
1252 - embedded JavaScript expressions in XML do not recurse
1254 if (xml_element_depth > 0
1255 || (!inside_embedded_js_in_xml
1256 && (last_token_type == token_type_equal
1257 || last_token_type == token_type_lparen)))
1259 /* Comments, PI, or CDATA. */
1260 if (phase5_scan_xml_markup (tp))
1266 lexical_context = lc_xml_close_tag;
1268 /* Opening element. */
1272 lexical_context = lc_xml_open_tag;
1273 xml_element_depth++;
1276 tp->type = last_token_type = token_type_other;
1279 tp->type = last_token_type = token_type_operator;
1284 if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
1286 switch (lexical_context)
1288 case lc_xml_open_tag:
1289 lexical_context = lc_xml_content;
1292 case lc_xml_close_tag:
1293 if (xml_element_depth-- > 0)
1294 lexical_context = lc_xml_content;
1296 lexical_context = lc_outside;
1302 tp->type = last_token_type = token_type_other;
1305 tp->type = last_token_type = token_type_operator;
1309 if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
1311 /* If it appears in an opening tag of an XML element, it's
1313 if (lexical_context == lc_xml_open_tag)
1317 lexical_context = lc_outside;
1321 tp->type = last_token_type = token_type_other;
1325 /* Either a division operator or the start of a regular
1326 expression literal. If the '/' token is spotted after a
1327 symbol it's a division, otherwise it's a regular
1329 if (last_token_type == token_type_symbol
1330 || last_token_type == token_type_rparen
1331 || last_token_type == token_type_rbracket)
1332 tp->type = last_token_type = token_type_operator;
1335 phase5_scan_regexp (tp);
1336 tp->type = last_token_type = token_type_regexp;
1341 if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
1342 inside_embedded_js_in_xml = true;
1343 tp->type = last_token_type = token_type_other;
1347 if (xml_element_depth > 0 && inside_embedded_js_in_xml)
1348 inside_embedded_js_in_xml = false;
1349 tp->type = last_token_type = token_type_other;
1353 tp->type = last_token_type = token_type_lparen;
1357 tp->type = last_token_type = token_type_rparen;
1361 tp->type = last_token_type = token_type_comma;
1365 tp->type = last_token_type = token_type_lbracket;
1369 tp->type = last_token_type = token_type_rbracket;
1373 /* We could carefully recognize each of the 2 and 3 character
1374 operators, but it is not necessary, as we only need to recognize
1375 gettext invocations. Don't bother. */
1376 tp->type = last_token_type = token_type_other;
1382 /* Supports only one pushback token. */
1384 phase5_unget (token_ty *tp)
1386 if (tp->type != token_type_eof)
1388 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1390 phase5_pushback[phase5_pushback_length++] = *tp;
1395 /* String concatenation with '+'. */
1398 x_javascript_lex (token_ty *tp)
1401 if (tp->type == token_type_string)
1403 char *sum = tp->string;
1404 size_t sum_len = strlen (sum);
1410 phase5_get (&token2);
1411 if (token2.type == token_type_plus)
1415 phase5_get (&token3);
1416 if (token3.type == token_type_string)
1418 char *addend = token3.string;
1419 size_t addend_len = strlen (addend);
1421 sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
1422 memcpy (sum + sum_len, addend, addend_len + 1);
1423 sum_len += addend_len;
1425 free_token (&token3);
1426 free_token (&token2);
1429 phase5_unget (&token3);
1431 phase5_unget (&token2);
1439 /* ========================= Extracting strings. ========================== */
1442 /* Context lookup table. */
1443 static flag_context_list_table_ty *flag_context_list_table;
1446 /* The file is broken into tokens. Scan the token stream, looking for
1447 a keyword, followed by a left paren, followed by a string. When we
1448 see this sequence, we have something to remember. We assume we are
1449 looking at a valid JavaScript program, and leave the complaints about
1450 the grammar to the compiler.
1452 Normal handling: Look for
1453 keyword ( ... msgid ... )
1454 Plural handling: Look for
1455 keyword ( ... msgid ... msgid_plural ... )
1457 We use recursion because the arguments before msgid or between msgid
1458 and msgid_plural can contain subexpressions of the same form. */
1461 /* Extract messages until the next balanced closing parenthesis or bracket.
1462 Extracted messages are added to MLP.
1463 DELIM can be either token_type_rparen or token_type_rbracket, or
1464 token_type_eof to accept both.
1465 Return true upon eof, false upon closing parenthesis or bracket. */
1467 extract_balanced (message_list_ty *mlp,
1468 token_type_ty delim,
1469 flag_context_ty outer_context,
1470 flag_context_list_iterator_ty context_iter,
1471 struct arglist_parser *argparser)
1473 /* Current argument number. */
1475 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1477 /* Parameters of the keyword just seen. Defined only in state 1. */
1478 const struct callshapes *next_shapes = NULL;
1479 /* Context iterator that will be used if the next token is a '('. */
1480 flag_context_list_iterator_ty next_context_iter =
1481 passthrough_context_list_iterator;
1482 /* Current context. */
1483 flag_context_ty inner_context =
1484 inherited_context (outer_context,
1485 flag_context_list_iterator_advance (&context_iter));
1487 /* Start state is 0. */
1494 x_javascript_lex (&token);
1497 case token_type_symbol:
1499 void *keyword_value;
1501 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1505 next_shapes = (const struct callshapes *) keyword_value;
1512 flag_context_list_iterator (
1513 flag_context_list_table_lookup (
1514 flag_context_list_table,
1515 token.string, strlen (token.string)));
1516 free (token.string);
1519 case token_type_lparen:
1520 if (extract_balanced (mlp, token_type_rparen,
1521 inner_context, next_context_iter,
1522 arglist_parser_alloc (mlp,
1523 state ? next_shapes : NULL)))
1525 xgettext_current_source_encoding = po_charset_utf8;
1526 arglist_parser_done (argparser, arg);
1527 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1530 next_context_iter = null_context_list_iterator;
1534 case token_type_rparen:
1535 if (delim == token_type_rparen || delim == token_type_eof)
1537 xgettext_current_source_encoding = po_charset_utf8;
1538 arglist_parser_done (argparser, arg);
1539 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1542 next_context_iter = null_context_list_iterator;
1546 case token_type_comma:
1549 inherited_context (outer_context,
1550 flag_context_list_iterator_advance (
1552 next_context_iter = passthrough_context_list_iterator;
1556 case token_type_lbracket:
1557 if (extract_balanced (mlp, token_type_rbracket,
1558 null_context, null_context_list_iterator,
1559 arglist_parser_alloc (mlp, NULL)))
1561 xgettext_current_source_encoding = po_charset_utf8;
1562 arglist_parser_done (argparser, arg);
1563 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1566 next_context_iter = null_context_list_iterator;
1570 case token_type_rbracket:
1571 if (delim == token_type_rbracket || delim == token_type_eof)
1573 xgettext_current_source_encoding = po_charset_utf8;
1574 arglist_parser_done (argparser, arg);
1575 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1578 next_context_iter = null_context_list_iterator;
1582 case token_type_string:
1585 pos.file_name = logical_file_name;
1586 pos.line_number = token.line_number;
1588 xgettext_current_source_encoding = po_charset_utf8;
1590 remember_a_message (mlp, NULL, token.string, inner_context,
1591 &pos, NULL, token.comment);
1593 arglist_parser_remember (argparser, arg, token.string,
1595 pos.file_name, pos.line_number,
1597 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1599 drop_reference (token.comment);
1600 next_context_iter = null_context_list_iterator;
1604 case token_type_eof:
1605 xgettext_current_source_encoding = po_charset_utf8;
1606 arglist_parser_done (argparser, arg);
1607 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1610 case token_type_keyword:
1611 case token_type_plus:
1612 case token_type_regexp:
1613 case token_type_operator:
1614 case token_type_equal:
1615 case token_type_other:
1616 next_context_iter = null_context_list_iterator;
1628 extract_javascript (FILE *f,
1629 const char *real_filename, const char *logical_filename,
1630 flag_context_list_table_ty *flag_table,
1631 msgdomain_list_ty *mdlp)
1633 message_list_ty *mlp = mdlp->item[0]->messages;
1636 real_file_name = real_filename;
1637 logical_file_name = xstrdup (logical_filename);
1640 lexical_context = lc_outside;
1642 last_comment_line = -1;
1643 last_non_comment_line = -1;
1645 xml_element_depth = 0;
1647 xgettext_current_file_source_encoding = xgettext_global_source_encoding;
1649 xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1652 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1654 xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1657 continuation_or_nonblank_line = false;
1659 flag_context_list_table = flag_table;
1663 /* Eat tokens until eof is seen. When extract_balanced returns
1664 due to an unbalanced closing parenthesis, just restart it. */
1665 while (!extract_balanced (mlp, token_type_eof,
1666 null_context, null_context_list_iterator,
1667 arglist_parser_alloc (mlp, NULL)))
1671 real_file_name = NULL;
1672 logical_file_name = NULL;