/* xgettext Vala backend.
- Copyright (C) 2013 Free Software Foundation, Inc.
+ Copyright (C) 2013, 2015 Free Software Foundation, Inc.
This file was written by Daiki Ueno <ueno@gnu.org>, 2013.
/* Specification. */
#include "x-vala.h"
+#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include "xalloc.h"
#include "xvasprintf.h"
#include "hash.h"
+#include "po-charset.h"
#include "gettext.h"
#define _(s) gettext(s)
token_type_rparen, /* ) */
token_type_lbrace, /* { */
token_type_rbrace, /* } */
- token_type_assign, /* = */
+ token_type_assign, /* = += -= *= /= %= <<= >>= &= |= ^= */
token_type_return, /* return */
token_type_plus, /* + */
- token_type_minus, /* - */
+ token_type_arithmetic_operator, /* - * / % << >> & | ^ */
token_type_equality_test_operator, /* == < > >= <= != */
token_type_logic_operator, /* ! && || */
token_type_comma, /* , */
+ token_type_question, /* ? */
token_type_colon, /* : */
token_type_number, /* 2.7 */
token_type_string_literal, /* "abc" */
token_type_ty type;
char *string; /* for token_type_symbol, token_type_string_literal */
refcounted_string_list_ty *comment; /* for token_type_string_literal */
+ enum literalstring_escape_type escape;
int line_number;
};
}
-/* Replace escape sequences within character strings with their single
- character equivalents. */
-
-#define P7_QUOTES (1000 + '"')
-#define P7_QUOTE (1000 + '\'')
-#define P7_NEWLINE (1000 + '\n')
-
-static int
-phase7_getc ()
-{
- int c, n, j;
-
- /* Use phase 1, because phase 2 elides comments. */
- c = phase1_getc ();
-
- /* Return a magic newline indicator, so that we can distinguish
- between the user requesting a newline in the string (e.g. using
- "\n" or "\012") from the user failing to terminate the string or
- character constant. The ANSI C standard says: 3.1.3.4 Character
- Constants contain "any character except single quote, backslash or
- newline; or an escape sequence" and 3.1.4 String Literals contain
- "any character except double quote, backslash or newline; or an
- escape sequence".
-
- Most compilers give a fatal error in this case, however gcc is
- stupidly silent, even though this is a very common typo. OK, so
- "gcc --pedantic" will tell me, but that gripes about too much other
- stuff. Could I have a "gcc -Wnewline-in-string" option, or
- better yet a "gcc -fno-newline-in-string" option, please? Gcc is
- also inconsistent between string literals and character constants:
- you may not embed newlines in character constants; try it, you get
- a useful diagnostic. --PMiller */
- if (c == '\n')
- return P7_NEWLINE;
-
- if (c == '"')
- return P7_QUOTES;
- if (c == '\'')
- return P7_QUOTE;
- if (c != '\\')
- return c;
- c = phase1_getc ();
- switch (c)
- {
- default:
- /* Unknown escape sequences really should be an error, but just
- ignore them, and let the real compiler complain. */
- phase1_ungetc (c);
- return '\\';
-
- case '"':
- case '\'':
- case '?':
- case '\\':
- return c;
-
- case 'a':
- return '\a';
- case 'b':
- return '\b';
-
- /* The \e escape is preculiar to gcc, and assumes an ASCII
- character set (or superset). We don't provide support for it
- here. */
-
- case 'f':
- return '\f';
- case 'n':
- return '\n';
- case 'r':
- return '\r';
- case 't':
- return '\t';
- case 'v':
- return '\v';
-
- case 'x':
- c = phase1_getc ();
- switch (c)
- {
- default:
- phase1_ungetc (c);
- phase1_ungetc ('x');
- return '\\';
-
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
- case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
- break;
- }
- n = 0;
- for (;;)
- {
- switch (c)
- {
- default:
- phase1_ungetc (c);
- return n;
-
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- n = n * 16 + c - '0';
- break;
-
- case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
- n = n * 16 + 10 + c - 'A';
- break;
-
- case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
- n = n * 16 + 10 + c - 'a';
- break;
- }
- c = phase1_getc ();
- }
- return n;
-
- case '0': case '1': case '2': case '3':
- case '4': case '5': case '6': case '7':
- n = 0;
- for (j = 0; j < 3; ++j)
- {
- n = n * 8 + c - '0';
- c = phase1_getc ();
- switch (c)
- {
- default:
- break;
-
- case '0': case '1': case '2': case '3':
- case '4': case '5': case '6': case '7':
- continue;
- }
- break;
- }
- phase1_ungetc (c);
- return n;
- }
-}
-
-
-static void
-phase7_ungetc (int c)
-{
- phase1_ungetc (c);
-}
-
-
/* 3. Parse each resulting logical line as preprocessing tokens and
white space. Preprocessing tokens and Vala tokens don't always
match. */
static char *buffer;
static int bufmax;
int bufpos;
+ int last_was_backslash;
+
+#undef APPEND
+#define APPEND(c) \
+ do \
+ { \
+ if (bufpos >= bufmax) \
+ { \
+ bufmax = 2 * bufmax + 10; \
+ buffer = xrealloc (buffer, bufmax); \
+ } \
+ buffer[bufpos++] = c; \
+ } \
+ while (0)
if (phase3_pushback_length)
{
bufpos = 0;
for (;;)
{
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
- }
- buffer[bufpos++] = c;
+ APPEND (c);
c = phase2_getc ();
switch (c)
{
}
break;
}
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
- }
- buffer[bufpos] = 0;
+ APPEND (0);
if (strcmp (buffer, "return") == 0)
tp->type = last_token_type = token_type_return;
else
bufpos = 0;
for (;;)
{
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
- }
- buffer[bufpos++] = c;
+ APPEND (c);
c = phase2_getc ();
switch (c)
{
case 'e':
case 'E':
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
- }
- buffer[bufpos++] = c;
+ APPEND (c);
c = phase2_getc ();
if (c != '+' && c != '-')
{
}
break;
}
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
- }
- buffer[bufpos] = 0;
+ APPEND (0);
tp->type = last_token_type = token_type_number;
return;
case '\'':
+ last_was_backslash = false;
for (;;)
{
- c = phase7_getc ();
- if (c == P7_NEWLINE)
+ c = phase2_getc ();
+ if (last_was_backslash)
+ {
+ last_was_backslash = false;
+ continue;
+ }
+ switch (c)
{
+ case '\\':
+ last_was_backslash = true;
+ /* FALLTHROUGH */
+ default:
+ continue;
+ case '\n':
error_with_progname = false;
error (0, 0, _("%s:%d: warning: unterminated character constant"),
logical_file_name, line_number - 1);
error_with_progname = true;
- phase7_ungetc ('\n');
+ phase2_ungetc ('\n');
+ break;
+ case EOF: case '\'':
break;
}
- if (c == EOF || c == P7_QUOTE)
- break;
+ break;
}
tp->type = last_token_type = token_type_character_constant;
return;
case '"':
{
int c2 = phase2_getc ();
+
if (c2 == '"')
{
int c3 = phase2_getc ();
}
else
phase2_ungetc (c2);
- }
- bufpos = 0;
- for (;;)
- {
- c = phase7_getc ();
- if (c == P7_NEWLINE)
- {
- if (verbatim)
- c = '\n';
- else
- {
- error_with_progname = false;
- error (0, 0, _("%s:%d: warning: unterminated string literal"),
- logical_file_name, line_number - 1);
- error_with_progname = true;
- phase7_ungetc ('\n');
+ if (verbatim)
+ {
+ bufpos = 0;
+ for (;;)
+ {
+ /* Use phase 1, because phase 2 elides comments. */
+ c = phase1_getc ();
+ if (c == EOF)
break;
- }
- }
- if (c == P7_QUOTES)
- {
- if (verbatim)
- {
- int c2 = phase2_getc ();
- if (c2 == '"')
- {
- int c3 = phase2_getc ();
- if (c3 == '"')
- break;
- phase2_ungetc (c3);
- }
- phase2_ungetc (c2);
- c = '"';
- }
- else
+
+ if (c == '"')
+ {
+ int c2 = phase1_getc ();
+ if (c2 == '"')
+ {
+ int c3 = phase1_getc ();
+ if (c3 == '"')
+ break;
+ phase1_ungetc (c3);
+ }
+ phase1_ungetc (c2);
+ }
+ APPEND (c);
+ }
+ }
+ else
+ {
+ last_was_backslash = false;
+ bufpos = 0;
+ for (;;)
+ {
+ c = phase1_getc ();
+ if (last_was_backslash)
+ {
+ last_was_backslash = false;
+ APPEND (c);
+ continue;
+ }
+
+ switch (c)
+ {
+ case '\\':
+ last_was_backslash = true;
+ /* FALLTHROUGH */
+ default:
+ APPEND (c);
+ continue;
+
+ case '\n':
+ error_with_progname = false;
+ error (0, 0, _("\
+%s:%d: warning: unterminated string literal"),
+ logical_file_name, line_number - 1);
+ error_with_progname = true;
+ phase1_ungetc ('\n');
+ break;
+ case EOF: case '"':
+ break;
+ }
break;
- }
- if (c == EOF)
- break;
- if (c == P7_QUOTE)
- c = '\'';
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
- }
- buffer[bufpos++] = c;
- }
- if (bufpos >= bufmax)
- {
- bufmax = 2 * bufmax + 10;
- buffer = xrealloc (buffer, bufmax);
- }
- buffer[bufpos] = 0;
- tp->type = last_token_type = template ? token_type_string_template : token_type_string_literal;
- tp->string = xstrdup (buffer);
- tp->comment = add_reference (savable_comment);
- return;
+ }
+ }
+ APPEND (0);
+ tp->type = last_token_type = template
+ ? token_type_string_template : token_type_string_literal;
+ tp->string = xstrdup (buffer);
+ tp->comment = add_reference (savable_comment);
+ tp->escape = verbatim ? 0 : LET_ANSI_C | LET_UNICODE;
+ return;
+ }
case '/':
switch (last_token_type)
case token_type_assign:
case token_type_return:
case token_type_plus:
- case token_type_minus:
+ case token_type_arithmetic_operator:
case token_type_equality_test_operator:
case token_type_logic_operator:
case token_type_comma:
+ case token_type_question:
+ case token_type_colon:
phase3_scan_regex ();
tp->type = last_token_type = token_type_regex_literal;
break;
{
int c2 = phase2_getc ();
if (c2 == '=')
+ tp->type = last_token_type = token_type_assign;
+ else
{
- /* /= */
phase2_ungetc (c2);
+ tp->type = last_token_type = token_type_arithmetic_operator;
}
- tp->type = last_token_type = token_type_other;
break;
}
}
int c2 = phase2_getc ();
switch (c2)
{
- case '=': case '+':
+ case '+':
tp->type = last_token_type = token_type_other;
break;
+ case '=':
+ tp->type = last_token_type = token_type_assign;
+ break;
default:
phase2_ungetc (c2);
tp->type = last_token_type = token_type_plus;
int c2 = phase2_getc ();
switch (c2)
{
- case '=': case '-':
+ case '-':
tp->type = last_token_type = token_type_other;
break;
+ case '=':
+ tp->type = last_token_type = token_type_assign;
+ break;
default:
phase2_ungetc (c2);
- tp->type = last_token_type = token_type_minus;
+ tp->type = last_token_type = token_type_arithmetic_operator;
break;
}
return;
}
+ case '%':
+ case '^':
+ {
+ int c2 = phase2_getc ();
+ if (c2 == '=')
+ tp->type = last_token_type = token_type_assign;
+ else
+ {
+ phase2_ungetc (c2);
+ tp->type = last_token_type = token_type_logic_operator;
+ }
+ return;
+ }
+
case '=':
{
int c2 = phase2_getc ();
{
int c2 = phase2_getc ();
if (c2 == '=')
+ tp->type = last_token_type = token_type_equality_test_operator;
+ else
{
- tp->type = last_token_type = token_type_equality_test_operator;
- return;
+ phase2_ungetc (c2);
+ tp->type = last_token_type = token_type_logic_operator;
}
- phase2_ungetc (c2);
- tp->type = last_token_type = token_type_logic_operator;
return;
}
else if (c2 == c)
{
int c3 = phase2_getc ();
- if (c3 != '=')
- phase2_ungetc (c3);
- tp->type = last_token_type = token_type_other;
+ if (c3 == '=')
+ tp->type = last_token_type = token_type_assign;
+ else
+ {
+ phase2_ungetc (c2);
+ phase2_ungetc (c3);
+ tp->type = last_token_type = token_type_other;
+ }
}
else
{
phase2_ungetc (c2);
tp->type = last_token_type = token_type_equality_test_operator;
}
+ return;
}
- return;
case ',':
tp->type = last_token_type = token_type_comma;
if (c2 == c)
tp->type = last_token_type = token_type_logic_operator;
else if (c2 == '=')
- tp->type = last_token_type = token_type_other;
+ tp->type = last_token_type = token_type_assign;
else
{
phase2_ungetc (c2);
- tp->type = last_token_type = token_type_other;
+ tp->type = last_token_type = token_type_arithmetic_operator;
}
+ return;
}
- return;
case '?':
{
int c2 = phase2_getc ();
if (c2 == '?')
+ tp->type = last_token_type = token_type_logic_operator;
+ else
{
- tp->type = last_token_type = token_type_logic_operator;
- return;
+ phase2_ungetc (c2);
+ tp->type = last_token_type = token_type_question;
}
- phase2_ungetc (c2);
- tp->type = last_token_type = token_type_other;
return;
}
return;
}
}
+#undef APPEND
}
static void
/* Context lookup table. */
static flag_context_list_table_ty *flag_context_list_table;
+/* Use the same literalstring_parser provided by the C scanner. */
+extern struct literalstring_parser literalstring_c;
/* The file is broken into tokens. Scan the token stream, looking for
a keyword, followed by a left paren, followed by a string. When we
pos.line_number = token.line_number;
if (extract_all)
- remember_a_message (mlp, NULL, token.string, inner_context,
- &pos, NULL, token.comment);
+ {
+ char *string;
+ refcounted_string_list_ty *comment;
+ const char *encoding;
+
+ string = literalstring_c.parse (token.string, &pos,
+ token.escape);
+ free (token.string);
+ token.string = string;
+
+ if (token.comment != NULL)
+ {
+ comment = savable_comment_convert_encoding (token.comment,
+ &pos);
+ drop_reference (token.comment);
+ token.comment = comment;
+ }
+
+ /* token.string and token.comment are already converted
+ to UTF-8. Prevent further conversion in
+ remember_a_message. */
+ encoding = xgettext_current_source_encoding;
+ xgettext_current_source_encoding = po_charset_utf8;
+ remember_a_message (mlp, NULL, token.string, inner_context,
+ &pos, NULL, token.comment);
+ xgettext_current_source_encoding = encoding;
+ }
else
{
- /* A string immediately after a symbol means a function call. */
+ /* A string immediately after a symbol means a
+ function call. */
if (state)
{
struct arglist_parser *tmp_argparser;
tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
- arglist_parser_remember (tmp_argparser, 1, token.string,
- inner_context, pos.file_name,
- pos.line_number, token.comment);
+ arglist_parser_remember_literal (tmp_argparser, 1,
+ token.string,
+ inner_context,
+ pos.file_name,
+ pos.line_number,
+ token.comment,
+ token.escape);
arglist_parser_done (tmp_argparser, 1);
}
else
- arglist_parser_remember (argparser, arg, token.string,
- inner_context, pos.file_name,
- pos.line_number, token.comment);
+ arglist_parser_remember_literal (argparser, arg, token.string,
+ inner_context, pos.file_name,
+ pos.line_number,
+ token.comment,
+ token.escape);
}
}
drop_reference (token.comment);
case token_type_assign:
case token_type_return:
case token_type_plus:
- case token_type_minus:
+ case token_type_arithmetic_operator:
case token_type_equality_test_operator:
case token_type_logic_operator:
+ case token_type_question:
case token_type_colon:
case token_type_number:
case token_type_string_template: