Imported Upstream version 0.19.7

[platform/upstream/gettext.git] / gettext-tools / src / x-vala.c
diff --git a/gettext-tools/src/x-vala.c b/gettext-tools/src/x-vala.c

index 68c8d9c..4e53b50 100644 (file)
--- a/gettext-tools/src/x-vala.c
+++ b/gettext-tools/src/x-vala.c
@@ -1,5 +1,5 @@
  /* xgettext Vala backend.
-   Copyright (C) 2013 Free Software Foundation, Inc.
+   Copyright (C) 2013, 2015 Free Software Foundation, Inc.
  
     This file was written by Daiki Ueno <ueno@gnu.org>, 2013.
  
@@ -23,6 +23,7 @@
  /* Specification.  */
  #include "x-vala.h"
  
+#include <assert.h>
  #include <errno.h>
  #include <stdbool.h>
  #include <stdio.h>
@@ -36,6 +37,7 @@
  #include "xalloc.h"
  #include "xvasprintf.h"
  #include "hash.h"
+#include "po-charset.h"
  #include "gettext.h"
  
  #define _(s) gettext(s)
@@ -340,13 +342,14 @@ enum token_type_ty
    token_type_rparen,                    /* ) */
    token_type_lbrace,                    /* { */
    token_type_rbrace,                    /* } */
-  token_type_assign,                    /* = */
+  token_type_assign,                    /* = += -= *= /= %= <<= >>= &= |= ^= */
    token_type_return,                    /* return */
    token_type_plus,                      /* + */
-  token_type_minus,                     /* - */
+  token_type_arithmetic_operator,       /* - * / % << >> & | ^ */
    token_type_equality_test_operator,    /* == < > >= <= != */
    token_type_logic_operator,            /* ! && || */
    token_type_comma,                     /* , */
+  token_type_question,                  /* ? */
    token_type_colon,                     /* : */
    token_type_number,                    /* 2.7 */
    token_type_string_literal,            /* "abc" */
@@ -363,6 +366,7 @@ struct token_ty
    token_type_ty type;
    char *string;         /* for token_type_symbol, token_type_string_literal */
    refcounted_string_list_ty *comment;   /* for token_type_string_literal */
+  enum literalstring_escape_type escape;
    int line_number;
  };
  
@@ -377,154 +381,6 @@ free_token (token_ty *tp)
  }
  
  
-/* Replace escape sequences within character strings with their single
-   character equivalents.  */
-
-#define P7_QUOTES (1000 + '"')
-#define P7_QUOTE (1000 + '\'')
-#define P7_NEWLINE (1000 + '\n')
-
-static int
-phase7_getc ()
-{
-  int c, n, j;
-
-  /* Use phase 1, because phase 2 elides comments.  */
-  c = phase1_getc ();
-
-  /* Return a magic newline indicator, so that we can distinguish
-     between the user requesting a newline in the string (e.g. using
-     "\n" or "\012") from the user failing to terminate the string or
-     character constant.  The ANSI C standard says: 3.1.3.4 Character
-     Constants contain "any character except single quote, backslash or
-     newline; or an escape sequence" and 3.1.4 String Literals contain
-     "any character except double quote, backslash or newline; or an
-     escape sequence".
-
-     Most compilers give a fatal error in this case, however gcc is
-     stupidly silent, even though this is a very common typo.  OK, so
-     "gcc --pedantic" will tell me, but that gripes about too much other
-     stuff.  Could I have a "gcc -Wnewline-in-string" option, or
-     better yet a "gcc -fno-newline-in-string" option, please?  Gcc is
-     also inconsistent between string literals and character constants:
-     you may not embed newlines in character constants; try it, you get
-     a useful diagnostic.  --PMiller  */
-  if (c == '\n')
-    return P7_NEWLINE;
-
-  if (c == '"')
-    return P7_QUOTES;
-  if (c == '\'')
-    return P7_QUOTE;
-  if (c != '\\')
-    return c;
-  c = phase1_getc ();
-  switch (c)
-    {
-    default:
-      /* Unknown escape sequences really should be an error, but just
-         ignore them, and let the real compiler complain.  */
-      phase1_ungetc (c);
-      return '\\';
-
-    case '"':
-    case '\'':
-    case '?':
-    case '\\':
-      return c;
-
-    case 'a':
-      return '\a';
-    case 'b':
-      return '\b';
-
-      /* The \e escape is preculiar to gcc, and assumes an ASCII
-         character set (or superset).  We don't provide support for it
-         here.  */
-
-    case 'f':
-      return '\f';
-    case 'n':
-      return '\n';
-    case 'r':
-      return '\r';
-    case 't':
-      return '\t';
-    case 'v':
-      return '\v';
-
-    case 'x':
-      c = phase1_getc ();
-      switch (c)
-        {
-        default:
-          phase1_ungetc (c);
-          phase1_ungetc ('x');
-          return '\\';
-
-        case '0': case '1': case '2': case '3': case '4':
-        case '5': case '6': case '7': case '8': case '9':
-        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
-          break;
-        }
-      n = 0;
-      for (;;)
-        {
-          switch (c)
-            {
-            default:
-              phase1_ungetc (c);
-              return n;
-
-            case '0': case '1': case '2': case '3': case '4':
-            case '5': case '6': case '7': case '8': case '9':
-              n = n * 16 + c - '0';
-              break;
-
-            case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-              n = n * 16 + 10 + c - 'A';
-              break;
-
-            case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
-              n = n * 16 + 10 + c - 'a';
-              break;
-            }
-          c = phase1_getc ();
-        }
-      return n;
-
-    case '0': case '1': case '2': case '3':
-    case '4': case '5': case '6': case '7':
-      n = 0;
-      for (j = 0; j < 3; ++j)
-        {
-          n = n * 8 + c - '0';
-          c = phase1_getc ();
-          switch (c)
-            {
-            default:
-              break;
-
-            case '0': case '1': case '2': case '3':
-            case '4': case '5': case '6': case '7':
-              continue;
-            }
-          break;
-        }
-      phase1_ungetc (c);
-      return n;
-    }
-}
-
-
-static void
-phase7_ungetc (int c)
-{
-  phase1_ungetc (c);
-}
-
-
  /* 3. Parse each resulting logical line as preprocessing tokens and
     white space.  Preprocessing tokens and Vala tokens don't always
     match.  */
@@ -573,6 +429,20 @@ phase3_get (token_ty *tp)
    static char *buffer;
    static int bufmax;
    int bufpos;
+  int last_was_backslash;
+
+#undef APPEND
+#define APPEND(c)                               \
+  do                                            \
+    {                                           \
+      if (bufpos >= bufmax)                     \
+        {                                       \
+          bufmax = 2 * bufmax + 10;             \
+          buffer = xrealloc (buffer, bufmax);   \
+        }                                       \
+      buffer[bufpos++] = c;                     \
+    }                                           \
+  while (0)
  
    if (phase3_pushback_length)
      {
@@ -627,12 +497,7 @@ phase3_get (token_ty *tp)
            bufpos = 0;
            for (;;)
              {
-              if (bufpos >= bufmax)
-                {
-                  bufmax = 2 * bufmax + 10;
-                  buffer = xrealloc (buffer, bufmax);
-                }
-              buffer[bufpos++] = c;
+              APPEND (c);
                c = phase2_getc ();
                switch (c)
                  {
@@ -657,12 +522,7 @@ phase3_get (token_ty *tp)
                  }
                break;
              }
-          if (bufpos >= bufmax)
-            {
-              bufmax = 2 * bufmax + 10;
-              buffer = xrealloc (buffer, bufmax);
-            }
-          buffer[bufpos] = 0;
+          APPEND (0);
            if (strcmp (buffer, "return") == 0)
              tp->type = last_token_type = token_type_return;
            else
@@ -697,23 +557,13 @@ phase3_get (token_ty *tp)
            bufpos = 0;
            for (;;)
              {
-              if (bufpos >= bufmax)
-                {
-                  bufmax = 2 * bufmax + 10;
-                  buffer = xrealloc (buffer, bufmax);
-                }
-              buffer[bufpos++] = c;
+              APPEND (c);
                c = phase2_getc ();
                switch (c)
                  {
                  case 'e':
                  case 'E':
-                  if (bufpos >= bufmax)
-                    {
-                      bufmax = 2 * bufmax + 10;
-                      buffer = xrealloc (buffer, bufmax);
-                    }
-                  buffer[bufpos++] = c;
+                  APPEND (c);
                    c = phase2_getc ();
                    if (c != '+' && c != '-')
                      {
@@ -743,30 +593,38 @@ phase3_get (token_ty *tp)
                  }
                break;
              }
-          if (bufpos >= bufmax)
-            {
-              bufmax = 2 * bufmax + 10;
-              buffer = xrealloc (buffer, bufmax);
-            }
-          buffer[bufpos] = 0;
+          APPEND (0);
            tp->type = last_token_type = token_type_number;
            return;
  
          case '\'':
+          last_was_backslash = false;
            for (;;)
              {
-              c = phase7_getc ();
-              if (c == P7_NEWLINE)
+              c = phase2_getc ();
+              if (last_was_backslash)
+                {
+                  last_was_backslash = false;
+                  continue;
+                }
+              switch (c)
                  {
+                case '\\':
+                  last_was_backslash = true;
+                  /* FALLTHROUGH */
+                default:
+                  continue;
+                case '\n':
                    error_with_progname = false;
                    error (0, 0, _("%s:%d: warning: unterminated character constant"),
                           logical_file_name, line_number - 1);
                    error_with_progname = true;
-                  phase7_ungetc ('\n');
+                  phase2_ungetc ('\n');
+                  break;
+                case EOF: case '\'':
                    break;
                  }
-              if (c == EOF || c == P7_QUOTE)
-                break;
+              break;
              }
            tp->type = last_token_type = token_type_character_constant;
            return;
@@ -803,6 +661,7 @@ phase3_get (token_ty *tp)
          case '"':
            {
              int c2 = phase2_getc ();
+
              if (c2 == '"')
                {
                  int c3 = phase2_getc ();
@@ -816,65 +675,77 @@ phase3_get (token_ty *tp)
                }
              else
                phase2_ungetc (c2);
-          }
  
-          bufpos = 0;
-          for (;;)
-            {
-              c = phase7_getc ();
-              if (c == P7_NEWLINE)
-                {
-                  if (verbatim)
-                    c = '\n';
-                  else
-                    {
-                      error_with_progname = false;
-                      error (0, 0, _("%s:%d: warning: unterminated string literal"),
-                             logical_file_name, line_number - 1);
-                      error_with_progname = true;
-                      phase7_ungetc ('\n');
+            if (verbatim)
+              {
+                bufpos = 0;
+                for (;;)
+                  {
+                    /* Use phase 1, because phase 2 elides comments.  */
+                    c = phase1_getc ();
+                    if (c == EOF)
                        break;
-                    }
-                }
-              if (c == P7_QUOTES)
-                {
-                  if (verbatim)
-                    {
-                      int c2 = phase2_getc ();
-                      if (c2 == '"')
-                        {
-                          int c3 = phase2_getc ();
-                          if (c3 == '"')
-                            break;
-                          phase2_ungetc (c3);
-                        }
-                      phase2_ungetc (c2);
-                      c = '"';
-                    }
-                  else
+
+                    if (c == '"')
+                      {
+                        int c2 = phase1_getc ();
+                        if (c2 == '"')
+                          {
+                            int c3 = phase1_getc ();
+                            if (c3 == '"')
+                              break;
+                            phase1_ungetc (c3);
+                          }
+                        phase1_ungetc (c2);
+                      }
+                    APPEND (c);
+                  }
+              }
+            else
+              {
+                last_was_backslash = false;
+                bufpos = 0;
+                for (;;)
+                  {
+                    c = phase1_getc ();
+                    if (last_was_backslash)
+                      {
+                        last_was_backslash = false;
+                        APPEND (c);
+                        continue;
+                      }
+
+                    switch (c)
+                      {
+                      case '\\':
+                        last_was_backslash = true;
+                        /* FALLTHROUGH */
+                      default:
+                        APPEND (c);
+                        continue;
+
+                      case '\n':
+                        error_with_progname = false;
+                        error (0, 0, _("\
+%s:%d: warning: unterminated string literal"),
+                               logical_file_name, line_number - 1);
+                        error_with_progname = true;
+                        phase1_ungetc ('\n');
+                        break;
+                      case EOF: case '"':
+                        break;
+                      }
                      break;
-                }
-              if (c == EOF)
-                break;
-              if (c == P7_QUOTE)
-                c = '\'';
-              if (bufpos >= bufmax)
-                {
-                  bufmax = 2 * bufmax + 10;
-                  buffer = xrealloc (buffer, bufmax);
-                }
-              buffer[bufpos++] = c;
-            }
-          if (bufpos >= bufmax)
-            {
-              bufmax = 2 * bufmax + 10;
-              buffer = xrealloc (buffer, bufmax);
-            }
-          buffer[bufpos] = 0;
-          tp->type = last_token_type = template ? token_type_string_template : token_type_string_literal;
-          tp->string = xstrdup (buffer);
-          tp->comment = add_reference (savable_comment);
-          return;
+                  }
+              }
+            APPEND (0);
+            tp->type = last_token_type = template
+              ? token_type_string_template : token_type_string_literal;
+            tp->string = xstrdup (buffer);
+            tp->comment = add_reference (savable_comment);
+            tp->escape = verbatim ? 0 : LET_ANSI_C | LET_UNICODE;
+            return;
+          }
  
          case '/':
            switch (last_token_type)
@@ -884,10 +755,12 @@ phase3_get (token_ty *tp)
              case token_type_assign:
              case token_type_return:
              case token_type_plus:
-            case token_type_minus:
+            case token_type_arithmetic_operator:
              case token_type_equality_test_operator:
              case token_type_logic_operator:
              case token_type_comma:
+            case token_type_question:
+            case token_type_colon:
                phase3_scan_regex ();
                tp->type = last_token_type = token_type_regex_literal;
                break;
@@ -895,11 +768,12 @@ phase3_get (token_ty *tp)
                {
                  int c2 = phase2_getc ();
                  if (c2 == '=')
+                  tp->type = last_token_type = token_type_assign;
+                else
                    {
-                    /* /= */
                      phase2_ungetc (c2);
+                    tp->type = last_token_type = token_type_arithmetic_operator;
                    }
-                tp->type = last_token_type = token_type_other;
                  break;
                }
              }
@@ -926,9 +800,12 @@ phase3_get (token_ty *tp)
              int c2 = phase2_getc ();
              switch (c2)
                {
-              case '=': case '+':
+              case '+':
                  tp->type = last_token_type = token_type_other;
                  break;
+              case '=':
+                tp->type = last_token_type = token_type_assign;
+                break;
                default:
                  phase2_ungetc (c2);
                  tp->type = last_token_type = token_type_plus;
@@ -942,17 +819,34 @@ phase3_get (token_ty *tp)
              int c2 = phase2_getc ();
              switch (c2)
                {
-              case '=': case '-':
+              case '-':
                  tp->type = last_token_type = token_type_other;
                  break;
+              case '=':
+                tp->type = last_token_type = token_type_assign;
+                break;
                default:
                  phase2_ungetc (c2);
-                tp->type = last_token_type = token_type_minus;
+                tp->type = last_token_type = token_type_arithmetic_operator;
                  break;
                }
              return;
            }
  
+        case '%':
+        case '^':
+          {
+            int c2 = phase2_getc ();
+            if (c2 == '=')
+             tp->type = last_token_type = token_type_assign;
+            else
+              {
+                phase2_ungetc (c2);
+                tp->type = last_token_type = token_type_logic_operator;
+              }
+            return;
+          }
+
          case '=':
            {
              int c2 = phase2_getc ();
@@ -976,12 +870,12 @@ phase3_get (token_ty *tp)
            {
              int c2 = phase2_getc ();
              if (c2 == '=')
+              tp->type = last_token_type = token_type_equality_test_operator;
+            else
                {
-                tp->type = last_token_type = token_type_equality_test_operator;
-                return;
+                phase2_ungetc (c2);
+                tp->type = last_token_type = token_type_logic_operator;
                }
-            phase2_ungetc (c2);
-            tp->type = last_token_type = token_type_logic_operator;
              return;
            }
            
@@ -994,17 +888,22 @@ phase3_get (token_ty *tp)
              else if (c2 == c)
                {
                  int c3 = phase2_getc ();
-                if (c3 != '=')
-                  phase2_ungetc (c3);
-                tp->type = last_token_type = token_type_other;
+                if (c3 == '=')
+                  tp->type = last_token_type = token_type_assign;
+                else
+                  {
+                    phase2_ungetc (c2);
+                    phase2_ungetc (c3);
+                    tp->type = last_token_type = token_type_other;
+                  }
                }
              else
                {
                  phase2_ungetc (c2);
                  tp->type = last_token_type = token_type_equality_test_operator;
                }
+            return;
            }
-          return;
            
          case ',':
            tp->type = last_token_type = token_type_comma;
@@ -1021,25 +920,25 @@ phase3_get (token_ty *tp)
              if (c2 == c)
               tp->type = last_token_type = token_type_logic_operator;
              else if (c2 == '=')
-             tp->type = last_token_type = token_type_other;
+             tp->type = last_token_type = token_type_assign;
              else
                {
                  phase2_ungetc (c2);
-                tp->type = last_token_type = token_type_other;
+                tp->type = last_token_type = token_type_arithmetic_operator;
                }
+            return;
            }
-          return;
  
          case '?':
            {
              int c2 = phase2_getc ();
              if (c2 == '?')
+              tp->type = last_token_type = token_type_logic_operator;
+            else
                {
-                tp->type = last_token_type = token_type_logic_operator;
-                return;
+                phase2_ungetc (c2);
+                tp->type = last_token_type = token_type_question;
                }
-            phase2_ungetc (c2);
-            tp->type = last_token_type = token_type_other;
              return;
            }
  
@@ -1048,6 +947,7 @@ phase3_get (token_ty *tp)
            return;
          }
      }
+#undef APPEND
  }
  
  static void
@@ -1112,6 +1012,8 @@ x_vala_lex (token_ty *tp)
  /* Context lookup table.  */
  static flag_context_list_table_ty *flag_context_list_table;
  
+/* Use the same literalstring_parser provided by the C scanner.  */
+extern struct literalstring_parser literalstring_c;
  
  /* The file is broken into tokens.  Scan the token stream, looking for
     a keyword, followed by a left paren, followed by a string.  When we
@@ -1231,25 +1133,57 @@ extract_balanced (message_list_ty *mlp, token_type_ty delim,
              pos.line_number = token.line_number;
  
              if (extract_all)
-              remember_a_message (mlp, NULL, token.string, inner_context,
-                                  &pos, NULL, token.comment);
+              {
+                char *string;
+                refcounted_string_list_ty *comment;
+                const char *encoding;
+
+                string = literalstring_c.parse (token.string, &pos,
+                                                token.escape);
+                free (token.string);
+                token.string = string;
+
+                if (token.comment != NULL)
+                  {
+                    comment = savable_comment_convert_encoding (token.comment,
+                                                                &pos);
+                    drop_reference (token.comment);
+                    token.comment = comment;
+                  }
+
+                /* token.string and token.comment are already converted
+                   to UTF-8.  Prevent further conversion in
+                   remember_a_message.  */
+                encoding = xgettext_current_source_encoding;
+                xgettext_current_source_encoding = po_charset_utf8;
+                remember_a_message (mlp, NULL, token.string, inner_context,
+                                    &pos, NULL, token.comment);
+                xgettext_current_source_encoding = encoding;
+              }
              else
                {
-                /* A string immediately after a symbol means a function call.  */
+                /* A string immediately after a symbol means a
+                   function call.  */
                  if (state)
                    {
                      struct arglist_parser *tmp_argparser;
                      tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
  
-                    arglist_parser_remember (tmp_argparser, 1, token.string,
-                                             inner_context, pos.file_name,
-                                             pos.line_number, token.comment);
+                    arglist_parser_remember_literal (tmp_argparser, 1,
+                                                     token.string,
+                                                     inner_context,
+                                                     pos.file_name,
+                                                     pos.line_number,
+                                                     token.comment,
+                                                     token.escape);
                      arglist_parser_done (tmp_argparser, 1);
                    }
                  else
-                  arglist_parser_remember (argparser, arg, token.string,
-                                           inner_context, pos.file_name,
-                                           pos.line_number, token.comment);
+                  arglist_parser_remember_literal (argparser, arg, token.string,
+                                                   inner_context, pos.file_name,
+                                                   pos.line_number,
+                                                   token.comment,
+                                                   token.escape);
                }
            }
            drop_reference (token.comment);
@@ -1263,9 +1197,10 @@ extract_balanced (message_list_ty *mlp, token_type_ty delim,
          case token_type_assign:
          case token_type_return:
          case token_type_plus:
-        case token_type_minus:
+        case token_type_arithmetic_operator:
          case token_type_equality_test_operator:
          case token_type_logic_operator:
+        case token_type_question:
          case token_type_colon:
          case token_type_number:
          case token_type_string_template: