gettext-tools/src/x-vala.c

   1 /* xgettext Vala backend.
   2    Copyright (C) 2013, 2015 Free Software Foundation, Inc.
   3
   4    This file was written by Daiki Ueno <ueno@gnu.org>, 2013.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22
  23 /* Specification.  */
  24 #include "x-vala.h"
  25
  26 #include <assert.h>
  27 #include <errno.h>
  28 #include <stdbool.h>
  29 #include <stdio.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32
  33 #include "message.h"
  34 #include "xgettext.h"
  35 #include "error.h"
  36 #include "error-progname.h"
  37 #include "xalloc.h"
  38 #include "xvasprintf.h"
  39 #include "hash.h"
  40 #include "po-charset.h"
  41 #include "gettext.h"
  42
  43 #define _(s) gettext(s)
  44
  45 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  46
  47 /* The Vala syntax is defined in the Vala Reference Manual
  48    http://www.vala-project.org/doc/vala/.
  49    See also vala/valascanner.vala.  */
  50
  51 /* ====================== Keyword set customization.  ====================== */
  52
  53 /* If true extract all strings.  */
  54 static bool extract_all = false;
  55
  56 static hash_table keywords;
  57 static bool default_keywords = true;
  58
  59
  60 void
  61 x_vala_extract_all ()
  62 {
  63   extract_all = true;
  64 }
  65
  66
  67 static void
  68 add_keyword (const char *name, hash_table *keywords)
  69 {
  70   if (name == NULL)
  71     default_keywords = false;
  72   else
  73     {
  74       const char *end;
  75       struct callshape shape;
  76       const char *colon;
  77
  78       if (keywords->table == NULL)
  79         hash_init (keywords, 100);
  80
  81       split_keywordspec (name, &end, &shape);
  82
  83       /* The characters between name and end should form a valid C identifier.
  84          A colon means an invalid parse in split_keywordspec().  */
  85       colon = strchr (name, ':');
  86       if (colon == NULL || colon >= end)
  87         insert_keyword_callshape (keywords, name, end - name, &shape);
  88     }
  89 }
  90
  91 void
  92 x_vala_keyword (const char *name)
  93 {
  94   add_keyword (name, &keywords);
  95 }
  96
  97 static void
  98 init_keywords ()
  99 {
 100   if (default_keywords)
 101     {
 102       /* When adding new keywords here, also update the documentation in
 103          xgettext.texi!  */
 104       x_vala_keyword ("dgettext:2");
 105       x_vala_keyword ("dcgettext:2");
 106       x_vala_keyword ("ngettext:1,2");
 107       x_vala_keyword ("dngettext:2,3");
 108       x_vala_keyword ("dpgettext:2g");
 109       x_vala_keyword ("dpgettext2:2c,3");
 110       x_vala_keyword ("_");
 111       x_vala_keyword ("Q_");
 112       x_vala_keyword ("N_");
 113       x_vala_keyword ("NC_:1c,2");
 114
 115       default_keywords = false;
 116     }
 117 }
 118
 119 void
 120 init_flag_table_vala ()
 121 {
 122   xgettext_record_flag ("dgettext:2:pass-c-format");
 123   xgettext_record_flag ("dcgettext:2:pass-c-format");
 124   xgettext_record_flag ("ngettext:1:pass-c-format");
 125   xgettext_record_flag ("ngettext:2:pass-c-format");
 126   xgettext_record_flag ("dngettext:2:pass-c-format");
 127   xgettext_record_flag ("dngettext:3:pass-c-format");
 128   xgettext_record_flag ("dpgettext:2:pass-c-format");
 129   xgettext_record_flag ("dpgettext2:3:pass-c-format");
 130   xgettext_record_flag ("_:1:pass-c-format");
 131   xgettext_record_flag ("Q_:1:pass-c-format");
 132   xgettext_record_flag ("N_:1:pass-c-format");
 133   xgettext_record_flag ("NC_:2:pass-c-format");
 134
 135   /* Vala leaves string formatting to Glib functions and thus the
 136      format string is exactly same as C.  See also
 137      vapi/glib-2.0.vapi.  */
 138   xgettext_record_flag ("printf:1:c-format");
 139   xgettext_record_flag ("vprintf:1:c-format");
 140 }
 141
 142
 143 /* ======================== Reading of characters.  ======================== */
 144
 145 /* Real filename, used in error messages about the input file.  */
 146 static const char *real_file_name;
 147
 148 /* Logical filename and line number, used to label the extracted messages.  */
 149 static char *logical_file_name;
 150 static int line_number;
 151
 152 /* The input file stream.  */
 153 static FILE *fp;
 154
 155
 156 /* 1. line_number handling.  */
 157
 158 #define MAX_PHASE1_PUSHBACK 16
 159 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
 160 static int phase1_pushback_length;
 161
 162
 163 static int
 164 phase1_getc ()
 165 {
 166   int c;
 167
 168   if (phase1_pushback_length)
 169     c = phase1_pushback[--phase1_pushback_length];
 170   else
 171     {
 172       c = getc (fp);
 173       if (c == EOF)
 174         {
 175           if (ferror (fp))
 176             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 177                    real_file_name);
 178           return EOF;
 179         }
 180     }
 181
 182   if (c == '\n')
 183     ++line_number;
 184   return c;
 185 }
 186
 187
 188 /* Supports 2 characters of pushback.  */
 189 static void
 190 phase1_ungetc (int c)
 191 {
 192   if (c != EOF)
 193     {
 194       if (c == '\n')
 195         --line_number;
 196
 197       if (phase1_pushback_length == SIZEOF (phase1_pushback))
 198         abort ();
 199       phase1_pushback[phase1_pushback_length++] = c;
 200     }
 201 }
 202
 203
 204 /* These are for tracking whether comments count as immediately before
 205    keyword.  */
 206 static int last_comment_line;
 207 static int last_non_comment_line;
 208
 209 /* Accumulating comments.  */
 210
 211 static char *buffer;
 212 static size_t bufmax;
 213 static size_t buflen;
 214
 215 static inline void
 216 comment_start ()
 217 {
 218   buflen = 0;
 219 }
 220
 221 static inline void
 222 comment_add (int c)
 223 {
 224   if (buflen >= bufmax)
 225     {
 226       bufmax = 2 * bufmax + 10;
 227       buffer = xrealloc (buffer, bufmax);
 228     }
 229   buffer[buflen++] = c;
 230 }
 231
 232 static inline void
 233 comment_line_end (size_t chars_to_remove)
 234 {
 235   buflen -= chars_to_remove;
 236   while (buflen >= 1
 237          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 238     --buflen;
 239   if (chars_to_remove == 0 && buflen >= bufmax)
 240     {
 241       bufmax = 2 * bufmax + 10;
 242       buffer = xrealloc (buffer, bufmax);
 243     }
 244   buffer[buflen] = '\0';
 245   savable_comment_add (buffer);
 246 }
 247
 248
 249 /* 2. Replace each comment that is not inside a character constant or
 250    string literal with a space character.  */
 251
 252 static int
 253 phase2_getc ()
 254 {
 255   int c;
 256   bool last_was_star;
 257
 258   c = phase1_getc ();
 259   if (c != '/')
 260     return c;
 261   c = phase1_getc ();
 262   switch (c)
 263     {
 264     default:
 265       phase1_ungetc (c);
 266       return '/';
 267
 268     case '*':
 269       /* C comment.  */
 270       comment_start ();
 271       last_was_star = false;
 272       for (;;)
 273         {
 274           c = phase1_getc ();
 275           if (c == EOF)
 276             break;
 277           /* We skip all leading white space, but not EOLs.  */
 278           if (!(buflen == 0 && (c == ' ' || c == '\t')))
 279             comment_add (c);
 280           switch (c)
 281             {
 282             case '\n':
 283               comment_line_end (1);
 284               comment_start ();
 285               last_was_star = false;
 286               continue;
 287
 288             case '*':
 289               last_was_star = true;
 290               continue;
 291
 292             case '/':
 293               if (last_was_star)
 294                 {
 295                   comment_line_end (2);
 296                   break;
 297                 }
 298               /* FALLTHROUGH */
 299
 300             default:
 301               last_was_star = false;
 302               continue;
 303             }
 304           break;
 305         }
 306       last_comment_line = line_number;
 307       return ' ';
 308
 309     case '/':
 310       /* C++ or ISO C 99 comment.  */
 311       comment_start ();
 312       for (;;)
 313         {
 314           c = phase1_getc ();
 315           if (c == '\n' || c == EOF)
 316             break;
 317           /* We skip all leading white space, but not EOLs.  */
 318           if (!(buflen == 0 && (c == ' ' || c == '\t')))
 319             comment_add (c);
 320         }
 321       comment_line_end (0);
 322       last_comment_line = line_number;
 323       return '\n';
 324     }
 325 }
 326
 327
 328 static void
 329 phase2_ungetc (int c)
 330 {
 331   phase1_ungetc (c);
 332 }
 333
 334
 335 /* ========================== Reading of tokens.  ========================== */
 336
 337 enum token_type_ty
 338 {
 339   token_type_character_constant,        /* 'x' */
 340   token_type_eof,
 341   token_type_lparen,                    /* ( */
 342   token_type_rparen,                    /* ) */
 343   token_type_lbrace,                    /* { */
 344   token_type_rbrace,                    /* } */
 345   token_type_assign,                    /* = += -= *= /= %= <<= >>= &= |= ^= */
 346   token_type_return,                    /* return */
 347   token_type_plus,                      /* + */
 348   token_type_arithmetic_operator,       /* - * / % << >> & | ^ */
 349   token_type_equality_test_operator,    /* == < > >= <= != */
 350   token_type_logic_operator,            /* ! && || */
 351   token_type_comma,                     /* , */
 352   token_type_question,                  /* ? */
 353   token_type_colon,                     /* : */
 354   token_type_number,                    /* 2.7 */
 355   token_type_string_literal,            /* "abc" */
 356   token_type_string_template,           /* @"abc" */
 357   token_type_regex_literal,             /* /.../ */
 358   token_type_symbol,                    /* if else etc. */
 359   token_type_other
 360 };
 361 typedef enum token_type_ty token_type_ty;
 362
 363 typedef struct token_ty token_ty;
 364 struct token_ty
 365 {
 366   token_type_ty type;
 367   char *string;         /* for token_type_symbol, token_type_string_literal */
 368   refcounted_string_list_ty *comment;   /* for token_type_string_literal */
 369   enum literalstring_escape_type escape;
 370   int line_number;
 371 };
 372
 373 /* Free the memory pointed to by a 'struct token_ty'.  */
 374 static inline void
 375 free_token (token_ty *tp)
 376 {
 377   if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
 378     free (tp->string);
 379   if (tp->type == token_type_string_literal)
 380     drop_reference (tp->comment);
 381 }
 382
 383
 384 /* 3. Parse each resulting logical line as preprocessing tokens and
 385    white space.  Preprocessing tokens and Vala tokens don't always
 386    match.  */
 387
 388 static token_ty phase3_pushback[2];
 389 static int phase3_pushback_length;
 390
 391
 392 static token_type_ty last_token_type = token_type_other;
 393
 394 static void
 395 phase3_scan_regex ()
 396 {
 397     int c;
 398
 399     for (;;)
 400       {
 401         c = phase1_getc ();
 402         if (c == '/')
 403           break;
 404         if (c == '\\')
 405           {
 406             c = phase1_getc ();
 407             if (c != EOF)
 408               continue;
 409           }
 410         if (c == EOF)
 411           {
 412             error_with_progname = false;
 413             error (0, 0,
 414                    _("%s:%d: warning: regular expression literal terminated too early"),
 415                    logical_file_name, line_number);
 416             error_with_progname = true;
 417             return;
 418           }
 419       }
 420
 421     c = phase2_getc ();
 422     if (!(c == 'i' || c == 's' || c == 'm' || c == 'x'))
 423       phase2_ungetc (c);
 424 }
 425
 426 static void
 427 phase3_get (token_ty *tp)
 428 {
 429   static char *buffer;
 430   static int bufmax;
 431   int bufpos;
 432   int last_was_backslash;
 433
 434 #undef APPEND
 435 #define APPEND(c)                               \
 436   do                                            \
 437     {                                           \
 438       if (bufpos >= bufmax)                     \
 439         {                                       \
 440           bufmax = 2 * bufmax + 10;             \
 441           buffer = xrealloc (buffer, bufmax);   \
 442         }                                       \
 443       buffer[bufpos++] = c;                     \
 444     }                                           \
 445   while (0)
 446
 447   if (phase3_pushback_length)
 448     {
 449       *tp = phase3_pushback[--phase3_pushback_length];
 450       last_token_type = tp->type;
 451       return;
 452     }
 453
 454   for (;;)
 455     {
 456       bool template;
 457       bool verbatim;
 458       int c;
 459
 460       tp->line_number = line_number;
 461       c = phase2_getc ();
 462
 463       switch (c)
 464         {
 465         case EOF:
 466           tp->type = last_token_type = token_type_eof;
 467           return;
 468
 469         case '\n':
 470           if (last_non_comment_line > last_comment_line)
 471             savable_comment_reset ();
 472           /* FALLTHROUGH */
 473         case ' ':
 474         case '\f':
 475         case '\t':
 476           /* Ignore whitespace and comments.  */
 477           continue;
 478         default:
 479           break;
 480         }
 481
 482       last_non_comment_line = tp->line_number;
 483       template = false;
 484       verbatim = false;
 485
 486       switch (c)
 487         {
 488         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
 489         case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
 490         case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
 491         case 'V': case 'W': case 'X': case 'Y': case 'Z':
 492         case '_':
 493         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
 494         case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
 495         case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
 496         case 'v': case 'w': case 'x': case 'y': case 'z':
 497           bufpos = 0;
 498           for (;;)
 499             {
 500               APPEND (c);
 501               c = phase2_getc ();
 502               switch (c)
 503                 {
 504                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 505                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 506                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 507                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 508                 case 'Y': case 'Z':
 509                 case '_':
 510                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 511                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 512                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 513                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 514                 case 'y': case 'z':
 515                 case '0': case '1': case '2': case '3': case '4':
 516                 case '5': case '6': case '7': case '8': case '9':
 517                   continue;
 518
 519                 default:
 520                   phase2_ungetc (c);
 521                   break;
 522                 }
 523               break;
 524             }
 525           APPEND (0);
 526           if (strcmp (buffer, "return") == 0)
 527             tp->type = last_token_type = token_type_return;
 528           else
 529             {
 530               tp->string = xstrdup (buffer);
 531               tp->type = last_token_type = token_type_symbol;
 532             }
 533           return;
 534
 535         case '.':
 536           c = phase2_getc ();
 537           phase2_ungetc (c);
 538           switch (c)
 539             {
 540             default:
 541               tp->string = xstrdup (".");
 542               tp->type = last_token_type = token_type_symbol;
 543               return;
 544
 545             case '0': case '1': case '2': case '3': case '4':
 546             case '5': case '6': case '7': case '8': case '9':
 547               c = '.';
 548               break;
 549             }
 550           /* FALLTHROUGH */
 551
 552         case '0': case '1': case '2': case '3': case '4':
 553         case '5': case '6': case '7': case '8': case '9':
 554           /* The preprocessing number token is more "generous" than the C
 555              number tokens.  This is mostly due to token pasting (another
 556              thing we can ignore here).  */
 557           bufpos = 0;
 558           for (;;)
 559             {
 560               APPEND (c);
 561               c = phase2_getc ();
 562               switch (c)
 563                 {
 564                 case 'e':
 565                 case 'E':
 566                   APPEND (c);
 567                   c = phase2_getc ();
 568                   if (c != '+' && c != '-')
 569                     {
 570                       phase2_ungetc (c);
 571                       break;
 572                     }
 573                   continue;
 574
 575                 case 'A': case 'B': case 'C': case 'D':           case 'F':
 576                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 577                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 578                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 579                 case 'Y': case 'Z':
 580                 case 'a': case 'b': case 'c': case 'd':           case 'f':
 581                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 582                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 583                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 584                 case 'y': case 'z':
 585                 case '0': case '1': case '2': case '3': case '4':
 586                 case '5': case '6': case '7': case '8': case '9':
 587                 case '.':
 588                   continue;
 589
 590                 default:
 591                   phase2_ungetc (c);
 592                   break;
 593                 }
 594               break;
 595             }
 596           APPEND (0);
 597           tp->type = last_token_type = token_type_number;
 598           return;
 599
 600         case '\'':
 601           last_was_backslash = false;
 602           for (;;)
 603             {
 604               c = phase2_getc ();
 605               if (last_was_backslash)
 606                 {
 607                   last_was_backslash = false;
 608                   continue;
 609                 }
 610               switch (c)
 611                 {
 612                 case '\\':
 613                   last_was_backslash = true;
 614                   /* FALLTHROUGH */
 615                 default:
 616                   continue;
 617                 case '\n':
 618                   error_with_progname = false;
 619                   error (0, 0, _("%s:%d: warning: unterminated character constant"),
 620                          logical_file_name, line_number - 1);
 621                   error_with_progname = true;
 622                   phase2_ungetc ('\n');
 623                   break;
 624                 case EOF: case '\'':
 625                   break;
 626                 }
 627               break;
 628             }
 629           tp->type = last_token_type = token_type_character_constant;
 630           return;
 631
 632           /* Vala provides strings in three different formats.
 633
 634              Usual string literals:
 635                "..."
 636              Verbatim string literals:
 637                """...""" (where ... can include newlines and double quotes)
 638              String templates.
 639                @"...", @"""..."""
 640
 641              Note that, with the current implementation string
 642              templates are not subject to translation, because they are
 643              inspected at compile time.  For example, the following code
 644
 645                string bar = "bar";
 646                string foo = _(@"foo $bar");
 647
 648              will be translated into the C code, like:
 649
 650                _(g_strconcat ("foo ", "bar", NULL));  */
 651         case '@':
 652           c = phase2_getc ();
 653           if (c != '"')
 654             {
 655               phase2_ungetc (c);
 656               tp->type = last_token_type = token_type_other;
 657               return;
 658             }
 659           template = true;
 660           /* FALLTHROUGH */
 661         case '"':
 662           {
 663             int c2 = phase2_getc ();
 664
 665             if (c2 == '"')
 666               {
 667                 int c3 = phase2_getc ();
 668                 if (c3 == '"')
 669                   verbatim = true;
 670                 else
 671                   {
 672                     phase2_ungetc (c3);
 673                     phase2_ungetc (c2);
 674                   }
 675               }
 676             else
 677               phase2_ungetc (c2);
 678
 679             if (verbatim)
 680               {
 681                 bufpos = 0;
 682                 for (;;)
 683                   {
 684                     /* Use phase 1, because phase 2 elides comments.  */
 685                     c = phase1_getc ();
 686                     if (c == EOF)
 687                       break;
 688
 689                     if (c == '"')
 690                       {
 691                         int c2 = phase1_getc ();
 692                         if (c2 == '"')
 693                           {
 694                             int c3 = phase1_getc ();
 695                             if (c3 == '"')
 696                               break;
 697                             phase1_ungetc (c3);
 698                           }
 699                         phase1_ungetc (c2);
 700                       }
 701                     APPEND (c);
 702                   }
 703               }
 704             else
 705               {
 706                 last_was_backslash = false;
 707                 bufpos = 0;
 708                 for (;;)
 709                   {
 710                     c = phase1_getc ();
 711                     if (last_was_backslash)
 712                       {
 713                         last_was_backslash = false;
 714                         APPEND (c);
 715                         continue;
 716                       }
 717
 718                     switch (c)
 719                       {
 720                       case '\\':
 721                         last_was_backslash = true;
 722                         /* FALLTHROUGH */
 723                       default:
 724                         APPEND (c);
 725                         continue;
 726
 727                       case '\n':
 728                         error_with_progname = false;
 729                         error (0, 0, _("\
 730 %s:%d: warning: unterminated string literal"),
 731                                logical_file_name, line_number - 1);
 732                         error_with_progname = true;
 733                         phase1_ungetc ('\n');
 734                         break;
 735                       case EOF: case '"':
 736                         break;
 737                       }
 738                     break;
 739                   }
 740               }
 741             APPEND (0);
 742             tp->type = last_token_type = template
 743               ? token_type_string_template : token_type_string_literal;
 744             tp->string = xstrdup (buffer);
 745             tp->comment = add_reference (savable_comment);
 746             tp->escape = verbatim ? 0 : LET_ANSI_C | LET_UNICODE;
 747             return;
 748           }
 749
 750         case '/':
 751           switch (last_token_type)
 752             {
 753             case token_type_lparen:
 754             case token_type_lbrace:
 755             case token_type_assign:
 756             case token_type_return:
 757             case token_type_plus:
 758             case token_type_arithmetic_operator:
 759             case token_type_equality_test_operator:
 760             case token_type_logic_operator:
 761             case token_type_comma:
 762             case token_type_question:
 763             case token_type_colon:
 764               phase3_scan_regex ();
 765               tp->type = last_token_type = token_type_regex_literal;
 766               break;
 767             default:
 768               {
 769                 int c2 = phase2_getc ();
 770                 if (c2 == '=')
 771                   tp->type = last_token_type = token_type_assign;
 772                 else
 773                   {
 774                     phase2_ungetc (c2);
 775                     tp->type = last_token_type = token_type_arithmetic_operator;
 776                   }
 777                 break;
 778               }
 779             }
 780           return;
 781
 782         case '(':
 783           tp->type = last_token_type = token_type_lparen;
 784           return;
 785
 786         case ')':
 787           tp->type = last_token_type = token_type_rparen;
 788           return;
 789
 790         case '{':
 791           tp->type = last_token_type = token_type_lbrace;
 792           return;
 793
 794         case '}':
 795           tp->type = last_token_type = token_type_rbrace;
 796           return;
 797
 798         case '+':
 799           {
 800             int c2 = phase2_getc ();
 801             switch (c2)
 802               {
 803               case '+':
 804                 tp->type = last_token_type = token_type_other;
 805                 break;
 806               case '=':
 807                 tp->type = last_token_type = token_type_assign;
 808                 break;
 809               default:
 810                 phase2_ungetc (c2);
 811                 tp->type = last_token_type = token_type_plus;
 812                 break;
 813               }
 814             return;
 815           }
 816
 817         case '-':
 818           {
 819             int c2 = phase2_getc ();
 820             switch (c2)
 821               {
 822               case '-':
 823                 tp->type = last_token_type = token_type_other;
 824                 break;
 825               case '=':
 826                 tp->type = last_token_type = token_type_assign;
 827                 break;
 828               default:
 829                 phase2_ungetc (c2);
 830                 tp->type = last_token_type = token_type_arithmetic_operator;
 831                 break;
 832               }
 833             return;
 834           }
 835
 836         case '%':
 837         case '^':
 838           {
 839             int c2 = phase2_getc ();
 840             if (c2 == '=')
 841               tp->type = last_token_type = token_type_assign;
 842             else
 843               {
 844                 phase2_ungetc (c2);
 845                 tp->type = last_token_type = token_type_logic_operator;
 846               }
 847             return;
 848           }
 849
 850         case '=':
 851           {
 852             int c2 = phase2_getc ();
 853             switch (c2)
 854               {
 855               case '=':
 856                 tp->type = last_token_type = token_type_equality_test_operator;
 857                 break;
 858               case '>':
 859                 tp->type = last_token_type = token_type_other;
 860                 break;
 861               default:
 862                 phase2_ungetc (c2);
 863                 tp->type = last_token_type = token_type_assign;
 864                 break;
 865               }
 866             return;
 867           }
 868
 869         case '!':
 870           {
 871             int c2 = phase2_getc ();
 872             if (c2 == '=')
 873               tp->type = last_token_type = token_type_equality_test_operator;
 874             else
 875               {
 876                 phase2_ungetc (c2);
 877                 tp->type = last_token_type = token_type_logic_operator;
 878               }
 879             return;
 880           }
 881
 882         case '>':
 883         case '<':
 884           {
 885             int c2 = phase2_getc ();
 886             if (c2 == '=')
 887               tp->type = last_token_type = token_type_equality_test_operator;
 888             else if (c2 == c)
 889               {
 890                 int c3 = phase2_getc ();
 891                 if (c3 == '=')
 892                   tp->type = last_token_type = token_type_assign;
 893                 else
 894                   {
 895                     phase2_ungetc (c2);
 896                     phase2_ungetc (c3);
 897                     tp->type = last_token_type = token_type_other;
 898                   }
 899               }
 900             else
 901               {
 902                 phase2_ungetc (c2);
 903                 tp->type = last_token_type = token_type_equality_test_operator;
 904               }
 905             return;
 906           }
 907
 908         case ',':
 909           tp->type = last_token_type = token_type_comma;
 910           return;
 911
 912         case ':':
 913           tp->type = last_token_type = token_type_colon;
 914           return;
 915
 916         case '&':
 917         case '|':
 918           {
 919             int c2 = phase2_getc ();
 920             if (c2 == c)
 921               tp->type = last_token_type = token_type_logic_operator;
 922             else if (c2 == '=')
 923               tp->type = last_token_type = token_type_assign;
 924             else
 925               {
 926                 phase2_ungetc (c2);
 927                 tp->type = last_token_type = token_type_arithmetic_operator;
 928               }
 929             return;
 930           }
 931
 932         case '?':
 933           {
 934             int c2 = phase2_getc ();
 935             if (c2 == '?')
 936               tp->type = last_token_type = token_type_logic_operator;
 937             else
 938               {
 939                 phase2_ungetc (c2);
 940                 tp->type = last_token_type = token_type_question;
 941               }
 942             return;
 943           }
 944
 945         default:
 946           tp->type = last_token_type = token_type_other;
 947           return;
 948         }
 949     }
 950 #undef APPEND
 951 }
 952
 953 static void
 954 phase3_unget (token_ty *tp)
 955 {
 956   if (tp->type != token_type_eof)
 957     {
 958       if (phase3_pushback_length == SIZEOF (phase3_pushback))
 959         abort ();
 960       phase3_pushback[phase3_pushback_length++] = *tp;
 961     }
 962 }
 963
 964
 965 /* String concatenation with '+'.  */
 966
 967 static void
 968 x_vala_lex (token_ty *tp)
 969 {
 970   phase3_get (tp);
 971   if (tp->type == token_type_string_literal)
 972     {
 973       char *sum = tp->string;
 974       size_t sum_len = strlen (sum);
 975
 976       for (;;)
 977         {
 978           token_ty token2;
 979
 980           phase3_get (&token2);
 981           if (token2.type == token_type_plus)
 982             {
 983               token_ty token3;
 984
 985               phase3_get (&token3);
 986               if (token3.type == token_type_string_literal)
 987                 {
 988                   char *addend = token3.string;
 989                   size_t addend_len = strlen (addend);
 990
 991                   sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
 992                   memcpy (sum + sum_len, addend, addend_len + 1);
 993                   sum_len += addend_len;
 994
 995                   free_token (&token3);
 996                   free_token (&token2);
 997                   continue;
 998                 }
 999               phase3_unget (&token3);
1000             }
1001           phase3_unget (&token2);
1002           break;
1003         }
1004       tp->string = sum;
1005     }
1006 }
1007
1008
1009 /* ========================= Extracting strings.  ========================== */
1010
1011
1012 /* Context lookup table.  */
1013 static flag_context_list_table_ty *flag_context_list_table;
1014
1015 /* Use the same literalstring_parser provided by the C scanner.  */
1016 extern struct literalstring_parser literalstring_c;
1017
1018 /* The file is broken into tokens.  Scan the token stream, looking for
1019    a keyword, followed by a left paren, followed by a string.  When we
1020    see this sequence, we have something to remember.  We assume we are
1021    looking at a valid Vala program, and leave the complaints about the
1022    grammar to the compiler.
1023
1024      Normal handling: Look for
1025        keyword ( ... msgid ... )
1026        keyword msgid
1027      Plural handling: Look for
1028        keyword ( ... msgid ... msgid_plural ... )
1029
1030    We use recursion because the arguments before msgid or between msgid
1031    and msgid_plural can contain subexpressions of the same form.  */
1032
1033 /* Extract messages until the next balanced closing parenthesis or bracket.
1034    Extracted messages are added to MLP.
1035    DELIM can be either token_type_rparen or token_type_rbracket, or
1036    token_type_eof to accept both.
1037    Return true upon eof, false upon closing parenthesis or bracket.  */
1038 static bool
1039 extract_balanced (message_list_ty *mlp, token_type_ty delim,
1040                   flag_context_ty outer_context,
1041                   flag_context_list_iterator_ty context_iter,
1042                   struct arglist_parser *argparser)
1043 {
1044   /* Current argument number.  */
1045   int arg = 1;
1046   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1047   int state;
1048   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1049   const struct callshapes *next_shapes = NULL;
1050   /* Context iterator that will be used if the next token is a '('.  */
1051   flag_context_list_iterator_ty next_context_iter =
1052     passthrough_context_list_iterator;
1053   /* Current context.  */
1054   flag_context_ty inner_context =
1055     inherited_context (outer_context,
1056                        flag_context_list_iterator_advance (&context_iter));
1057
1058   /* Start state is 0.  */
1059   state = 0;
1060
1061   for (;;)
1062     {
1063       token_ty token;
1064
1065       x_vala_lex (&token);
1066
1067       switch (token.type)
1068         {
1069         case token_type_symbol:
1070           {
1071             void *keyword_value;
1072
1073             if (hash_find_entry (&keywords, token.string, strlen (token.string),
1074                                  &keyword_value)
1075                 == 0)
1076               {
1077                 next_shapes = (const struct callshapes *) keyword_value;
1078                 state = 1;
1079               }
1080             else
1081               state = 0;
1082           }
1083           next_context_iter =
1084             flag_context_list_iterator (
1085               flag_context_list_table_lookup (
1086                 flag_context_list_table,
1087                 token.string, strlen (token.string)));
1088           free (token.string);
1089           continue;
1090
1091         case token_type_lparen:
1092           if (extract_balanced (mlp, token_type_rparen,
1093                                 inner_context, next_context_iter,
1094                                 arglist_parser_alloc (mlp,
1095                                                       state ? next_shapes : NULL)))
1096             {
1097               arglist_parser_done (argparser, arg);
1098               return true;
1099             }
1100           next_context_iter = null_context_list_iterator;
1101           state = 0;
1102           break;
1103
1104         case token_type_rparen:
1105           if (delim == token_type_rparen || delim == token_type_eof)
1106             {
1107               arglist_parser_done (argparser, arg);
1108               return false;
1109             }
1110
1111           next_context_iter = null_context_list_iterator;
1112           state = 0;
1113           continue;
1114
1115         case token_type_comma:
1116           arg++;
1117           inner_context =
1118             inherited_context (outer_context,
1119                                flag_context_list_iterator_advance (
1120                                  &context_iter));
1121           next_context_iter = passthrough_context_list_iterator;
1122           state = 0;
1123           continue;
1124
1125         case token_type_eof:
1126           arglist_parser_done (argparser, arg);
1127           return true;
1128
1129         case token_type_string_literal:
1130           {
1131             lex_pos_ty pos;
1132             pos.file_name = logical_file_name;
1133             pos.line_number = token.line_number;
1134
1135             if (extract_all)
1136               {
1137                 char *string;
1138                 refcounted_string_list_ty *comment;
1139                 const char *encoding;
1140
1141                 string = literalstring_c.parse (token.string, &pos,
1142                                                 token.escape);
1143                 free (token.string);
1144                 token.string = string;
1145
1146                 if (token.comment != NULL)
1147                   {
1148                     comment = savable_comment_convert_encoding (token.comment,
1149                                                                 &pos);
1150                     drop_reference (token.comment);
1151                     token.comment = comment;
1152                   }
1153
1154                 /* token.string and token.comment are already converted
1155                    to UTF-8.  Prevent further conversion in
1156                    remember_a_message.  */
1157                 encoding = xgettext_current_source_encoding;
1158                 xgettext_current_source_encoding = po_charset_utf8;
1159                 remember_a_message (mlp, NULL, token.string, inner_context,
1160                                     &pos, NULL, token.comment);
1161                 xgettext_current_source_encoding = encoding;
1162               }
1163             else
1164               {
1165                 /* A string immediately after a symbol means a
1166                    function call.  */
1167                 if (state)
1168                   {
1169                     struct arglist_parser *tmp_argparser;
1170                     tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
1171
1172                     arglist_parser_remember_literal (tmp_argparser, 1,
1173                                                      token.string,
1174                                                      inner_context,
1175                                                      pos.file_name,
1176                                                      pos.line_number,
1177                                                      token.comment,
1178                                                      token.escape);
1179                     arglist_parser_done (tmp_argparser, 1);
1180                   }
1181                 else
1182                   arglist_parser_remember_literal (argparser, arg, token.string,
1183                                                    inner_context, pos.file_name,
1184                                                    pos.line_number,
1185                                                    token.comment,
1186                                                    token.escape);
1187               }
1188           }
1189           drop_reference (token.comment);
1190           next_context_iter = null_context_list_iterator;
1191           state = 0;
1192           continue;
1193
1194         case token_type_character_constant:
1195         case token_type_lbrace:
1196         case token_type_rbrace:
1197         case token_type_assign:
1198         case token_type_return:
1199         case token_type_plus:
1200         case token_type_arithmetic_operator:
1201         case token_type_equality_test_operator:
1202         case token_type_logic_operator:
1203         case token_type_question:
1204         case token_type_colon:
1205         case token_type_number:
1206         case token_type_string_template:
1207         case token_type_regex_literal:
1208         case token_type_other:
1209           next_context_iter = null_context_list_iterator;
1210           state = 0;
1211           continue;
1212
1213         default:
1214           abort ();
1215         }
1216     }
1217 }
1218
1219 void
1220 extract_vala (FILE *f,
1221               const char *real_filename, const char *logical_filename,
1222               flag_context_list_table_ty *flag_table,
1223               msgdomain_list_ty *mdlp)
1224 {
1225   message_list_ty *mlp = mdlp->item[0]->messages;
1226
1227   fp = f;
1228   real_file_name = real_filename;
1229   logical_file_name = xstrdup (logical_filename);
1230   line_number = 1;
1231
1232   last_comment_line = -1;
1233   last_non_comment_line = -1;
1234
1235   flag_context_list_table = flag_table;
1236
1237   init_keywords ();
1238
1239   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1240      due to an unbalanced closing parenthesis, just restart it.  */
1241   while (!extract_balanced (mlp, token_type_eof,
1242                             null_context, null_context_list_iterator,
1243                             arglist_parser_alloc (mlp, NULL)))
1244     ;
1245
1246   fp = NULL;
1247   real_file_name = NULL;
1248   logical_file_name = NULL;
1249   line_number = 0;
1250 }