gettext-tools/src/x-lua.c

   1 /* xgettext Lua backend.
   2    Copyright (C) 2012-2015 Free Software Foundation, Inc.
   3
   4    This file was written by Ľubomír Remák <lubomirr@lubomirr.eu>, 2012.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 #include "config.h"
  21 #endif
  22
  23 /* Specification.  */
  24 #include "x-lua.h"
  25
  26 #include <errno.h>
  27 #include <stdbool.h>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30
  31 #include "message.h"
  32 #include "xgettext.h"
  33 #include "error.h"
  34 #include "xalloc.h"
  35 #include "gettext.h"
  36 #include "po-charset.h"
  37
  38 #define _(s) gettext(s)
  39
  40 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  41
  42 /* The Lua syntax is defined in the Lua manual section 9,
  43    which can be found at
  44    http://www.lua.org/manual/5.2/manual.html#9  */
  45
  46 /* If true extract all strings.  */
  47 static bool extract_all = false;
  48
  49 /* A hash table for keywords.  */
  50 static hash_table keywords;
  51 static bool default_keywords = true;
  52
  53 /* Set extract_all flag (gettext will extract all strings).  */
  54 void
  55 x_lua_extract_all ()
  56 {
  57   extract_all = true;
  58 }
  59
  60 /* Adds a keyword.  Copied from other lexers.  */
  61 void
  62 x_lua_keyword (const char *name)
  63 {
  64   if (name == NULL)
  65     default_keywords = false;
  66   else
  67     {
  68       const char *end;
  69       struct callshape shape;
  70       const char *colon;
  71
  72       if (keywords.table == NULL)
  73         hash_init (&keywords, 100);
  74
  75       split_keywordspec (name, &end, &shape);
  76
  77       /* The characters between name and end should form a valid C identifier.
  78          A colon means an invalid parse in split_keywordspec().  */
  79       colon = strchr (name, ':');
  80       if (colon == NULL || colon >= end)
  81         insert_keyword_callshape (&keywords, name, end - name, &shape);
  82     }
  83 }
  84
  85 /* Finish initializing the keywords hash table.
  86    Called after argument processing, before each file is processed.  */
  87 static void
  88 init_keywords ()
  89 {
  90   if (default_keywords)
  91     {
  92       /* When adding new keywords here, also update the documentation in
  93          xgettext.texi!  */
  94       x_lua_keyword ("_");
  95       x_lua_keyword ("gettext.gettext");
  96       x_lua_keyword ("gettext.dgettext:2");
  97       x_lua_keyword ("gettext.dcgettext:2");
  98       x_lua_keyword ("gettext.ngettext:1,2");
  99       x_lua_keyword ("gettext.dngettext:2,3");
 100       x_lua_keyword ("gettext.dcngettext:2,3");
 101       default_keywords = false;
 102     }
 103 }
 104
 105 void
 106 init_flag_table_lua ()
 107 {
 108   xgettext_record_flag ("_:1:pass-lua-format");
 109   xgettext_record_flag ("gettext.gettext:1:pass-lua-format");
 110   xgettext_record_flag ("gettext.dgettext:2:pass-lua-format");
 111   xgettext_record_flag ("gettext.dcgettext:2:pass-lua-format");
 112   xgettext_record_flag ("gettext.ngettext:1:pass-lua-format");
 113   xgettext_record_flag ("gettext.ngettext:2:pass-lua-format");
 114   xgettext_record_flag ("gettext.dngettext:2:pass-lua-format");
 115   xgettext_record_flag ("gettext.dngettext:3:pass-lua-format");
 116   xgettext_record_flag ("gettext.dcngettext:2:pass-lua-format");
 117   xgettext_record_flag ("gettext.dcngettext:3:pass-lua-format");
 118   xgettext_record_flag ("string.format:1:lua-format");
 119 }
 120
 121 /* ======================== Reading of characters.  ======================== */
 122
 123
 124 /* Real filename, used in error messages about the input file.  */
 125 static const char *real_file_name;
 126
 127 /* Logical filename and line number, used to label the extracted messages.  */
 128 static char *logical_file_name;
 129 static int line_number;
 130
 131 /* The input file stream.  */
 132 static FILE *fp;
 133
 134
 135 /* 1. line_number handling.  */
 136
 137 static unsigned char phase1_pushback[2];
 138 static int phase1_pushback_length;
 139
 140 static int first_character = 1;
 141
 142 static int
 143 phase1_getc ()
 144 {
 145   int c;
 146
 147   if (phase1_pushback_length)
 148     c = phase1_pushback[--phase1_pushback_length];
 149   else
 150     {
 151       c = getc (fp);
 152
 153       if (first_character)
 154         {
 155           first_character = 0;
 156
 157           /* Ignore shebang line.  No pushback required in this case.  */
 158           if (c == '#')
 159             {
 160               while (c != '\n' && c != EOF)
 161                 c = getc (fp);
 162               if (c == '\n')
 163                 {
 164                   line_number++;
 165                   c = getc (fp);
 166                 }
 167             }
 168         }
 169
 170       if (c == EOF)
 171         {
 172           if (ferror (fp))
 173             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 174                    real_file_name);
 175           return EOF;
 176         }
 177     }
 178
 179   if (c == '\n')
 180     line_number++;
 181
 182   return c;
 183 }
 184
 185 /* Supports 2 characters of pushback.  */
 186
 187 static void
 188 phase1_ungetc (int c)
 189 {
 190   if (c != EOF)
 191     {
 192       if (c == '\n')
 193         --line_number;
 194
 195       if (phase1_pushback_length == SIZEOF (phase1_pushback))
 196         abort ();
 197       phase1_pushback[phase1_pushback_length++] = c;
 198     }
 199 }
 200
 201
 202 /* These are for tracking whether comments count as immediately before
 203    keyword.  */
 204 static int last_comment_line;
 205 static int last_non_comment_line;
 206
 207 /* Accumulating comments.  */
 208
 209 static char *buffer;
 210 static size_t bufmax;
 211 static size_t buflen;
 212
 213 static inline void
 214 comment_start ()
 215 {
 216   buflen = 0;
 217 }
 218
 219 static inline void
 220 comment_add (int c)
 221 {
 222   if (buflen >= bufmax)
 223     {
 224       bufmax = 2 * bufmax + 10;
 225       buffer = xrealloc (buffer, bufmax);
 226     }
 227   buffer[buflen++] = c;
 228 }
 229
 230 static inline void
 231 comment_line_end (size_t chars_to_remove)
 232 {
 233   buflen -= chars_to_remove;
 234   while (buflen >= 1
 235          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 236     --buflen;
 237   if (chars_to_remove == 0 && buflen >= bufmax)
 238     {
 239       bufmax = 2 * bufmax + 10;
 240       buffer = xrealloc (buffer, bufmax);
 241     }
 242   buffer[buflen] = '\0';
 243   savable_comment_add (buffer);
 244 }
 245
 246 /* Eats characters until '\n' and adds them to the comment.  */
 247 static void
 248 eat_comment_line ()
 249 {
 250   for (;;)
 251     {
 252       int c = phase1_getc ();
 253       if (c == '\n' || c == EOF)
 254         {
 255           comment_line_end (0);
 256           break;
 257         }
 258
 259       if (!(buflen == 0 && (c == ' ' || c == '\t')))
 260         comment_add (c);
 261     }
 262 }
 263
 264 static int
 265 phase2_getc ()
 266 {
 267   int c;
 268   int lineno;
 269
 270   c = phase1_getc ();
 271
 272   if (c == '-')
 273     {
 274       c = phase1_getc ();
 275
 276       if (c == '-')
 277         {
 278           /* It starts with '--', so it must be either a short or a long
 279              comment.  */
 280           c = phase1_getc ();
 281
 282           if (c == '[')
 283             {
 284               c = phase1_getc ();
 285
 286               int esigns = 0;
 287               while (c == '=')
 288                 {
 289                   esigns++;
 290                   c = phase1_getc ();
 291                 }
 292
 293               if (c == '[')
 294                 {
 295                   /* Long comment.  */
 296                   bool right_bracket = false;
 297                   bool end = false;
 298                   int esigns2 = 0;
 299
 300                   lineno = line_number;
 301                   comment_start ();
 302                   while (!end)
 303                     {
 304                       c = phase1_getc ();
 305
 306                       if (c == EOF)
 307                         break;
 308
 309                       /* Ignore leading spaces and tabs.  */
 310                       if (buflen == 0 && (c == ' ' || c == '\t'))
 311                         continue;
 312
 313                       comment_add (c);
 314
 315                       switch (c)
 316                         {
 317                         case ']':
 318                           if (!right_bracket)
 319                             {
 320                               right_bracket = true;
 321                               esigns2 = 0;
 322                             }
 323                           else
 324                             {
 325                               if (esigns2 == esigns)
 326                                 {
 327                                   comment_line_end (2 + esigns);
 328                                   end = true;
 329                                 }
 330                             }
 331                           break;
 332
 333                         case '=':
 334                           if (right_bracket)
 335                             esigns2++;
 336                           break;
 337
 338                         case '\n':
 339                           comment_line_end (1);
 340                           comment_start ();
 341                           lineno = line_number;
 342                           /* Intentionally not breaking.  */
 343
 344                         default:
 345                           right_bracket = false;
 346                         }
 347                     }
 348                   last_comment_line = lineno;
 349                   return ' ';
 350                 }
 351               else
 352                 {
 353                   /* One line (short) comment, starting with '--[=...='.  */
 354                   lineno = last_comment_line;
 355                   comment_start ();
 356                   comment_add ('[');
 357                   while (esigns--)
 358                     comment_add ('=');
 359                   phase1_ungetc (c);
 360                   eat_comment_line ();
 361                   last_comment_line = lineno;
 362                   return '\n';
 363                 }
 364             }
 365           else
 366             {
 367               /* One line (short) comment.  */
 368               lineno = line_number;
 369               comment_start ();
 370               phase1_ungetc (c);
 371               eat_comment_line ();
 372               last_comment_line = lineno;
 373               return '\n';
 374             }
 375         }
 376       else
 377         {
 378           /* Minus sign.  */
 379           phase1_ungetc (c);
 380           return '-';
 381         }
 382     }
 383   else
 384     return c;
 385 }
 386
 387 /* ========================== Reading of tokens.  ========================== */
 388
 389 enum token_type_ty
 390 {
 391   token_type_eof,
 392   token_type_lparen,            /* ( */
 393   token_type_rparen,            /* ) */
 394   token_type_lbracket,          /* [ */
 395   token_type_rbracket,          /* ] */
 396   token_type_comma,             /* , */
 397   token_type_dot,               /* . */
 398   token_type_doubledot,         /* .. */
 399   token_type_operator1,         /* + - * / % not # - ^ */
 400   token_type_operator2,         /* < > <= >= ~= == and or */
 401   token_type_string,
 402   token_type_number,
 403   token_type_symbol,
 404   token_type_other
 405 };
 406
 407 typedef enum token_type_ty token_type_ty;
 408
 409 typedef struct token_ty token_ty;
 410 struct token_ty
 411 {
 412   token_type_ty type;
 413   char *string; /* for token_type_string_literal, token_type_symbol */
 414   refcounted_string_list_ty *comment;  /* for token_type_string_literal */
 415   int line_number;
 416 };
 417
 418 /* Free the memory pointed to by a 'struct token_ty'.  */
 419 static inline void
 420 free_token (token_ty *tp)
 421 {
 422   if (tp->type == token_type_string || tp->type == token_type_symbol)
 423     free (tp->string);
 424   if (tp->type == token_type_string)
 425     drop_reference (tp->comment);
 426 }
 427
 428 /* Our current string.  */
 429 static int string_buf_length;
 430 static int string_buf_alloc;
 431 static char *string_buf;
 432
 433 static void
 434 string_start ()
 435 {
 436   string_buf_length = 0;
 437 }
 438
 439 static void
 440 string_add (int c)
 441 {
 442   if (string_buf_length >= string_buf_alloc)
 443     {
 444       string_buf_alloc = 2 * string_buf_alloc + 10;
 445       string_buf = xrealloc (string_buf, string_buf_alloc);
 446     }
 447
 448   string_buf[string_buf_length++] = c;
 449 }
 450
 451 static void
 452 string_end ()
 453 {
 454   string_buf[string_buf_length] = '\0';
 455 }
 456
 457
 458 /* We need 3 pushback tokens for string optimization.  */
 459 static int phase3_pushback_length;
 460 static token_ty phase3_pushback[3];
 461
 462
 463 static void
 464 phase3_unget (token_ty *tp)
 465 {
 466   if (tp->type != token_type_eof)
 467     {
 468       if (phase3_pushback_length == SIZEOF (phase3_pushback))
 469         abort ();
 470       phase3_pushback[phase3_pushback_length++] = *tp;
 471     }
 472 }
 473
 474 static void
 475 phase3_get (token_ty *tp)
 476 {
 477   int c;
 478   int c2;
 479   int c_start;
 480
 481   if (phase3_pushback_length)
 482     {
 483       *tp = phase3_pushback[--phase3_pushback_length];
 484       return;
 485     }
 486
 487   tp->string = NULL;
 488
 489   for (;;)
 490     {
 491       tp->line_number = line_number;
 492       c = phase2_getc ();
 493
 494       switch (c)
 495         {
 496         case EOF:
 497           tp->type = token_type_eof;
 498           return;
 499
 500         case '\n':
 501           if (last_non_comment_line > last_comment_line)
 502             savable_comment_reset ();
 503           /* Intentionally not breaking.  */
 504         case ' ':
 505         case '\t':
 506         case '\f':
 507           continue;
 508
 509         case '+':
 510         case '-':
 511         case '*':
 512         case '/':
 513         case '^':
 514         case '%':
 515         case '#':
 516           tp->type = token_type_operator1;
 517           return;
 518         case '<':
 519         case '>':
 520         case '=':
 521           c2 = phase1_getc ();
 522           if (c2 != '=')
 523             phase1_ungetc (c2);
 524           tp->type = token_type_operator2;
 525           return;
 526         case '~':
 527           c2 = phase1_getc ();
 528           if (c2 == '=')
 529             {
 530               tp->type = token_type_operator2;
 531               return;
 532             }
 533           else
 534             phase1_ungetc (c2);
 535           continue;
 536         case '(':
 537           tp->type = token_type_lparen;
 538           return;
 539         case ')':
 540           tp->type = token_type_rparen;
 541           return;
 542         case ',':
 543           tp->type = token_type_comma;
 544           return;
 545
 546         case ';':
 547           tp->type = token_type_other;
 548           return;
 549
 550           /* There are three operators beginning with a dot.  '.',
 551              '..' and '...'.  The most useful for us is the string
 552              concatenation operator ('..').  */
 553         case '.':
 554           c = phase1_getc ();
 555           if (c == '.')
 556             {
 557               c = phase1_getc ();
 558               if (c == '.')
 559                 {
 560                   tp->type = token_type_other;
 561                   return;
 562                 }
 563               else
 564                 {
 565                   phase1_ungetc (c);
 566                   tp->type = token_type_doubledot;
 567                   return;
 568                 }
 569             }
 570           else if (c >= '0' && c <= '9')
 571             {
 572               /* It's a number.  We aren't interested in the actual
 573                  numeric value, so ignore the dot and let next
 574                  iteration eat the number.  */
 575               phase1_ungetc (c);
 576               continue;
 577             }
 578           else
 579             {
 580               phase1_ungetc (c);
 581               tp->type = token_type_dot;
 582               return;
 583             }
 584
 585         case '"':
 586         case '\'':
 587           c_start = c;
 588           string_start ();
 589
 590           for (;;)
 591             {
 592               /* We need unprocessed characters from phase 1.  */
 593               c = phase1_getc ();
 594
 595               /* We got '\', this is probably an escape sequence.  */
 596               if (c == '\\')
 597                 {
 598                   c = phase1_getc ();
 599                   switch (c)
 600                     {
 601                     case 'a':
 602                       string_add ('\a');
 603                       break;
 604                     case 'b':
 605                       string_add ('\b');
 606                       break;
 607                     case 'f':
 608                       string_add ('\f');
 609                       break;
 610                     case 'n':
 611                       string_add ('\n');
 612                       break;
 613                     case 'r':
 614                       string_add ('\r');
 615                       break;
 616                     case 't':
 617                       string_add ('\t');
 618                       break;
 619                     case 'v':
 620                       string_add ('\v');
 621                       break;
 622                     case 'x':
 623                       {
 624                         int num = 0;
 625                         int i = 0;
 626
 627                         for (i = 0; i < 2; i++)
 628                           {
 629                             c = phase1_getc ();
 630                             if (c >= '0' && c <= '9')
 631                               num += c - '0';
 632                             else if (c >= 'a' && c <= 'f')
 633                               num += c - 'a' + 10;
 634                             else if (c >= 'A' && c <= 'F')
 635                               num += c - 'A' + 10;
 636                             else
 637                               {
 638                                 phase1_ungetc (c);
 639                                 break;
 640                               }
 641
 642                             if (i == 0)
 643                               num *= 16;
 644                           }
 645
 646                         if (i == 2)
 647                           string_add (num);
 648                       }
 649
 650                       break;
 651                     case 'z':
 652                       /* Ignore the following whitespace.  */
 653                       do
 654                         {
 655                           c = phase1_getc ();
 656                         }
 657                       while (c == ' ' || c == '\n' || c == '\t' || c == '\r'
 658                              || c == '\f' || c == '\v');
 659
 660                       phase1_ungetc (c);
 661
 662                       break;
 663                     default:
 664                       /* Check if it's a '\ddd' sequence.  */
 665                       if (c >= '0' && c <= '9')
 666                         {
 667                           int num = 0;
 668                           int i = 0;
 669
 670                           while (c >= '0' && c <= '9' && i < 3)
 671                             {
 672                               num *= 10;
 673                               num += (c - '0');
 674                               c = phase1_getc ();
 675                               i++;
 676                             }
 677
 678                           /* The last read character is either a
 679                              non-number or another number after our
 680                              '\ddd' sequence.  We need to ungetc it.  */
 681                           phase1_ungetc (c);
 682
 683                           /* The sequence number is too big, this
 684                              causes a lexical error.  Ignore it.  */
 685                           if (num < 256)
 686                             string_add (num);
 687                         }
 688                       else
 689                         string_add (c);
 690                     }
 691                 }
 692               else if (c == c_start || c == EOF || c == '\n')
 693                 {
 694                   /* End of string.  */
 695                   string_end ();
 696                   tp->string = xstrdup (string_buf);
 697                   tp->comment = add_reference (savable_comment);
 698                   tp->type = token_type_string;
 699                   return;
 700                 }
 701               else
 702                 string_add (c);
 703             }
 704           break;
 705
 706         case '[':
 707           c = phase1_getc ();
 708
 709           /* Count the number of equal signs.  */
 710           int esigns = 0;
 711           while (c == '=')
 712             {
 713               esigns++;
 714               c = phase1_getc ();
 715             }
 716
 717           if (c != '[')
 718             {
 719               /* We did not find what we were looking for, ungetc it.  */
 720               phase1_ungetc (c);
 721               if (esigns == 0)
 722                 {
 723                   /* Our current character isn't '[' and we got 0 equal
 724                      signs, so the first '[' must have been a left
 725                      bracket.  */
 726                   tp->type = token_type_lbracket;
 727                   return;
 728                 }
 729               else
 730                 /* Lexical error, ignore it.  */
 731                 continue;
 732             }
 733
 734           string_start ();
 735
 736           for (;;)
 737             {
 738               c = phase1_getc ();
 739
 740               if (c == ']')
 741                 {
 742                   c = phase1_getc ();
 743
 744                   /* Count the number of equal signs.  */
 745                   int esigns2 = 0;
 746                   while (c == '=')
 747                     {
 748                       esigns2++;
 749                       c = phase1_getc ();
 750                     }
 751
 752                   if (c == ']' && esigns == esigns2)
 753                     {
 754                       /* We got ']==...==]', where the number of equal
 755                          signs matches the number of equal signs in
 756                          the opening bracket.  */
 757                       string_end ();
 758                       tp->string = xstrdup (string_buf);
 759                       tp->comment = add_reference (savable_comment);
 760                       tp->type = token_type_string;
 761                       return;
 762                     }
 763                   else
 764                     {
 765                       /* Otherwise we got either ']==' garbage or
 766                          ']==...==]' with a different number of equal
 767                          signs.
 768
 769                          Add ']' and equal signs to the string, and
 770                          ungetc the current character, because the
 771                          second ']' might be a part of another closing
 772                          long bracket, e.g. '==]===]'.  */
 773                       phase1_ungetc (c);
 774
 775                       string_add (']');
 776                       while (esigns2--)
 777                         string_add ('=');
 778                     }
 779                 }
 780               else
 781                 {
 782                   if (c == EOF)
 783                     {
 784                       string_end ();
 785                       tp->string = xstrdup (string_buf);
 786                       tp->comment = add_reference (savable_comment);
 787                       tp->type = token_type_string;
 788                       return;
 789                     }
 790                   else
 791                     string_add (c);
 792                 }
 793             }
 794           break;
 795
 796         case ']':
 797           tp->type = token_type_rbracket;
 798           return;
 799
 800         default:
 801           if (c >= '0' && c <= '9')
 802             {
 803               while (c >= '0' && c <= '9')
 804                 c = phase1_getc ();
 805
 806               if (c == '.')
 807                 {
 808                   c = phase1_getc ();
 809                   while (c >= '0' && c <= '9')
 810                     c = phase1_getc ();
 811                 }
 812
 813               if (c == 'e' || c == 'E')
 814                 {
 815                   if (c == '+' || c == '-')
 816                     c = phase1_getc ();
 817                   while (c >= '0' && c <= '9')
 818                     c = phase1_getc ();
 819                 }
 820
 821               phase1_ungetc (c);
 822
 823               tp->type = token_type_number;
 824               return;
 825             }
 826           else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
 827                    || c == '_')
 828             {
 829               string_start ();
 830               while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
 831                      || c == '_' || (c >= '0' && c <= '9'))
 832                 {
 833                   string_add (c);
 834                   c = phase1_getc ();
 835                 }
 836               string_end ();
 837               phase1_ungetc (c);
 838
 839               if (strcmp (string_buf, "not") == 0)
 840                 tp->type = token_type_operator1;
 841               else if (strcmp (string_buf, "and") == 0)
 842                 tp->type = token_type_operator2;
 843               else if (strcmp (string_buf, "or") == 0)
 844                 tp->type = token_type_operator2;
 845               else
 846                 {
 847                   tp->string = xstrdup (string_buf);
 848                   tp->type = token_type_symbol;
 849                 }
 850               return;
 851             }
 852           else
 853             tp->type = token_type_other;
 854         }
 855     }
 856 }
 857
 858 /* String and symbol concatenation.  */
 859
 860 static token_type_ty phase4_last;
 861
 862 /* We need 3 pushback tokens for string and symbol concatenation.  */
 863 static int phase4_pushback_length;
 864 static token_ty phase4_pushback[3];
 865
 866 static void
 867 phase4_unget (token_ty *tp)
 868 {
 869   if (tp->type != token_type_eof)
 870     {
 871       if (phase4_pushback_length == SIZEOF (phase4_pushback))
 872         abort ();
 873       phase4_pushback[phase4_pushback_length++] = *tp;
 874     }
 875 }
 876
 877 static void
 878 phase4_get (token_ty *tp)
 879 {
 880   if (phase4_pushback_length)
 881     {
 882       *tp = phase4_pushback[--phase4_pushback_length];
 883       phase4_last = tp->type;
 884       return;
 885     }
 886
 887   phase3_get (tp);
 888   if (tp->type == token_type_string
 889       && !(phase4_last == token_type_operator1
 890            || phase4_last == token_type_dot
 891            || phase4_last == token_type_symbol
 892            || phase4_last == token_type_doubledot
 893            || phase4_last == token_type_rparen))
 894     {
 895       char *sum = tp->string;
 896       size_t sum_len = strlen (sum);
 897
 898       for (;;)
 899         {
 900           token_ty token2;
 901
 902           phase3_get (&token2);
 903           if (token2.type == token_type_doubledot)
 904             {
 905               token_ty token3;
 906
 907               phase3_get (&token3);
 908               if (token3.type == token_type_string)
 909                 {
 910                   token_ty token_after;
 911
 912                   phase3_get (&token_after);
 913                   if (token_after.type != token_type_operator1)
 914                     {
 915                       char *addend = token3.string;
 916                       size_t addend_len = strlen (addend);
 917
 918                       sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
 919                       memcpy (sum + sum_len, addend, addend_len + 1);
 920                       sum_len += addend_len;
 921
 922                       phase3_unget (&token_after);
 923                       free_token (&token3);
 924                       free_token (&token2);
 925                       continue;
 926                     }
 927                   phase3_unget (&token_after);
 928                 }
 929               phase3_unget (&token3);
 930             }
 931           phase3_unget (&token2);
 932           break;
 933         }
 934       tp->string = sum;
 935     }
 936   phase4_last = tp->type;
 937 }
 938
 939 static void
 940 phase5_get (token_ty *tp)
 941 {
 942   phase4_get (tp);
 943
 944   /* Combine symbol1 . ... . symbolN to a single strings, so that
 945      we can recognize function calls like
 946      gettext.gettext.  The information present for
 947      symbolI.....symbolN has precedence over the information for
 948      symbolJ.....symbolN with J > I.  */
 949   if (tp->type == token_type_symbol)
 950     {
 951       char *sum = tp->string;
 952       size_t sum_len = strlen (sum);
 953
 954       for (;;)
 955         {
 956           token_ty token2;
 957
 958           phase4_get (&token2);
 959           if (token2.type == token_type_dot)
 960             {
 961               token_ty token3;
 962
 963               phase4_get (&token3);
 964               if (token3.type == token_type_symbol)
 965                 {
 966                   char *addend = token3.string;
 967                   size_t addend_len = strlen (addend);
 968
 969                   sum = (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
 970                   sum[sum_len] = '.';
 971                   memcpy (sum + sum_len + 1, addend, addend_len + 1);
 972                   sum_len += 1 + addend_len;
 973
 974                   free_token (&token2);
 975                   free_token (&token3);
 976                   continue;
 977                 }
 978               phase4_unget (&token3);
 979             }
 980           phase4_unget (&token2);
 981           break;
 982         }
 983       tp->string = sum;
 984     }
 985 }
 986
 987 static void
 988 x_lua_lex (token_ty *tok)
 989 {
 990   phase5_get (tok);
 991 }
 992
 993
 994 /* ========================= Extracting strings.  ========================== */
 995
 996
 997 /* Context lookup table.  */
 998 static flag_context_list_table_ty *flag_context_list_table;
 999
1000
1001 /* The file is broken into tokens.  Scan the token stream, looking for
1002    a keyword, followed by a left paren, followed by a string.  When we
1003    see this sequence, we have something to remember.  We assume we are
1004    looking at a valid Lua program, and leave the complaints about the
1005    grammar to the compiler.
1006
1007      Normal handling: Look for
1008        keyword ( ... msgid ... )
1009        keyword msgid
1010      Plural handling: Look for
1011        keyword ( ... msgid ... msgid_plural ... )
1012
1013    We use recursion because the arguments before msgid or between msgid
1014    and msgid_plural can contain subexpressions of the same form.  */
1015
1016 /* Extract messages until the next balanced closing parenthesis or bracket.
1017    Extracted messages are added to MLP.
1018    DELIM can be either token_type_rparen or token_type_rbracket, or
1019    token_type_eof to accept both.
1020    Return true upon eof, false upon closing parenthesis or bracket.  */
1021 static bool
1022 extract_balanced (message_list_ty *mlp, token_type_ty delim,
1023                   flag_context_ty outer_context,
1024                   flag_context_list_iterator_ty context_iter,
1025                   struct arglist_parser *argparser)
1026 {
1027   /* Current argument number.  */
1028   int arg = 1;
1029   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1030   int state;
1031   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1032   const struct callshapes *next_shapes = NULL;
1033   /* Context iterator that will be used if the next token is a '('.  */
1034   flag_context_list_iterator_ty next_context_iter =
1035     passthrough_context_list_iterator;
1036   /* Current context.  */
1037   flag_context_ty inner_context =
1038     inherited_context (outer_context,
1039                        flag_context_list_iterator_advance (&context_iter));
1040
1041   /* Start state is 0.  */
1042   state = 0;
1043
1044   for (;;)
1045     {
1046       token_ty token;
1047
1048       x_lua_lex (&token);
1049
1050       switch (token.type)
1051         {
1052         case token_type_symbol:
1053           {
1054             void *keyword_value;
1055
1056             if (hash_find_entry (&keywords, token.string, strlen (token.string),
1057                                  &keyword_value)
1058                 == 0)
1059               {
1060                 next_shapes = (const struct callshapes *) keyword_value;
1061                 state = 1;
1062               }
1063             else
1064               state = 0;
1065           }
1066           next_context_iter =
1067             flag_context_list_iterator (
1068               flag_context_list_table_lookup (
1069                 flag_context_list_table,
1070                 token.string, strlen (token.string)));
1071           free (token.string);
1072           continue;
1073
1074         case token_type_lparen:
1075           if (extract_balanced (mlp, token_type_rparen,
1076                                 inner_context, next_context_iter,
1077                                 arglist_parser_alloc (mlp,
1078                                                       state ? next_shapes : NULL)))
1079             {
1080               arglist_parser_done (argparser, arg);
1081               return true;
1082             }
1083           next_context_iter = null_context_list_iterator;
1084           state = 0;
1085           break;
1086
1087         case token_type_rparen:
1088           if (delim == token_type_rparen || delim == token_type_eof)
1089             {
1090               arglist_parser_done (argparser, arg);
1091               return false;
1092             }
1093
1094           next_context_iter = null_context_list_iterator;
1095           state = 0;
1096           continue;
1097
1098         case token_type_lbracket:
1099           if (extract_balanced (mlp, token_type_rbracket,
1100                                 null_context, null_context_list_iterator,
1101                                 arglist_parser_alloc (mlp, NULL)))
1102             {
1103               arglist_parser_done (argparser, arg);
1104               return true;
1105             }
1106           next_context_iter = null_context_list_iterator;
1107           state = 0;
1108           break;
1109
1110         case token_type_rbracket:
1111           if (delim == token_type_rbracket || delim == token_type_eof)
1112             {
1113               arglist_parser_done (argparser, arg);
1114               return false;
1115             }
1116
1117           next_context_iter = null_context_list_iterator;
1118           state = 0;
1119           continue;
1120
1121         case token_type_comma:
1122           arg++;
1123           inner_context =
1124             inherited_context (outer_context,
1125                                flag_context_list_iterator_advance (
1126                                  &context_iter));
1127           next_context_iter = passthrough_context_list_iterator;
1128           state = 0;
1129           continue;
1130
1131         case token_type_eof:
1132           arglist_parser_done (argparser, arg);
1133           return true;
1134
1135         case token_type_string:
1136           {
1137             lex_pos_ty pos;
1138             pos.file_name = logical_file_name;
1139             pos.line_number = token.line_number;
1140
1141             if (extract_all)
1142               remember_a_message (mlp, NULL, token.string, inner_context,
1143                                   &pos, NULL, token.comment);
1144             else
1145               {
1146                 /* A string immediately after a symbol means a function call.  */
1147                 if (state)
1148                   {
1149                     struct arglist_parser *tmp_argparser;
1150                     tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
1151
1152                     arglist_parser_remember (tmp_argparser, 1, token.string,
1153                                              inner_context, pos.file_name,
1154                                              pos.line_number, token.comment);
1155                     arglist_parser_done (tmp_argparser, 1);
1156                   }
1157                 else
1158                   arglist_parser_remember (argparser, arg, token.string,
1159                                            inner_context, pos.file_name,
1160                                            pos.line_number, token.comment);
1161               }
1162           }
1163           drop_reference (token.comment);
1164           next_context_iter = null_context_list_iterator;
1165           state = 0;
1166           continue;
1167
1168         case token_type_dot:
1169         case token_type_doubledot:
1170         case token_type_operator1:
1171         case token_type_operator2:
1172         case token_type_number:
1173         case token_type_other:
1174           next_context_iter = null_context_list_iterator;
1175           state = 0;
1176           continue;
1177
1178         default:
1179           abort ();
1180         }
1181     }
1182 }
1183
1184 void
1185 extract_lua (FILE *f,
1186              const char *real_filename, const char *logical_filename,
1187              flag_context_list_table_ty *flag_table,
1188              msgdomain_list_ty *mdlp)
1189 {
1190   message_list_ty *mlp = mdlp->item[0]->messages;
1191
1192   fp = f;
1193   real_file_name = real_filename;
1194   logical_file_name = xstrdup (logical_filename);
1195   line_number = 1;
1196
1197   last_comment_line = -1;
1198   last_non_comment_line = -1;
1199
1200   flag_context_list_table = flag_table;
1201
1202   init_keywords ();
1203
1204   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1205      due to an unbalanced closing parenthesis, just restart it.  */
1206   while (!extract_balanced (mlp, token_type_eof,
1207                             null_context, null_context_list_iterator,
1208                             arglist_parser_alloc (mlp, NULL)))
1209     ;
1210
1211   fp = NULL;
1212   real_file_name = NULL;
1213   logical_file_name = NULL;
1214   line_number = 0;
1215 }