gettext-tools/src/x-smalltalk.c

   1 /* xgettext Smalltalk backend.
   2    Copyright (C) 2002-2003, 2005-2009, 2011, 2015 Free Software
   3    Foundation, Inc.
   4
   5    This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
   6
   7    This program is free software: you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 /* Specification.  */
  25 #include "x-smalltalk.h"
  26
  27 #include <errno.h>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30
  31 #include "message.h"
  32 #include "xgettext.h"
  33 #include "error.h"
  34 #include "xalloc.h"
  35 #include "gettext.h"
  36
  37 #define _(s) gettext(s)
  38
  39 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  40
  41
  42 /* The relevant parts of the Smalltalk syntax are:
  43
  44      stringliteral ::= string | stringconst | symconst
  45      stringconst ::= "#"string
  46      string      ::= "'"[char]*"'"
  47      symconst    ::= "#"symbol
  48      symbol      ::= id | binsel | keysel[keysel]*
  49      keysel      ::= id":"
  50      id          ::= letter[letter|digit]*
  51      letter      ::= "A".."Z" | "a".."z"
  52      digit       ::= "0".."9"
  53      binsel      ::= selchar[selchar]
  54      selchar     ::= "+" | "-" | "*" | "/" | "~" | "|" | "," | "<" | ">"
  55                      | "=" | "&" | "@" | "?" | "%" | "\"
  56
  57    Strings can contain any characters; to include the string delimiter itself,
  58    it must be duplicated.
  59
  60    Character constants are written  "$"char
  61
  62    Comments are enclosed within double quotes.
  63
  64    In well-formed expressions, {} and [] and () are balanced.
  65  */
  66
  67
  68 /* ======================== Reading of characters.  ======================== */
  69
  70
  71 /* Real filename, used in error messages about the input file.  */
  72 static const char *real_file_name;
  73
  74 /* Logical filename and line number, used to label the extracted messages.  */
  75 static char *logical_file_name;
  76 static int line_number;
  77
  78 /* The input file stream.  */
  79 static FILE *fp;
  80
  81
  82 /* 1. line_number handling.  */
  83
  84 static int
  85 phase1_getc ()
  86 {
  87   int c = getc (fp);
  88
  89   if (c == EOF)
  90     {
  91       if (ferror (fp))
  92         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
  93                real_file_name);
  94       return EOF;
  95     }
  96
  97   if (c == '\n')
  98     line_number++;
  99
 100   return c;
 101 }
 102
 103 /* Supports only one pushback character.  */
 104 static void
 105 phase1_ungetc (int c)
 106 {
 107   if (c != EOF)
 108     {
 109       if (c == '\n')
 110         --line_number;
 111
 112       ungetc (c, fp);
 113     }
 114 }
 115
 116
 117 /* Accumulating comments.  */
 118
 119 static char *buffer;
 120 static size_t bufmax;
 121 static size_t buflen;
 122
 123 static inline void
 124 comment_start ()
 125 {
 126   buflen = 0;
 127 }
 128
 129 static inline void
 130 comment_add (int c)
 131 {
 132   if (buflen >= bufmax)
 133     {
 134       bufmax = 2 * bufmax + 10;
 135       buffer = xrealloc (buffer, bufmax);
 136     }
 137   buffer[buflen++] = c;
 138 }
 139
 140 static inline void
 141 comment_line_end ()
 142 {
 143   while (buflen >= 1
 144          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 145     --buflen;
 146   if (buflen >= bufmax)
 147     {
 148       bufmax = 2 * bufmax + 10;
 149       buffer = xrealloc (buffer, bufmax);
 150     }
 151   buffer[buflen] = '\0';
 152   savable_comment_add (buffer);
 153 }
 154
 155
 156 /* These are for tracking whether comments count as immediately before
 157    keyword.  */
 158 static int last_comment_line;
 159 static int last_non_comment_line;
 160
 161
 162 /* ========================== Reading of tokens.  ========================== */
 163
 164
 165 enum token_type_ty
 166 {
 167   token_type_eof,
 168   token_type_uniq,              /* # */
 169   token_type_symbol,            /* symbol */
 170   token_type_string_literal,    /* string, stringconst, symbolconst */
 171   token_type_other              /* misc. operator */
 172 };
 173 typedef enum token_type_ty token_type_ty;
 174
 175 typedef struct token_ty token_ty;
 176 struct token_ty
 177 {
 178   token_type_ty type;
 179   char *string;         /* for token_type_string_literal, token_type_symbol */
 180   int line_number;
 181 };
 182
 183
 184 /* 2. Combine characters into tokens.  Discard comments and whitespace.  */
 185
 186 static token_ty phase2_pushback[1];
 187 static int phase2_pushback_length;
 188
 189 static void
 190 phase2_get (token_ty *tp)
 191 {
 192   static char *buffer;
 193   static int bufmax;
 194   int bufpos;
 195   int c;
 196
 197   if (phase2_pushback_length)
 198     {
 199       *tp = phase2_pushback[--phase2_pushback_length];
 200       return;
 201     }
 202
 203   tp->string = NULL;
 204
 205   for (;;)
 206     {
 207       tp->line_number = line_number;
 208       c = phase1_getc ();
 209       switch (c)
 210         {
 211         case EOF:
 212           tp->type = token_type_eof;
 213           return;
 214
 215         case '"':
 216           {
 217             /* Comment.  */
 218             int lineno;
 219
 220             comment_start ();
 221             lineno = line_number;
 222             for (;;)
 223               {
 224                 c = phase1_getc ();
 225                 if (c == '"' || c == EOF)
 226                   break;
 227                 if (c == '\n')
 228                   {
 229                     comment_line_end ();
 230                     comment_start ();
 231                   }
 232                 else
 233                   {
 234                     /* We skip all leading white space, but not EOLs.  */
 235                     if (!(buflen == 0 && (c == ' ' || c == '\t')))
 236                       comment_add (c);
 237                   }
 238               }
 239             comment_line_end ();
 240             last_comment_line = lineno;
 241             continue;
 242           }
 243
 244         case '\n':
 245           if (last_non_comment_line > last_comment_line)
 246             savable_comment_reset ();
 247           /* FALLTHROUGH */
 248         case ' ':
 249         case '\t':
 250         case '\r':
 251           /* Ignore whitespace.  */
 252           continue;
 253         }
 254
 255       last_non_comment_line = tp->line_number;
 256
 257       switch (c)
 258         {
 259         case '\'':
 260           /* String literal.  */
 261           bufpos = 0;
 262           for (;;)
 263             {
 264               c = phase1_getc ();
 265               if (c == EOF)
 266                 break;
 267               if (c == '\'')
 268                 {
 269                   c = phase1_getc ();
 270                   if (c != '\'')
 271                     {
 272                       phase1_ungetc (c);
 273                       break;
 274                     }
 275                 }
 276               if (bufpos >= bufmax)
 277                 {
 278                   bufmax = 2 * bufmax + 10;
 279                   buffer = xrealloc (buffer, bufmax);
 280                 }
 281               buffer[bufpos++] = c;
 282             }
 283           if (bufpos >= bufmax)
 284             {
 285               bufmax = 2 * bufmax + 10;
 286               buffer = xrealloc (buffer, bufmax);
 287             }
 288           buffer[bufpos] = 0;
 289           tp->type = token_type_string_literal;
 290           tp->string = xstrdup (buffer);
 291           return;
 292
 293         case '+':
 294         case '-':
 295         case '*':
 296         case '/':
 297         case '~':
 298         case '|':
 299         case ',':
 300         case '<':
 301         case '>':
 302         case '=':
 303         case '&':
 304         case '@':
 305         case '?':
 306         case '%':
 307         case '\\':
 308           {
 309             char *name;
 310             int c2 = phase1_getc ();
 311             switch (c2)
 312               {
 313               case '+':
 314               case '-':
 315               case '*':
 316               case '/':
 317               case '~':
 318               case '|':
 319               case ',':
 320               case '<':
 321               case '>':
 322               case '=':
 323               case '&':
 324               case '@':
 325               case '?':
 326               case '%':
 327                 name = XNMALLOC (3, char);
 328                 name[0] = c;
 329                 name[1] = c2;
 330                 name[2] = '\0';
 331                 tp->type = token_type_symbol;
 332                 tp->string = name;
 333                 return;
 334               default:
 335                 phase1_ungetc (c2);
 336                 break;
 337               }
 338             name = XNMALLOC (2, char);
 339             name[0] = c;
 340             name[1] = '\0';
 341             tp->type = token_type_symbol;
 342             tp->string = name;
 343             return;
 344           }
 345
 346         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 347         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 348         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 349         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 350         case 'Y': case 'Z':
 351         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 352         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 353         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 354         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 355         case 'y': case 'z':
 356           /* Recognize id or id":"[id":"]* or id":"[id":"]*id.  */
 357           bufpos = 0;
 358           for (;;)
 359             {
 360               if (bufpos >= bufmax)
 361                 {
 362                   bufmax = 2 * bufmax + 10;
 363                   buffer = xrealloc (buffer, bufmax);
 364                 }
 365               buffer[bufpos++] = c;
 366               c = phase1_getc ();
 367               switch (c)
 368                 {
 369                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 370                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 371                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 372                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 373                 case 'Y': case 'Z':
 374                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 375                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 376                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 377                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 378                 case 'y': case 'z':
 379                 case '0': case '1': case '2': case '3': case '4':
 380                 case '5': case '6': case '7': case '8': case '9':
 381                   continue;
 382                 case ':':
 383                   if (bufpos >= bufmax)
 384                     {
 385                       bufmax = 2 * bufmax + 10;
 386                       buffer = xrealloc (buffer, bufmax);
 387                     }
 388                   buffer[bufpos++] = c;
 389                   c = phase1_getc ();
 390                   switch (c)
 391                     {
 392                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 393                     case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 394                     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 395                     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 396                     case 'Y': case 'Z':
 397                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 398                     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 399                     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 400                     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 401                     case 'y': case 'z':
 402                       continue;
 403                     default:
 404                       phase1_ungetc (c);
 405                       break;
 406                     }
 407                   break;
 408                 default:
 409                   phase1_ungetc (c);
 410                   break;
 411                 }
 412               break;
 413             }
 414           if (bufpos >= bufmax)
 415             {
 416               bufmax = 2 * bufmax + 10;
 417               buffer = xrealloc (buffer, bufmax);
 418             }
 419           buffer[bufpos] = '\0';
 420           tp->string = xstrdup (buffer);
 421           tp->type = token_type_symbol;
 422           return;
 423
 424         case '#':
 425           /* Uniquification operator.  */
 426           tp->type = token_type_uniq;
 427           return;
 428
 429         case '$':
 430           c = phase1_getc ();
 431           tp->type = token_type_other;
 432           return;
 433
 434         default:
 435           tp->type = token_type_other;
 436           return;
 437         }
 438     }
 439 }
 440
 441 /* Supports only one pushback token.  */
 442 static void
 443 phase2_unget (token_ty *tp)
 444 {
 445   if (tp->type != token_type_eof)
 446     {
 447       if (phase2_pushback_length == SIZEOF (phase2_pushback))
 448         abort ();
 449       phase2_pushback[phase2_pushback_length++] = *tp;
 450     }
 451 }
 452
 453
 454 /* 3. Combine "# string_literal" and "# symbol" to a single token.  */
 455
 456 static void
 457 x_smalltalk_lex (token_ty *tp)
 458 {
 459   phase2_get (tp);
 460   if (tp->type == token_type_uniq)
 461     {
 462       token_ty token2;
 463
 464       phase2_get (&token2);
 465       if (token2.type == token_type_symbol
 466           || token2.type == token_type_string_literal)
 467         {
 468           tp->type = token_type_string_literal;
 469           tp->string = token2.string;
 470         }
 471       else
 472         phase2_unget (&token2);
 473     }
 474 }
 475
 476
 477 /* ========================= Extracting strings.  ========================== */
 478
 479 /* The file is broken into tokens.  Scan the token stream, looking for the
 480    following patterns
 481       NLS ? <string>
 482       NLS at: <string>
 483       NLS at: <string> plural: <string>
 484    where <string> is one of
 485       string_literal
 486       # string_literal
 487       # symbol
 488  */
 489
 490 void
 491 extract_smalltalk (FILE *f,
 492                    const char *real_filename, const char *logical_filename,
 493                    flag_context_list_table_ty *flag_table,
 494                    msgdomain_list_ty *mdlp)
 495 {
 496   message_list_ty *mlp = mdlp->item[0]->messages;
 497
 498   fp = f;
 499   real_file_name = real_filename;
 500   logical_file_name = xstrdup (logical_filename);
 501   line_number = 1;
 502
 503   last_comment_line = -1;
 504   last_non_comment_line = -1;
 505
 506   /* Eat tokens until eof is seen.  */
 507   {
 508     /* 0 when no "NLS" has been seen.
 509        1 after "NLS".
 510        2 after "NLS ?".
 511        3 after "NLS at:".
 512        4 after "NLS at: <string>".
 513        5 after "NLS at: <string> plural:".  */
 514     int state;
 515     /* Remember the message containing the msgid, for msgid_plural.
 516        Non-NULL in states 4, 5.  */
 517     message_ty *plural_mp = NULL;
 518
 519     /* Start state is 0.  */
 520     state = 0;
 521
 522     for (;;)
 523       {
 524         token_ty token;
 525
 526         x_smalltalk_lex (&token);
 527
 528         switch (token.type)
 529           {
 530           case token_type_symbol:
 531             state = (strcmp (token.string, "NLS") == 0 ? 1 :
 532                      strcmp (token.string, "?") == 0 && state == 1 ? 2 :
 533                      strcmp (token.string, "at:") == 0 && state == 1 ? 3 :
 534                      strcmp (token.string, "plural:") == 0 && state == 4 ? 5 :
 535                      0);
 536             free (token.string);
 537             break;
 538
 539           case token_type_string_literal:
 540             if (state == 2)
 541               {
 542                 lex_pos_ty pos;
 543                 pos.file_name = logical_file_name;
 544                 pos.line_number = token.line_number;
 545                 remember_a_message (mlp, NULL, token.string, null_context,
 546                                     &pos, NULL, savable_comment);
 547                 state = 0;
 548                 break;
 549               }
 550             if (state == 3)
 551               {
 552                 lex_pos_ty pos;
 553                 pos.file_name = logical_file_name;
 554                 pos.line_number = token.line_number;
 555                 plural_mp = remember_a_message (mlp, NULL, token.string,
 556                                                 null_context, &pos,
 557                                                 NULL, savable_comment);
 558                 state = 4;
 559                 break;
 560               }
 561             if (state == 5)
 562               {
 563                 lex_pos_ty pos;
 564                 pos.file_name = logical_file_name;
 565                 pos.line_number = token.line_number;
 566                 if (plural_mp != NULL)
 567                   remember_a_message_plural (plural_mp, token.string,
 568                                              null_context, &pos,
 569                                              savable_comment);
 570                 state = 0;
 571                 break;
 572               }
 573             state = 0;
 574             free (token.string);
 575             break;
 576
 577           case token_type_uniq:
 578           case token_type_other:
 579             state = 0;
 580             break;
 581
 582           case token_type_eof:
 583             break;
 584
 585           default:
 586             abort ();
 587           }
 588
 589         if (token.type == token_type_eof)
 590           break;
 591       }
 592   }
 593
 594   /* Close scanner.  */
 595   fp = NULL;
 596   real_file_name = NULL;
 597   logical_file_name = NULL;
 598   line_number = 0;
 599 }