gettext-tools/src/x-smalltalk.c

   1 /* xgettext Smalltalk backend.
   2    Copyright (C) 2002-2003, 2005-2009, 2011 Free Software Foundation, Inc.
   3
   4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22
  23 /* Specification.  */
  24 #include "x-smalltalk.h"
  25
  26 #include <errno.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29
  30 #include "message.h"
  31 #include "xgettext.h"
  32 #include "error.h"
  33 #include "xalloc.h"
  34 #include "gettext.h"
  35
  36 #define _(s) gettext(s)
  37
  38 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  39
  40
  41 /* The relevant parts of the Smalltalk syntax are:
  42
  43      stringliteral ::= string | stringconst | symconst
  44      stringconst ::= "#"string
  45      string      ::= "'"[char]*"'"
  46      symconst    ::= "#"symbol
  47      symbol      ::= id | binsel | keysel[keysel]*
  48      keysel      ::= id":"
  49      id          ::= letter[letter|digit]*
  50      letter      ::= "A".."Z" | "a".."z"
  51      digit       ::= "0".."9"
  52      binsel      ::= selchar[selchar]
  53      selchar     ::= "+" | "-" | "*" | "/" | "~" | "|" | "," | "<" | ">"
  54                      | "=" | "&" | "@" | "?" | "%" | "\"
  55
  56    Strings can contain any characters; to include the string delimiter itself,
  57    it must be duplicated.
  58
  59    Character constants are written  "$"char
  60
  61    Comments are enclosed within double quotes.
  62
  63    In well-formed expressions, {} and [] and () are balanced.
  64  */
  65
  66
  67 /* ======================== Reading of characters.  ======================== */
  68
  69
  70 /* Real filename, used in error messages about the input file.  */
  71 static const char *real_file_name;
  72
  73 /* Logical filename and line number, used to label the extracted messages.  */
  74 static char *logical_file_name;
  75 static int line_number;
  76
  77 /* The input file stream.  */
  78 static FILE *fp;
  79
  80
  81 /* 1. line_number handling.  */
  82
  83 static int
  84 phase1_getc ()
  85 {
  86   int c = getc (fp);
  87
  88   if (c == EOF)
  89     {
  90       if (ferror (fp))
  91         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
  92                real_file_name);
  93       return EOF;
  94     }
  95
  96   if (c == '\n')
  97     line_number++;
  98
  99   return c;
 100 }
 101
 102 /* Supports only one pushback character.  */
 103 static void
 104 phase1_ungetc (int c)
 105 {
 106   if (c != EOF)
 107     {
 108       if (c == '\n')
 109         --line_number;
 110
 111       ungetc (c, fp);
 112     }
 113 }
 114
 115
 116 /* Accumulating comments.  */
 117
 118 static char *buffer;
 119 static size_t bufmax;
 120 static size_t buflen;
 121
 122 static inline void
 123 comment_start ()
 124 {
 125   buflen = 0;
 126 }
 127
 128 static inline void
 129 comment_add (int c)
 130 {
 131   if (buflen >= bufmax)
 132     {
 133       bufmax = 2 * bufmax + 10;
 134       buffer = xrealloc (buffer, bufmax);
 135     }
 136   buffer[buflen++] = c;
 137 }
 138
 139 static inline void
 140 comment_line_end ()
 141 {
 142   while (buflen >= 1
 143          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 144     --buflen;
 145   if (buflen >= bufmax)
 146     {
 147       bufmax = 2 * bufmax + 10;
 148       buffer = xrealloc (buffer, bufmax);
 149     }
 150   buffer[buflen] = '\0';
 151   savable_comment_add (buffer);
 152 }
 153
 154
 155 /* These are for tracking whether comments count as immediately before
 156    keyword.  */
 157 static int last_comment_line;
 158 static int last_non_comment_line;
 159
 160
 161 /* ========================== Reading of tokens.  ========================== */
 162
 163
 164 enum token_type_ty
 165 {
 166   token_type_eof,
 167   token_type_uniq,              /* # */
 168   token_type_symbol,            /* symbol */
 169   token_type_string_literal,    /* string, stringconst, symbolconst */
 170   token_type_other              /* misc. operator */
 171 };
 172 typedef enum token_type_ty token_type_ty;
 173
 174 typedef struct token_ty token_ty;
 175 struct token_ty
 176 {
 177   token_type_ty type;
 178   char *string;         /* for token_type_string_literal, token_type_symbol */
 179   int line_number;
 180 };
 181
 182
 183 /* 2. Combine characters into tokens.  Discard comments and whitespace.  */
 184
 185 static token_ty phase2_pushback[1];
 186 static int phase2_pushback_length;
 187
 188 static void
 189 phase2_get (token_ty *tp)
 190 {
 191   static char *buffer;
 192   static int bufmax;
 193   int bufpos;
 194   int c;
 195
 196   if (phase2_pushback_length)
 197     {
 198       *tp = phase2_pushback[--phase2_pushback_length];
 199       return;
 200     }
 201
 202   tp->string = NULL;
 203
 204   for (;;)
 205     {
 206       tp->line_number = line_number;
 207       c = phase1_getc ();
 208       switch (c)
 209         {
 210         case EOF:
 211           tp->type = token_type_eof;
 212           return;
 213
 214         case '"':
 215           {
 216             /* Comment.  */
 217             int lineno;
 218
 219             comment_start ();
 220             lineno = line_number;
 221             for (;;)
 222               {
 223                 c = phase1_getc ();
 224                 if (c == '"' || c == EOF)
 225                   break;
 226                 if (c == '\n')
 227                   {
 228                     comment_line_end ();
 229                     comment_start ();
 230                   }
 231                 else
 232                   {
 233                     /* We skip all leading white space, but not EOLs.  */
 234                     if (!(buflen == 0 && (c == ' ' || c == '\t')))
 235                       comment_add (c);
 236                   }
 237               }
 238             comment_line_end ();
 239             last_comment_line = lineno;
 240             continue;
 241           }
 242
 243         case '\n':
 244           if (last_non_comment_line > last_comment_line)
 245             savable_comment_reset ();
 246           /* FALLTHROUGH */
 247         case ' ':
 248         case '\t':
 249         case '\r':
 250           /* Ignore whitespace.  */
 251           continue;
 252         }
 253
 254       last_non_comment_line = tp->line_number;
 255
 256       switch (c)
 257         {
 258         case '\'':
 259           /* String literal.  */
 260           bufpos = 0;
 261           for (;;)
 262             {
 263               c = phase1_getc ();
 264               if (c == EOF)
 265                 break;
 266               if (c == '\'')
 267                 {
 268                   c = phase1_getc ();
 269                   if (c != '\'')
 270                     {
 271                       phase1_ungetc (c);
 272                       break;
 273                     }
 274                 }
 275               if (bufpos >= bufmax)
 276                 {
 277                   bufmax = 2 * bufmax + 10;
 278                   buffer = xrealloc (buffer, bufmax);
 279                 }
 280               buffer[bufpos++] = c;
 281             }
 282           if (bufpos >= bufmax)
 283             {
 284               bufmax = 2 * bufmax + 10;
 285               buffer = xrealloc (buffer, bufmax);
 286             }
 287           buffer[bufpos] = 0;
 288           tp->type = token_type_string_literal;
 289           tp->string = xstrdup (buffer);
 290           return;
 291
 292         case '+':
 293         case '-':
 294         case '*':
 295         case '/':
 296         case '~':
 297         case '|':
 298         case ',':
 299         case '<':
 300         case '>':
 301         case '=':
 302         case '&':
 303         case '@':
 304         case '?':
 305         case '%':
 306         case '\\':
 307           {
 308             char *name;
 309             int c2 = phase1_getc ();
 310             switch (c2)
 311               {
 312               case '+':
 313               case '-':
 314               case '*':
 315               case '/':
 316               case '~':
 317               case '|':
 318               case ',':
 319               case '<':
 320               case '>':
 321               case '=':
 322               case '&':
 323               case '@':
 324               case '?':
 325               case '%':
 326                 name = XNMALLOC (3, char);
 327                 name[0] = c;
 328                 name[1] = c2;
 329                 name[2] = '\0';
 330                 tp->type = token_type_symbol;
 331                 tp->string = name;
 332                 return;
 333               default:
 334                 phase1_ungetc (c2);
 335                 break;
 336               }
 337             name = XNMALLOC (2, char);
 338             name[0] = c;
 339             name[1] = '\0';
 340             tp->type = token_type_symbol;
 341             tp->string = name;
 342             return;
 343           }
 344
 345         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 346         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 347         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 348         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 349         case 'Y': case 'Z':
 350         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 351         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 352         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 353         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 354         case 'y': case 'z':
 355           /* Recognize id or id":"[id":"]* or id":"[id":"]*id.  */
 356           bufpos = 0;
 357           for (;;)
 358             {
 359               if (bufpos >= bufmax)
 360                 {
 361                   bufmax = 2 * bufmax + 10;
 362                   buffer = xrealloc (buffer, bufmax);
 363                 }
 364               buffer[bufpos++] = c;
 365               c = phase1_getc ();
 366               switch (c)
 367                 {
 368                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 369                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 370                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 371                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 372                 case 'Y': case 'Z':
 373                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 374                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 375                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 376                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 377                 case 'y': case 'z':
 378                 case '0': case '1': case '2': case '3': case '4':
 379                 case '5': case '6': case '7': case '8': case '9':
 380                   continue;
 381                 case ':':
 382                   if (bufpos >= bufmax)
 383                     {
 384                       bufmax = 2 * bufmax + 10;
 385                       buffer = xrealloc (buffer, bufmax);
 386                     }
 387                   buffer[bufpos++] = c;
 388                   c = phase1_getc ();
 389                   switch (c)
 390                     {
 391                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 392                     case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 393                     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 394                     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 395                     case 'Y': case 'Z':
 396                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 397                     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 398                     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 399                     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 400                     case 'y': case 'z':
 401                       continue;
 402                     default:
 403                       phase1_ungetc (c);
 404                       break;
 405                     }
 406                   break;
 407                 default:
 408                   phase1_ungetc (c);
 409                   break;
 410                 }
 411               break;
 412             }
 413           if (bufpos >= bufmax)
 414             {
 415               bufmax = 2 * bufmax + 10;
 416               buffer = xrealloc (buffer, bufmax);
 417             }
 418           buffer[bufpos] = '\0';
 419           tp->string = xstrdup (buffer);
 420           tp->type = token_type_symbol;
 421           return;
 422
 423         case '#':
 424           /* Uniquification operator.  */
 425           tp->type = token_type_uniq;
 426           return;
 427
 428         case '$':
 429           c = phase1_getc ();
 430           tp->type = token_type_other;
 431           return;
 432
 433         default:
 434           tp->type = token_type_other;
 435           return;
 436         }
 437     }
 438 }
 439
 440 /* Supports only one pushback token.  */
 441 static void
 442 phase2_unget (token_ty *tp)
 443 {
 444   if (tp->type != token_type_eof)
 445     {
 446       if (phase2_pushback_length == SIZEOF (phase2_pushback))
 447         abort ();
 448       phase2_pushback[phase2_pushback_length++] = *tp;
 449     }
 450 }
 451
 452
 453 /* 3. Combine "# string_literal" and "# symbol" to a single token.  */
 454
 455 static void
 456 x_smalltalk_lex (token_ty *tp)
 457 {
 458   phase2_get (tp);
 459   if (tp->type == token_type_uniq)
 460     {
 461       token_ty token2;
 462
 463       phase2_get (&token2);
 464       if (token2.type == token_type_symbol
 465           || token2.type == token_type_string_literal)
 466         {
 467           tp->type = token_type_string_literal;
 468           tp->string = token2.string;
 469         }
 470       else
 471         phase2_unget (&token2);
 472     }
 473 }
 474
 475
 476 /* ========================= Extracting strings.  ========================== */
 477
 478 /* The file is broken into tokens.  Scan the token stream, looking for the
 479    following patterns
 480       NLS ? <string>
 481       NLS at: <string>
 482       NLS at: <string> plural: <string>
 483    where <string> is one of
 484       string_literal
 485       # string_literal
 486       # symbol
 487  */
 488
 489 void
 490 extract_smalltalk (FILE *f,
 491                    const char *real_filename, const char *logical_filename,
 492                    flag_context_list_table_ty *flag_table,
 493                    msgdomain_list_ty *mdlp)
 494 {
 495   message_list_ty *mlp = mdlp->item[0]->messages;
 496
 497   fp = f;
 498   real_file_name = real_filename;
 499   logical_file_name = xstrdup (logical_filename);
 500   line_number = 1;
 501
 502   last_comment_line = -1;
 503   last_non_comment_line = -1;
 504
 505   /* Eat tokens until eof is seen.  */
 506   {
 507     /* 0 when no "NLS" has been seen.
 508        1 after "NLS".
 509        2 after "NLS ?".
 510        3 after "NLS at:".
 511        4 after "NLS at: <string>".
 512        5 after "NLS at: <string> plural:".  */
 513     int state;
 514     /* Remember the message containing the msgid, for msgid_plural.
 515        Non-NULL in states 4, 5.  */
 516     message_ty *plural_mp = NULL;
 517
 518     /* Start state is 0.  */
 519     state = 0;
 520
 521     for (;;)
 522       {
 523         token_ty token;
 524
 525         x_smalltalk_lex (&token);
 526
 527         switch (token.type)
 528           {
 529           case token_type_symbol:
 530             state = (strcmp (token.string, "NLS") == 0 ? 1 :
 531                      strcmp (token.string, "?") == 0 && state == 1 ? 2 :
 532                      strcmp (token.string, "at:") == 0 && state == 1 ? 3 :
 533                      strcmp (token.string, "plural:") == 0 && state == 4 ? 5 :
 534                      0);
 535             free (token.string);
 536             break;
 537
 538           case token_type_string_literal:
 539             if (state == 2)
 540               {
 541                 lex_pos_ty pos;
 542                 pos.file_name = logical_file_name;
 543                 pos.line_number = token.line_number;
 544                 remember_a_message (mlp, NULL, token.string, null_context,
 545                                     &pos, NULL, savable_comment);
 546                 state = 0;
 547                 break;
 548               }
 549             if (state == 3)
 550               {
 551                 lex_pos_ty pos;
 552                 pos.file_name = logical_file_name;
 553                 pos.line_number = token.line_number;
 554                 plural_mp = remember_a_message (mlp, NULL, token.string,
 555                                                 null_context, &pos,
 556                                                 NULL, savable_comment);
 557                 state = 4;
 558                 break;
 559               }
 560             if (state == 5)
 561               {
 562                 lex_pos_ty pos;
 563                 pos.file_name = logical_file_name;
 564                 pos.line_number = token.line_number;
 565                 if (plural_mp != NULL)
 566                   remember_a_message_plural (plural_mp, token.string,
 567                                              null_context, &pos,
 568                                              savable_comment);
 569                 state = 0;
 570                 break;
 571               }
 572             state = 0;
 573             free (token.string);
 574             break;
 575
 576           case token_type_uniq:
 577           case token_type_other:
 578             state = 0;
 579             break;
 580
 581           case token_type_eof:
 582             break;
 583
 584           default:
 585             abort ();
 586           }
 587
 588         if (token.type == token_type_eof)
 589           break;
 590       }
 591   }
 592
 593   /* Close scanner.  */
 594   fp = NULL;
 595   real_file_name = NULL;
 596   logical_file_name = NULL;
 597   line_number = 0;
 598 }