gettext-tools/src/x-lisp.c

   1 /* xgettext Lisp backend.
   2    Copyright (C) 2001-2003, 2005-2009 Free Software Foundation, Inc.
   3
   4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22
  23 /* Specification.  */
  24 #include "x-lisp.h"
  25
  26 #include <errno.h>
  27 #include <stdbool.h>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31
  32 #include "message.h"
  33 #include "xgettext.h"
  34 #include "error.h"
  35 #include "xalloc.h"
  36 #include "hash.h"
  37 #include "gettext.h"
  38
  39 #define _(s) gettext(s)
  40
  41
  42 /* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
  43    Since we are interested only in strings and in forms similar to
  44         (gettext msgid ...)
  45    or   (ngettext msgid msgid_plural ...)
  46    we make the following simplifications:
  47
  48    - Assume the keywords and strings are in an ASCII compatible encoding.
  49      This means we can read the input file one byte at a time, instead of
  50      one character at a time.  No need to worry about multibyte characters:
  51      If they occur as part of identifiers, they most probably act as
  52      constituent characters, and the byte based approach will do the same.
  53
  54    - Assume the read table is the standard Common Lisp read table.
  55      Non-standard read tables are mostly used to read data, not programs.
  56
  57    - Assume the read table case is :UPCASE, and *READ-BASE* is 10.
  58
  59    - Don't interpret #n= and #n#, they usually don't appear in programs.
  60
  61    - Don't interpret #+, #-, they are unlikely to appear in a gettext form.
  62
  63    The remaining syntax rules are:
  64
  65    - The syntax code assigned to each character, and how tokens are built
  66      up from characters (single escape, multiple escape etc.).
  67
  68    - Comment syntax: ';' and '#| ... |#'.
  69
  70    - String syntax: "..." with single escapes.
  71
  72    - Read macros and dispatch macro character '#'.  Needed to be able to
  73      tell which is the n-th argument of a function call.
  74
  75  */
  76
  77
  78 /* ========================= Lexer customization.  ========================= */
  79
  80 /* 'readtable_case' is the case conversion that is applied to non-escaped
  81     parts of symbol tokens.  In Common Lisp: (readtable-case *readtable*).  */
  82
  83 enum rtcase
  84 {
  85   case_upcase,
  86   case_downcase,
  87   case_preserve,
  88   case_invert
  89 };
  90
  91 static enum rtcase readtable_case = case_upcase;
  92
  93 /* 'read_base' is the assumed radix of integers and rational numbers.
  94    In Common Lisp: *read-base*.  */
  95 static int read_base = 10;
  96
  97 /* 'read_preserve_whitespace' specifies whether a whitespace character
  98    that terminates a token must be pushed back on the input stream.
  99    We set it to true, because the special newline side effect in read_object()
 100    requires that read_object() sees every newline not inside a token.  */
 101 static bool read_preserve_whitespace = true;
 102
 103
 104 /* ====================== Keyword set customization.  ====================== */
 105
 106 /* If true extract all strings.  */
 107 static bool extract_all = false;
 108
 109 static hash_table keywords;
 110 static bool default_keywords = true;
 111
 112
 113 void
 114 x_lisp_extract_all ()
 115 {
 116   extract_all = true;
 117 }
 118
 119
 120 void
 121 x_lisp_keyword (const char *name)
 122 {
 123   if (name == NULL)
 124     default_keywords = false;
 125   else
 126     {
 127       const char *end;
 128       struct callshape shape;
 129       const char *colon;
 130       size_t len;
 131       char *symname;
 132       size_t i;
 133
 134       if (keywords.table == NULL)
 135         hash_init (&keywords, 100);
 136
 137       split_keywordspec (name, &end, &shape);
 138
 139       /* The characters between name and end should form a valid Lisp symbol.
 140          Extract the symbol name part.  */
 141       colon = strchr (name, ':');
 142       if (colon != NULL && colon < end)
 143         {
 144           name = colon + 1;
 145           if (name < end && *name == ':')
 146             name++;
 147           colon = strchr (name, ':');
 148           if (colon != NULL && colon < end)
 149             return;
 150         }
 151
 152       /* Uppercase it.  */
 153       len = end - name;
 154       symname = XNMALLOC (len, char);
 155       for (i = 0; i < len; i++)
 156         symname[i] =
 157           (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);
 158
 159       insert_keyword_callshape (&keywords, symname, len, &shape);
 160     }
 161 }
 162
 163 /* Finish initializing the keywords hash table.
 164    Called after argument processing, before each file is processed.  */
 165 static void
 166 init_keywords ()
 167 {
 168   if (default_keywords)
 169     {
 170       /* When adding new keywords here, also update the documentation in
 171          xgettext.texi!  */
 172       x_lisp_keyword ("gettext");       /* I18N:GETTEXT */
 173       x_lisp_keyword ("ngettext:1,2");  /* I18N:NGETTEXT */
 174       x_lisp_keyword ("gettext-noop");
 175       default_keywords = false;
 176     }
 177 }
 178
 179 void
 180 init_flag_table_lisp ()
 181 {
 182   xgettext_record_flag ("gettext:1:pass-lisp-format");
 183   xgettext_record_flag ("ngettext:1:pass-lisp-format");
 184   xgettext_record_flag ("ngettext:2:pass-lisp-format");
 185   xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
 186   xgettext_record_flag ("format:2:lisp-format");
 187 }
 188
 189
 190 /* ======================== Reading of characters.  ======================== */
 191
 192 /* Real filename, used in error messages about the input file.  */
 193 static const char *real_file_name;
 194
 195 /* Logical filename and line number, used to label the extracted messages.  */
 196 static char *logical_file_name;
 197 static int line_number;
 198
 199 /* The input file stream.  */
 200 static FILE *fp;
 201
 202
 203 /* Fetch the next character from the input file.  */
 204 static int
 205 do_getc ()
 206 {
 207   int c = getc (fp);
 208
 209   if (c == EOF)
 210     {
 211       if (ferror (fp))
 212         error (EXIT_FAILURE, errno, _("\
 213 error while reading \"%s\""), real_file_name);
 214     }
 215   else if (c == '\n')
 216    line_number++;
 217
 218   return c;
 219 }
 220
 221 /* Put back the last fetched character, not EOF.  */
 222 static void
 223 do_ungetc (int c)
 224 {
 225   if (c == '\n')
 226     line_number--;
 227   ungetc (c, fp);
 228 }
 229
 230
 231 /* ========= Reading of tokens.  See CLHS 2.2 "Reader Algorithm".  ========= */
 232
 233
 234 /* Syntax code.  See CLHS 2.1.4 "Character Syntax Types".  */
 235
 236 enum syntax_code
 237 {
 238   syntax_illegal,       /* non-printable, except whitespace     */
 239   syntax_single_esc,    /* '\' (single escape)                  */
 240   syntax_multi_esc,     /* '|' (multiple escape)                */
 241   syntax_constituent,   /* everything else (constituent)        */
 242   syntax_whitespace,    /* TAB,LF,FF,CR,' ' (whitespace)        */
 243   syntax_eof,           /* EOF                                  */
 244   syntax_t_macro,       /* '()'"' (terminating macro)           */
 245   syntax_nt_macro       /* '#' (non-terminating macro)          */
 246 };
 247
 248 /* Returns the syntax code of a character.  */
 249 static enum syntax_code
 250 syntax_code_of (unsigned char c)
 251 {
 252   switch (c)
 253     {
 254     case '\\':
 255       return syntax_single_esc;
 256     case '|':
 257       return syntax_multi_esc;
 258     case '\t': case '\n': case '\f': case '\r': case ' ':
 259       return syntax_whitespace;
 260     case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
 261       return syntax_t_macro;
 262     case '#':
 263       return syntax_nt_macro;
 264     default:
 265       if (c < ' ' && c != '\b')
 266         return syntax_illegal;
 267       else
 268         return syntax_constituent;
 269     }
 270 }
 271
 272 struct char_syntax
 273 {
 274   int ch;                       /* character */
 275   enum syntax_code scode;       /* syntax code */
 276 };
 277
 278 /* Returns the next character and its syntax code.  */
 279 static void
 280 read_char_syntax (struct char_syntax *p)
 281 {
 282   int c = do_getc ();
 283
 284   p->ch = c;
 285   p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
 286 }
 287
 288 /* Every character in a token has an attribute assigned.  The attributes
 289    help during interpretation of the token.  See
 290    CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
 291    and CLHS 2.1.4.2 "Constituent Traits".  */
 292
 293 enum attribute
 294 {
 295   a_illg,       /* invalid constituent */
 296   a_pack_m,     /* ':' package marker */
 297   a_alpha,      /* normal alphabetic */
 298   a_escaped,    /* alphabetic but not subject to case conversion */
 299   a_ratio,      /* '/' */
 300   a_dot,        /* '.' */
 301   a_sign,       /* '+-' */
 302   a_extens,     /* '_^' extension characters */
 303   a_digit,      /* '0123456789' */
 304   a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
 305   a_expodigit,  /* 'esfdlESFDL' below base */
 306   a_letter,     /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
 307   a_expo        /* 'esfdlESFDL' */
 308 };
 309
 310 #define is_letter_attribute(a) ((a) >= a_letter)
 311 #define is_number_attribute(a) ((a) >= a_ratio)
 312
 313 /* Returns the attribute of a character, assuming base 10.  */
 314 static enum attribute
 315 attribute_of (unsigned char c)
 316 {
 317   switch (c)
 318     {
 319     case ':':
 320       return a_pack_m;
 321     case '/':
 322       return a_ratio;
 323     case '.':
 324       return a_dot;
 325     case '+': case '-':
 326       return a_sign;
 327     case '_': case '^':
 328       return a_extens;
 329     case '0': case '1': case '2': case '3': case '4':
 330     case '5': case '6': case '7': case '8': case '9':
 331       return a_digit;
 332     case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
 333     case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 334     case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
 335     case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
 336     case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 337     case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
 338       return a_letter;
 339     case 'e': case 's': case 'd': case 'f': case 'l':
 340     case 'E': case 'S': case 'D': case 'F': case 'L':
 341       return a_expo;
 342     default:
 343       /* Treat everything as valid.  Never return a_illg.  */
 344       return a_alpha;
 345     }
 346 }
 347
 348 struct token_char
 349 {
 350   unsigned char ch;             /* character */
 351   unsigned char attribute;      /* attribute */
 352 };
 353
 354 /* A token consists of a sequence of characters with associated attribute.  */
 355 struct token
 356 {
 357   int allocated;                /* number of allocated 'token_char's */
 358   int charcount;                /* number of used 'token_char's */
 359   struct token_char *chars;     /* the token's constituents */
 360   bool with_escape;             /* whether single-escape or multiple escape occurs */
 361 };
 362
 363 /* Initialize a 'struct token'.  */
 364 static inline void
 365 init_token (struct token *tp)
 366 {
 367   tp->allocated = 10;
 368   tp->chars = XNMALLOC (tp->allocated, struct token_char);
 369   tp->charcount = 0;
 370 }
 371
 372 /* Free the memory pointed to by a 'struct token'.  */
 373 static inline void
 374 free_token (struct token *tp)
 375 {
 376   free (tp->chars);
 377 }
 378
 379 /* Ensure there is enough room in the token for one more character.  */
 380 static inline void
 381 grow_token (struct token *tp)
 382 {
 383   if (tp->charcount == tp->allocated)
 384     {
 385       tp->allocated *= 2;
 386       tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
 387     }
 388 }
 389
 390 /* Read the next token.  If 'first' is given, it points to the first
 391    character, which has already been read.
 392    The algorithm follows CLHS 2.2 "Reader Algorithm".  */
 393 static void
 394 read_token (struct token *tp, const struct char_syntax *first)
 395 {
 396   bool multiple_escape_flag;
 397   struct char_syntax curr;
 398
 399   init_token (tp);
 400   tp->with_escape = false;
 401
 402   multiple_escape_flag = false;
 403   if (first)
 404     curr = *first;
 405   else
 406     read_char_syntax (&curr);
 407
 408   for (;; read_char_syntax (&curr))
 409     {
 410       switch (curr.scode)
 411         {
 412         case syntax_illegal:
 413           /* Invalid input.  Be tolerant, no error message.  */
 414           do_ungetc (curr.ch);
 415           return;
 416
 417         case syntax_single_esc:
 418           tp->with_escape = true;
 419           read_char_syntax (&curr);
 420           if (curr.scode == syntax_eof)
 421             /* Invalid input.  Be tolerant, no error message.  */
 422             return;
 423           grow_token (tp);
 424           tp->chars[tp->charcount].ch = curr.ch;
 425           tp->chars[tp->charcount].attribute = a_escaped;
 426           tp->charcount++;
 427           break;
 428
 429         case syntax_multi_esc:
 430           multiple_escape_flag = !multiple_escape_flag;
 431           tp->with_escape = true;
 432           break;
 433
 434         case syntax_constituent:
 435         case syntax_nt_macro:
 436           grow_token (tp);
 437           if (multiple_escape_flag)
 438             {
 439               tp->chars[tp->charcount].ch = curr.ch;
 440               tp->chars[tp->charcount].attribute = a_escaped;
 441               tp->charcount++;
 442             }
 443           else
 444             {
 445               tp->chars[tp->charcount].ch = curr.ch;
 446               tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
 447               tp->charcount++;
 448             }
 449           break;
 450
 451         case syntax_whitespace:
 452         case syntax_t_macro:
 453           if (multiple_escape_flag)
 454             {
 455               grow_token (tp);
 456               tp->chars[tp->charcount].ch = curr.ch;
 457               tp->chars[tp->charcount].attribute = a_escaped;
 458               tp->charcount++;
 459             }
 460           else
 461             {
 462               if (curr.scode != syntax_whitespace || read_preserve_whitespace)
 463                 do_ungetc (curr.ch);
 464               return;
 465             }
 466           break;
 467
 468         case syntax_eof:
 469           if (multiple_escape_flag)
 470             /* Invalid input.  Be tolerant, no error message.  */
 471             ;
 472           return;
 473         }
 474     }
 475 }
 476
 477 /* A potential number is a token which
 478    1. consists only of digits, '+','-','/','^','_','.' and number markers.
 479       The base for digits is context dependent, but always 10 if a dot '.'
 480       occurs. A number marker is a non-digit letter which is not adjacent
 481       to a non-digit letter.
 482    2. has at least one digit.
 483    3. starts with a digit, '+','-','.','^' or '_'.
 484    4. does not end with '+' or '-'.
 485    See CLHS 2.3.1.1 "Potential Numbers as Tokens".
 486  */
 487
 488 static inline bool
 489 has_a_dot (const struct token *tp)
 490 {
 491   int n = tp->charcount;
 492   int i;
 493
 494   for (i = 0; i < n; i++)
 495     if (tp->chars[i].attribute == a_dot)
 496       return true;
 497   return false;
 498 }
 499
 500 static inline bool
 501 all_a_number (const struct token *tp)
 502 {
 503   int n = tp->charcount;
 504   int i;
 505
 506   for (i = 0; i < n; i++)
 507     if (!is_number_attribute (tp->chars[i].attribute))
 508       return false;
 509   return true;
 510 }
 511
 512 static inline void
 513 a_letter_to_digit (const struct token *tp, int base)
 514 {
 515   int n = tp->charcount;
 516   int i;
 517
 518   for (i = 0; i < n; i++)
 519     if (is_letter_attribute (tp->chars[i].attribute))
 520       {
 521         int c = tp->chars[i].ch;
 522
 523         if (c >= 'a')
 524           c -= 'a' - 'A';
 525         if (c - 'A' + 10 < base)
 526           tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
 527                                           a_expo -> a_expodigit */
 528       }
 529 }
 530
 531 static inline bool
 532 has_a_digit (const struct token *tp)
 533 {
 534   int n = tp->charcount;
 535   int i;
 536
 537   for (i = 0; i < n; i++)
 538     if (tp->chars[i].attribute == a_digit
 539         || tp->chars[i].attribute == a_letterdigit
 540         || tp->chars[i].attribute == a_expodigit)
 541       return true;
 542   return false;
 543 }
 544
 545 static inline bool
 546 has_adjacent_letters (const struct token *tp)
 547 {
 548   int n = tp->charcount;
 549   int i;
 550
 551   for (i = 1; i < n; i++)
 552     if (is_letter_attribute (tp->chars[i-1].attribute)
 553         && is_letter_attribute (tp->chars[i].attribute))
 554       return true;
 555   return false;
 556 }
 557
 558 static bool
 559 is_potential_number (const struct token *tp, int *basep)
 560 {
 561   /* CLHS 2.3.1.1.1:
 562      "A potential number cannot contain any escape characters."  */
 563   if (tp->with_escape)
 564     return false;
 565
 566   if (has_a_dot (tp))
 567     *basep = 10;
 568
 569   if (!all_a_number (tp))
 570     return false;
 571
 572   a_letter_to_digit (tp, *basep);
 573
 574   if (!has_a_digit (tp))
 575     return false;
 576
 577   if (has_adjacent_letters (tp))
 578     return false;
 579
 580   if (!(tp->chars[0].attribute >= a_dot
 581         && tp->chars[0].attribute <= a_expodigit))
 582     return false;
 583
 584   if (tp->chars[tp->charcount - 1].attribute == a_sign)
 585     return false;
 586
 587   return true;
 588 }
 589
 590 /* A number is one of integer, ratio, float.  Each has a particular syntax.
 591    See CLHS 2.3.1 "Numbers as Tokens".
 592    But note a mistake: The exponent rule should read:
 593        exponent ::= exponent-marker [sign] {decimal-digit}+
 594    (see 22.1.3.1.3 "Printing Floats").  */
 595
 596 enum number_type
 597 {
 598   n_none,
 599   n_integer,
 600   n_ratio,
 601   n_float
 602 };
 603
 604 static enum number_type
 605 is_number (const struct token *tp, int *basep)
 606 {
 607   struct token_char *ptr_limit;
 608   struct token_char *ptr1;
 609
 610   if (!is_potential_number (tp, basep))
 611     return n_none;
 612
 613   /* is_potential_number guarantees
 614      - all attributes are >= a_ratio,
 615      - there is at least one a_digit or a_letterdigit or a_expodigit, and
 616      - if there is an a_dot, then *basep = 10.  */
 617
 618   ptr1 = &tp->chars[0];
 619   ptr_limit = &tp->chars[tp->charcount];
 620
 621   if (ptr1->attribute == a_sign)
 622     ptr1++;
 623
 624   /* Test for syntax
 625    * { a_sign | }
 626    * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
 627    */
 628   {
 629     bool seen_a_ratio = false;
 630     bool seen_a_digit = false;  /* seen a digit in last digit block? */
 631     struct token_char *ptr;
 632
 633     for (ptr = ptr1;; ptr++)
 634       {
 635         if (ptr >= ptr_limit)
 636           {
 637             if (!seen_a_digit)
 638               break;
 639             if (seen_a_ratio)
 640               return n_ratio;
 641             else
 642               return n_integer;
 643           }
 644         if (ptr->attribute == a_digit
 645             || ptr->attribute == a_letterdigit
 646             || ptr->attribute == a_expodigit)
 647           {
 648             int c = ptr->ch;
 649
 650             c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
 651             if (c >= *basep)
 652               break;
 653             seen_a_digit = true;
 654           }
 655         else if (ptr->attribute == a_ratio)
 656           {
 657             if (seen_a_ratio || !seen_a_digit)
 658               break;
 659             seen_a_ratio = true;
 660             seen_a_digit = false;
 661           }
 662         else
 663           break;
 664       }
 665   }
 666
 667   /* Test for syntax
 668    * { a_sign | }
 669    * { a_digit }* { a_dot { a_digit }* | }
 670    * { a_expo { a_sign | } { a_digit }+ | }
 671    *
 672    * If there is an exponent part, there must be digits before the dot or
 673    * after the dot. The result is a float.
 674    * If there is no exponen:
 675    *   If there is no dot, it would an integer in base 10, but is has already
 676    *   been verified to not be an integer in the current base.
 677    *   If there is a dot:
 678    *     If there are digits after the dot, it's a float.
 679    *     Otherwise, if there are digits before the dot, it's an integer.
 680    */
 681   *basep = 10;
 682   {
 683     bool seen_a_dot = false;
 684     bool seen_a_dot_with_leading_digits = false;
 685     bool seen_a_digit = false;  /* seen a digit in last digit block? */
 686     struct token_char *ptr;
 687
 688     for (ptr = ptr1;; ptr++)
 689       {
 690         if (ptr >= ptr_limit)
 691           {
 692             /* no exponent */
 693             if (!seen_a_dot)
 694               return n_none;
 695             if (seen_a_digit)
 696               return n_float;
 697             if (seen_a_dot_with_leading_digits)
 698               return n_integer;
 699             else
 700               return n_none;
 701           }
 702         if (ptr->attribute == a_digit)
 703           {
 704             seen_a_digit = true;
 705           }
 706         else if (ptr->attribute == a_dot)
 707           {
 708             if (seen_a_dot)
 709               return n_none;
 710             seen_a_dot = true;
 711             if (seen_a_digit)
 712               seen_a_dot_with_leading_digits = true;
 713             seen_a_digit = false;
 714           }
 715         else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
 716           break;
 717         else
 718           return n_none;
 719       }
 720     ptr++;
 721     if (!seen_a_dot_with_leading_digits || !seen_a_digit)
 722       return n_none;
 723     if (ptr >= ptr_limit)
 724       return n_none;
 725     if (ptr->attribute == a_sign)
 726       ptr++;
 727     seen_a_digit = false;
 728     for (;; ptr++)
 729       {
 730         if (ptr >= ptr_limit)
 731           break;
 732         if (ptr->attribute != a_digit)
 733           return n_none;
 734         seen_a_digit = true;
 735       }
 736     if (!seen_a_digit)
 737       return n_none;
 738     return n_float;
 739   }
 740 }
 741
 742 /* A token representing a symbol must be case converted.
 743    For portability, we convert only ASCII characters here.  */
 744
 745 static void
 746 upcase_token (struct token *tp)
 747 {
 748   int n = tp->charcount;
 749   int i;
 750
 751   for (i = 0; i < n; i++)
 752     if (tp->chars[i].attribute != a_escaped)
 753       {
 754         unsigned char c = tp->chars[i].ch;
 755         if (c >= 'a' && c <= 'z')
 756           tp->chars[i].ch = c - 'a' + 'A';
 757       }
 758 }
 759
 760 static void
 761 downcase_token (struct token *tp)
 762 {
 763   int n = tp->charcount;
 764   int i;
 765
 766   for (i = 0; i < n; i++)
 767     if (tp->chars[i].attribute != a_escaped)
 768       {
 769         unsigned char c = tp->chars[i].ch;
 770         if (c >= 'A' && c <= 'Z')
 771           tp->chars[i].ch = c - 'A' + 'a';
 772       }
 773 }
 774
 775 static void
 776 case_convert_token (struct token *tp)
 777 {
 778   int n = tp->charcount;
 779   int i;
 780
 781   switch (readtable_case)
 782     {
 783     case case_upcase:
 784       upcase_token (tp);
 785       break;
 786
 787     case case_downcase:
 788       downcase_token (tp);
 789       break;
 790
 791     case case_preserve:
 792       break;
 793
 794     case case_invert:
 795       {
 796         bool seen_uppercase = false;
 797         bool seen_lowercase = false;
 798         for (i = 0; i < n; i++)
 799           if (tp->chars[i].attribute != a_escaped)
 800             {
 801               unsigned char c = tp->chars[i].ch;
 802               if (c >= 'a' && c <= 'z')
 803                 seen_lowercase = true;
 804               if (c >= 'A' && c <= 'Z')
 805                 seen_uppercase = true;
 806             }
 807         if (seen_uppercase)
 808           {
 809             if (!seen_lowercase)
 810               downcase_token (tp);
 811           }
 812         else
 813           {
 814             if (seen_lowercase)
 815               upcase_token (tp);
 816           }
 817       }
 818       break;
 819     }
 820 }
 821
 822
 823 /* ========================= Accumulating comments ========================= */
 824
 825
 826 static char *buffer;
 827 static size_t bufmax;
 828 static size_t buflen;
 829
 830 static inline void
 831 comment_start ()
 832 {
 833   buflen = 0;
 834 }
 835
 836 static inline void
 837 comment_add (int c)
 838 {
 839   if (buflen >= bufmax)
 840     {
 841       bufmax = 2 * bufmax + 10;
 842       buffer = xrealloc (buffer, bufmax);
 843     }
 844   buffer[buflen++] = c;
 845 }
 846
 847 static inline void
 848 comment_line_end (size_t chars_to_remove)
 849 {
 850   buflen -= chars_to_remove;
 851   while (buflen >= 1
 852          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 853     --buflen;
 854   if (chars_to_remove == 0 && buflen >= bufmax)
 855     {
 856       bufmax = 2 * bufmax + 10;
 857       buffer = xrealloc (buffer, bufmax);
 858     }
 859   buffer[buflen] = '\0';
 860   savable_comment_add (buffer);
 861 }
 862
 863
 864 /* These are for tracking whether comments count as immediately before
 865    keyword.  */
 866 static int last_comment_line;
 867 static int last_non_comment_line;
 868
 869
 870 /* ========================= Accumulating messages ========================= */
 871
 872
 873 static message_list_ty *mlp;
 874
 875
 876 /* ============== Reading of objects.  See CLHS 2 "Syntax".  ============== */
 877
 878
 879 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
 880    Other objects need not to be represented precisely.  */
 881 enum object_type
 882 {
 883   t_symbol,     /* symbol */
 884   t_string,     /* string */
 885   t_other,      /* other kind of real object */
 886   t_dot,        /* '.' pseudo object */
 887   t_close,      /* ')' pseudo object */
 888   t_eof         /* EOF marker */
 889 };
 890
 891 struct object
 892 {
 893   enum object_type type;
 894   struct token *token;          /* for t_symbol and t_string */
 895   int line_number_at_start;     /* for t_string */
 896 };
 897
 898 /* Free the memory pointed to by a 'struct object'.  */
 899 static inline void
 900 free_object (struct object *op)
 901 {
 902   if (op->type == t_symbol || op->type == t_string)
 903     {
 904       free_token (op->token);
 905       free (op->token);
 906     }
 907 }
 908
 909 /* Convert a t_symbol/t_string token to a char*.  */
 910 static char *
 911 string_of_object (const struct object *op)
 912 {
 913   char *str;
 914   const struct token_char *p;
 915   char *q;
 916   int n;
 917
 918   if (!(op->type == t_symbol || op->type == t_string))
 919     abort ();
 920   n = op->token->charcount;
 921   str = XNMALLOC (n + 1, char);
 922   q = str;
 923   for (p = op->token->chars; n > 0; p++, n--)
 924     *q++ = p->ch;
 925   *q = '\0';
 926   return str;
 927 }
 928
 929 /* Context lookup table.  */
 930 static flag_context_list_table_ty *flag_context_list_table;
 931
 932 /* Read the next object.  */
 933 static void
 934 read_object (struct object *op, flag_context_ty outer_context)
 935 {
 936   for (;;)
 937     {
 938       struct char_syntax curr;
 939
 940       read_char_syntax (&curr);
 941
 942       switch (curr.scode)
 943         {
 944         case syntax_eof:
 945           op->type = t_eof;
 946           return;
 947
 948         case syntax_whitespace:
 949           if (curr.ch == '\n')
 950             /* Comments assumed to be grouped with a message must immediately
 951                precede it, with no non-whitespace token on a line between
 952                both.  */
 953             if (last_non_comment_line > last_comment_line)
 954               savable_comment_reset ();
 955           continue;
 956
 957         case syntax_illegal:
 958           op->type = t_other;
 959           return;
 960
 961         case syntax_single_esc:
 962         case syntax_multi_esc:
 963         case syntax_constituent:
 964           /* Start reading a token.  */
 965           op->token = XMALLOC (struct token);
 966           read_token (op->token, &curr);
 967           last_non_comment_line = line_number;
 968
 969           /* Interpret the token.  */
 970
 971           /* Dots.  */
 972           if (!op->token->with_escape
 973               && op->token->charcount == 1
 974               && op->token->chars[0].attribute == a_dot)
 975             {
 976               free_token (op->token);
 977               free (op->token);
 978               op->type = t_dot;
 979               return;
 980             }
 981           /* Tokens consisting entirely of dots are illegal, but be tolerant
 982              here.  */
 983
 984           /* Number.  */
 985           {
 986             int base = read_base;
 987
 988             if (is_number (op->token, &base) != n_none)
 989               {
 990                 free_token (op->token);
 991                 free (op->token);
 992                 op->type = t_other;
 993                 return;
 994               }
 995           }
 996
 997           /* We interpret all other tokens as symbols (including 'reserved
 998              tokens', i.e. potential numbers which are not numbers).  */
 999           case_convert_token (op->token);
1000           op->type = t_symbol;
1001           return;
1002
1003         case syntax_t_macro:
1004         case syntax_nt_macro:
1005           /* Read a macro.  */
1006           switch (curr.ch)
1007             {
1008             case '(':
1009               {
1010                 int arg = 0;            /* Current argument number.  */
1011                 flag_context_list_iterator_ty context_iter;
1012                 const struct callshapes *shapes = NULL;
1013                 struct arglist_parser *argparser = NULL;
1014
1015                 for (;; arg++)
1016                   {
1017                     struct object inner;
1018                     flag_context_ty inner_context;
1019
1020                     if (arg == 0)
1021                       inner_context = null_context;
1022                     else
1023                       inner_context =
1024                         inherited_context (outer_context,
1025                                            flag_context_list_iterator_advance (
1026                                              &context_iter));
1027
1028                     read_object (&inner, inner_context);
1029
1030                     /* Recognize end of list.  */
1031                     if (inner.type == t_close)
1032                       {
1033                         op->type = t_other;
1034                         /* Don't bother converting "()" to "NIL".  */
1035                         last_non_comment_line = line_number;
1036                         if (argparser != NULL)
1037                           arglist_parser_done (argparser, arg);
1038                         return;
1039                       }
1040
1041                     /* Dots are not allowed in every position.
1042                        But be tolerant.  */
1043
1044                     /* EOF inside list is illegal.
1045                        But be tolerant.  */
1046                     if (inner.type == t_eof)
1047                       break;
1048
1049                     if (arg == 0)
1050                       {
1051                         /* This is the function position.  */
1052                         if (inner.type == t_symbol)
1053                           {
1054                             char *symbol_name = string_of_object (&inner);
1055                             int i;
1056                             int prefix_len;
1057                             void *keyword_value;
1058
1059                             /* Omit any package name.  */
1060                             i = inner.token->charcount;
1061                             while (i > 0
1062                                    && inner.token->chars[i-1].attribute != a_pack_m)
1063                               i--;
1064                             prefix_len = i;
1065
1066                             if (hash_find_entry (&keywords,
1067                                                  symbol_name + prefix_len,
1068                                                  strlen (symbol_name + prefix_len),
1069                                                  &keyword_value)
1070                                 == 0)
1071                               shapes = (const struct callshapes *) keyword_value;
1072
1073                             argparser = arglist_parser_alloc (mlp, shapes);
1074
1075                             context_iter =
1076                               flag_context_list_iterator (
1077                                 flag_context_list_table_lookup (
1078                                   flag_context_list_table,
1079                                   symbol_name, strlen (symbol_name)));
1080
1081                             free (symbol_name);
1082                           }
1083                         else
1084                           context_iter = null_context_list_iterator;
1085                       }
1086                     else
1087                       {
1088                         /* These are the argument positions.  */
1089                         if (argparser != NULL && inner.type == t_string)
1090                           arglist_parser_remember (argparser, arg,
1091                                                    string_of_object (&inner),
1092                                                    inner_context,
1093                                                    logical_file_name,
1094                                                    inner.line_number_at_start,
1095                                                    savable_comment);
1096                       }
1097
1098                     free_object (&inner);
1099                   }
1100
1101                 if (argparser != NULL)
1102                   arglist_parser_done (argparser, arg);
1103               }
1104               op->type = t_other;
1105               last_non_comment_line = line_number;
1106               return;
1107
1108             case ')':
1109               /* Tell the caller about the end of list.
1110                  Unmatched closing parenthesis is illegal.
1111                  But be tolerant.  */
1112               op->type = t_close;
1113               last_non_comment_line = line_number;
1114               return;
1115
1116             case ',':
1117               {
1118                 int c = do_getc ();
1119                 /* The ,@ handling inside lists is wrong anyway, because
1120                    ,@form expands to an unknown number of elements.  */
1121                 if (c != EOF && c != '@' && c != '.')
1122                   do_ungetc (c);
1123               }
1124               /*FALLTHROUGH*/
1125             case '\'':
1126             case '`':
1127               {
1128                 struct object inner;
1129
1130                 read_object (&inner, null_context);
1131
1132                 /* Dots and EOF are not allowed here.  But be tolerant.  */
1133
1134                 free_object (&inner);
1135
1136                 op->type = t_other;
1137                 last_non_comment_line = line_number;
1138                 return;
1139               }
1140
1141             case ';':
1142               {
1143                 bool all_semicolons = true;
1144
1145                 last_comment_line = line_number;
1146                 comment_start ();
1147                 for (;;)
1148                   {
1149                     int c = do_getc ();
1150                     if (c == EOF || c == '\n')
1151                       break;
1152                     if (c != ';')
1153                       all_semicolons = false;
1154                     if (!all_semicolons)
1155                       {
1156                         /* We skip all leading white space, but not EOLs.  */
1157                         if (!(buflen == 0 && (c == ' ' || c == '\t')))
1158                           comment_add (c);
1159                       }
1160                   }
1161                 comment_line_end (0);
1162                 continue;
1163               }
1164
1165             case '"':
1166               {
1167                 op->token = XMALLOC (struct token);
1168                 init_token (op->token);
1169                 op->line_number_at_start = line_number;
1170                 for (;;)
1171                   {
1172                     int c = do_getc ();
1173                     if (c == EOF)
1174                       /* Invalid input.  Be tolerant, no error message.  */
1175                       break;
1176                     if (c == '"')
1177                       break;
1178                     if (c == '\\') /* syntax_single_esc */
1179                       {
1180                         c = do_getc ();
1181                         if (c == EOF)
1182                           /* Invalid input.  Be tolerant, no error message.  */
1183                           break;
1184                       }
1185                     grow_token (op->token);
1186                     op->token->chars[op->token->charcount++].ch = c;
1187                   }
1188                 op->type = t_string;
1189
1190                 if (extract_all)
1191                   {
1192                     lex_pos_ty pos;
1193
1194                     pos.file_name = logical_file_name;
1195                     pos.line_number = op->line_number_at_start;
1196                     remember_a_message (mlp, NULL, string_of_object (op),
1197                                         null_context, &pos,
1198                                         NULL, savable_comment);
1199                   }
1200                 last_non_comment_line = line_number;
1201                 return;
1202               }
1203
1204             case '#':
1205               /* Dispatch macro handling.  */
1206               {
1207                 int c;
1208
1209                 for (;;)
1210                   {
1211                     c = do_getc ();
1212                     if (c == EOF)
1213                       /* Invalid input.  Be tolerant, no error message.  */
1214                       {
1215                         op->type = t_other;
1216                         return;
1217                       }
1218                     if (!(c >= '0' && c <= '9'))
1219                       break;
1220                   }
1221
1222                 switch (c)
1223                   {
1224                   case '(':
1225                   case '"':
1226                     do_ungetc (c);
1227                     /*FALLTHROUGH*/
1228                   case '\'':
1229                   case ':':
1230                   case '.':
1231                   case ',':
1232                   case 'A': case 'a':
1233                   case 'C': case 'c':
1234                   case 'P': case 'p':
1235                   case 'S': case 's':
1236                     {
1237                       struct object inner;
1238                       read_object (&inner, null_context);
1239                       /* Dots and EOF are not allowed here.
1240                          But be tolerant.  */
1241                       free_object (&inner);
1242                       op->type = t_other;
1243                       last_non_comment_line = line_number;
1244                       return;
1245                     }
1246
1247                   case '|':
1248                     {
1249                       int depth = 0;
1250                       int c;
1251
1252                       comment_start ();
1253                       c = do_getc ();
1254                       for (;;)
1255                         {
1256                           if (c == EOF)
1257                             break;
1258                           if (c == '|')
1259                             {
1260                               c = do_getc ();
1261                               if (c == EOF)
1262                                 break;
1263                               if (c == '#')
1264                                 {
1265                                   if (depth == 0)
1266                                     {
1267                                       comment_line_end (0);
1268                                       break;
1269                                     }
1270                                   depth--;
1271                                   comment_add ('|');
1272                                   comment_add ('#');
1273                                   c = do_getc ();
1274                                 }
1275                               else
1276                                 comment_add ('|');
1277                             }
1278                           else if (c == '#')
1279                             {
1280                               c = do_getc ();
1281                               if (c == EOF)
1282                                 break;
1283                               comment_add ('#');
1284                               if (c == '|')
1285                                 {
1286                                   depth++;
1287                                   comment_add ('|');
1288                                   c = do_getc ();
1289                                 }
1290                             }
1291                           else
1292                             {
1293                               /* We skip all leading white space.  */
1294                               if (!(buflen == 0 && (c == ' ' || c == '\t')))
1295                                 comment_add (c);
1296                               if (c == '\n')
1297                                 {
1298                                   comment_line_end (1);
1299                                   comment_start ();
1300                                 }
1301                               c = do_getc ();
1302                             }
1303                         }
1304                       if (c == EOF)
1305                         {
1306                           /* EOF not allowed here.  But be tolerant.  */
1307                           op->type = t_eof;
1308                           return;
1309                         }
1310                       last_comment_line = line_number;
1311                       continue;
1312                     }
1313
1314                   case '\\':
1315                     {
1316                       struct token token;
1317                       struct char_syntax first;
1318                       first.ch = '\\';
1319                       first.scode = syntax_single_esc;
1320                       read_token (&token, &first);
1321                       free_token (&token);
1322                       op->type = t_other;
1323                       last_non_comment_line = line_number;
1324                       return;
1325                     }
1326
1327                   case 'B': case 'b':
1328                   case 'O': case 'o':
1329                   case 'X': case 'x':
1330                   case 'R': case 'r':
1331                   case '*':
1332                     {
1333                       struct token token;
1334                       read_token (&token, NULL);
1335                       free_token (&token);
1336                       op->type = t_other;
1337                       last_non_comment_line = line_number;
1338                       return;
1339                     }
1340
1341                   case '=':
1342                     /* Ignore read labels.  */
1343                     continue;
1344
1345                   case '#':
1346                     /* Don't bother looking up the corresponding object.  */
1347                     op->type = t_other;
1348                     last_non_comment_line = line_number;
1349                     return;
1350
1351                   case '+':
1352                   case '-':
1353                     /* Simply assume every feature expression is true.  */
1354                     {
1355                       struct object inner;
1356                       read_object (&inner, null_context);
1357                       /* Dots and EOF are not allowed here.
1358                          But be tolerant.  */
1359                       free_object (&inner);
1360                       continue;
1361                     }
1362
1363                   default:
1364                     op->type = t_other;
1365                     last_non_comment_line = line_number;
1366                     return;
1367                   }
1368                 /*NOTREACHED*/
1369                 abort ();
1370               }
1371
1372             default:
1373               /*NOTREACHED*/
1374               abort ();
1375             }
1376
1377         default:
1378           /*NOTREACHED*/
1379           abort ();
1380         }
1381     }
1382 }
1383
1384
1385 void
1386 extract_lisp (FILE *f,
1387               const char *real_filename, const char *logical_filename,
1388               flag_context_list_table_ty *flag_table,
1389               msgdomain_list_ty *mdlp)
1390 {
1391   mlp = mdlp->item[0]->messages;
1392
1393   fp = f;
1394   real_file_name = real_filename;
1395   logical_file_name = xstrdup (logical_filename);
1396   line_number = 1;
1397
1398   last_comment_line = -1;
1399   last_non_comment_line = -1;
1400
1401   flag_context_list_table = flag_table;
1402
1403   init_keywords ();
1404
1405   /* Eat tokens until eof is seen.  When read_object returns
1406      due to an unbalanced closing parenthesis, just restart it.  */
1407   do
1408     {
1409       struct object toplevel_object;
1410
1411       read_object (&toplevel_object, null_context);
1412
1413       if (toplevel_object.type == t_eof)
1414         break;
1415
1416       free_object (&toplevel_object);
1417     }
1418   while (!feof (fp));
1419
1420   /* Close scanner.  */
1421   fp = NULL;
1422   real_file_name = NULL;
1423   logical_file_name = NULL;
1424   line_number = 0;
1425 }