gettext-tools/src/x-lisp.c

   1 /* xgettext Lisp backend.
   2    Copyright (C) 2001-2003, 2005-2009, 2015 Free Software Foundation,
   3    Inc.
   4
   5    This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
   6
   7    This program is free software: you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 /* Specification.  */
  25 #include "x-lisp.h"
  26
  27 #include <errno.h>
  28 #include <stdbool.h>
  29 #include <stdio.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32
  33 #include "message.h"
  34 #include "xgettext.h"
  35 #include "error.h"
  36 #include "xalloc.h"
  37 #include "hash.h"
  38 #include "gettext.h"
  39
  40 #define _(s) gettext(s)
  41
  42
  43 /* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
  44    Since we are interested only in strings and in forms similar to
  45         (gettext msgid ...)
  46    or   (ngettext msgid msgid_plural ...)
  47    we make the following simplifications:
  48
  49    - Assume the keywords and strings are in an ASCII compatible encoding.
  50      This means we can read the input file one byte at a time, instead of
  51      one character at a time.  No need to worry about multibyte characters:
  52      If they occur as part of identifiers, they most probably act as
  53      constituent characters, and the byte based approach will do the same.
  54
  55    - Assume the read table is the standard Common Lisp read table.
  56      Non-standard read tables are mostly used to read data, not programs.
  57
  58    - Assume the read table case is :UPCASE, and *READ-BASE* is 10.
  59
  60    - Don't interpret #n= and #n#, they usually don't appear in programs.
  61
  62    - Don't interpret #+, #-, they are unlikely to appear in a gettext form.
  63
  64    The remaining syntax rules are:
  65
  66    - The syntax code assigned to each character, and how tokens are built
  67      up from characters (single escape, multiple escape etc.).
  68
  69    - Comment syntax: ';' and '#| ... |#'.
  70
  71    - String syntax: "..." with single escapes.
  72
  73    - Read macros and dispatch macro character '#'.  Needed to be able to
  74      tell which is the n-th argument of a function call.
  75
  76  */
  77
  78
  79 /* ========================= Lexer customization.  ========================= */
  80
  81 /* 'readtable_case' is the case conversion that is applied to non-escaped
  82     parts of symbol tokens.  In Common Lisp: (readtable-case *readtable*).  */
  83
  84 enum rtcase
  85 {
  86   case_upcase,
  87   case_downcase,
  88   case_preserve,
  89   case_invert
  90 };
  91
  92 static enum rtcase readtable_case = case_upcase;
  93
  94 /* 'read_base' is the assumed radix of integers and rational numbers.
  95    In Common Lisp: *read-base*.  */
  96 static int read_base = 10;
  97
  98 /* 'read_preserve_whitespace' specifies whether a whitespace character
  99    that terminates a token must be pushed back on the input stream.
 100    We set it to true, because the special newline side effect in read_object()
 101    requires that read_object() sees every newline not inside a token.  */
 102 static bool read_preserve_whitespace = true;
 103
 104
 105 /* ====================== Keyword set customization.  ====================== */
 106
 107 /* If true extract all strings.  */
 108 static bool extract_all = false;
 109
 110 static hash_table keywords;
 111 static bool default_keywords = true;
 112
 113
 114 void
 115 x_lisp_extract_all ()
 116 {
 117   extract_all = true;
 118 }
 119
 120
 121 void
 122 x_lisp_keyword (const char *name)
 123 {
 124   if (name == NULL)
 125     default_keywords = false;
 126   else
 127     {
 128       const char *end;
 129       struct callshape shape;
 130       const char *colon;
 131       size_t len;
 132       char *symname;
 133       size_t i;
 134
 135       if (keywords.table == NULL)
 136         hash_init (&keywords, 100);
 137
 138       split_keywordspec (name, &end, &shape);
 139
 140       /* The characters between name and end should form a valid Lisp symbol.
 141          Extract the symbol name part.  */
 142       colon = strchr (name, ':');
 143       if (colon != NULL && colon < end)
 144         {
 145           name = colon + 1;
 146           if (name < end && *name == ':')
 147             name++;
 148           colon = strchr (name, ':');
 149           if (colon != NULL && colon < end)
 150             return;
 151         }
 152
 153       /* Uppercase it.  */
 154       len = end - name;
 155       symname = XNMALLOC (len, char);
 156       for (i = 0; i < len; i++)
 157         symname[i] =
 158           (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);
 159
 160       insert_keyword_callshape (&keywords, symname, len, &shape);
 161     }
 162 }
 163
 164 /* Finish initializing the keywords hash table.
 165    Called after argument processing, before each file is processed.  */
 166 static void
 167 init_keywords ()
 168 {
 169   if (default_keywords)
 170     {
 171       /* When adding new keywords here, also update the documentation in
 172          xgettext.texi!  */
 173       x_lisp_keyword ("gettext");       /* I18N:GETTEXT */
 174       x_lisp_keyword ("ngettext:1,2");  /* I18N:NGETTEXT */
 175       x_lisp_keyword ("gettext-noop");
 176       default_keywords = false;
 177     }
 178 }
 179
 180 void
 181 init_flag_table_lisp ()
 182 {
 183   xgettext_record_flag ("gettext:1:pass-lisp-format");
 184   xgettext_record_flag ("ngettext:1:pass-lisp-format");
 185   xgettext_record_flag ("ngettext:2:pass-lisp-format");
 186   xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
 187   xgettext_record_flag ("format:2:lisp-format");
 188 }
 189
 190
 191 /* ======================== Reading of characters.  ======================== */
 192
 193 /* Real filename, used in error messages about the input file.  */
 194 static const char *real_file_name;
 195
 196 /* Logical filename and line number, used to label the extracted messages.  */
 197 static char *logical_file_name;
 198 static int line_number;
 199
 200 /* The input file stream.  */
 201 static FILE *fp;
 202
 203
 204 /* Fetch the next character from the input file.  */
 205 static int
 206 do_getc ()
 207 {
 208   int c = getc (fp);
 209
 210   if (c == EOF)
 211     {
 212       if (ferror (fp))
 213         error (EXIT_FAILURE, errno, _("\
 214 error while reading \"%s\""), real_file_name);
 215     }
 216   else if (c == '\n')
 217    line_number++;
 218
 219   return c;
 220 }
 221
 222 /* Put back the last fetched character, not EOF.  */
 223 static void
 224 do_ungetc (int c)
 225 {
 226   if (c == '\n')
 227     line_number--;
 228   ungetc (c, fp);
 229 }
 230
 231
 232 /* ========= Reading of tokens.  See CLHS 2.2 "Reader Algorithm".  ========= */
 233
 234
 235 /* Syntax code.  See CLHS 2.1.4 "Character Syntax Types".  */
 236
 237 enum syntax_code
 238 {
 239   syntax_illegal,       /* non-printable, except whitespace     */
 240   syntax_single_esc,    /* '\' (single escape)                  */
 241   syntax_multi_esc,     /* '|' (multiple escape)                */
 242   syntax_constituent,   /* everything else (constituent)        */
 243   syntax_whitespace,    /* TAB,LF,FF,CR,' ' (whitespace)        */
 244   syntax_eof,           /* EOF                                  */
 245   syntax_t_macro,       /* '()'"' (terminating macro)           */
 246   syntax_nt_macro       /* '#' (non-terminating macro)          */
 247 };
 248
 249 /* Returns the syntax code of a character.  */
 250 static enum syntax_code
 251 syntax_code_of (unsigned char c)
 252 {
 253   switch (c)
 254     {
 255     case '\\':
 256       return syntax_single_esc;
 257     case '|':
 258       return syntax_multi_esc;
 259     case '\t': case '\n': case '\f': case '\r': case ' ':
 260       return syntax_whitespace;
 261     case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
 262       return syntax_t_macro;
 263     case '#':
 264       return syntax_nt_macro;
 265     default:
 266       if (c < ' ' && c != '\b')
 267         return syntax_illegal;
 268       else
 269         return syntax_constituent;
 270     }
 271 }
 272
 273 struct char_syntax
 274 {
 275   int ch;                       /* character */
 276   enum syntax_code scode;       /* syntax code */
 277 };
 278
 279 /* Returns the next character and its syntax code.  */
 280 static void
 281 read_char_syntax (struct char_syntax *p)
 282 {
 283   int c = do_getc ();
 284
 285   p->ch = c;
 286   p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
 287 }
 288
 289 /* Every character in a token has an attribute assigned.  The attributes
 290    help during interpretation of the token.  See
 291    CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
 292    and CLHS 2.1.4.2 "Constituent Traits".  */
 293
 294 enum attribute
 295 {
 296   a_illg,       /* invalid constituent */
 297   a_pack_m,     /* ':' package marker */
 298   a_alpha,      /* normal alphabetic */
 299   a_escaped,    /* alphabetic but not subject to case conversion */
 300   a_ratio,      /* '/' */
 301   a_dot,        /* '.' */
 302   a_sign,       /* '+-' */
 303   a_extens,     /* '_^' extension characters */
 304   a_digit,      /* '0123456789' */
 305   a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
 306   a_expodigit,  /* 'esfdlESFDL' below base */
 307   a_letter,     /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
 308   a_expo        /* 'esfdlESFDL' */
 309 };
 310
 311 #define is_letter_attribute(a) ((a) >= a_letter)
 312 #define is_number_attribute(a) ((a) >= a_ratio)
 313
 314 /* Returns the attribute of a character, assuming base 10.  */
 315 static enum attribute
 316 attribute_of (unsigned char c)
 317 {
 318   switch (c)
 319     {
 320     case ':':
 321       return a_pack_m;
 322     case '/':
 323       return a_ratio;
 324     case '.':
 325       return a_dot;
 326     case '+': case '-':
 327       return a_sign;
 328     case '_': case '^':
 329       return a_extens;
 330     case '0': case '1': case '2': case '3': case '4':
 331     case '5': case '6': case '7': case '8': case '9':
 332       return a_digit;
 333     case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
 334     case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 335     case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
 336     case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
 337     case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 338     case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
 339       return a_letter;
 340     case 'e': case 's': case 'd': case 'f': case 'l':
 341     case 'E': case 'S': case 'D': case 'F': case 'L':
 342       return a_expo;
 343     default:
 344       /* Treat everything as valid.  Never return a_illg.  */
 345       return a_alpha;
 346     }
 347 }
 348
 349 struct token_char
 350 {
 351   unsigned char ch;             /* character */
 352   unsigned char attribute;      /* attribute */
 353 };
 354
 355 /* A token consists of a sequence of characters with associated attribute.  */
 356 struct token
 357 {
 358   int allocated;                /* number of allocated 'token_char's */
 359   int charcount;                /* number of used 'token_char's */
 360   struct token_char *chars;     /* the token's constituents */
 361   bool with_escape;             /* whether single-escape or multiple escape occurs */
 362 };
 363
 364 /* Initialize a 'struct token'.  */
 365 static inline void
 366 init_token (struct token *tp)
 367 {
 368   tp->allocated = 10;
 369   tp->chars = XNMALLOC (tp->allocated, struct token_char);
 370   tp->charcount = 0;
 371 }
 372
 373 /* Free the memory pointed to by a 'struct token'.  */
 374 static inline void
 375 free_token (struct token *tp)
 376 {
 377   free (tp->chars);
 378 }
 379
 380 /* Ensure there is enough room in the token for one more character.  */
 381 static inline void
 382 grow_token (struct token *tp)
 383 {
 384   if (tp->charcount == tp->allocated)
 385     {
 386       tp->allocated *= 2;
 387       tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
 388     }
 389 }
 390
 391 /* Read the next token.  If 'first' is given, it points to the first
 392    character, which has already been read.
 393    The algorithm follows CLHS 2.2 "Reader Algorithm".  */
 394 static void
 395 read_token (struct token *tp, const struct char_syntax *first)
 396 {
 397   bool multiple_escape_flag;
 398   struct char_syntax curr;
 399
 400   init_token (tp);
 401   tp->with_escape = false;
 402
 403   multiple_escape_flag = false;
 404   if (first)
 405     curr = *first;
 406   else
 407     read_char_syntax (&curr);
 408
 409   for (;; read_char_syntax (&curr))
 410     {
 411       switch (curr.scode)
 412         {
 413         case syntax_illegal:
 414           /* Invalid input.  Be tolerant, no error message.  */
 415           do_ungetc (curr.ch);
 416           return;
 417
 418         case syntax_single_esc:
 419           tp->with_escape = true;
 420           read_char_syntax (&curr);
 421           if (curr.scode == syntax_eof)
 422             /* Invalid input.  Be tolerant, no error message.  */
 423             return;
 424           grow_token (tp);
 425           tp->chars[tp->charcount].ch = curr.ch;
 426           tp->chars[tp->charcount].attribute = a_escaped;
 427           tp->charcount++;
 428           break;
 429
 430         case syntax_multi_esc:
 431           multiple_escape_flag = !multiple_escape_flag;
 432           tp->with_escape = true;
 433           break;
 434
 435         case syntax_constituent:
 436         case syntax_nt_macro:
 437           grow_token (tp);
 438           if (multiple_escape_flag)
 439             {
 440               tp->chars[tp->charcount].ch = curr.ch;
 441               tp->chars[tp->charcount].attribute = a_escaped;
 442               tp->charcount++;
 443             }
 444           else
 445             {
 446               tp->chars[tp->charcount].ch = curr.ch;
 447               tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
 448               tp->charcount++;
 449             }
 450           break;
 451
 452         case syntax_whitespace:
 453         case syntax_t_macro:
 454           if (multiple_escape_flag)
 455             {
 456               grow_token (tp);
 457               tp->chars[tp->charcount].ch = curr.ch;
 458               tp->chars[tp->charcount].attribute = a_escaped;
 459               tp->charcount++;
 460             }
 461           else
 462             {
 463               if (curr.scode != syntax_whitespace || read_preserve_whitespace)
 464                 do_ungetc (curr.ch);
 465               return;
 466             }
 467           break;
 468
 469         case syntax_eof:
 470           if (multiple_escape_flag)
 471             /* Invalid input.  Be tolerant, no error message.  */
 472             ;
 473           return;
 474         }
 475     }
 476 }
 477
 478 /* A potential number is a token which
 479    1. consists only of digits, '+','-','/','^','_','.' and number markers.
 480       The base for digits is context dependent, but always 10 if a dot '.'
 481       occurs. A number marker is a non-digit letter which is not adjacent
 482       to a non-digit letter.
 483    2. has at least one digit.
 484    3. starts with a digit, '+','-','.','^' or '_'.
 485    4. does not end with '+' or '-'.
 486    See CLHS 2.3.1.1 "Potential Numbers as Tokens".
 487  */
 488
 489 static inline bool
 490 has_a_dot (const struct token *tp)
 491 {
 492   int n = tp->charcount;
 493   int i;
 494
 495   for (i = 0; i < n; i++)
 496     if (tp->chars[i].attribute == a_dot)
 497       return true;
 498   return false;
 499 }
 500
 501 static inline bool
 502 all_a_number (const struct token *tp)
 503 {
 504   int n = tp->charcount;
 505   int i;
 506
 507   for (i = 0; i < n; i++)
 508     if (!is_number_attribute (tp->chars[i].attribute))
 509       return false;
 510   return true;
 511 }
 512
 513 static inline void
 514 a_letter_to_digit (const struct token *tp, int base)
 515 {
 516   int n = tp->charcount;
 517   int i;
 518
 519   for (i = 0; i < n; i++)
 520     if (is_letter_attribute (tp->chars[i].attribute))
 521       {
 522         int c = tp->chars[i].ch;
 523
 524         if (c >= 'a')
 525           c -= 'a' - 'A';
 526         if (c - 'A' + 10 < base)
 527           tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
 528                                           a_expo -> a_expodigit */
 529       }
 530 }
 531
 532 static inline bool
 533 has_a_digit (const struct token *tp)
 534 {
 535   int n = tp->charcount;
 536   int i;
 537
 538   for (i = 0; i < n; i++)
 539     if (tp->chars[i].attribute == a_digit
 540         || tp->chars[i].attribute == a_letterdigit
 541         || tp->chars[i].attribute == a_expodigit)
 542       return true;
 543   return false;
 544 }
 545
 546 static inline bool
 547 has_adjacent_letters (const struct token *tp)
 548 {
 549   int n = tp->charcount;
 550   int i;
 551
 552   for (i = 1; i < n; i++)
 553     if (is_letter_attribute (tp->chars[i-1].attribute)
 554         && is_letter_attribute (tp->chars[i].attribute))
 555       return true;
 556   return false;
 557 }
 558
 559 static bool
 560 is_potential_number (const struct token *tp, int *basep)
 561 {
 562   /* CLHS 2.3.1.1.1:
 563      "A potential number cannot contain any escape characters."  */
 564   if (tp->with_escape)
 565     return false;
 566
 567   if (has_a_dot (tp))
 568     *basep = 10;
 569
 570   if (!all_a_number (tp))
 571     return false;
 572
 573   a_letter_to_digit (tp, *basep);
 574
 575   if (!has_a_digit (tp))
 576     return false;
 577
 578   if (has_adjacent_letters (tp))
 579     return false;
 580
 581   if (!(tp->chars[0].attribute >= a_dot
 582         && tp->chars[0].attribute <= a_expodigit))
 583     return false;
 584
 585   if (tp->chars[tp->charcount - 1].attribute == a_sign)
 586     return false;
 587
 588   return true;
 589 }
 590
 591 /* A number is one of integer, ratio, float.  Each has a particular syntax.
 592    See CLHS 2.3.1 "Numbers as Tokens".
 593    But note a mistake: The exponent rule should read:
 594        exponent ::= exponent-marker [sign] {decimal-digit}+
 595    (see 22.1.3.1.3 "Printing Floats").  */
 596
 597 enum number_type
 598 {
 599   n_none,
 600   n_integer,
 601   n_ratio,
 602   n_float
 603 };
 604
 605 static enum number_type
 606 is_number (const struct token *tp, int *basep)
 607 {
 608   struct token_char *ptr_limit;
 609   struct token_char *ptr1;
 610
 611   if (!is_potential_number (tp, basep))
 612     return n_none;
 613
 614   /* is_potential_number guarantees
 615      - all attributes are >= a_ratio,
 616      - there is at least one a_digit or a_letterdigit or a_expodigit, and
 617      - if there is an a_dot, then *basep = 10.  */
 618
 619   ptr1 = &tp->chars[0];
 620   ptr_limit = &tp->chars[tp->charcount];
 621
 622   if (ptr1->attribute == a_sign)
 623     ptr1++;
 624
 625   /* Test for syntax
 626    * { a_sign | }
 627    * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
 628    */
 629   {
 630     bool seen_a_ratio = false;
 631     bool seen_a_digit = false;  /* seen a digit in last digit block? */
 632     struct token_char *ptr;
 633
 634     for (ptr = ptr1;; ptr++)
 635       {
 636         if (ptr >= ptr_limit)
 637           {
 638             if (!seen_a_digit)
 639               break;
 640             if (seen_a_ratio)
 641               return n_ratio;
 642             else
 643               return n_integer;
 644           }
 645         if (ptr->attribute == a_digit
 646             || ptr->attribute == a_letterdigit
 647             || ptr->attribute == a_expodigit)
 648           {
 649             int c = ptr->ch;
 650
 651             c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
 652             if (c >= *basep)
 653               break;
 654             seen_a_digit = true;
 655           }
 656         else if (ptr->attribute == a_ratio)
 657           {
 658             if (seen_a_ratio || !seen_a_digit)
 659               break;
 660             seen_a_ratio = true;
 661             seen_a_digit = false;
 662           }
 663         else
 664           break;
 665       }
 666   }
 667
 668   /* Test for syntax
 669    * { a_sign | }
 670    * { a_digit }* { a_dot { a_digit }* | }
 671    * { a_expo { a_sign | } { a_digit }+ | }
 672    *
 673    * If there is an exponent part, there must be digits before the dot or
 674    * after the dot. The result is a float.
 675    * If there is no exponen:
 676    *   If there is no dot, it would an integer in base 10, but is has already
 677    *   been verified to not be an integer in the current base.
 678    *   If there is a dot:
 679    *     If there are digits after the dot, it's a float.
 680    *     Otherwise, if there are digits before the dot, it's an integer.
 681    */
 682   *basep = 10;
 683   {
 684     bool seen_a_dot = false;
 685     bool seen_a_dot_with_leading_digits = false;
 686     bool seen_a_digit = false;  /* seen a digit in last digit block? */
 687     struct token_char *ptr;
 688
 689     for (ptr = ptr1;; ptr++)
 690       {
 691         if (ptr >= ptr_limit)
 692           {
 693             /* no exponent */
 694             if (!seen_a_dot)
 695               return n_none;
 696             if (seen_a_digit)
 697               return n_float;
 698             if (seen_a_dot_with_leading_digits)
 699               return n_integer;
 700             else
 701               return n_none;
 702           }
 703         if (ptr->attribute == a_digit)
 704           {
 705             seen_a_digit = true;
 706           }
 707         else if (ptr->attribute == a_dot)
 708           {
 709             if (seen_a_dot)
 710               return n_none;
 711             seen_a_dot = true;
 712             if (seen_a_digit)
 713               seen_a_dot_with_leading_digits = true;
 714             seen_a_digit = false;
 715           }
 716         else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
 717           break;
 718         else
 719           return n_none;
 720       }
 721     ptr++;
 722     if (!seen_a_dot_with_leading_digits || !seen_a_digit)
 723       return n_none;
 724     if (ptr >= ptr_limit)
 725       return n_none;
 726     if (ptr->attribute == a_sign)
 727       ptr++;
 728     seen_a_digit = false;
 729     for (;; ptr++)
 730       {
 731         if (ptr >= ptr_limit)
 732           break;
 733         if (ptr->attribute != a_digit)
 734           return n_none;
 735         seen_a_digit = true;
 736       }
 737     if (!seen_a_digit)
 738       return n_none;
 739     return n_float;
 740   }
 741 }
 742
 743 /* A token representing a symbol must be case converted.
 744    For portability, we convert only ASCII characters here.  */
 745
 746 static void
 747 upcase_token (struct token *tp)
 748 {
 749   int n = tp->charcount;
 750   int i;
 751
 752   for (i = 0; i < n; i++)
 753     if (tp->chars[i].attribute != a_escaped)
 754       {
 755         unsigned char c = tp->chars[i].ch;
 756         if (c >= 'a' && c <= 'z')
 757           tp->chars[i].ch = c - 'a' + 'A';
 758       }
 759 }
 760
 761 static void
 762 downcase_token (struct token *tp)
 763 {
 764   int n = tp->charcount;
 765   int i;
 766
 767   for (i = 0; i < n; i++)
 768     if (tp->chars[i].attribute != a_escaped)
 769       {
 770         unsigned char c = tp->chars[i].ch;
 771         if (c >= 'A' && c <= 'Z')
 772           tp->chars[i].ch = c - 'A' + 'a';
 773       }
 774 }
 775
 776 static void
 777 case_convert_token (struct token *tp)
 778 {
 779   int n = tp->charcount;
 780   int i;
 781
 782   switch (readtable_case)
 783     {
 784     case case_upcase:
 785       upcase_token (tp);
 786       break;
 787
 788     case case_downcase:
 789       downcase_token (tp);
 790       break;
 791
 792     case case_preserve:
 793       break;
 794
 795     case case_invert:
 796       {
 797         bool seen_uppercase = false;
 798         bool seen_lowercase = false;
 799         for (i = 0; i < n; i++)
 800           if (tp->chars[i].attribute != a_escaped)
 801             {
 802               unsigned char c = tp->chars[i].ch;
 803               if (c >= 'a' && c <= 'z')
 804                 seen_lowercase = true;
 805               if (c >= 'A' && c <= 'Z')
 806                 seen_uppercase = true;
 807             }
 808         if (seen_uppercase)
 809           {
 810             if (!seen_lowercase)
 811               downcase_token (tp);
 812           }
 813         else
 814           {
 815             if (seen_lowercase)
 816               upcase_token (tp);
 817           }
 818       }
 819       break;
 820     }
 821 }
 822
 823
 824 /* ========================= Accumulating comments ========================= */
 825
 826
 827 static char *buffer;
 828 static size_t bufmax;
 829 static size_t buflen;
 830
 831 static inline void
 832 comment_start ()
 833 {
 834   buflen = 0;
 835 }
 836
 837 static inline void
 838 comment_add (int c)
 839 {
 840   if (buflen >= bufmax)
 841     {
 842       bufmax = 2 * bufmax + 10;
 843       buffer = xrealloc (buffer, bufmax);
 844     }
 845   buffer[buflen++] = c;
 846 }
 847
 848 static inline void
 849 comment_line_end (size_t chars_to_remove)
 850 {
 851   buflen -= chars_to_remove;
 852   while (buflen >= 1
 853          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 854     --buflen;
 855   if (chars_to_remove == 0 && buflen >= bufmax)
 856     {
 857       bufmax = 2 * bufmax + 10;
 858       buffer = xrealloc (buffer, bufmax);
 859     }
 860   buffer[buflen] = '\0';
 861   savable_comment_add (buffer);
 862 }
 863
 864
 865 /* These are for tracking whether comments count as immediately before
 866    keyword.  */
 867 static int last_comment_line;
 868 static int last_non_comment_line;
 869
 870
 871 /* ========================= Accumulating messages ========================= */
 872
 873
 874 static message_list_ty *mlp;
 875
 876
 877 /* ============== Reading of objects.  See CLHS 2 "Syntax".  ============== */
 878
 879
 880 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
 881    Other objects need not to be represented precisely.  */
 882 enum object_type
 883 {
 884   t_symbol,     /* symbol */
 885   t_string,     /* string */
 886   t_other,      /* other kind of real object */
 887   t_dot,        /* '.' pseudo object */
 888   t_close,      /* ')' pseudo object */
 889   t_eof         /* EOF marker */
 890 };
 891
 892 struct object
 893 {
 894   enum object_type type;
 895   struct token *token;          /* for t_symbol and t_string */
 896   int line_number_at_start;     /* for t_string */
 897 };
 898
 899 /* Free the memory pointed to by a 'struct object'.  */
 900 static inline void
 901 free_object (struct object *op)
 902 {
 903   if (op->type == t_symbol || op->type == t_string)
 904     {
 905       free_token (op->token);
 906       free (op->token);
 907     }
 908 }
 909
 910 /* Convert a t_symbol/t_string token to a char*.  */
 911 static char *
 912 string_of_object (const struct object *op)
 913 {
 914   char *str;
 915   const struct token_char *p;
 916   char *q;
 917   int n;
 918
 919   if (!(op->type == t_symbol || op->type == t_string))
 920     abort ();
 921   n = op->token->charcount;
 922   str = XNMALLOC (n + 1, char);
 923   q = str;
 924   for (p = op->token->chars; n > 0; p++, n--)
 925     *q++ = p->ch;
 926   *q = '\0';
 927   return str;
 928 }
 929
 930 /* Context lookup table.  */
 931 static flag_context_list_table_ty *flag_context_list_table;
 932
 933 /* Read the next object.  */
 934 static void
 935 read_object (struct object *op, flag_context_ty outer_context)
 936 {
 937   for (;;)
 938     {
 939       struct char_syntax curr;
 940
 941       read_char_syntax (&curr);
 942
 943       switch (curr.scode)
 944         {
 945         case syntax_eof:
 946           op->type = t_eof;
 947           return;
 948
 949         case syntax_whitespace:
 950           if (curr.ch == '\n')
 951             /* Comments assumed to be grouped with a message must immediately
 952                precede it, with no non-whitespace token on a line between
 953                both.  */
 954             if (last_non_comment_line > last_comment_line)
 955               savable_comment_reset ();
 956           continue;
 957
 958         case syntax_illegal:
 959           op->type = t_other;
 960           return;
 961
 962         case syntax_single_esc:
 963         case syntax_multi_esc:
 964         case syntax_constituent:
 965           /* Start reading a token.  */
 966           op->token = XMALLOC (struct token);
 967           read_token (op->token, &curr);
 968           last_non_comment_line = line_number;
 969
 970           /* Interpret the token.  */
 971
 972           /* Dots.  */
 973           if (!op->token->with_escape
 974               && op->token->charcount == 1
 975               && op->token->chars[0].attribute == a_dot)
 976             {
 977               free_token (op->token);
 978               free (op->token);
 979               op->type = t_dot;
 980               return;
 981             }
 982           /* Tokens consisting entirely of dots are illegal, but be tolerant
 983              here.  */
 984
 985           /* Number.  */
 986           {
 987             int base = read_base;
 988
 989             if (is_number (op->token, &base) != n_none)
 990               {
 991                 free_token (op->token);
 992                 free (op->token);
 993                 op->type = t_other;
 994                 return;
 995               }
 996           }
 997
 998           /* We interpret all other tokens as symbols (including 'reserved
 999              tokens', i.e. potential numbers which are not numbers).  */
1000           case_convert_token (op->token);
1001           op->type = t_symbol;
1002           return;
1003
1004         case syntax_t_macro:
1005         case syntax_nt_macro:
1006           /* Read a macro.  */
1007           switch (curr.ch)
1008             {
1009             case '(':
1010               {
1011                 int arg = 0;            /* Current argument number.  */
1012                 flag_context_list_iterator_ty context_iter;
1013                 const struct callshapes *shapes = NULL;
1014                 struct arglist_parser *argparser = NULL;
1015
1016                 for (;; arg++)
1017                   {
1018                     struct object inner;
1019                     flag_context_ty inner_context;
1020
1021                     if (arg == 0)
1022                       inner_context = null_context;
1023                     else
1024                       inner_context =
1025                         inherited_context (outer_context,
1026                                            flag_context_list_iterator_advance (
1027                                              &context_iter));
1028
1029                     read_object (&inner, inner_context);
1030
1031                     /* Recognize end of list.  */
1032                     if (inner.type == t_close)
1033                       {
1034                         op->type = t_other;
1035                         /* Don't bother converting "()" to "NIL".  */
1036                         last_non_comment_line = line_number;
1037                         if (argparser != NULL)
1038                           arglist_parser_done (argparser, arg);
1039                         return;
1040                       }
1041
1042                     /* Dots are not allowed in every position.
1043                        But be tolerant.  */
1044
1045                     /* EOF inside list is illegal.
1046                        But be tolerant.  */
1047                     if (inner.type == t_eof)
1048                       break;
1049
1050                     if (arg == 0)
1051                       {
1052                         /* This is the function position.  */
1053                         if (inner.type == t_symbol)
1054                           {
1055                             char *symbol_name = string_of_object (&inner);
1056                             int i;
1057                             int prefix_len;
1058                             void *keyword_value;
1059
1060                             /* Omit any package name.  */
1061                             i = inner.token->charcount;
1062                             while (i > 0
1063                                    && inner.token->chars[i-1].attribute != a_pack_m)
1064                               i--;
1065                             prefix_len = i;
1066
1067                             if (hash_find_entry (&keywords,
1068                                                  symbol_name + prefix_len,
1069                                                  strlen (symbol_name + prefix_len),
1070                                                  &keyword_value)
1071                                 == 0)
1072                               shapes = (const struct callshapes *) keyword_value;
1073
1074                             argparser = arglist_parser_alloc (mlp, shapes);
1075
1076                             context_iter =
1077                               flag_context_list_iterator (
1078                                 flag_context_list_table_lookup (
1079                                   flag_context_list_table,
1080                                   symbol_name, strlen (symbol_name)));
1081
1082                             free (symbol_name);
1083                           }
1084                         else
1085                           context_iter = null_context_list_iterator;
1086                       }
1087                     else
1088                       {
1089                         /* These are the argument positions.  */
1090                         if (argparser != NULL && inner.type == t_string)
1091                           arglist_parser_remember (argparser, arg,
1092                                                    string_of_object (&inner),
1093                                                    inner_context,
1094                                                    logical_file_name,
1095                                                    inner.line_number_at_start,
1096                                                    savable_comment);
1097                       }
1098
1099                     free_object (&inner);
1100                   }
1101
1102                 if (argparser != NULL)
1103                   arglist_parser_done (argparser, arg);
1104               }
1105               op->type = t_other;
1106               last_non_comment_line = line_number;
1107               return;
1108
1109             case ')':
1110               /* Tell the caller about the end of list.
1111                  Unmatched closing parenthesis is illegal.
1112                  But be tolerant.  */
1113               op->type = t_close;
1114               last_non_comment_line = line_number;
1115               return;
1116
1117             case ',':
1118               {
1119                 int c = do_getc ();
1120                 /* The ,@ handling inside lists is wrong anyway, because
1121                    ,@form expands to an unknown number of elements.  */
1122                 if (c != EOF && c != '@' && c != '.')
1123                   do_ungetc (c);
1124               }
1125               /*FALLTHROUGH*/
1126             case '\'':
1127             case '`':
1128               {
1129                 struct object inner;
1130
1131                 read_object (&inner, null_context);
1132
1133                 /* Dots and EOF are not allowed here.  But be tolerant.  */
1134
1135                 free_object (&inner);
1136
1137                 op->type = t_other;
1138                 last_non_comment_line = line_number;
1139                 return;
1140               }
1141
1142             case ';':
1143               {
1144                 bool all_semicolons = true;
1145
1146                 last_comment_line = line_number;
1147                 comment_start ();
1148                 for (;;)
1149                   {
1150                     int c = do_getc ();
1151                     if (c == EOF || c == '\n')
1152                       break;
1153                     if (c != ';')
1154                       all_semicolons = false;
1155                     if (!all_semicolons)
1156                       {
1157                         /* We skip all leading white space, but not EOLs.  */
1158                         if (!(buflen == 0 && (c == ' ' || c == '\t')))
1159                           comment_add (c);
1160                       }
1161                   }
1162                 comment_line_end (0);
1163                 continue;
1164               }
1165
1166             case '"':
1167               {
1168                 op->token = XMALLOC (struct token);
1169                 init_token (op->token);
1170                 op->line_number_at_start = line_number;
1171                 for (;;)
1172                   {
1173                     int c = do_getc ();
1174                     if (c == EOF)
1175                       /* Invalid input.  Be tolerant, no error message.  */
1176                       break;
1177                     if (c == '"')
1178                       break;
1179                     if (c == '\\') /* syntax_single_esc */
1180                       {
1181                         c = do_getc ();
1182                         if (c == EOF)
1183                           /* Invalid input.  Be tolerant, no error message.  */
1184                           break;
1185                       }
1186                     grow_token (op->token);
1187                     op->token->chars[op->token->charcount++].ch = c;
1188                   }
1189                 op->type = t_string;
1190
1191                 if (extract_all)
1192                   {
1193                     lex_pos_ty pos;
1194
1195                     pos.file_name = logical_file_name;
1196                     pos.line_number = op->line_number_at_start;
1197                     remember_a_message (mlp, NULL, string_of_object (op),
1198                                         null_context, &pos,
1199                                         NULL, savable_comment);
1200                   }
1201                 last_non_comment_line = line_number;
1202                 return;
1203               }
1204
1205             case '#':
1206               /* Dispatch macro handling.  */
1207               {
1208                 int c;
1209
1210                 for (;;)
1211                   {
1212                     c = do_getc ();
1213                     if (c == EOF)
1214                       /* Invalid input.  Be tolerant, no error message.  */
1215                       {
1216                         op->type = t_other;
1217                         return;
1218                       }
1219                     if (!(c >= '0' && c <= '9'))
1220                       break;
1221                   }
1222
1223                 switch (c)
1224                   {
1225                   case '(':
1226                   case '"':
1227                     do_ungetc (c);
1228                     /*FALLTHROUGH*/
1229                   case '\'':
1230                   case ':':
1231                   case '.':
1232                   case ',':
1233                   case 'A': case 'a':
1234                   case 'C': case 'c':
1235                   case 'P': case 'p':
1236                   case 'S': case 's':
1237                     {
1238                       struct object inner;
1239                       read_object (&inner, null_context);
1240                       /* Dots and EOF are not allowed here.
1241                          But be tolerant.  */
1242                       free_object (&inner);
1243                       op->type = t_other;
1244                       last_non_comment_line = line_number;
1245                       return;
1246                     }
1247
1248                   case '|':
1249                     {
1250                       int depth = 0;
1251                       int c;
1252
1253                       comment_start ();
1254                       c = do_getc ();
1255                       for (;;)
1256                         {
1257                           if (c == EOF)
1258                             break;
1259                           if (c == '|')
1260                             {
1261                               c = do_getc ();
1262                               if (c == EOF)
1263                                 break;
1264                               if (c == '#')
1265                                 {
1266                                   if (depth == 0)
1267                                     {
1268                                       comment_line_end (0);
1269                                       break;
1270                                     }
1271                                   depth--;
1272                                   comment_add ('|');
1273                                   comment_add ('#');
1274                                   c = do_getc ();
1275                                 }
1276                               else
1277                                 comment_add ('|');
1278                             }
1279                           else if (c == '#')
1280                             {
1281                               c = do_getc ();
1282                               if (c == EOF)
1283                                 break;
1284                               comment_add ('#');
1285                               if (c == '|')
1286                                 {
1287                                   depth++;
1288                                   comment_add ('|');
1289                                   c = do_getc ();
1290                                 }
1291                             }
1292                           else
1293                             {
1294                               /* We skip all leading white space.  */
1295                               if (!(buflen == 0 && (c == ' ' || c == '\t')))
1296                                 comment_add (c);
1297                               if (c == '\n')
1298                                 {
1299                                   comment_line_end (1);
1300                                   comment_start ();
1301                                 }
1302                               c = do_getc ();
1303                             }
1304                         }
1305                       if (c == EOF)
1306                         {
1307                           /* EOF not allowed here.  But be tolerant.  */
1308                           op->type = t_eof;
1309                           return;
1310                         }
1311                       last_comment_line = line_number;
1312                       continue;
1313                     }
1314
1315                   case '\\':
1316                     {
1317                       struct token token;
1318                       struct char_syntax first;
1319                       first.ch = '\\';
1320                       first.scode = syntax_single_esc;
1321                       read_token (&token, &first);
1322                       free_token (&token);
1323                       op->type = t_other;
1324                       last_non_comment_line = line_number;
1325                       return;
1326                     }
1327
1328                   case 'B': case 'b':
1329                   case 'O': case 'o':
1330                   case 'X': case 'x':
1331                   case 'R': case 'r':
1332                   case '*':
1333                     {
1334                       struct token token;
1335                       read_token (&token, NULL);
1336                       free_token (&token);
1337                       op->type = t_other;
1338                       last_non_comment_line = line_number;
1339                       return;
1340                     }
1341
1342                   case '=':
1343                     /* Ignore read labels.  */
1344                     continue;
1345
1346                   case '#':
1347                     /* Don't bother looking up the corresponding object.  */
1348                     op->type = t_other;
1349                     last_non_comment_line = line_number;
1350                     return;
1351
1352                   case '+':
1353                   case '-':
1354                     /* Simply assume every feature expression is true.  */
1355                     {
1356                       struct object inner;
1357                       read_object (&inner, null_context);
1358                       /* Dots and EOF are not allowed here.
1359                          But be tolerant.  */
1360                       free_object (&inner);
1361                       continue;
1362                     }
1363
1364                   default:
1365                     op->type = t_other;
1366                     last_non_comment_line = line_number;
1367                     return;
1368                   }
1369                 /*NOTREACHED*/
1370                 abort ();
1371               }
1372
1373             default:
1374               /*NOTREACHED*/
1375               abort ();
1376             }
1377
1378         default:
1379           /*NOTREACHED*/
1380           abort ();
1381         }
1382     }
1383 }
1384
1385
1386 void
1387 extract_lisp (FILE *f,
1388               const char *real_filename, const char *logical_filename,
1389               flag_context_list_table_ty *flag_table,
1390               msgdomain_list_ty *mdlp)
1391 {
1392   mlp = mdlp->item[0]->messages;
1393
1394   fp = f;
1395   real_file_name = real_filename;
1396   logical_file_name = xstrdup (logical_filename);
1397   line_number = 1;
1398
1399   last_comment_line = -1;
1400   last_non_comment_line = -1;
1401
1402   flag_context_list_table = flag_table;
1403
1404   init_keywords ();
1405
1406   /* Eat tokens until eof is seen.  When read_object returns
1407      due to an unbalanced closing parenthesis, just restart it.  */
1408   do
1409     {
1410       struct object toplevel_object;
1411
1412       read_object (&toplevel_object, null_context);
1413
1414       if (toplevel_object.type == t_eof)
1415         break;
1416
1417       free_object (&toplevel_object);
1418     }
1419   while (!feof (fp));
1420
1421   /* Close scanner.  */
1422   fp = NULL;
1423   real_file_name = NULL;
1424   logical_file_name = NULL;
1425   line_number = 0;
1426 }