gettext-tools/src/x-scheme.c

   1 /* xgettext Scheme backend.
   2    Copyright (C) 2004-2009 Free Software Foundation, Inc.
   3
   4    This file was written by Bruno Haible <bruno@clisp.org>, 2004-2005.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22
  23 /* Specification.  */
  24 #include "x-scheme.h"
  25
  26 #include <errno.h>
  27 #include <stdbool.h>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31
  32 #include "message.h"
  33 #include "xgettext.h"
  34 #include "error.h"
  35 #include "xalloc.h"
  36 #include "hash.h"
  37 #include "gettext.h"
  38
  39 #define _(s) gettext(s)
  40
  41
  42 /* The Scheme syntax is described in R5RS.  It is implemented in
  43    guile-1.6.4/libguile/read.c.
  44    Since we are interested only in strings and in forms similar to
  45         (gettext msgid ...)
  46    or   (ngettext msgid msgid_plural ...)
  47    we make the following simplifications:
  48
  49    - Assume the keywords and strings are in an ASCII compatible encoding.
  50      This means we can read the input file one byte at a time, instead of
  51      one character at a time.  No need to worry about multibyte characters:
  52      If they occur as part of identifiers, they most probably act as
  53      constituent characters, and the byte based approach will do the same.
  54
  55    - Assume the read-hash-procedures is in the default state.
  56      Non-standard reader extensions are mostly used to read data, not programs.
  57
  58    The remaining syntax rules are:
  59
  60    - The syntax code assigned to each character, and how tokens are built
  61      up from characters (single escape, multiple escape etc.).
  62
  63    - Comment syntax: ';' and '#! ... \n!#\n'.
  64
  65    - String syntax: "..." with single escapes.
  66
  67    - Read macros and dispatch macro character '#'.  Needed to be able to
  68      tell which is the n-th argument of a function call.
  69
  70  */
  71
  72
  73 /* ====================== Keyword set customization.  ====================== */
  74
  75 /* If true extract all strings.  */
  76 static bool extract_all = false;
  77
  78 static hash_table keywords;
  79 static bool default_keywords = true;
  80
  81
  82 void
  83 x_scheme_extract_all ()
  84 {
  85   extract_all = true;
  86 }
  87
  88
  89 void
  90 x_scheme_keyword (const char *name)
  91 {
  92   if (name == NULL)
  93     default_keywords = false;
  94   else
  95     {
  96       const char *end;
  97       struct callshape shape;
  98       const char *colon;
  99
 100       if (keywords.table == NULL)
 101         hash_init (&keywords, 100);
 102
 103       split_keywordspec (name, &end, &shape);
 104
 105       /* The characters between name and end should form a valid Lisp symbol.
 106          Extract the symbol name part.  */
 107       colon = strchr (name, ':');
 108       if (colon != NULL && colon < end)
 109         {
 110           name = colon + 1;
 111           if (name < end && *name == ':')
 112             name++;
 113           colon = strchr (name, ':');
 114           if (colon != NULL && colon < end)
 115             return;
 116         }
 117
 118       insert_keyword_callshape (&keywords, name, end - name, &shape);
 119     }
 120 }
 121
 122 /* Finish initializing the keywords hash table.
 123    Called after argument processing, before each file is processed.  */
 124 static void
 125 init_keywords ()
 126 {
 127   if (default_keywords)
 128     {
 129       /* When adding new keywords here, also update the documentation in
 130          xgettext.texi!  */
 131       x_scheme_keyword ("gettext");             /* libguile/i18n.c */
 132       x_scheme_keyword ("ngettext:1,2");        /* libguile/i18n.c */
 133       x_scheme_keyword ("gettext-noop");
 134       default_keywords = false;
 135     }
 136 }
 137
 138 void
 139 init_flag_table_scheme ()
 140 {
 141   xgettext_record_flag ("gettext:1:pass-scheme-format");
 142   xgettext_record_flag ("ngettext:1:pass-scheme-format");
 143   xgettext_record_flag ("ngettext:2:pass-scheme-format");
 144   xgettext_record_flag ("gettext-noop:1:pass-scheme-format");
 145   xgettext_record_flag ("format:2:scheme-format");
 146 }
 147
 148
 149 /* ======================== Reading of characters.  ======================== */
 150
 151 /* Real filename, used in error messages about the input file.  */
 152 static const char *real_file_name;
 153
 154 /* Logical filename and line number, used to label the extracted messages.  */
 155 static char *logical_file_name;
 156 static int line_number;
 157
 158 /* The input file stream.  */
 159 static FILE *fp;
 160
 161
 162 /* Fetch the next character from the input file.  */
 163 static int
 164 do_getc ()
 165 {
 166   int c = getc (fp);
 167
 168   if (c == EOF)
 169     {
 170       if (ferror (fp))
 171         error (EXIT_FAILURE, errno, _("\
 172 error while reading \"%s\""), real_file_name);
 173     }
 174   else if (c == '\n')
 175    line_number++;
 176
 177   return c;
 178 }
 179
 180 /* Put back the last fetched character, not EOF.  */
 181 static void
 182 do_ungetc (int c)
 183 {
 184   if (c == '\n')
 185     line_number--;
 186   ungetc (c, fp);
 187 }
 188
 189
 190 /* ========================== Reading of tokens.  ========================== */
 191
 192
 193 /* A token consists of a sequence of characters.  */
 194 struct token
 195 {
 196   int allocated;                /* number of allocated 'token_char's */
 197   int charcount;                /* number of used 'token_char's */
 198   char *chars;                  /* the token's constituents */
 199 };
 200
 201 /* Initialize a 'struct token'.  */
 202 static inline void
 203 init_token (struct token *tp)
 204 {
 205   tp->allocated = 10;
 206   tp->chars = XNMALLOC (tp->allocated, char);
 207   tp->charcount = 0;
 208 }
 209
 210 /* Free the memory pointed to by a 'struct token'.  */
 211 static inline void
 212 free_token (struct token *tp)
 213 {
 214   free (tp->chars);
 215 }
 216
 217 /* Ensure there is enough room in the token for one more character.  */
 218 static inline void
 219 grow_token (struct token *tp)
 220 {
 221   if (tp->charcount == tp->allocated)
 222     {
 223       tp->allocated *= 2;
 224       tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
 225     }
 226 }
 227
 228 /* Read the next token.  'first' is the first character, which has already
 229    been read.  */
 230 static void
 231 read_token (struct token *tp, int first)
 232 {
 233   init_token (tp);
 234
 235   grow_token (tp);
 236   tp->chars[tp->charcount++] = first;
 237
 238   for (;;)
 239     {
 240       int c = do_getc ();
 241
 242       if (c == EOF)
 243         break;
 244       if (c == ' ' || c == '\r' || c == '\f' || c == '\t' || c == '\n'
 245           || c == '"' || c == '(' || c == ')' || c == ';')
 246         {
 247           do_ungetc (c);
 248           break;
 249         }
 250       grow_token (tp);
 251       tp->chars[tp->charcount++] = c;
 252     }
 253 }
 254
 255 /* Tests if a token represents an integer.
 256    Taken from guile-1.6.4/libguile/numbers.c:scm_istr2int().  */
 257 static inline bool
 258 is_integer_syntax (const char *str, int len, int radix)
 259 {
 260   const char *p = str;
 261   const char *p_end = str + len;
 262
 263   /* The accepted syntax is
 264        ['+'|'-'] DIGIT+
 265      where DIGIT is a hexadecimal digit whose value is below radix.  */
 266
 267   if (p == p_end)
 268     return false;
 269   if (*p == '+' || *p == '-')
 270     {
 271       p++;
 272       if (p == p_end)
 273         return false;
 274     }
 275   do
 276     {
 277       int c = *p++;
 278
 279       if (c >= '0' && c <= '9')
 280         c = c - '0';
 281       else if (c >= 'A' && c <= 'F')
 282         c = c - 'A' + 10;
 283       else if (c >= 'a' && c <= 'f')
 284         c = c - 'a' + 10;
 285       else
 286         return false;
 287       if (c >= radix)
 288         return false;
 289     }
 290   while (p < p_end);
 291   return true;
 292 }
 293
 294 /* Tests if a token represents a rational, floating-point or complex number.
 295    If unconstrained is false, only real numbers are accepted; otherwise,
 296    complex numbers are accepted as well.
 297    Taken from guile-1.6.4/libguile/numbers.c:scm_istr2flo().  */
 298 static inline bool
 299 is_other_number_syntax (const char *str, int len, int radix, bool unconstrained)
 300 {
 301   const char *p = str;
 302   const char *p_end = str + len;
 303   bool seen_sign;
 304   bool seen_digits;
 305
 306   /* The accepted syntaxes are:
 307      for a floating-point number:
 308        ['+'|'-'] DIGIT+ [EXPONENT]
 309        ['+'|'-'] DIGIT* '.' DIGIT+ [EXPONENT]
 310        where EXPONENT ::= ['d'|'e'|'f'|'l'|'s'] DIGIT+
 311        (Dot and exponent are allowed only if radix is 10.)
 312      for a rational number:
 313        ['+'|'-'] DIGIT+ '/' DIGIT+
 314      for a complex number:
 315        REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
 316        REAL-NUMBER {'+'|'-'} 'i'
 317        {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
 318        {'+'|'-'} 'i'
 319        REAL-NUMBER '@' REAL-NUMBER
 320    */
 321   if (p == p_end)
 322     return false;
 323   /* Parse leading sign.  */
 324   seen_sign = false;
 325   if (*p == '+' || *p == '-')
 326     {
 327       p++;
 328       if (p == p_end)
 329         return false;
 330       seen_sign = true;
 331       /* Recognize complex number syntax: {'+'|'-'} 'i'  */
 332       if (unconstrained && (*p == 'I' || *p == 'i') && p + 1 == p_end)
 333         return true;
 334     }
 335   /* Parse digits before dot or exponent or slash.  */
 336   seen_digits = false;
 337   do
 338     {
 339       int c = *p;
 340
 341       if (c >= '0' && c <= '9')
 342         c = c - '0';
 343       else if (c >= 'A' && c <= 'F')
 344         {
 345           if (c >= 'D' && radix == 10) /* exponent? */
 346             break;
 347           c = c - 'A' + 10;
 348         }
 349       else if (c >= 'a' && c <= 'f')
 350         {
 351           if (c >= 'd' && radix == 10) /* exponent? */
 352             break;
 353           c = c - 'a' + 10;
 354         }
 355       else
 356         break;
 357       if (c >= radix)
 358         return false;
 359       seen_digits = true;
 360       p++;
 361     }
 362   while (p < p_end);
 363   /* If p == p_end, we know that seen_digits = true, and the number is an
 364      integer without exponent.  */
 365   if (p < p_end)
 366     {
 367       /* If we have no digits so far, we need a decimal point later.  */
 368       if (!seen_digits && !(*p == '.' && radix == 10))
 369         return false;
 370       /* Trailing '#' signs are equivalent to zeroes.  */
 371       while (p < p_end && *p == '#')
 372         p++;
 373       if (p < p_end)
 374         {
 375           if (*p == '/')
 376             {
 377               /* Parse digits after the slash.  */
 378               bool all_zeroes = true;
 379               p++;
 380               for (; p < p_end; p++)
 381                 {
 382                   int c = *p;
 383
 384                   if (c >= '0' && c <= '9')
 385                     c = c - '0';
 386                   else if (c >= 'A' && c <= 'F')
 387                     c = c - 'A' + 10;
 388                   else if (c >= 'a' && c <= 'f')
 389                     c = c - 'a' + 10;
 390                   else
 391                     break;
 392                   if (c >= radix)
 393                     return false;
 394                   if (c != 0)
 395                     all_zeroes = false;
 396                 }
 397               /* A zero denominator is not allowed.  */
 398               if (all_zeroes)
 399                 return false;
 400               /* Trailing '#' signs are equivalent to zeroes.  */
 401               while (p < p_end && *p == '#')
 402                 p++;
 403             }
 404           else
 405             {
 406               if (*p == '.')
 407                 {
 408                   /* Decimal point notation.  */
 409                   if (radix != 10)
 410                     return false;
 411                   /* Parse digits after the decimal point.  */
 412                   p++;
 413                   for (; p < p_end; p++)
 414                     {
 415                       int c = *p;
 416
 417                       if (c >= '0' && c <= '9')
 418                         seen_digits = true;
 419                       else
 420                         break;
 421                     }
 422                   /* Digits are required before or after the decimal point.  */
 423                   if (!seen_digits)
 424                     return false;
 425                   /* Trailing '#' signs are equivalent to zeroes.  */
 426                   while (p < p_end && *p == '#')
 427                     p++;
 428                 }
 429               if (p < p_end)
 430                 {
 431                   /* Parse exponent.  */
 432                   switch (*p)
 433                     {
 434                     case 'D': case 'd':
 435                     case 'E': case 'e':
 436                     case 'F': case 'f':
 437                     case 'L': case 'l':
 438                     case 'S': case 's':
 439                       if (radix != 10)
 440                         return false;
 441                       p++;
 442                       if (p == p_end)
 443                         return false;
 444                       if (*p == '+' || *p == '-')
 445                         {
 446                           p++;
 447                           if (p == p_end)
 448                             return false;
 449                         }
 450                       if (!(*p >= '0' && *p <= '9'))
 451                         return false;
 452                       for (;;)
 453                         {
 454                           p++;
 455                           if (p == p_end)
 456                             break;
 457                           if (!(*p >= '0' && *p <= '9'))
 458                             break;
 459                         }
 460                       break;
 461                     default:
 462                       break;
 463                     }
 464                 }
 465             }
 466         }
 467     }
 468   if (p == p_end)
 469     return true;
 470   /* Recognize complex number syntax.  */
 471   if (unconstrained)
 472     {
 473       /* Recognize the syntax  {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'  */
 474       if (seen_sign && (*p == 'I' || *p == 'i') && p + 1 == p_end)
 475         return true;
 476       /* Recognize the syntaxes
 477            REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
 478            REAL-NUMBER {'+'|'-'} 'i'
 479        */
 480       if (*p == '+' || *p == '-')
 481         return (p_end[-1] == 'I' || p_end[-1] == 'i')
 482                 && (p + 1 == p_end - 1
 483                     || is_other_number_syntax (p, p_end - 1 - p, radix, false));
 484       /* Recognize the syntax  REAL-NUMBER '@' REAL-NUMBER  */
 485       if (*p == '@')
 486         {
 487           p++;
 488           return is_other_number_syntax (p, p_end - p, radix, false);
 489         }
 490     }
 491   return false;
 492 }
 493
 494 /* Tests if a token represents a number.
 495    Taken from guile-1.6.4/libguile/numbers.c:scm_istring2number().  */
 496 static bool
 497 is_number (const struct token *tp)
 498 {
 499   const char *str = tp->chars;
 500   int len = tp->charcount;
 501   int radix = 10;
 502   enum { unknown, exact, inexact } exactness = unknown;
 503   bool seen_radix_prefix = false;
 504   bool seen_exactness_prefix = false;
 505
 506   if (len == 1)
 507     if (*str == '+' || *str == '-')
 508       return false;
 509   while (len >= 2 && *str == '#')
 510     {
 511       switch (str[1])
 512         {
 513         case 'B': case 'b':
 514           if (seen_radix_prefix)
 515             return false;
 516           radix = 2;
 517           seen_radix_prefix = true;
 518           break;
 519         case 'O': case 'o':
 520           if (seen_radix_prefix)
 521             return false;
 522           radix = 8;
 523           seen_radix_prefix = true;
 524           break;
 525         case 'D': case 'd':
 526           if (seen_radix_prefix)
 527             return false;
 528           radix = 10;
 529           seen_radix_prefix = true;
 530           break;
 531         case 'X': case 'x':
 532           if (seen_radix_prefix)
 533             return false;
 534           radix = 16;
 535           seen_radix_prefix = true;
 536           break;
 537         case 'E': case 'e':
 538           if (seen_exactness_prefix)
 539             return false;
 540           exactness = exact;
 541           seen_exactness_prefix = true;
 542           break;
 543         case 'I': case 'i':
 544           if (seen_exactness_prefix)
 545             return false;
 546           exactness = inexact;
 547           seen_exactness_prefix = true;
 548           break;
 549         default:
 550           return false;
 551         }
 552       str += 2;
 553       len -= 2;
 554     }
 555   if (exactness != inexact)
 556     {
 557       /* Try to parse an integer.  */
 558       if (is_integer_syntax (str, len, 10))
 559         return true;
 560       /* FIXME: Other Scheme implementations support exact rational numbers
 561          or exact complex numbers.  */
 562     }
 563   if (exactness != exact)
 564     {
 565       /* Try to parse a rational, floating-point or complex number.  */
 566       if (is_other_number_syntax (str, len, 10, true))
 567         return true;
 568     }
 569   return false;
 570 }
 571
 572
 573 /* ========================= Accumulating comments ========================= */
 574
 575
 576 static char *buffer;
 577 static size_t bufmax;
 578 static size_t buflen;
 579
 580 static inline void
 581 comment_start ()
 582 {
 583   buflen = 0;
 584 }
 585
 586 static inline void
 587 comment_add (int c)
 588 {
 589   if (buflen >= bufmax)
 590     {
 591       bufmax = 2 * bufmax + 10;
 592       buffer = xrealloc (buffer, bufmax);
 593     }
 594   buffer[buflen++] = c;
 595 }
 596
 597 static inline void
 598 comment_line_end (size_t chars_to_remove)
 599 {
 600   buflen -= chars_to_remove;
 601   while (buflen >= 1
 602          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 603     --buflen;
 604   if (chars_to_remove == 0 && buflen >= bufmax)
 605     {
 606       bufmax = 2 * bufmax + 10;
 607       buffer = xrealloc (buffer, bufmax);
 608     }
 609   buffer[buflen] = '\0';
 610   savable_comment_add (buffer);
 611 }
 612
 613
 614 /* These are for tracking whether comments count as immediately before
 615    keyword.  */
 616 static int last_comment_line;
 617 static int last_non_comment_line;
 618
 619
 620 /* ========================= Accumulating messages ========================= */
 621
 622
 623 static message_list_ty *mlp;
 624
 625
 626 /* ========================== Reading of objects.  ========================= */
 627
 628
 629 /* We are only interested in symbols (e.g. gettext or ngettext) and strings.
 630    Other objects need not to be represented precisely.  */
 631 enum object_type
 632 {
 633   t_symbol,     /* symbol */
 634   t_string,     /* string */
 635   t_other,      /* other kind of real object */
 636   t_dot,        /* '.' pseudo object */
 637   t_close,      /* ')' pseudo object */
 638   t_eof         /* EOF marker */
 639 };
 640
 641 struct object
 642 {
 643   enum object_type type;
 644   struct token *token;          /* for t_symbol and t_string */
 645   int line_number_at_start;     /* for t_string */
 646 };
 647
 648 /* Free the memory pointed to by a 'struct object'.  */
 649 static inline void
 650 free_object (struct object *op)
 651 {
 652   if (op->type == t_symbol || op->type == t_string)
 653     {
 654       free_token (op->token);
 655       free (op->token);
 656     }
 657 }
 658
 659 /* Convert a t_symbol/t_string token to a char*.  */
 660 static char *
 661 string_of_object (const struct object *op)
 662 {
 663   char *str;
 664   int n;
 665
 666   if (!(op->type == t_symbol || op->type == t_string))
 667     abort ();
 668   n = op->token->charcount;
 669   str = XNMALLOC (n + 1, char);
 670   memcpy (str, op->token->chars, n);
 671   str[n] = '\0';
 672   return str;
 673 }
 674
 675 /* Context lookup table.  */
 676 static flag_context_list_table_ty *flag_context_list_table;
 677
 678 /* Read the next object.  */
 679 static void
 680 read_object (struct object *op, flag_context_ty outer_context)
 681 {
 682   for (;;)
 683     {
 684       int c = do_getc ();
 685
 686       switch (c)
 687         {
 688         case EOF:
 689           op->type = t_eof;
 690           return;
 691
 692         case ' ': case '\r': case '\f': case '\t':
 693           continue;
 694
 695         case '\n':
 696           /* Comments assumed to be grouped with a message must immediately
 697              precede it, with no non-whitespace token on a line between
 698              both.  */
 699           if (last_non_comment_line > last_comment_line)
 700             savable_comment_reset ();
 701           continue;
 702
 703         case ';':
 704           {
 705             bool all_semicolons = true;
 706
 707             last_comment_line = line_number;
 708             comment_start ();
 709             for (;;)
 710               {
 711                 c = do_getc ();
 712                 if (c == EOF || c == '\n')
 713                   break;
 714                 if (c != ';')
 715                   all_semicolons = false;
 716                 if (!all_semicolons)
 717                   {
 718                     /* We skip all leading white space, but not EOLs.  */
 719                     if (!(buflen == 0 && (c == ' ' || c == '\t')))
 720                       comment_add (c);
 721                   }
 722               }
 723             comment_line_end (0);
 724             continue;
 725           }
 726
 727         case '(':
 728           {
 729              int arg = 0;               /* Current argument number.  */
 730              flag_context_list_iterator_ty context_iter;
 731             const struct callshapes *shapes = NULL;
 732             struct arglist_parser *argparser = NULL;
 733
 734              for (;; arg++)
 735                {
 736                 struct object inner;
 737                 flag_context_ty inner_context;
 738
 739                 if (arg == 0)
 740                   inner_context = null_context;
 741                 else
 742                   inner_context =
 743                     inherited_context (outer_context,
 744                                        flag_context_list_iterator_advance (
 745                                          &context_iter));
 746
 747                 read_object (&inner, inner_context);
 748
 749                 /* Recognize end of list.  */
 750                 if (inner.type == t_close)
 751                   {
 752                     op->type = t_other;
 753                     last_non_comment_line = line_number;
 754                     if (argparser != NULL)
 755                       arglist_parser_done (argparser, arg);
 756                     return;
 757                   }
 758
 759                 /* Dots are not allowed in every position.
 760                    But be tolerant.  */
 761
 762                 /* EOF inside list is illegal.
 763                    But be tolerant.  */
 764                 if (inner.type == t_eof)
 765                   break;
 766
 767                 if (arg == 0)
 768                   {
 769                     /* This is the function position.  */
 770                     if (inner.type == t_symbol)
 771                       {
 772                         char *symbol_name = string_of_object (&inner);
 773                         void *keyword_value;
 774
 775                         if (hash_find_entry (&keywords,
 776                                              symbol_name, strlen (symbol_name),
 777                                              &keyword_value)
 778                             == 0)
 779                           shapes = (const struct callshapes *) keyword_value;
 780
 781                         argparser = arglist_parser_alloc (mlp, shapes);
 782
 783                         context_iter =
 784                           flag_context_list_iterator (
 785                             flag_context_list_table_lookup (
 786                               flag_context_list_table,
 787                               symbol_name, strlen (symbol_name)));
 788
 789                         free (symbol_name);
 790                       }
 791                     else
 792                       context_iter = null_context_list_iterator;
 793                   }
 794                 else
 795                   {
 796                     /* These are the argument positions.  */
 797                     if (argparser != NULL && inner.type == t_string)
 798                       arglist_parser_remember (argparser, arg,
 799                                                string_of_object (&inner),
 800                                                inner_context,
 801                                                logical_file_name,
 802                                                inner.line_number_at_start,
 803                                                savable_comment);
 804                   }
 805
 806                 free_object (&inner);
 807               }
 808             if (argparser != NULL)
 809               arglist_parser_done (argparser, arg);
 810           }
 811           op->type = t_other;
 812           last_non_comment_line = line_number;
 813           return;
 814
 815         case ')':
 816           /* Tell the caller about the end of list.
 817              Unmatched closing parenthesis is illegal.
 818              But be tolerant.  */
 819           op->type = t_close;
 820           last_non_comment_line = line_number;
 821           return;
 822
 823         case ',':
 824           {
 825             int c = do_getc ();
 826             /* The ,@ handling inside lists is wrong anyway, because
 827                ,@form expands to an unknown number of elements.  */
 828             if (c != EOF && c != '@')
 829               do_ungetc (c);
 830           }
 831           /*FALLTHROUGH*/
 832         case '\'':
 833         case '`':
 834           {
 835             struct object inner;
 836
 837             read_object (&inner, null_context);
 838
 839             /* Dots and EOF are not allowed here.  But be tolerant.  */
 840
 841             free_object (&inner);
 842
 843             op->type = t_other;
 844             last_non_comment_line = line_number;
 845             return;
 846           }
 847
 848         case '#':
 849           /* Dispatch macro handling.  */
 850           {
 851             c = do_getc ();
 852             if (c == EOF)
 853               /* Invalid input.  Be tolerant, no error message.  */
 854               {
 855                 op->type = t_other;
 856                 return;
 857               }
 858
 859             switch (c)
 860               {
 861               case '(': /* Vector */
 862                 do_ungetc (c);
 863                 {
 864                   struct object inner;
 865                   read_object (&inner, null_context);
 866                   /* Dots and EOF are not allowed here.
 867                      But be tolerant.  */
 868                   free_object (&inner);
 869                   op->type = t_other;
 870                   last_non_comment_line = line_number;
 871                   return;
 872                 }
 873
 874               case 'T': case 't': /* Boolean true */
 875               case 'F': case 'f': /* Boolean false */
 876                 op->type = t_other;
 877                 last_non_comment_line = line_number;
 878                 return;
 879
 880               case 'B': case 'b':
 881               case 'O': case 'o':
 882               case 'D': case 'd':
 883               case 'X': case 'x':
 884               case 'E': case 'e':
 885               case 'I': case 'i':
 886                 {
 887                   struct token token;
 888                   do_ungetc (c);
 889                   read_token (&token, '#');
 890                   if (is_number (&token))
 891                     {
 892                       /* A number.  */
 893                       free_token (&token);
 894                       op->type = t_other;
 895                       last_non_comment_line = line_number;
 896                       return;
 897                     }
 898                   else
 899                     {
 900                       if (token.charcount == 2
 901                           && (token.chars[1] == 'e' || token.chars[1] == 'i'))
 902                         {
 903                           c = do_getc ();
 904                           if (c != EOF)
 905                             do_ungetc (c);
 906                           if (c == '(')
 907                             /* Homogenous vector syntax, see arrays.scm.  */
 908                             case 'a':   /* Vectors of char */
 909                             case 'c':   /* Vectors of complex */
 910                           /*case 'e':*/ /* Vectors of long */
 911                             case 'h':   /* Vectors of short */
 912                           /*case 'i':*/ /* Vectors of double-float */
 913                             case 'l':   /* Vectors of long long */
 914                             case 's':   /* Vectors of single-float */
 915                             case 'u':   /* Vectors of unsigned long */
 916                             case 'y':   /* Vectors of byte */
 917                               {
 918                                 struct object inner;
 919                                 read_object (&inner, null_context);
 920                                 /* Dots and EOF are not allowed here.
 921                                    But be tolerant.  */
 922                                 free_token (&token);
 923                                 free_object (&inner);
 924                                 op->type = t_other;
 925                                 last_non_comment_line = line_number;
 926                                 return;
 927                               }
 928                         }
 929                       /* Unknown # object.  But be tolerant.  */
 930                       free_token (&token);
 931                       op->type = t_other;
 932                       last_non_comment_line = line_number;
 933                       return;
 934                     }
 935                 }
 936
 937               case '!':
 938                 /* Block comment '#! ... \n!#\n'.  We don't extract it
 939                    because it's only used to introduce scripts on Unix.  */
 940                 {
 941                   int last1 = 0;
 942                   int last2 = 0;
 943                   int last3 = 0;
 944
 945                   for (;;)
 946                     {
 947                       c = do_getc ();
 948                       if (c == EOF)
 949                         /* EOF is not allowed here.  But be tolerant.  */
 950                         break;
 951                       if (last3 == '\n' && last2 == '!' && last1 == '#'
 952                           && c == '\n')
 953                         break;
 954                       last3 = last2;
 955                       last2 = last1;
 956                       last1 = c;
 957                     }
 958                   continue;
 959                 }
 960
 961               case '*':
 962                 /* Bit vector.  */
 963                 {
 964                   struct token token;
 965                   read_token (&token, c);
 966                   /* The token should consists only of '0' and '1', except
 967                      for the initial '*'.  But be tolerant.  */
 968                   free_token (&token);
 969                   op->type = t_other;
 970                   last_non_comment_line = line_number;
 971                   return;
 972                 }
 973
 974               case '{':
 975                 /* Symbol with multiple escapes: #{...}#  */
 976                 {
 977                   op->token = XMALLOC (struct token);
 978
 979                   init_token (op->token);
 980
 981                   for (;;)
 982                     {
 983                       c = do_getc ();
 984
 985                       if (c == EOF)
 986                         break;
 987                       if (c == '\\')
 988                         {
 989                           c = do_getc ();
 990                           if (c == EOF)
 991                             break;
 992                         }
 993                       else if (c == '}')
 994                         {
 995                           c = do_getc ();
 996                           if (c == '#')
 997                             break;
 998                           if (c != EOF)
 999                             do_ungetc (c);
1000                           c = '}';
1001                         }
1002                       grow_token (op->token);
1003                       op->token->chars[op->token->charcount++] = c;
1004                     }
1005
1006                   op->type = t_symbol;
1007                   last_non_comment_line = line_number;
1008                   return;
1009                 }
1010
1011               case '\\':
1012                 /* Character.  */
1013                 {
1014                   struct token token;
1015                   c = do_getc ();
1016                   if (c != EOF)
1017                     {
1018                       read_token (&token, c);
1019                       free_token (&token);
1020                     }
1021                   op->type = t_other;
1022                   last_non_comment_line = line_number;
1023                   return;
1024                 }
1025
1026               case ':': /* Keyword.  */
1027               case '&': /* Deprecated keyword, installed in optargs.scm.  */
1028                 {
1029                   struct token token;
1030                   read_token (&token, '-');
1031                   free_token (&token);
1032                   op->type = t_other;
1033                   last_non_comment_line = line_number;
1034                   return;
1035                 }
1036
1037               /* The following are installed through read-hash-extend.  */
1038
1039               /* arrays.scm */
1040               case '0': case '1': case '2': case '3': case '4':
1041               case '5': case '6': case '7': case '8': case '9':
1042                 /* Multidimensional array syntax: #nx(...) where
1043                      n ::= DIGIT+
1044                      x ::= {'a'|'b'|'c'|'e'|'i'|'s'|'u'}
1045                  */
1046                 do
1047                   c = do_getc ();
1048                 while (c >= '0' && c <= '9');
1049                 /* c should be one of {'a'|'b'|'c'|'e'|'i'|'s'|'u'}.
1050                    But be tolerant.  */
1051                 /*FALLTHROUGH*/
1052               case '\'': /* boot-9.scm */
1053               case '.': /* boot-9.scm */
1054               case ',': /* srfi-10.scm */
1055                 {
1056                   struct object inner;
1057                   read_object (&inner, null_context);
1058                   /* Dots and EOF are not allowed here.
1059                      But be tolerant.  */
1060                   free_object (&inner);
1061                   op->type = t_other;
1062                   last_non_comment_line = line_number;
1063                   return;
1064                 }
1065
1066               default:
1067                 /* Unknown.  */
1068                 op->type = t_other;
1069                 last_non_comment_line = line_number;
1070                 return;
1071               }
1072             /*NOTREACHED*/
1073             abort ();
1074           }
1075
1076         case '"':
1077           {
1078             op->token = XMALLOC (struct token);
1079             init_token (op->token);
1080             op->line_number_at_start = line_number;
1081             for (;;)
1082               {
1083                 int c = do_getc ();
1084                 if (c == EOF)
1085                   /* Invalid input.  Be tolerant, no error message.  */
1086                   break;
1087                 if (c == '"')
1088                   break;
1089                 if (c == '\\')
1090                   {
1091                     c = do_getc ();
1092                     if (c == EOF)
1093                       /* Invalid input.  Be tolerant, no error message.  */
1094                       break;
1095                     switch (c)
1096                       {
1097                       case '\n':
1098                         continue;
1099                       case '0':
1100                         c = '\0';
1101                         break;
1102                       case 'a':
1103                         c = '\a';
1104                         break;
1105                       case 'f':
1106                         c = '\f';
1107                         break;
1108                       case 'n':
1109                         c = '\n';
1110                         break;
1111                       case 'r':
1112                         c = '\r';
1113                         break;
1114                       case 't':
1115                         c = '\t';
1116                         break;
1117                       case 'v':
1118                         c = '\v';
1119                         break;
1120                       default:
1121                         break;
1122                       }
1123                   }
1124                 grow_token (op->token);
1125                 op->token->chars[op->token->charcount++] = c;
1126               }
1127             op->type = t_string;
1128
1129             if (extract_all)
1130               {
1131                 lex_pos_ty pos;
1132
1133                 pos.file_name = logical_file_name;
1134                 pos.line_number = op->line_number_at_start;
1135                 remember_a_message (mlp, NULL, string_of_object (op),
1136                                     null_context, &pos, NULL, savable_comment);
1137               }
1138             last_non_comment_line = line_number;
1139             return;
1140           }
1141
1142         case '0': case '1': case '2': case '3': case '4':
1143         case '5': case '6': case '7': case '8': case '9':
1144         case '+': case '-': case '.':
1145           /* Read a number or symbol token.  */
1146           op->token = XMALLOC (struct token);
1147           read_token (op->token, c);
1148           if (op->token->charcount == 1 && op->token->chars[0] == '.')
1149             {
1150               free_token (op->token);
1151               free (op->token);
1152               op->type = t_dot;
1153             }
1154           else if (is_number (op->token))
1155             {
1156               /* A number.  */
1157               free_token (op->token);
1158               free (op->token);
1159               op->type = t_other;
1160             }
1161           else
1162             {
1163               /* A symbol.  */
1164               op->type = t_symbol;
1165             }
1166           last_non_comment_line = line_number;
1167           return;
1168
1169         case ':':
1170         default:
1171           /* Read a symbol token.  */
1172           op->token = XMALLOC (struct token);
1173           read_token (op->token, c);
1174           op->type = t_symbol;
1175           last_non_comment_line = line_number;
1176           return;
1177         }
1178     }
1179 }
1180
1181
1182 void
1183 extract_scheme (FILE *f,
1184                 const char *real_filename, const char *logical_filename,
1185                 flag_context_list_table_ty *flag_table,
1186                 msgdomain_list_ty *mdlp)
1187 {
1188   mlp = mdlp->item[0]->messages;
1189
1190   fp = f;
1191   real_file_name = real_filename;
1192   logical_file_name = xstrdup (logical_filename);
1193   line_number = 1;
1194
1195   last_comment_line = -1;
1196   last_non_comment_line = -1;
1197
1198   flag_context_list_table = flag_table;
1199
1200   init_keywords ();
1201
1202   /* Eat tokens until eof is seen.  When read_object returns
1203      due to an unbalanced closing parenthesis, just restart it.  */
1204   do
1205     {
1206       struct object toplevel_object;
1207
1208       read_object (&toplevel_object, null_context);
1209
1210       if (toplevel_object.type == t_eof)
1211         break;
1212
1213       free_object (&toplevel_object);
1214     }
1215   while (!feof (fp));
1216
1217   /* Close scanner.  */
1218   fp = NULL;
1219   real_file_name = NULL;
1220   logical_file_name = NULL;
1221   line_number = 0;
1222 }