gettext-tools/src/x-scheme.c

   1 /* xgettext Scheme backend.
   2    Copyright (C) 2004-2009, 2011, 2015 Free Software Foundation, Inc.
   3
   4    This file was written by Bruno Haible <bruno@clisp.org>, 2004-2005.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22
  23 /* Specification.  */
  24 #include "x-scheme.h"
  25
  26 #include <errno.h>
  27 #include <stdbool.h>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31
  32 #include "message.h"
  33 #include "xgettext.h"
  34 #include "error.h"
  35 #include "xalloc.h"
  36 #include "hash.h"
  37 #include "gettext.h"
  38
  39 #define _(s) gettext(s)
  40
  41
  42 /* The Scheme syntax is described in R5RS.  It is implemented in
  43    guile-2.0.0/libguile/read.c.
  44    Since we are interested only in strings and in forms similar to
  45         (gettext msgid ...)
  46    or   (ngettext msgid msgid_plural ...)
  47    we make the following simplifications:
  48
  49    - Assume the keywords and strings are in an ASCII compatible encoding.
  50      This means we can read the input file one byte at a time, instead of
  51      one character at a time.  No need to worry about multibyte characters:
  52      If they occur as part of identifiers, they most probably act as
  53      constituent characters, and the byte based approach will do the same.
  54
  55    - Assume the read-hash-procedures is in the default state.
  56      Non-standard reader extensions are mostly used to read data, not programs.
  57
  58    The remaining syntax rules are:
  59
  60    - The syntax code assigned to each character, and how tokens are built
  61      up from characters (single escape, multiple escape etc.).
  62
  63    - Comment syntax: ';' and '#! ... !#' and '#| ... |#' (may be nested).
  64
  65    - String syntax: "..." with single escapes.
  66
  67    - Read macros and dispatch macro character '#'.  Needed to be able to
  68      tell which is the n-th argument of a function call.
  69
  70  */
  71
  72
  73 /* ====================== Keyword set customization.  ====================== */
  74
  75 /* If true extract all strings.  */
  76 static bool extract_all = false;
  77
  78 static hash_table keywords;
  79 static bool default_keywords = true;
  80
  81
  82 void
  83 x_scheme_extract_all ()
  84 {
  85   extract_all = true;
  86 }
  87
  88
  89 void
  90 x_scheme_keyword (const char *name)
  91 {
  92   if (name == NULL)
  93     default_keywords = false;
  94   else
  95     {
  96       const char *end;
  97       struct callshape shape;
  98       const char *colon;
  99
 100       if (keywords.table == NULL)
 101         hash_init (&keywords, 100);
 102
 103       split_keywordspec (name, &end, &shape);
 104
 105       /* The characters between name and end should form a valid Lisp symbol.
 106          Extract the symbol name part.  */
 107       colon = strchr (name, ':');
 108       if (colon != NULL && colon < end)
 109         {
 110           name = colon + 1;
 111           if (name < end && *name == ':')
 112             name++;
 113           colon = strchr (name, ':');
 114           if (colon != NULL && colon < end)
 115             return;
 116         }
 117
 118       insert_keyword_callshape (&keywords, name, end - name, &shape);
 119     }
 120 }
 121
 122 /* Finish initializing the keywords hash table.
 123    Called after argument processing, before each file is processed.  */
 124 static void
 125 init_keywords ()
 126 {
 127   if (default_keywords)
 128     {
 129       /* When adding new keywords here, also update the documentation in
 130          xgettext.texi!  */
 131       x_scheme_keyword ("gettext");             /* libguile/i18n.c */
 132       x_scheme_keyword ("ngettext:1,2");        /* libguile/i18n.c */
 133       x_scheme_keyword ("gettext-noop");
 134       default_keywords = false;
 135     }
 136 }
 137
 138 void
 139 init_flag_table_scheme ()
 140 {
 141   xgettext_record_flag ("gettext:1:pass-scheme-format");
 142   xgettext_record_flag ("ngettext:1:pass-scheme-format");
 143   xgettext_record_flag ("ngettext:2:pass-scheme-format");
 144   xgettext_record_flag ("gettext-noop:1:pass-scheme-format");
 145   xgettext_record_flag ("format:2:scheme-format");
 146 }
 147
 148
 149 /* ======================== Reading of characters.  ======================== */
 150
 151 /* Real filename, used in error messages about the input file.  */
 152 static const char *real_file_name;
 153
 154 /* Logical filename and line number, used to label the extracted messages.  */
 155 static char *logical_file_name;
 156 static int line_number;
 157
 158 /* The input file stream.  */
 159 static FILE *fp;
 160
 161
 162 /* Fetch the next character from the input file.  */
 163 static int
 164 do_getc ()
 165 {
 166   int c = getc (fp);
 167
 168   if (c == EOF)
 169     {
 170       if (ferror (fp))
 171         error (EXIT_FAILURE, errno, _("\
 172 error while reading \"%s\""), real_file_name);
 173     }
 174   else if (c == '\n')
 175    line_number++;
 176
 177   return c;
 178 }
 179
 180 /* Put back the last fetched character, not EOF.  */
 181 static void
 182 do_ungetc (int c)
 183 {
 184   if (c == '\n')
 185     line_number--;
 186   ungetc (c, fp);
 187 }
 188
 189
 190 /* ========================== Reading of tokens.  ========================== */
 191
 192
 193 /* A token consists of a sequence of characters.  */
 194 struct token
 195 {
 196   int allocated;                /* number of allocated 'token_char's */
 197   int charcount;                /* number of used 'token_char's */
 198   char *chars;                  /* the token's constituents */
 199 };
 200
 201 /* Initialize a 'struct token'.  */
 202 static inline void
 203 init_token (struct token *tp)
 204 {
 205   tp->allocated = 10;
 206   tp->chars = XNMALLOC (tp->allocated, char);
 207   tp->charcount = 0;
 208 }
 209
 210 /* Free the memory pointed to by a 'struct token'.  */
 211 static inline void
 212 free_token (struct token *tp)
 213 {
 214   free (tp->chars);
 215 }
 216
 217 /* Ensure there is enough room in the token for one more character.  */
 218 static inline void
 219 grow_token (struct token *tp)
 220 {
 221   if (tp->charcount == tp->allocated)
 222     {
 223       tp->allocated *= 2;
 224       tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
 225     }
 226 }
 227
 228 /* Read the next token.  'first' is the first character, which has already
 229    been read.  */
 230 static void
 231 read_token (struct token *tp, int first)
 232 {
 233   init_token (tp);
 234
 235   grow_token (tp);
 236   tp->chars[tp->charcount++] = first;
 237
 238   for (;;)
 239     {
 240       int c = do_getc ();
 241
 242       if (c == EOF)
 243         break;
 244       if (c == ' ' || c == '\r' || c == '\f' || c == '\t' || c == '\n'
 245           || c == '"' || c == '(' || c == ')' || c == ';')
 246         {
 247           do_ungetc (c);
 248           break;
 249         }
 250       grow_token (tp);
 251       tp->chars[tp->charcount++] = c;
 252     }
 253 }
 254
 255 /* Tests if a token represents an integer.
 256    Taken from guile-1.6.4/libguile/numbers.c:scm_istr2int().  */
 257 static inline bool
 258 is_integer_syntax (const char *str, int len, int radix)
 259 {
 260   const char *p = str;
 261   const char *p_end = str + len;
 262
 263   /* The accepted syntax is
 264        ['+'|'-'] DIGIT+
 265      where DIGIT is a hexadecimal digit whose value is below radix.  */
 266
 267   if (p == p_end)
 268     return false;
 269   if (*p == '+' || *p == '-')
 270     {
 271       p++;
 272       if (p == p_end)
 273         return false;
 274     }
 275   do
 276     {
 277       int c = *p++;
 278
 279       if (c >= '0' && c <= '9')
 280         c = c - '0';
 281       else if (c >= 'A' && c <= 'F')
 282         c = c - 'A' + 10;
 283       else if (c >= 'a' && c <= 'f')
 284         c = c - 'a' + 10;
 285       else
 286         return false;
 287       if (c >= radix)
 288         return false;
 289     }
 290   while (p < p_end);
 291   return true;
 292 }
 293
 294 /* Tests if a token represents a rational, floating-point or complex number.
 295    If unconstrained is false, only real numbers are accepted; otherwise,
 296    complex numbers are accepted as well.
 297    Taken from guile-1.6.4/libguile/numbers.c:scm_istr2flo().  */
 298 static inline bool
 299 is_other_number_syntax (const char *str, int len, int radix, bool unconstrained)
 300 {
 301   const char *p = str;
 302   const char *p_end = str + len;
 303   bool seen_sign;
 304   bool seen_digits;
 305
 306   /* The accepted syntaxes are:
 307      for a floating-point number:
 308        ['+'|'-'] DIGIT+ [EXPONENT]
 309        ['+'|'-'] DIGIT* '.' DIGIT+ [EXPONENT]
 310        where EXPONENT ::= ['d'|'e'|'f'|'l'|'s'] DIGIT+
 311        (Dot and exponent are allowed only if radix is 10.)
 312      for a rational number:
 313        ['+'|'-'] DIGIT+ '/' DIGIT+
 314      for a complex number:
 315        REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
 316        REAL-NUMBER {'+'|'-'} 'i'
 317        {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
 318        {'+'|'-'} 'i'
 319        REAL-NUMBER '@' REAL-NUMBER
 320    */
 321   if (p == p_end)
 322     return false;
 323   /* Parse leading sign.  */
 324   seen_sign = false;
 325   if (*p == '+' || *p == '-')
 326     {
 327       p++;
 328       if (p == p_end)
 329         return false;
 330       seen_sign = true;
 331       /* Recognize complex number syntax: {'+'|'-'} 'i'  */
 332       if (unconstrained && (*p == 'I' || *p == 'i') && p + 1 == p_end)
 333         return true;
 334     }
 335   /* Parse digits before dot or exponent or slash.  */
 336   seen_digits = false;
 337   do
 338     {
 339       int c = *p;
 340
 341       if (c >= '0' && c <= '9')
 342         c = c - '0';
 343       else if (c >= 'A' && c <= 'F')
 344         {
 345           if (c >= 'D' && radix == 10) /* exponent? */
 346             break;
 347           c = c - 'A' + 10;
 348         }
 349       else if (c >= 'a' && c <= 'f')
 350         {
 351           if (c >= 'd' && radix == 10) /* exponent? */
 352             break;
 353           c = c - 'a' + 10;
 354         }
 355       else
 356         break;
 357       if (c >= radix)
 358         return false;
 359       seen_digits = true;
 360       p++;
 361     }
 362   while (p < p_end);
 363   /* If p == p_end, we know that seen_digits = true, and the number is an
 364      integer without exponent.  */
 365   if (p < p_end)
 366     {
 367       /* If we have no digits so far, we need a decimal point later.  */
 368       if (!seen_digits && !(*p == '.' && radix == 10))
 369         return false;
 370       /* Trailing '#' signs are equivalent to zeroes.  */
 371       while (p < p_end && *p == '#')
 372         p++;
 373       if (p < p_end)
 374         {
 375           if (*p == '/')
 376             {
 377               /* Parse digits after the slash.  */
 378               bool all_zeroes = true;
 379               p++;
 380               for (; p < p_end; p++)
 381                 {
 382                   int c = *p;
 383
 384                   if (c >= '0' && c <= '9')
 385                     c = c - '0';
 386                   else if (c >= 'A' && c <= 'F')
 387                     c = c - 'A' + 10;
 388                   else if (c >= 'a' && c <= 'f')
 389                     c = c - 'a' + 10;
 390                   else
 391                     break;
 392                   if (c >= radix)
 393                     return false;
 394                   if (c != 0)
 395                     all_zeroes = false;
 396                 }
 397               /* A zero denominator is not allowed.  */
 398               if (all_zeroes)
 399                 return false;
 400               /* Trailing '#' signs are equivalent to zeroes.  */
 401               while (p < p_end && *p == '#')
 402                 p++;
 403             }
 404           else
 405             {
 406               if (*p == '.')
 407                 {
 408                   /* Decimal point notation.  */
 409                   if (radix != 10)
 410                     return false;
 411                   /* Parse digits after the decimal point.  */
 412                   p++;
 413                   for (; p < p_end; p++)
 414                     {
 415                       int c = *p;
 416
 417                       if (c >= '0' && c <= '9')
 418                         seen_digits = true;
 419                       else
 420                         break;
 421                     }
 422                   /* Digits are required before or after the decimal point.  */
 423                   if (!seen_digits)
 424                     return false;
 425                   /* Trailing '#' signs are equivalent to zeroes.  */
 426                   while (p < p_end && *p == '#')
 427                     p++;
 428                 }
 429               if (p < p_end)
 430                 {
 431                   /* Parse exponent.  */
 432                   switch (*p)
 433                     {
 434                     case 'D': case 'd':
 435                     case 'E': case 'e':
 436                     case 'F': case 'f':
 437                     case 'L': case 'l':
 438                     case 'S': case 's':
 439                       if (radix != 10)
 440                         return false;
 441                       p++;
 442                       if (p == p_end)
 443                         return false;
 444                       if (*p == '+' || *p == '-')
 445                         {
 446                           p++;
 447                           if (p == p_end)
 448                             return false;
 449                         }
 450                       if (!(*p >= '0' && *p <= '9'))
 451                         return false;
 452                       for (;;)
 453                         {
 454                           p++;
 455                           if (p == p_end)
 456                             break;
 457                           if (!(*p >= '0' && *p <= '9'))
 458                             break;
 459                         }
 460                       break;
 461                     default:
 462                       break;
 463                     }
 464                 }
 465             }
 466         }
 467     }
 468   if (p == p_end)
 469     return true;
 470   /* Recognize complex number syntax.  */
 471   if (unconstrained)
 472     {
 473       /* Recognize the syntax  {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'  */
 474       if (seen_sign && (*p == 'I' || *p == 'i') && p + 1 == p_end)
 475         return true;
 476       /* Recognize the syntaxes
 477            REAL-NUMBER {'+'|'-'} REAL-NUMBER-WITHOUT-SIGN 'i'
 478            REAL-NUMBER {'+'|'-'} 'i'
 479        */
 480       if (*p == '+' || *p == '-')
 481         return (p_end[-1] == 'I' || p_end[-1] == 'i')
 482                 && (p + 1 == p_end - 1
 483                     || is_other_number_syntax (p, p_end - 1 - p, radix, false));
 484       /* Recognize the syntax  REAL-NUMBER '@' REAL-NUMBER  */
 485       if (*p == '@')
 486         {
 487           p++;
 488           return is_other_number_syntax (p, p_end - p, radix, false);
 489         }
 490     }
 491   return false;
 492 }
 493
 494 /* Tests if a token represents a number.
 495    Taken from guile-1.6.4/libguile/numbers.c:scm_istring2number().  */
 496 static bool
 497 is_number (const struct token *tp)
 498 {
 499   const char *str = tp->chars;
 500   int len = tp->charcount;
 501   enum { unknown, exact, inexact } exactness = unknown;
 502   bool seen_radix_prefix = false;
 503   bool seen_exactness_prefix = false;
 504
 505   if (len == 1)
 506     if (*str == '+' || *str == '-')
 507       return false;
 508   while (len >= 2 && *str == '#')
 509     {
 510       switch (str[1])
 511         {
 512         case 'B': case 'b':
 513           if (seen_radix_prefix)
 514             return false;
 515           seen_radix_prefix = true;
 516           break;
 517         case 'O': case 'o':
 518           if (seen_radix_prefix)
 519             return false;
 520           seen_radix_prefix = true;
 521           break;
 522         case 'D': case 'd':
 523           if (seen_radix_prefix)
 524             return false;
 525           seen_radix_prefix = true;
 526           break;
 527         case 'X': case 'x':
 528           if (seen_radix_prefix)
 529             return false;
 530           seen_radix_prefix = true;
 531           break;
 532         case 'E': case 'e':
 533           if (seen_exactness_prefix)
 534             return false;
 535           exactness = exact;
 536           seen_exactness_prefix = true;
 537           break;
 538         case 'I': case 'i':
 539           if (seen_exactness_prefix)
 540             return false;
 541           exactness = inexact;
 542           seen_exactness_prefix = true;
 543           break;
 544         default:
 545           return false;
 546         }
 547       str += 2;
 548       len -= 2;
 549     }
 550   if (exactness != inexact)
 551     {
 552       /* Try to parse an integer.  */
 553       if (is_integer_syntax (str, len, 10))
 554         return true;
 555       /* FIXME: Other Scheme implementations support exact rational numbers
 556          or exact complex numbers.  */
 557     }
 558   if (exactness != exact)
 559     {
 560       /* Try to parse a rational, floating-point or complex number.  */
 561       if (is_other_number_syntax (str, len, 10, true))
 562         return true;
 563     }
 564   return false;
 565 }
 566
 567
 568 /* ========================= Accumulating comments ========================= */
 569
 570
 571 static char *buffer;
 572 static size_t bufmax;
 573 static size_t buflen;
 574
 575 static inline void
 576 comment_start ()
 577 {
 578   buflen = 0;
 579 }
 580
 581 static inline void
 582 comment_add (int c)
 583 {
 584   if (buflen >= bufmax)
 585     {
 586       bufmax = 2 * bufmax + 10;
 587       buffer = xrealloc (buffer, bufmax);
 588     }
 589   buffer[buflen++] = c;
 590 }
 591
 592 static inline void
 593 comment_line_end (size_t chars_to_remove)
 594 {
 595   buflen -= chars_to_remove;
 596   while (buflen >= 1
 597          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
 598     --buflen;
 599   if (chars_to_remove == 0 && buflen >= bufmax)
 600     {
 601       bufmax = 2 * bufmax + 10;
 602       buffer = xrealloc (buffer, bufmax);
 603     }
 604   buffer[buflen] = '\0';
 605   savable_comment_add (buffer);
 606 }
 607
 608
 609 /* These are for tracking whether comments count as immediately before
 610    keyword.  */
 611 static int last_comment_line;
 612 static int last_non_comment_line;
 613
 614
 615 /* ========================= Accumulating messages ========================= */
 616
 617
 618 static message_list_ty *mlp;
 619
 620
 621 /* ========================== Reading of objects.  ========================= */
 622
 623
 624 /* We are only interested in symbols (e.g. gettext or ngettext) and strings.
 625    Other objects need not to be represented precisely.  */
 626 enum object_type
 627 {
 628   t_symbol,     /* symbol */
 629   t_string,     /* string */
 630   t_other,      /* other kind of real object */
 631   t_dot,        /* '.' pseudo object */
 632   t_close,      /* ')' pseudo object */
 633   t_eof         /* EOF marker */
 634 };
 635
 636 struct object
 637 {
 638   enum object_type type;
 639   struct token *token;          /* for t_symbol and t_string */
 640   int line_number_at_start;     /* for t_string */
 641 };
 642
 643 /* Free the memory pointed to by a 'struct object'.  */
 644 static inline void
 645 free_object (struct object *op)
 646 {
 647   if (op->type == t_symbol || op->type == t_string)
 648     {
 649       free_token (op->token);
 650       free (op->token);
 651     }
 652 }
 653
 654 /* Convert a t_symbol/t_string token to a char*.  */
 655 static char *
 656 string_of_object (const struct object *op)
 657 {
 658   char *str;
 659   int n;
 660
 661   if (!(op->type == t_symbol || op->type == t_string))
 662     abort ();
 663   n = op->token->charcount;
 664   str = XNMALLOC (n + 1, char);
 665   memcpy (str, op->token->chars, n);
 666   str[n] = '\0';
 667   return str;
 668 }
 669
 670 /* Context lookup table.  */
 671 static flag_context_list_table_ty *flag_context_list_table;
 672
 673 /* Read the next object.  */
 674 static void
 675 read_object (struct object *op, flag_context_ty outer_context)
 676 {
 677   for (;;)
 678     {
 679       int c = do_getc ();
 680       bool seen_underscore_prefix = false;
 681
 682       switch (c)
 683         {
 684         case EOF:
 685           op->type = t_eof;
 686           return;
 687
 688         case ' ': case '\r': case '\f': case '\t':
 689           continue;
 690
 691         case '\n':
 692           /* Comments assumed to be grouped with a message must immediately
 693              precede it, with no non-whitespace token on a line between
 694              both.  */
 695           if (last_non_comment_line > last_comment_line)
 696             savable_comment_reset ();
 697           continue;
 698
 699         case ';':
 700           {
 701             bool all_semicolons = true;
 702
 703             last_comment_line = line_number;
 704             comment_start ();
 705             for (;;)
 706               {
 707                 c = do_getc ();
 708                 if (c == EOF || c == '\n')
 709                   break;
 710                 if (c != ';')
 711                   all_semicolons = false;
 712                 if (!all_semicolons)
 713                   {
 714                     /* We skip all leading white space, but not EOLs.  */
 715                     if (!(buflen == 0 && (c == ' ' || c == '\t')))
 716                       comment_add (c);
 717                   }
 718               }
 719             comment_line_end (0);
 720             continue;
 721           }
 722
 723         case '(':
 724           {
 725              int arg = 0;               /* Current argument number.  */
 726              flag_context_list_iterator_ty context_iter;
 727             const struct callshapes *shapes = NULL;
 728             struct arglist_parser *argparser = NULL;
 729
 730              for (;; arg++)
 731                {
 732                 struct object inner;
 733                 flag_context_ty inner_context;
 734
 735                 if (arg == 0)
 736                   inner_context = null_context;
 737                 else
 738                   inner_context =
 739                     inherited_context (outer_context,
 740                                        flag_context_list_iterator_advance (
 741                                          &context_iter));
 742
 743                 read_object (&inner, inner_context);
 744
 745                 /* Recognize end of list.  */
 746                 if (inner.type == t_close)
 747                   {
 748                     op->type = t_other;
 749                     last_non_comment_line = line_number;
 750                     if (argparser != NULL)
 751                       arglist_parser_done (argparser, arg);
 752                     return;
 753                   }
 754
 755                 /* Dots are not allowed in every position.
 756                    But be tolerant.  */
 757
 758                 /* EOF inside list is illegal.
 759                    But be tolerant.  */
 760                 if (inner.type == t_eof)
 761                   break;
 762
 763                 if (arg == 0)
 764                   {
 765                     /* This is the function position.  */
 766                     if (inner.type == t_symbol)
 767                       {
 768                         char *symbol_name = string_of_object (&inner);
 769                         void *keyword_value;
 770
 771                         if (hash_find_entry (&keywords,
 772                                              symbol_name, strlen (symbol_name),
 773                                              &keyword_value)
 774                             == 0)
 775                           shapes = (const struct callshapes *) keyword_value;
 776
 777                         argparser = arglist_parser_alloc (mlp, shapes);
 778
 779                         context_iter =
 780                           flag_context_list_iterator (
 781                             flag_context_list_table_lookup (
 782                               flag_context_list_table,
 783                               symbol_name, strlen (symbol_name)));
 784
 785                         free (symbol_name);
 786                       }
 787                     else
 788                       context_iter = null_context_list_iterator;
 789                   }
 790                 else
 791                   {
 792                     /* These are the argument positions.  */
 793                     if (argparser != NULL && inner.type == t_string)
 794                       arglist_parser_remember (argparser, arg,
 795                                                string_of_object (&inner),
 796                                                inner_context,
 797                                                logical_file_name,
 798                                                inner.line_number_at_start,
 799                                                savable_comment);
 800                   }
 801
 802                 free_object (&inner);
 803               }
 804             if (argparser != NULL)
 805               arglist_parser_done (argparser, arg);
 806           }
 807           op->type = t_other;
 808           last_non_comment_line = line_number;
 809           return;
 810
 811         case ')':
 812           /* Tell the caller about the end of list.
 813              Unmatched closing parenthesis is illegal.
 814              But be tolerant.  */
 815           op->type = t_close;
 816           last_non_comment_line = line_number;
 817           return;
 818
 819         case ',':
 820           {
 821             int c = do_getc ();
 822             /* The ,@ handling inside lists is wrong anyway, because
 823                ,@form expands to an unknown number of elements.  */
 824             if (c != EOF && c != '@')
 825               do_ungetc (c);
 826           }
 827           /*FALLTHROUGH*/
 828         case '\'':
 829         case '`':
 830           {
 831             struct object inner;
 832
 833             read_object (&inner, null_context);
 834
 835             /* Dots and EOF are not allowed here.  But be tolerant.  */
 836
 837             free_object (&inner);
 838
 839             op->type = t_other;
 840             last_non_comment_line = line_number;
 841             return;
 842           }
 843
 844         case '#':
 845           /* Dispatch macro handling.  */
 846           {
 847             c = do_getc ();
 848             if (c == EOF)
 849               /* Invalid input.  Be tolerant, no error message.  */
 850               {
 851                 op->type = t_other;
 852                 return;
 853               }
 854
 855             switch (c)
 856               {
 857               case '(': /* Vector */
 858                 do_ungetc (c);
 859                 {
 860                   struct object inner;
 861                   read_object (&inner, null_context);
 862                   /* Dots and EOF are not allowed here.
 863                      But be tolerant.  */
 864                   free_object (&inner);
 865                   op->type = t_other;
 866                   last_non_comment_line = line_number;
 867                   return;
 868                 }
 869
 870               case 'T': case 't': /* Boolean true */
 871               case 'F': case 'f': /* Boolean false */
 872                 op->type = t_other;
 873                 last_non_comment_line = line_number;
 874                 return;
 875
 876               case 'B': case 'b':
 877               case 'O': case 'o':
 878               case 'D': case 'd':
 879               case 'X': case 'x':
 880               case 'E': case 'e':
 881               case 'I': case 'i':
 882                 {
 883                   struct token token;
 884                   do_ungetc (c);
 885                   read_token (&token, '#');
 886                   if (is_number (&token))
 887                     {
 888                       /* A number.  */
 889                       free_token (&token);
 890                       op->type = t_other;
 891                       last_non_comment_line = line_number;
 892                       return;
 893                     }
 894                   else
 895                     {
 896                       if (token.charcount == 2
 897                           && (token.chars[1] == 'e' || token.chars[1] == 'i'))
 898                         {
 899                           c = do_getc ();
 900                           if (c != EOF)
 901                             do_ungetc (c);
 902                           if (c == '(')
 903                             /* Homogenous vector syntax, see arrays.scm.  */
 904                             case 'a':   /* Vectors of char */
 905                             case 'c':   /* Vectors of complex */
 906                           /*case 'e':*/ /* Vectors of long */
 907                             case 'h':   /* Vectors of short */
 908                           /*case 'i':*/ /* Vectors of double-float */
 909                             case 'l':   /* Vectors of long long */
 910                             case 's':   /* Vectors of single-float */
 911                             case 'u':   /* Vectors of unsigned long */
 912                             case 'y':   /* Vectors of byte */
 913                               {
 914                                 struct object inner;
 915                                 read_object (&inner, null_context);
 916                                 /* Dots and EOF are not allowed here.
 917                                    But be tolerant.  */
 918                                 free_token (&token);
 919                                 free_object (&inner);
 920                                 op->type = t_other;
 921                                 last_non_comment_line = line_number;
 922                                 return;
 923                               }
 924                         }
 925                       /* Unknown # object.  But be tolerant.  */
 926                       free_token (&token);
 927                       op->type = t_other;
 928                       last_non_comment_line = line_number;
 929                       return;
 930                     }
 931                 }
 932
 933               case '!':
 934                 /* Block comment '#! ... !#'.  See
 935                    <http://www.gnu.org/software/guile/manual/html_node/Block-Comments.html>.  */
 936                 {
 937                   int c;
 938
 939                   comment_start ();
 940                   c = do_getc ();
 941                   for (;;)
 942                     {
 943                       if (c == EOF)
 944                         break;
 945                       if (c == '!')
 946                         {
 947                           c = do_getc ();
 948                           if (c == EOF)
 949                             break;
 950                           if (c == '#')
 951                             {
 952                               comment_line_end (0);
 953                               break;
 954                             }
 955                           else
 956                             comment_add ('!');
 957                         }
 958                       else
 959                         {
 960                           /* We skip all leading white space.  */
 961                           if (!(buflen == 0 && (c == ' ' || c == '\t')))
 962                             comment_add (c);
 963                           if (c == '\n')
 964                             {
 965                               comment_line_end (1);
 966                               comment_start ();
 967                             }
 968                           c = do_getc ();
 969                         }
 970                     }
 971                   if (c == EOF)
 972                     {
 973                       /* EOF not allowed here.  But be tolerant.  */
 974                       op->type = t_eof;
 975                       return;
 976                     }
 977                   last_comment_line = line_number;
 978                   continue;
 979                 }
 980
 981               case '|':
 982                 /* Block comment '#| ... |#'.  See
 983                    <http://www.gnu.org/software/guile/manual/html_node/Block-Comments.html>
 984                    and <http://srfi.schemers.org/srfi-30/srfi-30.html>.  */
 985                 {
 986                   int depth = 0;
 987                   int c;
 988
 989                   comment_start ();
 990                   c = do_getc ();
 991                   for (;;)
 992                     {
 993                       if (c == EOF)
 994                         break;
 995                       if (c == '|')
 996                         {
 997                           c = do_getc ();
 998                           if (c == EOF)
 999                             break;
1000                           if (c == '#')
1001                             {
1002                               if (depth == 0)
1003                                 {
1004                                   comment_line_end (0);
1005                                   break;
1006                                 }
1007                               depth--;
1008                               comment_add ('|');
1009                               comment_add ('#');
1010                               c = do_getc ();
1011                             }
1012                           else
1013                             comment_add ('|');
1014                         }
1015                       else if (c == '#')
1016                         {
1017                           c = do_getc ();
1018                           if (c == EOF)
1019                             break;
1020                           comment_add ('#');
1021                           if (c == '|')
1022                             {
1023                               depth++;
1024                               comment_add ('|');
1025                               c = do_getc ();
1026                             }
1027                         }
1028                       else
1029                         {
1030                           /* We skip all leading white space.  */
1031                           if (!(buflen == 0 && (c == ' ' || c == '\t')))
1032                             comment_add (c);
1033                           if (c == '\n')
1034                             {
1035                               comment_line_end (1);
1036                               comment_start ();
1037                             }
1038                           c = do_getc ();
1039                         }
1040                     }
1041                   if (c == EOF)
1042                     {
1043                       /* EOF not allowed here.  But be tolerant.  */
1044                       op->type = t_eof;
1045                       return;
1046                     }
1047                   last_comment_line = line_number;
1048                   continue;
1049                 }
1050
1051               case '*':
1052                 /* Bit vector.  */
1053                 {
1054                   struct token token;
1055                   read_token (&token, c);
1056                   /* The token should consists only of '0' and '1', except
1057                      for the initial '*'.  But be tolerant.  */
1058                   free_token (&token);
1059                   op->type = t_other;
1060                   last_non_comment_line = line_number;
1061                   return;
1062                 }
1063
1064               case '{':
1065                 /* Symbol with multiple escapes: #{...}#  */
1066                 {
1067                   op->token = XMALLOC (struct token);
1068
1069                   init_token (op->token);
1070
1071                   for (;;)
1072                     {
1073                       c = do_getc ();
1074
1075                       if (c == EOF)
1076                         break;
1077                       if (c == '\\')
1078                         {
1079                           c = do_getc ();
1080                           if (c == EOF)
1081                             break;
1082                         }
1083                       else if (c == '}')
1084                         {
1085                           c = do_getc ();
1086                           if (c == '#')
1087                             break;
1088                           if (c != EOF)
1089                             do_ungetc (c);
1090                           c = '}';
1091                         }
1092                       grow_token (op->token);
1093                       op->token->chars[op->token->charcount++] = c;
1094                     }
1095
1096                   op->type = t_symbol;
1097                   last_non_comment_line = line_number;
1098                   return;
1099                 }
1100
1101               case '\\':
1102                 /* Character.  */
1103                 {
1104                   struct token token;
1105                   c = do_getc ();
1106                   if (c != EOF)
1107                     {
1108                       read_token (&token, c);
1109                       free_token (&token);
1110                     }
1111                   op->type = t_other;
1112                   last_non_comment_line = line_number;
1113                   return;
1114                 }
1115
1116               case ':': /* Keyword.  */
1117               case '&': /* Deprecated keyword, installed in optargs.scm.  */
1118                 {
1119                   struct token token;
1120                   read_token (&token, '-');
1121                   free_token (&token);
1122                   op->type = t_other;
1123                   last_non_comment_line = line_number;
1124                   return;
1125                 }
1126
1127               /* The following are installed through read-hash-extend.  */
1128
1129               /* arrays.scm */
1130               case '0': case '1': case '2': case '3': case '4':
1131               case '5': case '6': case '7': case '8': case '9':
1132                 /* Multidimensional array syntax: #nx(...) where
1133                      n ::= DIGIT+
1134                      x ::= {'a'|'b'|'c'|'e'|'i'|'s'|'u'}
1135                  */
1136                 do
1137                   c = do_getc ();
1138                 while (c >= '0' && c <= '9');
1139                 /* c should be one of {'a'|'b'|'c'|'e'|'i'|'s'|'u'}.
1140                    But be tolerant.  */
1141                 /*FALLTHROUGH*/
1142               case '\'': /* boot-9.scm */
1143               case '.': /* boot-9.scm */
1144               case ',': /* srfi-10.scm */
1145                 {
1146                   struct object inner;
1147                   read_object (&inner, null_context);
1148                   /* Dots and EOF are not allowed here.
1149                      But be tolerant.  */
1150                   free_object (&inner);
1151                   op->type = t_other;
1152                   last_non_comment_line = line_number;
1153                   return;
1154                 }
1155
1156               default:
1157                 /* Unknown.  */
1158                 op->type = t_other;
1159                 last_non_comment_line = line_number;
1160                 return;
1161               }
1162             /*NOTREACHED*/
1163             abort ();
1164           }
1165
1166         case '_':
1167           /* GIMP script-fu extension: '_' before a string literal is
1168              considered a gettext call on the string.  */
1169           {
1170             int c = do_getc ();
1171             if (c == EOF)
1172               /* Invalid input.  Be tolerant, no error message.  */
1173               {
1174                 op->type = t_other;
1175                 return;
1176               }
1177             if (c != '"')
1178               {
1179                 do_ungetc (c);
1180
1181                 /* If '_' is not followed by a string literal,
1182                    consider it a part of symbol.  */
1183                 op->token = XMALLOC (struct token);
1184                 read_token (op->token, '_');
1185                 op->type = t_symbol;
1186                 last_non_comment_line = line_number;
1187                 return;
1188               }
1189             seen_underscore_prefix = true;
1190           }
1191           /*FALLTHROUGH*/
1192
1193         case '"':
1194           {
1195             op->token = XMALLOC (struct token);
1196             init_token (op->token);
1197             op->line_number_at_start = line_number;
1198             for (;;)
1199               {
1200                 int c = do_getc ();
1201                 if (c == EOF)
1202                   /* Invalid input.  Be tolerant, no error message.  */
1203                   break;
1204                 if (c == '"')
1205                   break;
1206                 if (c == '\\')
1207                   {
1208                     c = do_getc ();
1209                     if (c == EOF)
1210                       /* Invalid input.  Be tolerant, no error message.  */
1211                       break;
1212                     switch (c)
1213                       {
1214                       case '\n':
1215                         continue;
1216                       case '0':
1217                         c = '\0';
1218                         break;
1219                       case 'a':
1220                         c = '\a';
1221                         break;
1222                       case 'f':
1223                         c = '\f';
1224                         break;
1225                       case 'n':
1226                         c = '\n';
1227                         break;
1228                       case 'r':
1229                         c = '\r';
1230                         break;
1231                       case 't':
1232                         c = '\t';
1233                         break;
1234                       case 'v':
1235                         c = '\v';
1236                         break;
1237                       default:
1238                         break;
1239                       }
1240                   }
1241                 grow_token (op->token);
1242                 op->token->chars[op->token->charcount++] = c;
1243               }
1244             op->type = t_string;
1245
1246             if (seen_underscore_prefix || extract_all)
1247               {
1248                 lex_pos_ty pos;
1249
1250                 pos.file_name = logical_file_name;
1251                 pos.line_number = op->line_number_at_start;
1252                 remember_a_message (mlp, NULL, string_of_object (op),
1253                                     null_context, &pos, NULL, savable_comment);
1254               }
1255             last_non_comment_line = line_number;
1256             return;
1257           }
1258
1259         case '0': case '1': case '2': case '3': case '4':
1260         case '5': case '6': case '7': case '8': case '9':
1261         case '+': case '-': case '.':
1262           /* Read a number or symbol token.  */
1263           op->token = XMALLOC (struct token);
1264           read_token (op->token, c);
1265           if (op->token->charcount == 1 && op->token->chars[0] == '.')
1266             {
1267               free_token (op->token);
1268               free (op->token);
1269               op->type = t_dot;
1270             }
1271           else if (is_number (op->token))
1272             {
1273               /* A number.  */
1274               free_token (op->token);
1275               free (op->token);
1276               op->type = t_other;
1277             }
1278           else
1279             {
1280               /* A symbol.  */
1281               op->type = t_symbol;
1282             }
1283           last_non_comment_line = line_number;
1284           return;
1285
1286         case ':':
1287         default:
1288           /* Read a symbol token.  */
1289           op->token = XMALLOC (struct token);
1290           read_token (op->token, c);
1291           op->type = t_symbol;
1292           last_non_comment_line = line_number;
1293           return;
1294         }
1295     }
1296 }
1297
1298
1299 void
1300 extract_scheme (FILE *f,
1301                 const char *real_filename, const char *logical_filename,
1302                 flag_context_list_table_ty *flag_table,
1303                 msgdomain_list_ty *mdlp)
1304 {
1305   mlp = mdlp->item[0]->messages;
1306
1307   fp = f;
1308   real_file_name = real_filename;
1309   logical_file_name = xstrdup (logical_filename);
1310   line_number = 1;
1311
1312   last_comment_line = -1;
1313   last_non_comment_line = -1;
1314
1315   flag_context_list_table = flag_table;
1316
1317   init_keywords ();
1318
1319   /* Eat tokens until eof is seen.  When read_object returns
1320      due to an unbalanced closing parenthesis, just restart it.  */
1321   do
1322     {
1323       struct object toplevel_object;
1324
1325       read_object (&toplevel_object, null_context);
1326
1327       if (toplevel_object.type == t_eof)
1328         break;
1329
1330       free_object (&toplevel_object);
1331     }
1332   while (!feof (fp));
1333
1334   /* Close scanner.  */
1335   fp = NULL;
1336   real_file_name = NULL;
1337   logical_file_name = NULL;
1338   line_number = 0;
1339 }