locale/programs/linereader.c

   1 /* Copyright (C) 1996, 1997, 1998 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Library General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <libintl.h>
  27 #include <stdarg.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30
  31 #include "error.h"
  32 #include "linereader.h"
  33 #include "charset.h"
  34 #include "stringtrans.h"
  35
  36
  37 void *xmalloc (size_t __n);
  38 void *xrealloc (void *__p, size_t __n);
  39 char *xstrdup (const char *__str);
  40
  41
  42 static struct token *get_toplvl_escape (struct linereader *lr);
  43 static struct token *get_symname (struct linereader *lr);
  44 static struct token *get_ident (struct linereader *lr);
  45 static struct token *get_string (struct linereader *lr,
  46                                  const struct charset_t *charset);
  47
  48
  49 struct linereader *
  50 lr_open (const char *fname, kw_hash_fct_t hf)
  51 {
  52   FILE *fp;
  53   struct linereader *result;
  54   int n;
  55
  56   if (fname == NULL || strcmp (fname, "-") == 0
  57       || strcmp (fname, "/dev/stdin") == 0)
  58     fp = stdin;
  59   else
  60     {
  61       fp = fopen (fname, "r");
  62       if (fp == NULL)
  63         return NULL;
  64     }
  65
  66   result = (struct linereader *) xmalloc (sizeof (*result));
  67
  68   result->fp = fp;
  69   result->fname = xstrdup (fname ? : "<stdin>");
  70   result->buf = NULL;
  71   result->bufsize = 0;
  72   result->lineno = 1;
  73   result->idx = 0;
  74   result->comment_char = '#';
  75   result->escape_char = '\\';
  76   result->translate_strings = 1;
  77
  78   n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
  79   if (n < 0)
  80     {
  81       int save = errno;
  82       fclose (result->fp);
  83       free ((char *) result->fname);
  84       free (result);
  85       errno = save;
  86       return NULL;
  87     }
  88
  89   if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
  90     n -= 2;
  91
  92   result->buf[n] = '\0';
  93   result->bufact = n;
  94   result->hash_fct = hf;
  95
  96   return result;
  97 }
  98
  99
 100 int
 101 lr_eof (struct linereader *lr)
 102 {
 103   return lr->bufact = 0;
 104 }
 105
 106
 107 void
 108 lr_close (struct linereader *lr)
 109 {
 110   fclose (lr->fp);
 111   free (lr->buf);
 112   free (lr);
 113 }
 114
 115
 116 int
 117 lr_next (struct linereader *lr)
 118 {
 119   int n;
 120
 121   n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
 122   if (n < 0)
 123     return -1;
 124
 125   ++lr->lineno;
 126
 127   if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
 128     {
 129       /* An escaped newline character is substituted with a single <SP>.  */
 130       --n;
 131       lr->buf[n - 1] = ' ';
 132     }
 133
 134   lr->buf[n] = '\0';
 135   lr->bufact = n;
 136   lr->idx = 0;
 137
 138   return 0;
 139 }
 140
 141
 142 /* Defined in error.c.  */
 143 /* This variable is incremented each time `error' is called.  */
 144 extern unsigned int error_message_count;
 145
 146 /* The calling program should define program_name and set it to the
 147    name of the executing program.  */
 148 extern char *program_name;
 149
 150
 151 struct token *
 152 lr_token (struct linereader *lr, const struct charset_t *charset)
 153 {
 154   int ch;
 155
 156   while (1)
 157     {
 158       do
 159         {
 160           ch = lr_getc (lr);
 161
 162           if (ch == EOF)
 163             {
 164               lr->token.tok = tok_eof;
 165               return &lr->token;
 166             };
 167
 168           if (ch == '\n')
 169             {
 170               lr->token.tok = tok_eol;
 171               return &lr->token;
 172             }
 173         }
 174       while (isspace (ch));
 175
 176       if (ch == EOF)
 177         {
 178           lr->token.tok = tok_eof;
 179           return &lr->token;
 180         };
 181
 182       if (ch != lr->comment_char)
 183         break;
 184
 185       /* Ignore rest of line.  */
 186       lr_ignore_rest (lr, 0);
 187       lr->token.tok = tok_eol;
 188       return &lr->token;
 189     }
 190
 191   /* Match escape sequences.  */
 192   if (ch == lr->escape_char)
 193     return get_toplvl_escape (lr);
 194
 195   /* Match ellipsis.  */
 196   if (ch == '.' && strncmp (&lr->buf[lr->idx], "..", 2) == 0)
 197     {
 198       lr_getc (lr);
 199       lr_getc (lr);
 200       lr->token.tok = tok_ellipsis;
 201       return &lr->token;
 202     }
 203
 204   switch (ch)
 205     {
 206     case '<':
 207       return get_symname (lr);
 208
 209     case '0' ... '9':
 210       lr->token.tok = tok_number;
 211       lr->token.val.num = ch - '0';
 212
 213       while (isdigit (ch = lr_getc (lr)))
 214         {
 215           lr->token.val.num *= 10;
 216           lr->token.val.num += ch - '0';
 217         }
 218       if (isalpha (ch))
 219         lr_error (lr, _("garbage at end of number"));
 220       lr_ungetn (lr, 1);
 221
 222       return &lr->token;
 223
 224     case ';':
 225       lr->token.tok = tok_semicolon;
 226       return &lr->token;
 227
 228     case ',':
 229       lr->token.tok = tok_comma;
 230       return &lr->token;
 231
 232     case '(':
 233       lr->token.tok = tok_open_brace;
 234       return &lr->token;
 235
 236     case ')':
 237       lr->token.tok = tok_close_brace;
 238       return &lr->token;
 239
 240     case '"':
 241       return get_string (lr, charset);
 242
 243     case '-':
 244       ch = lr_getc (lr);
 245       if (ch == '1')
 246         {
 247           lr->token.tok = tok_minus1;
 248           return &lr->token;
 249         }
 250       lr_ungetn (lr, 2);
 251       break;
 252     }
 253
 254   return get_ident (lr);
 255 }
 256
 257
 258 static struct token *
 259 get_toplvl_escape (struct linereader *lr)
 260 {
 261   /* This is supposed to be a numeric value.  We return the
 262      numerical value and the number of bytes.  */
 263   size_t start_idx = lr->idx - 1;
 264   unsigned int value = 0;
 265   int nbytes = 0;
 266   int ch;
 267
 268   do
 269     {
 270       unsigned int byte = 0;
 271       unsigned int base = 8;
 272
 273       ch = lr_getc (lr);
 274
 275       if (ch == 'd')
 276         {
 277           base = 10;
 278           ch = lr_getc (lr);
 279         }
 280       else if (ch == 'x')
 281         {
 282           base = 16;
 283           ch = lr_getc (lr);
 284         }
 285
 286       if ((base == 16 && !isxdigit (ch))
 287           || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
 288         {
 289         esc_error:
 290           lr->token.val.str.start = &lr->buf[start_idx];
 291
 292           while (ch != EOF && !isspace (ch))
 293             ch = lr_getc (lr);
 294           lr->token.val.str.len = lr->idx - start_idx;
 295
 296           lr->token.tok = tok_error;
 297           return &lr->token;
 298         }
 299
 300       if (isdigit (ch))
 301         byte = ch - '0';
 302       else
 303         byte = tolower (ch) - 'a' + 10;
 304
 305       ch = lr_getc (lr);
 306       if ((base == 16 && !isxdigit (ch))
 307           || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
 308         goto esc_error;
 309
 310       byte *= base;
 311       if (isdigit (ch))
 312         byte += ch - '0';
 313       else
 314         byte += tolower (ch) - 'a' + 10;
 315
 316       ch = lr_getc (lr);
 317       if (base != 16 && isdigit (ch))
 318         {
 319           byte *= base;
 320           byte += ch - '0';
 321
 322           ch = lr_getc (lr);
 323         }
 324
 325       value *= 256;
 326       value += byte;
 327
 328       ++nbytes;
 329     }
 330   while (ch == lr->escape_char && nbytes < 4);
 331
 332   if (!isspace (ch))
 333     lr_error (lr, _("garbage at end of character code specification"));
 334
 335   lr_ungetn (lr, 1);
 336
 337   lr->token.tok = tok_charcode;
 338   lr->token.val.charcode.val = value;
 339   lr->token.val.charcode.nbytes = nbytes;
 340
 341   return &lr->token;
 342 }
 343
 344
 345 #define ADDC(ch)                                                            \
 346   do                                                                        \
 347     {                                                                       \
 348       if (bufact == bufmax)                                                 \
 349         {                                                                   \
 350           bufmax *= 2;                                                      \
 351           buf = xrealloc (buf, bufmax);                                     \
 352         }                                                                   \
 353       buf[bufact++] = (ch);                                                 \
 354     }                                                                       \
 355   while (0)
 356
 357
 358 static struct token *
 359 get_symname (struct linereader *lr)
 360 {
 361   /* Symbol in brackets.  We must distinguish three kinds:
 362      1. reserved words
 363      2. ISO 10646 position values
 364      3. all other.  */
 365   char *buf;
 366   size_t bufact = 0;
 367   size_t bufmax = 56;
 368   const struct keyword_t *kw;
 369   int ch;
 370
 371   buf = (char *) xmalloc (bufmax);
 372
 373   do
 374     {
 375       ch = lr_getc (lr);
 376       if (ch == lr->escape_char)
 377         {
 378           int c2 = lr_getc (lr);
 379           ADDC (c2);
 380
 381           if (c2 == '\n')
 382             ch = '\n';
 383         }
 384       else
 385         ADDC (ch);
 386     }
 387   while (ch != '>' && ch != '\n');
 388
 389   if (ch == '\n')
 390     lr_error (lr, _("unterminated symbolic name"));
 391
 392   /* Test for ISO 10646 position value.  */
 393   if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
 394     {
 395       char *cp = buf + 1;
 396       while (cp < &buf[bufact - 1] && isxdigit (*cp))
 397         ++cp;
 398
 399       if (cp == &buf[bufact - 1])
 400         {
 401           /* Yes, it is.  */
 402           lr->token.tok = bufact == 6 ? tok_ucs2 : tok_ucs4;
 403           lr->token.val.charcode.val = strtoul (buf, NULL, 16);
 404           lr->token.val.charcode.nbytes = lr->token.tok == tok_ucs2 ? 2 : 4;
 405
 406           return &lr->token;
 407         }
 408     }
 409
 410   /* It is a symbolic name.  Test for reserved words.  */
 411   kw = lr->hash_fct (buf, bufact - 1);
 412
 413   if (kw != NULL && kw->symname_or_ident == 1)
 414     {
 415       lr->token.tok = kw->token;
 416       free (buf);
 417     }
 418   else
 419     {
 420       lr->token.tok = tok_bsymbol;
 421
 422       buf[bufact] = '\0';
 423       buf = xrealloc (buf, bufact + 1);
 424
 425       lr->token.val.str.start = buf;
 426       lr->token.val.str.len = bufact - 1;
 427     }
 428
 429   return &lr->token;
 430 }
 431
 432
 433 static struct token *
 434 get_ident (struct linereader *lr)
 435 {
 436   char *buf;
 437   size_t bufact;
 438   size_t bufmax = 56;
 439   const struct keyword_t *kw;
 440   int ch;
 441
 442   buf = xmalloc (bufmax);
 443   bufact = 0;
 444
 445   ADDC (lr->buf[lr->idx - 1]);
 446
 447   while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
 448          && ch != '<' && ch != ',')
 449     /* XXX Handle escape sequences?  */
 450     ADDC (ch);
 451
 452   lr_ungetn (lr, 1);
 453
 454   kw = lr->hash_fct (buf, bufact);
 455
 456   if (kw != NULL && kw->symname_or_ident == 0)
 457     {
 458       lr->token.tok = kw->token;
 459       free (buf);
 460     }
 461   else
 462     {
 463       lr->token.tok = tok_ident;
 464
 465       buf[bufact] = '\0';
 466       buf = xrealloc (buf, bufact + 1);
 467
 468       lr->token.val.str.start = buf;
 469       lr->token.val.str.len = bufact;
 470     }
 471
 472   return &lr->token;
 473 }
 474
 475
 476 static struct token *
 477 get_string (struct linereader *lr, const struct charset_t *charset)
 478 {
 479   int illegal_string = 0;
 480   char *buf, *cp;
 481   size_t bufact;
 482   size_t bufmax = 56;
 483   int ch;
 484
 485   buf = xmalloc (bufmax);
 486   bufact = 0;
 487
 488   while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
 489     if (ch != '<' || charset == NULL)
 490       {
 491         if (ch == lr->escape_char)
 492           {
 493             ch = lr_getc (lr);
 494             if (ch == '\n' || ch == EOF)
 495               break;
 496           }
 497         ADDC (ch);
 498       }
 499     else
 500       {
 501         /* We have to get the value of the symbol.  */
 502         unsigned int value;
 503         size_t startidx = bufact;
 504
 505         if (!lr->translate_strings)
 506           ADDC ('<');
 507
 508         while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
 509           {
 510             if (ch == lr->escape_char)
 511               {
 512                 ch = lr_getc (lr);
 513                 if (ch == '\n' || ch == EOF)
 514                   break;
 515               }
 516             ADDC (ch);
 517           }
 518
 519         if (ch == '\n' || ch == EOF)
 520           lr_error (lr, _("unterminated string"));
 521         else
 522           if (!lr->translate_strings)
 523             ADDC ('>');
 524
 525         if (lr->translate_strings)
 526           {
 527             value = charset_find_value (&charset->char_table, &buf[startidx],
 528                                         bufact - startidx);
 529             if ((wchar_t) value == ILLEGAL_CHAR_VALUE)
 530               illegal_string = 1;
 531             bufact = startidx;
 532
 533             if (bufmax - bufact < 8)
 534               {
 535                 bufmax *= 2;
 536                 buf = (char *) xrealloc (buf, bufmax);
 537               }
 538
 539             cp = &buf[bufact];
 540             if (encode_char (value, &cp))
 541               illegal_string = 1;
 542
 543             bufact = cp - buf;
 544           }
 545       }
 546
 547   /* Catch errors with trailing escape character.  */
 548   if (bufact > 0 && buf[bufact - 1] == lr->escape_char
 549       && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
 550     {
 551       lr_error (lr, _("illegal escape sequence at end of string"));
 552       --bufact;
 553     }
 554   else if (ch == '\n' || ch == EOF)
 555     lr_error (lr, _("unterminated string"));
 556
 557   /* Terminate string if necessary.  */
 558   if (lr->translate_strings)
 559     {
 560       cp = &buf[bufact];
 561       if (encode_char (0, &cp))
 562         illegal_string = 1;
 563
 564       bufact = cp - buf;
 565     }
 566   else
 567     ADDC ('\0');
 568
 569   lr->token.tok = tok_string;
 570
 571   if (illegal_string)
 572     {
 573       free (buf);
 574       lr->token.val.str.start = NULL;
 575       lr->token.val.str.len = 0;
 576     }
 577   else
 578     {
 579       buf = xrealloc (buf, bufact + 1);
 580
 581       lr->token.val.str.start = buf;
 582       lr->token.val.str.len = bufact;
 583     }
 584
 585   return &lr->token;
 586 }