src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering, meyering@cs.utexas.edu.  */
  19
  20 #include <config.h>
  21
  22 /* Get isblank from GNU libc.  */
  23 #define _GNU_SOURCE
  24
  25 #include <stdio.h>
  26 #define NDEBUG 1
  27 #include <assert.h>
  28 #include <errno.h>
  29 #include <sys/types.h>
  30 #include <getopt.h>
  31
  32 #if HAVE_LIMITS_H
  33 # include <limits.h>
  34 #endif
  35
  36 #include "system.h"
  37 #include "error.h"
  38
  39 #ifndef ULONG_MAX
  40 #define ULONG_MAX ((unsigned long) ~(unsigned long) 0)
  41 #endif
  42
  43 #ifndef LONG_MAX
  44 #define LONG_MAX ((long int) (ULONG_MAX >> 1))
  45 #endif
  46
  47 #ifndef UINT_MAX
  48 # define UINT_MAX ((unsigned int) ~(unsigned int) 0)
  49 #endif
  50
  51 #ifndef INT_MAX
  52 # define INT_MAX ((int) (UINT_MAX >> 1))
  53 #endif
  54
  55 #ifndef UCHAR_MAX
  56 #define UCHAR_MAX 0xFF
  57 #endif
  58
  59 #define N_CHARS (UCHAR_MAX + 1)
  60
  61 /* A pointer to a function that returns an int.  */
  62 typedef int (*PFI) ();
  63
  64 /* Convert from character C to its index in the collating
  65    sequence array.  Just cast to an unsigned int to avoid
  66    problems with sign-extension.  */
  67 #define ORD(c) (unsigned int)(c)
  68
  69 /* The inverse of ORD.  */
  70 #define CHR(i) (unsigned char)(i)
  71
  72 /* The value for Spec_list->state that indicates to
  73    get_next that it should initialize the tail pointer.
  74    Its value should be as large as possible to avoid conflict
  75    a valid value for the state field -- and that may be as
  76    large as any valid repeat_count.  */
  77 #define BEGIN_STATE (INT_MAX - 1)
  78
  79 /* The value for Spec_list->state that indicates to
  80    get_next that the element pointed to by Spec_list->tail is
  81    being considered for the first time on this pass through the
  82    list -- it indicates that get_next should make any necessary
  83    initializations.  */
  84 #define NEW_ELEMENT (BEGIN_STATE + 1)
  85
  86 /* A value distinct from any character that may have been stored in a
  87    buffer as the result of a block-read in the function squeeze_filter.  */
  88 #define NOT_A_CHAR (unsigned int)(-1)
  89
  90 /* The following (but not CC_NO_CLASS) are indices into the array of
  91    valid character class strings.  */
  92 enum Char_class
  93   {
  94     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  95     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  96     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  97     CC_NO_CLASS = 9999
  98   };
  99
 100 /* Character class to which a character (returned by get_next) belonged;
 101    but it is set only if the construct from which the character was obtained
 102    was one of the character classes [:upper:] or [:lower:].  The value
 103    is used only when translating and then, only to make sure that upper
 104    and lower class constructs have the same relative positions in string1
 105    and string2.  */
 106 enum Upper_Lower_class
 107   {
 108     UL_LOWER = 0,
 109     UL_UPPER = 1,
 110     UL_NONE = 2
 111   };
 112
 113 /* A shortcut to ensure that when constructing the translation array,
 114    one of the values returned by paired calls to get_next (from s1 and s2)
 115    is from [:upper:] and the other is from [:lower:], or neither is from
 116    upper or lower.  In fact, no other character classes are allowed when
 117    translating, but that condition is tested elsewhere.  This array is
 118    indexed by values of type enum Upper_Lower_class.  */
 119 static int const class_ok[3][3] =
 120 {
 121   {0, 1, 0},
 122   {1, 0, 0},
 123   {0, 0, 1}
 124 };
 125
 126 /* The type of a List_element.  See build_spec_list for more details.  */
 127 enum Range_element_type
 128   {
 129     RE_NO_TYPE = 0,
 130     RE_NORMAL_CHAR,
 131     RE_RANGE,
 132     RE_CHAR_CLASS,
 133     RE_EQUIV_CLASS,
 134     RE_REPEATED_CHAR
 135   };
 136
 137 /* One construct in one of tr's argument strings.
 138    For example, consider the POSIX version of the classic tr command:
 139        tr -cs 'a-zA-Z_' '[\n*]'
 140    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 141    and a single normal character, `_'.  String2 has one construct.  */
 142 struct List_element
 143   {
 144     enum Range_element_type type;
 145     struct List_element *next;
 146     union
 147       {
 148         int normal_char;
 149         struct                  /* unnamed */
 150           {
 151             unsigned int first_char;
 152             unsigned int last_char;
 153           }
 154         range;
 155         enum Char_class char_class;
 156         int equiv_code;
 157         struct                  /* unnamed */
 158           {
 159             unsigned int the_repeated_char;
 160             size_t repeat_count;
 161           }
 162         repeated_char;
 163       }
 164     u;
 165   };
 166
 167 /* Each of tr's argument strings is parsed into a form that is easier
 168    to work with: a linked list of constructs (struct List_element).
 169    Each Spec_list structure also encapsulates various attributes of
 170    the corresponding argument string.  The attributes are used mainly
 171    to verify that the strings are valid in the context of any options
 172    specified (like -s, -d, or -c).  The main exception is the member
 173    `tail', which is first used to construct the list.  After construction,
 174    it is used by get_next to save its state when traversing the list.
 175    The member `state' serves a similar function.  */
 176 struct Spec_list
 177   {
 178     /* Points to the head of the list of range elements.
 179        The first struct is a dummy; its members are never used.  */
 180     struct List_element *head;
 181
 182     /* When appending, points to the last element.  When traversing via
 183        get_next(), points to the element to process next.  Setting
 184        Spec_list.state to the value BEGIN_STATE before calling get_next
 185        signals get_next to initialize tail to point to head->next.  */
 186     struct List_element *tail;
 187
 188     /* Used to save state between calls to get_next().  */
 189     unsigned int state;
 190
 191     /* Length, in the sense that length ('a-z[:digit:]123abc')
 192        is 42 ( = 26 + 10 + 6).  */
 193     size_t length;
 194
 195     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 196     int n_indefinite_repeats;
 197
 198     /* If n_indefinite_repeats is nonzero, this points to the List_element
 199        corresponding to the last [c*] or [c*0] construct encountered in
 200        this spec.  Otherwise it is undefined.  */
 201     struct List_element *indefinite_repeat_element;
 202
 203     /* Non-zero if this spec contains at least one equivalence
 204        class construct e.g. [=c=].  */
 205     int has_equiv_class;
 206
 207     /* Non-zero if this spec contains at least one of [:upper:] or
 208        [:lower:] class constructs.  */
 209     int has_upper_or_lower;
 210
 211     /* Non-zero if this spec contains at least one of the character class
 212        constructs (all but upper and lower) that aren't allowed in s2.  */
 213     int has_restricted_char_class;
 214   };
 215
 216 /* A representation for escaped string1 or string2.  As a string is parsed,
 217    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 218    etc.) are marked as such in this structure by setting the corresponding
 219    entry in the ESCAPED vector.  */
 220 struct E_string
 221 {
 222   unsigned char *s;
 223   int *escaped;
 224   size_t len;
 225 };
 226
 227 /* Return nonzero if the Ith character of escaped string ES matches C
 228    and is not escaped itself.  */
 229 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
 230
 231
 232 char *xmalloc ();
 233 char *stpcpy ();
 234 int safe_read ();
 235
 236 /* The name by which this program was run.  */
 237 char *program_name;
 238
 239 /* When nonzero, each sequence in the input of a repeated character
 240    (call it c) is replaced (in the output) by a single occurrence of c
 241    for every c in the squeeze set.  */
 242 static int squeeze_repeats = 0;
 243
 244 /* When nonzero, removes characters in the delete set from input.  */
 245 static int delete = 0;
 246
 247 /* Use the complement of set1 in place of set1.  */
 248 static int complement = 0;
 249
 250 /* When nonzero, this flag causes GNU tr to provide strict
 251    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 252    says that when -d is used without -s, string2 (if present)
 253    must be ignored.  Silently ignoring arguments is a bad idea.
 254    The default GNU behavior is to give a usage message and exit.
 255    Additionally, when this flag is nonzero, tr prints warnings
 256    on stderr if it is being used in a manner that is not portable.
 257    Applicable warnings are given by default, but are suppressed
 258    if the environment variable `POSIXLY_CORRECT' is set, since
 259    being POSIX conformant means we can't issue such messages.
 260    Warnings on the following topics are suppressed when this
 261    variable is nonzero:
 262    1. Ambiguous octal escapes.  */
 263 static int posix_pedantic;
 264
 265 /* When tr is performing translation and string1 is longer than string2,
 266    POSIX says that the result is undefined.  That gives the implementor
 267    of a POSIX conforming version of tr two reasonable choices for the
 268    semantics of this case.
 269
 270    * The BSD tr pads string2 to the length of string1 by
 271    repeating the last character in string2.
 272
 273    * System V tr ignores characters in string1 that have no
 274    corresponding character in string2.  That is, string1 is effectively
 275    truncated to the length of string2.
 276
 277    When nonzero, this flag causes GNU tr to imitate the behavior
 278    of System V tr when translating with string1 longer than string2.
 279    The default is to emulate BSD tr.  This flag is ignored in modes where
 280    no translation is performed.  Emulating the System V tr
 281    in this exceptional case causes the relatively common BSD idiom:
 282
 283        tr -cs A-Za-z0-9 '\012'
 284
 285    to break (it would convert only zero bytes, rather than all
 286    non-alphanumerics, to newlines).
 287
 288    WARNING: This switch does not provide general BSD or System V
 289    compatibility.  For example, it doesn't disable the interpretation
 290    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 291    some unfortunate coincidence you use such constructs in scripts
 292    expecting to use some other version of tr, the scripts will break.  */
 293 static int truncate_set1 = 0;
 294
 295 /* An alias for (!delete && non_option_args == 2).
 296    It is set in main and used there and in validate().  */
 297 static int translating;
 298
 299 #ifndef BUFSIZ
 300 #define BUFSIZ 8192
 301 #endif
 302
 303 #define IO_BUF_SIZE BUFSIZ
 304 static unsigned char io_buf[IO_BUF_SIZE];
 305
 306 static char const *const char_class_name[] =
 307 {
 308   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 309   "lower", "print", "punct", "space", "upper", "xdigit"
 310 };
 311 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 312
 313 typedef char SET_TYPE;
 314
 315 /* Array of boolean values.  A character `c' is a member of the
 316    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 317    set is defined by the last (possibly, the only) string argument
 318    on the command line when the squeeze option is given.  */
 319 static SET_TYPE in_squeeze_set[N_CHARS];
 320
 321 /* Array of boolean values.  A character `c' is a member of the
 322    delete set if and only if in_delete_set[c] is true.  The delete
 323    set is defined by the first (or only) string argument on the
 324    command line when the delete option is given.  */
 325 static SET_TYPE in_delete_set[N_CHARS];
 326
 327 /* Array of character values defining the translation (if any) that
 328    tr is to perform.  Translation is performed only when there are
 329    two specification strings and the delete switch is not given.  */
 330 static char xlate[N_CHARS];
 331
 332 /* If nonzero, display usage information and exit.  */
 333 static int show_help;
 334
 335 /* If nonzero, print the version on standard output then exit.  */
 336 static int show_version;
 337
 338 static struct option const long_options[] =
 339 {
 340   {"complement", no_argument, NULL, 'c'},
 341   {"delete", no_argument, NULL, 'd'},
 342   {"squeeze-repeats", no_argument, NULL, 's'},
 343   {"truncate-set1", no_argument, NULL, 't'},
 344   {"help", no_argument, &show_help, 1},
 345   {"version", no_argument, &show_version, 1},
 346   {NULL, 0, NULL, 0}
 347 };
 348 \f
 349 static void
 350 usage (int status)
 351 {
 352   if (status != 0)
 353     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 354              program_name);
 355   else
 356     {
 357       printf (_("\
 358 Usage: %s [OPTION]... SET1 [SET2]\n\
 359 "),
 360               program_name);
 361       printf (_("\
 362 Translate, squeeze, and/or delete characters from standard input,\n\
 363 writing to standard output.\n\
 364 \n\
 365   -c, --complement        first complement SET1\n\
 366   -d, --delete            delete characters in SET1, do not translate\n\
 367   -s, --squeeze-repeats   replace sequence of characters with one\n\
 368   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 369       --help              display this help and exit\n\
 370       --version           output version information and exit\n\
 371 "));
 372       printf (_("\
 373 \n\
 374 SETs are specified as strings of characters.  Most represent themselves.\n\
 375 Interpreted sequences are:\n\
 376 \n\
 377   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 378   \\\\              backslash\n\
 379   \\a              audible BEL\n\
 380   \\b              backspace\n\
 381   \\f              form feed\n\
 382   \\n              new line\n\
 383   \\r              return\n\
 384   \\t              horizontal tab\n\
 385   \\v              vertical tab\n\
 386   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 387   [CHAR1-CHAR2]   same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
 388   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 389   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 390   [:alnum:]       all letters and digits\n\
 391   [:alpha:]       all letters\n\
 392   [:blank:]       all horizontal whitespace\n\
 393   [:cntrl:]       all control characters\n\
 394   [:digit:]       all digits\n\
 395   [:graph:]       all printable characters, not including space\n\
 396   [:lower:]       all lower case letters\n\
 397   [:print:]       all printable characters, including space\n\
 398   [:punct:]       all punctuation characters\n\
 399   [:space:]       all horizontal or vertical whitespace\n\
 400   [:upper:]       all upper case letters\n\
 401   [:xdigit:]      all hexadecimal digits\n\
 402   [=CHAR=]        all characters which are equivalent to CHAR\n\
 403 "));
 404       printf (_("\
 405 \n\
 406 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 407 -t may be used only when translating.  SET2 is extended to length of\n\
 408 SET1 by repeating its last character as necessary.  Excess characters\n\
 409 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 410 expand in ascending order; used in SET2 while translating, they may\n\
 411 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 412 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 413 translation or deletion.\n\
 414 "));
 415     }
 416   exit (status);
 417 }
 418
 419 /* Return nonzero if the character C is a member of the
 420    equivalence class containing the character EQUIV_CLASS.  */
 421
 422 static int
 423 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
 424 {
 425   return (equiv_class == c);
 426 }
 427
 428 /* Return nonzero if the character C is a member of the
 429    character class CHAR_CLASS.  */
 430
 431 static int
 432 is_char_class_member (enum Char_class char_class, unsigned int c)
 433 {
 434   int result;
 435
 436   switch (char_class)
 437     {
 438     case CC_ALNUM:
 439       result = ISALNUM (c);
 440       break;
 441     case CC_ALPHA:
 442       result = ISALPHA (c);
 443       break;
 444     case CC_BLANK:
 445       result = ISBLANK (c);
 446       break;
 447     case CC_CNTRL:
 448       result = ISCNTRL (c);
 449       break;
 450     case CC_DIGIT:
 451       result = ISDIGIT (c);
 452       break;
 453     case CC_GRAPH:
 454       result = ISGRAPH (c);
 455       break;
 456     case CC_LOWER:
 457       result = ISLOWER (c);
 458       break;
 459     case CC_PRINT:
 460       result = ISPRINT (c);
 461       break;
 462     case CC_PUNCT:
 463       result = ISPUNCT (c);
 464       break;
 465     case CC_SPACE:
 466       result = ISSPACE (c);
 467       break;
 468     case CC_UPPER:
 469       result = ISUPPER (c);
 470       break;
 471     case CC_XDIGIT:
 472       result = ISXDIGIT (c);
 473       break;
 474     default:
 475       abort ();
 476       break;
 477     }
 478   return result;
 479 }
 480
 481 /* Perform the first pass over each range-spec argument S, converting all
 482    \c and \ddd escapes to their one-byte representations.  The conversion
 483    is done in-place, so S must point to writable storage.  If an invalid
 484    quote sequence is found print an error message and return nonzero.
 485    Otherwise set *LEN to the length of the resulting string and return
 486    zero.  The resulting array of characters may contain zero-bytes;
 487    however, on input, S is assumed to be null-terminated, and hence
 488    cannot contain actual (non-escaped) zero bytes.  */
 489
 490 static int
 491 unquote (const unsigned char *s, struct E_string *es)
 492 {
 493   size_t i, j;
 494   size_t len;
 495
 496   len = strlen ((char *) s);
 497
 498   es->s = (unsigned char *) xmalloc (len);
 499   es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
 500   for (i = 0; i < len; i++)
 501     es->escaped[i] = 0;
 502
 503   j = 0;
 504   for (i = 0; s[i]; i++)
 505     {
 506       switch (s[i])
 507         {
 508           int c;
 509         case '\\':
 510           switch (s[i + 1])
 511             {
 512               int oct_digit;
 513             case '\\':
 514               c = '\\';
 515               break;
 516             case 'a':
 517               c = '\007';
 518               break;
 519             case 'b':
 520               c = '\b';
 521               break;
 522             case 'f':
 523               c = '\f';
 524               break;
 525             case 'n':
 526               c = '\n';
 527               break;
 528             case 'r':
 529               c = '\r';
 530               break;
 531             case 't':
 532               c = '\t';
 533               break;
 534             case 'v':
 535               c = '\v';
 536               break;
 537             case '0':
 538             case '1':
 539             case '2':
 540             case '3':
 541             case '4':
 542             case '5':
 543             case '6':
 544             case '7':
 545               c = s[i + 1] - '0';
 546               oct_digit = s[i + 2] - '0';
 547               if (0 <= oct_digit && oct_digit <= 7)
 548                 {
 549                   c = 8 * c + oct_digit;
 550                   ++i;
 551                   oct_digit = s[i + 2] - '0';
 552                   if (0 <= oct_digit && oct_digit <= 7)
 553                     {
 554                       if (8 * c + oct_digit < N_CHARS)
 555                         {
 556                           c = 8 * c + oct_digit;
 557                           ++i;
 558                         }
 559                       else if (!posix_pedantic)
 560                         {
 561                           /* Any octal number larger than 0377 won't
 562                              fit in 8 bits.  So we stop when adding the
 563                              next digit would put us over the limit and
 564                              give a warning about the ambiguity.  POSIX
 565                              isn't clear on this, but one person has said
 566                              that in his interpretation, POSIX says tr
 567                              can't even give a warning.  */
 568                           error (0, 0, _("warning: the ambiguous octal escape \
 569 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
 570                                  s[i], s[i + 1], s[i + 2],
 571                                  s[i], s[i + 1], s[i + 2]);
 572                         }
 573                     }
 574                 }
 575               break;
 576             case '\0':
 577               error (0, 0, _("invalid backslash escape at end of string"));
 578               return 1;
 579
 580             default:
 581               if (posix_pedantic)
 582                 {
 583                   error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
 584                   return 1;
 585                 }
 586               else
 587                 {
 588                   c = s[i + 1];
 589                   es->escaped[j] = 1;
 590                 }
 591             }
 592           ++i;
 593           es->s[j++] = c;
 594           break;
 595         default:
 596           es->s[j++] = s[i];
 597           break;
 598         }
 599     }
 600   es->len = j;
 601   return 0;
 602 }
 603
 604 /* If CLASS_STR is a valid character class string, return its index
 605    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 606
 607 static enum Char_class
 608 look_up_char_class (const unsigned char *class_str, size_t len)
 609 {
 610   unsigned int i;
 611
 612   for (i = 0; i < N_CHAR_CLASSES; i++)
 613     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 614         && strlen (char_class_name[i]) == len)
 615       return (enum Char_class) i;
 616   return CC_NO_CLASS;
 617 }
 618
 619 /* Return a newly allocated string with a printable version of C.
 620    This function is used solely for formatting error messages.  */
 621
 622 static char *
 623 make_printable_char (unsigned int c)
 624 {
 625   char *buf = xmalloc (5);
 626
 627   assert (c < N_CHARS);
 628   if (ISPRINT (c))
 629     {
 630       buf[0] = c;
 631       buf[1] = '\0';
 632     }
 633   else
 634     {
 635       sprintf (buf, "\\%03o", c);
 636     }
 637   return buf;
 638 }
 639
 640 /* Return a newly allocated copy of S which is suitable for printing.
 641    LEN is the number of characters in S.  Most non-printing
 642    (isprint) characters are represented by a backslash followed by
 643    3 octal digits.  However, the characters represented by \c escapes
 644    where c is one of [abfnrtv] are represented by their 2-character \c
 645    sequences.  This function is used solely for printing error messages.  */
 646
 647 static char *
 648 make_printable_str (const unsigned char *s, size_t len)
 649 {
 650   /* Worst case is that every character expands to a backslash
 651      followed by a 3-character octal escape sequence.  */
 652   char *printable_buf = xmalloc (4 * len + 1);
 653   char *p = printable_buf;
 654   size_t i;
 655
 656   for (i = 0; i < len; i++)
 657     {
 658       char buf[5];
 659       char *tmp = NULL;
 660
 661       switch (s[i])
 662         {
 663         case '\\':
 664           tmp = "\\";
 665           break;
 666         case '\007':
 667           tmp = "\\a";
 668           break;
 669         case '\b':
 670           tmp = "\\b";
 671           break;
 672         case '\f':
 673           tmp = "\\f";
 674           break;
 675         case '\n':
 676           tmp = "\\n";
 677           break;
 678         case '\r':
 679           tmp = "\\r";
 680           break;
 681         case '\t':
 682           tmp = "\\t";
 683           break;
 684         case '\v':
 685           tmp = "\\v";
 686           break;
 687         default:
 688           if (ISPRINT (s[i]))
 689             {
 690               buf[0] = s[i];
 691               buf[1] = '\0';
 692             }
 693           else
 694             sprintf (buf, "\\%03o", s[i]);
 695           tmp = buf;
 696           break;
 697         }
 698       p = stpcpy (p, tmp);
 699     }
 700   return printable_buf;
 701 }
 702
 703 /* Append a newly allocated structure representing a
 704    character C to the specification list LIST.  */
 705
 706 static void
 707 append_normal_char (struct Spec_list *list, unsigned int c)
 708 {
 709   struct List_element *new;
 710
 711   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 712   new->next = NULL;
 713   new->type = RE_NORMAL_CHAR;
 714   new->u.normal_char = c;
 715   assert (list->tail);
 716   list->tail->next = new;
 717   list->tail = new;
 718 }
 719
 720 /* Append a newly allocated structure representing the range
 721    of characters from FIRST to LAST to the specification list LIST.
 722    Return nonzero if LAST precedes FIRST in the collating sequence,
 723    zero otherwise.  This means that '[c-c]' is acceptable.  */
 724
 725 static int
 726 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
 727 {
 728   struct List_element *new;
 729
 730   if (ORD (first) > ORD (last))
 731     {
 732       char *tmp1 = make_printable_char (first);
 733       char *tmp2 = make_printable_char (last);
 734
 735       error (0, 0,
 736        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 737              tmp1, tmp2);
 738       free (tmp1);
 739       free (tmp2);
 740       return 1;
 741     }
 742   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 743   new->next = NULL;
 744   new->type = RE_RANGE;
 745   new->u.range.first_char = first;
 746   new->u.range.last_char = last;
 747   assert (list->tail);
 748   list->tail->next = new;
 749   list->tail = new;
 750   return 0;
 751 }
 752
 753 /* If CHAR_CLASS_STR is a valid character class string, append a
 754    newly allocated structure representing that character class to the end
 755    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 756    a valid string return nonzero.  */
 757
 758 static int
 759 append_char_class (struct Spec_list *list,
 760                    const unsigned char *char_class_str, size_t len)
 761 {
 762   enum Char_class char_class;
 763   struct List_element *new;
 764
 765   char_class = look_up_char_class (char_class_str, len);
 766   if (char_class == CC_NO_CLASS)
 767     return 1;
 768   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 769   new->next = NULL;
 770   new->type = RE_CHAR_CLASS;
 771   new->u.char_class = char_class;
 772   assert (list->tail);
 773   list->tail->next = new;
 774   list->tail = new;
 775   return 0;
 776 }
 777
 778 /* Append a newly allocated structure representing a [c*n]
 779    repeated character construct to the specification list LIST.
 780    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 781    is a non-negative repeat count.  */
 782
 783 static void
 784 append_repeated_char (struct Spec_list *list, unsigned int the_char,
 785                       size_t repeat_count)
 786 {
 787   struct List_element *new;
 788
 789   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 790   new->next = NULL;
 791   new->type = RE_REPEATED_CHAR;
 792   new->u.repeated_char.the_repeated_char = the_char;
 793   new->u.repeated_char.repeat_count = repeat_count;
 794   assert (list->tail);
 795   list->tail->next = new;
 796   list->tail = new;
 797 }
 798
 799 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 800    the length of that string, LEN, if LEN is exactly one, append
 801    a newly allocated structure representing the specified
 802    equivalence class to the specification list, LIST and return zero.
 803    If LEN is not 1, return nonzero.  */
 804
 805 static int
 806 append_equiv_class (struct Spec_list *list,
 807                     const unsigned char *equiv_class_str, size_t len)
 808 {
 809   struct List_element *new;
 810
 811   if (len != 1)
 812     return 1;
 813   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 814   new->next = NULL;
 815   new->type = RE_EQUIV_CLASS;
 816   new->u.equiv_code = *equiv_class_str;
 817   assert (list->tail);
 818   list->tail->next = new;
 819   list->tail = new;
 820   return 0;
 821 }
 822
 823 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 824    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 825    NUL bytes, and is *not* NUL-terminated.  */
 826
 827 static unsigned char *
 828 substr (const unsigned char *p, size_t first_idx, size_t last_idx)
 829 {
 830   size_t len;
 831   unsigned char *tmp;
 832
 833   assert (first_idx <= last_idx);
 834   len = last_idx - first_idx + 1;
 835   tmp = (unsigned char *) xmalloc (len);
 836
 837   assert (first_idx <= last_idx);
 838   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 839   memcpy (tmp, p + first_idx, len);
 840   return tmp;
 841 }
 842
 843 /* Search forward starting at START_IDX for the 2-char sequence
 844    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 845    a sequence is found, set *RESULT_IDX to the index of the first
 846    character and return nonzero. Otherwise return zero.  P may contain
 847    zero bytes.  */
 848
 849 static int
 850 find_closing_delim (const struct E_string *es, size_t start_idx,
 851                     unsigned int pre_bracket_char, size_t *result_idx)
 852 {
 853   size_t i;
 854
 855   for (i = start_idx; i < es->len - 1; i++)
 856     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 857         && !es->escaped[i] && !es->escaped[i + 1])
 858       {
 859         *result_idx = i;
 860         return 1;
 861       }
 862   return 0;
 863 }
 864
 865 /* Convert a string S with explicit length LEN, possibly
 866    containing embedded zero bytes, to a long integer value.
 867    If the string represents a negative value, a value larger
 868    than LONG_MAX, or if all LEN characters do not represent a
 869    valid integer, return nonzero and do not modify *VAL.
 870    Otherwise, return zero and set *VAL to the converted value.  */
 871
 872 static int
 873 non_neg_strtol (const unsigned char *s, size_t len, size_t *val)
 874 {
 875   size_t i;
 876   unsigned long sum = 0;
 877   unsigned int base;
 878
 879   if (len <= 0)
 880     return 1;
 881   if (s[0] == '0')
 882     base = 8;
 883   else if (ISDIGIT (s[0]))
 884     base = 10;
 885   else
 886     return 1;
 887
 888   for (i = 0; i < len; i++)
 889     {
 890       unsigned int c;
 891
 892       if (s[i] < '0')
 893         return 1;
 894
 895       c = s[i] - '0';
 896       if (c >= base)
 897         return 1;
 898
 899       if (sum > (LONG_MAX - c) / base)
 900         return 1;
 901       sum = sum * base + c;
 902     }
 903   *val = sum;
 904   return 0;
 905 }
 906
 907 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 908    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 909    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 910    and return zero. If the second character following
 911    the opening bracket is not `*' or if no closing bracket can be
 912    found, return -1.  If a closing bracket is found and the
 913    second char is `*', but the string between the `*' and `]' isn't
 914    empty, an octal number, or a decimal number, print an error message
 915    and return -2.  */
 916
 917 static int
 918 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 919                        unsigned int *char_to_repeat, size_t *repeat_count,
 920                        size_t *closing_bracket_idx)
 921 {
 922   size_t i;
 923
 924   assert (start_idx + 1 < es->len);
 925   if (!ES_MATCH (es, start_idx + 1, '*'))
 926     return -1;
 927
 928   for (i = start_idx + 2; i < es->len; i++)
 929     {
 930       if (ES_MATCH (es, i, ']'))
 931         {
 932           const unsigned char *digit_str;
 933           size_t digit_str_len = i - start_idx - 2;
 934
 935           *char_to_repeat = es->s[start_idx];
 936           if (digit_str_len == 0)
 937             {
 938               /* We've matched [c*] -- no explicit repeat count.  */
 939               *repeat_count = 0;
 940               *closing_bracket_idx = i;
 941               return 0;
 942             }
 943
 944           /* Here, we have found [c*s] where s should be a string
 945              of octal or decimal digits.  */
 946           digit_str = &es->s[start_idx + 2];
 947           if (non_neg_strtol (digit_str, digit_str_len, repeat_count)
 948               || *repeat_count > BEGIN_STATE)
 949             {
 950               char *tmp = make_printable_str (digit_str, digit_str_len);
 951               error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
 952                      tmp);
 953               free (tmp);
 954               return -2;
 955             }
 956           *closing_bracket_idx = i;
 957           return 0;
 958         }
 959     }
 960   return -1;                    /* No bracket found.  */
 961 }
 962
 963 /* Return nonzero if the string at ES->s[IDX] matches the regular
 964    expression `\*[0-9]*\]', zero otherwise.  To match, the `*' and
 965    the `]' must not be escaped.  */
 966
 967 static int
 968 star_digits_closebracket (const struct E_string *es, size_t idx)
 969 {
 970   size_t i;
 971
 972   if (!ES_MATCH (es, idx, '*'))
 973     return 0;
 974
 975   for (i = idx + 1; i < es->len; i++)
 976     {
 977       if (!ISDIGIT (es->s[i]))
 978         {
 979           if (ES_MATCH (es, i, ']'))
 980             return 1;
 981           return 0;
 982         }
 983     }
 984   return 0;
 985 }
 986
 987 /* Convert string UNESACPED_STRING (which has been preprocessed to
 988    convert backslash-escape sequences) of length LEN characters into
 989    a linked list of the following 5 types of constructs:
 990       - [:str:] Character class where `str' is one of the 12 valid strings.
 991       - [=c=] Equivalence class where `c' is any single character.
 992       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 993           However, if `n' is present, it must be a non-negative octal or
 994           decimal integer.
 995       - r-s Range of characters from `r' to `s'.  The second endpoint must
 996           not precede the first in the current collating sequence.
 997       - c Any other character is interpreted as itself.  */
 998
 999 static int
1000 build_spec_list (const struct E_string *es, struct Spec_list *result)
1001 {
1002   const unsigned char *p;
1003   size_t i;
1004
1005   p = es->s;
1006
1007   /* The main for-loop below recognizes the 4 multi-character constructs.
1008      A character that matches (in its context) none of the multi-character
1009      constructs is classified as `normal'.  Since all multi-character
1010      constructs have at least 3 characters, any strings of length 2 or
1011      less are composed solely of normal characters.  Hence, the index of
1012      the outer for-loop runs only as far as LEN-2.  */
1013
1014   for (i = 0; i + 2 < es->len; /* empty */)
1015     {
1016       if (ES_MATCH (es, i, '['))
1017         {
1018           int matched_multi_char_construct;
1019           size_t closing_bracket_idx;
1020           unsigned int char_to_repeat;
1021           size_t repeat_count;
1022           int err;
1023
1024           matched_multi_char_construct = 1;
1025           if (ES_MATCH (es, i + 1, ':')
1026               || ES_MATCH (es, i + 1, '='))
1027             {
1028               size_t closing_delim_idx;
1029               int found;
1030
1031               found = find_closing_delim (es, i + 2, p[i + 1],
1032                                           &closing_delim_idx);
1033               if (found)
1034                 {
1035                   int parse_failed;
1036                   unsigned char *opnd_str = substr (p, i + 2,
1037                                                     closing_delim_idx - 1);
1038                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
1039
1040                   if (p[i + 1] == ':')
1041                     {
1042                       parse_failed = append_char_class (result, opnd_str,
1043                                                         opnd_str_len);
1044
1045                       /* FIXME: big comment.  */
1046                       if (parse_failed)
1047                         {
1048                           if (star_digits_closebracket (es, i + 2))
1049                             {
1050                               free (opnd_str);
1051                               goto try_bracketed_repeat;
1052                             }
1053                           else
1054                             {
1055                               char *tmp = make_printable_str (opnd_str,
1056                                                               opnd_str_len);
1057                               error (0, 0, _("invalid character class `%s'"),
1058                                      tmp);
1059                               free (tmp);
1060                               return 1;
1061                             }
1062                         }
1063                     }
1064                   else
1065                     {
1066                       parse_failed = append_equiv_class (result, opnd_str,
1067                                                          opnd_str_len);
1068
1069                       /* FIXME: big comment.  */
1070                       if (parse_failed)
1071                         {
1072                           if (star_digits_closebracket (es, i + 2))
1073                             {
1074                               free (opnd_str);
1075                               goto try_bracketed_repeat;
1076                             }
1077                           else
1078                             {
1079                               char *tmp = make_printable_str (opnd_str,
1080                                                               opnd_str_len);
1081                               error (0, 0,
1082                _("%s: equivalence class operand must be a single character"),
1083                                      tmp);
1084                               free (tmp);
1085                               return 1;
1086                             }
1087                         }
1088                     }
1089                   free (opnd_str);
1090
1091                   /* Return nonzero if append_*_class reports a problem.  */
1092                   if (parse_failed)
1093                     return 1;
1094                   else
1095                     i = closing_delim_idx + 2;
1096                   continue;
1097                 }
1098               /* Else fall through.  This could be [:*] or [=*].  */
1099             }
1100
1101         try_bracketed_repeat:
1102
1103           /* Determine whether this is a bracketed repeat range
1104              matching the RE \[.\*(dec_or_oct_number)?\].  */
1105           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1106                                        &repeat_count,
1107                                        &closing_bracket_idx);
1108           if (err == 0)
1109             {
1110               append_repeated_char (result, char_to_repeat, repeat_count);
1111               i = closing_bracket_idx + 1;
1112             }
1113           else if (err == -1)
1114             {
1115               matched_multi_char_construct = 0;
1116             }
1117           else
1118             {
1119               /* Found a string that looked like [c*n] but the
1120                  numeric part was invalid.  */
1121               return 1;
1122             }
1123
1124           if (matched_multi_char_construct)
1125             continue;
1126
1127           /* We reach this point if P does not match [:str:], [=c=],
1128              [c*n], or [c*].  Now, see if P looks like a range `[-c'
1129              (from `[' to `c').  */
1130         }
1131
1132       /* Look ahead one char for ranges like a-z.  */
1133       if (ES_MATCH (es, i + 1, '-'))
1134         {
1135           if (append_range (result, p[i], p[i + 2]))
1136             return 1;
1137           i += 3;
1138         }
1139       else
1140         {
1141           append_normal_char (result, p[i]);
1142           ++i;
1143         }
1144     }
1145
1146   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1147   for (; i < es->len; i++)
1148     append_normal_char (result, p[i]);
1149
1150   return 0;
1151 }
1152
1153 /* Given a Spec_list S (with its saved state implicit in the values
1154    of its members `tail' and `state'), return the next single character
1155    in the expansion of S's constructs.  If the last character of S was
1156    returned on the previous call or if S was empty, this function
1157    returns -1.  For example, successive calls to get_next where S
1158    represents the spec-string 'a-d[y*3]' will return the sequence
1159    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1160    which the returned character comes is [:upper:] or [:lower:], the
1161    parameter CLASS is given a value to indicate which it was.  Otherwise
1162    CLASS is set to UL_NONE.  This value is used only when constructing
1163    the translation table to verify that any occurrences of upper and
1164    lower class constructs in the spec-strings appear in the same relative
1165    positions.  */
1166
1167 static int
1168 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1169 {
1170   struct List_element *p;
1171   int return_val;
1172   int i;
1173
1174   if (class)
1175     *class = UL_NONE;
1176
1177   if (s->state == BEGIN_STATE)
1178     {
1179       s->tail = s->head->next;
1180       s->state = NEW_ELEMENT;
1181     }
1182
1183   p = s->tail;
1184   if (p == NULL)
1185     return -1;
1186
1187   switch (p->type)
1188     {
1189     case RE_NORMAL_CHAR:
1190       return_val = p->u.normal_char;
1191       s->state = NEW_ELEMENT;
1192       s->tail = p->next;
1193       break;
1194
1195     case RE_RANGE:
1196       if (s->state == NEW_ELEMENT)
1197         s->state = ORD (p->u.range.first_char);
1198       else
1199         ++(s->state);
1200       return_val = CHR (s->state);
1201       if (s->state == ORD (p->u.range.last_char))
1202         {
1203           s->tail = p->next;
1204           s->state = NEW_ELEMENT;
1205         }
1206       break;
1207
1208     case RE_CHAR_CLASS:
1209       if (s->state == NEW_ELEMENT)
1210         {
1211           for (i = 0; i < N_CHARS; i++)
1212             if (is_char_class_member (p->u.char_class, i))
1213               break;
1214           assert (i < N_CHARS);
1215           s->state = i;
1216         }
1217       assert (is_char_class_member (p->u.char_class, s->state));
1218       return_val = CHR (s->state);
1219       for (i = s->state + 1; i < N_CHARS; i++)
1220         if (is_char_class_member (p->u.char_class, i))
1221           break;
1222       if (i < N_CHARS)
1223         s->state = i;
1224       else
1225         {
1226           s->tail = p->next;
1227           s->state = NEW_ELEMENT;
1228         }
1229       if (class)
1230         {
1231           switch (p->u.char_class)
1232             {
1233             case CC_LOWER:
1234               *class = UL_LOWER;
1235               break;
1236             case CC_UPPER:
1237               *class = UL_UPPER;
1238               break;
1239             default:
1240               /* empty */
1241               break;
1242             }
1243         }
1244       break;
1245
1246     case RE_EQUIV_CLASS:
1247       /* FIXME: this assumes that each character is alone in its own
1248          equivalence class (which appears to be correct for my
1249          LC_COLLATE.  But I don't know of any function that allows
1250          one to determine a character's equivalence class.  */
1251
1252       return_val = p->u.equiv_code;
1253       s->state = NEW_ELEMENT;
1254       s->tail = p->next;
1255       break;
1256
1257     case RE_REPEATED_CHAR:
1258       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1259       if (p->u.repeated_char.repeat_count == 0)
1260         {
1261           s->tail = p->next;
1262           s->state = NEW_ELEMENT;
1263           return_val = get_next (s, class);
1264         }
1265       else
1266         {
1267           if (s->state == NEW_ELEMENT)
1268             {
1269               s->state = 0;
1270             }
1271           ++(s->state);
1272           return_val = p->u.repeated_char.the_repeated_char;
1273           if (p->u.repeated_char.repeat_count > 0
1274               && s->state == p->u.repeated_char.repeat_count)
1275             {
1276               s->tail = p->next;
1277               s->state = NEW_ELEMENT;
1278             }
1279         }
1280       break;
1281
1282     case RE_NO_TYPE:
1283       abort ();
1284       break;
1285
1286     default:
1287       abort ();
1288       break;
1289     }
1290
1291   return return_val;
1292 }
1293
1294 /* This is a minor kludge.  This function is called from
1295    get_spec_stats to determine the cardinality of a set derived
1296    from a complemented string.  It's a kludge in that some of the
1297    same operations are (duplicated) performed in set_initialize.  */
1298
1299 static int
1300 card_of_complement (struct Spec_list *s)
1301 {
1302   int c;
1303   int cardinality = N_CHARS;
1304   SET_TYPE in_set[N_CHARS];
1305
1306   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1307   s->state = BEGIN_STATE;
1308   while ((c = get_next (s, NULL)) != -1)
1309     if (!in_set[c]++)
1310       --cardinality;
1311   return cardinality;
1312 }
1313
1314 /* Gather statistics about the spec-list S in preparation for the tests
1315    in validate that determine the consistency of the specs.  This function
1316    is called at most twice; once for string1, and again for any string2.
1317    LEN_S1 < 0 indicates that this is the first call and that S represents
1318    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1319    constructs in string1, and we can use its value to resolve any
1320    indefinite repeat construct in S (which represents string2).  Hence,
1321    this function has the side-effect that it converts a valid [c*]
1322    construct in string2 to [c*n] where n is large enough (or 0) to give
1323    string2 the same length as string1.  For example, with the command
1324    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1325    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1326
1327 static void
1328 get_spec_stats (struct Spec_list *s)
1329 {
1330   struct List_element *p;
1331   int len = 0;
1332
1333   s->n_indefinite_repeats = 0;
1334   s->has_equiv_class = 0;
1335   s->has_restricted_char_class = 0;
1336   s->has_upper_or_lower = 0;
1337   for (p = s->head->next; p; p = p->next)
1338     {
1339       switch (p->type)
1340         {
1341           int i;
1342         case RE_NORMAL_CHAR:
1343           ++len;
1344           break;
1345
1346         case RE_RANGE:
1347           assert (p->u.range.last_char >= p->u.range.first_char);
1348           len += p->u.range.last_char - p->u.range.first_char + 1;
1349           break;
1350
1351         case RE_CHAR_CLASS:
1352           for (i = 0; i < N_CHARS; i++)
1353             if (is_char_class_member (p->u.char_class, i))
1354               ++len;
1355           switch (p->u.char_class)
1356             {
1357             case CC_UPPER:
1358             case CC_LOWER:
1359               s->has_upper_or_lower = 1;
1360               break;
1361             default:
1362               s->has_restricted_char_class = 1;
1363               break;
1364             }
1365           break;
1366
1367         case RE_EQUIV_CLASS:
1368           for (i = 0; i < N_CHARS; i++)
1369             if (is_equiv_class_member (p->u.equiv_code, i))
1370               ++len;
1371           s->has_equiv_class = 1;
1372           break;
1373
1374         case RE_REPEATED_CHAR:
1375           if (p->u.repeated_char.repeat_count > 0)
1376             len += p->u.repeated_char.repeat_count;
1377           else if (p->u.repeated_char.repeat_count == 0)
1378             {
1379               s->indefinite_repeat_element = p;
1380               ++(s->n_indefinite_repeats);
1381             }
1382           break;
1383
1384         case RE_NO_TYPE:
1385           assert (0);
1386           break;
1387         }
1388     }
1389
1390   s->length = len;
1391 }
1392
1393 static void
1394 get_s1_spec_stats (struct Spec_list *s1)
1395 {
1396   get_spec_stats (s1);
1397   if (complement)
1398     s1->length = card_of_complement (s1);
1399 }
1400
1401 static void
1402 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1403 {
1404   get_spec_stats (s2);
1405   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1406     {
1407       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1408         len_s1 - s2->length;
1409       s2->length = len_s1;
1410     }
1411 }
1412
1413 static void
1414 spec_init (struct Spec_list *spec_list)
1415 {
1416   spec_list->head = spec_list->tail =
1417     (struct List_element *) xmalloc (sizeof (struct List_element));
1418   spec_list->head->next = NULL;
1419 }
1420
1421 /* This function makes two passes over the argument string S.  The first
1422    one converts all \c and \ddd escapes to their one-byte representations.
1423    The second constructs a linked specification list, SPEC_LIST, of the
1424    characters and constructs that comprise the argument string.  If either
1425    of these passes detects an error, this function returns nonzero.  */
1426
1427 static int
1428 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1429 {
1430   struct E_string es;
1431
1432   if (unquote (s, &es))
1433     return 1;
1434   if (build_spec_list (&es, spec_list))
1435     return 1;
1436   return 0;
1437 }
1438
1439 /* Given two specification lists, S1 and S2, and assuming that
1440    S1->length > S2->length, append a single [c*n] element to S2 where c
1441    is the last character in the expansion of S2 and n is the difference
1442    between the two lengths.
1443    Upon successful completion, S2->length is set to S1->length.  The only
1444    way this function can fail to make S2 as long as S1 is when S2 has
1445    zero-length, since in that case, there is no last character to repeat.
1446    So S2->length is required to be at least 1.
1447
1448    Providing this functionality allows the user to do some pretty
1449    non-BSD (and non-portable) things:  For example, the command
1450        tr -cs '[:upper:]0-9' '[:lower:]'
1451    is almost guaranteed to give results that depend on your collating
1452    sequence.  */
1453
1454 static void
1455 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1456 {
1457   struct List_element *p;
1458   int char_to_repeat;
1459   int i;
1460
1461   assert (translating);
1462   assert (s1->length > s2->length);
1463   assert (s2->length > 0);
1464
1465   p = s2->tail;
1466   switch (p->type)
1467     {
1468     case RE_NORMAL_CHAR:
1469       char_to_repeat = p->u.normal_char;
1470       break;
1471     case RE_RANGE:
1472       char_to_repeat = p->u.range.last_char;
1473       break;
1474     case RE_CHAR_CLASS:
1475       for (i = N_CHARS; i >= 0; i--)
1476         if (is_char_class_member (p->u.char_class, i))
1477           break;
1478       assert (i >= 0);
1479       char_to_repeat = CHR (i);
1480       break;
1481
1482     case RE_REPEATED_CHAR:
1483       char_to_repeat = p->u.repeated_char.the_repeated_char;
1484       break;
1485
1486     case RE_EQUIV_CLASS:
1487       /* This shouldn't happen, because validate exits with an error
1488          if it finds an equiv class in string2 when translating.  */
1489       abort ();
1490       break;
1491
1492     case RE_NO_TYPE:
1493       abort ();
1494       break;
1495
1496     default:
1497       abort ();
1498       break;
1499     }
1500
1501   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1502   s2->length = s1->length;
1503 }
1504
1505 /* Die with an error message if S1 and S2 describe strings that
1506    are not valid with the given command line switches.
1507    A side effect of this function is that if a valid [c*] or
1508    [c*0] construct appears in string2, it is converted to [c*n]
1509    with a value for n that makes s2->length == s1->length.  By
1510    the same token, if the --truncate-set1 option is not
1511    given, S2 may be extended.  */
1512
1513 static void
1514 validate (struct Spec_list *s1, struct Spec_list *s2)
1515 {
1516   get_s1_spec_stats (s1);
1517   if (s1->n_indefinite_repeats > 0)
1518     {
1519       error (1, 0, _("the [c*] repeat construct may not appear in string1"));
1520     }
1521
1522   /* FIXME: it isn't clear from the POSIX spec that this is invalid,
1523      but in the spirit of the other restrictions put on translation
1524      with character classes, this seems a logical interpretation.  */
1525   if (complement && s1->has_upper_or_lower)
1526     {
1527       error (1, 0,
1528              _("character classes may not be used when translating \
1529 and complementing"));
1530     }
1531
1532   if (s2)
1533     {
1534       get_s2_spec_stats (s2, s1->length);
1535       if (s2->has_restricted_char_class)
1536         {
1537           error (1, 0,
1538                  _("when translating, the only character classes that may \
1539 appear in\n\tstring2 are `upper' and `lower'"));
1540         }
1541
1542       if (s2->n_indefinite_repeats > 1)
1543         {
1544           error (1, 0,
1545                  _("only one [c*] repeat construct may appear in string2"));
1546         }
1547
1548       if (translating)
1549         {
1550           if (s2->has_equiv_class)
1551             {
1552               error (1, 0,
1553                      _("[=c=] expressions may not appear in string2 \
1554 when translating"));
1555             }
1556
1557           if (s1->length > s2->length)
1558             {
1559               if (!truncate_set1)
1560                 {
1561                   /* string2 must be non-empty unless --truncate-set1 is
1562                      given or string1 is empty.  */
1563
1564                   if (s2->length == 0)
1565                     error (1, 0,
1566                      _("when not truncating set1, string2 must be non-empty"));
1567                   string2_extend (s1, s2);
1568                 }
1569             }
1570
1571           if (complement && s2->has_upper_or_lower)
1572             error (1, 0,
1573                    _("character classes may not be used when translating \
1574 and complementing"));
1575         }
1576       else
1577         /* Not translating.  */
1578         {
1579           if (s2->n_indefinite_repeats > 0)
1580             error (1, 0,
1581                    _("the [c*] construct may appear in string2 only \
1582 when translating"));
1583         }
1584     }
1585 }
1586
1587 /* Read buffers of SIZE bytes via the function READER (if READER is
1588    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1589    read_and_delete or read_and_xlate.  After each buffer is read, it is
1590    processed and written to stdout.  The buffers are processed so that
1591    multiple consecutive occurrences of the same character in the input
1592    stream are replaced by a single occurrence of that character if the
1593    character is in the squeeze set.  */
1594
1595 static void
1596 squeeze_filter (unsigned char *buf, long int size, PFI reader)
1597 {
1598   unsigned int char_to_squeeze = NOT_A_CHAR;
1599   int i = 0;
1600   int nr = 0;
1601
1602   for (;;)
1603     {
1604       int begin;
1605
1606       if (i >= nr)
1607         {
1608           if (reader == NULL)
1609             nr = safe_read (0, (char *) buf, size);
1610           else
1611             nr = (*reader) (buf, size, NULL);
1612
1613           if (nr < 0)
1614             error (1, errno, _("read error"));
1615           if (nr == 0)
1616             break;
1617           i = 0;
1618         }
1619
1620       begin = i;
1621
1622       if (char_to_squeeze == NOT_A_CHAR)
1623         {
1624           int out_len;
1625           /* Here, by being a little tricky, we can get a significant
1626              performance increase in most cases when the input is
1627              reasonably large.  Since tr will modify the input only
1628              if two consecutive (and identical) input characters are
1629              in the squeeze set, we can step by two through the data
1630              when searching for a character in the squeeze set.  This
1631              means there may be a little more work in a few cases and
1632              perhaps twice as much work in the worst cases where most
1633              of the input is removed by squeezing repeats.  But most
1634              uses of this functionality seem to remove less than 20-30%
1635              of the input.  */
1636           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1637             ;                   /* empty */
1638
1639           /* There is a special case when i == nr and we've just
1640              skipped a character (the last one in buf) that is in
1641              the squeeze set.  */
1642           if (i == nr && in_squeeze_set[buf[i - 1]])
1643             --i;
1644
1645           if (i >= nr)
1646             out_len = nr - begin;
1647           else
1648             {
1649               char_to_squeeze = buf[i];
1650               /* We're about to output buf[begin..i].  */
1651               out_len = i - begin + 1;
1652
1653               /* But since we stepped by 2 in the loop above,
1654                  out_len may be one too large.  */
1655               if (i > 0 && buf[i - 1] == char_to_squeeze)
1656                 --out_len;
1657
1658               /* Advance i to the index of first character to be
1659                  considered when looking for a char different from
1660                  char_to_squeeze.  */
1661               ++i;
1662             }
1663           if (out_len > 0
1664               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1665             error (1, errno, _("write error"));
1666         }
1667
1668       if (char_to_squeeze != NOT_A_CHAR)
1669         {
1670           /* Advance i to index of first char != char_to_squeeze
1671              (or to nr if all the rest of the characters in this
1672              buffer are the same as char_to_squeeze).  */
1673           for (; i < nr && buf[i] == char_to_squeeze; i++)
1674             ;                   /* empty */
1675           if (i < nr)
1676             char_to_squeeze = NOT_A_CHAR;
1677           /* If (i >= nr) we've squeezed the last character in this buffer.
1678              So now we have to read a new buffer and continue comparing
1679              characters against char_to_squeeze.  */
1680         }
1681     }
1682 }
1683
1684 /* Read buffers of SIZE bytes from stdin until one is found that
1685    contains at least one character not in the delete set.  Store
1686    in the array BUF, all characters from that buffer that are not
1687    in the delete set, and return the number of characters saved
1688    or 0 upon EOF.  */
1689
1690 static long
1691 read_and_delete (unsigned char *buf, long int size, PFI not_used)
1692 {
1693   long n_saved;
1694   static int hit_eof = 0;
1695
1696   assert (not_used == NULL);
1697   assert (size > 0);
1698
1699   if (hit_eof)
1700     return 0;
1701
1702   /* This enclosing do-while loop is to make sure that
1703      we don't return zero (indicating EOF) when we've
1704      just deleted all the characters in a buffer.  */
1705   do
1706     {
1707       int i;
1708       int nr = safe_read (0, (char *) buf, size);
1709
1710       if (nr < 0)
1711         error (1, errno, _("read error"));
1712       if (nr == 0)
1713         {
1714           hit_eof = 1;
1715           return 0;
1716         }
1717
1718       /* This first loop may be a waste of code, but gives much
1719          better performance when no characters are deleted in
1720          the beginning of a buffer.  It just avoids the copying
1721          of buf[i] into buf[n_saved] when it would be a NOP.  */
1722
1723       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1724         /* empty */ ;
1725       n_saved = i;
1726
1727       for (++i; i < nr; i++)
1728         if (!in_delete_set[buf[i]])
1729           buf[n_saved++] = buf[i];
1730     }
1731   while (n_saved == 0);
1732
1733   return n_saved;
1734 }
1735
1736 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1737    perform the in-place and one-to-one mapping specified by the global
1738    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1739
1740 static long
1741 read_and_xlate (unsigned char *buf, long int size, PFI not_used)
1742 {
1743   long chars_read = 0;
1744   static int hit_eof = 0;
1745   int i;
1746
1747   assert (not_used == NULL);
1748   assert (size > 0);
1749
1750   if (hit_eof)
1751     return 0;
1752
1753   chars_read = safe_read (0, (char *) buf, size);
1754   if (chars_read < 0)
1755     error (1, errno, _("read error"));
1756   if (chars_read == 0)
1757     {
1758       hit_eof = 1;
1759       return 0;
1760     }
1761
1762   for (i = 0; i < chars_read; i++)
1763     buf[i] = xlate[buf[i]];
1764
1765   return chars_read;
1766 }
1767
1768 /* Initialize a boolean membership set IN_SET with the character
1769    values obtained by traversing the linked list of constructs S
1770    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1771    nonzero the resulting set is complemented.  */
1772
1773 static void
1774 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1775 {
1776   int c;
1777   int i;
1778
1779   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1780   s->state = BEGIN_STATE;
1781   while ((c = get_next (s, NULL)) != -1)
1782     in_set[c] = 1;
1783   if (complement_this_set)
1784     for (i = 0; i < N_CHARS; i++)
1785       in_set[i] = (!in_set[i]);
1786 }
1787
1788 void
1789 main (int argc, char **argv)
1790 {
1791   int c;
1792   int non_option_args;
1793   struct Spec_list buf1, buf2;
1794   struct Spec_list *s1 = &buf1;
1795   struct Spec_list *s2 = &buf2;
1796
1797   program_name = argv[0];
1798   setlocale (LC_ALL, "");
1799   bindtextdomain (PACKAGE, LOCALEDIR);
1800   textdomain (PACKAGE);
1801
1802   while ((c = getopt_long (argc, argv, "cdst", long_options,
1803                            (int *) 0)) != EOF)
1804     {
1805       switch (c)
1806         {
1807         case 0:
1808           break;
1809
1810         case 'c':
1811           complement = 1;
1812           break;
1813
1814         case 'd':
1815           delete = 1;
1816           break;
1817
1818         case 's':
1819           squeeze_repeats = 1;
1820           break;
1821
1822         case 't':
1823           truncate_set1 = 1;
1824           break;
1825
1826         default:
1827           usage (2);
1828           break;
1829         }
1830     }
1831
1832   if (show_version)
1833     {
1834       printf ("tr - %s\n", PACKAGE_VERSION);
1835       exit (0);
1836     }
1837
1838   if (show_help)
1839     usage (0);
1840
1841   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1842
1843   non_option_args = argc - optind;
1844   translating = (non_option_args == 2 && !delete);
1845
1846   /* Change this test if it is valid to give tr no options and
1847      no args at all.  POSIX doesn't specifically say anything
1848      either way, but it looks like they implied it's invalid
1849      by omission.  If you want to make tr do a slow imitation
1850      of `cat' use `tr a a'.  */
1851   if (non_option_args > 2)
1852     {
1853       error (0, 0, _("too many arguments"));
1854       usage (2);
1855     }
1856
1857   if (!delete && !squeeze_repeats && non_option_args != 2)
1858     error (1, 0, _("two strings must be given when translating"));
1859
1860   if (delete && squeeze_repeats && non_option_args != 2)
1861     error (1, 0, _("two strings must be given when both \
1862 deleting and squeezing repeats"));
1863
1864   /* If --delete is given without --squeeze-repeats, then
1865      only one string argument may be specified.  But POSIX
1866      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1867      is set, pretend we never saw string2.  But I think
1868      this deserves a fatal error, so that's the default.  */
1869   if ((delete && !squeeze_repeats) && non_option_args != 1)
1870     {
1871       if (posix_pedantic && non_option_args == 2)
1872         --non_option_args;
1873       else
1874         error (1, 0,
1875                _("only one string may be given when deleting \
1876 without squeezing repeats"));
1877     }
1878
1879   if (squeeze_repeats && non_option_args == 0)
1880     error (1, 0,
1881            _("at least one string must be given when squeezing repeats"));
1882
1883   spec_init (s1);
1884   if (parse_str ((unsigned char *) argv[optind], s1))
1885     exit (1);
1886
1887   if (non_option_args == 2)
1888     {
1889       spec_init (s2);
1890       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1891         exit (1);
1892     }
1893   else
1894     s2 = NULL;
1895
1896   validate (s1, s2);
1897
1898   if (squeeze_repeats && non_option_args == 1)
1899     {
1900       set_initialize (s1, complement, in_squeeze_set);
1901       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1902     }
1903   else if (delete && non_option_args == 1)
1904     {
1905       long nr;
1906
1907       set_initialize (s1, complement, in_delete_set);
1908       do
1909         {
1910           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1911           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1912             error (1, errno, _("write error"));
1913         }
1914       while (nr > 0);
1915     }
1916   else if (squeeze_repeats && delete && non_option_args == 2)
1917     {
1918       set_initialize (s1, complement, in_delete_set);
1919       set_initialize (s2, 0, in_squeeze_set);
1920       squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1921     }
1922   else if (translating)
1923     {
1924       if (complement)
1925         {
1926           int i;
1927           SET_TYPE *in_s1 = in_delete_set;
1928
1929           set_initialize (s1, 0, in_s1);
1930           s2->state = BEGIN_STATE;
1931           for (i = 0; i < N_CHARS; i++)
1932             xlate[i] = i;
1933           for (i = 0; i < N_CHARS; i++)
1934             {
1935               if (!in_s1[i])
1936                 {
1937                   int ch = get_next (s2, NULL);
1938                   assert (ch != -1 || truncate_set1);
1939                   if (ch == -1)
1940                     {
1941                       /* This will happen when tr is invoked like e.g.
1942                          tr -cs A-Za-z0-9 '\012'.  */
1943                       break;
1944                     }
1945                   xlate[i] = ch;
1946                 }
1947             }
1948           assert (get_next (s2, NULL) == -1 || truncate_set1);
1949         }
1950       else
1951         {
1952           int c1, c2;
1953           int i;
1954           enum Upper_Lower_class class_s1;
1955           enum Upper_Lower_class class_s2;
1956
1957           for (i = 0; i < N_CHARS; i++)
1958             xlate[i] = i;
1959           s1->state = BEGIN_STATE;
1960           s2->state = BEGIN_STATE;
1961           for (;;)
1962             {
1963               c1 = get_next (s1, &class_s1);
1964               c2 = get_next (s2, &class_s2);
1965               if (!class_ok[(int) class_s1][(int) class_s2])
1966                 error (1, 0,
1967                      _("misaligned or mismatched upper and/or lower classes"));
1968               /* The following should have been checked by validate...  */
1969               if (c2 == -1)
1970                 break;
1971               xlate[c1] = c2;
1972             }
1973           assert (c1 == -1 || truncate_set1);
1974         }
1975       if (squeeze_repeats)
1976         {
1977           set_initialize (s2, 0, in_squeeze_set);
1978           squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
1979         }
1980       else
1981         {
1982           long chars_read;
1983
1984           do
1985             {
1986               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
1987               if (chars_read > 0
1988                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
1989                 error (1, errno, _("write error"));
1990             }
1991           while (chars_read > 0);
1992         }
1993     }
1994
1995   if (fclose (stdout) == EOF)
1996     error (2, errno, _("write error"));
1997
1998   if (close (0) != 0)
1999     error (2, errno, _("standard input"));
2000
2001   exit (0);
2002 }