src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 91, 95, 96, 1997 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering */
  19
  20 #include <config.h>
  21
  22 /* Get isblank from GNU libc.  */
  23 #define _GNU_SOURCE
  24
  25 #include <stdio.h>
  26 #define NDEBUG 1
  27 #include <assert.h>
  28 #include <errno.h>
  29 #include <sys/types.h>
  30 #include <getopt.h>
  31
  32 #if HAVE_LIMITS_H
  33 # include <limits.h>
  34 #endif
  35
  36 #include "system.h"
  37 #include "error.h"
  38
  39 #ifndef ULONG_MAX
  40 # define ULONG_MAX ((unsigned long) ~(unsigned long) 0)
  41 #endif
  42
  43 #ifndef LONG_MAX
  44 # define LONG_MAX ((long int) (ULONG_MAX >> 1))
  45 #endif
  46
  47 #ifndef UINT_MAX
  48 # define UINT_MAX ((unsigned int) ~(unsigned int) 0)
  49 #endif
  50
  51 #ifndef INT_MAX
  52 # define INT_MAX ((int) (UINT_MAX >> 1))
  53 #endif
  54
  55 #ifndef UCHAR_MAX
  56 # define UCHAR_MAX 0xFF
  57 #endif
  58
  59 #define N_CHARS (UCHAR_MAX + 1)
  60
  61 /* A pointer to a function that returns an int.  */
  62 typedef int (*PFI) ();
  63
  64 /* Convert from character C to its index in the collating
  65    sequence array.  Just cast to an unsigned int to avoid
  66    problems with sign-extension.  */
  67 #define ORD(c) (unsigned int)(c)
  68
  69 /* The inverse of ORD.  */
  70 #define CHR(i) (unsigned char)(i)
  71
  72 /* The value for Spec_list->state that indicates to
  73    get_next that it should initialize the tail pointer.
  74    Its value should be as large as possible to avoid conflict
  75    a valid value for the state field -- and that may be as
  76    large as any valid repeat_count.  */
  77 #define BEGIN_STATE (INT_MAX - 1)
  78
  79 /* The value for Spec_list->state that indicates to
  80    get_next that the element pointed to by Spec_list->tail is
  81    being considered for the first time on this pass through the
  82    list -- it indicates that get_next should make any necessary
  83    initializations.  */
  84 #define NEW_ELEMENT (BEGIN_STATE + 1)
  85
  86 /* A value distinct from any character that may have been stored in a
  87    buffer as the result of a block-read in the function squeeze_filter.  */
  88 #define NOT_A_CHAR (unsigned int)(-1)
  89
  90 /* The following (but not CC_NO_CLASS) are indices into the array of
  91    valid character class strings.  */
  92 enum Char_class
  93   {
  94     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  95     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  96     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  97     CC_NO_CLASS = 9999
  98   };
  99
 100 /* Character class to which a character (returned by get_next) belonged;
 101    but it is set only if the construct from which the character was obtained
 102    was one of the character classes [:upper:] or [:lower:].  The value
 103    is used only when translating and then, only to make sure that upper
 104    and lower class constructs have the same relative positions in string1
 105    and string2.  */
 106 enum Upper_Lower_class
 107   {
 108     UL_LOWER = 0,
 109     UL_UPPER = 1,
 110     UL_NONE = 2
 111   };
 112
 113 /* A shortcut to ensure that when constructing the translation array,
 114    one of the values returned by paired calls to get_next (from s1 and s2)
 115    is from [:upper:] and the other is from [:lower:], or neither is from
 116    upper or lower.  By default, GNU tr permits the identity mappings: from
 117    [:upper:] to [:upper:] and [:lower:] to [:lower:].  But when
 118    POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
 119    by values of type enum Upper_Lower_class.  */
 120 static int const class_ok[3][3] =
 121 {
 122   {1, 1, 0},
 123   {1, 1, 0},
 124   {0, 0, 1}
 125 };
 126
 127 /* The type of a List_element.  See build_spec_list for more details.  */
 128 enum Range_element_type
 129   {
 130     RE_NO_TYPE = 0,
 131     RE_NORMAL_CHAR,
 132     RE_RANGE,
 133     RE_CHAR_CLASS,
 134     RE_EQUIV_CLASS,
 135     RE_REPEATED_CHAR
 136   };
 137
 138 /* One construct in one of tr's argument strings.
 139    For example, consider the POSIX version of the classic tr command:
 140        tr -cs 'a-zA-Z_' '[\n*]'
 141    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 142    and a single normal character, `_'.  String2 has one construct.  */
 143 struct List_element
 144   {
 145     enum Range_element_type type;
 146     struct List_element *next;
 147     union
 148       {
 149         int normal_char;
 150         struct                  /* unnamed */
 151           {
 152             unsigned int first_char;
 153             unsigned int last_char;
 154           }
 155         range;
 156         enum Char_class char_class;
 157         int equiv_code;
 158         struct                  /* unnamed */
 159           {
 160             unsigned int the_repeated_char;
 161             size_t repeat_count;
 162           }
 163         repeated_char;
 164       }
 165     u;
 166   };
 167
 168 /* Each of tr's argument strings is parsed into a form that is easier
 169    to work with: a linked list of constructs (struct List_element).
 170    Each Spec_list structure also encapsulates various attributes of
 171    the corresponding argument string.  The attributes are used mainly
 172    to verify that the strings are valid in the context of any options
 173    specified (like -s, -d, or -c).  The main exception is the member
 174    `tail', which is first used to construct the list.  After construction,
 175    it is used by get_next to save its state when traversing the list.
 176    The member `state' serves a similar function.  */
 177 struct Spec_list
 178   {
 179     /* Points to the head of the list of range elements.
 180        The first struct is a dummy; its members are never used.  */
 181     struct List_element *head;
 182
 183     /* When appending, points to the last element.  When traversing via
 184        get_next(), points to the element to process next.  Setting
 185        Spec_list.state to the value BEGIN_STATE before calling get_next
 186        signals get_next to initialize tail to point to head->next.  */
 187     struct List_element *tail;
 188
 189     /* Used to save state between calls to get_next().  */
 190     unsigned int state;
 191
 192     /* Length, in the sense that length ('a-z[:digit:]123abc')
 193        is 42 ( = 26 + 10 + 6).  */
 194     size_t length;
 195
 196     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 197     int n_indefinite_repeats;
 198
 199     /* If n_indefinite_repeats is nonzero, this points to the List_element
 200        corresponding to the last [c*] or [c*0] construct encountered in
 201        this spec.  Otherwise it is undefined.  */
 202     struct List_element *indefinite_repeat_element;
 203
 204     /* Non-zero if this spec contains at least one equivalence
 205        class construct e.g. [=c=].  */
 206     int has_equiv_class;
 207
 208     /* Non-zero if this spec contains at least one character class
 209        construct.  E.g. [:digit:].  */
 210     int has_char_class;
 211
 212     /* Non-zero if this spec contains at least one of the character class
 213        constructs (all but upper and lower) that aren't allowed in s2.  */
 214     int has_restricted_char_class;
 215   };
 216
 217 /* A representation for escaped string1 or string2.  As a string is parsed,
 218    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 219    etc.) are marked as such in this structure by setting the corresponding
 220    entry in the ESCAPED vector.  */
 221 struct E_string
 222 {
 223   unsigned char *s;
 224   int *escaped;
 225   size_t len;
 226 };
 227
 228 /* Return nonzero if the Ith character of escaped string ES matches C
 229    and is not escaped itself.  */
 230 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
 231
 232 char *xmalloc ();
 233 int safe_read ();
 234
 235 /* The name by which this program was run.  */
 236 char *program_name;
 237
 238 /* When nonzero, each sequence in the input of a repeated character
 239    (call it c) is replaced (in the output) by a single occurrence of c
 240    for every c in the squeeze set.  */
 241 static int squeeze_repeats = 0;
 242
 243 /* When nonzero, removes characters in the delete set from input.  */
 244 static int delete = 0;
 245
 246 /* Use the complement of set1 in place of set1.  */
 247 static int complement = 0;
 248
 249 /* When nonzero, this flag causes GNU tr to provide strict
 250    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 251    says that when -d is used without -s, string2 (if present)
 252    must be ignored.  Silently ignoring arguments is a bad idea.
 253    The default GNU behavior is to give a usage message and exit.
 254    Additionally, when this flag is nonzero, tr prints warnings
 255    on stderr if it is being used in a manner that is not portable.
 256    Applicable warnings are given by default, but are suppressed
 257    if the environment variable `POSIXLY_CORRECT' is set, since
 258    being POSIX conformant means we can't issue such messages.
 259    Warnings on the following topics are suppressed when this
 260    variable is nonzero:
 261    1. Ambiguous octal escapes.  */
 262 static int posix_pedantic;
 263
 264 /* When tr is performing translation and string1 is longer than string2,
 265    POSIX says that the result is undefined.  That gives the implementor
 266    of a POSIX conforming version of tr two reasonable choices for the
 267    semantics of this case.
 268
 269    * The BSD tr pads string2 to the length of string1 by
 270    repeating the last character in string2.
 271
 272    * System V tr ignores characters in string1 that have no
 273    corresponding character in string2.  That is, string1 is effectively
 274    truncated to the length of string2.
 275
 276    When nonzero, this flag causes GNU tr to imitate the behavior
 277    of System V tr when translating with string1 longer than string2.
 278    The default is to emulate BSD tr.  This flag is ignored in modes where
 279    no translation is performed.  Emulating the System V tr
 280    in this exceptional case causes the relatively common BSD idiom:
 281
 282        tr -cs A-Za-z0-9 '\012'
 283
 284    to break (it would convert only zero bytes, rather than all
 285    non-alphanumerics, to newlines).
 286
 287    WARNING: This switch does not provide general BSD or System V
 288    compatibility.  For example, it doesn't disable the interpretation
 289    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 290    some unfortunate coincidence you use such constructs in scripts
 291    expecting to use some other version of tr, the scripts will break.  */
 292 static int truncate_set1 = 0;
 293
 294 /* An alias for (!delete && non_option_args == 2).
 295    It is set in main and used there and in validate().  */
 296 static int translating;
 297
 298 #ifndef BUFSIZ
 299 # define BUFSIZ 8192
 300 #endif
 301
 302 #define IO_BUF_SIZE BUFSIZ
 303 static unsigned char io_buf[IO_BUF_SIZE];
 304
 305 static char const *const char_class_name[] =
 306 {
 307   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 308   "lower", "print", "punct", "space", "upper", "xdigit"
 309 };
 310 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 311
 312 typedef char SET_TYPE;
 313
 314 /* Array of boolean values.  A character `c' is a member of the
 315    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 316    set is defined by the last (possibly, the only) string argument
 317    on the command line when the squeeze option is given.  */
 318 static SET_TYPE in_squeeze_set[N_CHARS];
 319
 320 /* Array of boolean values.  A character `c' is a member of the
 321    delete set if and only if in_delete_set[c] is true.  The delete
 322    set is defined by the first (or only) string argument on the
 323    command line when the delete option is given.  */
 324 static SET_TYPE in_delete_set[N_CHARS];
 325
 326 /* Array of character values defining the translation (if any) that
 327    tr is to perform.  Translation is performed only when there are
 328    two specification strings and the delete switch is not given.  */
 329 static char xlate[N_CHARS];
 330
 331 /* If nonzero, display usage information and exit.  */
 332 static int show_help;
 333
 334 /* If nonzero, print the version on standard output then exit.  */
 335 static int show_version;
 336
 337 static struct option const long_options[] =
 338 {
 339   {"complement", no_argument, NULL, 'c'},
 340   {"delete", no_argument, NULL, 'd'},
 341   {"squeeze-repeats", no_argument, NULL, 's'},
 342   {"truncate-set1", no_argument, NULL, 't'},
 343   {"help", no_argument, &show_help, 1},
 344   {"version", no_argument, &show_version, 1},
 345   {NULL, 0, NULL, 0}
 346 };
 347 \f
 348 static void
 349 usage (int status)
 350 {
 351   if (status != 0)
 352     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 353              program_name);
 354   else
 355     {
 356       printf (_("\
 357 Usage: %s [OPTION]... SET1 [SET2]\n\
 358 "),
 359               program_name);
 360       printf (_("\
 361 Translate, squeeze, and/or delete characters from standard input,\n\
 362 writing to standard output.\n\
 363 \n\
 364   -c, --complement        first complement SET1\n\
 365   -d, --delete            delete characters in SET1, do not translate\n\
 366   -s, --squeeze-repeats   replace sequence of characters with one\n\
 367   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 368       --help              display this help and exit\n\
 369       --version           output version information and exit\n\
 370 "));
 371       printf (_("\
 372 \n\
 373 SETs are specified as strings of characters.  Most represent themselves.\n\
 374 Interpreted sequences are:\n\
 375 \n\
 376   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 377   \\\\              backslash\n\
 378   \\a              audible BEL\n\
 379   \\b              backspace\n\
 380   \\f              form feed\n\
 381   \\n              new line\n\
 382   \\r              return\n\
 383   \\t              horizontal tab\n\
 384   \\v              vertical tab\n\
 385   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 386   [CHAR1-CHAR2]   same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
 387   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 388   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 389   [:alnum:]       all letters and digits\n\
 390   [:alpha:]       all letters\n\
 391   [:blank:]       all horizontal whitespace\n\
 392   [:cntrl:]       all control characters\n\
 393   [:digit:]       all digits\n\
 394   [:graph:]       all printable characters, not including space\n\
 395   [:lower:]       all lower case letters\n\
 396   [:print:]       all printable characters, including space\n\
 397   [:punct:]       all punctuation characters\n\
 398   [:space:]       all horizontal or vertical whitespace\n\
 399   [:upper:]       all upper case letters\n\
 400   [:xdigit:]      all hexadecimal digits\n\
 401   [=CHAR=]        all characters which are equivalent to CHAR\n\
 402 "));
 403       printf (_("\
 404 \n\
 405 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 406 -t may be used only when translating.  SET2 is extended to length of\n\
 407 SET1 by repeating its last character as necessary.  Excess characters\n\
 408 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 409 expand in ascending order; used in SET2 while translating, they may\n\
 410 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 411 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 412 translation or deletion.\n\
 413 "));
 414       puts (_("\nReport bugs to <textutils-bugs@gnu.org>."));
 415     }
 416   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 417 }
 418
 419 /* Return nonzero if the character C is a member of the
 420    equivalence class containing the character EQUIV_CLASS.  */
 421
 422 static int
 423 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
 424 {
 425   return (equiv_class == c);
 426 }
 427
 428 /* Return nonzero if the character C is a member of the
 429    character class CHAR_CLASS.  */
 430
 431 static int
 432 is_char_class_member (enum Char_class char_class, unsigned int c)
 433 {
 434   int result;
 435
 436   switch (char_class)
 437     {
 438     case CC_ALNUM:
 439       result = ISALNUM (c);
 440       break;
 441     case CC_ALPHA:
 442       result = ISALPHA (c);
 443       break;
 444     case CC_BLANK:
 445       result = ISBLANK (c);
 446       break;
 447     case CC_CNTRL:
 448       result = ISCNTRL (c);
 449       break;
 450     case CC_DIGIT:
 451       result = ISDIGIT_LOCALE (c);
 452       break;
 453     case CC_GRAPH:
 454       result = ISGRAPH (c);
 455       break;
 456     case CC_LOWER:
 457       result = ISLOWER (c);
 458       break;
 459     case CC_PRINT:
 460       result = ISPRINT (c);
 461       break;
 462     case CC_PUNCT:
 463       result = ISPUNCT (c);
 464       break;
 465     case CC_SPACE:
 466       result = ISSPACE (c);
 467       break;
 468     case CC_UPPER:
 469       result = ISUPPER (c);
 470       break;
 471     case CC_XDIGIT:
 472       result = ISXDIGIT (c);
 473       break;
 474     default:
 475       abort ();
 476       break;
 477     }
 478   return result;
 479 }
 480
 481 static void
 482 es_free (struct E_string *es)
 483 {
 484   free (es->s);
 485   free (es->escaped);
 486 }
 487
 488 /* Perform the first pass over each range-spec argument S, converting all
 489    \c and \ddd escapes to their one-byte representations.  The conversion
 490    is done in-place, so S must point to writable storage.  If an invalid
 491    quote sequence is found print an error message and return nonzero.
 492    Otherwise set *LEN to the length of the resulting string and return
 493    zero.  The resulting array of characters may contain zero-bytes;
 494    however, on input, S is assumed to be null-terminated, and hence
 495    cannot contain actual (non-escaped) zero bytes.  */
 496
 497 static int
 498 unquote (const unsigned char *s, struct E_string *es)
 499 {
 500   size_t i, j;
 501   size_t len;
 502
 503   len = strlen ((char *) s);
 504
 505   es->s = (unsigned char *) xmalloc (len);
 506   es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
 507   for (i = 0; i < len; i++)
 508     es->escaped[i] = 0;
 509
 510   j = 0;
 511   for (i = 0; s[i]; i++)
 512     {
 513       switch (s[i])
 514         {
 515           int c;
 516         case '\\':
 517           switch (s[i + 1])
 518             {
 519               int oct_digit;
 520             case '\\':
 521               c = '\\';
 522               break;
 523             case 'a':
 524               c = '\007';
 525               break;
 526             case 'b':
 527               c = '\b';
 528               break;
 529             case 'f':
 530               c = '\f';
 531               break;
 532             case 'n':
 533               c = '\n';
 534               break;
 535             case 'r':
 536               c = '\r';
 537               break;
 538             case 't':
 539               c = '\t';
 540               break;
 541             case 'v':
 542               c = '\v';
 543               break;
 544             case '0':
 545             case '1':
 546             case '2':
 547             case '3':
 548             case '4':
 549             case '5':
 550             case '6':
 551             case '7':
 552               c = s[i + 1] - '0';
 553               oct_digit = s[i + 2] - '0';
 554               if (0 <= oct_digit && oct_digit <= 7)
 555                 {
 556                   c = 8 * c + oct_digit;
 557                   ++i;
 558                   oct_digit = s[i + 2] - '0';
 559                   if (0 <= oct_digit && oct_digit <= 7)
 560                     {
 561                       if (8 * c + oct_digit < N_CHARS)
 562                         {
 563                           c = 8 * c + oct_digit;
 564                           ++i;
 565                         }
 566                       else if (!posix_pedantic)
 567                         {
 568                           /* Any octal number larger than 0377 won't
 569                              fit in 8 bits.  So we stop when adding the
 570                              next digit would put us over the limit and
 571                              give a warning about the ambiguity.  POSIX
 572                              isn't clear on this, but one person has said
 573                              that in his interpretation, POSIX says tr
 574                              can't even give a warning.  */
 575                           error (0, 0, _("warning: the ambiguous octal escape \
 576 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
 577                                  s[i], s[i + 1], s[i + 2],
 578                                  s[i], s[i + 1], s[i + 2]);
 579                         }
 580                     }
 581                 }
 582               break;
 583             case '\0':
 584               error (0, 0, _("invalid backslash escape at end of string"));
 585               return 1;
 586
 587             default:
 588               if (posix_pedantic)
 589                 {
 590                   error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
 591                   return 1;
 592                 }
 593               else
 594                 {
 595                   c = s[i + 1];
 596                   es->escaped[j] = 1;
 597                 }
 598             }
 599           ++i;
 600           es->s[j++] = c;
 601           break;
 602         default:
 603           es->s[j++] = s[i];
 604           break;
 605         }
 606     }
 607   es->len = j;
 608   return 0;
 609 }
 610
 611 /* If CLASS_STR is a valid character class string, return its index
 612    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 613
 614 static enum Char_class
 615 look_up_char_class (const unsigned char *class_str, size_t len)
 616 {
 617   unsigned int i;
 618
 619   for (i = 0; i < N_CHAR_CLASSES; i++)
 620     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 621         && strlen (char_class_name[i]) == len)
 622       return (enum Char_class) i;
 623   return CC_NO_CLASS;
 624 }
 625
 626 /* Return a newly allocated string with a printable version of C.
 627    This function is used solely for formatting error messages.  */
 628
 629 static char *
 630 make_printable_char (unsigned int c)
 631 {
 632   char *buf = xmalloc (5);
 633
 634   assert (c < N_CHARS);
 635   if (ISPRINT (c))
 636     {
 637       buf[0] = c;
 638       buf[1] = '\0';
 639     }
 640   else
 641     {
 642       sprintf (buf, "\\%03o", c);
 643     }
 644   return buf;
 645 }
 646
 647 /* Return a newly allocated copy of S which is suitable for printing.
 648    LEN is the number of characters in S.  Most non-printing
 649    (isprint) characters are represented by a backslash followed by
 650    3 octal digits.  However, the characters represented by \c escapes
 651    where c is one of [abfnrtv] are represented by their 2-character \c
 652    sequences.  This function is used solely for printing error messages.  */
 653
 654 static char *
 655 make_printable_str (const unsigned char *s, size_t len)
 656 {
 657   /* Worst case is that every character expands to a backslash
 658      followed by a 3-character octal escape sequence.  */
 659   char *printable_buf = xmalloc (4 * len + 1);
 660   char *p = printable_buf;
 661   size_t i;
 662
 663   for (i = 0; i < len; i++)
 664     {
 665       char buf[5];
 666       char *tmp = NULL;
 667
 668       switch (s[i])
 669         {
 670         case '\\':
 671           tmp = "\\";
 672           break;
 673         case '\007':
 674           tmp = "\\a";
 675           break;
 676         case '\b':
 677           tmp = "\\b";
 678           break;
 679         case '\f':
 680           tmp = "\\f";
 681           break;
 682         case '\n':
 683           tmp = "\\n";
 684           break;
 685         case '\r':
 686           tmp = "\\r";
 687           break;
 688         case '\t':
 689           tmp = "\\t";
 690           break;
 691         case '\v':
 692           tmp = "\\v";
 693           break;
 694         default:
 695           if (ISPRINT (s[i]))
 696             {
 697               buf[0] = s[i];
 698               buf[1] = '\0';
 699             }
 700           else
 701             sprintf (buf, "\\%03o", s[i]);
 702           tmp = buf;
 703           break;
 704         }
 705       p = stpcpy (p, tmp);
 706     }
 707   return printable_buf;
 708 }
 709
 710 /* Append a newly allocated structure representing a
 711    character C to the specification list LIST.  */
 712
 713 static void
 714 append_normal_char (struct Spec_list *list, unsigned int c)
 715 {
 716   struct List_element *new;
 717
 718   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 719   new->next = NULL;
 720   new->type = RE_NORMAL_CHAR;
 721   new->u.normal_char = c;
 722   assert (list->tail);
 723   list->tail->next = new;
 724   list->tail = new;
 725 }
 726
 727 /* Append a newly allocated structure representing the range
 728    of characters from FIRST to LAST to the specification list LIST.
 729    Return nonzero if LAST precedes FIRST in the collating sequence,
 730    zero otherwise.  This means that '[c-c]' is acceptable.  */
 731
 732 static int
 733 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
 734 {
 735   struct List_element *new;
 736
 737   if (ORD (first) > ORD (last))
 738     {
 739       char *tmp1 = make_printable_char (first);
 740       char *tmp2 = make_printable_char (last);
 741
 742       error (0, 0,
 743        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 744              tmp1, tmp2);
 745       free (tmp1);
 746       free (tmp2);
 747       return 1;
 748     }
 749   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 750   new->next = NULL;
 751   new->type = RE_RANGE;
 752   new->u.range.first_char = first;
 753   new->u.range.last_char = last;
 754   assert (list->tail);
 755   list->tail->next = new;
 756   list->tail = new;
 757   return 0;
 758 }
 759
 760 /* If CHAR_CLASS_STR is a valid character class string, append a
 761    newly allocated structure representing that character class to the end
 762    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 763    a valid string return nonzero.  */
 764
 765 static int
 766 append_char_class (struct Spec_list *list,
 767                    const unsigned char *char_class_str, size_t len)
 768 {
 769   enum Char_class char_class;
 770   struct List_element *new;
 771
 772   char_class = look_up_char_class (char_class_str, len);
 773   if (char_class == CC_NO_CLASS)
 774     return 1;
 775   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 776   new->next = NULL;
 777   new->type = RE_CHAR_CLASS;
 778   new->u.char_class = char_class;
 779   assert (list->tail);
 780   list->tail->next = new;
 781   list->tail = new;
 782   return 0;
 783 }
 784
 785 /* Append a newly allocated structure representing a [c*n]
 786    repeated character construct to the specification list LIST.
 787    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 788    is a non-negative repeat count.  */
 789
 790 static void
 791 append_repeated_char (struct Spec_list *list, unsigned int the_char,
 792                       size_t repeat_count)
 793 {
 794   struct List_element *new;
 795
 796   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 797   new->next = NULL;
 798   new->type = RE_REPEATED_CHAR;
 799   new->u.repeated_char.the_repeated_char = the_char;
 800   new->u.repeated_char.repeat_count = repeat_count;
 801   assert (list->tail);
 802   list->tail->next = new;
 803   list->tail = new;
 804 }
 805
 806 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 807    the length of that string, LEN, if LEN is exactly one, append
 808    a newly allocated structure representing the specified
 809    equivalence class to the specification list, LIST and return zero.
 810    If LEN is not 1, return nonzero.  */
 811
 812 static int
 813 append_equiv_class (struct Spec_list *list,
 814                     const unsigned char *equiv_class_str, size_t len)
 815 {
 816   struct List_element *new;
 817
 818   if (len != 1)
 819     return 1;
 820   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 821   new->next = NULL;
 822   new->type = RE_EQUIV_CLASS;
 823   new->u.equiv_code = *equiv_class_str;
 824   assert (list->tail);
 825   list->tail->next = new;
 826   list->tail = new;
 827   return 0;
 828 }
 829
 830 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 831    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 832    NUL bytes, and is *not* NUL-terminated.  */
 833
 834 static unsigned char *
 835 substr (const unsigned char *p, size_t first_idx, size_t last_idx)
 836 {
 837   size_t len;
 838   unsigned char *tmp;
 839
 840   assert (first_idx <= last_idx);
 841   len = last_idx - first_idx + 1;
 842   tmp = (unsigned char *) xmalloc (len);
 843
 844   assert (first_idx <= last_idx);
 845   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 846   memcpy (tmp, p + first_idx, len);
 847   return tmp;
 848 }
 849
 850 /* Search forward starting at START_IDX for the 2-char sequence
 851    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 852    a sequence is found, set *RESULT_IDX to the index of the first
 853    character and return nonzero. Otherwise return zero.  P may contain
 854    zero bytes.  */
 855
 856 static int
 857 find_closing_delim (const struct E_string *es, size_t start_idx,
 858                     unsigned int pre_bracket_char, size_t *result_idx)
 859 {
 860   size_t i;
 861
 862   for (i = start_idx; i < es->len - 1; i++)
 863     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 864         && !es->escaped[i] && !es->escaped[i + 1])
 865       {
 866         *result_idx = i;
 867         return 1;
 868       }
 869   return 0;
 870 }
 871
 872 /* Convert a string S with explicit length LEN, possibly
 873    containing embedded zero bytes, to a long integer value.
 874    If the string represents a negative value, a value larger
 875    than LONG_MAX, or if all LEN characters do not represent a
 876    valid integer, return nonzero and do not modify *VAL.
 877    Otherwise, return zero and set *VAL to the converted value.  */
 878
 879 static int
 880 non_neg_strtol (const unsigned char *s, size_t len, size_t *val)
 881 {
 882   size_t i;
 883   unsigned long sum = 0;
 884   unsigned int base;
 885
 886   if (len <= 0)
 887     return 1;
 888   if (s[0] == '0')
 889     base = 8;
 890   else if (ISDIGIT (s[0]))
 891     base = 10;
 892   else
 893     return 1;
 894
 895   for (i = 0; i < len; i++)
 896     {
 897       unsigned int c;
 898
 899       if (s[i] < '0')
 900         return 1;
 901
 902       c = s[i] - '0';
 903       if (c >= base)
 904         return 1;
 905
 906       if (sum > (LONG_MAX - c) / base)
 907         return 1;
 908       sum = sum * base + c;
 909     }
 910   *val = sum;
 911   return 0;
 912 }
 913
 914 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 915    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 916    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 917    and return zero. If the second character following
 918    the opening bracket is not `*' or if no closing bracket can be
 919    found, return -1.  If a closing bracket is found and the
 920    second char is `*', but the string between the `*' and `]' isn't
 921    empty, an octal number, or a decimal number, print an error message
 922    and return -2.  */
 923
 924 static int
 925 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 926                        unsigned int *char_to_repeat, size_t *repeat_count,
 927                        size_t *closing_bracket_idx)
 928 {
 929   size_t i;
 930
 931   assert (start_idx + 1 < es->len);
 932   if (!ES_MATCH (es, start_idx + 1, '*'))
 933     return -1;
 934
 935   for (i = start_idx + 2; i < es->len; i++)
 936     {
 937       if (ES_MATCH (es, i, ']'))
 938         {
 939           const unsigned char *digit_str;
 940           size_t digit_str_len = i - start_idx - 2;
 941
 942           *char_to_repeat = es->s[start_idx];
 943           if (digit_str_len == 0)
 944             {
 945               /* We've matched [c*] -- no explicit repeat count.  */
 946               *repeat_count = 0;
 947               *closing_bracket_idx = i;
 948               return 0;
 949             }
 950
 951           /* Here, we have found [c*s] where s should be a string
 952              of octal or decimal digits.  */
 953           digit_str = &es->s[start_idx + 2];
 954           if (non_neg_strtol (digit_str, digit_str_len, repeat_count)
 955               || *repeat_count > BEGIN_STATE)
 956             {
 957               char *tmp = make_printable_str (digit_str, digit_str_len);
 958               error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
 959                      tmp);
 960               free (tmp);
 961               return -2;
 962             }
 963           *closing_bracket_idx = i;
 964           return 0;
 965         }
 966     }
 967   return -1;                    /* No bracket found.  */
 968 }
 969
 970 /* Return nonzero if the string at ES->s[IDX] matches the regular
 971    expression `\*[0-9]*\]', zero otherwise.  To match, the `*' and
 972    the `]' must not be escaped.  */
 973
 974 static int
 975 star_digits_closebracket (const struct E_string *es, size_t idx)
 976 {
 977   size_t i;
 978
 979   if (!ES_MATCH (es, idx, '*'))
 980     return 0;
 981
 982   for (i = idx + 1; i < es->len; i++)
 983     {
 984       if (!ISDIGIT (es->s[i]))
 985         {
 986           if (ES_MATCH (es, i, ']'))
 987             return 1;
 988           return 0;
 989         }
 990     }
 991   return 0;
 992 }
 993
 994 /* Convert string UNESACPED_STRING (which has been preprocessed to
 995    convert backslash-escape sequences) of length LEN characters into
 996    a linked list of the following 5 types of constructs:
 997       - [:str:] Character class where `str' is one of the 12 valid strings.
 998       - [=c=] Equivalence class where `c' is any single character.
 999       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
1000           However, if `n' is present, it must be a non-negative octal or
1001           decimal integer.
1002       - r-s Range of characters from `r' to `s'.  The second endpoint must
1003           not precede the first in the current collating sequence.
1004       - c Any other character is interpreted as itself.  */
1005
1006 static int
1007 build_spec_list (const struct E_string *es, struct Spec_list *result)
1008 {
1009   const unsigned char *p;
1010   size_t i;
1011
1012   p = es->s;
1013
1014   /* The main for-loop below recognizes the 4 multi-character constructs.
1015      A character that matches (in its context) none of the multi-character
1016      constructs is classified as `normal'.  Since all multi-character
1017      constructs have at least 3 characters, any strings of length 2 or
1018      less are composed solely of normal characters.  Hence, the index of
1019      the outer for-loop runs only as far as LEN-2.  */
1020
1021   for (i = 0; i + 2 < es->len; /* empty */)
1022     {
1023       if (ES_MATCH (es, i, '['))
1024         {
1025           int matched_multi_char_construct;
1026           size_t closing_bracket_idx;
1027           unsigned int char_to_repeat;
1028           size_t repeat_count;
1029           int err;
1030
1031           matched_multi_char_construct = 1;
1032           if (ES_MATCH (es, i + 1, ':')
1033               || ES_MATCH (es, i + 1, '='))
1034             {
1035               size_t closing_delim_idx;
1036               int found;
1037
1038               found = find_closing_delim (es, i + 2, p[i + 1],
1039                                           &closing_delim_idx);
1040               if (found)
1041                 {
1042                   int parse_failed;
1043                   unsigned char *opnd_str = substr (p, i + 2,
1044                                                     closing_delim_idx - 1);
1045                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
1046
1047                   if (p[i + 1] == ':')
1048                     {
1049                       parse_failed = append_char_class (result, opnd_str,
1050                                                         opnd_str_len);
1051
1052                       /* FIXME: big comment.  */
1053                       if (parse_failed)
1054                         {
1055                           if (star_digits_closebracket (es, i + 2))
1056                             {
1057                               free (opnd_str);
1058                               goto try_bracketed_repeat;
1059                             }
1060                           else
1061                             {
1062                               char *tmp = make_printable_str (opnd_str,
1063                                                               opnd_str_len);
1064                               error (0, 0, _("invalid character class `%s'"),
1065                                      tmp);
1066                               free (tmp);
1067                               return 1;
1068                             }
1069                         }
1070                     }
1071                   else
1072                     {
1073                       parse_failed = append_equiv_class (result, opnd_str,
1074                                                          opnd_str_len);
1075
1076                       /* FIXME: big comment.  */
1077                       if (parse_failed)
1078                         {
1079                           if (star_digits_closebracket (es, i + 2))
1080                             {
1081                               free (opnd_str);
1082                               goto try_bracketed_repeat;
1083                             }
1084                           else
1085                             {
1086                               char *tmp = make_printable_str (opnd_str,
1087                                                               opnd_str_len);
1088                               error (0, 0,
1089                _("%s: equivalence class operand must be a single character"),
1090                                      tmp);
1091                               free (tmp);
1092                               return 1;
1093                             }
1094                         }
1095                     }
1096                   free (opnd_str);
1097
1098                   /* Return nonzero if append_*_class reports a problem.  */
1099                   if (parse_failed)
1100                     return 1;
1101                   else
1102                     i = closing_delim_idx + 2;
1103                   continue;
1104                 }
1105               /* Else fall through.  This could be [:*] or [=*].  */
1106             }
1107
1108         try_bracketed_repeat:
1109
1110           /* Determine whether this is a bracketed repeat range
1111              matching the RE \[.\*(dec_or_oct_number)?\].  */
1112           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1113                                        &repeat_count,
1114                                        &closing_bracket_idx);
1115           if (err == 0)
1116             {
1117               append_repeated_char (result, char_to_repeat, repeat_count);
1118               i = closing_bracket_idx + 1;
1119             }
1120           else if (err == -1)
1121             {
1122               matched_multi_char_construct = 0;
1123             }
1124           else
1125             {
1126               /* Found a string that looked like [c*n] but the
1127                  numeric part was invalid.  */
1128               return 1;
1129             }
1130
1131           if (matched_multi_char_construct)
1132             continue;
1133
1134           /* We reach this point if P does not match [:str:], [=c=],
1135              [c*n], or [c*].  Now, see if P looks like a range `[-c'
1136              (from `[' to `c').  */
1137         }
1138
1139       /* Look ahead one char for ranges like a-z.  */
1140       if (ES_MATCH (es, i + 1, '-'))
1141         {
1142           if (append_range (result, p[i], p[i + 2]))
1143             return 1;
1144           i += 3;
1145         }
1146       else
1147         {
1148           append_normal_char (result, p[i]);
1149           ++i;
1150         }
1151     }
1152
1153   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1154   for (; i < es->len; i++)
1155     append_normal_char (result, p[i]);
1156
1157   return 0;
1158 }
1159
1160 /* Given a Spec_list S (with its saved state implicit in the values
1161    of its members `tail' and `state'), return the next single character
1162    in the expansion of S's constructs.  If the last character of S was
1163    returned on the previous call or if S was empty, this function
1164    returns -1.  For example, successive calls to get_next where S
1165    represents the spec-string 'a-d[y*3]' will return the sequence
1166    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1167    which the returned character comes is [:upper:] or [:lower:], the
1168    parameter CLASS is given a value to indicate which it was.  Otherwise
1169    CLASS is set to UL_NONE.  This value is used only when constructing
1170    the translation table to verify that any occurrences of upper and
1171    lower class constructs in the spec-strings appear in the same relative
1172    positions.  */
1173
1174 static int
1175 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1176 {
1177   struct List_element *p;
1178   int return_val;
1179   int i;
1180
1181   if (class)
1182     *class = UL_NONE;
1183
1184   if (s->state == BEGIN_STATE)
1185     {
1186       s->tail = s->head->next;
1187       s->state = NEW_ELEMENT;
1188     }
1189
1190   p = s->tail;
1191   if (p == NULL)
1192     return -1;
1193
1194   switch (p->type)
1195     {
1196     case RE_NORMAL_CHAR:
1197       return_val = p->u.normal_char;
1198       s->state = NEW_ELEMENT;
1199       s->tail = p->next;
1200       break;
1201
1202     case RE_RANGE:
1203       if (s->state == NEW_ELEMENT)
1204         s->state = ORD (p->u.range.first_char);
1205       else
1206         ++(s->state);
1207       return_val = CHR (s->state);
1208       if (s->state == ORD (p->u.range.last_char))
1209         {
1210           s->tail = p->next;
1211           s->state = NEW_ELEMENT;
1212         }
1213       break;
1214
1215     case RE_CHAR_CLASS:
1216       if (class)
1217         {
1218           int upper_or_lower;
1219           switch (p->u.char_class)
1220             {
1221             case CC_LOWER:
1222               *class = UL_LOWER;
1223               upper_or_lower = 1;
1224               break;
1225             case CC_UPPER:
1226               *class = UL_UPPER;
1227               upper_or_lower = 1;
1228               break;
1229             default:
1230               upper_or_lower = 0;
1231               break;
1232             }
1233
1234           if (upper_or_lower)
1235             {
1236               s->tail = p->next;
1237               s->state = NEW_ELEMENT;
1238               return_val = 0;
1239               break;
1240             }
1241         }
1242
1243       if (s->state == NEW_ELEMENT)
1244         {
1245           for (i = 0; i < N_CHARS; i++)
1246             if (is_char_class_member (p->u.char_class, i))
1247               break;
1248           assert (i < N_CHARS);
1249           s->state = i;
1250         }
1251       assert (is_char_class_member (p->u.char_class, s->state));
1252       return_val = CHR (s->state);
1253       for (i = s->state + 1; i < N_CHARS; i++)
1254         if (is_char_class_member (p->u.char_class, i))
1255           break;
1256       if (i < N_CHARS)
1257         s->state = i;
1258       else
1259         {
1260           s->tail = p->next;
1261           s->state = NEW_ELEMENT;
1262         }
1263       break;
1264
1265     case RE_EQUIV_CLASS:
1266       /* FIXME: this assumes that each character is alone in its own
1267          equivalence class (which appears to be correct for my
1268          LC_COLLATE.  But I don't know of any function that allows
1269          one to determine a character's equivalence class.  */
1270
1271       return_val = p->u.equiv_code;
1272       s->state = NEW_ELEMENT;
1273       s->tail = p->next;
1274       break;
1275
1276     case RE_REPEATED_CHAR:
1277       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1278       if (p->u.repeated_char.repeat_count == 0)
1279         {
1280           s->tail = p->next;
1281           s->state = NEW_ELEMENT;
1282           return_val = get_next (s, class);
1283         }
1284       else
1285         {
1286           if (s->state == NEW_ELEMENT)
1287             {
1288               s->state = 0;
1289             }
1290           ++(s->state);
1291           return_val = p->u.repeated_char.the_repeated_char;
1292           if (p->u.repeated_char.repeat_count > 0
1293               && s->state == p->u.repeated_char.repeat_count)
1294             {
1295               s->tail = p->next;
1296               s->state = NEW_ELEMENT;
1297             }
1298         }
1299       break;
1300
1301     case RE_NO_TYPE:
1302       abort ();
1303       break;
1304
1305     default:
1306       abort ();
1307       break;
1308     }
1309
1310   return return_val;
1311 }
1312
1313 /* This is a minor kludge.  This function is called from
1314    get_spec_stats to determine the cardinality of a set derived
1315    from a complemented string.  It's a kludge in that some of the
1316    same operations are (duplicated) performed in set_initialize.  */
1317
1318 static int
1319 card_of_complement (struct Spec_list *s)
1320 {
1321   int c;
1322   int cardinality = N_CHARS;
1323   SET_TYPE in_set[N_CHARS];
1324
1325   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1326   s->state = BEGIN_STATE;
1327   while ((c = get_next (s, NULL)) != -1)
1328     if (!in_set[c]++)
1329       --cardinality;
1330   return cardinality;
1331 }
1332
1333 /* Gather statistics about the spec-list S in preparation for the tests
1334    in validate that determine the consistency of the specs.  This function
1335    is called at most twice; once for string1, and again for any string2.
1336    LEN_S1 < 0 indicates that this is the first call and that S represents
1337    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1338    constructs in string1, and we can use its value to resolve any
1339    indefinite repeat construct in S (which represents string2).  Hence,
1340    this function has the side-effect that it converts a valid [c*]
1341    construct in string2 to [c*n] where n is large enough (or 0) to give
1342    string2 the same length as string1.  For example, with the command
1343    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1344    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1345
1346 static void
1347 get_spec_stats (struct Spec_list *s)
1348 {
1349   struct List_element *p;
1350   int len = 0;
1351
1352   s->n_indefinite_repeats = 0;
1353   s->has_equiv_class = 0;
1354   s->has_restricted_char_class = 0;
1355   s->has_char_class = 0;
1356   for (p = s->head->next; p; p = p->next)
1357     {
1358       switch (p->type)
1359         {
1360           int i;
1361         case RE_NORMAL_CHAR:
1362           ++len;
1363           break;
1364
1365         case RE_RANGE:
1366           assert (p->u.range.last_char >= p->u.range.first_char);
1367           len += p->u.range.last_char - p->u.range.first_char + 1;
1368           break;
1369
1370         case RE_CHAR_CLASS:
1371           s->has_char_class = 1;
1372           for (i = 0; i < N_CHARS; i++)
1373             if (is_char_class_member (p->u.char_class, i))
1374               ++len;
1375           switch (p->u.char_class)
1376             {
1377             case CC_UPPER:
1378             case CC_LOWER:
1379               break;
1380             default:
1381               s->has_restricted_char_class = 1;
1382               break;
1383             }
1384           break;
1385
1386         case RE_EQUIV_CLASS:
1387           for (i = 0; i < N_CHARS; i++)
1388             if (is_equiv_class_member (p->u.equiv_code, i))
1389               ++len;
1390           s->has_equiv_class = 1;
1391           break;
1392
1393         case RE_REPEATED_CHAR:
1394           if (p->u.repeated_char.repeat_count > 0)
1395             len += p->u.repeated_char.repeat_count;
1396           else if (p->u.repeated_char.repeat_count == 0)
1397             {
1398               s->indefinite_repeat_element = p;
1399               ++(s->n_indefinite_repeats);
1400             }
1401           break;
1402
1403         case RE_NO_TYPE:
1404           assert (0);
1405           break;
1406         }
1407     }
1408
1409   s->length = len;
1410 }
1411
1412 static void
1413 get_s1_spec_stats (struct Spec_list *s1)
1414 {
1415   get_spec_stats (s1);
1416   if (complement)
1417     s1->length = card_of_complement (s1);
1418 }
1419
1420 static void
1421 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1422 {
1423   get_spec_stats (s2);
1424   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1425     {
1426       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1427         len_s1 - s2->length;
1428       s2->length = len_s1;
1429     }
1430 }
1431
1432 static void
1433 spec_init (struct Spec_list *spec_list)
1434 {
1435   spec_list->head = spec_list->tail =
1436     (struct List_element *) xmalloc (sizeof (struct List_element));
1437   spec_list->head->next = NULL;
1438 }
1439
1440 /* This function makes two passes over the argument string S.  The first
1441    one converts all \c and \ddd escapes to their one-byte representations.
1442    The second constructs a linked specification list, SPEC_LIST, of the
1443    characters and constructs that comprise the argument string.  If either
1444    of these passes detects an error, this function returns nonzero.  */
1445
1446 static int
1447 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1448 {
1449   struct E_string es;
1450   int fail;
1451
1452   fail = unquote (s, &es);
1453   if (!fail)
1454     fail = build_spec_list (&es, spec_list);
1455   es_free (&es);
1456   return fail;
1457 }
1458
1459 /* Given two specification lists, S1 and S2, and assuming that
1460    S1->length > S2->length, append a single [c*n] element to S2 where c
1461    is the last character in the expansion of S2 and n is the difference
1462    between the two lengths.
1463    Upon successful completion, S2->length is set to S1->length.  The only
1464    way this function can fail to make S2 as long as S1 is when S2 has
1465    zero-length, since in that case, there is no last character to repeat.
1466    So S2->length is required to be at least 1.
1467
1468    Providing this functionality allows the user to do some pretty
1469    non-BSD (and non-portable) things:  For example, the command
1470        tr -cs '[:upper:]0-9' '[:lower:]'
1471    is almost guaranteed to give results that depend on your collating
1472    sequence.  */
1473
1474 static void
1475 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1476 {
1477   struct List_element *p;
1478   int char_to_repeat;
1479   int i;
1480
1481   assert (translating);
1482   assert (s1->length > s2->length);
1483   assert (s2->length > 0);
1484
1485   p = s2->tail;
1486   switch (p->type)
1487     {
1488     case RE_NORMAL_CHAR:
1489       char_to_repeat = p->u.normal_char;
1490       break;
1491     case RE_RANGE:
1492       char_to_repeat = p->u.range.last_char;
1493       break;
1494     case RE_CHAR_CLASS:
1495       for (i = N_CHARS; i >= 0; i--)
1496         if (is_char_class_member (p->u.char_class, i))
1497           break;
1498       assert (i >= 0);
1499       char_to_repeat = CHR (i);
1500       break;
1501
1502     case RE_REPEATED_CHAR:
1503       char_to_repeat = p->u.repeated_char.the_repeated_char;
1504       break;
1505
1506     case RE_EQUIV_CLASS:
1507       /* This shouldn't happen, because validate exits with an error
1508          if it finds an equiv class in string2 when translating.  */
1509       abort ();
1510       break;
1511
1512     case RE_NO_TYPE:
1513       abort ();
1514       break;
1515
1516     default:
1517       abort ();
1518       break;
1519     }
1520
1521   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1522   s2->length = s1->length;
1523 }
1524
1525 /* Return non-zero if S is a non-empty list in which exactly one
1526    character (but potentially, many instances of it) appears.
1527    E.g.  [X*] or xxxxxxxx.  */
1528
1529 static int
1530 homogeneous_spec_list (struct Spec_list *s)
1531 {
1532   int b, c;
1533
1534   s->state = BEGIN_STATE;
1535
1536   if ((b = get_next (s, NULL)) == -1)
1537     return 0;
1538
1539   while ((c = get_next (s, NULL)) != -1)
1540     if (c != b)
1541       return 0;
1542
1543   return 1;
1544 }
1545
1546 /* Die with an error message if S1 and S2 describe strings that
1547    are not valid with the given command line switches.
1548    A side effect of this function is that if a valid [c*] or
1549    [c*0] construct appears in string2, it is converted to [c*n]
1550    with a value for n that makes s2->length == s1->length.  By
1551    the same token, if the --truncate-set1 option is not
1552    given, S2 may be extended.  */
1553
1554 static void
1555 validate (struct Spec_list *s1, struct Spec_list *s2)
1556 {
1557   get_s1_spec_stats (s1);
1558   if (s1->n_indefinite_repeats > 0)
1559     {
1560       error (EXIT_FAILURE, 0,
1561              _("the [c*] repeat construct may not appear in string1"));
1562     }
1563
1564   if (s2)
1565     {
1566       get_s2_spec_stats (s2, s1->length);
1567
1568       if (s2->n_indefinite_repeats > 1)
1569         {
1570           error (EXIT_FAILURE, 0,
1571                  _("only one [c*] repeat construct may appear in string2"));
1572         }
1573
1574       if (translating)
1575         {
1576           if (s2->has_equiv_class)
1577             {
1578               error (EXIT_FAILURE, 0,
1579                      _("[=c=] expressions may not appear in string2 \
1580 when translating"));
1581             }
1582
1583           if (s1->length > s2->length)
1584             {
1585               if (!truncate_set1)
1586                 {
1587                   /* string2 must be non-empty unless --truncate-set1 is
1588                      given or string1 is empty.  */
1589
1590                   if (s2->length == 0)
1591                     error (EXIT_FAILURE, 0,
1592                      _("when not truncating set1, string2 must be non-empty"));
1593                   string2_extend (s1, s2);
1594                 }
1595             }
1596
1597           if (complement && s1->has_char_class
1598               && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1599             {
1600               error (EXIT_FAILURE, 0,
1601                      _("when translating with complemented character classes,\
1602 \nstring2 must map all characters in the domain to one"));
1603             }
1604
1605           if (s2->has_restricted_char_class)
1606             {
1607               error (EXIT_FAILURE, 0,
1608                      _("when translating, the only character classes that may \
1609 appear in\nstring2 are `upper' and `lower'"));
1610             }
1611         }
1612       else
1613         /* Not translating.  */
1614         {
1615           if (s2->n_indefinite_repeats > 0)
1616             error (EXIT_FAILURE, 0,
1617                    _("the [c*] construct may appear in string2 only \
1618 when translating"));
1619         }
1620     }
1621 }
1622
1623 /* Read buffers of SIZE bytes via the function READER (if READER is
1624    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1625    read_and_delete or read_and_xlate.  After each buffer is read, it is
1626    processed and written to stdout.  The buffers are processed so that
1627    multiple consecutive occurrences of the same character in the input
1628    stream are replaced by a single occurrence of that character if the
1629    character is in the squeeze set.  */
1630
1631 static void
1632 squeeze_filter (unsigned char *buf, long int size, PFI reader)
1633 {
1634   unsigned int char_to_squeeze = NOT_A_CHAR;
1635   int i = 0;
1636   int nr = 0;
1637
1638   for (;;)
1639     {
1640       int begin;
1641
1642       if (i >= nr)
1643         {
1644           if (reader == NULL)
1645             nr = safe_read (0, (char *) buf, size);
1646           else
1647             nr = (*reader) (buf, size, NULL);
1648
1649           if (nr < 0)
1650             error (EXIT_FAILURE, errno, _("read error"));
1651           if (nr == 0)
1652             break;
1653           i = 0;
1654         }
1655
1656       begin = i;
1657
1658       if (char_to_squeeze == NOT_A_CHAR)
1659         {
1660           int out_len;
1661           /* Here, by being a little tricky, we can get a significant
1662              performance increase in most cases when the input is
1663              reasonably large.  Since tr will modify the input only
1664              if two consecutive (and identical) input characters are
1665              in the squeeze set, we can step by two through the data
1666              when searching for a character in the squeeze set.  This
1667              means there may be a little more work in a few cases and
1668              perhaps twice as much work in the worst cases where most
1669              of the input is removed by squeezing repeats.  But most
1670              uses of this functionality seem to remove less than 20-30%
1671              of the input.  */
1672           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1673             ;                   /* empty */
1674
1675           /* There is a special case when i == nr and we've just
1676              skipped a character (the last one in buf) that is in
1677              the squeeze set.  */
1678           if (i == nr && in_squeeze_set[buf[i - 1]])
1679             --i;
1680
1681           if (i >= nr)
1682             out_len = nr - begin;
1683           else
1684             {
1685               char_to_squeeze = buf[i];
1686               /* We're about to output buf[begin..i].  */
1687               out_len = i - begin + 1;
1688
1689               /* But since we stepped by 2 in the loop above,
1690                  out_len may be one too large.  */
1691               if (i > 0 && buf[i - 1] == char_to_squeeze)
1692                 --out_len;
1693
1694               /* Advance i to the index of first character to be
1695                  considered when looking for a char different from
1696                  char_to_squeeze.  */
1697               ++i;
1698             }
1699           if (out_len > 0
1700               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1701             error (EXIT_FAILURE, errno, _("write error"));
1702         }
1703
1704       if (char_to_squeeze != NOT_A_CHAR)
1705         {
1706           /* Advance i to index of first char != char_to_squeeze
1707              (or to nr if all the rest of the characters in this
1708              buffer are the same as char_to_squeeze).  */
1709           for (; i < nr && buf[i] == char_to_squeeze; i++)
1710             ;                   /* empty */
1711           if (i < nr)
1712             char_to_squeeze = NOT_A_CHAR;
1713           /* If (i >= nr) we've squeezed the last character in this buffer.
1714              So now we have to read a new buffer and continue comparing
1715              characters against char_to_squeeze.  */
1716         }
1717     }
1718 }
1719
1720 /* Read buffers of SIZE bytes from stdin until one is found that
1721    contains at least one character not in the delete set.  Store
1722    in the array BUF, all characters from that buffer that are not
1723    in the delete set, and return the number of characters saved
1724    or 0 upon EOF.  */
1725
1726 static long
1727 read_and_delete (unsigned char *buf, long int size, PFI not_used)
1728 {
1729   long n_saved;
1730   static int hit_eof = 0;
1731
1732   assert (not_used == NULL);
1733   assert (size > 0);
1734
1735   if (hit_eof)
1736     return 0;
1737
1738   /* This enclosing do-while loop is to make sure that
1739      we don't return zero (indicating EOF) when we've
1740      just deleted all the characters in a buffer.  */
1741   do
1742     {
1743       int i;
1744       int nr = safe_read (0, (char *) buf, size);
1745
1746       if (nr < 0)
1747         error (EXIT_FAILURE, errno, _("read error"));
1748       if (nr == 0)
1749         {
1750           hit_eof = 1;
1751           return 0;
1752         }
1753
1754       /* This first loop may be a waste of code, but gives much
1755          better performance when no characters are deleted in
1756          the beginning of a buffer.  It just avoids the copying
1757          of buf[i] into buf[n_saved] when it would be a NOP.  */
1758
1759       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1760         /* empty */ ;
1761       n_saved = i;
1762
1763       for (++i; i < nr; i++)
1764         if (!in_delete_set[buf[i]])
1765           buf[n_saved++] = buf[i];
1766     }
1767   while (n_saved == 0);
1768
1769   return n_saved;
1770 }
1771
1772 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1773    perform the in-place and one-to-one mapping specified by the global
1774    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1775
1776 static long
1777 read_and_xlate (unsigned char *buf, long int size, PFI not_used)
1778 {
1779   long chars_read = 0;
1780   static int hit_eof = 0;
1781   int i;
1782
1783   assert (not_used == NULL);
1784   assert (size > 0);
1785
1786   if (hit_eof)
1787     return 0;
1788
1789   chars_read = safe_read (0, (char *) buf, size);
1790   if (chars_read < 0)
1791     error (EXIT_FAILURE, errno, _("read error"));
1792   if (chars_read == 0)
1793     {
1794       hit_eof = 1;
1795       return 0;
1796     }
1797
1798   for (i = 0; i < chars_read; i++)
1799     buf[i] = xlate[buf[i]];
1800
1801   return chars_read;
1802 }
1803
1804 /* Initialize a boolean membership set IN_SET with the character
1805    values obtained by traversing the linked list of constructs S
1806    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1807    nonzero the resulting set is complemented.  */
1808
1809 static void
1810 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1811 {
1812   int c;
1813   int i;
1814
1815   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1816   s->state = BEGIN_STATE;
1817   while ((c = get_next (s, NULL)) != -1)
1818     in_set[c] = 1;
1819   if (complement_this_set)
1820     for (i = 0; i < N_CHARS; i++)
1821       in_set[i] = (!in_set[i]);
1822 }
1823
1824 int
1825 main (int argc, char **argv)
1826 {
1827   int c;
1828   int non_option_args;
1829   struct Spec_list buf1, buf2;
1830   struct Spec_list *s1 = &buf1;
1831   struct Spec_list *s2 = &buf2;
1832
1833   program_name = argv[0];
1834   setlocale (LC_ALL, "");
1835   bindtextdomain (PACKAGE, LOCALEDIR);
1836   textdomain (PACKAGE);
1837
1838   while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1839     {
1840       switch (c)
1841         {
1842         case 0:
1843           break;
1844
1845         case 'c':
1846           complement = 1;
1847           break;
1848
1849         case 'd':
1850           delete = 1;
1851           break;
1852
1853         case 's':
1854           squeeze_repeats = 1;
1855           break;
1856
1857         case 't':
1858           truncate_set1 = 1;
1859           break;
1860
1861         default:
1862           usage (2);
1863           break;
1864         }
1865     }
1866
1867   if (show_version)
1868     {
1869       printf ("tr (%s) %s\n", GNU_PACKAGE, VERSION);
1870       exit (EXIT_SUCCESS);
1871     }
1872
1873   if (show_help)
1874     usage (0);
1875
1876   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1877
1878   non_option_args = argc - optind;
1879   translating = (non_option_args == 2 && !delete);
1880
1881   /* Change this test if it is valid to give tr no options and
1882      no args at all.  POSIX doesn't specifically say anything
1883      either way, but it looks like they implied it's invalid
1884      by omission.  If you want to make tr do a slow imitation
1885      of `cat' use `tr a a'.  */
1886   if (non_option_args > 2)
1887     {
1888       error (0, 0, _("too many arguments"));
1889       usage (2);
1890     }
1891
1892   if (!delete && !squeeze_repeats && non_option_args != 2)
1893     error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1894
1895   if (delete && squeeze_repeats && non_option_args != 2)
1896     error (EXIT_FAILURE, 0, _("two strings must be given when both \
1897 deleting and squeezing repeats"));
1898
1899   /* If --delete is given without --squeeze-repeats, then
1900      only one string argument may be specified.  But POSIX
1901      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1902      is set, pretend we never saw string2.  But I think
1903      this deserves a fatal error, so that's the default.  */
1904   if ((delete && !squeeze_repeats) && non_option_args != 1)
1905     {
1906       if (posix_pedantic && non_option_args == 2)
1907         --non_option_args;
1908       else
1909         error (EXIT_FAILURE, 0,
1910                _("only one string may be given when deleting \
1911 without squeezing repeats"));
1912     }
1913
1914   if (squeeze_repeats && non_option_args == 0)
1915     error (EXIT_FAILURE, 0,
1916            _("at least one string must be given when squeezing repeats"));
1917
1918   spec_init (s1);
1919   if (parse_str ((unsigned char *) argv[optind], s1))
1920     exit (EXIT_FAILURE);
1921
1922   if (non_option_args == 2)
1923     {
1924       spec_init (s2);
1925       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1926         exit (EXIT_FAILURE);
1927     }
1928   else
1929     s2 = NULL;
1930
1931   validate (s1, s2);
1932
1933   if (squeeze_repeats && non_option_args == 1)
1934     {
1935       set_initialize (s1, complement, in_squeeze_set);
1936       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1937     }
1938   else if (delete && non_option_args == 1)
1939     {
1940       long nr;
1941
1942       set_initialize (s1, complement, in_delete_set);
1943       do
1944         {
1945           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1946           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1947             error (EXIT_FAILURE, errno, _("write error"));
1948         }
1949       while (nr > 0);
1950     }
1951   else if (squeeze_repeats && delete && non_option_args == 2)
1952     {
1953       set_initialize (s1, complement, in_delete_set);
1954       set_initialize (s2, 0, in_squeeze_set);
1955       squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1956     }
1957   else if (translating)
1958     {
1959       if (complement)
1960         {
1961           int i;
1962           SET_TYPE *in_s1 = in_delete_set;
1963
1964           set_initialize (s1, 0, in_s1);
1965           s2->state = BEGIN_STATE;
1966           for (i = 0; i < N_CHARS; i++)
1967             xlate[i] = i;
1968           for (i = 0; i < N_CHARS; i++)
1969             {
1970               if (!in_s1[i])
1971                 {
1972                   int ch = get_next (s2, NULL);
1973                   assert (ch != -1 || truncate_set1);
1974                   if (ch == -1)
1975                     {
1976                       /* This will happen when tr is invoked like e.g.
1977                          tr -cs A-Za-z0-9 '\012'.  */
1978                       break;
1979                     }
1980                   xlate[i] = ch;
1981                 }
1982             }
1983           assert (get_next (s2, NULL) == -1 || truncate_set1);
1984         }
1985       else
1986         {
1987           int c1, c2;
1988           int i;
1989           enum Upper_Lower_class class_s1;
1990           enum Upper_Lower_class class_s2;
1991
1992           for (i = 0; i < N_CHARS; i++)
1993             xlate[i] = i;
1994           s1->state = BEGIN_STATE;
1995           s2->state = BEGIN_STATE;
1996           for (;;)
1997             {
1998               c1 = get_next (s1, &class_s1);
1999               c2 = get_next (s2, &class_s2);
2000               if (!class_ok[(int) class_s1][(int) class_s2])
2001                 error (EXIT_FAILURE, 0,
2002                        _("misaligned [:upper:] and/or [:lower:] construct"));
2003
2004               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
2005                 {
2006                   for (i = 0; i < N_CHARS; i++)
2007                     if (ISLOWER (i))
2008                       xlate[i] = toupper (i);
2009                 }
2010               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
2011                 {
2012                   for (i = 0; i < N_CHARS; i++)
2013                     if (ISUPPER (i))
2014                       xlate[i] = tolower (i);
2015                 }
2016               else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
2017                        || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
2018                 {
2019                   /* By default, GNU tr permits the identity mappings: from
2020                      [:upper:] to [:upper:] and [:lower:] to [:lower:].  But
2021                      when POSIXLY_CORRECT is set, those evoke diagnostics.  */
2022                   if (posix_pedantic)
2023                     {
2024                       error (EXIT_FAILURE, 0,
2025                              _("\
2026 invalid identity mapping;  when translating, any [:lower:] or [:upper:]\n\
2027 construct in string1 must be aligned with a corresponding construct\n\
2028 ([:upper:] or [:lower:], respectively) in string2"));
2029                     }
2030                 }
2031               else
2032                 {
2033                   /* The following should have been checked by validate...  */
2034                   if (c1 == -1 || c2 == -1)
2035                     break;
2036                   xlate[c1] = c2;
2037                 }
2038             }
2039           assert (c1 == -1 || truncate_set1);
2040         }
2041       if (squeeze_repeats)
2042         {
2043           set_initialize (s2, 0, in_squeeze_set);
2044           squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
2045         }
2046       else
2047         {
2048           long chars_read;
2049
2050           do
2051             {
2052               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2053               if (chars_read > 0
2054                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2055                 error (EXIT_FAILURE, errno, _("write error"));
2056             }
2057           while (chars_read > 0);
2058         }
2059     }
2060
2061   if (fclose (stdout) == EOF)
2062     error (EXIT_FAILURE, errno, _("write error"));
2063
2064   if (close (0) != 0)
2065     error (EXIT_FAILURE, errno, _("standard input"));
2066
2067   exit (EXIT_SUCCESS);
2068 }