src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  17
  18 /* Written by Jim Meyering, meyering@cs.utexas.edu.  */
  19
  20 #include <config.h>
  21
  22 /* Get isblank from GNU libc.  */
  23 #define _GNU_SOURCE
  24
  25 #include <stdio.h>
  26 #include <assert.h>
  27 #include <errno.h>
  28 #include <sys/types.h>
  29 #include <getopt.h>
  30
  31 #include "system.h"
  32 #include "version.h"
  33 #include "error.h"
  34
  35 #ifndef ULONG_MAX
  36 #define ULONG_MAX ((unsigned long) ~(unsigned long) 0)
  37 #endif
  38
  39 #ifndef LONG_MAX
  40 #define LONG_MAX ((long int) (ULONG_MAX >> 1))
  41 #endif
  42
  43 #ifndef UCHAR_MAX
  44 #define UCHAR_MAX 0xFF
  45 #endif
  46
  47 #define N_CHARS (UCHAR_MAX + 1)
  48
  49 /* A pointer to a function that returns an int.  */
  50 typedef int (*PFI) ();
  51
  52 /* Convert from character C to its index in the collating
  53    sequence array.  Just cast to an unsigned int to avoid
  54    problems with sign-extension.  */
  55 #define ORD(c) (unsigned int)(c)
  56
  57 /* The inverse of ORD.  */
  58 #define CHR(i) (unsigned char)(i)
  59
  60 /* The value for Spec_list->state that indicates to
  61    get_next that it should initialize the tail pointer.
  62    Its value doesn't matter as long as it can't be
  63    confused with a valid character code.  */
  64 #define BEGIN_STATE (2 * N_CHARS)
  65
  66 /* The value for Spec_list->state that indicates to
  67    get_next that the element pointed to by Spec_list->tail is
  68    being considered for the first time on this pass through the
  69    list -- it indicates that get_next should make any necessary
  70    initializations.  */
  71 #define NEW_ELEMENT (BEGIN_STATE + 1)
  72
  73 /* A value distinct from any character that may have been stored in a
  74    buffer as the result of a block-read in the function squeeze_filter.  */
  75 #define NOT_A_CHAR (unsigned int)(-1)
  76
  77 /* The following (but not CC_NO_CLASS) are indices into the array of
  78    valid character class strings.  */
  79 enum Char_class
  80   {
  81     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  82     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  83     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  84     CC_NO_CLASS = 9999
  85   };
  86
  87 /* Character class to which a character (returned by get_next) belonged;
  88    but it is set only if the construct from which the character was obtained
  89    was one of the character classes [:upper:] or [:lower:].  The value
  90    is used only when translating and then, only to make sure that upper
  91    and lower class constructs have the same relative positions in string1
  92    and string2.  */
  93 enum Upper_Lower_class
  94   {
  95     UL_LOWER = 0,
  96     UL_UPPER = 1,
  97     UL_NONE = 2
  98   };
  99
 100 /* A shortcut to ensure that when constructing the translation array,
 101    one of the values returned by paired calls to get_next (from s1 and s2)
 102    is from [:upper:] and the other is from [:lower:], or neither is from
 103    upper or lower.  In fact, no other character classes are allowed when
 104    translating, but that condition is tested elsewhere.  This array is
 105    indexed by values of type enum Upper_Lower_class.  */
 106 static int const class_ok[3][3] =
 107 {
 108   {0, 1, 0},
 109   {1, 0, 0},
 110   {0, 0, 1}
 111 };
 112
 113 /* The type of a List_element.  See build_spec_list for more details.  */
 114 enum Range_element_type
 115   {
 116     RE_NO_TYPE = 0,
 117     RE_NORMAL_CHAR,
 118     RE_RANGE,
 119     RE_CHAR_CLASS,
 120     RE_EQUIV_CLASS,
 121     RE_REPEATED_CHAR
 122   };
 123
 124 /* One construct in one of tr's argument strings.
 125    For example, consider the POSIX version of the classic tr command:
 126        tr -cs 'a-zA-Z_' '[\n*]'
 127    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 128    and a single normal character, `_'.  String2 has one construct.  */
 129 struct List_element
 130   {
 131     enum Range_element_type type;
 132     struct List_element *next;
 133     union
 134       {
 135         int normal_char;
 136         struct                  /* unnamed */
 137           {
 138             unsigned int first_char;
 139             unsigned int last_char;
 140           }
 141         range;
 142         enum Char_class char_class;
 143         int equiv_code;
 144         struct                  /* unnamed */
 145           {
 146             unsigned int the_repeated_char;
 147             long repeat_count;
 148           }
 149         repeated_char;
 150       }
 151     u;
 152   };
 153
 154 /* Each of tr's argument strings is parsed into a form that is easier
 155    to work with: a linked list of constructs (struct List_element).
 156    Each Spec_list structure also encapsulates various attributes of
 157    the corresponding argument string.  The attributes are used mainly
 158    to verify that the strings are valid in the context of any options
 159    specified (like -s, -d, or -c).  The main exception is the member
 160    `tail', which is first used to construct the list.  After construction,
 161    it is used by get_next to save its state when traversing the list.
 162    The member `state' serves a similar function.  */
 163 struct Spec_list
 164   {
 165     /* Points to the head of the list of range elements.
 166        The first struct is a dummy; its members are never used.  */
 167     struct List_element *head;
 168
 169     /* When appending, points to the last element.  When traversing via
 170        get_next(), points to the element to process next.  Setting
 171        Spec_list.state to the value BEGIN_STATE before calling get_next
 172        signals get_next to initialize tail to point to head->next.  */
 173     struct List_element *tail;
 174
 175     /* Used to save state between calls to get_next().  */
 176     unsigned int state;
 177
 178     /* Length, in the sense that length('a-z[:digit:]123abc')
 179        is 42 ( = 26 + 10 + 6).  */
 180     int length;
 181
 182     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 183     int n_indefinite_repeats;
 184
 185     /* Non-zero if this spec contains at least one equivalence
 186        class construct e.g. [=c=].  */
 187     int has_equiv_class;
 188
 189     /* Non-zero if this spec contains at least one of [:upper:] or
 190        [:lower:] class constructs.  */
 191     int has_upper_or_lower;
 192
 193     /* Non-zero if this spec contains at least one of the character class
 194        constructs (all but upper and lower) that aren't allowed in s2.  */
 195     int has_restricted_char_class;
 196   };
 197
 198 char *xmalloc ();
 199 char *stpcpy ();
 200 int safe_read ();
 201
 202 /* The name by which this program was run.  */
 203 char *program_name;
 204
 205 /* When non-zero, each sequence in the input of a repeated character
 206    (call it c) is replaced (in the output) by a single occurrence of c
 207    for every c in the squeeze set.  */
 208 static int squeeze_repeats = 0;
 209
 210 /* When non-zero, removes characters in the delete set from input.  */
 211 static int delete = 0;
 212
 213 /* Use the complement of set1 in place of set1.  */
 214 static int complement = 0;
 215
 216 /* When non-zero, this flag causes GNU tr to provide strict
 217    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 218    says that when -d is used without -s, string2 (if present)
 219    must be ignored.  Silently ignoring arguments is a bad idea.
 220    The default GNU behavior is to give a usage message and exit.
 221    Additionally, when this flag is non-zero, tr prints warnings
 222    on stderr if it is being used in a manner that is not portable.
 223    Applicable warnings are given by default, but are suppressed
 224    if the environment variable `POSIXLY_CORRECT' is set, since
 225    being POSIX conformant means we can't issue such messages.
 226    Warnings on the following topics are suppressed when this
 227    variable is non-zero:
 228    1. Ambiguous octal escapes.  */
 229 static int posix_pedantic;
 230
 231 /* When tr is performing translation and string1 is longer than string2,
 232    POSIX says that the result is undefined.  That gives the implementor
 233    of a POSIX conforming version of tr two reasonable choices for the
 234    semantics of this case.
 235
 236    * The BSD tr pads string2 to the length of string1 by
 237    repeating the last character in string2.
 238
 239    * System V tr ignores characters in string1 that have no
 240    corresponding character in string2.  That is, string1 is effectively
 241    truncated to the length of string2.
 242
 243    When non-zero, this flag causes GNU tr to imitate the behavior
 244    of System V tr when translating with string1 longer than string2.
 245    The default is to emulate BSD tr.  This flag is ignored in modes where
 246    no translation is performed.  Emulating the System V tr
 247    in this exceptional case causes the relatively common BSD idiom:
 248
 249        tr -cs A-Za-z0-9 '\012'
 250
 251    to break (it would convert only zero bytes, rather than all
 252    non-alphanumerics, to newlines).
 253
 254    WARNING: This switch does not provide general BSD or System V
 255    compatibility.  For example, it doesn't disable the interpretation
 256    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 257    some unfortunate coincidence you use such constructs in scripts
 258    expecting to use some other version of tr, the scripts will break.  */
 259 static int truncate_set1 = 0;
 260
 261 /* An alias for (!delete && non_option_args == 2).
 262    It is set in main and used there and in validate().  */
 263 static int translating;
 264
 265 #ifndef BUFSIZ
 266 #define BUFSIZ 8192
 267 #endif
 268
 269 #define IO_BUF_SIZE BUFSIZ
 270 static unsigned char io_buf[IO_BUF_SIZE];
 271
 272 static char const *const char_class_name[] =
 273 {
 274   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 275   "lower", "print", "punct", "space", "upper", "xdigit"
 276 };
 277 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 278
 279 typedef char SET_TYPE;
 280
 281 /* Array of boolean values.  A character `c' is a member of the
 282    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 283    set is defined by the last (possibly, the only) string argument
 284    on the command line when the squeeze option is given.  */
 285 static SET_TYPE in_squeeze_set[N_CHARS];
 286
 287 /* Array of boolean values.  A character `c' is a member of the
 288    delete set if and only if in_delete_set[c] is true.  The delete
 289    set is defined by the first (or only) string argument on the
 290    command line when the delete option is given.  */
 291 static SET_TYPE in_delete_set[N_CHARS];
 292
 293 /* Array of character values defining the translation (if any) that
 294    tr is to perform.  Translation is performed only when there are
 295    two specification strings and the delete switch is not given.  */
 296 static char xlate[N_CHARS];
 297
 298 /* If non-zero, display usage information and exit.  */
 299 static int show_help;
 300
 301 /* If non-zero, print the version on standard output then exit.  */
 302 static int show_version;
 303
 304 static struct option const long_options[] =
 305 {
 306   {"complement", no_argument, NULL, 'c'},
 307   {"delete", no_argument, NULL, 'd'},
 308   {"squeeze-repeats", no_argument, NULL, 's'},
 309   {"truncate-set1", no_argument, NULL, 't'},
 310   {"help", no_argument, &show_help, 1},
 311   {"version", no_argument, &show_version, 1},
 312   {NULL, 0, NULL, 0}
 313 };
 314 \f
 315 static void
 316 usage (status)
 317      int status;
 318 {
 319   if (status != 0)
 320     fprintf (stderr, "Try `%s --help' for more information.\n",
 321              program_name);
 322   else
 323     {
 324       printf ("\
 325 Usage: %s [OPTION]... SET1 [SET2]\n\
 326 ",
 327               program_name);
 328       printf ("\
 329 \n\
 330   -c, --complement        first complement SET1\n\
 331   -d, --delete            delete characters in SET1, do not translate\n\
 332   -s, --squeeze-repeats   replace sequence of characters with one\n\
 333   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 334       --help              display this help and exit\n\
 335       --version           output version information and exit\n\
 336 ");
 337       printf ("\
 338 \n\
 339 SETs are specified as strings of characters.  Most represent\n\
 340 themselves.  Here are the special writings:\n\
 341 \n\
 342   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 343   \\\\              backslash\n\
 344   \\a              audible BEL\n\
 345   \\b              backspace\n\
 346   \\f              form feed\n\
 347   \\n              new line\n\
 348   \\r              return\n\
 349   \\t              horizontal tab\n\
 350   \\v              vertical tab\n\
 351   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 352   [CHAR1-CHAR2]   same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
 353   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 354   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 355   [:alnum:]       all letters and digits\n\
 356   [:alpha:]       all letters\n\
 357   [:blank:]       all horizontal whitespace\n\
 358   [:cntrl:]       all control characters\n\
 359   [:digit:]       all digits\n\
 360   [:graph:]       all printable characters, not including space\n\
 361   [:lower:]       all lower case letters\n\
 362   [:print:]       all printable characters, including space\n\
 363   [:punct:]       all punctuation characters\n\
 364   [:space:]       all horizontal or vertical whitespace\n\
 365   [:upper:]       all upper case letters\n\
 366   [:xdigit:]      all hexadecimal digits\n\
 367   [=CHAR=]        all characters which are equivalent to CHAR\n\
 368 ");
 369       printf ("\
 370 \n\
 371 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 372 -t may be used only when translating.  SET2 is extended to length of\n\
 373 SET1 by repeating its last character as necessary.  Excess characters\n\
 374 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 375 expand in ascending order; used in SET2 while translating, they may\n\
 376 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 377 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 378 translation or deletion.\n\
 379 ");
 380     }
 381   exit (status);
 382 }
 383
 384 /* Return non-zero if the character C is a member of the
 385    equivalence class containing the character EQUIV_CLASS.  */
 386
 387 static int
 388 is_equiv_class_member (equiv_class, c)
 389      unsigned int equiv_class;
 390      unsigned int c;
 391 {
 392   return (equiv_class == c);
 393 }
 394
 395 /* Return non-zero if the character C is a member of the
 396    character class CHAR_CLASS.  */
 397
 398 static int
 399 is_char_class_member (char_class, c)
 400      enum Char_class char_class;
 401      unsigned int c;
 402 {
 403   switch (char_class)
 404     {
 405     case CC_ALNUM:
 406       return ISALNUM (c);
 407       break;
 408     case CC_ALPHA:
 409       return ISALPHA (c);
 410       break;
 411     case CC_BLANK:
 412       return ISBLANK (c);
 413       break;
 414     case CC_CNTRL:
 415       return ISCNTRL (c);
 416       break;
 417     case CC_DIGIT:
 418       return ISDIGIT (c);
 419       break;
 420     case CC_GRAPH:
 421       return ISGRAPH (c);
 422       break;
 423     case CC_LOWER:
 424       return ISLOWER (c);
 425       break;
 426     case CC_PRINT:
 427       return ISPRINT (c);
 428       break;
 429     case CC_PUNCT:
 430       return ISPUNCT (c);
 431       break;
 432     case CC_SPACE:
 433       return ISSPACE (c);
 434       break;
 435     case CC_UPPER:
 436       return ISUPPER (c);
 437       break;
 438     case CC_XDIGIT:
 439       return ISXDIGIT (c);
 440       break;
 441     default:
 442       abort ();
 443       break;
 444     }
 445 }
 446
 447 /* Perform the first pass over each range-spec argument S, converting all
 448    \c and \ddd escapes to their one-byte representations.  The conversion
 449    is done in-place, so S must point to writable storage.  If an invalid
 450    quote sequence is found print an error message and return non-zero.
 451    Otherwise set *LEN to the length of the resulting string and return
 452    zero.  The resulting array of characters may contain zero-bytes;
 453    however, on input, S is assumed to be null-terminated, and hence
 454    cannot contain actual (non-escaped) zero bytes.  */
 455
 456 static int
 457 unquote (s, len)
 458      unsigned char *s;
 459      int *len;
 460 {
 461   int i, j;
 462
 463   j = 0;
 464   for (i = 0; s[i]; i++)
 465     {
 466       switch (s[i])
 467         {
 468           int c;
 469         case '\\':
 470           switch (s[i + 1])
 471             {
 472               int oct_digit;
 473             case '\\':
 474               c = '\\';
 475               break;
 476             case 'a':
 477               c = '\007';
 478               break;
 479             case 'b':
 480               c = '\b';
 481               break;
 482             case 'f':
 483               c = '\f';
 484               break;
 485             case 'n':
 486               c = '\n';
 487               break;
 488             case 'r':
 489               c = '\r';
 490               break;
 491             case 't':
 492               c = '\t';
 493               break;
 494             case 'v':
 495               c = '\v';
 496               break;
 497             case '0':
 498             case '1':
 499             case '2':
 500             case '3':
 501             case '4':
 502             case '5':
 503             case '6':
 504             case '7':
 505               c = s[i + 1] - '0';
 506               oct_digit = s[i + 2] - '0';
 507               if (0 <= oct_digit && oct_digit <= 7)
 508                 {
 509                   c = 8 * c + oct_digit;
 510                   ++i;
 511                   oct_digit = s[i + 2] - '0';
 512                   if (0 <= oct_digit && oct_digit <= 7)
 513                     {
 514                       if (8 * c + oct_digit < N_CHARS)
 515                         {
 516                           c = 8 * c + oct_digit;
 517                           ++i;
 518                         }
 519                       else if (!posix_pedantic)
 520                         {
 521                           /* Any octal number larger than 0377 won't
 522                              fit in 8 bits.  So we stop when adding the
 523                              next digit would put us over the limit and
 524                              give a warning about the ambiguity.  POSIX
 525                              isn't clear on this, but one person has said
 526                              that in his interpretation, POSIX says tr
 527                              can't even give a warning.  */
 528                           error (0, 0, "warning: the ambiguous octal escape \
 529 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'",
 530                                  s[i], s[i + 1], s[i + 2],
 531                                  s[i], s[i + 1], s[i + 2]);
 532                         }
 533                     }
 534                 }
 535               break;
 536             case '\0':
 537               error (0, 0, "invalid backslash escape at end of string");
 538               return 1;
 539               break;
 540             default:
 541               error (0, 0, "invalid backslash escape `\\%c'", s[i + 1]);
 542               return 1;
 543               break;
 544             }
 545           ++i;
 546           s[j++] = c;
 547           break;
 548         default:
 549           s[j++] = s[i];
 550           break;
 551         }
 552     }
 553   *len = j;
 554   return 0;
 555 }
 556
 557 /* If CLASS_STR is a valid character class string, return its index
 558    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 559
 560 static enum Char_class
 561 look_up_char_class (class_str)
 562      unsigned char *class_str;
 563 {
 564   unsigned int i;
 565
 566   for (i = 0; i < N_CHAR_CLASSES; i++)
 567     if (strcmp ((const char *) class_str, char_class_name[i]) == 0)
 568       return (enum Char_class) i;
 569   return CC_NO_CLASS;
 570 }
 571
 572 /* Return a newly allocated string with a printable version of C.
 573    This function is used solely for formatting error messages.  */
 574
 575 static char *
 576 make_printable_char (c)
 577      unsigned int c;
 578 {
 579   char *buf = xmalloc (5);
 580
 581   assert (c < N_CHARS);
 582   if (ISPRINT (c))
 583     {
 584       buf[0] = c;
 585       buf[1] = '\0';
 586     }
 587   else
 588     {
 589       sprintf (buf, "\\%03o", c);
 590     }
 591   return buf;
 592 }
 593
 594 /* Return a newly allocated copy of S which is suitable for printing.
 595    LEN is the number of characters in S.  Most non-printing
 596    (isprint) characters are represented by a backslash followed by
 597    3 octal digits.  However, the characters represented by \c escapes
 598    where c is one of [abfnrtv] are represented by their 2-character \c
 599    sequences.  This function is used solely for printing error messages.  */
 600
 601 static char *
 602 make_printable_str (s, len)
 603      const unsigned char *s;
 604      int len;
 605 {
 606   /* Worst case is that every character expands to a backslash
 607      followed by a 3-character octal escape sequence.  */
 608   char *printable_buf = xmalloc (4 * len + 1);
 609   char *p = printable_buf;
 610   int i;
 611
 612   for (i = 0; i < len; i++)
 613     {
 614       char buf[5];
 615       char *tmp = NULL;
 616
 617       switch (s[i])
 618         {
 619         case '\\':
 620           tmp = "\\";
 621           break;
 622         case '\007':
 623           tmp = "\\a";
 624           break;
 625         case '\b':
 626           tmp = "\\b";
 627           break;
 628         case '\f':
 629           tmp = "\\f";
 630           break;
 631         case '\n':
 632           tmp = "\\n";
 633           break;
 634         case '\r':
 635           tmp = "\\r";
 636           break;
 637         case '\t':
 638           tmp = "\\t";
 639           break;
 640         case '\v':
 641           tmp = "\\v";
 642           break;
 643         default:
 644           if (ISPRINT (s[i]))
 645             {
 646               buf[0] = s[i];
 647               buf[1] = '\0';
 648             }
 649           else
 650             sprintf (buf, "\\%03o", s[i]);
 651           tmp = buf;
 652           break;
 653         }
 654       p = stpcpy (p, tmp);
 655     }
 656   return printable_buf;
 657 }
 658
 659 /* Append a newly allocated structure representing a
 660    character C to the specification list LIST.  */
 661
 662 static void
 663 append_normal_char (list, c)
 664      struct Spec_list *list;
 665      unsigned int c;
 666 {
 667   struct List_element *new;
 668
 669   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 670   new->next = NULL;
 671   new->type = RE_NORMAL_CHAR;
 672   new->u.normal_char = c;
 673   assert (list->tail);
 674   list->tail->next = new;
 675   list->tail = new;
 676 }
 677
 678 /* Append a newly allocated structure representing the range
 679    of characters from FIRST to LAST to the specification list LIST.
 680    Return non-zero if LAST precedes FIRST in the collating sequence,
 681    zero otherwise.  This means that '[c-c]' is acceptable.  */
 682
 683 static int
 684 append_range (list, first, last)
 685      struct Spec_list *list;
 686      unsigned int first;
 687      unsigned int last;
 688 {
 689   struct List_element *new;
 690
 691   if (ORD (first) > ORD (last))
 692     {
 693       char *tmp1 = make_printable_char (first);
 694       char *tmp2 = make_printable_char (last);
 695
 696       error (0, 0,
 697        "range-endpoints of `%s-%s' are in reverse collating sequence order",
 698              tmp1, tmp2);
 699       free (tmp1);
 700       free (tmp2);
 701       return 1;
 702     }
 703   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 704   new->next = NULL;
 705   new->type = RE_RANGE;
 706   new->u.range.first_char = first;
 707   new->u.range.last_char = last;
 708   assert (list->tail);
 709   list->tail->next = new;
 710   list->tail = new;
 711   return 0;
 712 }
 713
 714 /* If CHAR_CLASS_STR is a valid character class string, append a
 715    newly allocated structure representing that character class to the end
 716    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 717    a valid string, print an error message and return non-zero.  */
 718
 719 static int
 720 append_char_class (list, char_class_str, len)
 721      struct Spec_list *list;
 722      unsigned char *char_class_str;
 723      int len;
 724 {
 725   enum Char_class char_class;
 726   struct List_element *new;
 727
 728   char_class = look_up_char_class (char_class_str);
 729   if (char_class == CC_NO_CLASS)
 730     {
 731       char *tmp = make_printable_str (char_class_str, len);
 732
 733       error (0, 0, "invalid character class `%s'", tmp);
 734       free (tmp);
 735       return 1;
 736     }
 737   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 738   new->next = NULL;
 739   new->type = RE_CHAR_CLASS;
 740   new->u.char_class = char_class;
 741   assert (list->tail);
 742   list->tail->next = new;
 743   list->tail = new;
 744   return 0;
 745 }
 746
 747 /* Append a newly allocated structure representing a [c*n]
 748    repeated character construct to the specification list LIST.
 749    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 750    is a non-negative repeat count.  */
 751
 752 static void
 753 append_repeated_char (list, the_char, repeat_count)
 754      struct Spec_list *list;
 755      unsigned int the_char;
 756      long int repeat_count;
 757 {
 758   struct List_element *new;
 759
 760   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 761   new->next = NULL;
 762   new->type = RE_REPEATED_CHAR;
 763   new->u.repeated_char.the_repeated_char = the_char;
 764   new->u.repeated_char.repeat_count = repeat_count;
 765   assert (list->tail);
 766   list->tail->next = new;
 767   list->tail = new;
 768 }
 769
 770 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 771    the length of that string, LEN, if LEN is exactly one, append
 772    a newly allocated structure representing the specified
 773    equivalence class to the specification list, LIST and return zero.
 774    If LEN is not 1, issue an error message and return non-zero.  */
 775
 776 static int
 777 append_equiv_class (list, equiv_class_str, len)
 778      struct Spec_list *list;
 779      unsigned char *equiv_class_str;
 780      int len;
 781 {
 782   struct List_element *new;
 783
 784   if (len != 1)
 785     {
 786       char *tmp = make_printable_str (equiv_class_str, len);
 787
 788       error (0, 0, "%s: equivalence class operand must be a single character",
 789              tmp);
 790       free (tmp);
 791       return 1;
 792     }
 793   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 794   new->next = NULL;
 795   new->type = RE_EQUIV_CLASS;
 796   new->u.equiv_code = *equiv_class_str;
 797   assert (list->tail);
 798   list->tail->next = new;
 799   list->tail = new;
 800   return 0;
 801 }
 802
 803 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 804    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 805    NUL bytes, and is not NUL-terminated.  */
 806
 807 static unsigned char *
 808 substr (p, first_idx, last_idx)
 809      const unsigned char *p;
 810      int first_idx;
 811      int last_idx;
 812 {
 813   int len = last_idx - first_idx + 1;
 814   unsigned char *tmp = (unsigned char *) xmalloc (len);
 815
 816   assert (first_idx <= last_idx);
 817   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 818   memcpy (tmp, p + first_idx, len);
 819   return tmp;
 820 }
 821
 822 /* Search forward starting at START_IDX for the 2-char sequence
 823    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 824    a sequence is found, return the index of the first character,
 825    otherwise return -1.  P may contain zero bytes.  */
 826
 827 static int
 828 find_closing_delim (p, start_idx, p_len, pre_bracket_char)
 829      const unsigned char *p;
 830      int start_idx;
 831      int p_len;
 832      unsigned int pre_bracket_char;
 833 {
 834   int i;
 835
 836   for (i = start_idx; i < p_len - 1; i++)
 837     if (p[i] == pre_bracket_char && p[i + 1] == ']')
 838       return i;
 839   return -1;
 840 }
 841
 842 /* Convert a string S with explicit length LEN, possibly
 843    containing embedded zero bytes, to a long integer value.
 844    If the string represents a negative value, a value larger
 845    than LONG_MAX, or if all LEN characters do not represent a
 846    valid integer, return non-zero and do not modify *VAL.
 847    Otherwise, return zero and set *VAL to the converted value.  */
 848
 849 static int
 850 non_neg_strtol (s, len, val)
 851      const unsigned char *s;
 852      int len;
 853      long int *val;
 854 {
 855   int i;
 856   long sum = 0;
 857   unsigned int base;
 858
 859   if (len <= 0)
 860     return 1;
 861   if (s[0] == '0')
 862     base = 8;
 863   else if (ISDIGIT (s[0]))
 864     base = 10;
 865   else
 866     return 1;
 867
 868   for (i = 0; i < len; i++)
 869     {
 870       int c = s[i] - '0';
 871
 872       if (c >= base || c < 0)
 873         return 1;
 874       if (i > 8 && sum > (LONG_MAX - c) / base)
 875         return 1;
 876       sum = sum * base + c;
 877     }
 878   *val = sum;
 879   return 0;
 880 }
 881
 882 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 883    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 884    return the character and the repeat count through the arg pointers,
 885    CHAR_TO_REPEAT and N, and then return the index of the closing
 886    bracket as the function value.  If the second character following
 887    the opening bracket is not `*' or if no closing bracket can be
 888    found, return -1.  If a closing bracket is found and the
 889    second char is `*', but the string between the `*' and `]' isn't
 890    empty, an octal number, or a decimal number, print an error message
 891    and return -2.  */
 892
 893 static int
 894 find_bracketed_repeat (p, start_idx, p_len, char_to_repeat, n)
 895      const unsigned char *p;
 896      int start_idx;
 897      int p_len;
 898      unsigned int *char_to_repeat;
 899      long int *n;
 900 {
 901   int i;
 902
 903   assert (start_idx + 1 < p_len);
 904   if (p[start_idx + 1] != '*')
 905     return -1;
 906
 907   for (i = start_idx + 2; i < p_len; i++)
 908     {
 909       if (p[i] == ']')
 910         {
 911           const unsigned char *digit_str;
 912           int digit_str_len = i - start_idx - 2;
 913
 914           *char_to_repeat = p[start_idx];
 915           if (digit_str_len == 0)
 916             {
 917               /* We've matched [c*] -- no explicit repeat count.  */
 918               *n = 0;
 919               return i;
 920             }
 921
 922           /* Here, we have found [c*s] where s should be a string
 923              of octal or decimal digits.  */
 924           digit_str = &p[start_idx + 2];
 925           if (non_neg_strtol (digit_str, digit_str_len, n))
 926             {
 927               char *tmp = make_printable_str (digit_str, digit_str_len);
 928               error (0, 0, "invalid repeat count `%s' in [c*n] construct", tmp);
 929               free (tmp);
 930               return -2;
 931             }
 932           return i;
 933         }
 934     }
 935   return -1;                    /* No bracket found.  */
 936 }
 937
 938 /* Convert string UNESACPED_STRING (which has been preprocessed to
 939    convert backslash-escape sequences) of length LEN characters into
 940    a linked list of the following 5 types of constructs:
 941       - [:str:] Character class where `str' is one of the 12 valid strings.
 942       - [=c=] Equivalence class where `c' is any single character.
 943       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 944           However, if `n' is present, it must be a non-negative octal or
 945           decimal integer.
 946       - r-s Range of characters from `r' to `s'.  The second endpoint must
 947           not precede the first in the current collating sequence.
 948       - c Any other character is interpreted as itself.  */
 949
 950 static int
 951 build_spec_list (unescaped_string, len, result)
 952      const unsigned char *unescaped_string;
 953      int len;
 954      struct Spec_list *result;
 955 {
 956   const unsigned char *p;
 957   int i;
 958
 959   p = unescaped_string;
 960
 961   /* The main for-loop below recognizes the 4 multi-character constructs.
 962      A character that matches (in its context) none of the multi-character
 963      constructs is classified as `normal'.  Since all multi-character
 964      constructs have at least 3 characters, any strings of length 2 or
 965      less are composed solely of normal characters.  Hence, the index of
 966      the outer for-loop runs only as far as LEN-2.  */
 967
 968   for (i = 0; i < len - 2;)
 969     {
 970       switch (p[i])
 971         {
 972           int fall_through;
 973         case '[':
 974           fall_through = 0;
 975           switch (p[i + 1])
 976             {
 977               int closing_delim_idx;
 978               int closing_bracket_idx;
 979               unsigned int char_to_repeat;
 980               long repeat_count;
 981             case ':':
 982             case '=':
 983               closing_delim_idx = find_closing_delim (p, i + 2, len, p[i + 1]);
 984               if (closing_delim_idx >= 0)
 985                 {
 986                   int parse_failed;
 987                   unsigned char *opnd_str = substr (p, i + 2,
 988                                                     closing_delim_idx - 1);
 989                   if (p[i + 1] == ':')
 990                     parse_failed = append_char_class (result, opnd_str,
 991                                      (closing_delim_idx - 1) - (i + 2) + 1);
 992                   else
 993                     parse_failed = append_equiv_class (result, opnd_str,
 994                                      (closing_delim_idx - 1) - (i + 2) + 1);
 995                   free (opnd_str);
 996
 997                   /* Return non-zero if append_*_class reports a problem.  */
 998                   if (parse_failed)
 999                     return 1;
1000                   else
1001                     i = closing_delim_idx + 2;
1002                   break;
1003                 }
1004               /* Else fall through.  This could be [:*] or [=*].  */
1005             default:
1006               /* Determine whether this is a bracketed repeat range
1007                  matching the RE \[.\*(dec_or_oct_number)?\].  */
1008               closing_bracket_idx = find_bracketed_repeat (p, i + 1,
1009                                        len, &char_to_repeat, &repeat_count);
1010               if (closing_bracket_idx >= 0)
1011                 {
1012                   append_repeated_char (result, char_to_repeat, repeat_count);
1013                   i = closing_bracket_idx + 1;
1014                   break;
1015                 }
1016               else if (closing_bracket_idx == -1)
1017                 {
1018                   fall_through = 1;
1019                 }
1020               else
1021                 /* Found a string that looked like [c*n] but the
1022                    numeric part was invalid.  */
1023                 return 1;
1024               break;
1025             }
1026           if (!fall_through)
1027             break;
1028
1029           /* Here if we've tried to match [c*n], [:str:], and [=c=]
1030              and none of them fit.  So we still have to consider the
1031              range `[-c' (from `[' to `c').  */
1032         default:
1033           /* Look ahead one char for ranges like a-z.  */
1034           if (p[i + 1] == '-')
1035             {
1036               if (append_range (result, p[i], p[i + 2]))
1037                 return 1;
1038               i += 3;
1039             }
1040           else
1041             {
1042               append_normal_char (result, p[i]);
1043               ++i;
1044             }
1045           break;
1046         }
1047     }
1048
1049   /* Now handle the (2 or fewer) remaining characters p[i]..p[len - 1].  */
1050   for (; i < len; i++)
1051     append_normal_char (result, p[i]);
1052
1053   return 0;
1054 }
1055
1056 /* Given a Spec_list S (with its saved state implicit in the values
1057    of its members `tail' and `state'), return the next single character
1058    in the expansion of S's constructs.  If the last character of S was
1059    returned on the previous call or if S was empty, this function
1060    returns -1.  For example, successive calls to get_next where S
1061    represents the spec-string 'a-d[y*3]' will return the sequence
1062    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1063    which the returned character comes is [:upper:] or [:lower:], the
1064    parameter CLASS is given a value to indicate which it was.  Otherwise
1065    CLASS is set to UL_NONE.  This value is used only when constructing
1066    the translation table to verify that any occurrences of upper and
1067    lower class constructs in the spec-strings appear in the same relative
1068    positions.  */
1069
1070 static int
1071 get_next (s, class)
1072      struct Spec_list *s;
1073      enum Upper_Lower_class *class;
1074 {
1075   struct List_element *p;
1076   int return_val;
1077   int i;
1078
1079   if (class)
1080     *class = UL_NONE;
1081
1082   if (s->state == BEGIN_STATE)
1083     {
1084       s->tail = s->head->next;
1085       s->state = NEW_ELEMENT;
1086     }
1087
1088   p = s->tail;
1089   if (p == NULL)
1090     return -1;
1091
1092   switch (p->type)
1093     {
1094     case RE_NORMAL_CHAR:
1095       return_val = p->u.normal_char;
1096       s->state = NEW_ELEMENT;
1097       s->tail = p->next;
1098       break;
1099
1100     case RE_RANGE:
1101       if (s->state == NEW_ELEMENT)
1102         s->state = ORD (p->u.range.first_char);
1103       else
1104         ++(s->state);
1105       return_val = CHR (s->state);
1106       if (s->state == ORD (p->u.range.last_char))
1107         {
1108           s->tail = p->next;
1109           s->state = NEW_ELEMENT;
1110         }
1111       break;
1112
1113     case RE_CHAR_CLASS:
1114       if (s->state == NEW_ELEMENT)
1115         {
1116           for (i = 0; i < N_CHARS; i++)
1117             if (is_char_class_member (p->u.char_class, i))
1118               break;
1119           assert (i < N_CHARS);
1120           s->state = i;
1121         }
1122       assert (is_char_class_member (p->u.char_class, s->state));
1123       return_val = CHR (s->state);
1124       for (i = s->state + 1; i < N_CHARS; i++)
1125         if (is_char_class_member (p->u.char_class, i))
1126           break;
1127       if (i < N_CHARS)
1128         s->state = i;
1129       else
1130         {
1131           s->tail = p->next;
1132           s->state = NEW_ELEMENT;
1133         }
1134       if (class)
1135         {
1136           switch (p->u.char_class)
1137             {
1138             case CC_LOWER:
1139               *class = UL_LOWER;
1140               break;
1141             case CC_UPPER:
1142               *class = UL_UPPER;
1143               break;
1144             default:
1145               /* empty */
1146               break;
1147             }
1148         }
1149       break;
1150
1151     case RE_EQUIV_CLASS:
1152       /* FIXME: this assumes that each character is alone in its own
1153          equivalence class (which appears to be correct for my
1154          LC_COLLATE.  But I don't know of any function that allows
1155          one to determine a character's equivalence class.  */
1156
1157       return_val = p->u.equiv_code;
1158       s->state = NEW_ELEMENT;
1159       s->tail = p->next;
1160       break;
1161
1162     case RE_REPEATED_CHAR:
1163       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1164       assert (p->u.repeated_char.repeat_count >= 0);
1165       if (p->u.repeated_char.repeat_count == 0)
1166         {
1167           s->tail = p->next;
1168           s->state = NEW_ELEMENT;
1169           return_val = get_next (s, class);
1170         }
1171       else
1172         {
1173           if (s->state == NEW_ELEMENT)
1174             {
1175               s->state = 0;
1176             }
1177           ++(s->state);
1178           return_val = p->u.repeated_char.the_repeated_char;
1179           if (p->u.repeated_char.repeat_count > 0
1180               && s->state == p->u.repeated_char.repeat_count)
1181             {
1182               s->tail = p->next;
1183               s->state = NEW_ELEMENT;
1184             }
1185         }
1186       break;
1187
1188     case RE_NO_TYPE:
1189       abort ();
1190       break;
1191
1192     default:
1193       abort ();
1194       break;
1195     }
1196
1197   return return_val;
1198 }
1199
1200 /* This is a minor kludge.  This function is called from
1201    get_spec_stats to determine the cardinality of a set derived
1202    from a complemented string.  It's a kludge in that some of the
1203    same operations are (duplicated) performed in set_initialize.  */
1204
1205 static int
1206 card_of_complement (s)
1207      struct Spec_list *s;
1208 {
1209   int c;
1210   int cardinality = N_CHARS;
1211   SET_TYPE in_set[N_CHARS];
1212
1213   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1214   s->state = BEGIN_STATE;
1215   while ((c = get_next (s, NULL)) != -1)
1216     if (!in_set[c]++)
1217       --cardinality;
1218   return cardinality;
1219 }
1220
1221 /* Gather statistics about the spec-list S in preparation for the tests
1222    in validate that determine the consistency of the specs.  This function
1223    is called at most twice; once for string1, and again for any string2.
1224    LEN_S1 < 0 indicates that this is the first call and that S represents
1225    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1226    constructs in string1, and we can use its value to resolve any
1227    indefinite repeat construct in S (which represents string2).  Hence,
1228    this function has the side-effect that it converts a valid [c*]
1229    construct in string2 to [c*n] where n is large enough (or 0) to give
1230    string2 the same length as string1.  For example, with the command
1231    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1232    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1233
1234 static void
1235 get_spec_stats (s, len_s1)
1236      struct Spec_list *s;
1237      int len_s1;
1238 {
1239   struct List_element *p;
1240   struct List_element *indefinite_repeat_element = NULL;
1241   int len = 0;
1242
1243   s->n_indefinite_repeats = 0;
1244   s->has_equiv_class = 0;
1245   s->has_restricted_char_class = 0;
1246   s->has_upper_or_lower = 0;
1247   for (p = s->head->next; p; p = p->next)
1248     {
1249       switch (p->type)
1250         {
1251           int i;
1252         case RE_NORMAL_CHAR:
1253           ++len;
1254           break;
1255
1256         case RE_RANGE:
1257           assert (p->u.range.last_char >= p->u.range.first_char);
1258           len += p->u.range.last_char - p->u.range.first_char + 1;
1259           break;
1260
1261         case RE_CHAR_CLASS:
1262           for (i = 0; i < N_CHARS; i++)
1263             if (is_char_class_member (p->u.char_class, i))
1264               ++len;
1265           switch (p->u.char_class)
1266             {
1267             case CC_UPPER:
1268             case CC_LOWER:
1269               s->has_upper_or_lower = 1;
1270               break;
1271             default:
1272               s->has_restricted_char_class = 1;
1273               break;
1274             }
1275           break;
1276
1277         case RE_EQUIV_CLASS:
1278           for (i = 0; i < N_CHARS; i++)
1279             if (is_equiv_class_member (p->u.equiv_code, i))
1280               ++len;
1281           s->has_equiv_class = 1;
1282           break;
1283
1284         case RE_REPEATED_CHAR:
1285           if (p->u.repeated_char.repeat_count > 0)
1286             len += p->u.repeated_char.repeat_count;
1287           else if (p->u.repeated_char.repeat_count == 0)
1288             {
1289               indefinite_repeat_element = p;
1290               ++(s->n_indefinite_repeats);
1291             }
1292           break;
1293
1294         case RE_NO_TYPE:
1295           assert (0);
1296           break;
1297         }
1298     }
1299
1300   if (len_s1 >= len && s->n_indefinite_repeats == 1)
1301     {
1302       indefinite_repeat_element->u.repeated_char.repeat_count = len_s1 - len;
1303       len = len_s1;
1304     }
1305   if (complement && len_s1 < 0)
1306     s->length = card_of_complement (s);
1307   else
1308     s->length = len;
1309   return;
1310 }
1311
1312 static void
1313 spec_init (spec_list)
1314      struct Spec_list *spec_list;
1315 {
1316   spec_list->head = spec_list->tail =
1317     (struct List_element *) xmalloc (sizeof (struct List_element));
1318   spec_list->head->next = NULL;
1319 }
1320
1321 /* This function makes two passes over the argument string S.  The first
1322    one converts all \c and \ddd escapes to their one-byte representations.
1323    The second constructs a linked specification list, SPEC_LIST, of the
1324    characters and constructs that comprise the argument string.  If either
1325    of these passes detects an error, this function returns non-zero.  */
1326
1327 static int
1328 parse_str (s, spec_list)
1329      unsigned char *s;
1330      struct Spec_list *spec_list;
1331 {
1332   int len;
1333
1334   if (unquote (s, &len))
1335     return 1;
1336   if (build_spec_list (s, len, spec_list))
1337     return 1;
1338   return 0;
1339 }
1340
1341 /* Given two specification lists, S1 and S2, and assuming that
1342    S1->length > S2->length, append a single [c*n] element to S2 where c
1343    is the last character in the expansion of S2 and n is the difference
1344    between the two lengths.
1345    Upon successful completion, S2->length is set to S1->length.  The only
1346    way this function can fail to make S2 as long as S1 is when S2 has
1347    zero-length, since in that case, there is no last character to repeat.
1348    So S2->length is required to be at least 1.
1349
1350    Providing this functionality allows the user to do some pretty
1351    non-BSD (and non-portable) things:  For example, the command
1352        tr -cs '[:upper:]0-9' '[:lower:]'
1353    is almost guaranteed to give results that depend on your collating
1354    sequence.  */
1355
1356 static void
1357 string2_extend (s1, s2)
1358      const struct Spec_list *s1;
1359      struct Spec_list *s2;
1360 {
1361   struct List_element *p;
1362   int char_to_repeat;
1363   int i;
1364
1365   assert (translating);
1366   assert (s1->length > s2->length);
1367   assert (s2->length > 0);
1368
1369   p = s2->tail;
1370   switch (p->type)
1371     {
1372     case RE_NORMAL_CHAR:
1373       char_to_repeat = p->u.normal_char;
1374       break;
1375     case RE_RANGE:
1376       char_to_repeat = p->u.range.last_char;
1377       break;
1378     case RE_CHAR_CLASS:
1379       for (i = N_CHARS; i >= 0; i--)
1380         if (is_char_class_member (p->u.char_class, i))
1381           break;
1382       assert (i >= 0);
1383       char_to_repeat = CHR (i);
1384       break;
1385
1386     case RE_REPEATED_CHAR:
1387       char_to_repeat = p->u.repeated_char.the_repeated_char;
1388       break;
1389
1390     case RE_EQUIV_CLASS:
1391       /* This shouldn't happen, because validate exits with an error
1392          if it finds an equiv class in string2 when translating.  */
1393       abort ();
1394       break;
1395
1396     case RE_NO_TYPE:
1397       abort ();
1398       break;
1399
1400     default:
1401       abort ();
1402       break;
1403     }
1404
1405   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1406   s2->length = s1->length;
1407   return;
1408 }
1409
1410 /* Die with an error message if S1 and S2 describe strings that
1411    are not valid with the given command line switches.
1412    A side effect of this function is that if a valid [c*] or
1413    [c*0] construct appears in string2, it is converted to [c*n]
1414    with a value for n that makes s2->length == s1->length.  By
1415    the same token, if the --truncate-set1 option is not
1416    given, S2 may be extended.  */
1417
1418 static void
1419 validate (s1, s2)
1420      const struct Spec_list *s1;
1421      struct Spec_list *s2;
1422 {
1423   get_spec_stats (s1, -1);
1424   if (s1->n_indefinite_repeats > 0)
1425     {
1426       error (1, 0, "the [c*] repeat construct may not appear in string1");
1427     }
1428
1429   /* FIXME: it isn't clear from the POSIX spec that this is invalid,
1430      but in the spirit of the other restrictions put on translation
1431      with character classes, this seems a logical interpretation.  */
1432   if (complement && s1->has_upper_or_lower)
1433     {
1434       error (1, 0,
1435              "character classes may not be used when translating \
1436 and complementing");
1437     }
1438
1439   if (s2)
1440     {
1441       get_spec_stats (s2, s1->length);
1442       if (s2->has_restricted_char_class)
1443         {
1444           error (1, 0,
1445                  "when translating, the only character classes that may \
1446 appear in\n\tstring2 are `upper' and `lower'");
1447         }
1448
1449       if (s2->n_indefinite_repeats > 1)
1450         {
1451           error (1, 0, "only one [c*] repeat construct may appear in string2");
1452         }
1453
1454       if (translating)
1455         {
1456           if (s2->has_equiv_class)
1457             {
1458               error (1, 0,
1459                      "[=c=] expressions may not appear in string2 \
1460 when translating");
1461             }
1462
1463           if (s1->length > s2->length)
1464             {
1465               if (!truncate_set1)
1466                 {
1467                   /* string2 must be non-empty unless --truncate-set1 is
1468                      given or string1 is empty.  */
1469
1470                   if (s2->length == 0)
1471                     error (1, 0,
1472                      "when not truncating set1, string2 must be non-empty");
1473                   string2_extend (s1, s2);
1474                 }
1475             }
1476
1477           if (complement && s2->has_upper_or_lower)
1478             error (1, 0,
1479                    "character classes may not be used when translating \
1480 and complementing");
1481         }
1482       else
1483         /* Not translating.  */
1484         {
1485           if (s2->n_indefinite_repeats > 0)
1486             error (1, 0,
1487                    "the [c*] construct may appear in string2 only \
1488 when translating");
1489         }
1490     }
1491 }
1492
1493 /* Read buffers of SIZE bytes via the function READER (if READER is
1494    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1495    read_and_delete or read_and_xlate.  After each buffer is read, it is
1496    processed and written to stdout.  The buffers are processed so that
1497    multiple consecutive occurrences of the same character in the input
1498    stream are replaced by a single occurrence of that character if the
1499    character is in the squeeze set.  */
1500
1501 static void
1502 squeeze_filter (buf, size, reader)
1503      unsigned char *buf;
1504      long int size;
1505      PFI reader;
1506 {
1507   unsigned int char_to_squeeze = NOT_A_CHAR;
1508   int i = 0;
1509   int nr = 0;
1510
1511   for (;;)
1512     {
1513       int begin;
1514
1515       if (i >= nr)
1516         {
1517           if (reader == NULL)
1518             nr = safe_read (0, (char *) buf, size);
1519           else
1520             nr = (*reader) (buf, size, NULL);
1521
1522           if (nr < 0)
1523             error (1, errno, "read error");
1524           if (nr == 0)
1525             break;
1526           i = 0;
1527         }
1528
1529       begin = i;
1530
1531       if (char_to_squeeze == NOT_A_CHAR)
1532         {
1533           int out_len;
1534           /* Here, by being a little tricky, we can get a significant
1535              performance increase in most cases when the input is
1536              reasonably large.  Since tr will modify the input only
1537              if two consecutive (and identical) input characters are
1538              in the squeeze set, we can step by two through the data
1539              when searching for a character in the squeeze set.  This
1540              means there may be a little more work in a few cases and
1541              perhaps twice as much work in the worst cases where most
1542              of the input is removed by squeezing repeats.  But most
1543              uses of this functionality seem to remove less than 20-30%
1544              of the input.  */
1545           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1546             ;                   /* empty */
1547
1548           /* There is a special case when i == nr and we've just
1549              skipped a character (the last one in buf) that is in
1550              the squeeze set.  */
1551           if (i == nr && in_squeeze_set[buf[i - 1]])
1552             --i;
1553
1554           if (i >= nr)
1555             out_len = nr - begin;
1556           else
1557             {
1558               char_to_squeeze = buf[i];
1559               /* We're about to output buf[begin..i].  */
1560               out_len = i - begin + 1;
1561
1562               /* But since we stepped by 2 in the loop above,
1563                  out_len may be one too large.  */
1564               if (i > 0 && buf[i - 1] == char_to_squeeze)
1565                 --out_len;
1566
1567               /* Advance i to the index of first character to be
1568                  considered when looking for a char different from
1569                  char_to_squeeze.  */
1570               ++i;
1571             }
1572           if (out_len > 0
1573               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1574             error (1, errno, "write error");
1575         }
1576
1577       if (char_to_squeeze != NOT_A_CHAR)
1578         {
1579           /* Advance i to index of first char != char_to_squeeze
1580              (or to nr if all the rest of the characters in this
1581              buffer are the same as char_to_squeeze).  */
1582           for (; i < nr && buf[i] == char_to_squeeze; i++)
1583             ;                   /* empty */
1584           if (i < nr)
1585             char_to_squeeze = NOT_A_CHAR;
1586           /* If (i >= nr) we've squeezed the last character in this buffer.
1587              So now we have to read a new buffer and continue comparing
1588              characters against char_to_squeeze.  */
1589         }
1590     }
1591 }
1592
1593 /* Read buffers of SIZE bytes from stdin until one is found that
1594    contains at least one character not in the delete set.  Store
1595    in the array BUF, all characters from that buffer that are not
1596    in the delete set, and return the number of characters saved
1597    or 0 upon EOF.  */
1598
1599 static long
1600 read_and_delete (buf, size, not_used)
1601      unsigned char *buf;
1602      long int size;
1603      PFI not_used;
1604 {
1605   long n_saved;
1606   static int hit_eof = 0;
1607
1608   assert (not_used == NULL);
1609   assert (size > 0);
1610
1611   if (hit_eof)
1612     return 0;
1613
1614   /* This enclosing do-while loop is to make sure that
1615      we don't return zero (indicating EOF) when we've
1616      just deleted all the characters in a buffer.  */
1617   do
1618     {
1619       int i;
1620       int nr = safe_read (0, (char *) buf, size);
1621
1622       if (nr < 0)
1623         error (1, errno, "read error");
1624       if (nr == 0)
1625         {
1626           hit_eof = 1;
1627           return 0;
1628         }
1629
1630       /* This first loop may be a waste of code, but gives much
1631          better performance when no characters are deleted in
1632          the beginning of a buffer.  It just avoids the copying
1633          of buf[i] into buf[n_saved] when it would be a NOP.  */
1634
1635       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1636         /* empty */ ;
1637       n_saved = i;
1638
1639       for (++i; i < nr; i++)
1640         if (!in_delete_set[buf[i]])
1641           buf[n_saved++] = buf[i];
1642     }
1643   while (n_saved == 0);
1644
1645   return n_saved;
1646 }
1647
1648 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1649    perform the in-place and one-to-one mapping specified by the global
1650    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1651
1652 static long
1653 read_and_xlate (buf, size, not_used)
1654      unsigned char *buf;
1655      long int size;
1656      PFI not_used;
1657 {
1658   long chars_read = 0;
1659   static int hit_eof = 0;
1660   int i;
1661
1662   assert (not_used == NULL);
1663   assert (size > 0);
1664
1665   if (hit_eof)
1666     return 0;
1667
1668   chars_read = safe_read (0, (char *) buf, size);
1669   if (chars_read < 0)
1670     error (1, errno, "read error");
1671   if (chars_read == 0)
1672     {
1673       hit_eof = 1;
1674       return 0;
1675     }
1676
1677   for (i = 0; i < chars_read; i++)
1678     buf[i] = xlate[buf[i]];
1679
1680   return chars_read;
1681 }
1682
1683 /* Initialize a boolean membership set IN_SET with the character
1684    values obtained by traversing the linked list of constructs S
1685    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1686    non-zero the resulting set is complemented.  */
1687
1688 static void
1689 set_initialize (s, complement_this_set, in_set)
1690      struct Spec_list *s;
1691      int complement_this_set;
1692      SET_TYPE *in_set;
1693 {
1694   int c;
1695   int i;
1696
1697   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1698   s->state = BEGIN_STATE;
1699   while ((c = get_next (s, NULL)) != -1)
1700     in_set[c] = 1;
1701   if (complement_this_set)
1702     for (i = 0; i < N_CHARS; i++)
1703       in_set[i] = (!in_set[i]);
1704 }
1705
1706 void
1707 main (argc, argv)
1708      int argc;
1709      char **argv;
1710 {
1711   int c;
1712   int non_option_args;
1713   struct Spec_list buf1, buf2;
1714   struct Spec_list *s1 = &buf1;
1715   struct Spec_list *s2 = &buf2;
1716
1717   program_name = argv[0];
1718
1719   while ((c = getopt_long (argc, argv, "cdst", long_options,
1720                            (int *) 0)) != EOF)
1721     {
1722       switch (c)
1723         {
1724         case 0:
1725           break;
1726
1727         case 'c':
1728           complement = 1;
1729           break;
1730
1731         case 'd':
1732           delete = 1;
1733           break;
1734
1735         case 's':
1736           squeeze_repeats = 1;
1737           break;
1738
1739         case 't':
1740           truncate_set1 = 1;
1741           break;
1742
1743         default:
1744           usage (2);
1745           break;
1746         }
1747     }
1748
1749   if (show_version)
1750     {
1751       printf ("tr - %s\n", version_string);
1752       exit (0);
1753     }
1754
1755   if (show_help)
1756     usage (0);
1757
1758   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1759
1760   non_option_args = argc - optind;
1761   translating = (non_option_args == 2 && !delete);
1762
1763   /* Change this test if it is valid to give tr no options and
1764      no args at all.  POSIX doesn't specifically say anything
1765      either way, but it looks like they implied it's invalid
1766      by omission.  If you want to make tr do a slow imitation
1767      of `cat' use `tr a a'.  */
1768   if (non_option_args > 2)
1769     {
1770       error (0, 0, "too many arguments");
1771       usage (2);
1772     }
1773
1774   if (!delete && !squeeze_repeats && non_option_args != 2)
1775     error (1, 0, "two strings must be given when translating");
1776
1777   if (delete && squeeze_repeats && non_option_args != 2)
1778     error (1, 0, "two strings must be given when both \
1779 deleting and squeezing repeats");
1780
1781   /* If --delete is given without --squeeze-repeats, then
1782      only one string argument may be specified.  But POSIX
1783      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1784      is set, pretend we never saw string2.  But I think
1785      this deserves a fatal error, so that's the default.  */
1786   if ((delete && !squeeze_repeats) && non_option_args != 1)
1787     {
1788       if (posix_pedantic && non_option_args == 2)
1789         --non_option_args;
1790       else
1791         error (1, 0,
1792                "only one string may be given when deleting \
1793 without squeezing repeats");
1794     }
1795
1796   if (squeeze_repeats && non_option_args == 0)
1797     error (1, 0, "at least one string must be given when squeezing repeats");
1798
1799   spec_init (s1);
1800   if (parse_str ((unsigned char *) argv[optind], s1))
1801     exit (1);
1802
1803   if (non_option_args == 2)
1804     {
1805       spec_init (s2);
1806       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1807         exit (1);
1808     }
1809   else
1810     s2 = NULL;
1811
1812   validate (s1, s2);
1813
1814   if (squeeze_repeats && non_option_args == 1)
1815     {
1816       set_initialize (s1, complement, in_squeeze_set);
1817       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1818     }
1819   else if (delete && non_option_args == 1)
1820     {
1821       int nr;
1822
1823       set_initialize (s1, complement, in_delete_set);
1824       do
1825         {
1826           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1827           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1828             error (1, errno, "write error");
1829         }
1830       while (nr > 0);
1831     }
1832   else if (squeeze_repeats && delete && non_option_args == 2)
1833     {
1834       set_initialize (s1, complement, in_delete_set);
1835       set_initialize (s2, 0, in_squeeze_set);
1836       squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1837     }
1838   else if (translating)
1839     {
1840       if (complement)
1841         {
1842           int i;
1843           SET_TYPE *in_s1 = in_delete_set;
1844
1845           set_initialize (s1, 0, in_s1);
1846           s2->state = BEGIN_STATE;
1847           for (i = 0; i < N_CHARS; i++)
1848             xlate[i] = i;
1849           for (i = 0; i < N_CHARS; i++)
1850             {
1851               if (!in_s1[i])
1852                 {
1853                   int ch = get_next (s2, NULL);
1854                   assert (ch != -1 || truncate_set1);
1855                   if (ch == -1)
1856                     {
1857                       /* This will happen when tr is invoked like e.g.
1858                          tr -cs A-Za-z0-9 '\012'.  */
1859                       break;
1860                     }
1861                   xlate[i] = ch;
1862                 }
1863             }
1864           assert (get_next (s2, NULL) == -1 || truncate_set1);
1865         }
1866       else
1867         {
1868           int c1, c2;
1869           int i;
1870           enum Upper_Lower_class class_s1;
1871           enum Upper_Lower_class class_s2;
1872
1873           for (i = 0; i < N_CHARS; i++)
1874             xlate[i] = i;
1875           s1->state = BEGIN_STATE;
1876           s2->state = BEGIN_STATE;
1877           for (;;)
1878             {
1879               c1 = get_next (s1, &class_s1);
1880               c2 = get_next (s2, &class_s2);
1881               if (!class_ok[(int) class_s1][(int) class_s2])
1882                 error (1, 0,
1883                      "misaligned or mismatched upper and/or lower classes");
1884               /* The following should have been checked by validate...  */
1885               if (c2 == -1)
1886                 break;
1887               xlate[c1] = c2;
1888             }
1889           assert (c1 == -1 || truncate_set1);
1890         }
1891       if (squeeze_repeats)
1892         {
1893           set_initialize (s2, 0, in_squeeze_set);
1894           squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
1895         }
1896       else
1897         {
1898           int chars_read;
1899
1900           do
1901             {
1902               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
1903               if (chars_read > 0
1904                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
1905                 error (1, errno, "write error");
1906             }
1907           while (chars_read > 0);
1908         }
1909     }
1910
1911   if (fclose (stdout) == EOF)
1912     error (2, errno, "write error");
1913
1914   if (close (0) != 0)
1915     error (2, errno, "standard input");
1916
1917   exit (0);
1918 }