src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 1991 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  17
  18 /* Written by Jim Meyering, meyering@cs.utexas.edu. */
  19
  20 /* Get isblank from GNU libc.  */
  21 #define _GNU_SOURCE
  22
  23 #include <stdio.h>
  24 #include <assert.h>
  25 #include <errno.h>
  26 #include <sys/types.h>
  27 #include "getopt.h"
  28 #include "system.h"
  29 #include "version.h"
  30
  31 #ifndef LONG_MAX
  32 #define LONG_MAX 0x7FFFFFFF
  33 #endif
  34
  35 #ifndef UCHAR_MAX
  36 #define UCHAR_MAX 0xFF
  37 #endif
  38
  39 #define N_CHARS (UCHAR_MAX + 1)
  40
  41 /* A pointer to a function that returns an int. */
  42 typedef int (*PFI) ();
  43
  44 /* Convert from character C to its index in the collating
  45    sequence array.  Just cast to an unsigned int to avoid
  46    problems with sign-extension. */
  47 #define ORD(c) (unsigned int)(c)
  48
  49 /* The inverse of ORD. */
  50 #define CHR(i) (unsigned char)(i)
  51
  52 /* The value for Spec_list->state that indicates to
  53    get_next that it should initialize the tail pointer.
  54    Its value doesn't matter as long as it can't be
  55    confused with a valid character code. */
  56 #define BEGIN_STATE (2 * N_CHARS)
  57
  58 /* The value for Spec_list->state that indicates to
  59    get_next that the element pointed to by Spec_list->tail is
  60    being considered for the first time on this pass through the
  61    list -- it indicates that get_next should make any necessary
  62    initializations. */
  63 #define NEW_ELEMENT (BEGIN_STATE + 1)
  64
  65 /* A value distinct from any character that may have been stored in a
  66    buffer as the result of a block-read in the function squeeze_filter. */
  67 #define NOT_A_CHAR (unsigned int)(-1)
  68
  69 /* The following (but not CC_NO_CLASS) are indices into the array of
  70    valid character class strings. */
  71 enum Char_class
  72   {
  73     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  74     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  75     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  76     CC_NO_CLASS = 9999
  77   };
  78
  79 /* Character class to which a character (returned by get_next) belonged;
  80    but it is set only if the construct from which the character was obtained
  81    was one of the character classes [:upper:] or [:lower:].  The value
  82    is used only when translating and then, only to make sure that upper
  83    and lower class constructs have the same relative positions in string1
  84    and string2. */
  85 enum Upper_Lower_class
  86   {
  87     UL_LOWER = 0,
  88     UL_UPPER = 1,
  89     UL_NONE = 2
  90   };
  91
  92 /* A shortcut to ensure that when constructing the translation array,
  93    one of the values returned by paired calls to get_next (from s1 and s2)
  94    is from [:upper:] and the other is from [:lower:], or neither is from
  95    upper or lower.  In fact, no other character classes are allowed when
  96    translating, but that condition is tested elsewhere.  This array is
  97    indexed by values of type enum Upper_Lower_class. */
  98 static int const class_ok[3][3] =
  99 {
 100   {0, 1, 0},
 101   {1, 0, 0},
 102   {0, 0, 1}
 103 };
 104
 105 /* The type of a List_element.  See build_spec_list for more details. */
 106 enum Range_element_type
 107   {
 108     RE_NO_TYPE = 0,
 109     RE_NORMAL_CHAR,
 110     RE_RANGE,
 111     RE_CHAR_CLASS,
 112     RE_EQUIV_CLASS,
 113     RE_REPEATED_CHAR
 114   };
 115
 116 /* One construct in one of tr's argument strings.
 117    For example, consider the POSIX version of the classic tr command:
 118        tr -cs 'a-zA-Z_' '[\n*]'
 119    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 120    and a single normal character, `_'.  String2 has one construct. */
 121 struct List_element
 122   {
 123     enum Range_element_type type;
 124     struct List_element *next;
 125     union
 126       {
 127         int normal_char;
 128         struct                  /* unnamed */
 129           {
 130             unsigned int first_char;
 131             unsigned int last_char;
 132           }
 133         range;
 134         enum Char_class char_class;
 135         int equiv_code;
 136         struct                  /* unnamed */
 137           {
 138             unsigned int the_repeated_char;
 139             long repeat_count;
 140           }
 141         repeated_char;
 142       }
 143     u;
 144   };
 145
 146 /* Each of tr's argument strings is parsed into a form that is easier
 147    to work with: a linked list of constructs (struct List_element).
 148    Each Spec_list structure also encapsulates various attributes of
 149    the corresponding argument string.  The attributes are used mainly
 150    to verify that the strings are legal in the context of any options
 151    specified (like -s, -d, or -c).  The main exception is the member
 152    `tail', which is first used to construct the list.  After construction,
 153    it is used by get_next to save its state when traversing the list.
 154    The member `state' serves a similar function. */
 155 struct Spec_list
 156   {
 157     /* Points to the head of the list of range elements.
 158        The first struct is a dummy; its members are never used. */
 159     struct List_element *head;
 160
 161     /* When appending, points to the last element.  When traversing via
 162        get_next(), points to the element to process next.  Setting
 163        Spec_list.state to the value BEGIN_STATE before calling get_next
 164        signals get_next to initialize tail to point to head->next. */
 165     struct List_element *tail;
 166
 167     /* Used to save state between calls to get_next(). */
 168     unsigned int state;
 169
 170     /* Length, in the sense that length('a-z[:digit:]123abc')
 171        is 42 ( = 26 + 10 + 6). */
 172     int length;
 173
 174     /* The number of [c*] and [c*0] constructs that appear in this spec. */
 175     int n_indefinite_repeats;
 176
 177     /* Non-zero if this spec contains at least one equivalence
 178        class construct e.g. [=c=]. */
 179     int has_equiv_class;
 180
 181     /* Non-zero if this spec contains at least one of [:upper:] or
 182        [:lower:] class constructs. */
 183     int has_upper_or_lower;
 184
 185     /* Non-zero if this spec contains at least one of the character class
 186        constructs (all but upper and lower) that aren't allowed in s2. */
 187     int has_restricted_char_class;
 188   };
 189
 190 char *xmalloc ();
 191 char *stpcpy ();
 192 void error ();
 193
 194 /* The name by which this program was run. */
 195 char *program_name;
 196
 197 /* When non-zero, each sequence in the input of a repeated character
 198    (call it c) is replaced (in the output) by a single occurrence of c
 199    for every c in the squeeze set. */
 200 static int squeeze_repeats = 0;
 201
 202 /* When non-zero, removes characters in the delete set from input. */
 203 static int delete = 0;
 204
 205 /* Use the complement of set1 in place of set1. */
 206 static int complement = 0;
 207
 208 /* When non-zero, this flag causes GNU tr to provide strict
 209    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 210    says that when -d is used without -s, string2 (if present)
 211    must be ignored.  Silently ignoring arguments is a bad idea.
 212    The default GNU behavior is to give a usage message and exit.
 213    Additionally, when this flag is non-zero, tr prints warnings
 214    on stderr if it is being used in a manner that is not portable.
 215    Applicable warnings are given by default, but are suppressed
 216    if the environment variable `POSIXLY_CORRECT' is set, since
 217    being POSIX conformant means we can't issue such messages.
 218    Warnings on the following topics are suppressed when this
 219    variable is non-zero:
 220    1. Ambiguous octal escapes. */
 221 static int posix_pedantic;
 222
 223 /* When tr is performing translation and string1 is longer than string2,
 224    POSIX says that the result is undefined.  That gives the implementor
 225    of a POSIX conforming version of tr two reasonable choices for the
 226    semantics of this case.
 227
 228    * The BSD tr pads string2 to the length of string1 by
 229    repeating the last character in string2.
 230
 231    * System V tr ignores characters in string1 that have no
 232    corresponding character in string2.  That is, string1 is effectively
 233    truncated to the length of string2.
 234
 235    When non-zero, this flag causes GNU tr to imitate the behavior
 236    of System V tr when translating with string1 longer than string2.
 237    The default is to emulate BSD tr.  This flag is ignored in modes where
 238    no translation is performed.  Emulating the System V tr
 239    in this exceptional case causes the relatively common BSD idiom:
 240
 241        tr -cs A-Za-z0-9 '\012'
 242
 243    to break (it would convert only zero bytes, rather than all
 244    non-alphanumerics, to newlines).
 245
 246    WARNING: This switch does not provide general BSD or System V
 247    compatibility.  For example, it doesn't disable the interpretation
 248    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 249    some unfortunate coincidence you use such constructs in scripts
 250    expecting to use some other version of tr, the scripts will break. */
 251 static int truncate_set1 = 0;
 252
 253 /* An alias for (!delete && non_option_args == 2).
 254    It is set in main and used there and in validate(). */
 255 static int translating;
 256
 257 #ifndef BUFSIZ
 258 #define BUFSIZ 8192
 259 #endif
 260
 261 #define IO_BUF_SIZE BUFSIZ
 262 static unsigned char io_buf[IO_BUF_SIZE];
 263
 264 static char const *const char_class_name[] =
 265 {
 266   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 267   "lower", "print", "punct", "space", "upper", "xdigit"
 268 };
 269 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 270
 271 typedef char SET_TYPE;
 272
 273 /* Array of boolean values.  A character `c' is a member of the
 274    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 275    set is defined by the last (possibly, the only) string argument
 276    on the command line when the squeeze option is given.  */
 277 static SET_TYPE in_squeeze_set[N_CHARS];
 278
 279 /* Array of boolean values.  A character `c' is a member of the
 280    delete set if and only if in_delete_set[c] is true.  The delete
 281    set is defined by the first (or only) string argument on the
 282    command line when the delete option is given.  */
 283 static SET_TYPE in_delete_set[N_CHARS];
 284
 285 /* Array of character values defining the translation (if any) that
 286    tr is to perform.  Translation is performed only when there are
 287    two specification strings and the delete switch is not given. */
 288 static char xlate[N_CHARS];
 289
 290 /* If non-zero, display usage information and exit.  */
 291 static int flag_help;
 292
 293 /* If non-zero, print the version on standard error.  */
 294 static int flag_version;
 295
 296 static struct option const long_options[] =
 297 {
 298   {"complement", no_argument, NULL, 'c'},
 299   {"delete", no_argument, NULL, 'd'},
 300   {"squeeze-repeats", no_argument, NULL, 's'},
 301   {"truncate-set1", no_argument, NULL, 't'},
 302   {"help", no_argument, &flag_help, 1},
 303   {"version", no_argument, &flag_version, 1},
 304   {NULL, 0, NULL, 0}
 305 };
 306 \f
 307 static void
 308 usage ()
 309 {
 310   fprintf (stderr, "\
 311 Usage: %s [-cdst] [--complement] [--delete] [--squeeze-repeats]\n\
 312        [--truncate-set1] [--help] [--version] string1 [string2]\n",
 313            program_name);
 314   exit (2);
 315 }
 316
 317 /* Return non-zero if the character C is a member of the
 318    equivalence class containing the character EQUIV_CLASS. */
 319
 320 static int
 321 is_equiv_class_member (equiv_class, c)
 322      unsigned int equiv_class;
 323      unsigned int c;
 324 {
 325   return (equiv_class == c);
 326 }
 327
 328 /* Return non-zero if the character C is a member of the
 329    character class CHAR_CLASS. */
 330
 331 static int
 332 is_char_class_member (char_class, c)
 333      enum Char_class char_class;
 334      unsigned int c;
 335 {
 336   switch (char_class)
 337     {
 338     case CC_ALNUM:
 339       return ISALNUM (c);
 340       break;
 341     case CC_ALPHA:
 342       return ISALPHA (c);
 343       break;
 344     case CC_BLANK:
 345       return ISBLANK (c);
 346       break;
 347     case CC_CNTRL:
 348       return ISCNTRL (c);
 349       break;
 350     case CC_DIGIT:
 351       return ISDIGIT (c);
 352       break;
 353     case CC_GRAPH:
 354       return ISGRAPH (c);
 355       break;
 356     case CC_LOWER:
 357       return ISLOWER (c);
 358       break;
 359     case CC_PRINT:
 360       return ISPRINT (c);
 361       break;
 362     case CC_PUNCT:
 363       return ISPUNCT (c);
 364       break;
 365     case CC_SPACE:
 366       return ISSPACE (c);
 367       break;
 368     case CC_UPPER:
 369       return ISUPPER (c);
 370       break;
 371     case CC_XDIGIT:
 372       return ISXDIGIT (c);
 373       break;
 374     default:
 375       abort ();
 376       break;
 377     }
 378 }
 379
 380 /* Perform the first pass over each range-spec argument S,
 381    converting all \c and \ddd escapes to their one-byte representations.
 382    The conversion is done in-place, so S must point to writable
 383    storage.  If an illegal quote sequence is found, an error message is
 384    printed and the function returns non-zero.  Otherwise the length of
 385    the resulting string is returned through LEN and the function returns 0.
 386    The resulting array of characters may contain zero-bytes; however,
 387    on input, S is assumed to be null-terminated, and hence
 388    cannot contain actual (non-escaped) zero bytes. */
 389
 390 static int
 391 unquote (s, len)
 392      unsigned char *s;
 393      int *len;
 394 {
 395   int i, j;
 396
 397   j = 0;
 398   for (i = 0; s[i]; i++)
 399     {
 400       switch (s[i])
 401         {
 402           int c;
 403         case '\\':
 404           switch (s[i + 1])
 405             {
 406               int oct_digit;
 407             case '\\':
 408               c = '\\';
 409               break;
 410             case 'a':
 411               c = '\007';
 412               break;
 413             case 'b':
 414               c = '\b';
 415               break;
 416             case 'f':
 417               c = '\f';
 418               break;
 419             case 'n':
 420               c = '\n';
 421               break;
 422             case 'r':
 423               c = '\r';
 424               break;
 425             case 't':
 426               c = '\t';
 427               break;
 428             case 'v':
 429               c = '\v';
 430               break;
 431             case '0':
 432             case '1':
 433             case '2':
 434             case '3':
 435             case '4':
 436             case '5':
 437             case '6':
 438             case '7':
 439               c = s[i + 1] - '0';
 440               oct_digit = s[i + 2] - '0';
 441               if (0 <= oct_digit && oct_digit <= 7)
 442                 {
 443                   c = 8 * c + oct_digit;
 444                   ++i;
 445                   oct_digit = s[i + 2] - '0';
 446                   if (0 <= oct_digit && oct_digit <= 7)
 447                     {
 448                       if (8 * c + oct_digit < N_CHARS)
 449                         {
 450                           c = 8 * c + oct_digit;
 451                           ++i;
 452                         }
 453                       else if (!posix_pedantic)
 454                         {
 455                           /* Any octal number larger than 0377 won't
 456                              fit in 8 bits.  So we stop when adding the
 457                              next digit would put us over the limit and
 458                              give a warning about the ambiguity.  POSIX
 459                              isn't clear on this, but one person has said
 460                              that in his interpretation, POSIX says tr
 461                              can't even give a warning. */
 462                           error (0, 0, "warning: the ambiguous octal escape \
 463 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'",
 464                                  s[i], s[i + 1], s[i + 2],
 465                                  s[i], s[i + 1], s[i + 2]);
 466                         }
 467                     }
 468                 }
 469               break;
 470             case '\0':
 471               error (0, 0, "invalid backslash escape at end of string");
 472               return 1;
 473               break;
 474             default:
 475               error (0, 0, "invalid backslash escape `\\%c'", s[i + 1]);
 476               return 1;
 477               break;
 478             }
 479           ++i;
 480           s[j++] = c;
 481           break;
 482         default:
 483           s[j++] = s[i];
 484           break;
 485         }
 486     }
 487   *len = j;
 488   return 0;
 489 }
 490
 491 /* If CLASS_STR is a valid character class string, return its index
 492    in the global char_class_name array.  Otherwise, return CC_NO_CLASS. */
 493
 494 static enum Char_class
 495 look_up_char_class (class_str)
 496      unsigned char *class_str;
 497 {
 498   unsigned int i;
 499
 500   for (i = 0; i < N_CHAR_CLASSES; i++)
 501     if (strcmp ((const char *) class_str, char_class_name[i]) == 0)
 502       return (enum Char_class) i;
 503   return CC_NO_CLASS;
 504 }
 505
 506 /* Return a newly allocated string with a printable version of C.
 507    This function is used solely for formatting error messages. */
 508
 509 static char *
 510 make_printable_char (c)
 511      unsigned int c;
 512 {
 513   char *buf = xmalloc (5);
 514
 515   assert (c < N_CHARS);
 516   if (ISPRINT (c))
 517     {
 518       buf[0] = c;
 519       buf[1] = '\0';
 520     }
 521   else
 522     {
 523       sprintf (buf, "\\%03o", c);
 524     }
 525   return buf;
 526 }
 527
 528 /* Return a newly allocated copy of S which is suitable for printing.
 529    LEN is the number of characters in S.  Most non-printing
 530    (isprint) characters are represented by a backslash followed by
 531    3 octal digits.  However, the characters represented by \c escapes
 532    where c is one of [abfnrtv] are represented by their 2-character \c
 533    sequences.  This function is used solely for printing error messages. */
 534
 535 static char *
 536 make_printable_str (s, len)
 537      unsigned char *s;
 538      int len;
 539 {
 540   /* Worst case is that every character expands to a backslash
 541      followed by a 3-character octal escape sequence. */
 542   char *printable_buf = xmalloc (4 * len + 1);
 543   char *p = printable_buf;
 544   int i;
 545
 546   for (i = 0; i < len; i++)
 547     {
 548       char buf[5];
 549       char *tmp = NULL;
 550
 551       switch (s[i])
 552         {
 553         case '\\':
 554           tmp = "\\";
 555           break;
 556         case '\007':
 557           tmp = "\\a";
 558           break;
 559         case '\b':
 560           tmp = "\\b";
 561           break;
 562         case '\f':
 563           tmp = "\\f";
 564           break;
 565         case '\n':
 566           tmp = "\\n";
 567           break;
 568         case '\r':
 569           tmp = "\\r";
 570           break;
 571         case '\t':
 572           tmp = "\\t";
 573           break;
 574         case '\v':
 575           tmp = "\\v";
 576           break;
 577         default:
 578           if (ISPRINT (s[i]))
 579             {
 580               buf[0] = s[i];
 581               buf[1] = '\0';
 582             }
 583           else
 584             sprintf (buf, "\\%03o", s[i]);
 585           tmp = buf;
 586           break;
 587         }
 588       p = stpcpy (p, tmp);
 589     }
 590   return printable_buf;
 591 }
 592
 593 /* Append a newly allocated structure representing a
 594    character C to the specification list LIST. */
 595
 596 static void
 597 append_normal_char (list, c)
 598      struct Spec_list *list;
 599      unsigned int c;
 600 {
 601   struct List_element *new;
 602
 603   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 604   new->next = NULL;
 605   new->type = RE_NORMAL_CHAR;
 606   new->u.normal_char = c;
 607   assert (list->tail);
 608   list->tail->next = new;
 609   list->tail = new;
 610 }
 611
 612 /* Append a newly allocated structure representing the range
 613    of characters from FIRST to LAST to the specification list LIST.
 614    Return non-zero if LAST precedes FIRST in the collating sequence,
 615    zero otherwise.  This means that '[c-c]' is acceptable.  */
 616
 617 static int
 618 append_range (list, first, last)
 619      struct Spec_list *list;
 620      unsigned int first;
 621      unsigned int last;
 622 {
 623   struct List_element *new;
 624
 625   if (ORD (first) > ORD (last))
 626     {
 627       char *tmp1 = make_printable_char (first);
 628       char *tmp2 = make_printable_char (last);
 629
 630       error (0, 0,
 631        "range-endpoints of `%s-%s' are in reverse collating sequence order",
 632              tmp1, tmp2);
 633       free (tmp1);
 634       free (tmp2);
 635       return 1;
 636     }
 637   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 638   new->next = NULL;
 639   new->type = RE_RANGE;
 640   new->u.range.first_char = first;
 641   new->u.range.last_char = last;
 642   assert (list->tail);
 643   list->tail->next = new;
 644   list->tail = new;
 645   return 0;
 646 }
 647
 648 /* If CHAR_CLASS_STR is a valid character class string, append a
 649    newly allocated structure representing that character class to the end
 650    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 651    a valid string, give an error message and return non-zero. */
 652
 653 static int
 654 append_char_class (list, char_class_str, len)
 655      struct Spec_list *list;
 656      unsigned char *char_class_str;
 657      int len;
 658 {
 659   enum Char_class char_class;
 660   struct List_element *new;
 661
 662   char_class = look_up_char_class (char_class_str);
 663   if (char_class == CC_NO_CLASS)
 664     {
 665       char *tmp = make_printable_str (char_class_str, len);
 666
 667       error (0, 0, "invalid character class `%s'", tmp);
 668       free (tmp);
 669       return 1;
 670     }
 671   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 672   new->next = NULL;
 673   new->type = RE_CHAR_CLASS;
 674   new->u.char_class = char_class;
 675   assert (list->tail);
 676   list->tail->next = new;
 677   list->tail = new;
 678   return 0;
 679 }
 680
 681 /* Append a newly allocated structure representing a [c*n]
 682    repeated character construct, to the specification list LIST.
 683    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 684    is non-negative repeat count. */
 685
 686 static void
 687 append_repeated_char (list, the_char, repeat_count)
 688      struct Spec_list *list;
 689      unsigned int the_char;
 690      long int repeat_count;
 691 {
 692   struct List_element *new;
 693
 694   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 695   new->next = NULL;
 696   new->type = RE_REPEATED_CHAR;
 697   new->u.repeated_char.the_repeated_char = the_char;
 698   new->u.repeated_char.repeat_count = repeat_count;
 699   assert (list->tail);
 700   list->tail->next = new;
 701   list->tail = new;
 702 }
 703
 704 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 705    the length of that string, LEN, if LEN is exactly one, append
 706    a newly allocated structure representing the specified
 707    equivalence class to the specification list, LIST and return zero.
 708    If LEN is not 1, issue an error message and return non-zero. */
 709
 710 static int
 711 append_equiv_class (list, equiv_class_str, len)
 712      struct Spec_list *list;
 713      unsigned char *equiv_class_str;
 714      int len;
 715 {
 716   struct List_element *new;
 717
 718   if (len != 1)
 719     {
 720       char *tmp = make_printable_str (equiv_class_str, len);
 721
 722       error (0, 0, "%s: equivalence class operand must be a single character",
 723              tmp);
 724       free (tmp);
 725       return 1;
 726     }
 727   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 728   new->next = NULL;
 729   new->type = RE_EQUIV_CLASS;
 730   new->u.equiv_code = *equiv_class_str;
 731   assert (list->tail);
 732   list->tail->next = new;
 733   list->tail = new;
 734   return 0;
 735 }
 736
 737 /* Return a newly allocated copy of P[FIRST_IDX..LAST_IDX]. */
 738
 739 static unsigned char *
 740 substr (p, first_idx, last_idx)
 741      unsigned char *p;
 742      int first_idx;
 743      int last_idx;
 744 {
 745   int len = last_idx - first_idx + 1;
 746   unsigned char *tmp = (unsigned char *) xmalloc (len);
 747
 748   assert (first_idx <= last_idx);
 749   /* We must use bcopy or memcopy rather than strncpy
 750      because `p' may contain zero-bytes. */
 751   bcopy (p + first_idx, tmp, len);
 752   tmp[len] = '\0';
 753   return tmp;
 754 }
 755
 756 /* Search forward starting at START_IDX for the 2-char sequence
 757    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 758    a sequence is found, return the index of the first character,
 759    otherwise return -1.  P may contain zero bytes. */
 760
 761 static int
 762 find_closing_delim (p, start_idx, p_len, pre_bracket_char)
 763      unsigned char *p;
 764      int start_idx;
 765      int p_len;
 766      unsigned int pre_bracket_char;
 767 {
 768   int i;
 769
 770   for (i = start_idx; i < p_len - 1; i++)
 771     if (p[i] == pre_bracket_char && p[i + 1] == ']')
 772       return i;
 773   return -1;
 774 }
 775
 776 /* Convert a string S with explicit length LEN, possibly
 777    containing embedded zero bytes, to a long integer value.
 778    If the string represents a negative value, a value larger
 779    than LONG_MAX, or if all LEN characters do not represent a
 780    valid integer, return non-zero and do not modify *VAL.
 781    Otherwise, return zero and set *VAL to the converted value. */
 782
 783 static int
 784 non_neg_strtol (s, len, val)
 785      unsigned char *s;
 786      int len;
 787      long int *val;
 788 {
 789   int i;
 790   long sum = 0;
 791   unsigned int base;
 792
 793   if (len <= 0)
 794     return 1;
 795   if (s[0] == '0')
 796     base = 8;
 797   else if (ISDIGIT (s[0]))
 798     base = 10;
 799   else
 800     return 1;
 801
 802   for (i = 0; i < len; i++)
 803     {
 804       int c = s[i] - '0';
 805
 806       if (c >= base || c < 0)
 807         return 1;
 808       if (i > 8 && sum > (LONG_MAX - c) / base)
 809         return 1;
 810       sum = sum * base + c;
 811     }
 812   *val = sum;
 813   return 0;
 814 }
 815
 816 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 817    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 818    return the character and the repeat count through the arg pointers,
 819    CHAR_TO_REPEAT and N, and then return the index of the closing
 820    bracket as the function value.  If the second character following
 821    the opening bracket is not `*' or if no closing bracket can be
 822    found, return -1.  If a closing bracket is found and the
 823    second char is `*', but the string between the `*' and `]' isn't
 824    empty, an octal number, or a decimal number, print an error message
 825    and return -2. */
 826
 827 static int
 828 find_bracketed_repeat (p, start_idx, p_len, char_to_repeat, n)
 829      unsigned char *p;
 830      int start_idx;
 831      int p_len;
 832      unsigned int *char_to_repeat;
 833      long int *n;
 834 {
 835   int i;
 836
 837   assert (start_idx + 1 < p_len);
 838   if (p[start_idx + 1] != '*')
 839     return -1;
 840
 841   for (i = start_idx + 2; i < p_len; i++)
 842     {
 843       if (p[i] == ']')
 844         {
 845           unsigned char *digit_str;
 846           int digit_str_len = i - start_idx - 2;
 847
 848           *char_to_repeat = p[start_idx];
 849           if (digit_str_len == 0)
 850             {
 851               /* We've matched [c*] -- no explicit repeat count. */
 852               *n = 0;
 853               return i;
 854             }
 855
 856           /* Here, we have found [c*s] where s should be a string
 857              of octal or decimal digits. */
 858           digit_str = &p[start_idx + 2];
 859           if (non_neg_strtol (digit_str, digit_str_len, n))
 860             {
 861               char *tmp = make_printable_str (digit_str, digit_str_len);
 862               error (0, 0, "invalid repeat count `%s' in [c*n] construct", tmp);
 863               free (tmp);
 864               return -2;
 865             }
 866           return i;
 867         }
 868     }
 869   return -1;                    /* No bracket found. */
 870 }
 871
 872 /* Convert string UNESACPED_STRING (which has been preprocessed to
 873    convert backslash-escape sequences) of length LEN characters into
 874    a linked list of the following 5 types of constructs:
 875       - [:str:] Character class where `str' is one of the 12 valid strings.
 876       - [=c=] Equivalence class where `c' is any single character.
 877       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 878           However, if `n' is present, it must be a non-negative octal or
 879           decimal integer.
 880       - r-s Range of characters from `r' to `s'.  The second endpoint must
 881           not precede the first in the current collating sequence.
 882       - c Any other character is interpreted as itself. */
 883
 884 static int
 885 build_spec_list (unescaped_string, len, result)
 886      unsigned char *unescaped_string;
 887      int len;
 888      struct Spec_list *result;
 889 {
 890   unsigned char *p;
 891   int i;
 892
 893   p = unescaped_string;
 894
 895   /* The main for-loop below recognizes the 4 multi-character constructs.
 896      A character that matches (in its context) none of the multi-character
 897      constructs is classified as `normal'.  Since all multi-character
 898      constructs have at least 3 characters, any strings of length 2 or
 899      less are composed solely of normal characters.  Hence, the index of
 900      the outer for-loop runs only as far as LEN-2. */
 901
 902   for (i = 0; i < len - 2;)
 903     {
 904       switch (p[i])
 905         {
 906           int fall_through;
 907         case '[':
 908           fall_through = 0;
 909           switch (p[i + 1])
 910             {
 911               int closing_delim_idx;
 912               int closing_bracket_idx;
 913               unsigned int char_to_repeat;
 914               long repeat_count;
 915             case ':':
 916             case '=':
 917               closing_delim_idx = find_closing_delim (p, i + 2, len, p[i + 1]);
 918               if (closing_delim_idx >= 0)
 919                 {
 920                   int parse_failed;
 921                   unsigned char *opnd_str = substr (p, i + 2, closing_delim_idx - 1);
 922                   if (p[i + 1] == ':')
 923                     parse_failed = append_char_class (result, opnd_str,
 924                                      (closing_delim_idx - 1) - (i + 2) + 1);
 925                   else
 926                     parse_failed = append_equiv_class (result, opnd_str,
 927                                      (closing_delim_idx - 1) - (i + 2) + 1);
 928                   free (opnd_str);
 929
 930                   /* Return non-zero if append_*_class reports a problem. */
 931                   if (parse_failed)
 932                     return 1;
 933                   else
 934                     i = closing_delim_idx + 2;
 935                   break;
 936                 }
 937               /* Else fall through.  This could be [:*] or [=*]. */
 938             default:
 939               /* Determine whether this is a bracketed repeat range
 940                  matching the RE \[.\*(dec_or_oct_number)?\]. */
 941               closing_bracket_idx = find_bracketed_repeat (p, i + 1,
 942                                        len, &char_to_repeat, &repeat_count);
 943               if (closing_bracket_idx >= 0)
 944                 {
 945                   append_repeated_char (result, char_to_repeat, repeat_count);
 946                   i = closing_bracket_idx + 1;
 947                   break;
 948                 }
 949               else if (closing_bracket_idx == -1)
 950                 {
 951                   fall_through = 1;
 952                 }
 953               else
 954                 /* Found a string that looked like [c*n] but the
 955                    numeric part was invalid. */
 956                 return 1;
 957               break;
 958             }
 959           if (!fall_through)
 960             break;
 961
 962           /* Here if we've tried to match [c*n], [:str:], and [=c=]
 963              and none of them fit.  So we still have to consider the
 964              range `[-c' (from `[' to `c'). */
 965         default:
 966           /* Look ahead one char for ranges like a-z. */
 967           if (p[i + 1] == '-')
 968             {
 969               if (append_range (result, p[i], p[i + 2]))
 970                 return 1;
 971               i += 3;
 972             }
 973           else
 974             {
 975               append_normal_char (result, p[i]);
 976               ++i;
 977             }
 978           break;
 979         }
 980     }
 981
 982   /* Now handle the (2 or fewer) remaining characters p[i]..p[len - 1]. */
 983   for (; i < len; i++)
 984     append_normal_char (result, p[i]);
 985
 986   return 0;
 987 }
 988
 989 /* Given a Spec_list S (with its saved state implicit in the values
 990    of its members `tail' and `state'), return the next single character
 991    in the expansion of S's constructs.  If the last character of S was
 992    returned on the previous call or if S was empty, this function
 993    returns -1.  For example, successive calls to get_next where S
 994    represents the spec-string 'a-d[y*3]' will return the sequence
 995    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
 996    which the returned character comes is [:upper:] or [:lower:], the
 997    parameter CLASS is given a value to indicate which it was.  Otherwise
 998    CLASS is set to UL_NONE.  This value is used only when constructing
 999    the translation table to verify that any occurrences of upper and
1000    lower class constructs in the spec-strings appear in the same relative
1001    positions. */
1002
1003 static int
1004 get_next (s, class)
1005      struct Spec_list *s;
1006      enum Upper_Lower_class *class;
1007 {
1008   struct List_element *p;
1009   int return_val;
1010   int i;
1011
1012   if (class)
1013     *class = UL_NONE;
1014
1015   if (s->state == BEGIN_STATE)
1016     {
1017       s->tail = s->head->next;
1018       s->state = NEW_ELEMENT;
1019     }
1020
1021   p = s->tail;
1022   if (p == NULL)
1023     return -1;
1024
1025   switch (p->type)
1026     {
1027     case RE_NORMAL_CHAR:
1028       return_val = p->u.normal_char;
1029       s->state = NEW_ELEMENT;
1030       s->tail = p->next;
1031       break;
1032
1033     case RE_RANGE:
1034       if (s->state == NEW_ELEMENT)
1035         s->state = ORD (p->u.range.first_char);
1036       else
1037         ++(s->state);
1038       return_val = CHR (s->state);
1039       if (s->state == ORD (p->u.range.last_char))
1040         {
1041           s->tail = p->next;
1042           s->state = NEW_ELEMENT;
1043         }
1044       break;
1045
1046     case RE_CHAR_CLASS:
1047       if (s->state == NEW_ELEMENT)
1048         {
1049           for (i = 0; i < N_CHARS; i++)
1050             if (is_char_class_member (p->u.char_class, i))
1051               break;
1052           assert (i < N_CHARS);
1053           s->state = i;
1054         }
1055       assert (is_char_class_member (p->u.char_class, s->state));
1056       return_val = CHR (s->state);
1057       for (i = s->state + 1; i < N_CHARS; i++)
1058         if (is_char_class_member (p->u.char_class, i))
1059           break;
1060       if (i < N_CHARS)
1061         s->state = i;
1062       else
1063         {
1064           s->tail = p->next;
1065           s->state = NEW_ELEMENT;
1066         }
1067       if (class)
1068         {
1069           switch (p->u.char_class)
1070             {
1071             case CC_LOWER:
1072               *class = UL_LOWER;
1073               break;
1074             case CC_UPPER:
1075               *class = UL_UPPER;
1076               break;
1077             default:
1078               /* empty */
1079               break;
1080             }
1081         }
1082       break;
1083
1084     case RE_EQUIV_CLASS:
1085       /* FIXME: this assumes that each character is alone in its own
1086          equivalence class (which appears to be correct for my
1087          LC_COLLATE.  But I don't know of any function that allows
1088          one to determine a character's equivalence class. */
1089
1090       return_val = p->u.equiv_code;
1091       s->state = NEW_ELEMENT;
1092       s->tail = p->next;
1093       break;
1094
1095     case RE_REPEATED_CHAR:
1096       /* Here, a repeat count of n == 0 means don't repeat at all. */
1097       assert (p->u.repeated_char.repeat_count >= 0);
1098       if (p->u.repeated_char.repeat_count == 0)
1099         {
1100           s->tail = p->next;
1101           s->state = NEW_ELEMENT;
1102           return_val = get_next (s, class);
1103         }
1104       else
1105         {
1106           if (s->state == NEW_ELEMENT)
1107             {
1108               s->state = 0;
1109             }
1110           ++(s->state);
1111           return_val = p->u.repeated_char.the_repeated_char;
1112           if (p->u.repeated_char.repeat_count > 0
1113               && s->state == p->u.repeated_char.repeat_count)
1114             {
1115               s->tail = p->next;
1116               s->state = NEW_ELEMENT;
1117             }
1118         }
1119       break;
1120
1121     case RE_NO_TYPE:
1122       abort ();
1123       break;
1124     }
1125   return return_val;
1126 }
1127
1128 /* This is a minor kludge.  This function is called from
1129    get_spec_stats to determine the cardinality of a set derived
1130    from a complemented string.  It's a kludge in that some of
1131    the same operations are (duplicated) performed in set_initialize. */
1132
1133 static int
1134 card_of_complement (s)
1135      struct Spec_list *s;
1136 {
1137   int c;
1138   int cardinality = N_CHARS;
1139   SET_TYPE in_set[N_CHARS];
1140
1141   bzero (in_set, N_CHARS * sizeof (in_set[0]));
1142   s->state = BEGIN_STATE;
1143   while ((c = get_next (s, NULL)) != -1)
1144     if (!in_set[c]++)
1145       --cardinality;
1146   return cardinality;
1147 }
1148
1149 /* Gather statistics about the spec-list S in preparation for the tests
1150    in validate that determine the legality of the specs.  This function
1151    is called at most twice; once for string1, and again for any string2.
1152    LEN_S1 < 0 indicates that this is the first call and that S represents
1153    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1154    constructs in string1, and we can use its value to resolve any
1155    indefinite repeat construct in S (which represents string2).  Hence,
1156    this function has the side-effect that it converts a valid [c*]
1157    construct in string2 to [c*n] where n is large enough (or 0) to give
1158    string2 the same length as string1.  For example, with the command
1159    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1160    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */
1161
1162 static void
1163 get_spec_stats (s, len_s1)
1164      struct Spec_list *s;
1165      int len_s1;
1166 {
1167   struct List_element *p;
1168   struct List_element *indefinite_repeat_element = NULL;
1169   int len = 0;
1170
1171   s->n_indefinite_repeats = 0;
1172   s->has_equiv_class = 0;
1173   s->has_restricted_char_class = 0;
1174   s->has_upper_or_lower = 0;
1175   for (p = s->head->next; p; p = p->next)
1176     {
1177       switch (p->type)
1178         {
1179           int i;
1180         case RE_NORMAL_CHAR:
1181           ++len;
1182           break;
1183
1184         case RE_RANGE:
1185           assert (p->u.range.last_char >= p->u.range.first_char);
1186           len += p->u.range.last_char - p->u.range.first_char + 1;
1187           break;
1188
1189         case RE_CHAR_CLASS:
1190           for (i = 0; i < N_CHARS; i++)
1191             if (is_char_class_member (p->u.char_class, i))
1192               ++len;
1193           switch (p->u.char_class)
1194             {
1195             case CC_UPPER:
1196             case CC_LOWER:
1197               s->has_upper_or_lower = 1;
1198               break;
1199             default:
1200               s->has_restricted_char_class = 1;
1201               break;
1202             }
1203           break;
1204
1205         case RE_EQUIV_CLASS:
1206           for (i = 0; i < N_CHARS; i++)
1207             if (is_equiv_class_member (p->u.equiv_code, i))
1208               ++len;
1209           s->has_equiv_class = 1;
1210           break;
1211
1212         case RE_REPEATED_CHAR:
1213           if (p->u.repeated_char.repeat_count > 0)
1214             len += p->u.repeated_char.repeat_count;
1215           else if (p->u.repeated_char.repeat_count == 0)
1216             {
1217               indefinite_repeat_element = p;
1218               ++(s->n_indefinite_repeats);
1219             }
1220           break;
1221
1222         case RE_NO_TYPE:
1223           assert (0);
1224           break;
1225         }
1226     }
1227
1228   if (len_s1 >= len && s->n_indefinite_repeats == 1)
1229     {
1230       indefinite_repeat_element->u.repeated_char.repeat_count = len_s1 - len;
1231       len = len_s1;
1232     }
1233   if (complement && len_s1 < 0)
1234     s->length = card_of_complement (s);
1235   else
1236     s->length = len;
1237   return;
1238 }
1239
1240 static void
1241 spec_init (spec_list)
1242      struct Spec_list *spec_list;
1243 {
1244   spec_list->head = spec_list->tail =
1245     (struct List_element *) xmalloc (sizeof (struct List_element));
1246   spec_list->head->next = NULL;
1247 }
1248
1249 /* This function makes two passes over the argument string S.  The first
1250    one converts all \c and \ddd escapes to their one-byte representations.
1251    The second constructs a linked specification list, SPEC_LIST, of the
1252    characters and constructs that comprise the argument string.  If either
1253    of these passes detects an error, this function returns non-zero. */
1254
1255 static int
1256 parse_str (s, spec_list)
1257      unsigned char *s;
1258      struct Spec_list *spec_list;
1259 {
1260   int len;
1261
1262   if (unquote (s, &len))
1263     return 1;
1264   if (build_spec_list (s, len, spec_list))
1265     return 1;
1266   return 0;
1267 }
1268
1269 /* Given two specification lists, S1 and S2, and assuming that
1270    S1->length > S2->length, append a single [c*n] element to S2 where c
1271    is the last character in the expansion of S2 and n is the difference
1272    between the two lengths.
1273    Upon successful completion, S2->length is set to S1->length.  The only
1274    way this function can fail to make S2 as long as S1 is when S2 has
1275    zero-length, since in that case, there is no last character to repeat.
1276    So S2->length is required to be at least 1.
1277
1278    Providing this functionality allows the user to do some pretty
1279    non-BSD (and non-portable) things:  For example, the command
1280        tr -cs '[:upper:]0-9' '[:lower:]'
1281    is almost guaranteed to give results that depend on your collating
1282    sequence.  */
1283
1284 static void
1285 string2_extend (s1, s2)
1286      struct Spec_list *s1;
1287      struct Spec_list *s2;
1288 {
1289   struct List_element *p;
1290   int char_to_repeat;
1291   int i;
1292
1293   assert (translating);
1294   assert (s1->length > s2->length);
1295   assert (s2->length > 0);
1296
1297   p = s2->tail;
1298   switch (p->type)
1299     {
1300     case RE_NORMAL_CHAR:
1301       char_to_repeat = p->u.normal_char;
1302       break;
1303     case RE_RANGE:
1304       char_to_repeat = p->u.range.last_char;
1305       break;
1306     case RE_CHAR_CLASS:
1307       for (i = N_CHARS; i >= 0; i--)
1308         if (is_char_class_member (p->u.char_class, i))
1309           break;
1310       assert (i >= 0);
1311       char_to_repeat = CHR (i);
1312       break;
1313
1314     case RE_REPEATED_CHAR:
1315       char_to_repeat = p->u.repeated_char.the_repeated_char;
1316       break;
1317
1318     case RE_EQUIV_CLASS:
1319       /* This shouldn't happen, because validate exits with an error
1320          if it finds an equiv class in string2 when translating. */
1321       abort ();
1322       break;
1323
1324     case RE_NO_TYPE:
1325       abort ();
1326       break;
1327     }
1328   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1329   s2->length = s1->length;
1330   return;
1331 }
1332
1333 /* Die with an error message if S1 and S2 describe strings that
1334    are not valid with the given command line switches.
1335    A side effect of this function is that if a legal [c*] or
1336    [c*0] construct appears in string2, it is converted to [c*n]
1337    with a value for n that makes s2->length == s1->length.  By
1338    the same token, if the --truncate-set1 option is not
1339    given, S2 may be extended. */
1340
1341 static void
1342 validate (s1, s2)
1343      struct Spec_list *s1;
1344      struct Spec_list *s2;
1345 {
1346   get_spec_stats (s1, -1);
1347   if (s1->n_indefinite_repeats > 0)
1348     {
1349       error (1, 0, "the [c*] repeat construct may not appear in string1");
1350     }
1351
1352   /* FIXME: it isn't clear from the POSIX spec that this is illegal,
1353      but in the spirit of the other restrictions put on translation
1354      with character classes, this seems a logical interpretation. */
1355   if (complement && s1->has_upper_or_lower)
1356     {
1357       error (1, 0,
1358              "character classes may not be used when translating and complementing");
1359     }
1360
1361   if (s2)
1362     {
1363       get_spec_stats (s2, s1->length);
1364       if (s2->has_restricted_char_class)
1365         {
1366           error (1, 0,
1367                  "when translating, the only character classes that may appear in\n\
1368 \tstring2 are `upper' and `lower'");
1369         }
1370
1371       if (s2->n_indefinite_repeats > 1)
1372         {
1373           error (1, 0, "only one [c*] repeat construct may appear in string2");
1374         }
1375
1376       if (translating)
1377         {
1378           if (s2->has_equiv_class)
1379             {
1380               error (1, 0,
1381                      "[=c=] expressions may not appear in string2 when translating");
1382             }
1383
1384           if (s1->length > s2->length)
1385             {
1386               if (!truncate_set1)
1387                 {
1388                   /* string2 must be non-empty unless --truncate-set1 is
1389                      given or string1 is empty. */
1390
1391                   if (s2->length == 0)
1392                     error (1, 0,
1393                      "when not truncating set1, string2 must be non-empty");
1394                   string2_extend (s1, s2);
1395                 }
1396             }
1397
1398           if (complement && s2->has_upper_or_lower)
1399             error (1, 0,
1400                    "character classes may not be used when translating and complementing");
1401         }
1402       else
1403         /* Not translating. */
1404         {
1405           if (s2->n_indefinite_repeats > 0)
1406             error (1, 0,
1407                    "the [c*] construct may appear in string2 only when translating");
1408         }
1409     }
1410 }
1411
1412 /* Read buffers of SIZE bytes via the function READER (if READER is
1413    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1414    read_and_delete or read_and_xlate.  After each buffer is read, it is
1415    processed and written to stdout.  The buffers are processed so that
1416    multiple consecutive occurrences of the same character in the input
1417    stream are replaced by a single occurrence of that character if the
1418    character is in the squeeze set. */
1419
1420 static void
1421 squeeze_filter (buf, size, reader)
1422      unsigned char *buf;
1423      long int size;
1424      PFI reader;
1425 {
1426   unsigned int char_to_squeeze = NOT_A_CHAR;
1427   int i = 0;
1428   int nr = 0;
1429
1430   for (;;)
1431     {
1432       int begin;
1433
1434       if (i >= nr)
1435         {
1436           if (reader == NULL)
1437             nr = read (0, (char *) buf, size);
1438           else
1439             nr = (*reader) (buf, size, NULL);
1440
1441           if (nr < 0)
1442             error (1, errno, "read error");
1443           if (nr == 0)
1444             break;
1445           i = 0;
1446         }
1447
1448       begin = i;
1449
1450       if (char_to_squeeze == NOT_A_CHAR)
1451         {
1452           int out_len;
1453           /* Here, by being a little tricky, we can get a significant
1454              performance increase in most cases when the input is
1455              reasonably large.  Since tr will modify the input only
1456              if two consecutive (and identical) input characters are
1457              in the squeeze set, we can step by two through the data
1458              when searching for a character in the squeeze set.  This
1459              means there may be a little more work in a few cases and
1460              perhaps twice as much work in the worst cases where most
1461              of the input is removed by squeezing repeats.  But most
1462              uses of this functionality seem to remove less than 20-30%
1463              of the input. */
1464           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1465             ;                   /* empty */
1466
1467           /* There is a special case when i == nr and we've just
1468              skipped a character (the last one in buf) that is in
1469              the squeeze set. */
1470           if (i == nr && in_squeeze_set[buf[i - 1]])
1471             --i;
1472
1473           if (i >= nr)
1474             out_len = nr - begin;
1475           else
1476             {
1477               char_to_squeeze = buf[i];
1478               /* We're about to output buf[begin..i]. */
1479               out_len = i - begin + 1;
1480
1481               /* But since we stepped by 2 in the loop above,
1482                  out_len may be one too large. */
1483               if (i > 0 && buf[i - 1] == char_to_squeeze)
1484                 --out_len;
1485
1486               /* Advance i to the index of first character to be
1487                  considered when looking for a char different from
1488                  char_to_squeeze. */
1489               ++i;
1490             }
1491           if (out_len > 0
1492               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1493             error (1, errno, "write error");
1494         }
1495
1496       if (char_to_squeeze != NOT_A_CHAR)
1497         {
1498           /* Advance i to index of first char != char_to_squeeze
1499              (or to nr if all the rest of the characters in this
1500              buffer are the same as char_to_squeeze). */
1501           for (; i < nr && buf[i] == char_to_squeeze; i++)
1502             ;                   /* empty */
1503           if (i < nr)
1504             char_to_squeeze = NOT_A_CHAR;
1505           /* If (i >= nr) we've squeezed the last character in this buffer.
1506              So now we have to read a new buffer and continue comparing
1507              characters against char_to_squeeze. */
1508         }
1509     }
1510 }
1511
1512 /* Read buffers of SIZE bytes from stdin until one is found that
1513    contains at least one character not in the delete set.  Store
1514    in the array BUF, all characters from that buffer that are not
1515    in the delete set, and return the number of characters saved
1516    or 0 upon EOF. */
1517
1518 static long
1519 read_and_delete (buf, size, not_used)
1520      unsigned char *buf;
1521      long int size;
1522      PFI not_used;
1523 {
1524   long n_saved;
1525   static int hit_eof = 0;
1526
1527   assert (not_used == NULL);
1528   assert (size > 0);
1529
1530   if (hit_eof)
1531     return 0;
1532
1533   /* This enclosing do-while loop is to make sure that
1534      we don't return zero (indicating EOF) when we've
1535      just deleted all the characters in a buffer. */
1536   do
1537     {
1538       int i;
1539       int nr = read (0, (char *) buf, size);
1540
1541       if (nr < 0)
1542         error (1, errno, "read error");
1543       if (nr == 0)
1544         {
1545           hit_eof = 1;
1546           return 0;
1547         }
1548
1549       /* This first loop may be a waste of code, but gives much
1550          better performance when no characters are deleted in
1551          the beginning of a buffer.  It just avoids the copying
1552          of buf[i] into buf[n_saved] when it would be a NOP. */
1553
1554       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1555         /* empty */ ;
1556       n_saved = i;
1557
1558       for (++i; i < nr; i++)
1559         if (!in_delete_set[buf[i]])
1560           buf[n_saved++] = buf[i];
1561     }
1562   while (n_saved == 0);
1563
1564   return n_saved;
1565 }
1566
1567 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1568    perform the in-place and one-to-one mapping specified by the global
1569    array `xlate'.  Return the number of characters read, or 0 upon EOF. */
1570
1571 static long
1572 read_and_xlate (buf, size, not_used)
1573      unsigned char *buf;
1574      long int size;
1575      PFI not_used;
1576 {
1577   long chars_read = 0;
1578   static int hit_eof = 0;
1579   int i;
1580
1581   assert (not_used == NULL);
1582   assert (size > 0);
1583
1584   if (hit_eof)
1585     return 0;
1586
1587   chars_read = read (0, (char *) buf, size);
1588   if (chars_read < 0)
1589     error (1, errno, "read error");
1590   if (chars_read == 0)
1591     {
1592       hit_eof = 1;
1593       return 0;
1594     }
1595
1596   for (i = 0; i < chars_read; i++)
1597     buf[i] = xlate[buf[i]];
1598
1599   return chars_read;
1600 }
1601
1602 /* Initialize a boolean membership set IN_SET with the character
1603    values obtained by traversing the linked list of constructs S
1604    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1605    non-zero the resulting set is complemented. */
1606
1607 static void
1608 set_initialize (s, complement_this_set, in_set)
1609      struct Spec_list *s;
1610      int complement_this_set;
1611      SET_TYPE *in_set;
1612 {
1613   int c;
1614   int i;
1615
1616   bzero (in_set, N_CHARS * sizeof (in_set[0]));
1617   s->state = BEGIN_STATE;
1618   while ((c = get_next (s, NULL)) != -1)
1619     in_set[c] = 1;
1620   if (complement_this_set)
1621     for (i = 0; i < N_CHARS; i++)
1622       in_set[i] = (!in_set[i]);
1623 }
1624
1625 void
1626 main (argc, argv)
1627      int argc;
1628      char **argv;
1629 {
1630   int c;
1631   int non_option_args;
1632   struct Spec_list buf1, buf2;
1633   struct Spec_list *s1 = &buf1;
1634   struct Spec_list *s2 = &buf2;
1635
1636   program_name = argv[0];
1637
1638   while ((c = getopt_long (argc, argv, "cdst", long_options,
1639                            (int *) 0)) != EOF)
1640     {
1641       switch (c)
1642         {
1643         case 0:
1644           break;
1645
1646         case 'c':
1647           complement = 1;
1648           break;
1649
1650         case 'd':
1651           delete = 1;
1652           break;
1653
1654         case 's':
1655           squeeze_repeats = 1;
1656           break;
1657
1658         case 't':
1659           truncate_set1 = 1;
1660           break;
1661
1662         default:
1663           usage ();
1664           break;
1665         }
1666     }
1667
1668   if (flag_version)
1669     fprintf (stderr, "%s\n", version_string);
1670
1671   if (flag_help)
1672     usage ();
1673
1674   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1675
1676   non_option_args = argc - optind;
1677   translating = (non_option_args == 2 && !delete);
1678
1679   /* Change this test if it is legal to give tr no options and
1680      no args at all.  POSIX doesn't specifically say anything
1681      either way, but it looks like they implied it's illegal
1682      by omission.  If you want to make tr do a slow imitation
1683      of `cat' use `tr a a'. */
1684   if (non_option_args > 2)
1685     usage ();
1686
1687   if (!delete && !squeeze_repeats && non_option_args != 2)
1688     error (1, 0, "two strings must be given when translating");
1689
1690   if (delete && squeeze_repeats && non_option_args != 2)
1691     error (1, 0, "two strings must be given when both \
1692 deleting and squeezing repeats");
1693
1694   /* If --delete is given without --squeeze-repeats, then
1695      only one string argument may be specified.  But POSIX
1696      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1697      is set, pretend we never saw string2.  But I think
1698      this deserves a fatal error, so that's the default. */
1699   if ((delete && !squeeze_repeats) && non_option_args != 1)
1700     {
1701       if (posix_pedantic && non_option_args == 2)
1702         --non_option_args;
1703       else
1704         error (1, 0,
1705                "only one string may be given when deleting without squeezing repeats");
1706     }
1707
1708   spec_init (s1);
1709   if (parse_str ((unsigned char *) argv[optind], s1))
1710     exit (1);
1711
1712   if (non_option_args == 2)
1713     {
1714       spec_init (s2);
1715       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1716         exit (1);
1717     }
1718   else
1719     s2 = NULL;
1720
1721   validate (s1, s2);
1722
1723   if (squeeze_repeats && non_option_args == 1)
1724     {
1725       set_initialize (s1, complement, in_squeeze_set);
1726       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1727     }
1728   else if (delete && non_option_args == 1)
1729     {
1730       int nr;
1731
1732       set_initialize (s1, complement, in_delete_set);
1733       do
1734         {
1735           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1736           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1737             error (1, errno, "write error");
1738         }
1739       while (nr > 0);
1740     }
1741   else if (squeeze_repeats && delete && non_option_args == 2)
1742     {
1743       set_initialize (s1, complement, in_delete_set);
1744       set_initialize (s2, 0, in_squeeze_set);
1745       squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1746     }
1747   else if (translating)
1748     {
1749       if (complement)
1750         {
1751           int i;
1752           SET_TYPE *in_s1 = in_delete_set;
1753
1754           set_initialize (s1, 0, in_s1);
1755           s2->state = BEGIN_STATE;
1756           for (i = 0; i < N_CHARS; i++)
1757             xlate[i] = i;
1758           for (i = 0; i < N_CHARS; i++)
1759             {
1760               if (!in_s1[i])
1761                 {
1762                   int ch = get_next (s2, NULL);
1763                   assert (ch != -1 || truncate_set1);
1764                   if (ch == -1)
1765                     {
1766                       /* This will happen when tr is invoked like e.g.
1767                          tr -cs A-Za-z0-9 '\012'.  */
1768                       break;
1769                     }
1770                   xlate[i] = ch;
1771                 }
1772             }
1773           assert (get_next (s2, NULL) == -1 || truncate_set1);
1774         }
1775       else
1776         {
1777           int c1, c2;
1778           int i;
1779           enum Upper_Lower_class class_s1;
1780           enum Upper_Lower_class class_s2;
1781
1782           for (i = 0; i < N_CHARS; i++)
1783             xlate[i] = i;
1784           s1->state = BEGIN_STATE;
1785           s2->state = BEGIN_STATE;
1786           for (;;)
1787             {
1788               c1 = get_next (s1, &class_s1);
1789               c2 = get_next (s2, &class_s2);
1790               if (!class_ok[(int) class_s1][(int) class_s2])
1791                 error (1, 0,
1792                      "misaligned or mismatched upper and/or lower classes");
1793               /* The following should have been checked by validate... */
1794               if (c2 == -1)
1795                 break;
1796               xlate[c1] = c2;
1797             }
1798           assert (c1 == -1 || truncate_set1);
1799         }
1800       if (squeeze_repeats)
1801         {
1802           set_initialize (s2, 0, in_squeeze_set);
1803           squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
1804         }
1805       else
1806         {
1807           int chars_read;
1808
1809           do
1810             {
1811               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
1812               if (chars_read > 0
1813                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
1814                 error (1, errno, "write error");
1815             }
1816           while (chars_read > 0);
1817         }
1818     }
1819
1820   if (fclose (stdout) == EOF)
1821     error (2, errno, "write error");
1822
1823   if (close (0) != 0)
1824     error (2, errno, "standard input");
1825
1826   exit (0);
1827 }