src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  17
  18 /* Written by Jim Meyering, meyering@cs.utexas.edu.  */
  19
  20 #include <config.h>
  21
  22 /* Get isblank from GNU libc.  */
  23 #define _GNU_SOURCE
  24
  25 #include <stdio.h>
  26 #include <assert.h>
  27 #include <errno.h>
  28 #include <sys/types.h>
  29 #include <getopt.h>
  30
  31 #include "system.h"
  32 #include "version.h"
  33 #include "error.h"
  34
  35 #ifndef ULONG_MAX
  36 #define ULONG_MAX ((unsigned long) ~(unsigned long) 0)
  37 #endif
  38
  39 #ifndef LONG_MAX
  40 #define LONG_MAX ((long int) (ULONG_MAX >> 1))
  41 #endif
  42
  43 #ifndef UCHAR_MAX
  44 #define UCHAR_MAX 0xFF
  45 #endif
  46
  47 #define N_CHARS (UCHAR_MAX + 1)
  48
  49 /* A pointer to a function that returns an int.  */
  50 typedef int (*PFI) ();
  51
  52 /* Convert from character C to its index in the collating
  53    sequence array.  Just cast to an unsigned int to avoid
  54    problems with sign-extension.  */
  55 #define ORD(c) (unsigned int)(c)
  56
  57 /* The inverse of ORD.  */
  58 #define CHR(i) (unsigned char)(i)
  59
  60 /* The value for Spec_list->state that indicates to
  61    get_next that it should initialize the tail pointer.
  62    Its value doesn't matter as long as it can't be
  63    confused with a valid character code.  */
  64 #define BEGIN_STATE (2 * N_CHARS)
  65
  66 /* The value for Spec_list->state that indicates to
  67    get_next that the element pointed to by Spec_list->tail is
  68    being considered for the first time on this pass through the
  69    list -- it indicates that get_next should make any necessary
  70    initializations.  */
  71 #define NEW_ELEMENT (BEGIN_STATE + 1)
  72
  73 /* A value distinct from any character that may have been stored in a
  74    buffer as the result of a block-read in the function squeeze_filter.  */
  75 #define NOT_A_CHAR (unsigned int)(-1)
  76
  77 /* The following (but not CC_NO_CLASS) are indices into the array of
  78    valid character class strings.  */
  79 enum Char_class
  80   {
  81     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  82     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  83     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  84     CC_NO_CLASS = 9999
  85   };
  86
  87 /* Character class to which a character (returned by get_next) belonged;
  88    but it is set only if the construct from which the character was obtained
  89    was one of the character classes [:upper:] or [:lower:].  The value
  90    is used only when translating and then, only to make sure that upper
  91    and lower class constructs have the same relative positions in string1
  92    and string2.  */
  93 enum Upper_Lower_class
  94   {
  95     UL_LOWER = 0,
  96     UL_UPPER = 1,
  97     UL_NONE = 2
  98   };
  99
 100 /* A shortcut to ensure that when constructing the translation array,
 101    one of the values returned by paired calls to get_next (from s1 and s2)
 102    is from [:upper:] and the other is from [:lower:], or neither is from
 103    upper or lower.  In fact, no other character classes are allowed when
 104    translating, but that condition is tested elsewhere.  This array is
 105    indexed by values of type enum Upper_Lower_class.  */
 106 static int const class_ok[3][3] =
 107 {
 108   {0, 1, 0},
 109   {1, 0, 0},
 110   {0, 0, 1}
 111 };
 112
 113 /* The type of a List_element.  See build_spec_list for more details.  */
 114 enum Range_element_type
 115   {
 116     RE_NO_TYPE = 0,
 117     RE_NORMAL_CHAR,
 118     RE_RANGE,
 119     RE_CHAR_CLASS,
 120     RE_EQUIV_CLASS,
 121     RE_REPEATED_CHAR
 122   };
 123
 124 /* One construct in one of tr's argument strings.
 125    For example, consider the POSIX version of the classic tr command:
 126        tr -cs 'a-zA-Z_' '[\n*]'
 127    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 128    and a single normal character, `_'.  String2 has one construct.  */
 129 struct List_element
 130   {
 131     enum Range_element_type type;
 132     struct List_element *next;
 133     union
 134       {
 135         int normal_char;
 136         struct                  /* unnamed */
 137           {
 138             unsigned int first_char;
 139             unsigned int last_char;
 140           }
 141         range;
 142         enum Char_class char_class;
 143         int equiv_code;
 144         struct                  /* unnamed */
 145           {
 146             unsigned int the_repeated_char;
 147             long repeat_count;
 148           }
 149         repeated_char;
 150       }
 151     u;
 152   };
 153
 154 /* Each of tr's argument strings is parsed into a form that is easier
 155    to work with: a linked list of constructs (struct List_element).
 156    Each Spec_list structure also encapsulates various attributes of
 157    the corresponding argument string.  The attributes are used mainly
 158    to verify that the strings are valid in the context of any options
 159    specified (like -s, -d, or -c).  The main exception is the member
 160    `tail', which is first used to construct the list.  After construction,
 161    it is used by get_next to save its state when traversing the list.
 162    The member `state' serves a similar function.  */
 163 struct Spec_list
 164   {
 165     /* Points to the head of the list of range elements.
 166        The first struct is a dummy; its members are never used.  */
 167     struct List_element *head;
 168
 169     /* When appending, points to the last element.  When traversing via
 170        get_next(), points to the element to process next.  Setting
 171        Spec_list.state to the value BEGIN_STATE before calling get_next
 172        signals get_next to initialize tail to point to head->next.  */
 173     struct List_element *tail;
 174
 175     /* Used to save state between calls to get_next().  */
 176     unsigned int state;
 177
 178     /* Length, in the sense that length('a-z[:digit:]123abc')
 179        is 42 ( = 26 + 10 + 6).  */
 180     int length;
 181
 182     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 183     int n_indefinite_repeats;
 184
 185     /* Non-zero if this spec contains at least one equivalence
 186        class construct e.g. [=c=].  */
 187     int has_equiv_class;
 188
 189     /* Non-zero if this spec contains at least one of [:upper:] or
 190        [:lower:] class constructs.  */
 191     int has_upper_or_lower;
 192
 193     /* Non-zero if this spec contains at least one of the character class
 194        constructs (all but upper and lower) that aren't allowed in s2.  */
 195     int has_restricted_char_class;
 196   };
 197
 198 char *xmalloc ();
 199 char *stpcpy ();
 200 int safe_read ();
 201
 202 /* The name by which this program was run.  */
 203 char *program_name;
 204
 205 /* When non-zero, each sequence in the input of a repeated character
 206    (call it c) is replaced (in the output) by a single occurrence of c
 207    for every c in the squeeze set.  */
 208 static int squeeze_repeats = 0;
 209
 210 /* When non-zero, removes characters in the delete set from input.  */
 211 static int delete = 0;
 212
 213 /* Use the complement of set1 in place of set1.  */
 214 static int complement = 0;
 215
 216 /* When non-zero, this flag causes GNU tr to provide strict
 217    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 218    says that when -d is used without -s, string2 (if present)
 219    must be ignored.  Silently ignoring arguments is a bad idea.
 220    The default GNU behavior is to give a usage message and exit.
 221    Additionally, when this flag is non-zero, tr prints warnings
 222    on stderr if it is being used in a manner that is not portable.
 223    Applicable warnings are given by default, but are suppressed
 224    if the environment variable `POSIXLY_CORRECT' is set, since
 225    being POSIX conformant means we can't issue such messages.
 226    Warnings on the following topics are suppressed when this
 227    variable is non-zero:
 228    1. Ambiguous octal escapes.  */
 229 static int posix_pedantic;
 230
 231 /* When tr is performing translation and string1 is longer than string2,
 232    POSIX says that the result is undefined.  That gives the implementor
 233    of a POSIX conforming version of tr two reasonable choices for the
 234    semantics of this case.
 235
 236    * The BSD tr pads string2 to the length of string1 by
 237    repeating the last character in string2.
 238
 239    * System V tr ignores characters in string1 that have no
 240    corresponding character in string2.  That is, string1 is effectively
 241    truncated to the length of string2.
 242
 243    When non-zero, this flag causes GNU tr to imitate the behavior
 244    of System V tr when translating with string1 longer than string2.
 245    The default is to emulate BSD tr.  This flag is ignored in modes where
 246    no translation is performed.  Emulating the System V tr
 247    in this exceptional case causes the relatively common BSD idiom:
 248
 249        tr -cs A-Za-z0-9 '\012'
 250
 251    to break (it would convert only zero bytes, rather than all
 252    non-alphanumerics, to newlines).
 253
 254    WARNING: This switch does not provide general BSD or System V
 255    compatibility.  For example, it doesn't disable the interpretation
 256    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 257    some unfortunate coincidence you use such constructs in scripts
 258    expecting to use some other version of tr, the scripts will break.  */
 259 static int truncate_set1 = 0;
 260
 261 /* An alias for (!delete && non_option_args == 2).
 262    It is set in main and used there and in validate().  */
 263 static int translating;
 264
 265 #ifndef BUFSIZ
 266 #define BUFSIZ 8192
 267 #endif
 268
 269 #define IO_BUF_SIZE BUFSIZ
 270 static unsigned char io_buf[IO_BUF_SIZE];
 271
 272 static char const *const char_class_name[] =
 273 {
 274   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 275   "lower", "print", "punct", "space", "upper", "xdigit"
 276 };
 277 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 278
 279 typedef char SET_TYPE;
 280
 281 /* Array of boolean values.  A character `c' is a member of the
 282    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 283    set is defined by the last (possibly, the only) string argument
 284    on the command line when the squeeze option is given.  */
 285 static SET_TYPE in_squeeze_set[N_CHARS];
 286
 287 /* Array of boolean values.  A character `c' is a member of the
 288    delete set if and only if in_delete_set[c] is true.  The delete
 289    set is defined by the first (or only) string argument on the
 290    command line when the delete option is given.  */
 291 static SET_TYPE in_delete_set[N_CHARS];
 292
 293 /* Array of character values defining the translation (if any) that
 294    tr is to perform.  Translation is performed only when there are
 295    two specification strings and the delete switch is not given.  */
 296 static char xlate[N_CHARS];
 297
 298 /* If non-zero, display usage information and exit.  */
 299 static int show_help;
 300
 301 /* If non-zero, print the version on standard output then exit.  */
 302 static int show_version;
 303
 304 static struct option const long_options[] =
 305 {
 306   {"complement", no_argument, NULL, 'c'},
 307   {"delete", no_argument, NULL, 'd'},
 308   {"squeeze-repeats", no_argument, NULL, 's'},
 309   {"truncate-set1", no_argument, NULL, 't'},
 310   {"help", no_argument, &show_help, 1},
 311   {"version", no_argument, &show_version, 1},
 312   {NULL, 0, NULL, 0}
 313 };
 314 \f
 315 static void
 316 usage (status)
 317      int status;
 318 {
 319   if (status != 0)
 320     fprintf (stderr, "Try `%s --help' for more information.\n",
 321              program_name);
 322   else
 323     {
 324       printf ("\
 325 Usage: %s [OPTION]... SET1 [SET2]\n\
 326 ",
 327               program_name);
 328       printf ("\
 329 Translate, squeeze, and/or delete characters from standard input,\n\
 330 writing to standard output.\n\
 331 \n\
 332   -c, --complement        first complement SET1\n\
 333   -d, --delete            delete characters in SET1, do not translate\n\
 334   -s, --squeeze-repeats   replace sequence of characters with one\n\
 335   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 336       --help              display this help and exit\n\
 337       --version           output version information and exit\n\
 338 ");
 339       printf ("\
 340 \n\
 341 SETs are specified as strings of characters.  Most represent themselves.\n\
 342 Interpreted sequences are:\n\
 343 \n\
 344   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 345   \\\\              backslash\n\
 346   \\a              audible BEL\n\
 347   \\b              backspace\n\
 348   \\f              form feed\n\
 349   \\n              new line\n\
 350   \\r              return\n\
 351   \\t              horizontal tab\n\
 352   \\v              vertical tab\n\
 353   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 354   [CHAR1-CHAR2]   same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
 355   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 356   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 357   [:alnum:]       all letters and digits\n\
 358   [:alpha:]       all letters\n\
 359   [:blank:]       all horizontal whitespace\n\
 360   [:cntrl:]       all control characters\n\
 361   [:digit:]       all digits\n\
 362   [:graph:]       all printable characters, not including space\n\
 363   [:lower:]       all lower case letters\n\
 364   [:print:]       all printable characters, including space\n\
 365   [:punct:]       all punctuation characters\n\
 366   [:space:]       all horizontal or vertical whitespace\n\
 367   [:upper:]       all upper case letters\n\
 368   [:xdigit:]      all hexadecimal digits\n\
 369   [=CHAR=]        all characters which are equivalent to CHAR\n\
 370 ");
 371       printf ("\
 372 \n\
 373 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 374 -t may be used only when translating.  SET2 is extended to length of\n\
 375 SET1 by repeating its last character as necessary.  Excess characters\n\
 376 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 377 expand in ascending order; used in SET2 while translating, they may\n\
 378 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 379 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 380 translation or deletion.\n\
 381 ");
 382     }
 383   exit (status);
 384 }
 385
 386 /* Return non-zero if the character C is a member of the
 387    equivalence class containing the character EQUIV_CLASS.  */
 388
 389 static int
 390 is_equiv_class_member (equiv_class, c)
 391      unsigned int equiv_class;
 392      unsigned int c;
 393 {
 394   return (equiv_class == c);
 395 }
 396
 397 /* Return non-zero if the character C is a member of the
 398    character class CHAR_CLASS.  */
 399
 400 static int
 401 is_char_class_member (char_class, c)
 402      enum Char_class char_class;
 403      unsigned int c;
 404 {
 405   switch (char_class)
 406     {
 407     case CC_ALNUM:
 408       return ISALNUM (c);
 409       break;
 410     case CC_ALPHA:
 411       return ISALPHA (c);
 412       break;
 413     case CC_BLANK:
 414       return ISBLANK (c);
 415       break;
 416     case CC_CNTRL:
 417       return ISCNTRL (c);
 418       break;
 419     case CC_DIGIT:
 420       return ISDIGIT (c);
 421       break;
 422     case CC_GRAPH:
 423       return ISGRAPH (c);
 424       break;
 425     case CC_LOWER:
 426       return ISLOWER (c);
 427       break;
 428     case CC_PRINT:
 429       return ISPRINT (c);
 430       break;
 431     case CC_PUNCT:
 432       return ISPUNCT (c);
 433       break;
 434     case CC_SPACE:
 435       return ISSPACE (c);
 436       break;
 437     case CC_UPPER:
 438       return ISUPPER (c);
 439       break;
 440     case CC_XDIGIT:
 441       return ISXDIGIT (c);
 442       break;
 443     default:
 444       abort ();
 445       break;
 446     }
 447 }
 448
 449 /* Perform the first pass over each range-spec argument S, converting all
 450    \c and \ddd escapes to their one-byte representations.  The conversion
 451    is done in-place, so S must point to writable storage.  If an invalid
 452    quote sequence is found print an error message and return non-zero.
 453    Otherwise set *LEN to the length of the resulting string and return
 454    zero.  The resulting array of characters may contain zero-bytes;
 455    however, on input, S is assumed to be null-terminated, and hence
 456    cannot contain actual (non-escaped) zero bytes.  */
 457
 458 static int
 459 unquote (s, len)
 460      unsigned char *s;
 461      int *len;
 462 {
 463   int i, j;
 464
 465   j = 0;
 466   for (i = 0; s[i]; i++)
 467     {
 468       switch (s[i])
 469         {
 470           int c;
 471         case '\\':
 472           switch (s[i + 1])
 473             {
 474               int oct_digit;
 475             case '\\':
 476               c = '\\';
 477               break;
 478             case 'a':
 479               c = '\007';
 480               break;
 481             case 'b':
 482               c = '\b';
 483               break;
 484             case 'f':
 485               c = '\f';
 486               break;
 487             case 'n':
 488               c = '\n';
 489               break;
 490             case 'r':
 491               c = '\r';
 492               break;
 493             case 't':
 494               c = '\t';
 495               break;
 496             case 'v':
 497               c = '\v';
 498               break;
 499             case '0':
 500             case '1':
 501             case '2':
 502             case '3':
 503             case '4':
 504             case '5':
 505             case '6':
 506             case '7':
 507               c = s[i + 1] - '0';
 508               oct_digit = s[i + 2] - '0';
 509               if (0 <= oct_digit && oct_digit <= 7)
 510                 {
 511                   c = 8 * c + oct_digit;
 512                   ++i;
 513                   oct_digit = s[i + 2] - '0';
 514                   if (0 <= oct_digit && oct_digit <= 7)
 515                     {
 516                       if (8 * c + oct_digit < N_CHARS)
 517                         {
 518                           c = 8 * c + oct_digit;
 519                           ++i;
 520                         }
 521                       else if (!posix_pedantic)
 522                         {
 523                           /* Any octal number larger than 0377 won't
 524                              fit in 8 bits.  So we stop when adding the
 525                              next digit would put us over the limit and
 526                              give a warning about the ambiguity.  POSIX
 527                              isn't clear on this, but one person has said
 528                              that in his interpretation, POSIX says tr
 529                              can't even give a warning.  */
 530                           error (0, 0, "warning: the ambiguous octal escape \
 531 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'",
 532                                  s[i], s[i + 1], s[i + 2],
 533                                  s[i], s[i + 1], s[i + 2]);
 534                         }
 535                     }
 536                 }
 537               break;
 538             case '\0':
 539               error (0, 0, "invalid backslash escape at end of string");
 540               return 1;
 541               break;
 542             default:
 543               error (0, 0, "invalid backslash escape `\\%c'", s[i + 1]);
 544               return 1;
 545               break;
 546             }
 547           ++i;
 548           s[j++] = c;
 549           break;
 550         default:
 551           s[j++] = s[i];
 552           break;
 553         }
 554     }
 555   *len = j;
 556   return 0;
 557 }
 558
 559 /* If CLASS_STR is a valid character class string, return its index
 560    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 561
 562 static enum Char_class
 563 look_up_char_class (class_str)
 564      unsigned char *class_str;
 565 {
 566   unsigned int i;
 567
 568   for (i = 0; i < N_CHAR_CLASSES; i++)
 569     if (strcmp ((const char *) class_str, char_class_name[i]) == 0)
 570       return (enum Char_class) i;
 571   return CC_NO_CLASS;
 572 }
 573
 574 /* Return a newly allocated string with a printable version of C.
 575    This function is used solely for formatting error messages.  */
 576
 577 static char *
 578 make_printable_char (c)
 579      unsigned int c;
 580 {
 581   char *buf = xmalloc (5);
 582
 583   assert (c < N_CHARS);
 584   if (ISPRINT (c))
 585     {
 586       buf[0] = c;
 587       buf[1] = '\0';
 588     }
 589   else
 590     {
 591       sprintf (buf, "\\%03o", c);
 592     }
 593   return buf;
 594 }
 595
 596 /* Return a newly allocated copy of S which is suitable for printing.
 597    LEN is the number of characters in S.  Most non-printing
 598    (isprint) characters are represented by a backslash followed by
 599    3 octal digits.  However, the characters represented by \c escapes
 600    where c is one of [abfnrtv] are represented by their 2-character \c
 601    sequences.  This function is used solely for printing error messages.  */
 602
 603 static char *
 604 make_printable_str (s, len)
 605      const unsigned char *s;
 606      int len;
 607 {
 608   /* Worst case is that every character expands to a backslash
 609      followed by a 3-character octal escape sequence.  */
 610   char *printable_buf = xmalloc (4 * len + 1);
 611   char *p = printable_buf;
 612   int i;
 613
 614   for (i = 0; i < len; i++)
 615     {
 616       char buf[5];
 617       char *tmp = NULL;
 618
 619       switch (s[i])
 620         {
 621         case '\\':
 622           tmp = "\\";
 623           break;
 624         case '\007':
 625           tmp = "\\a";
 626           break;
 627         case '\b':
 628           tmp = "\\b";
 629           break;
 630         case '\f':
 631           tmp = "\\f";
 632           break;
 633         case '\n':
 634           tmp = "\\n";
 635           break;
 636         case '\r':
 637           tmp = "\\r";
 638           break;
 639         case '\t':
 640           tmp = "\\t";
 641           break;
 642         case '\v':
 643           tmp = "\\v";
 644           break;
 645         default:
 646           if (ISPRINT (s[i]))
 647             {
 648               buf[0] = s[i];
 649               buf[1] = '\0';
 650             }
 651           else
 652             sprintf (buf, "\\%03o", s[i]);
 653           tmp = buf;
 654           break;
 655         }
 656       p = stpcpy (p, tmp);
 657     }
 658   return printable_buf;
 659 }
 660
 661 /* Append a newly allocated structure representing a
 662    character C to the specification list LIST.  */
 663
 664 static void
 665 append_normal_char (list, c)
 666      struct Spec_list *list;
 667      unsigned int c;
 668 {
 669   struct List_element *new;
 670
 671   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 672   new->next = NULL;
 673   new->type = RE_NORMAL_CHAR;
 674   new->u.normal_char = c;
 675   assert (list->tail);
 676   list->tail->next = new;
 677   list->tail = new;
 678 }
 679
 680 /* Append a newly allocated structure representing the range
 681    of characters from FIRST to LAST to the specification list LIST.
 682    Return non-zero if LAST precedes FIRST in the collating sequence,
 683    zero otherwise.  This means that '[c-c]' is acceptable.  */
 684
 685 static int
 686 append_range (list, first, last)
 687      struct Spec_list *list;
 688      unsigned int first;
 689      unsigned int last;
 690 {
 691   struct List_element *new;
 692
 693   if (ORD (first) > ORD (last))
 694     {
 695       char *tmp1 = make_printable_char (first);
 696       char *tmp2 = make_printable_char (last);
 697
 698       error (0, 0,
 699        "range-endpoints of `%s-%s' are in reverse collating sequence order",
 700              tmp1, tmp2);
 701       free (tmp1);
 702       free (tmp2);
 703       return 1;
 704     }
 705   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 706   new->next = NULL;
 707   new->type = RE_RANGE;
 708   new->u.range.first_char = first;
 709   new->u.range.last_char = last;
 710   assert (list->tail);
 711   list->tail->next = new;
 712   list->tail = new;
 713   return 0;
 714 }
 715
 716 /* If CHAR_CLASS_STR is a valid character class string, append a
 717    newly allocated structure representing that character class to the end
 718    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 719    a valid string, print an error message and return non-zero.  */
 720
 721 static int
 722 append_char_class (list, char_class_str, len)
 723      struct Spec_list *list;
 724      unsigned char *char_class_str;
 725      int len;
 726 {
 727   enum Char_class char_class;
 728   struct List_element *new;
 729
 730   char_class = look_up_char_class (char_class_str);
 731   if (char_class == CC_NO_CLASS)
 732     {
 733       char *tmp = make_printable_str (char_class_str, len);
 734
 735       error (0, 0, "invalid character class `%s'", tmp);
 736       free (tmp);
 737       return 1;
 738     }
 739   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 740   new->next = NULL;
 741   new->type = RE_CHAR_CLASS;
 742   new->u.char_class = char_class;
 743   assert (list->tail);
 744   list->tail->next = new;
 745   list->tail = new;
 746   return 0;
 747 }
 748
 749 /* Append a newly allocated structure representing a [c*n]
 750    repeated character construct to the specification list LIST.
 751    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 752    is a non-negative repeat count.  */
 753
 754 static void
 755 append_repeated_char (list, the_char, repeat_count)
 756      struct Spec_list *list;
 757      unsigned int the_char;
 758      long int repeat_count;
 759 {
 760   struct List_element *new;
 761
 762   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 763   new->next = NULL;
 764   new->type = RE_REPEATED_CHAR;
 765   new->u.repeated_char.the_repeated_char = the_char;
 766   new->u.repeated_char.repeat_count = repeat_count;
 767   assert (list->tail);
 768   list->tail->next = new;
 769   list->tail = new;
 770 }
 771
 772 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 773    the length of that string, LEN, if LEN is exactly one, append
 774    a newly allocated structure representing the specified
 775    equivalence class to the specification list, LIST and return zero.
 776    If LEN is not 1, issue an error message and return non-zero.  */
 777
 778 static int
 779 append_equiv_class (list, equiv_class_str, len)
 780      struct Spec_list *list;
 781      unsigned char *equiv_class_str;
 782      int len;
 783 {
 784   struct List_element *new;
 785
 786   if (len != 1)
 787     {
 788       char *tmp = make_printable_str (equiv_class_str, len);
 789
 790       error (0, 0, "%s: equivalence class operand must be a single character",
 791              tmp);
 792       free (tmp);
 793       return 1;
 794     }
 795   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 796   new->next = NULL;
 797   new->type = RE_EQUIV_CLASS;
 798   new->u.equiv_code = *equiv_class_str;
 799   assert (list->tail);
 800   list->tail->next = new;
 801   list->tail = new;
 802   return 0;
 803 }
 804
 805 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 806    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 807    NUL bytes, and is not NUL-terminated.  */
 808
 809 static unsigned char *
 810 substr (p, first_idx, last_idx)
 811      const unsigned char *p;
 812      int first_idx;
 813      int last_idx;
 814 {
 815   int len = last_idx - first_idx + 1;
 816   unsigned char *tmp = (unsigned char *) xmalloc (len);
 817
 818   assert (first_idx <= last_idx);
 819   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 820   memcpy (tmp, p + first_idx, len);
 821   return tmp;
 822 }
 823
 824 /* Search forward starting at START_IDX for the 2-char sequence
 825    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 826    a sequence is found, return the index of the first character,
 827    otherwise return -1.  P may contain zero bytes.  */
 828
 829 static int
 830 find_closing_delim (p, start_idx, p_len, pre_bracket_char)
 831      const unsigned char *p;
 832      int start_idx;
 833      int p_len;
 834      unsigned int pre_bracket_char;
 835 {
 836   int i;
 837
 838   for (i = start_idx; i < p_len - 1; i++)
 839     if (p[i] == pre_bracket_char && p[i + 1] == ']')
 840       return i;
 841   return -1;
 842 }
 843
 844 /* Convert a string S with explicit length LEN, possibly
 845    containing embedded zero bytes, to a long integer value.
 846    If the string represents a negative value, a value larger
 847    than LONG_MAX, or if all LEN characters do not represent a
 848    valid integer, return non-zero and do not modify *VAL.
 849    Otherwise, return zero and set *VAL to the converted value.  */
 850
 851 static int
 852 non_neg_strtol (s, len, val)
 853      const unsigned char *s;
 854      int len;
 855      long int *val;
 856 {
 857   int i;
 858   long sum = 0;
 859   unsigned int base;
 860
 861   if (len <= 0)
 862     return 1;
 863   if (s[0] == '0')
 864     base = 8;
 865   else if (ISDIGIT (s[0]))
 866     base = 10;
 867   else
 868     return 1;
 869
 870   for (i = 0; i < len; i++)
 871     {
 872       int c = s[i] - '0';
 873
 874       if (c >= base || c < 0)
 875         return 1;
 876       if (i > 8 && sum > (LONG_MAX - c) / base)
 877         return 1;
 878       sum = sum * base + c;
 879     }
 880   *val = sum;
 881   return 0;
 882 }
 883
 884 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 885    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 886    return the character and the repeat count through the arg pointers,
 887    CHAR_TO_REPEAT and N, and then return the index of the closing
 888    bracket as the function value.  If the second character following
 889    the opening bracket is not `*' or if no closing bracket can be
 890    found, return -1.  If a closing bracket is found and the
 891    second char is `*', but the string between the `*' and `]' isn't
 892    empty, an octal number, or a decimal number, print an error message
 893    and return -2.  */
 894
 895 static int
 896 find_bracketed_repeat (p, start_idx, p_len, char_to_repeat, n)
 897      const unsigned char *p;
 898      int start_idx;
 899      int p_len;
 900      unsigned int *char_to_repeat;
 901      long int *n;
 902 {
 903   int i;
 904
 905   assert (start_idx + 1 < p_len);
 906   if (p[start_idx + 1] != '*')
 907     return -1;
 908
 909   for (i = start_idx + 2; i < p_len; i++)
 910     {
 911       if (p[i] == ']')
 912         {
 913           const unsigned char *digit_str;
 914           int digit_str_len = i - start_idx - 2;
 915
 916           *char_to_repeat = p[start_idx];
 917           if (digit_str_len == 0)
 918             {
 919               /* We've matched [c*] -- no explicit repeat count.  */
 920               *n = 0;
 921               return i;
 922             }
 923
 924           /* Here, we have found [c*s] where s should be a string
 925              of octal or decimal digits.  */
 926           digit_str = &p[start_idx + 2];
 927           if (non_neg_strtol (digit_str, digit_str_len, n))
 928             {
 929               char *tmp = make_printable_str (digit_str, digit_str_len);
 930               error (0, 0, "invalid repeat count `%s' in [c*n] construct", tmp);
 931               free (tmp);
 932               return -2;
 933             }
 934           return i;
 935         }
 936     }
 937   return -1;                    /* No bracket found.  */
 938 }
 939
 940 /* Convert string UNESACPED_STRING (which has been preprocessed to
 941    convert backslash-escape sequences) of length LEN characters into
 942    a linked list of the following 5 types of constructs:
 943       - [:str:] Character class where `str' is one of the 12 valid strings.
 944       - [=c=] Equivalence class where `c' is any single character.
 945       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 946           However, if `n' is present, it must be a non-negative octal or
 947           decimal integer.
 948       - r-s Range of characters from `r' to `s'.  The second endpoint must
 949           not precede the first in the current collating sequence.
 950       - c Any other character is interpreted as itself.  */
 951
 952 static int
 953 build_spec_list (unescaped_string, len, result)
 954      const unsigned char *unescaped_string;
 955      int len;
 956      struct Spec_list *result;
 957 {
 958   const unsigned char *p;
 959   int i;
 960
 961   p = unescaped_string;
 962
 963   /* The main for-loop below recognizes the 4 multi-character constructs.
 964      A character that matches (in its context) none of the multi-character
 965      constructs is classified as `normal'.  Since all multi-character
 966      constructs have at least 3 characters, any strings of length 2 or
 967      less are composed solely of normal characters.  Hence, the index of
 968      the outer for-loop runs only as far as LEN-2.  */
 969
 970   for (i = 0; i < len - 2;)
 971     {
 972       switch (p[i])
 973         {
 974           int fall_through;
 975         case '[':
 976           fall_through = 0;
 977           switch (p[i + 1])
 978             {
 979               int closing_delim_idx;
 980               int closing_bracket_idx;
 981               unsigned int char_to_repeat;
 982               long repeat_count;
 983             case ':':
 984             case '=':
 985               closing_delim_idx = find_closing_delim (p, i + 2, len, p[i + 1]);
 986               if (closing_delim_idx >= 0)
 987                 {
 988                   int parse_failed;
 989                   unsigned char *opnd_str = substr (p, i + 2,
 990                                                     closing_delim_idx - 1);
 991                   if (p[i + 1] == ':')
 992                     parse_failed = append_char_class (result, opnd_str,
 993                                      (closing_delim_idx - 1) - (i + 2) + 1);
 994                   else
 995                     parse_failed = append_equiv_class (result, opnd_str,
 996                                      (closing_delim_idx - 1) - (i + 2) + 1);
 997                   free (opnd_str);
 998
 999                   /* Return non-zero if append_*_class reports a problem.  */
1000                   if (parse_failed)
1001                     return 1;
1002                   else
1003                     i = closing_delim_idx + 2;
1004                   break;
1005                 }
1006               /* Else fall through.  This could be [:*] or [=*].  */
1007             default:
1008               /* Determine whether this is a bracketed repeat range
1009                  matching the RE \[.\*(dec_or_oct_number)?\].  */
1010               closing_bracket_idx = find_bracketed_repeat (p, i + 1,
1011                                        len, &char_to_repeat, &repeat_count);
1012               if (closing_bracket_idx >= 0)
1013                 {
1014                   append_repeated_char (result, char_to_repeat, repeat_count);
1015                   i = closing_bracket_idx + 1;
1016                   break;
1017                 }
1018               else if (closing_bracket_idx == -1)
1019                 {
1020                   fall_through = 1;
1021                 }
1022               else
1023                 /* Found a string that looked like [c*n] but the
1024                    numeric part was invalid.  */
1025                 return 1;
1026               break;
1027             }
1028           if (!fall_through)
1029             break;
1030
1031           /* Here if we've tried to match [c*n], [:str:], and [=c=]
1032              and none of them fit.  So we still have to consider the
1033              range `[-c' (from `[' to `c').  */
1034         default:
1035           /* Look ahead one char for ranges like a-z.  */
1036           if (p[i + 1] == '-')
1037             {
1038               if (append_range (result, p[i], p[i + 2]))
1039                 return 1;
1040               i += 3;
1041             }
1042           else
1043             {
1044               append_normal_char (result, p[i]);
1045               ++i;
1046             }
1047           break;
1048         }
1049     }
1050
1051   /* Now handle the (2 or fewer) remaining characters p[i]..p[len - 1].  */
1052   for (; i < len; i++)
1053     append_normal_char (result, p[i]);
1054
1055   return 0;
1056 }
1057
1058 /* Given a Spec_list S (with its saved state implicit in the values
1059    of its members `tail' and `state'), return the next single character
1060    in the expansion of S's constructs.  If the last character of S was
1061    returned on the previous call or if S was empty, this function
1062    returns -1.  For example, successive calls to get_next where S
1063    represents the spec-string 'a-d[y*3]' will return the sequence
1064    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1065    which the returned character comes is [:upper:] or [:lower:], the
1066    parameter CLASS is given a value to indicate which it was.  Otherwise
1067    CLASS is set to UL_NONE.  This value is used only when constructing
1068    the translation table to verify that any occurrences of upper and
1069    lower class constructs in the spec-strings appear in the same relative
1070    positions.  */
1071
1072 static int
1073 get_next (s, class)
1074      struct Spec_list *s;
1075      enum Upper_Lower_class *class;
1076 {
1077   struct List_element *p;
1078   int return_val;
1079   int i;
1080
1081   if (class)
1082     *class = UL_NONE;
1083
1084   if (s->state == BEGIN_STATE)
1085     {
1086       s->tail = s->head->next;
1087       s->state = NEW_ELEMENT;
1088     }
1089
1090   p = s->tail;
1091   if (p == NULL)
1092     return -1;
1093
1094   switch (p->type)
1095     {
1096     case RE_NORMAL_CHAR:
1097       return_val = p->u.normal_char;
1098       s->state = NEW_ELEMENT;
1099       s->tail = p->next;
1100       break;
1101
1102     case RE_RANGE:
1103       if (s->state == NEW_ELEMENT)
1104         s->state = ORD (p->u.range.first_char);
1105       else
1106         ++(s->state);
1107       return_val = CHR (s->state);
1108       if (s->state == ORD (p->u.range.last_char))
1109         {
1110           s->tail = p->next;
1111           s->state = NEW_ELEMENT;
1112         }
1113       break;
1114
1115     case RE_CHAR_CLASS:
1116       if (s->state == NEW_ELEMENT)
1117         {
1118           for (i = 0; i < N_CHARS; i++)
1119             if (is_char_class_member (p->u.char_class, i))
1120               break;
1121           assert (i < N_CHARS);
1122           s->state = i;
1123         }
1124       assert (is_char_class_member (p->u.char_class, s->state));
1125       return_val = CHR (s->state);
1126       for (i = s->state + 1; i < N_CHARS; i++)
1127         if (is_char_class_member (p->u.char_class, i))
1128           break;
1129       if (i < N_CHARS)
1130         s->state = i;
1131       else
1132         {
1133           s->tail = p->next;
1134           s->state = NEW_ELEMENT;
1135         }
1136       if (class)
1137         {
1138           switch (p->u.char_class)
1139             {
1140             case CC_LOWER:
1141               *class = UL_LOWER;
1142               break;
1143             case CC_UPPER:
1144               *class = UL_UPPER;
1145               break;
1146             default:
1147               /* empty */
1148               break;
1149             }
1150         }
1151       break;
1152
1153     case RE_EQUIV_CLASS:
1154       /* FIXME: this assumes that each character is alone in its own
1155          equivalence class (which appears to be correct for my
1156          LC_COLLATE.  But I don't know of any function that allows
1157          one to determine a character's equivalence class.  */
1158
1159       return_val = p->u.equiv_code;
1160       s->state = NEW_ELEMENT;
1161       s->tail = p->next;
1162       break;
1163
1164     case RE_REPEATED_CHAR:
1165       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1166       assert (p->u.repeated_char.repeat_count >= 0);
1167       if (p->u.repeated_char.repeat_count == 0)
1168         {
1169           s->tail = p->next;
1170           s->state = NEW_ELEMENT;
1171           return_val = get_next (s, class);
1172         }
1173       else
1174         {
1175           if (s->state == NEW_ELEMENT)
1176             {
1177               s->state = 0;
1178             }
1179           ++(s->state);
1180           return_val = p->u.repeated_char.the_repeated_char;
1181           if (p->u.repeated_char.repeat_count > 0
1182               && s->state == p->u.repeated_char.repeat_count)
1183             {
1184               s->tail = p->next;
1185               s->state = NEW_ELEMENT;
1186             }
1187         }
1188       break;
1189
1190     case RE_NO_TYPE:
1191       abort ();
1192       break;
1193
1194     default:
1195       abort ();
1196       break;
1197     }
1198
1199   return return_val;
1200 }
1201
1202 /* This is a minor kludge.  This function is called from
1203    get_spec_stats to determine the cardinality of a set derived
1204    from a complemented string.  It's a kludge in that some of the
1205    same operations are (duplicated) performed in set_initialize.  */
1206
1207 static int
1208 card_of_complement (s)
1209      struct Spec_list *s;
1210 {
1211   int c;
1212   int cardinality = N_CHARS;
1213   SET_TYPE in_set[N_CHARS];
1214
1215   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1216   s->state = BEGIN_STATE;
1217   while ((c = get_next (s, NULL)) != -1)
1218     if (!in_set[c]++)
1219       --cardinality;
1220   return cardinality;
1221 }
1222
1223 /* Gather statistics about the spec-list S in preparation for the tests
1224    in validate that determine the consistency of the specs.  This function
1225    is called at most twice; once for string1, and again for any string2.
1226    LEN_S1 < 0 indicates that this is the first call and that S represents
1227    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1228    constructs in string1, and we can use its value to resolve any
1229    indefinite repeat construct in S (which represents string2).  Hence,
1230    this function has the side-effect that it converts a valid [c*]
1231    construct in string2 to [c*n] where n is large enough (or 0) to give
1232    string2 the same length as string1.  For example, with the command
1233    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1234    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1235
1236 static void
1237 get_spec_stats (s, len_s1)
1238      struct Spec_list *s;
1239      int len_s1;
1240 {
1241   struct List_element *p;
1242   struct List_element *indefinite_repeat_element = NULL;
1243   int len = 0;
1244
1245   s->n_indefinite_repeats = 0;
1246   s->has_equiv_class = 0;
1247   s->has_restricted_char_class = 0;
1248   s->has_upper_or_lower = 0;
1249   for (p = s->head->next; p; p = p->next)
1250     {
1251       switch (p->type)
1252         {
1253           int i;
1254         case RE_NORMAL_CHAR:
1255           ++len;
1256           break;
1257
1258         case RE_RANGE:
1259           assert (p->u.range.last_char >= p->u.range.first_char);
1260           len += p->u.range.last_char - p->u.range.first_char + 1;
1261           break;
1262
1263         case RE_CHAR_CLASS:
1264           for (i = 0; i < N_CHARS; i++)
1265             if (is_char_class_member (p->u.char_class, i))
1266               ++len;
1267           switch (p->u.char_class)
1268             {
1269             case CC_UPPER:
1270             case CC_LOWER:
1271               s->has_upper_or_lower = 1;
1272               break;
1273             default:
1274               s->has_restricted_char_class = 1;
1275               break;
1276             }
1277           break;
1278
1279         case RE_EQUIV_CLASS:
1280           for (i = 0; i < N_CHARS; i++)
1281             if (is_equiv_class_member (p->u.equiv_code, i))
1282               ++len;
1283           s->has_equiv_class = 1;
1284           break;
1285
1286         case RE_REPEATED_CHAR:
1287           if (p->u.repeated_char.repeat_count > 0)
1288             len += p->u.repeated_char.repeat_count;
1289           else if (p->u.repeated_char.repeat_count == 0)
1290             {
1291               indefinite_repeat_element = p;
1292               ++(s->n_indefinite_repeats);
1293             }
1294           break;
1295
1296         case RE_NO_TYPE:
1297           assert (0);
1298           break;
1299         }
1300     }
1301
1302   if (len_s1 >= len && s->n_indefinite_repeats == 1)
1303     {
1304       indefinite_repeat_element->u.repeated_char.repeat_count = len_s1 - len;
1305       len = len_s1;
1306     }
1307   if (complement && len_s1 < 0)
1308     s->length = card_of_complement (s);
1309   else
1310     s->length = len;
1311   return;
1312 }
1313
1314 static void
1315 spec_init (spec_list)
1316      struct Spec_list *spec_list;
1317 {
1318   spec_list->head = spec_list->tail =
1319     (struct List_element *) xmalloc (sizeof (struct List_element));
1320   spec_list->head->next = NULL;
1321 }
1322
1323 /* This function makes two passes over the argument string S.  The first
1324    one converts all \c and \ddd escapes to their one-byte representations.
1325    The second constructs a linked specification list, SPEC_LIST, of the
1326    characters and constructs that comprise the argument string.  If either
1327    of these passes detects an error, this function returns non-zero.  */
1328
1329 static int
1330 parse_str (s, spec_list)
1331      unsigned char *s;
1332      struct Spec_list *spec_list;
1333 {
1334   int len;
1335
1336   if (unquote (s, &len))
1337     return 1;
1338   if (build_spec_list (s, len, spec_list))
1339     return 1;
1340   return 0;
1341 }
1342
1343 /* Given two specification lists, S1 and S2, and assuming that
1344    S1->length > S2->length, append a single [c*n] element to S2 where c
1345    is the last character in the expansion of S2 and n is the difference
1346    between the two lengths.
1347    Upon successful completion, S2->length is set to S1->length.  The only
1348    way this function can fail to make S2 as long as S1 is when S2 has
1349    zero-length, since in that case, there is no last character to repeat.
1350    So S2->length is required to be at least 1.
1351
1352    Providing this functionality allows the user to do some pretty
1353    non-BSD (and non-portable) things:  For example, the command
1354        tr -cs '[:upper:]0-9' '[:lower:]'
1355    is almost guaranteed to give results that depend on your collating
1356    sequence.  */
1357
1358 static void
1359 string2_extend (s1, s2)
1360      const struct Spec_list *s1;
1361      struct Spec_list *s2;
1362 {
1363   struct List_element *p;
1364   int char_to_repeat;
1365   int i;
1366
1367   assert (translating);
1368   assert (s1->length > s2->length);
1369   assert (s2->length > 0);
1370
1371   p = s2->tail;
1372   switch (p->type)
1373     {
1374     case RE_NORMAL_CHAR:
1375       char_to_repeat = p->u.normal_char;
1376       break;
1377     case RE_RANGE:
1378       char_to_repeat = p->u.range.last_char;
1379       break;
1380     case RE_CHAR_CLASS:
1381       for (i = N_CHARS; i >= 0; i--)
1382         if (is_char_class_member (p->u.char_class, i))
1383           break;
1384       assert (i >= 0);
1385       char_to_repeat = CHR (i);
1386       break;
1387
1388     case RE_REPEATED_CHAR:
1389       char_to_repeat = p->u.repeated_char.the_repeated_char;
1390       break;
1391
1392     case RE_EQUIV_CLASS:
1393       /* This shouldn't happen, because validate exits with an error
1394          if it finds an equiv class in string2 when translating.  */
1395       abort ();
1396       break;
1397
1398     case RE_NO_TYPE:
1399       abort ();
1400       break;
1401
1402     default:
1403       abort ();
1404       break;
1405     }
1406
1407   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1408   s2->length = s1->length;
1409   return;
1410 }
1411
1412 /* Die with an error message if S1 and S2 describe strings that
1413    are not valid with the given command line switches.
1414    A side effect of this function is that if a valid [c*] or
1415    [c*0] construct appears in string2, it is converted to [c*n]
1416    with a value for n that makes s2->length == s1->length.  By
1417    the same token, if the --truncate-set1 option is not
1418    given, S2 may be extended.  */
1419
1420 static void
1421 validate (s1, s2)
1422      const struct Spec_list *s1;
1423      struct Spec_list *s2;
1424 {
1425   get_spec_stats (s1, -1);
1426   if (s1->n_indefinite_repeats > 0)
1427     {
1428       error (1, 0, "the [c*] repeat construct may not appear in string1");
1429     }
1430
1431   /* FIXME: it isn't clear from the POSIX spec that this is invalid,
1432      but in the spirit of the other restrictions put on translation
1433      with character classes, this seems a logical interpretation.  */
1434   if (complement && s1->has_upper_or_lower)
1435     {
1436       error (1, 0,
1437              "character classes may not be used when translating \
1438 and complementing");
1439     }
1440
1441   if (s2)
1442     {
1443       get_spec_stats (s2, s1->length);
1444       if (s2->has_restricted_char_class)
1445         {
1446           error (1, 0,
1447                  "when translating, the only character classes that may \
1448 appear in\n\tstring2 are `upper' and `lower'");
1449         }
1450
1451       if (s2->n_indefinite_repeats > 1)
1452         {
1453           error (1, 0, "only one [c*] repeat construct may appear in string2");
1454         }
1455
1456       if (translating)
1457         {
1458           if (s2->has_equiv_class)
1459             {
1460               error (1, 0,
1461                      "[=c=] expressions may not appear in string2 \
1462 when translating");
1463             }
1464
1465           if (s1->length > s2->length)
1466             {
1467               if (!truncate_set1)
1468                 {
1469                   /* string2 must be non-empty unless --truncate-set1 is
1470                      given or string1 is empty.  */
1471
1472                   if (s2->length == 0)
1473                     error (1, 0,
1474                      "when not truncating set1, string2 must be non-empty");
1475                   string2_extend (s1, s2);
1476                 }
1477             }
1478
1479           if (complement && s2->has_upper_or_lower)
1480             error (1, 0,
1481                    "character classes may not be used when translating \
1482 and complementing");
1483         }
1484       else
1485         /* Not translating.  */
1486         {
1487           if (s2->n_indefinite_repeats > 0)
1488             error (1, 0,
1489                    "the [c*] construct may appear in string2 only \
1490 when translating");
1491         }
1492     }
1493 }
1494
1495 /* Read buffers of SIZE bytes via the function READER (if READER is
1496    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1497    read_and_delete or read_and_xlate.  After each buffer is read, it is
1498    processed and written to stdout.  The buffers are processed so that
1499    multiple consecutive occurrences of the same character in the input
1500    stream are replaced by a single occurrence of that character if the
1501    character is in the squeeze set.  */
1502
1503 static void
1504 squeeze_filter (buf, size, reader)
1505      unsigned char *buf;
1506      long int size;
1507      PFI reader;
1508 {
1509   unsigned int char_to_squeeze = NOT_A_CHAR;
1510   int i = 0;
1511   int nr = 0;
1512
1513   for (;;)
1514     {
1515       int begin;
1516
1517       if (i >= nr)
1518         {
1519           if (reader == NULL)
1520             nr = safe_read (0, (char *) buf, size);
1521           else
1522             nr = (*reader) (buf, size, NULL);
1523
1524           if (nr < 0)
1525             error (1, errno, "read error");
1526           if (nr == 0)
1527             break;
1528           i = 0;
1529         }
1530
1531       begin = i;
1532
1533       if (char_to_squeeze == NOT_A_CHAR)
1534         {
1535           int out_len;
1536           /* Here, by being a little tricky, we can get a significant
1537              performance increase in most cases when the input is
1538              reasonably large.  Since tr will modify the input only
1539              if two consecutive (and identical) input characters are
1540              in the squeeze set, we can step by two through the data
1541              when searching for a character in the squeeze set.  This
1542              means there may be a little more work in a few cases and
1543              perhaps twice as much work in the worst cases where most
1544              of the input is removed by squeezing repeats.  But most
1545              uses of this functionality seem to remove less than 20-30%
1546              of the input.  */
1547           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1548             ;                   /* empty */
1549
1550           /* There is a special case when i == nr and we've just
1551              skipped a character (the last one in buf) that is in
1552              the squeeze set.  */
1553           if (i == nr && in_squeeze_set[buf[i - 1]])
1554             --i;
1555
1556           if (i >= nr)
1557             out_len = nr - begin;
1558           else
1559             {
1560               char_to_squeeze = buf[i];
1561               /* We're about to output buf[begin..i].  */
1562               out_len = i - begin + 1;
1563
1564               /* But since we stepped by 2 in the loop above,
1565                  out_len may be one too large.  */
1566               if (i > 0 && buf[i - 1] == char_to_squeeze)
1567                 --out_len;
1568
1569               /* Advance i to the index of first character to be
1570                  considered when looking for a char different from
1571                  char_to_squeeze.  */
1572               ++i;
1573             }
1574           if (out_len > 0
1575               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1576             error (1, errno, "write error");
1577         }
1578
1579       if (char_to_squeeze != NOT_A_CHAR)
1580         {
1581           /* Advance i to index of first char != char_to_squeeze
1582              (or to nr if all the rest of the characters in this
1583              buffer are the same as char_to_squeeze).  */
1584           for (; i < nr && buf[i] == char_to_squeeze; i++)
1585             ;                   /* empty */
1586           if (i < nr)
1587             char_to_squeeze = NOT_A_CHAR;
1588           /* If (i >= nr) we've squeezed the last character in this buffer.
1589              So now we have to read a new buffer and continue comparing
1590              characters against char_to_squeeze.  */
1591         }
1592     }
1593 }
1594
1595 /* Read buffers of SIZE bytes from stdin until one is found that
1596    contains at least one character not in the delete set.  Store
1597    in the array BUF, all characters from that buffer that are not
1598    in the delete set, and return the number of characters saved
1599    or 0 upon EOF.  */
1600
1601 static long
1602 read_and_delete (buf, size, not_used)
1603      unsigned char *buf;
1604      long int size;
1605      PFI not_used;
1606 {
1607   long n_saved;
1608   static int hit_eof = 0;
1609
1610   assert (not_used == NULL);
1611   assert (size > 0);
1612
1613   if (hit_eof)
1614     return 0;
1615
1616   /* This enclosing do-while loop is to make sure that
1617      we don't return zero (indicating EOF) when we've
1618      just deleted all the characters in a buffer.  */
1619   do
1620     {
1621       int i;
1622       int nr = safe_read (0, (char *) buf, size);
1623
1624       if (nr < 0)
1625         error (1, errno, "read error");
1626       if (nr == 0)
1627         {
1628           hit_eof = 1;
1629           return 0;
1630         }
1631
1632       /* This first loop may be a waste of code, but gives much
1633          better performance when no characters are deleted in
1634          the beginning of a buffer.  It just avoids the copying
1635          of buf[i] into buf[n_saved] when it would be a NOP.  */
1636
1637       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1638         /* empty */ ;
1639       n_saved = i;
1640
1641       for (++i; i < nr; i++)
1642         if (!in_delete_set[buf[i]])
1643           buf[n_saved++] = buf[i];
1644     }
1645   while (n_saved == 0);
1646
1647   return n_saved;
1648 }
1649
1650 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1651    perform the in-place and one-to-one mapping specified by the global
1652    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1653
1654 static long
1655 read_and_xlate (buf, size, not_used)
1656      unsigned char *buf;
1657      long int size;
1658      PFI not_used;
1659 {
1660   long chars_read = 0;
1661   static int hit_eof = 0;
1662   int i;
1663
1664   assert (not_used == NULL);
1665   assert (size > 0);
1666
1667   if (hit_eof)
1668     return 0;
1669
1670   chars_read = safe_read (0, (char *) buf, size);
1671   if (chars_read < 0)
1672     error (1, errno, "read error");
1673   if (chars_read == 0)
1674     {
1675       hit_eof = 1;
1676       return 0;
1677     }
1678
1679   for (i = 0; i < chars_read; i++)
1680     buf[i] = xlate[buf[i]];
1681
1682   return chars_read;
1683 }
1684
1685 /* Initialize a boolean membership set IN_SET with the character
1686    values obtained by traversing the linked list of constructs S
1687    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1688    non-zero the resulting set is complemented.  */
1689
1690 static void
1691 set_initialize (s, complement_this_set, in_set)
1692      struct Spec_list *s;
1693      int complement_this_set;
1694      SET_TYPE *in_set;
1695 {
1696   int c;
1697   int i;
1698
1699   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1700   s->state = BEGIN_STATE;
1701   while ((c = get_next (s, NULL)) != -1)
1702     in_set[c] = 1;
1703   if (complement_this_set)
1704     for (i = 0; i < N_CHARS; i++)
1705       in_set[i] = (!in_set[i]);
1706 }
1707
1708 void
1709 main (argc, argv)
1710      int argc;
1711      char **argv;
1712 {
1713   int c;
1714   int non_option_args;
1715   struct Spec_list buf1, buf2;
1716   struct Spec_list *s1 = &buf1;
1717   struct Spec_list *s2 = &buf2;
1718
1719   program_name = argv[0];
1720
1721   while ((c = getopt_long (argc, argv, "cdst", long_options,
1722                            (int *) 0)) != EOF)
1723     {
1724       switch (c)
1725         {
1726         case 0:
1727           break;
1728
1729         case 'c':
1730           complement = 1;
1731           break;
1732
1733         case 'd':
1734           delete = 1;
1735           break;
1736
1737         case 's':
1738           squeeze_repeats = 1;
1739           break;
1740
1741         case 't':
1742           truncate_set1 = 1;
1743           break;
1744
1745         default:
1746           usage (2);
1747           break;
1748         }
1749     }
1750
1751   if (show_version)
1752     {
1753       printf ("tr - %s\n", version_string);
1754       exit (0);
1755     }
1756
1757   if (show_help)
1758     usage (0);
1759
1760   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1761
1762   non_option_args = argc - optind;
1763   translating = (non_option_args == 2 && !delete);
1764
1765   /* Change this test if it is valid to give tr no options and
1766      no args at all.  POSIX doesn't specifically say anything
1767      either way, but it looks like they implied it's invalid
1768      by omission.  If you want to make tr do a slow imitation
1769      of `cat' use `tr a a'.  */
1770   if (non_option_args > 2)
1771     {
1772       error (0, 0, "too many arguments");
1773       usage (2);
1774     }
1775
1776   if (!delete && !squeeze_repeats && non_option_args != 2)
1777     error (1, 0, "two strings must be given when translating");
1778
1779   if (delete && squeeze_repeats && non_option_args != 2)
1780     error (1, 0, "two strings must be given when both \
1781 deleting and squeezing repeats");
1782
1783   /* If --delete is given without --squeeze-repeats, then
1784      only one string argument may be specified.  But POSIX
1785      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1786      is set, pretend we never saw string2.  But I think
1787      this deserves a fatal error, so that's the default.  */
1788   if ((delete && !squeeze_repeats) && non_option_args != 1)
1789     {
1790       if (posix_pedantic && non_option_args == 2)
1791         --non_option_args;
1792       else
1793         error (1, 0,
1794                "only one string may be given when deleting \
1795 without squeezing repeats");
1796     }
1797
1798   if (squeeze_repeats && non_option_args == 0)
1799     error (1, 0, "at least one string must be given when squeezing repeats");
1800
1801   spec_init (s1);
1802   if (parse_str ((unsigned char *) argv[optind], s1))
1803     exit (1);
1804
1805   if (non_option_args == 2)
1806     {
1807       spec_init (s2);
1808       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1809         exit (1);
1810     }
1811   else
1812     s2 = NULL;
1813
1814   validate (s1, s2);
1815
1816   if (squeeze_repeats && non_option_args == 1)
1817     {
1818       set_initialize (s1, complement, in_squeeze_set);
1819       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1820     }
1821   else if (delete && non_option_args == 1)
1822     {
1823       int nr;
1824
1825       set_initialize (s1, complement, in_delete_set);
1826       do
1827         {
1828           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1829           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1830             error (1, errno, "write error");
1831         }
1832       while (nr > 0);
1833     }
1834   else if (squeeze_repeats && delete && non_option_args == 2)
1835     {
1836       set_initialize (s1, complement, in_delete_set);
1837       set_initialize (s2, 0, in_squeeze_set);
1838       squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1839     }
1840   else if (translating)
1841     {
1842       if (complement)
1843         {
1844           int i;
1845           SET_TYPE *in_s1 = in_delete_set;
1846
1847           set_initialize (s1, 0, in_s1);
1848           s2->state = BEGIN_STATE;
1849           for (i = 0; i < N_CHARS; i++)
1850             xlate[i] = i;
1851           for (i = 0; i < N_CHARS; i++)
1852             {
1853               if (!in_s1[i])
1854                 {
1855                   int ch = get_next (s2, NULL);
1856                   assert (ch != -1 || truncate_set1);
1857                   if (ch == -1)
1858                     {
1859                       /* This will happen when tr is invoked like e.g.
1860                          tr -cs A-Za-z0-9 '\012'.  */
1861                       break;
1862                     }
1863                   xlate[i] = ch;
1864                 }
1865             }
1866           assert (get_next (s2, NULL) == -1 || truncate_set1);
1867         }
1868       else
1869         {
1870           int c1, c2;
1871           int i;
1872           enum Upper_Lower_class class_s1;
1873           enum Upper_Lower_class class_s2;
1874
1875           for (i = 0; i < N_CHARS; i++)
1876             xlate[i] = i;
1877           s1->state = BEGIN_STATE;
1878           s2->state = BEGIN_STATE;
1879           for (;;)
1880             {
1881               c1 = get_next (s1, &class_s1);
1882               c2 = get_next (s2, &class_s2);
1883               if (!class_ok[(int) class_s1][(int) class_s2])
1884                 error (1, 0,
1885                      "misaligned or mismatched upper and/or lower classes");
1886               /* The following should have been checked by validate...  */
1887               if (c2 == -1)
1888                 break;
1889               xlate[c1] = c2;
1890             }
1891           assert (c1 == -1 || truncate_set1);
1892         }
1893       if (squeeze_repeats)
1894         {
1895           set_initialize (s2, 0, in_squeeze_set);
1896           squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
1897         }
1898       else
1899         {
1900           int chars_read;
1901
1902           do
1903             {
1904               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
1905               if (chars_read > 0
1906                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
1907                 error (1, errno, "write error");
1908             }
1909           while (chars_read > 0);
1910         }
1911     }
1912
1913   if (fclose (stdout) == EOF)
1914     error (2, errno, "write error");
1915
1916   if (close (0) != 0)
1917     error (2, errno, "standard input");
1918
1919   exit (0);
1920 }