src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 91, 1995-2008 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Jim Meyering */
  18
  19 #include <config.h>
  20
  21 #include <stdio.h>
  22 #include <assert.h>
  23 #include <sys/types.h>
  24 #include <getopt.h>
  25
  26 #include "system.h"
  27 #include "error.h"
  28 #include "quote.h"
  29 #include "safe-read.h"
  30 #include "xstrtol.h"
  31
  32 /* The official name of this program (e.g., no `g' prefix).  */
  33 #define PROGRAM_NAME "tr"
  34
  35 #define AUTHORS proper_name ("Jim Meyering")
  36
  37 enum { N_CHARS = UCHAR_MAX + 1 };
  38
  39 /* An unsigned integer type big enough to hold a repeat count or an
  40    unsigned character.  POSIX requires support for repeat counts as
  41    high as 2**31 - 1.  Since repeat counts might need to expand to
  42    match the length of an argument string, we need at least size_t to
  43    avoid arbitrary internal limits.  It doesn't cost much to use
  44    uintmax_t, though.  */
  45 typedef uintmax_t count;
  46
  47 /* The value for Spec_list->state that indicates to
  48    get_next that it should initialize the tail pointer.
  49    Its value should be as large as possible to avoid conflict
  50    a valid value for the state field -- and that may be as
  51    large as any valid repeat_count.  */
  52 #define BEGIN_STATE (UINTMAX_MAX - 1)
  53
  54 /* The value for Spec_list->state that indicates to
  55    get_next that the element pointed to by Spec_list->tail is
  56    being considered for the first time on this pass through the
  57    list -- it indicates that get_next should make any necessary
  58    initializations.  */
  59 #define NEW_ELEMENT (BEGIN_STATE + 1)
  60
  61 /* The maximum possible repeat count.  Due to how the states are
  62    implemented, it can be as much as BEGIN_STATE.  */
  63 #define REPEAT_COUNT_MAXIMUM BEGIN_STATE
  64
  65 /* The following (but not CC_NO_CLASS) are indices into the array of
  66    valid character class strings.  */
  67 enum Char_class
  68   {
  69     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  70     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  71     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  72     CC_NO_CLASS = 9999
  73   };
  74
  75 /* Character class to which a character (returned by get_next) belonged;
  76    but it is set only if the construct from which the character was obtained
  77    was one of the character classes [:upper:] or [:lower:].  The value
  78    is used only when translating and then, only to make sure that upper
  79    and lower class constructs have the same relative positions in string1
  80    and string2.  */
  81 enum Upper_Lower_class
  82   {
  83     UL_LOWER,
  84     UL_UPPER,
  85     UL_NONE
  86   };
  87
  88 /* The type of a List_element.  See build_spec_list for more details.  */
  89 enum Range_element_type
  90   {
  91     RE_NORMAL_CHAR,
  92     RE_RANGE,
  93     RE_CHAR_CLASS,
  94     RE_EQUIV_CLASS,
  95     RE_REPEATED_CHAR
  96   };
  97
  98 /* One construct in one of tr's argument strings.
  99    For example, consider the POSIX version of the classic tr command:
 100        tr -cs 'a-zA-Z_' '[\n*]'
 101    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 102    and a single normal character, `_'.  String2 has one construct.  */
 103 struct List_element
 104   {
 105     enum Range_element_type type;
 106     struct List_element *next;
 107     union
 108       {
 109         unsigned char normal_char;
 110         struct                  /* unnamed */
 111           {
 112             unsigned char first_char;
 113             unsigned char last_char;
 114           }
 115         range;
 116         enum Char_class char_class;
 117         unsigned char equiv_code;
 118         struct                  /* unnamed */
 119           {
 120             unsigned char the_repeated_char;
 121             count repeat_count;
 122           }
 123         repeated_char;
 124       }
 125     u;
 126   };
 127
 128 /* Each of tr's argument strings is parsed into a form that is easier
 129    to work with: a linked list of constructs (struct List_element).
 130    Each Spec_list structure also encapsulates various attributes of
 131    the corresponding argument string.  The attributes are used mainly
 132    to verify that the strings are valid in the context of any options
 133    specified (like -s, -d, or -c).  The main exception is the member
 134    `tail', which is first used to construct the list.  After construction,
 135    it is used by get_next to save its state when traversing the list.
 136    The member `state' serves a similar function.  */
 137 struct Spec_list
 138   {
 139     /* Points to the head of the list of range elements.
 140        The first struct is a dummy; its members are never used.  */
 141     struct List_element *head;
 142
 143     /* When appending, points to the last element.  When traversing via
 144        get_next(), points to the element to process next.  Setting
 145        Spec_list.state to the value BEGIN_STATE before calling get_next
 146        signals get_next to initialize tail to point to head->next.  */
 147     struct List_element *tail;
 148
 149     /* Used to save state between calls to get_next.  */
 150     count state;
 151
 152     /* Length, in the sense that length ('a-z[:digit:]123abc')
 153        is 42 ( = 26 + 10 + 6).  */
 154     count length;
 155
 156     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 157     size_t n_indefinite_repeats;
 158
 159     /* If n_indefinite_repeats is nonzero, this points to the List_element
 160        corresponding to the last [c*] or [c*0] construct encountered in
 161        this spec.  Otherwise it is undefined.  */
 162     struct List_element *indefinite_repeat_element;
 163
 164     /* True if this spec contains at least one equivalence
 165        class construct e.g. [=c=].  */
 166     bool has_equiv_class;
 167
 168     /* True if this spec contains at least one character class
 169        construct.  E.g. [:digit:].  */
 170     bool has_char_class;
 171
 172     /* True if this spec contains at least one of the character class
 173        constructs (all but upper and lower) that aren't allowed in s2.  */
 174     bool has_restricted_char_class;
 175   };
 176
 177 /* A representation for escaped string1 or string2.  As a string is parsed,
 178    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 179    etc.) are marked as such in this structure by setting the corresponding
 180    entry in the ESCAPED vector.  */
 181 struct E_string
 182 {
 183   char *s;
 184   bool *escaped;
 185   size_t len;
 186 };
 187
 188 /* Return nonzero if the Ith character of escaped string ES matches C
 189    and is not escaped itself.  */
 190 static inline bool
 191 es_match (struct E_string const *es, size_t i, char c)
 192 {
 193   return es->s[i] == c && !es->escaped[i];
 194 }
 195
 196 /* When true, each sequence in the input of a repeated character
 197    (call it c) is replaced (in the output) by a single occurrence of c
 198    for every c in the squeeze set.  */
 199 static bool squeeze_repeats = false;
 200
 201 /* When true, removes characters in the delete set from input.  */
 202 static bool delete = false;
 203
 204 /* Use the complement of set1 in place of set1.  */
 205 static bool complement = false;
 206
 207 /* When tr is performing translation and string1 is longer than string2,
 208    POSIX says that the result is unspecified.  That gives the implementor
 209    of a POSIX conforming version of tr two reasonable choices for the
 210    semantics of this case.
 211
 212    * The BSD tr pads string2 to the length of string1 by
 213    repeating the last character in string2.
 214
 215    * System V tr ignores characters in string1 that have no
 216    corresponding character in string2.  That is, string1 is effectively
 217    truncated to the length of string2.
 218
 219    When nonzero, this flag causes GNU tr to imitate the behavior
 220    of System V tr when translating with string1 longer than string2.
 221    The default is to emulate BSD tr.  This flag is ignored in modes where
 222    no translation is performed.  Emulating the System V tr
 223    in this exceptional case causes the relatively common BSD idiom:
 224
 225        tr -cs A-Za-z0-9 '\012'
 226
 227    to break (it would convert only zero bytes, rather than all
 228    non-alphanumerics, to newlines).
 229
 230    WARNING: This switch does not provide general BSD or System V
 231    compatibility.  For example, it doesn't disable the interpretation
 232    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 233    some unfortunate coincidence you use such constructs in scripts
 234    expecting to use some other version of tr, the scripts will break.  */
 235 static bool truncate_set1 = false;
 236
 237 /* An alias for (!delete && non_option_args == 2).
 238    It is set in main and used there and in validate().  */
 239 static bool translating;
 240
 241 static char io_buf[BUFSIZ];
 242
 243 static char const *const char_class_name[] =
 244 {
 245   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 246   "lower", "print", "punct", "space", "upper", "xdigit"
 247 };
 248 enum { N_CHAR_CLASSES = sizeof char_class_name / sizeof char_class_name[0] };
 249
 250 /* Array of boolean values.  A character `c' is a member of the
 251    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 252    set is defined by the last (possibly, the only) string argument
 253    on the command line when the squeeze option is given.  */
 254 static bool in_squeeze_set[N_CHARS];
 255
 256 /* Array of boolean values.  A character `c' is a member of the
 257    delete set if and only if in_delete_set[c] is true.  The delete
 258    set is defined by the first (or only) string argument on the
 259    command line when the delete option is given.  */
 260 static bool in_delete_set[N_CHARS];
 261
 262 /* Array of character values defining the translation (if any) that
 263    tr is to perform.  Translation is performed only when there are
 264    two specification strings and the delete switch is not given.  */
 265 static char xlate[N_CHARS];
 266
 267 static struct option const long_options[] =
 268 {
 269   {"complement", no_argument, NULL, 'c'},
 270   {"delete", no_argument, NULL, 'd'},
 271   {"squeeze-repeats", no_argument, NULL, 's'},
 272   {"truncate-set1", no_argument, NULL, 't'},
 273   {GETOPT_HELP_OPTION_DECL},
 274   {GETOPT_VERSION_OPTION_DECL},
 275   {NULL, 0, NULL, 0}
 276 };
 277 \f
 278 void
 279 usage (int status)
 280 {
 281   if (status != EXIT_SUCCESS)
 282     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 283              program_name);
 284   else
 285     {
 286       printf (_("\
 287 Usage: %s [OPTION]... SET1 [SET2]\n\
 288 "),
 289               program_name);
 290       fputs (_("\
 291 Translate, squeeze, and/or delete characters from standard input,\n\
 292 writing to standard output.\n\
 293 \n\
 294   -c, -C, --complement    first complement SET1\n\
 295   -d, --delete            delete characters in SET1, do not translate\n\
 296   -s, --squeeze-repeats   replace each input sequence of a repeated character\n\
 297                             that is listed in SET1 with a single occurrence\n\
 298                             of that character\n\
 299   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 300 "), stdout);
 301       fputs (HELP_OPTION_DESCRIPTION, stdout);
 302       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 303       fputs (_("\
 304 \n\
 305 SETs are specified as strings of characters.  Most represent themselves.\n\
 306 Interpreted sequences are:\n\
 307 \n\
 308   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 309   \\\\              backslash\n\
 310   \\a              audible BEL\n\
 311   \\b              backspace\n\
 312   \\f              form feed\n\
 313   \\n              new line\n\
 314   \\r              return\n\
 315   \\t              horizontal tab\n\
 316 "), stdout);
 317      fputs (_("\
 318   \\v              vertical tab\n\
 319   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 320   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 321   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 322   [:alnum:]       all letters and digits\n\
 323   [:alpha:]       all letters\n\
 324   [:blank:]       all horizontal whitespace\n\
 325   [:cntrl:]       all control characters\n\
 326   [:digit:]       all digits\n\
 327 "), stdout);
 328      fputs (_("\
 329   [:graph:]       all printable characters, not including space\n\
 330   [:lower:]       all lower case letters\n\
 331   [:print:]       all printable characters, including space\n\
 332   [:punct:]       all punctuation characters\n\
 333   [:space:]       all horizontal or vertical whitespace\n\
 334   [:upper:]       all upper case letters\n\
 335   [:xdigit:]      all hexadecimal digits\n\
 336   [=CHAR=]        all characters which are equivalent to CHAR\n\
 337 "), stdout);
 338      fputs (_("\
 339 \n\
 340 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 341 -t may be used only when translating.  SET2 is extended to length of\n\
 342 SET1 by repeating its last character as necessary.  \
 343 "), stdout);
 344      fputs (_("\
 345 Excess characters\n\
 346 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 347 expand in ascending order; used in SET2 while translating, they may\n\
 348 only be used in pairs to specify case conversion.  \
 349 "), stdout);
 350      fputs (_("\
 351 -s uses SET1 if not\n\
 352 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 353 translation or deletion.\n\
 354 "), stdout);
 355       emit_bug_reporting_address ();
 356     }
 357   exit (status);
 358 }
 359
 360 /* Return nonzero if the character C is a member of the
 361    equivalence class containing the character EQUIV_CLASS.  */
 362
 363 static inline bool
 364 is_equiv_class_member (unsigned char equiv_class, unsigned char c)
 365 {
 366   return (equiv_class == c);
 367 }
 368
 369 /* Return true if the character C is a member of the
 370    character class CHAR_CLASS.  */
 371
 372 static bool
 373 is_char_class_member (enum Char_class char_class, unsigned char c)
 374 {
 375   int result;
 376
 377   switch (char_class)
 378     {
 379     case CC_ALNUM:
 380       result = isalnum (c);
 381       break;
 382     case CC_ALPHA:
 383       result = isalpha (c);
 384       break;
 385     case CC_BLANK:
 386       result = isblank (c);
 387       break;
 388     case CC_CNTRL:
 389       result = iscntrl (c);
 390       break;
 391     case CC_DIGIT:
 392       result = isdigit (c);
 393       break;
 394     case CC_GRAPH:
 395       result = isgraph (c);
 396       break;
 397     case CC_LOWER:
 398       result = islower (c);
 399       break;
 400     case CC_PRINT:
 401       result = isprint (c);
 402       break;
 403     case CC_PUNCT:
 404       result = ispunct (c);
 405       break;
 406     case CC_SPACE:
 407       result = isspace (c);
 408       break;
 409     case CC_UPPER:
 410       result = isupper (c);
 411       break;
 412     case CC_XDIGIT:
 413       result = isxdigit (c);
 414       break;
 415     default:
 416       abort ();
 417       break;
 418     }
 419
 420   return !! result;
 421 }
 422
 423 static void
 424 es_free (struct E_string *es)
 425 {
 426   free (es->s);
 427   free (es->escaped);
 428 }
 429
 430 /* Perform the first pass over each range-spec argument S, converting all
 431    \c and \ddd escapes to their one-byte representations.  If an invalid
 432    quote sequence is found print an error message and return false;
 433    Otherwise set *ES to the resulting string and return true.
 434    The resulting array of characters may contain zero-bytes;
 435    however, on input, S is assumed to be null-terminated, and hence
 436    cannot contain actual (non-escaped) zero bytes.  */
 437
 438 static bool
 439 unquote (char const *s, struct E_string *es)
 440 {
 441   size_t i, j;
 442   size_t len = strlen (s);
 443
 444   es->s = xmalloc (len);
 445   es->escaped = xcalloc (len, sizeof es->escaped[0]);
 446
 447   j = 0;
 448   for (i = 0; s[i]; i++)
 449     {
 450       unsigned char c;
 451       int oct_digit;
 452
 453       switch (s[i])
 454         {
 455         case '\\':
 456           es->escaped[j] = true;
 457           switch (s[i + 1])
 458             {
 459             case '\\':
 460               c = '\\';
 461               break;
 462             case 'a':
 463               c = '\a';
 464               break;
 465             case 'b':
 466               c = '\b';
 467               break;
 468             case 'f':
 469               c = '\f';
 470               break;
 471             case 'n':
 472               c = '\n';
 473               break;
 474             case 'r':
 475               c = '\r';
 476               break;
 477             case 't':
 478               c = '\t';
 479               break;
 480             case 'v':
 481               c = '\v';
 482               break;
 483             case '0':
 484             case '1':
 485             case '2':
 486             case '3':
 487             case '4':
 488             case '5':
 489             case '6':
 490             case '7':
 491               c = s[i + 1] - '0';
 492               oct_digit = s[i + 2] - '0';
 493               if (0 <= oct_digit && oct_digit <= 7)
 494                 {
 495                   c = 8 * c + oct_digit;
 496                   ++i;
 497                   oct_digit = s[i + 2] - '0';
 498                   if (0 <= oct_digit && oct_digit <= 7)
 499                     {
 500                       if (8 * c + oct_digit < N_CHARS)
 501                         {
 502                           c = 8 * c + oct_digit;
 503                           ++i;
 504                         }
 505                       else
 506                         {
 507                           /* A 3-digit octal number larger than \377 won't
 508                              fit in 8 bits.  So we stop when adding the
 509                              next digit would put us over the limit and
 510                              give a warning about the ambiguity.  POSIX
 511                              isn't clear on this, and we interpret this
 512                              lack of clarity as meaning the resulting behavior
 513                              is undefined, which means we're allowed to issue
 514                              a warning.  */
 515                           error (0, 0, _("warning: the ambiguous octal escape \
 516 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, %c"),
 517                                  s[i], s[i + 1], s[i + 2],
 518                                  s[i], s[i + 1], s[i + 2]);
 519                         }
 520                     }
 521                 }
 522               break;
 523             case '\0':
 524               error (0, 0, _("warning: an unescaped backslash "
 525                              "at end of string is not portable"));
 526               /* POSIX is not clear about this.  */
 527               es->escaped[j] = false;
 528               i--;
 529               c = '\\';
 530               break;
 531             default:
 532               c = s[i + 1];
 533               break;
 534             }
 535           ++i;
 536           es->s[j++] = c;
 537           break;
 538         default:
 539           es->s[j++] = s[i];
 540           break;
 541         }
 542     }
 543   es->len = j;
 544   return true;
 545 }
 546
 547 /* If CLASS_STR is a valid character class string, return its index
 548    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 549
 550 static enum Char_class
 551 look_up_char_class (char const *class_str, size_t len)
 552 {
 553   enum Char_class i;
 554
 555   for (i = 0; i < N_CHAR_CLASSES; i++)
 556     if (strncmp (class_str, char_class_name[i], len) == 0
 557         && strlen (char_class_name[i]) == len)
 558       return i;
 559   return CC_NO_CLASS;
 560 }
 561
 562 /* Return a newly allocated string with a printable version of C.
 563    This function is used solely for formatting error messages.  */
 564
 565 static char *
 566 make_printable_char (unsigned char c)
 567 {
 568   char *buf = xmalloc (5);
 569
 570   if (isprint (c))
 571     {
 572       buf[0] = c;
 573       buf[1] = '\0';
 574     }
 575   else
 576     {
 577       sprintf (buf, "\\%03o", c);
 578     }
 579   return buf;
 580 }
 581
 582 /* Return a newly allocated copy of S which is suitable for printing.
 583    LEN is the number of characters in S.  Most non-printing
 584    (isprint) characters are represented by a backslash followed by
 585    3 octal digits.  However, the characters represented by \c escapes
 586    where c is one of [abfnrtv] are represented by their 2-character \c
 587    sequences.  This function is used solely for printing error messages.  */
 588
 589 static char *
 590 make_printable_str (char const *s, size_t len)
 591 {
 592   /* Worst case is that every character expands to a backslash
 593      followed by a 3-character octal escape sequence.  */
 594   char *printable_buf = xnmalloc (len + 1, 4);
 595   char *p = printable_buf;
 596   size_t i;
 597
 598   for (i = 0; i < len; i++)
 599     {
 600       char buf[5];
 601       char const *tmp = NULL;
 602       unsigned char c = s[i];
 603
 604       switch (c)
 605         {
 606         case '\\':
 607           tmp = "\\";
 608           break;
 609         case '\a':
 610           tmp = "\\a";
 611           break;
 612         case '\b':
 613           tmp = "\\b";
 614           break;
 615         case '\f':
 616           tmp = "\\f";
 617           break;
 618         case '\n':
 619           tmp = "\\n";
 620           break;
 621         case '\r':
 622           tmp = "\\r";
 623           break;
 624         case '\t':
 625           tmp = "\\t";
 626           break;
 627         case '\v':
 628           tmp = "\\v";
 629           break;
 630         default:
 631           if (isprint (c))
 632             {
 633               buf[0] = c;
 634               buf[1] = '\0';
 635             }
 636           else
 637             sprintf (buf, "\\%03o", c);
 638           tmp = buf;
 639           break;
 640         }
 641       p = stpcpy (p, tmp);
 642     }
 643   return printable_buf;
 644 }
 645
 646 /* Append a newly allocated structure representing a
 647    character C to the specification list LIST.  */
 648
 649 static void
 650 append_normal_char (struct Spec_list *list, unsigned char c)
 651 {
 652   struct List_element *new;
 653
 654   new = xmalloc (sizeof *new);
 655   new->next = NULL;
 656   new->type = RE_NORMAL_CHAR;
 657   new->u.normal_char = c;
 658   assert (list->tail);
 659   list->tail->next = new;
 660   list->tail = new;
 661 }
 662
 663 /* Append a newly allocated structure representing the range
 664    of characters from FIRST to LAST to the specification list LIST.
 665    Return false if LAST precedes FIRST in the collating sequence,
 666    true otherwise.  This means that '[c-c]' is acceptable.  */
 667
 668 static bool
 669 append_range (struct Spec_list *list, unsigned char first, unsigned char last)
 670 {
 671   struct List_element *new;
 672
 673   if (last < first)
 674     {
 675       char *tmp1 = make_printable_char (first);
 676       char *tmp2 = make_printable_char (last);
 677
 678       error (0, 0,
 679        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 680              tmp1, tmp2);
 681       free (tmp1);
 682       free (tmp2);
 683       return false;
 684     }
 685   new = xmalloc (sizeof *new);
 686   new->next = NULL;
 687   new->type = RE_RANGE;
 688   new->u.range.first_char = first;
 689   new->u.range.last_char = last;
 690   assert (list->tail);
 691   list->tail->next = new;
 692   list->tail = new;
 693   return true;
 694 }
 695
 696 /* If CHAR_CLASS_STR is a valid character class string, append a
 697    newly allocated structure representing that character class to the end
 698    of the specification list LIST and return true.  If CHAR_CLASS_STR is not
 699    a valid string return false.  */
 700
 701 static bool
 702 append_char_class (struct Spec_list *list,
 703                    char const *char_class_str, size_t len)
 704 {
 705   enum Char_class char_class;
 706   struct List_element *new;
 707
 708   char_class = look_up_char_class (char_class_str, len);
 709   if (char_class == CC_NO_CLASS)
 710     return false;
 711   new = xmalloc (sizeof *new);
 712   new->next = NULL;
 713   new->type = RE_CHAR_CLASS;
 714   new->u.char_class = char_class;
 715   assert (list->tail);
 716   list->tail->next = new;
 717   list->tail = new;
 718   return true;
 719 }
 720
 721 /* Append a newly allocated structure representing a [c*n]
 722    repeated character construct to the specification list LIST.
 723    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 724    is a non-negative repeat count.  */
 725
 726 static void
 727 append_repeated_char (struct Spec_list *list, unsigned char the_char,
 728                       count repeat_count)
 729 {
 730   struct List_element *new;
 731
 732   new = xmalloc (sizeof *new);
 733   new->next = NULL;
 734   new->type = RE_REPEATED_CHAR;
 735   new->u.repeated_char.the_repeated_char = the_char;
 736   new->u.repeated_char.repeat_count = repeat_count;
 737   assert (list->tail);
 738   list->tail->next = new;
 739   list->tail = new;
 740 }
 741
 742 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 743    the length of that string, LEN, if LEN is exactly one, append
 744    a newly allocated structure representing the specified
 745    equivalence class to the specification list, LIST and return true.
 746    If LEN is not 1, return false.  */
 747
 748 static bool
 749 append_equiv_class (struct Spec_list *list,
 750                     char const *equiv_class_str, size_t len)
 751 {
 752   struct List_element *new;
 753
 754   if (len != 1)
 755     return false;
 756   new = xmalloc (sizeof *new);
 757   new->next = NULL;
 758   new->type = RE_EQUIV_CLASS;
 759   new->u.equiv_code = *equiv_class_str;
 760   assert (list->tail);
 761   list->tail->next = new;
 762   list->tail = new;
 763   return true;
 764 }
 765
 766 /* Search forward starting at START_IDX for the 2-char sequence
 767    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 768    a sequence is found, set *RESULT_IDX to the index of the first
 769    character and return true.  Otherwise return false.  P may contain
 770    zero bytes.  */
 771
 772 static bool
 773 find_closing_delim (const struct E_string *es, size_t start_idx,
 774                     char pre_bracket_char, size_t *result_idx)
 775 {
 776   size_t i;
 777
 778   for (i = start_idx; i < es->len - 1; i++)
 779     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 780         && !es->escaped[i] && !es->escaped[i + 1])
 781       {
 782         *result_idx = i;
 783         return true;
 784       }
 785   return false;
 786 }
 787
 788 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 789    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 790    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 791    and return zero. If the second character following
 792    the opening bracket is not `*' or if no closing bracket can be
 793    found, return -1.  If a closing bracket is found and the
 794    second char is `*', but the string between the `*' and `]' isn't
 795    empty, an octal number, or a decimal number, print an error message
 796    and return -2.  */
 797
 798 static int
 799 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 800                        unsigned char *char_to_repeat, count *repeat_count,
 801                        size_t *closing_bracket_idx)
 802 {
 803   size_t i;
 804
 805   assert (start_idx + 1 < es->len);
 806   if (!es_match (es, start_idx + 1, '*'))
 807     return -1;
 808
 809   for (i = start_idx + 2; i < es->len && !es->escaped[i]; i++)
 810     {
 811       if (es->s[i] == ']')
 812         {
 813           size_t digit_str_len = i - start_idx - 2;
 814
 815           *char_to_repeat = es->s[start_idx];
 816           if (digit_str_len == 0)
 817             {
 818               /* We've matched [c*] -- no explicit repeat count.  */
 819               *repeat_count = 0;
 820             }
 821           else
 822             {
 823               /* Here, we have found [c*s] where s should be a string
 824                  of octal (if it starts with `0') or decimal digits.  */
 825               char const *digit_str = &es->s[start_idx + 2];
 826               char *d_end;
 827               if ((xstrtoumax (digit_str, &d_end, *digit_str == '0' ? 8 : 10,
 828                                repeat_count, NULL)
 829                    != LONGINT_OK)
 830                   || REPEAT_COUNT_MAXIMUM < *repeat_count
 831                   || digit_str + digit_str_len != d_end)
 832                 {
 833                   char *tmp = make_printable_str (digit_str, digit_str_len);
 834                   error (0, 0,
 835                          _("invalid repeat count %s in [c*n] construct"),
 836                          quote (tmp));
 837                   free (tmp);
 838                   return -2;
 839                 }
 840             }
 841           *closing_bracket_idx = i;
 842           return 0;
 843         }
 844     }
 845   return -1;                    /* No bracket found.  */
 846 }
 847
 848 /* Return true if the string at ES->s[IDX] matches the regular
 849    expression `\*[0-9]*\]', false otherwise.  The string does not
 850    match if any of its characters are escaped.  */
 851
 852 static bool
 853 star_digits_closebracket (const struct E_string *es, size_t idx)
 854 {
 855   size_t i;
 856
 857   if (!es_match (es, idx, '*'))
 858     return false;
 859
 860   for (i = idx + 1; i < es->len; i++)
 861     if (!ISDIGIT (to_uchar (es->s[i])) || es->escaped[i])
 862       return es_match (es, i, ']');
 863   return false;
 864 }
 865
 866 /* Convert string UNESCAPED_STRING (which has been preprocessed to
 867    convert backslash-escape sequences) of length LEN characters into
 868    a linked list of the following 5 types of constructs:
 869       - [:str:] Character class where `str' is one of the 12 valid strings.
 870       - [=c=] Equivalence class where `c' is any single character.
 871       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 872           However, if `n' is present, it must be a non-negative octal or
 873           decimal integer.
 874       - r-s Range of characters from `r' to `s'.  The second endpoint must
 875           not precede the first in the current collating sequence.
 876       - c Any other character is interpreted as itself.  */
 877
 878 static bool
 879 build_spec_list (const struct E_string *es, struct Spec_list *result)
 880 {
 881   char const *p;
 882   size_t i;
 883
 884   p = es->s;
 885
 886   /* The main for-loop below recognizes the 4 multi-character constructs.
 887      A character that matches (in its context) none of the multi-character
 888      constructs is classified as `normal'.  Since all multi-character
 889      constructs have at least 3 characters, any strings of length 2 or
 890      less are composed solely of normal characters.  Hence, the index of
 891      the outer for-loop runs only as far as LEN-2.  */
 892
 893   for (i = 0; i + 2 < es->len; /* empty */)
 894     {
 895       if (es_match (es, i, '['))
 896         {
 897           bool matched_multi_char_construct;
 898           size_t closing_bracket_idx;
 899           unsigned char char_to_repeat;
 900           count repeat_count;
 901           int err;
 902
 903           matched_multi_char_construct = true;
 904           if (es_match (es, i + 1, ':') || es_match (es, i + 1, '='))
 905             {
 906               size_t closing_delim_idx;
 907
 908               if (find_closing_delim (es, i + 2, p[i + 1], &closing_delim_idx))
 909                 {
 910                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
 911                   char const *opnd_str = p + i + 2;
 912
 913                   if (opnd_str_len == 0)
 914                     {
 915                       if (p[i + 1] == ':')
 916                         error (0, 0, _("missing character class name `[::]'"));
 917                       else
 918                         error (0, 0,
 919                                _("missing equivalence class character `[==]'"));
 920                       return false;
 921                     }
 922
 923                   if (p[i + 1] == ':')
 924                     {
 925                       /* FIXME: big comment.  */
 926                       if (!append_char_class (result, opnd_str, opnd_str_len))
 927                         {
 928                           if (star_digits_closebracket (es, i + 2))
 929                             goto try_bracketed_repeat;
 930                           else
 931                             {
 932                               char *tmp = make_printable_str (opnd_str,
 933                                                               opnd_str_len);
 934                               error (0, 0, _("invalid character class %s"),
 935                                      quote (tmp));
 936                               free (tmp);
 937                               return false;
 938                             }
 939                         }
 940                     }
 941                   else
 942                     {
 943                       /* FIXME: big comment.  */
 944                       if (!append_equiv_class (result, opnd_str, opnd_str_len))
 945                         {
 946                           if (star_digits_closebracket (es, i + 2))
 947                             goto try_bracketed_repeat;
 948                           else
 949                             {
 950                               char *tmp = make_printable_str (opnd_str,
 951                                                               opnd_str_len);
 952                               error (0, 0,
 953                _("%s: equivalence class operand must be a single character"),
 954                                      tmp);
 955                               free (tmp);
 956                               return false;
 957                             }
 958                         }
 959                     }
 960
 961                   i = closing_delim_idx + 2;
 962                   continue;
 963                 }
 964               /* Else fall through.  This could be [:*] or [=*].  */
 965             }
 966
 967         try_bracketed_repeat:
 968
 969           /* Determine whether this is a bracketed repeat range
 970              matching the RE \[.\*(dec_or_oct_number)?\].  */
 971           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
 972                                        &repeat_count,
 973                                        &closing_bracket_idx);
 974           if (err == 0)
 975             {
 976               append_repeated_char (result, char_to_repeat, repeat_count);
 977               i = closing_bracket_idx + 1;
 978             }
 979           else if (err == -1)
 980             {
 981               matched_multi_char_construct = false;
 982             }
 983           else
 984             {
 985               /* Found a string that looked like [c*n] but the
 986                  numeric part was invalid.  */
 987               return false;
 988             }
 989
 990           if (matched_multi_char_construct)
 991             continue;
 992
 993           /* We reach this point if P does not match [:str:], [=c=],
 994              [c*n], or [c*].  Now, see if P looks like a range `[-c'
 995              (from `[' to `c').  */
 996         }
 997
 998       /* Look ahead one char for ranges like a-z.  */
 999       if (es_match (es, i + 1, '-'))
1000         {
1001           if (!append_range (result, p[i], p[i + 2]))
1002             return false;
1003           i += 3;
1004         }
1005       else
1006         {
1007           append_normal_char (result, p[i]);
1008           ++i;
1009         }
1010     }
1011
1012   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1013   for (; i < es->len; i++)
1014     append_normal_char (result, p[i]);
1015
1016   return true;
1017 }
1018
1019 /* Advance past the current construct.
1020    S->tail must be non-NULL.  */
1021 static void
1022 skip_construct (struct Spec_list *s)
1023 {
1024   s->tail = s->tail->next;
1025   s->state = NEW_ELEMENT;
1026 }
1027
1028 /* Given a Spec_list S (with its saved state implicit in the values
1029    of its members `tail' and `state'), return the next single character
1030    in the expansion of S's constructs.  If the last character of S was
1031    returned on the previous call or if S was empty, this function
1032    returns -1.  For example, successive calls to get_next where S
1033    represents the spec-string 'a-d[y*3]' will return the sequence
1034    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1035    which the returned character comes is [:upper:] or [:lower:], the
1036    parameter CLASS is given a value to indicate which it was.  Otherwise
1037    CLASS is set to UL_NONE.  This value is used only when constructing
1038    the translation table to verify that any occurrences of upper and
1039    lower class constructs in the spec-strings appear in the same relative
1040    positions.  */
1041
1042 static int
1043 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1044 {
1045   struct List_element *p;
1046   int return_val;
1047   int i;
1048
1049   if (class)
1050     *class = UL_NONE;
1051
1052   if (s->state == BEGIN_STATE)
1053     {
1054       s->tail = s->head->next;
1055       s->state = NEW_ELEMENT;
1056     }
1057
1058   p = s->tail;
1059   if (p == NULL)
1060     return -1;
1061
1062   switch (p->type)
1063     {
1064     case RE_NORMAL_CHAR:
1065       return_val = p->u.normal_char;
1066       s->state = NEW_ELEMENT;
1067       s->tail = p->next;
1068       break;
1069
1070     case RE_RANGE:
1071       if (s->state == NEW_ELEMENT)
1072         s->state = p->u.range.first_char;
1073       else
1074         ++(s->state);
1075       return_val = s->state;
1076       if (s->state == p->u.range.last_char)
1077         {
1078           s->tail = p->next;
1079           s->state = NEW_ELEMENT;
1080         }
1081       break;
1082
1083     case RE_CHAR_CLASS:
1084       if (class)
1085         {
1086           switch (p->u.char_class)
1087             {
1088             case CC_LOWER:
1089               *class = UL_LOWER;
1090               break;
1091             case CC_UPPER:
1092               *class = UL_UPPER;
1093               break;
1094             default:
1095               break;
1096             }
1097         }
1098
1099       if (s->state == NEW_ELEMENT)
1100         {
1101           for (i = 0; i < N_CHARS; i++)
1102             if (is_char_class_member (p->u.char_class, i))
1103               break;
1104           assert (i < N_CHARS);
1105           s->state = i;
1106         }
1107       assert (is_char_class_member (p->u.char_class, s->state));
1108       return_val = s->state;
1109       for (i = s->state + 1; i < N_CHARS; i++)
1110         if (is_char_class_member (p->u.char_class, i))
1111           break;
1112       if (i < N_CHARS)
1113         s->state = i;
1114       else
1115         {
1116           s->tail = p->next;
1117           s->state = NEW_ELEMENT;
1118         }
1119       break;
1120
1121     case RE_EQUIV_CLASS:
1122       /* FIXME: this assumes that each character is alone in its own
1123          equivalence class (which appears to be correct for my
1124          LC_COLLATE.  But I don't know of any function that allows
1125          one to determine a character's equivalence class.  */
1126
1127       return_val = p->u.equiv_code;
1128       s->state = NEW_ELEMENT;
1129       s->tail = p->next;
1130       break;
1131
1132     case RE_REPEATED_CHAR:
1133       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1134       if (p->u.repeated_char.repeat_count == 0)
1135         {
1136           s->tail = p->next;
1137           s->state = NEW_ELEMENT;
1138           return_val = get_next (s, class);
1139         }
1140       else
1141         {
1142           if (s->state == NEW_ELEMENT)
1143             {
1144               s->state = 0;
1145             }
1146           ++(s->state);
1147           return_val = p->u.repeated_char.the_repeated_char;
1148           if (s->state == p->u.repeated_char.repeat_count)
1149             {
1150               s->tail = p->next;
1151               s->state = NEW_ELEMENT;
1152             }
1153         }
1154       break;
1155
1156     default:
1157       abort ();
1158       break;
1159     }
1160
1161   return return_val;
1162 }
1163
1164 /* This is a minor kludge.  This function is called from
1165    get_spec_stats to determine the cardinality of a set derived
1166    from a complemented string.  It's a kludge in that some of the
1167    same operations are (duplicated) performed in set_initialize.  */
1168
1169 static int
1170 card_of_complement (struct Spec_list *s)
1171 {
1172   int c;
1173   int cardinality = N_CHARS;
1174   bool in_set[N_CHARS] = { 0, };
1175
1176   s->state = BEGIN_STATE;
1177   while ((c = get_next (s, NULL)) != -1)
1178     {
1179       cardinality -= (!in_set[c]);
1180       in_set[c] = true;
1181     }
1182   return cardinality;
1183 }
1184
1185 /* Gather statistics about the spec-list S in preparation for the tests
1186    in validate that determine the consistency of the specs.  This function
1187    is called at most twice; once for string1, and again for any string2.
1188    LEN_S1 < 0 indicates that this is the first call and that S represents
1189    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1190    constructs in string1, and we can use its value to resolve any
1191    indefinite repeat construct in S (which represents string2).  Hence,
1192    this function has the side-effect that it converts a valid [c*]
1193    construct in string2 to [c*n] where n is large enough (or 0) to give
1194    string2 the same length as string1.  For example, with the command
1195    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1196    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1197
1198 static void
1199 get_spec_stats (struct Spec_list *s)
1200 {
1201   struct List_element *p;
1202   count length = 0;
1203
1204   s->n_indefinite_repeats = 0;
1205   s->has_equiv_class = false;
1206   s->has_restricted_char_class = false;
1207   s->has_char_class = false;
1208   for (p = s->head->next; p; p = p->next)
1209     {
1210       int i;
1211       count len = 0;
1212       count new_length;
1213
1214       switch (p->type)
1215         {
1216         case RE_NORMAL_CHAR:
1217           len = 1;
1218           break;
1219
1220         case RE_RANGE:
1221           assert (p->u.range.last_char >= p->u.range.first_char);
1222           len = p->u.range.last_char - p->u.range.first_char + 1;
1223           break;
1224
1225         case RE_CHAR_CLASS:
1226           s->has_char_class = true;
1227           for (i = 0; i < N_CHARS; i++)
1228             if (is_char_class_member (p->u.char_class, i))
1229               ++len;
1230           switch (p->u.char_class)
1231             {
1232             case CC_UPPER:
1233             case CC_LOWER:
1234               break;
1235             default:
1236               s->has_restricted_char_class = true;
1237               break;
1238             }
1239           break;
1240
1241         case RE_EQUIV_CLASS:
1242           for (i = 0; i < N_CHARS; i++)
1243             if (is_equiv_class_member (p->u.equiv_code, i))
1244               ++len;
1245           s->has_equiv_class = true;
1246           break;
1247
1248         case RE_REPEATED_CHAR:
1249           if (p->u.repeated_char.repeat_count > 0)
1250             len = p->u.repeated_char.repeat_count;
1251           else
1252             {
1253               s->indefinite_repeat_element = p;
1254               ++(s->n_indefinite_repeats);
1255             }
1256           break;
1257
1258         default:
1259           abort ();
1260           break;
1261         }
1262
1263       /* Check for arithmetic overflow in computing length.  Also, reject
1264          any length greater than the maximum repeat count, in case the
1265          length is later used to compute the repeat count for an
1266          indefinite element.  */
1267       new_length = length + len;
1268       if (! (length <= new_length && new_length <= REPEAT_COUNT_MAXIMUM))
1269         error (EXIT_FAILURE, 0, _("too many characters in set"));
1270       length = new_length;
1271     }
1272
1273   s->length = length;
1274 }
1275
1276 static void
1277 get_s1_spec_stats (struct Spec_list *s1)
1278 {
1279   get_spec_stats (s1);
1280   if (complement)
1281     s1->length = card_of_complement (s1);
1282 }
1283
1284 static void
1285 get_s2_spec_stats (struct Spec_list *s2, count len_s1)
1286 {
1287   get_spec_stats (s2);
1288   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1289     {
1290       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1291         len_s1 - s2->length;
1292       s2->length = len_s1;
1293     }
1294 }
1295
1296 static void
1297 spec_init (struct Spec_list *spec_list)
1298 {
1299   struct List_element *new = xmalloc (sizeof *new);
1300   spec_list->head = spec_list->tail = new;
1301   spec_list->head->next = NULL;
1302 }
1303
1304 /* This function makes two passes over the argument string S.  The first
1305    one converts all \c and \ddd escapes to their one-byte representations.
1306    The second constructs a linked specification list, SPEC_LIST, of the
1307    characters and constructs that comprise the argument string.  If either
1308    of these passes detects an error, this function returns false.  */
1309
1310 static bool
1311 parse_str (char const *s, struct Spec_list *spec_list)
1312 {
1313   struct E_string es;
1314   bool ok = unquote (s, &es) && build_spec_list (&es, spec_list);
1315   es_free (&es);
1316   return ok;
1317 }
1318
1319 /* Given two specification lists, S1 and S2, and assuming that
1320    S1->length > S2->length, append a single [c*n] element to S2 where c
1321    is the last character in the expansion of S2 and n is the difference
1322    between the two lengths.
1323    Upon successful completion, S2->length is set to S1->length.  The only
1324    way this function can fail to make S2 as long as S1 is when S2 has
1325    zero-length, since in that case, there is no last character to repeat.
1326    So S2->length is required to be at least 1.
1327
1328    Providing this functionality allows the user to do some pretty
1329    non-BSD (and non-portable) things:  For example, the command
1330        tr -cs '[:upper:]0-9' '[:lower:]'
1331    is almost guaranteed to give results that depend on your collating
1332    sequence.  */
1333
1334 static void
1335 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1336 {
1337   struct List_element *p;
1338   unsigned char char_to_repeat;
1339   int i;
1340
1341   assert (translating);
1342   assert (s1->length > s2->length);
1343   assert (s2->length > 0);
1344
1345   p = s2->tail;
1346   switch (p->type)
1347     {
1348     case RE_NORMAL_CHAR:
1349       char_to_repeat = p->u.normal_char;
1350       break;
1351     case RE_RANGE:
1352       char_to_repeat = p->u.range.last_char;
1353       break;
1354     case RE_CHAR_CLASS:
1355       for (i = N_CHARS - 1; i >= 0; i--)
1356         if (is_char_class_member (p->u.char_class, i))
1357           break;
1358       assert (i >= 0);
1359       char_to_repeat = i;
1360       break;
1361
1362     case RE_REPEATED_CHAR:
1363       char_to_repeat = p->u.repeated_char.the_repeated_char;
1364       break;
1365
1366     case RE_EQUIV_CLASS:
1367       /* This shouldn't happen, because validate exits with an error
1368          if it finds an equiv class in string2 when translating.  */
1369       abort ();
1370       break;
1371
1372     default:
1373       abort ();
1374       break;
1375     }
1376
1377   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1378   s2->length = s1->length;
1379 }
1380
1381 /* Return true if S is a non-empty list in which exactly one
1382    character (but potentially, many instances of it) appears.
1383    E.g., [X*] or xxxxxxxx.  */
1384
1385 static bool
1386 homogeneous_spec_list (struct Spec_list *s)
1387 {
1388   int b, c;
1389
1390   s->state = BEGIN_STATE;
1391
1392   if ((b = get_next (s, NULL)) == -1)
1393     return false;
1394
1395   while ((c = get_next (s, NULL)) != -1)
1396     if (c != b)
1397       return false;
1398
1399   return true;
1400 }
1401
1402 /* Die with an error message if S1 and S2 describe strings that
1403    are not valid with the given command line switches.
1404    A side effect of this function is that if a valid [c*] or
1405    [c*0] construct appears in string2, it is converted to [c*n]
1406    with a value for n that makes s2->length == s1->length.  By
1407    the same token, if the --truncate-set1 option is not
1408    given, S2 may be extended.  */
1409
1410 static void
1411 validate (struct Spec_list *s1, struct Spec_list *s2)
1412 {
1413   get_s1_spec_stats (s1);
1414   if (s1->n_indefinite_repeats > 0)
1415     {
1416       error (EXIT_FAILURE, 0,
1417              _("the [c*] repeat construct may not appear in string1"));
1418     }
1419
1420   if (s2)
1421     {
1422       get_s2_spec_stats (s2, s1->length);
1423
1424       if (s2->n_indefinite_repeats > 1)
1425         {
1426           error (EXIT_FAILURE, 0,
1427                  _("only one [c*] repeat construct may appear in string2"));
1428         }
1429
1430       if (translating)
1431         {
1432           if (s2->has_equiv_class)
1433             {
1434               error (EXIT_FAILURE, 0,
1435                      _("[=c=] expressions may not appear in string2 \
1436 when translating"));
1437             }
1438
1439           if (s1->length > s2->length)
1440             {
1441               if (!truncate_set1)
1442                 {
1443                   /* string2 must be non-empty unless --truncate-set1 is
1444                      given or string1 is empty.  */
1445
1446                   if (s2->length == 0)
1447                     error (EXIT_FAILURE, 0,
1448                      _("when not truncating set1, string2 must be non-empty"));
1449                   string2_extend (s1, s2);
1450                 }
1451             }
1452
1453           if (complement && s1->has_char_class
1454               && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1455             {
1456               error (EXIT_FAILURE, 0,
1457                      _("when translating with complemented character classes,\
1458 \nstring2 must map all characters in the domain to one"));
1459             }
1460
1461           if (s2->has_restricted_char_class)
1462             {
1463               error (EXIT_FAILURE, 0,
1464                      _("when translating, the only character classes that may \
1465 appear in\nstring2 are `upper' and `lower'"));
1466             }
1467         }
1468       else
1469         /* Not translating.  */
1470         {
1471           if (s2->n_indefinite_repeats > 0)
1472             error (EXIT_FAILURE, 0,
1473                    _("the [c*] construct may appear in string2 only \
1474 when translating"));
1475         }
1476     }
1477 }
1478
1479 /* Read buffers of SIZE bytes via the function READER (if READER is
1480    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1481    read_and_delete or read_and_xlate.  After each buffer is read, it is
1482    processed and written to stdout.  The buffers are processed so that
1483    multiple consecutive occurrences of the same character in the input
1484    stream are replaced by a single occurrence of that character if the
1485    character is in the squeeze set.  */
1486
1487 static void
1488 squeeze_filter (char *buf, size_t size, size_t (*reader) (char *, size_t))
1489 {
1490   /* A value distinct from any character that may have been stored in a
1491      buffer as the result of a block-read in the function squeeze_filter.  */
1492   enum { NOT_A_CHAR = CHAR_MAX + 1 };
1493
1494   int char_to_squeeze = NOT_A_CHAR;
1495   size_t i = 0;
1496   size_t nr = 0;
1497
1498   for (;;)
1499     {
1500       size_t begin;
1501
1502       if (i >= nr)
1503         {
1504           nr = reader (buf, size);
1505           if (nr == 0)
1506             break;
1507           i = 0;
1508         }
1509
1510       begin = i;
1511
1512       if (char_to_squeeze == NOT_A_CHAR)
1513         {
1514           size_t out_len;
1515           /* Here, by being a little tricky, we can get a significant
1516              performance increase in most cases when the input is
1517              reasonably large.  Since tr will modify the input only
1518              if two consecutive (and identical) input characters are
1519              in the squeeze set, we can step by two through the data
1520              when searching for a character in the squeeze set.  This
1521              means there may be a little more work in a few cases and
1522              perhaps twice as much work in the worst cases where most
1523              of the input is removed by squeezing repeats.  But most
1524              uses of this functionality seem to remove less than 20-30%
1525              of the input.  */
1526           for (; i < nr && !in_squeeze_set[to_uchar (buf[i])]; i += 2)
1527             continue;
1528
1529           /* There is a special case when i == nr and we've just
1530              skipped a character (the last one in buf) that is in
1531              the squeeze set.  */
1532           if (i == nr && in_squeeze_set[to_uchar (buf[i - 1])])
1533             --i;
1534
1535           if (i >= nr)
1536             out_len = nr - begin;
1537           else
1538             {
1539               char_to_squeeze = buf[i];
1540               /* We're about to output buf[begin..i].  */
1541               out_len = i - begin + 1;
1542
1543               /* But since we stepped by 2 in the loop above,
1544                  out_len may be one too large.  */
1545               if (i > 0 && buf[i - 1] == char_to_squeeze)
1546                 --out_len;
1547
1548               /* Advance i to the index of first character to be
1549                  considered when looking for a char different from
1550                  char_to_squeeze.  */
1551               ++i;
1552             }
1553           if (out_len > 0
1554               && fwrite (&buf[begin], 1, out_len, stdout) != out_len)
1555             error (EXIT_FAILURE, errno, _("write error"));
1556         }
1557
1558       if (char_to_squeeze != NOT_A_CHAR)
1559         {
1560           /* Advance i to index of first char != char_to_squeeze
1561              (or to nr if all the rest of the characters in this
1562              buffer are the same as char_to_squeeze).  */
1563           for (; i < nr && buf[i] == char_to_squeeze; i++)
1564             continue;
1565           if (i < nr)
1566             char_to_squeeze = NOT_A_CHAR;
1567           /* If (i >= nr) we've squeezed the last character in this buffer.
1568              So now we have to read a new buffer and continue comparing
1569              characters against char_to_squeeze.  */
1570         }
1571     }
1572 }
1573
1574 static size_t
1575 plain_read (char *buf, size_t size)
1576 {
1577   size_t nr = safe_read (STDIN_FILENO, buf, size);
1578   if (nr == SAFE_READ_ERROR)
1579     error (EXIT_FAILURE, errno, _("read error"));
1580   return nr;
1581 }
1582
1583 /* Read buffers of SIZE bytes from stdin until one is found that
1584    contains at least one character not in the delete set.  Store
1585    in the array BUF, all characters from that buffer that are not
1586    in the delete set, and return the number of characters saved
1587    or 0 upon EOF.  */
1588
1589 static size_t
1590 read_and_delete (char *buf, size_t size)
1591 {
1592   size_t n_saved;
1593
1594   /* This enclosing do-while loop is to make sure that
1595      we don't return zero (indicating EOF) when we've
1596      just deleted all the characters in a buffer.  */
1597   do
1598     {
1599       size_t i;
1600       size_t nr = plain_read (buf, size);
1601
1602       if (nr == 0)
1603         return 0;
1604
1605       /* This first loop may be a waste of code, but gives much
1606          better performance when no characters are deleted in
1607          the beginning of a buffer.  It just avoids the copying
1608          of buf[i] into buf[n_saved] when it would be a NOP.  */
1609
1610       for (i = 0; i < nr && !in_delete_set[to_uchar (buf[i])]; i++)
1611         continue;
1612       n_saved = i;
1613
1614       for (++i; i < nr; i++)
1615         if (!in_delete_set[to_uchar (buf[i])])
1616           buf[n_saved++] = buf[i];
1617     }
1618   while (n_saved == 0);
1619
1620   return n_saved;
1621 }
1622
1623 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1624    perform the in-place and one-to-one mapping specified by the global
1625    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1626
1627 static size_t
1628 read_and_xlate (char *buf, size_t size)
1629 {
1630   size_t bytes_read = plain_read (buf, size);
1631   size_t i;
1632
1633   for (i = 0; i < bytes_read; i++)
1634     buf[i] = xlate[to_uchar (buf[i])];
1635
1636   return bytes_read;
1637 }
1638
1639 /* Initialize a boolean membership set, IN_SET, with the character
1640    values obtained by traversing the linked list of constructs S
1641    using the function `get_next'.  IN_SET is expected to have been
1642    initialized to all zeros by the caller.  If COMPLEMENT_THIS_SET
1643    is true the resulting set is complemented.  */
1644
1645 static void
1646 set_initialize (struct Spec_list *s, bool complement_this_set, bool *in_set)
1647 {
1648   int c;
1649   size_t i;
1650
1651   s->state = BEGIN_STATE;
1652   while ((c = get_next (s, NULL)) != -1)
1653     in_set[c] = true;
1654   if (complement_this_set)
1655     for (i = 0; i < N_CHARS; i++)
1656       in_set[i] = (!in_set[i]);
1657 }
1658
1659 int
1660 main (int argc, char **argv)
1661 {
1662   int c;
1663   int non_option_args;
1664   int min_operands;
1665   int max_operands;
1666   struct Spec_list buf1, buf2;
1667   struct Spec_list *s1 = &buf1;
1668   struct Spec_list *s2 = &buf2;
1669
1670   initialize_main (&argc, &argv);
1671   set_program_name (argv[0]);
1672   setlocale (LC_ALL, "");
1673   bindtextdomain (PACKAGE, LOCALEDIR);
1674   textdomain (PACKAGE);
1675
1676   atexit (close_stdout);
1677
1678   while ((c = getopt_long (argc, argv, "+cCdst", long_options, NULL)) != -1)
1679     {
1680       switch (c)
1681         {
1682         case 'c':
1683         case 'C':
1684           complement = true;
1685           break;
1686
1687         case 'd':
1688           delete = true;
1689           break;
1690
1691         case 's':
1692           squeeze_repeats = true;
1693           break;
1694
1695         case 't':
1696           truncate_set1 = true;
1697           break;
1698
1699         case_GETOPT_HELP_CHAR;
1700
1701         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1702
1703         default:
1704           usage (EXIT_FAILURE);
1705           break;
1706         }
1707     }
1708
1709   non_option_args = argc - optind;
1710   translating = (non_option_args == 2 && !delete);
1711   min_operands = 1 + (delete == squeeze_repeats);
1712   max_operands = 1 + (delete <= squeeze_repeats);
1713
1714   if (non_option_args < min_operands)
1715     {
1716       if (non_option_args == 0)
1717         error (0, 0, _("missing operand"));
1718       else
1719         {
1720           error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1721           fprintf (stderr, "%s\n",
1722                    _(squeeze_repeats
1723                      ? ("Two strings must be given when "
1724                         "both deleting and squeezing repeats.")
1725                      : "Two strings must be given when translating."));
1726         }
1727       usage (EXIT_FAILURE);
1728     }
1729
1730   if (max_operands < non_option_args)
1731     {
1732       error (0, 0, _("extra operand %s"), quote (argv[optind + max_operands]));
1733       if (non_option_args == 2)
1734         fprintf (stderr, "%s\n",
1735                  _("Only one string may be given when "
1736                    "deleting without squeezing repeats."));
1737       usage (EXIT_FAILURE);
1738     }
1739
1740   spec_init (s1);
1741   if (!parse_str (argv[optind], s1))
1742     exit (EXIT_FAILURE);
1743
1744   if (non_option_args == 2)
1745     {
1746       spec_init (s2);
1747       if (!parse_str (argv[optind + 1], s2))
1748         exit (EXIT_FAILURE);
1749     }
1750   else
1751     s2 = NULL;
1752
1753   validate (s1, s2);
1754
1755   /* Use binary I/O, since `tr' is sometimes used to transliterate
1756      non-printable characters, or characters which are stripped away
1757      by text-mode reads (like CR and ^Z).  */
1758   if (O_BINARY && ! isatty (STDIN_FILENO))
1759     freopen (NULL, "rb", stdin);
1760   if (O_BINARY && ! isatty (STDOUT_FILENO))
1761     freopen (NULL, "wb", stdout);
1762
1763   if (squeeze_repeats && non_option_args == 1)
1764     {
1765       set_initialize (s1, complement, in_squeeze_set);
1766       squeeze_filter (io_buf, sizeof io_buf, plain_read);
1767     }
1768   else if (delete && non_option_args == 1)
1769     {
1770       set_initialize (s1, complement, in_delete_set);
1771
1772       for (;;)
1773         {
1774           size_t nr = read_and_delete (io_buf, sizeof io_buf);
1775           if (nr == 0)
1776             break;
1777           if (fwrite (io_buf, 1, nr, stdout) != nr)
1778             error (EXIT_FAILURE, errno, _("write error"));
1779         }
1780     }
1781   else if (squeeze_repeats && delete && non_option_args == 2)
1782     {
1783       set_initialize (s1, complement, in_delete_set);
1784       set_initialize (s2, false, in_squeeze_set);
1785       squeeze_filter (io_buf, sizeof io_buf, read_and_delete);
1786     }
1787   else if (translating)
1788     {
1789       if (complement)
1790         {
1791           int i;
1792           bool *in_s1 = in_delete_set;
1793
1794           set_initialize (s1, false, in_s1);
1795           s2->state = BEGIN_STATE;
1796           for (i = 0; i < N_CHARS; i++)
1797             xlate[i] = i;
1798           for (i = 0; i < N_CHARS; i++)
1799             {
1800               if (!in_s1[i])
1801                 {
1802                   int ch = get_next (s2, NULL);
1803                   assert (ch != -1 || truncate_set1);
1804                   if (ch == -1)
1805                     {
1806                       /* This will happen when tr is invoked like e.g.
1807                          tr -cs A-Za-z0-9 '\012'.  */
1808                       break;
1809                     }
1810                   xlate[i] = ch;
1811                 }
1812             }
1813         }
1814       else
1815         {
1816           int c1, c2;
1817           int i;
1818           bool case_convert = false;
1819           enum Upper_Lower_class class_s1;
1820           enum Upper_Lower_class class_s2;
1821
1822           for (i = 0; i < N_CHARS; i++)
1823             xlate[i] = i;
1824           s1->state = BEGIN_STATE;
1825           s2->state = BEGIN_STATE;
1826           for (;;)
1827             {
1828               /* When the previous pair identified case-converting classes,
1829                  advance S1 and S2 so that each points to the following
1830                  construct.  */
1831               if (case_convert)
1832                 {
1833                   skip_construct (s1);
1834                   skip_construct (s2);
1835                   case_convert = false;
1836                 }
1837
1838               c1 = get_next (s1, &class_s1);
1839               c2 = get_next (s2, &class_s2);
1840
1841               /* When translating and there is an [:upper:] or [:lower:]
1842                  class in SET2, then there must be a corresponding [:lower:]
1843                  or [:upper:] class in SET1.  */
1844               if (class_s1 == UL_NONE
1845                   && (class_s2 == UL_LOWER || class_s2 == UL_UPPER))
1846                 error (EXIT_FAILURE, 0,
1847                        _("misaligned [:upper:] and/or [:lower:] construct"));
1848
1849               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1850                 {
1851                   case_convert = true;
1852                   for (i = 0; i < N_CHARS; i++)
1853                     if (islower (i))
1854                       xlate[i] = toupper (i);
1855                 }
1856               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1857                 {
1858                   case_convert = true;
1859                   for (i = 0; i < N_CHARS; i++)
1860                     if (isupper (i))
1861                       xlate[i] = tolower (i);
1862                 }
1863               else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1864                        || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1865                 {
1866                   /* POSIX says the behavior of `tr "[:upper:]" "[:upper:]"'
1867                      is undefined.  Treat it as a no-op.  */
1868                 }
1869               else
1870                 {
1871                   /* The following should have been checked by validate...  */
1872                   if (c1 == -1 || c2 == -1)
1873                     break;
1874                   xlate[c1] = c2;
1875                 }
1876             }
1877           assert (c1 == -1 || truncate_set1);
1878         }
1879       if (squeeze_repeats)
1880         {
1881           set_initialize (s2, false, in_squeeze_set);
1882           squeeze_filter (io_buf, sizeof io_buf, read_and_xlate);
1883         }
1884       else
1885         {
1886           for (;;)
1887             {
1888               size_t bytes_read = read_and_xlate (io_buf, sizeof io_buf);
1889               if (bytes_read == 0)
1890                 break;
1891               if (fwrite (io_buf, 1, bytes_read, stdout) != bytes_read)
1892                 error (EXIT_FAILURE, errno, _("write error"));
1893             }
1894         }
1895     }
1896
1897   if (close (STDIN_FILENO) != 0)
1898     error (EXIT_FAILURE, errno, _("standard input"));
1899
1900   exit (EXIT_SUCCESS);
1901 }