src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 91, 1995-2007 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 3, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  17
  18 /* Written by Jim Meyering */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <assert.h>
  24 #include <sys/types.h>
  25 #include <getopt.h>
  26
  27 #include "system.h"
  28 #include "error.h"
  29 #include "quote.h"
  30 #include "safe-read.h"
  31 #include "xstrtol.h"
  32
  33 /* The official name of this program (e.g., no `g' prefix).  */
  34 #define PROGRAM_NAME "tr"
  35
  36 #define AUTHORS "Jim Meyering"
  37
  38 enum { N_CHARS = UCHAR_MAX + 1 };
  39
  40 /* An unsigned integer type big enough to hold a repeat count or an
  41    unsigned character.  POSIX requires support for repeat counts as
  42    high as 2**31 - 1.  Since repeat counts might need to expand to
  43    match the length of an argument string, we need at least size_t to
  44    avoid arbitrary internal limits.  It doesn't cost much to use
  45    uintmax_t, though.  */
  46 typedef uintmax_t count;
  47
  48 /* The value for Spec_list->state that indicates to
  49    get_next that it should initialize the tail pointer.
  50    Its value should be as large as possible to avoid conflict
  51    a valid value for the state field -- and that may be as
  52    large as any valid repeat_count.  */
  53 #define BEGIN_STATE (UINTMAX_MAX - 1)
  54
  55 /* The value for Spec_list->state that indicates to
  56    get_next that the element pointed to by Spec_list->tail is
  57    being considered for the first time on this pass through the
  58    list -- it indicates that get_next should make any necessary
  59    initializations.  */
  60 #define NEW_ELEMENT (BEGIN_STATE + 1)
  61
  62 /* The maximum possible repeat count.  Due to how the states are
  63    implemented, it can be as much as BEGIN_STATE.  */
  64 #define REPEAT_COUNT_MAXIMUM BEGIN_STATE
  65
  66 /* The following (but not CC_NO_CLASS) are indices into the array of
  67    valid character class strings.  */
  68 enum Char_class
  69   {
  70     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  71     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  72     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  73     CC_NO_CLASS = 9999
  74   };
  75
  76 /* Character class to which a character (returned by get_next) belonged;
  77    but it is set only if the construct from which the character was obtained
  78    was one of the character classes [:upper:] or [:lower:].  The value
  79    is used only when translating and then, only to make sure that upper
  80    and lower class constructs have the same relative positions in string1
  81    and string2.  */
  82 enum Upper_Lower_class
  83   {
  84     UL_LOWER,
  85     UL_UPPER,
  86     UL_NONE
  87   };
  88
  89 /* The type of a List_element.  See build_spec_list for more details.  */
  90 enum Range_element_type
  91   {
  92     RE_NORMAL_CHAR,
  93     RE_RANGE,
  94     RE_CHAR_CLASS,
  95     RE_EQUIV_CLASS,
  96     RE_REPEATED_CHAR
  97   };
  98
  99 /* One construct in one of tr's argument strings.
 100    For example, consider the POSIX version of the classic tr command:
 101        tr -cs 'a-zA-Z_' '[\n*]'
 102    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 103    and a single normal character, `_'.  String2 has one construct.  */
 104 struct List_element
 105   {
 106     enum Range_element_type type;
 107     struct List_element *next;
 108     union
 109       {
 110         unsigned char normal_char;
 111         struct                  /* unnamed */
 112           {
 113             unsigned char first_char;
 114             unsigned char last_char;
 115           }
 116         range;
 117         enum Char_class char_class;
 118         unsigned char equiv_code;
 119         struct                  /* unnamed */
 120           {
 121             unsigned char the_repeated_char;
 122             count repeat_count;
 123           }
 124         repeated_char;
 125       }
 126     u;
 127   };
 128
 129 /* Each of tr's argument strings is parsed into a form that is easier
 130    to work with: a linked list of constructs (struct List_element).
 131    Each Spec_list structure also encapsulates various attributes of
 132    the corresponding argument string.  The attributes are used mainly
 133    to verify that the strings are valid in the context of any options
 134    specified (like -s, -d, or -c).  The main exception is the member
 135    `tail', which is first used to construct the list.  After construction,
 136    it is used by get_next to save its state when traversing the list.
 137    The member `state' serves a similar function.  */
 138 struct Spec_list
 139   {
 140     /* Points to the head of the list of range elements.
 141        The first struct is a dummy; its members are never used.  */
 142     struct List_element *head;
 143
 144     /* When appending, points to the last element.  When traversing via
 145        get_next(), points to the element to process next.  Setting
 146        Spec_list.state to the value BEGIN_STATE before calling get_next
 147        signals get_next to initialize tail to point to head->next.  */
 148     struct List_element *tail;
 149
 150     /* Used to save state between calls to get_next.  */
 151     count state;
 152
 153     /* Length, in the sense that length ('a-z[:digit:]123abc')
 154        is 42 ( = 26 + 10 + 6).  */
 155     count length;
 156
 157     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 158     size_t n_indefinite_repeats;
 159
 160     /* If n_indefinite_repeats is nonzero, this points to the List_element
 161        corresponding to the last [c*] or [c*0] construct encountered in
 162        this spec.  Otherwise it is undefined.  */
 163     struct List_element *indefinite_repeat_element;
 164
 165     /* True if this spec contains at least one equivalence
 166        class construct e.g. [=c=].  */
 167     bool has_equiv_class;
 168
 169     /* True if this spec contains at least one character class
 170        construct.  E.g. [:digit:].  */
 171     bool has_char_class;
 172
 173     /* True if this spec contains at least one of the character class
 174        constructs (all but upper and lower) that aren't allowed in s2.  */
 175     bool has_restricted_char_class;
 176   };
 177
 178 /* A representation for escaped string1 or string2.  As a string is parsed,
 179    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 180    etc.) are marked as such in this structure by setting the corresponding
 181    entry in the ESCAPED vector.  */
 182 struct E_string
 183 {
 184   char *s;
 185   bool *escaped;
 186   size_t len;
 187 };
 188
 189 /* Return nonzero if the Ith character of escaped string ES matches C
 190    and is not escaped itself.  */
 191 static inline bool
 192 es_match (struct E_string const *es, size_t i, char c)
 193 {
 194   return es->s[i] == c && !es->escaped[i];
 195 }
 196
 197 /* The name by which this program was run.  */
 198 char *program_name;
 199
 200 /* When true, each sequence in the input of a repeated character
 201    (call it c) is replaced (in the output) by a single occurrence of c
 202    for every c in the squeeze set.  */
 203 static bool squeeze_repeats = false;
 204
 205 /* When true, removes characters in the delete set from input.  */
 206 static bool delete = false;
 207
 208 /* Use the complement of set1 in place of set1.  */
 209 static bool complement = false;
 210
 211 /* When tr is performing translation and string1 is longer than string2,
 212    POSIX says that the result is unspecified.  That gives the implementor
 213    of a POSIX conforming version of tr two reasonable choices for the
 214    semantics of this case.
 215
 216    * The BSD tr pads string2 to the length of string1 by
 217    repeating the last character in string2.
 218
 219    * System V tr ignores characters in string1 that have no
 220    corresponding character in string2.  That is, string1 is effectively
 221    truncated to the length of string2.
 222
 223    When nonzero, this flag causes GNU tr to imitate the behavior
 224    of System V tr when translating with string1 longer than string2.
 225    The default is to emulate BSD tr.  This flag is ignored in modes where
 226    no translation is performed.  Emulating the System V tr
 227    in this exceptional case causes the relatively common BSD idiom:
 228
 229        tr -cs A-Za-z0-9 '\012'
 230
 231    to break (it would convert only zero bytes, rather than all
 232    non-alphanumerics, to newlines).
 233
 234    WARNING: This switch does not provide general BSD or System V
 235    compatibility.  For example, it doesn't disable the interpretation
 236    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 237    some unfortunate coincidence you use such constructs in scripts
 238    expecting to use some other version of tr, the scripts will break.  */
 239 static bool truncate_set1 = false;
 240
 241 /* An alias for (!delete && non_option_args == 2).
 242    It is set in main and used there and in validate().  */
 243 static bool translating;
 244
 245 static char io_buf[BUFSIZ];
 246
 247 static char const *const char_class_name[] =
 248 {
 249   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 250   "lower", "print", "punct", "space", "upper", "xdigit"
 251 };
 252 enum { N_CHAR_CLASSES = sizeof char_class_name / sizeof char_class_name[0] };
 253
 254 /* Array of boolean values.  A character `c' is a member of the
 255    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 256    set is defined by the last (possibly, the only) string argument
 257    on the command line when the squeeze option is given.  */
 258 static bool in_squeeze_set[N_CHARS];
 259
 260 /* Array of boolean values.  A character `c' is a member of the
 261    delete set if and only if in_delete_set[c] is true.  The delete
 262    set is defined by the first (or only) string argument on the
 263    command line when the delete option is given.  */
 264 static bool in_delete_set[N_CHARS];
 265
 266 /* Array of character values defining the translation (if any) that
 267    tr is to perform.  Translation is performed only when there are
 268    two specification strings and the delete switch is not given.  */
 269 static char xlate[N_CHARS];
 270
 271 static struct option const long_options[] =
 272 {
 273   {"complement", no_argument, NULL, 'c'},
 274   {"delete", no_argument, NULL, 'd'},
 275   {"squeeze-repeats", no_argument, NULL, 's'},
 276   {"truncate-set1", no_argument, NULL, 't'},
 277   {GETOPT_HELP_OPTION_DECL},
 278   {GETOPT_VERSION_OPTION_DECL},
 279   {NULL, 0, NULL, 0}
 280 };
 281 \f
 282 void
 283 usage (int status)
 284 {
 285   if (status != EXIT_SUCCESS)
 286     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 287              program_name);
 288   else
 289     {
 290       printf (_("\
 291 Usage: %s [OPTION]... SET1 [SET2]\n\
 292 "),
 293               program_name);
 294       fputs (_("\
 295 Translate, squeeze, and/or delete characters from standard input,\n\
 296 writing to standard output.\n\
 297 \n\
 298   -c, -C, --complement    first complement SET1\n\
 299   -d, --delete            delete characters in SET1, do not translate\n\
 300   -s, --squeeze-repeats   replace each input sequence of a repeated character\n\
 301                             that is listed in SET1 with a single occurrence\n\
 302                             of that character\n\
 303   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 304 "), stdout);
 305       fputs (HELP_OPTION_DESCRIPTION, stdout);
 306       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 307       fputs (_("\
 308 \n\
 309 SETs are specified as strings of characters.  Most represent themselves.\n\
 310 Interpreted sequences are:\n\
 311 \n\
 312   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 313   \\\\              backslash\n\
 314   \\a              audible BEL\n\
 315   \\b              backspace\n\
 316   \\f              form feed\n\
 317   \\n              new line\n\
 318   \\r              return\n\
 319   \\t              horizontal tab\n\
 320 "), stdout);
 321      fputs (_("\
 322   \\v              vertical tab\n\
 323   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 324   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 325   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 326   [:alnum:]       all letters and digits\n\
 327   [:alpha:]       all letters\n\
 328   [:blank:]       all horizontal whitespace\n\
 329   [:cntrl:]       all control characters\n\
 330   [:digit:]       all digits\n\
 331 "), stdout);
 332      fputs (_("\
 333   [:graph:]       all printable characters, not including space\n\
 334   [:lower:]       all lower case letters\n\
 335   [:print:]       all printable characters, including space\n\
 336   [:punct:]       all punctuation characters\n\
 337   [:space:]       all horizontal or vertical whitespace\n\
 338   [:upper:]       all upper case letters\n\
 339   [:xdigit:]      all hexadecimal digits\n\
 340   [=CHAR=]        all characters which are equivalent to CHAR\n\
 341 "), stdout);
 342      fputs (_("\
 343 \n\
 344 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 345 -t may be used only when translating.  SET2 is extended to length of\n\
 346 SET1 by repeating its last character as necessary.  \
 347 "), stdout);
 348      fputs (_("\
 349 Excess characters\n\
 350 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 351 expand in ascending order; used in SET2 while translating, they may\n\
 352 only be used in pairs to specify case conversion.  \
 353 "), stdout);
 354      fputs (_("\
 355 -s uses SET1 if not\n\
 356 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 357 translation or deletion.\n\
 358 "), stdout);
 359       emit_bug_reporting_address ();
 360     }
 361   exit (status);
 362 }
 363
 364 /* Return nonzero if the character C is a member of the
 365    equivalence class containing the character EQUIV_CLASS.  */
 366
 367 static inline bool
 368 is_equiv_class_member (unsigned char equiv_class, unsigned char c)
 369 {
 370   return (equiv_class == c);
 371 }
 372
 373 /* Return true if the character C is a member of the
 374    character class CHAR_CLASS.  */
 375
 376 static bool
 377 is_char_class_member (enum Char_class char_class, unsigned char c)
 378 {
 379   int result;
 380
 381   switch (char_class)
 382     {
 383     case CC_ALNUM:
 384       result = isalnum (c);
 385       break;
 386     case CC_ALPHA:
 387       result = isalpha (c);
 388       break;
 389     case CC_BLANK:
 390       result = isblank (c);
 391       break;
 392     case CC_CNTRL:
 393       result = iscntrl (c);
 394       break;
 395     case CC_DIGIT:
 396       result = isdigit (c);
 397       break;
 398     case CC_GRAPH:
 399       result = isgraph (c);
 400       break;
 401     case CC_LOWER:
 402       result = islower (c);
 403       break;
 404     case CC_PRINT:
 405       result = isprint (c);
 406       break;
 407     case CC_PUNCT:
 408       result = ispunct (c);
 409       break;
 410     case CC_SPACE:
 411       result = isspace (c);
 412       break;
 413     case CC_UPPER:
 414       result = isupper (c);
 415       break;
 416     case CC_XDIGIT:
 417       result = isxdigit (c);
 418       break;
 419     default:
 420       abort ();
 421       break;
 422     }
 423
 424   return !! result;
 425 }
 426
 427 static void
 428 es_free (struct E_string *es)
 429 {
 430   free (es->s);
 431   free (es->escaped);
 432 }
 433
 434 /* Perform the first pass over each range-spec argument S, converting all
 435    \c and \ddd escapes to their one-byte representations.  If an invalid
 436    quote sequence is found print an error message and return false;
 437    Otherwise set *ES to the resulting string and return true.
 438    The resulting array of characters may contain zero-bytes;
 439    however, on input, S is assumed to be null-terminated, and hence
 440    cannot contain actual (non-escaped) zero bytes.  */
 441
 442 static bool
 443 unquote (char const *s, struct E_string *es)
 444 {
 445   size_t i, j;
 446   size_t len = strlen (s);
 447
 448   es->s = xmalloc (len);
 449   es->escaped = xcalloc (len, sizeof es->escaped[0]);
 450
 451   j = 0;
 452   for (i = 0; s[i]; i++)
 453     {
 454       unsigned char c;
 455       int oct_digit;
 456
 457       switch (s[i])
 458         {
 459         case '\\':
 460           es->escaped[j] = true;
 461           switch (s[i + 1])
 462             {
 463             case '\\':
 464               c = '\\';
 465               break;
 466             case 'a':
 467               c = '\a';
 468               break;
 469             case 'b':
 470               c = '\b';
 471               break;
 472             case 'f':
 473               c = '\f';
 474               break;
 475             case 'n':
 476               c = '\n';
 477               break;
 478             case 'r':
 479               c = '\r';
 480               break;
 481             case 't':
 482               c = '\t';
 483               break;
 484             case 'v':
 485               c = '\v';
 486               break;
 487             case '0':
 488             case '1':
 489             case '2':
 490             case '3':
 491             case '4':
 492             case '5':
 493             case '6':
 494             case '7':
 495               c = s[i + 1] - '0';
 496               oct_digit = s[i + 2] - '0';
 497               if (0 <= oct_digit && oct_digit <= 7)
 498                 {
 499                   c = 8 * c + oct_digit;
 500                   ++i;
 501                   oct_digit = s[i + 2] - '0';
 502                   if (0 <= oct_digit && oct_digit <= 7)
 503                     {
 504                       if (8 * c + oct_digit < N_CHARS)
 505                         {
 506                           c = 8 * c + oct_digit;
 507                           ++i;
 508                         }
 509                       else
 510                         {
 511                           /* A 3-digit octal number larger than \377 won't
 512                              fit in 8 bits.  So we stop when adding the
 513                              next digit would put us over the limit and
 514                              give a warning about the ambiguity.  POSIX
 515                              isn't clear on this, and we interpret this
 516                              lack of clarity as meaning the resulting behavior
 517                              is undefined, which means we're allowed to issue
 518                              a warning.  */
 519                           error (0, 0, _("warning: the ambiguous octal escape \
 520 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, %c"),
 521                                  s[i], s[i + 1], s[i + 2],
 522                                  s[i], s[i + 1], s[i + 2]);
 523                         }
 524                     }
 525                 }
 526               break;
 527             case '\0':
 528               error (0, 0, _("warning: an unescaped backslash "
 529                              "at end of string is not portable"));
 530               /* POSIX is not clear about this.  */
 531               es->escaped[j] = false;
 532               i--;
 533               c = '\\';
 534               break;
 535             default:
 536               c = s[i + 1];
 537               break;
 538             }
 539           ++i;
 540           es->s[j++] = c;
 541           break;
 542         default:
 543           es->s[j++] = s[i];
 544           break;
 545         }
 546     }
 547   es->len = j;
 548   return true;
 549 }
 550
 551 /* If CLASS_STR is a valid character class string, return its index
 552    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 553
 554 static enum Char_class
 555 look_up_char_class (char const *class_str, size_t len)
 556 {
 557   enum Char_class i;
 558
 559   for (i = 0; i < N_CHAR_CLASSES; i++)
 560     if (strncmp (class_str, char_class_name[i], len) == 0
 561         && strlen (char_class_name[i]) == len)
 562       return i;
 563   return CC_NO_CLASS;
 564 }
 565
 566 /* Return a newly allocated string with a printable version of C.
 567    This function is used solely for formatting error messages.  */
 568
 569 static char *
 570 make_printable_char (unsigned char c)
 571 {
 572   char *buf = xmalloc (5);
 573
 574   if (isprint (c))
 575     {
 576       buf[0] = c;
 577       buf[1] = '\0';
 578     }
 579   else
 580     {
 581       sprintf (buf, "\\%03o", c);
 582     }
 583   return buf;
 584 }
 585
 586 /* Return a newly allocated copy of S which is suitable for printing.
 587    LEN is the number of characters in S.  Most non-printing
 588    (isprint) characters are represented by a backslash followed by
 589    3 octal digits.  However, the characters represented by \c escapes
 590    where c is one of [abfnrtv] are represented by their 2-character \c
 591    sequences.  This function is used solely for printing error messages.  */
 592
 593 static char *
 594 make_printable_str (char const *s, size_t len)
 595 {
 596   /* Worst case is that every character expands to a backslash
 597      followed by a 3-character octal escape sequence.  */
 598   char *printable_buf = xnmalloc (len + 1, 4);
 599   char *p = printable_buf;
 600   size_t i;
 601
 602   for (i = 0; i < len; i++)
 603     {
 604       char buf[5];
 605       char const *tmp = NULL;
 606       unsigned char c = s[i];
 607
 608       switch (c)
 609         {
 610         case '\\':
 611           tmp = "\\";
 612           break;
 613         case '\a':
 614           tmp = "\\a";
 615           break;
 616         case '\b':
 617           tmp = "\\b";
 618           break;
 619         case '\f':
 620           tmp = "\\f";
 621           break;
 622         case '\n':
 623           tmp = "\\n";
 624           break;
 625         case '\r':
 626           tmp = "\\r";
 627           break;
 628         case '\t':
 629           tmp = "\\t";
 630           break;
 631         case '\v':
 632           tmp = "\\v";
 633           break;
 634         default:
 635           if (isprint (c))
 636             {
 637               buf[0] = c;
 638               buf[1] = '\0';
 639             }
 640           else
 641             sprintf (buf, "\\%03o", c);
 642           tmp = buf;
 643           break;
 644         }
 645       p = stpcpy (p, tmp);
 646     }
 647   return printable_buf;
 648 }
 649
 650 /* Append a newly allocated structure representing a
 651    character C to the specification list LIST.  */
 652
 653 static void
 654 append_normal_char (struct Spec_list *list, unsigned char c)
 655 {
 656   struct List_element *new;
 657
 658   new = xmalloc (sizeof *new);
 659   new->next = NULL;
 660   new->type = RE_NORMAL_CHAR;
 661   new->u.normal_char = c;
 662   assert (list->tail);
 663   list->tail->next = new;
 664   list->tail = new;
 665 }
 666
 667 /* Append a newly allocated structure representing the range
 668    of characters from FIRST to LAST to the specification list LIST.
 669    Return false if LAST precedes FIRST in the collating sequence,
 670    true otherwise.  This means that '[c-c]' is acceptable.  */
 671
 672 static bool
 673 append_range (struct Spec_list *list, unsigned char first, unsigned char last)
 674 {
 675   struct List_element *new;
 676
 677   if (last < first)
 678     {
 679       char *tmp1 = make_printable_char (first);
 680       char *tmp2 = make_printable_char (last);
 681
 682       error (0, 0,
 683        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 684              tmp1, tmp2);
 685       free (tmp1);
 686       free (tmp2);
 687       return false;
 688     }
 689   new = xmalloc (sizeof *new);
 690   new->next = NULL;
 691   new->type = RE_RANGE;
 692   new->u.range.first_char = first;
 693   new->u.range.last_char = last;
 694   assert (list->tail);
 695   list->tail->next = new;
 696   list->tail = new;
 697   return true;
 698 }
 699
 700 /* If CHAR_CLASS_STR is a valid character class string, append a
 701    newly allocated structure representing that character class to the end
 702    of the specification list LIST and return true.  If CHAR_CLASS_STR is not
 703    a valid string return false.  */
 704
 705 static bool
 706 append_char_class (struct Spec_list *list,
 707                    char const *char_class_str, size_t len)
 708 {
 709   enum Char_class char_class;
 710   struct List_element *new;
 711
 712   char_class = look_up_char_class (char_class_str, len);
 713   if (char_class == CC_NO_CLASS)
 714     return false;
 715   new = xmalloc (sizeof *new);
 716   new->next = NULL;
 717   new->type = RE_CHAR_CLASS;
 718   new->u.char_class = char_class;
 719   assert (list->tail);
 720   list->tail->next = new;
 721   list->tail = new;
 722   return true;
 723 }
 724
 725 /* Append a newly allocated structure representing a [c*n]
 726    repeated character construct to the specification list LIST.
 727    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 728    is a non-negative repeat count.  */
 729
 730 static void
 731 append_repeated_char (struct Spec_list *list, unsigned char the_char,
 732                       count repeat_count)
 733 {
 734   struct List_element *new;
 735
 736   new = xmalloc (sizeof *new);
 737   new->next = NULL;
 738   new->type = RE_REPEATED_CHAR;
 739   new->u.repeated_char.the_repeated_char = the_char;
 740   new->u.repeated_char.repeat_count = repeat_count;
 741   assert (list->tail);
 742   list->tail->next = new;
 743   list->tail = new;
 744 }
 745
 746 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 747    the length of that string, LEN, if LEN is exactly one, append
 748    a newly allocated structure representing the specified
 749    equivalence class to the specification list, LIST and return true.
 750    If LEN is not 1, return false.  */
 751
 752 static bool
 753 append_equiv_class (struct Spec_list *list,
 754                     char const *equiv_class_str, size_t len)
 755 {
 756   struct List_element *new;
 757
 758   if (len != 1)
 759     return false;
 760   new = xmalloc (sizeof *new);
 761   new->next = NULL;
 762   new->type = RE_EQUIV_CLASS;
 763   new->u.equiv_code = *equiv_class_str;
 764   assert (list->tail);
 765   list->tail->next = new;
 766   list->tail = new;
 767   return true;
 768 }
 769
 770 /* Search forward starting at START_IDX for the 2-char sequence
 771    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 772    a sequence is found, set *RESULT_IDX to the index of the first
 773    character and return true.  Otherwise return false.  P may contain
 774    zero bytes.  */
 775
 776 static bool
 777 find_closing_delim (const struct E_string *es, size_t start_idx,
 778                     char pre_bracket_char, size_t *result_idx)
 779 {
 780   size_t i;
 781
 782   for (i = start_idx; i < es->len - 1; i++)
 783     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 784         && !es->escaped[i] && !es->escaped[i + 1])
 785       {
 786         *result_idx = i;
 787         return true;
 788       }
 789   return false;
 790 }
 791
 792 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 793    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 794    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 795    and return zero. If the second character following
 796    the opening bracket is not `*' or if no closing bracket can be
 797    found, return -1.  If a closing bracket is found and the
 798    second char is `*', but the string between the `*' and `]' isn't
 799    empty, an octal number, or a decimal number, print an error message
 800    and return -2.  */
 801
 802 static int
 803 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 804                        unsigned char *char_to_repeat, count *repeat_count,
 805                        size_t *closing_bracket_idx)
 806 {
 807   size_t i;
 808
 809   assert (start_idx + 1 < es->len);
 810   if (!es_match (es, start_idx + 1, '*'))
 811     return -1;
 812
 813   for (i = start_idx + 2; i < es->len && !es->escaped[i]; i++)
 814     {
 815       if (es->s[i] == ']')
 816         {
 817           size_t digit_str_len = i - start_idx - 2;
 818
 819           *char_to_repeat = es->s[start_idx];
 820           if (digit_str_len == 0)
 821             {
 822               /* We've matched [c*] -- no explicit repeat count.  */
 823               *repeat_count = 0;
 824             }
 825           else
 826             {
 827               /* Here, we have found [c*s] where s should be a string
 828                  of octal (if it starts with `0') or decimal digits.  */
 829               char const *digit_str = &es->s[start_idx + 2];
 830               char *d_end;
 831               if ((xstrtoumax (digit_str, &d_end, *digit_str == '0' ? 8 : 10,
 832                                repeat_count, NULL)
 833                    != LONGINT_OK)
 834                   || REPEAT_COUNT_MAXIMUM < *repeat_count
 835                   || digit_str + digit_str_len != d_end)
 836                 {
 837                   char *tmp = make_printable_str (digit_str, digit_str_len);
 838                   error (0, 0,
 839                          _("invalid repeat count %s in [c*n] construct"),
 840                          quote (tmp));
 841                   free (tmp);
 842                   return -2;
 843                 }
 844             }
 845           *closing_bracket_idx = i;
 846           return 0;
 847         }
 848     }
 849   return -1;                    /* No bracket found.  */
 850 }
 851
 852 /* Return true if the string at ES->s[IDX] matches the regular
 853    expression `\*[0-9]*\]', false otherwise.  The string does not
 854    match if any of its characters are escaped.  */
 855
 856 static bool
 857 star_digits_closebracket (const struct E_string *es, size_t idx)
 858 {
 859   size_t i;
 860
 861   if (!es_match (es, idx, '*'))
 862     return false;
 863
 864   for (i = idx + 1; i < es->len; i++)
 865     if (!ISDIGIT (to_uchar (es->s[i])) || es->escaped[i])
 866       return es_match (es, i, ']');
 867   return false;
 868 }
 869
 870 /* Convert string UNESCAPED_STRING (which has been preprocessed to
 871    convert backslash-escape sequences) of length LEN characters into
 872    a linked list of the following 5 types of constructs:
 873       - [:str:] Character class where `str' is one of the 12 valid strings.
 874       - [=c=] Equivalence class where `c' is any single character.
 875       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 876           However, if `n' is present, it must be a non-negative octal or
 877           decimal integer.
 878       - r-s Range of characters from `r' to `s'.  The second endpoint must
 879           not precede the first in the current collating sequence.
 880       - c Any other character is interpreted as itself.  */
 881
 882 static bool
 883 build_spec_list (const struct E_string *es, struct Spec_list *result)
 884 {
 885   char const *p;
 886   size_t i;
 887
 888   p = es->s;
 889
 890   /* The main for-loop below recognizes the 4 multi-character constructs.
 891      A character that matches (in its context) none of the multi-character
 892      constructs is classified as `normal'.  Since all multi-character
 893      constructs have at least 3 characters, any strings of length 2 or
 894      less are composed solely of normal characters.  Hence, the index of
 895      the outer for-loop runs only as far as LEN-2.  */
 896
 897   for (i = 0; i + 2 < es->len; /* empty */)
 898     {
 899       if (es_match (es, i, '['))
 900         {
 901           bool matched_multi_char_construct;
 902           size_t closing_bracket_idx;
 903           unsigned char char_to_repeat;
 904           count repeat_count;
 905           int err;
 906
 907           matched_multi_char_construct = true;
 908           if (es_match (es, i + 1, ':') || es_match (es, i + 1, '='))
 909             {
 910               size_t closing_delim_idx;
 911
 912               if (find_closing_delim (es, i + 2, p[i + 1], &closing_delim_idx))
 913                 {
 914                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
 915                   char const *opnd_str = p + i + 2;
 916
 917                   if (opnd_str_len == 0)
 918                     {
 919                       if (p[i + 1] == ':')
 920                         error (0, 0, _("missing character class name `[::]'"));
 921                       else
 922                         error (0, 0,
 923                                _("missing equivalence class character `[==]'"));
 924                       return false;
 925                     }
 926
 927                   if (p[i + 1] == ':')
 928                     {
 929                       /* FIXME: big comment.  */
 930                       if (!append_char_class (result, opnd_str, opnd_str_len))
 931                         {
 932                           if (star_digits_closebracket (es, i + 2))
 933                             goto try_bracketed_repeat;
 934                           else
 935                             {
 936                               char *tmp = make_printable_str (opnd_str,
 937                                                               opnd_str_len);
 938                               error (0, 0, _("invalid character class %s"),
 939                                      quote (tmp));
 940                               free (tmp);
 941                               return false;
 942                             }
 943                         }
 944                     }
 945                   else
 946                     {
 947                       /* FIXME: big comment.  */
 948                       if (!append_equiv_class (result, opnd_str, opnd_str_len))
 949                         {
 950                           if (star_digits_closebracket (es, i + 2))
 951                             goto try_bracketed_repeat;
 952                           else
 953                             {
 954                               char *tmp = make_printable_str (opnd_str,
 955                                                               opnd_str_len);
 956                               error (0, 0,
 957                _("%s: equivalence class operand must be a single character"),
 958                                      tmp);
 959                               free (tmp);
 960                               return false;
 961                             }
 962                         }
 963                     }
 964
 965                   i = closing_delim_idx + 2;
 966                   continue;
 967                 }
 968               /* Else fall through.  This could be [:*] or [=*].  */
 969             }
 970
 971         try_bracketed_repeat:
 972
 973           /* Determine whether this is a bracketed repeat range
 974              matching the RE \[.\*(dec_or_oct_number)?\].  */
 975           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
 976                                        &repeat_count,
 977                                        &closing_bracket_idx);
 978           if (err == 0)
 979             {
 980               append_repeated_char (result, char_to_repeat, repeat_count);
 981               i = closing_bracket_idx + 1;
 982             }
 983           else if (err == -1)
 984             {
 985               matched_multi_char_construct = false;
 986             }
 987           else
 988             {
 989               /* Found a string that looked like [c*n] but the
 990                  numeric part was invalid.  */
 991               return false;
 992             }
 993
 994           if (matched_multi_char_construct)
 995             continue;
 996
 997           /* We reach this point if P does not match [:str:], [=c=],
 998              [c*n], or [c*].  Now, see if P looks like a range `[-c'
 999              (from `[' to `c').  */
1000         }
1001
1002       /* Look ahead one char for ranges like a-z.  */
1003       if (es_match (es, i + 1, '-'))
1004         {
1005           if (!append_range (result, p[i], p[i + 2]))
1006             return false;
1007           i += 3;
1008         }
1009       else
1010         {
1011           append_normal_char (result, p[i]);
1012           ++i;
1013         }
1014     }
1015
1016   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1017   for (; i < es->len; i++)
1018     append_normal_char (result, p[i]);
1019
1020   return true;
1021 }
1022
1023 /* Given a Spec_list S (with its saved state implicit in the values
1024    of its members `tail' and `state'), return the next single character
1025    in the expansion of S's constructs.  If the last character of S was
1026    returned on the previous call or if S was empty, this function
1027    returns -1.  For example, successive calls to get_next where S
1028    represents the spec-string 'a-d[y*3]' will return the sequence
1029    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1030    which the returned character comes is [:upper:] or [:lower:], the
1031    parameter CLASS is given a value to indicate which it was.  Otherwise
1032    CLASS is set to UL_NONE.  This value is used only when constructing
1033    the translation table to verify that any occurrences of upper and
1034    lower class constructs in the spec-strings appear in the same relative
1035    positions.  */
1036
1037 static int
1038 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1039 {
1040   struct List_element *p;
1041   int return_val;
1042   int i;
1043
1044   if (class)
1045     *class = UL_NONE;
1046
1047   if (s->state == BEGIN_STATE)
1048     {
1049       s->tail = s->head->next;
1050       s->state = NEW_ELEMENT;
1051     }
1052
1053   p = s->tail;
1054   if (p == NULL)
1055     return -1;
1056
1057   switch (p->type)
1058     {
1059     case RE_NORMAL_CHAR:
1060       return_val = p->u.normal_char;
1061       s->state = NEW_ELEMENT;
1062       s->tail = p->next;
1063       break;
1064
1065     case RE_RANGE:
1066       if (s->state == NEW_ELEMENT)
1067         s->state = p->u.range.first_char;
1068       else
1069         ++(s->state);
1070       return_val = s->state;
1071       if (s->state == p->u.range.last_char)
1072         {
1073           s->tail = p->next;
1074           s->state = NEW_ELEMENT;
1075         }
1076       break;
1077
1078     case RE_CHAR_CLASS:
1079       if (class)
1080         {
1081           bool upper_or_lower;
1082           switch (p->u.char_class)
1083             {
1084             case CC_LOWER:
1085               *class = UL_LOWER;
1086               upper_or_lower = true;
1087               break;
1088             case CC_UPPER:
1089               *class = UL_UPPER;
1090               upper_or_lower = true;
1091               break;
1092             default:
1093               upper_or_lower = false;
1094               break;
1095             }
1096
1097           if (upper_or_lower)
1098             {
1099               s->tail = p->next;
1100               s->state = NEW_ELEMENT;
1101               return_val = 0;
1102               break;
1103             }
1104         }
1105
1106       if (s->state == NEW_ELEMENT)
1107         {
1108           for (i = 0; i < N_CHARS; i++)
1109             if (is_char_class_member (p->u.char_class, i))
1110               break;
1111           assert (i < N_CHARS);
1112           s->state = i;
1113         }
1114       assert (is_char_class_member (p->u.char_class, s->state));
1115       return_val = s->state;
1116       for (i = s->state + 1; i < N_CHARS; i++)
1117         if (is_char_class_member (p->u.char_class, i))
1118           break;
1119       if (i < N_CHARS)
1120         s->state = i;
1121       else
1122         {
1123           s->tail = p->next;
1124           s->state = NEW_ELEMENT;
1125         }
1126       break;
1127
1128     case RE_EQUIV_CLASS:
1129       /* FIXME: this assumes that each character is alone in its own
1130          equivalence class (which appears to be correct for my
1131          LC_COLLATE.  But I don't know of any function that allows
1132          one to determine a character's equivalence class.  */
1133
1134       return_val = p->u.equiv_code;
1135       s->state = NEW_ELEMENT;
1136       s->tail = p->next;
1137       break;
1138
1139     case RE_REPEATED_CHAR:
1140       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1141       if (p->u.repeated_char.repeat_count == 0)
1142         {
1143           s->tail = p->next;
1144           s->state = NEW_ELEMENT;
1145           return_val = get_next (s, class);
1146         }
1147       else
1148         {
1149           if (s->state == NEW_ELEMENT)
1150             {
1151               s->state = 0;
1152             }
1153           ++(s->state);
1154           return_val = p->u.repeated_char.the_repeated_char;
1155           if (s->state == p->u.repeated_char.repeat_count)
1156             {
1157               s->tail = p->next;
1158               s->state = NEW_ELEMENT;
1159             }
1160         }
1161       break;
1162
1163     default:
1164       abort ();
1165       break;
1166     }
1167
1168   return return_val;
1169 }
1170
1171 /* This is a minor kludge.  This function is called from
1172    get_spec_stats to determine the cardinality of a set derived
1173    from a complemented string.  It's a kludge in that some of the
1174    same operations are (duplicated) performed in set_initialize.  */
1175
1176 static int
1177 card_of_complement (struct Spec_list *s)
1178 {
1179   int c;
1180   int cardinality = N_CHARS;
1181   bool in_set[N_CHARS] = { 0, };
1182
1183   s->state = BEGIN_STATE;
1184   while ((c = get_next (s, NULL)) != -1)
1185     {
1186       cardinality -= (!in_set[c]);
1187       in_set[c] = true;
1188     }
1189   return cardinality;
1190 }
1191
1192 /* Gather statistics about the spec-list S in preparation for the tests
1193    in validate that determine the consistency of the specs.  This function
1194    is called at most twice; once for string1, and again for any string2.
1195    LEN_S1 < 0 indicates that this is the first call and that S represents
1196    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1197    constructs in string1, and we can use its value to resolve any
1198    indefinite repeat construct in S (which represents string2).  Hence,
1199    this function has the side-effect that it converts a valid [c*]
1200    construct in string2 to [c*n] where n is large enough (or 0) to give
1201    string2 the same length as string1.  For example, with the command
1202    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1203    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1204
1205 static void
1206 get_spec_stats (struct Spec_list *s)
1207 {
1208   struct List_element *p;
1209   count length = 0;
1210
1211   s->n_indefinite_repeats = 0;
1212   s->has_equiv_class = false;
1213   s->has_restricted_char_class = false;
1214   s->has_char_class = false;
1215   for (p = s->head->next; p; p = p->next)
1216     {
1217       int i;
1218       count len = 0;
1219       count new_length;
1220
1221       switch (p->type)
1222         {
1223         case RE_NORMAL_CHAR:
1224           len = 1;
1225           break;
1226
1227         case RE_RANGE:
1228           assert (p->u.range.last_char >= p->u.range.first_char);
1229           len = p->u.range.last_char - p->u.range.first_char + 1;
1230           break;
1231
1232         case RE_CHAR_CLASS:
1233           s->has_char_class = true;
1234           for (i = 0; i < N_CHARS; i++)
1235             if (is_char_class_member (p->u.char_class, i))
1236               ++len;
1237           switch (p->u.char_class)
1238             {
1239             case CC_UPPER:
1240             case CC_LOWER:
1241               break;
1242             default:
1243               s->has_restricted_char_class = true;
1244               break;
1245             }
1246           break;
1247
1248         case RE_EQUIV_CLASS:
1249           for (i = 0; i < N_CHARS; i++)
1250             if (is_equiv_class_member (p->u.equiv_code, i))
1251               ++len;
1252           s->has_equiv_class = true;
1253           break;
1254
1255         case RE_REPEATED_CHAR:
1256           if (p->u.repeated_char.repeat_count > 0)
1257             len = p->u.repeated_char.repeat_count;
1258           else
1259             {
1260               s->indefinite_repeat_element = p;
1261               ++(s->n_indefinite_repeats);
1262             }
1263           break;
1264
1265         default:
1266           abort ();
1267           break;
1268         }
1269
1270       /* Check for arithmetic overflow in computing length.  Also, reject
1271          any length greater than the maximum repeat count, in case the
1272          length is later used to compute the repeat count for an
1273          indefinite element.  */
1274       new_length = length + len;
1275       if (! (length <= new_length && new_length <= REPEAT_COUNT_MAXIMUM))
1276         error (EXIT_FAILURE, 0, _("too many characters in set"));
1277       length = new_length;
1278     }
1279
1280   s->length = length;
1281 }
1282
1283 static void
1284 get_s1_spec_stats (struct Spec_list *s1)
1285 {
1286   get_spec_stats (s1);
1287   if (complement)
1288     s1->length = card_of_complement (s1);
1289 }
1290
1291 static void
1292 get_s2_spec_stats (struct Spec_list *s2, count len_s1)
1293 {
1294   get_spec_stats (s2);
1295   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1296     {
1297       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1298         len_s1 - s2->length;
1299       s2->length = len_s1;
1300     }
1301 }
1302
1303 static void
1304 spec_init (struct Spec_list *spec_list)
1305 {
1306   struct List_element *new = xmalloc (sizeof *new);
1307   spec_list->head = spec_list->tail = new;
1308   spec_list->head->next = NULL;
1309 }
1310
1311 /* This function makes two passes over the argument string S.  The first
1312    one converts all \c and \ddd escapes to their one-byte representations.
1313    The second constructs a linked specification list, SPEC_LIST, of the
1314    characters and constructs that comprise the argument string.  If either
1315    of these passes detects an error, this function returns false.  */
1316
1317 static bool
1318 parse_str (char const *s, struct Spec_list *spec_list)
1319 {
1320   struct E_string es;
1321   bool ok = unquote (s, &es) && build_spec_list (&es, spec_list);
1322   es_free (&es);
1323   return ok;
1324 }
1325
1326 /* Given two specification lists, S1 and S2, and assuming that
1327    S1->length > S2->length, append a single [c*n] element to S2 where c
1328    is the last character in the expansion of S2 and n is the difference
1329    between the two lengths.
1330    Upon successful completion, S2->length is set to S1->length.  The only
1331    way this function can fail to make S2 as long as S1 is when S2 has
1332    zero-length, since in that case, there is no last character to repeat.
1333    So S2->length is required to be at least 1.
1334
1335    Providing this functionality allows the user to do some pretty
1336    non-BSD (and non-portable) things:  For example, the command
1337        tr -cs '[:upper:]0-9' '[:lower:]'
1338    is almost guaranteed to give results that depend on your collating
1339    sequence.  */
1340
1341 static void
1342 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1343 {
1344   struct List_element *p;
1345   unsigned char char_to_repeat;
1346   int i;
1347
1348   assert (translating);
1349   assert (s1->length > s2->length);
1350   assert (s2->length > 0);
1351
1352   p = s2->tail;
1353   switch (p->type)
1354     {
1355     case RE_NORMAL_CHAR:
1356       char_to_repeat = p->u.normal_char;
1357       break;
1358     case RE_RANGE:
1359       char_to_repeat = p->u.range.last_char;
1360       break;
1361     case RE_CHAR_CLASS:
1362       for (i = N_CHARS - 1; i >= 0; i--)
1363         if (is_char_class_member (p->u.char_class, i))
1364           break;
1365       assert (i >= 0);
1366       char_to_repeat = i;
1367       break;
1368
1369     case RE_REPEATED_CHAR:
1370       char_to_repeat = p->u.repeated_char.the_repeated_char;
1371       break;
1372
1373     case RE_EQUIV_CLASS:
1374       /* This shouldn't happen, because validate exits with an error
1375          if it finds an equiv class in string2 when translating.  */
1376       abort ();
1377       break;
1378
1379     default:
1380       abort ();
1381       break;
1382     }
1383
1384   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1385   s2->length = s1->length;
1386 }
1387
1388 /* Return true if S is a non-empty list in which exactly one
1389    character (but potentially, many instances of it) appears.
1390    E.g., [X*] or xxxxxxxx.  */
1391
1392 static bool
1393 homogeneous_spec_list (struct Spec_list *s)
1394 {
1395   int b, c;
1396
1397   s->state = BEGIN_STATE;
1398
1399   if ((b = get_next (s, NULL)) == -1)
1400     return false;
1401
1402   while ((c = get_next (s, NULL)) != -1)
1403     if (c != b)
1404       return false;
1405
1406   return true;
1407 }
1408
1409 /* Die with an error message if S1 and S2 describe strings that
1410    are not valid with the given command line switches.
1411    A side effect of this function is that if a valid [c*] or
1412    [c*0] construct appears in string2, it is converted to [c*n]
1413    with a value for n that makes s2->length == s1->length.  By
1414    the same token, if the --truncate-set1 option is not
1415    given, S2 may be extended.  */
1416
1417 static void
1418 validate (struct Spec_list *s1, struct Spec_list *s2)
1419 {
1420   get_s1_spec_stats (s1);
1421   if (s1->n_indefinite_repeats > 0)
1422     {
1423       error (EXIT_FAILURE, 0,
1424              _("the [c*] repeat construct may not appear in string1"));
1425     }
1426
1427   if (s2)
1428     {
1429       get_s2_spec_stats (s2, s1->length);
1430
1431       if (s2->n_indefinite_repeats > 1)
1432         {
1433           error (EXIT_FAILURE, 0,
1434                  _("only one [c*] repeat construct may appear in string2"));
1435         }
1436
1437       if (translating)
1438         {
1439           if (s2->has_equiv_class)
1440             {
1441               error (EXIT_FAILURE, 0,
1442                      _("[=c=] expressions may not appear in string2 \
1443 when translating"));
1444             }
1445
1446           if (s1->length > s2->length)
1447             {
1448               if (!truncate_set1)
1449                 {
1450                   /* string2 must be non-empty unless --truncate-set1 is
1451                      given or string1 is empty.  */
1452
1453                   if (s2->length == 0)
1454                     error (EXIT_FAILURE, 0,
1455                      _("when not truncating set1, string2 must be non-empty"));
1456                   string2_extend (s1, s2);
1457                 }
1458             }
1459
1460           if (complement && s1->has_char_class
1461               && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1462             {
1463               error (EXIT_FAILURE, 0,
1464                      _("when translating with complemented character classes,\
1465 \nstring2 must map all characters in the domain to one"));
1466             }
1467
1468           if (s2->has_restricted_char_class)
1469             {
1470               error (EXIT_FAILURE, 0,
1471                      _("when translating, the only character classes that may \
1472 appear in\nstring2 are `upper' and `lower'"));
1473             }
1474         }
1475       else
1476         /* Not translating.  */
1477         {
1478           if (s2->n_indefinite_repeats > 0)
1479             error (EXIT_FAILURE, 0,
1480                    _("the [c*] construct may appear in string2 only \
1481 when translating"));
1482         }
1483     }
1484 }
1485
1486 /* Read buffers of SIZE bytes via the function READER (if READER is
1487    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1488    read_and_delete or read_and_xlate.  After each buffer is read, it is
1489    processed and written to stdout.  The buffers are processed so that
1490    multiple consecutive occurrences of the same character in the input
1491    stream are replaced by a single occurrence of that character if the
1492    character is in the squeeze set.  */
1493
1494 static void
1495 squeeze_filter (char *buf, size_t size, size_t (*reader) (char *, size_t))
1496 {
1497   /* A value distinct from any character that may have been stored in a
1498      buffer as the result of a block-read in the function squeeze_filter.  */
1499   enum { NOT_A_CHAR = CHAR_MAX + 1 };
1500
1501   int char_to_squeeze = NOT_A_CHAR;
1502   size_t i = 0;
1503   size_t nr = 0;
1504
1505   for (;;)
1506     {
1507       size_t begin;
1508
1509       if (i >= nr)
1510         {
1511           nr = reader (buf, size);
1512           if (nr == 0)
1513             break;
1514           i = 0;
1515         }
1516
1517       begin = i;
1518
1519       if (char_to_squeeze == NOT_A_CHAR)
1520         {
1521           size_t out_len;
1522           /* Here, by being a little tricky, we can get a significant
1523              performance increase in most cases when the input is
1524              reasonably large.  Since tr will modify the input only
1525              if two consecutive (and identical) input characters are
1526              in the squeeze set, we can step by two through the data
1527              when searching for a character in the squeeze set.  This
1528              means there may be a little more work in a few cases and
1529              perhaps twice as much work in the worst cases where most
1530              of the input is removed by squeezing repeats.  But most
1531              uses of this functionality seem to remove less than 20-30%
1532              of the input.  */
1533           for (; i < nr && !in_squeeze_set[to_uchar (buf[i])]; i += 2)
1534             continue;
1535
1536           /* There is a special case when i == nr and we've just
1537              skipped a character (the last one in buf) that is in
1538              the squeeze set.  */
1539           if (i == nr && in_squeeze_set[to_uchar (buf[i - 1])])
1540             --i;
1541
1542           if (i >= nr)
1543             out_len = nr - begin;
1544           else
1545             {
1546               char_to_squeeze = buf[i];
1547               /* We're about to output buf[begin..i].  */
1548               out_len = i - begin + 1;
1549
1550               /* But since we stepped by 2 in the loop above,
1551                  out_len may be one too large.  */
1552               if (i > 0 && buf[i - 1] == char_to_squeeze)
1553                 --out_len;
1554
1555               /* Advance i to the index of first character to be
1556                  considered when looking for a char different from
1557                  char_to_squeeze.  */
1558               ++i;
1559             }
1560           if (out_len > 0
1561               && fwrite (&buf[begin], 1, out_len, stdout) != out_len)
1562             error (EXIT_FAILURE, errno, _("write error"));
1563         }
1564
1565       if (char_to_squeeze != NOT_A_CHAR)
1566         {
1567           /* Advance i to index of first char != char_to_squeeze
1568              (or to nr if all the rest of the characters in this
1569              buffer are the same as char_to_squeeze).  */
1570           for (; i < nr && buf[i] == char_to_squeeze; i++)
1571             continue;
1572           if (i < nr)
1573             char_to_squeeze = NOT_A_CHAR;
1574           /* If (i >= nr) we've squeezed the last character in this buffer.
1575              So now we have to read a new buffer and continue comparing
1576              characters against char_to_squeeze.  */
1577         }
1578     }
1579 }
1580
1581 static size_t
1582 plain_read (char *buf, size_t size)
1583 {
1584   size_t nr = safe_read (STDIN_FILENO, buf, size);
1585   if (nr == SAFE_READ_ERROR)
1586     error (EXIT_FAILURE, errno, _("read error"));
1587   return nr;
1588 }
1589
1590 /* Read buffers of SIZE bytes from stdin until one is found that
1591    contains at least one character not in the delete set.  Store
1592    in the array BUF, all characters from that buffer that are not
1593    in the delete set, and return the number of characters saved
1594    or 0 upon EOF.  */
1595
1596 static size_t
1597 read_and_delete (char *buf, size_t size)
1598 {
1599   size_t n_saved;
1600
1601   /* This enclosing do-while loop is to make sure that
1602      we don't return zero (indicating EOF) when we've
1603      just deleted all the characters in a buffer.  */
1604   do
1605     {
1606       size_t i;
1607       size_t nr = plain_read (buf, size);
1608
1609       if (nr == 0)
1610         return 0;
1611
1612       /* This first loop may be a waste of code, but gives much
1613          better performance when no characters are deleted in
1614          the beginning of a buffer.  It just avoids the copying
1615          of buf[i] into buf[n_saved] when it would be a NOP.  */
1616
1617       for (i = 0; i < nr && !in_delete_set[to_uchar (buf[i])]; i++)
1618         continue;
1619       n_saved = i;
1620
1621       for (++i; i < nr; i++)
1622         if (!in_delete_set[to_uchar (buf[i])])
1623           buf[n_saved++] = buf[i];
1624     }
1625   while (n_saved == 0);
1626
1627   return n_saved;
1628 }
1629
1630 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1631    perform the in-place and one-to-one mapping specified by the global
1632    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1633
1634 static size_t
1635 read_and_xlate (char *buf, size_t size)
1636 {
1637   size_t bytes_read = plain_read (buf, size);
1638   size_t i;
1639
1640   for (i = 0; i < bytes_read; i++)
1641     buf[i] = xlate[to_uchar (buf[i])];
1642
1643   return bytes_read;
1644 }
1645
1646 /* Initialize a boolean membership set, IN_SET, with the character
1647    values obtained by traversing the linked list of constructs S
1648    using the function `get_next'.  IN_SET is expected to have been
1649    initialized to all zeros by the caller.  If COMPLEMENT_THIS_SET
1650    is true the resulting set is complemented.  */
1651
1652 static void
1653 set_initialize (struct Spec_list *s, bool complement_this_set, bool *in_set)
1654 {
1655   int c;
1656   size_t i;
1657
1658   s->state = BEGIN_STATE;
1659   while ((c = get_next (s, NULL)) != -1)
1660     in_set[c] = true;
1661   if (complement_this_set)
1662     for (i = 0; i < N_CHARS; i++)
1663       in_set[i] = (!in_set[i]);
1664 }
1665
1666 int
1667 main (int argc, char **argv)
1668 {
1669   int c;
1670   int non_option_args;
1671   int min_operands;
1672   int max_operands;
1673   struct Spec_list buf1, buf2;
1674   struct Spec_list *s1 = &buf1;
1675   struct Spec_list *s2 = &buf2;
1676
1677   initialize_main (&argc, &argv);
1678   program_name = argv[0];
1679   setlocale (LC_ALL, "");
1680   bindtextdomain (PACKAGE, LOCALEDIR);
1681   textdomain (PACKAGE);
1682
1683   atexit (close_stdout);
1684
1685   while ((c = getopt_long (argc, argv, "+cCdst", long_options, NULL)) != -1)
1686     {
1687       switch (c)
1688         {
1689         case 'c':
1690         case 'C':
1691           complement = true;
1692           break;
1693
1694         case 'd':
1695           delete = true;
1696           break;
1697
1698         case 's':
1699           squeeze_repeats = true;
1700           break;
1701
1702         case 't':
1703           truncate_set1 = true;
1704           break;
1705
1706         case_GETOPT_HELP_CHAR;
1707
1708         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1709
1710         default:
1711           usage (EXIT_FAILURE);
1712           break;
1713         }
1714     }
1715
1716   non_option_args = argc - optind;
1717   translating = (non_option_args == 2 && !delete);
1718   min_operands = 1 + (delete == squeeze_repeats);
1719   max_operands = 1 + (delete <= squeeze_repeats);
1720
1721   if (non_option_args < min_operands)
1722     {
1723       if (non_option_args == 0)
1724         error (0, 0, _("missing operand"));
1725       else
1726         {
1727           error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1728           fprintf (stderr, "%s\n",
1729                    _(squeeze_repeats
1730                      ? ("Two strings must be given when "
1731                         "both deleting and squeezing repeats.")
1732                      : "Two strings must be given when translating."));
1733         }
1734       usage (EXIT_FAILURE);
1735     }
1736
1737   if (max_operands < non_option_args)
1738     {
1739       error (0, 0, _("extra operand %s"), quote (argv[optind + max_operands]));
1740       if (non_option_args == 2)
1741         fprintf (stderr, "%s\n",
1742                  _("Only one string may be given when "
1743                    "deleting without squeezing repeats."));
1744       usage (EXIT_FAILURE);
1745     }
1746
1747   spec_init (s1);
1748   if (!parse_str (argv[optind], s1))
1749     exit (EXIT_FAILURE);
1750
1751   if (non_option_args == 2)
1752     {
1753       spec_init (s2);
1754       if (!parse_str (argv[optind + 1], s2))
1755         exit (EXIT_FAILURE);
1756     }
1757   else
1758     s2 = NULL;
1759
1760   validate (s1, s2);
1761
1762   /* Use binary I/O, since `tr' is sometimes used to transliterate
1763      non-printable characters, or characters which are stripped away
1764      by text-mode reads (like CR and ^Z).  */
1765   if (O_BINARY && ! isatty (STDIN_FILENO))
1766     freopen (NULL, "rb", stdin);
1767   if (O_BINARY && ! isatty (STDOUT_FILENO))
1768     freopen (NULL, "wb", stdout);
1769
1770   if (squeeze_repeats && non_option_args == 1)
1771     {
1772       set_initialize (s1, complement, in_squeeze_set);
1773       squeeze_filter (io_buf, sizeof io_buf, plain_read);
1774     }
1775   else if (delete && non_option_args == 1)
1776     {
1777       set_initialize (s1, complement, in_delete_set);
1778
1779       for (;;)
1780         {
1781           size_t nr = read_and_delete (io_buf, sizeof io_buf);
1782           if (nr == 0)
1783             break;
1784           if (fwrite (io_buf, 1, nr, stdout) != nr)
1785             error (EXIT_FAILURE, errno, _("write error"));
1786         }
1787     }
1788   else if (squeeze_repeats && delete && non_option_args == 2)
1789     {
1790       set_initialize (s1, complement, in_delete_set);
1791       set_initialize (s2, false, in_squeeze_set);
1792       squeeze_filter (io_buf, sizeof io_buf, read_and_delete);
1793     }
1794   else if (translating)
1795     {
1796       if (complement)
1797         {
1798           int i;
1799           bool *in_s1 = in_delete_set;
1800
1801           set_initialize (s1, false, in_s1);
1802           s2->state = BEGIN_STATE;
1803           for (i = 0; i < N_CHARS; i++)
1804             xlate[i] = i;
1805           for (i = 0; i < N_CHARS; i++)
1806             {
1807               if (!in_s1[i])
1808                 {
1809                   int ch = get_next (s2, NULL);
1810                   assert (ch != -1 || truncate_set1);
1811                   if (ch == -1)
1812                     {
1813                       /* This will happen when tr is invoked like e.g.
1814                          tr -cs A-Za-z0-9 '\012'.  */
1815                       break;
1816                     }
1817                   xlate[i] = ch;
1818                 }
1819             }
1820         }
1821       else
1822         {
1823           int c1, c2;
1824           int i;
1825           enum Upper_Lower_class class_s1;
1826           enum Upper_Lower_class class_s2;
1827
1828           for (i = 0; i < N_CHARS; i++)
1829             xlate[i] = i;
1830           s1->state = BEGIN_STATE;
1831           s2->state = BEGIN_STATE;
1832           for (;;)
1833             {
1834               c1 = get_next (s1, &class_s1);
1835               c2 = get_next (s2, &class_s2);
1836
1837               /* When constructing the translation array, either one of the
1838                  values returned by paired calls to get_next must be from
1839                  [:upper:] and the other is [:lower:], or neither can be from
1840                  upper or lower.  */
1841
1842               if ((class_s1 == UL_NONE) != (class_s2 == UL_NONE))
1843                 error (EXIT_FAILURE, 0,
1844                        _("misaligned [:upper:] and/or [:lower:] construct"));
1845
1846               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1847                 {
1848                   for (i = 0; i < N_CHARS; i++)
1849                     if (islower (i))
1850                       xlate[i] = toupper (i);
1851                 }
1852               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1853                 {
1854                   for (i = 0; i < N_CHARS; i++)
1855                     if (isupper (i))
1856                       xlate[i] = tolower (i);
1857                 }
1858               else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1859                        || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1860                 {
1861                   /* POSIX says the behavior of `tr "[:upper:]" "[:upper:]"'
1862                      is undefined.  Treat it as a no-op.  */
1863                 }
1864               else
1865                 {
1866                   /* The following should have been checked by validate...  */
1867                   if (c1 == -1 || c2 == -1)
1868                     break;
1869                   xlate[c1] = c2;
1870                 }
1871             }
1872           assert (c1 == -1 || truncate_set1);
1873         }
1874       if (squeeze_repeats)
1875         {
1876           set_initialize (s2, false, in_squeeze_set);
1877           squeeze_filter (io_buf, sizeof io_buf, read_and_xlate);
1878         }
1879       else
1880         {
1881           for (;;)
1882             {
1883               size_t bytes_read = read_and_xlate (io_buf, sizeof io_buf);
1884               if (bytes_read == 0)
1885                 break;
1886               if (fwrite (io_buf, 1, bytes_read, stdout) != bytes_read)
1887                 error (EXIT_FAILURE, errno, _("write error"));
1888             }
1889         }
1890     }
1891
1892   if (close (STDIN_FILENO) != 0)
1893     error (EXIT_FAILURE, errno, _("standard input"));
1894
1895   exit (EXIT_SUCCESS);
1896 }