src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 91, 1995-1998, 1999, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <assert.h>
  24 #include <errno.h>
  25 #include <sys/types.h>
  26 #include <getopt.h>
  27
  28 #include "system.h"
  29 #include "closeout.h"
  30 #include "error.h"
  31 #include "safe-read.h"
  32 #include "xstrtol.h"
  33
  34 /* The official name of this program (e.g., no `g' prefix).  */
  35 #define PROGRAM_NAME "tr"
  36
  37 #define AUTHORS "Jim Meyering"
  38
  39 #define N_CHARS (UCHAR_MAX + 1)
  40
  41 /* A pointer to a filtering function.  */
  42 typedef size_t (*Filter) (/* unsigned char *, size_t, Filter */);
  43
  44 /* Convert from character C to its index in the collating
  45    sequence array.  Just cast to an unsigned int to avoid
  46    problems with sign-extension.  */
  47 #define ORD(c) (unsigned int)(c)
  48
  49 /* The inverse of ORD.  */
  50 #define CHR(i) (unsigned char)(i)
  51
  52 /* The value for Spec_list->state that indicates to
  53    get_next that it should initialize the tail pointer.
  54    Its value should be as large as possible to avoid conflict
  55    a valid value for the state field -- and that may be as
  56    large as any valid repeat_count.  */
  57 #define BEGIN_STATE (INT_MAX - 1)
  58
  59 /* The value for Spec_list->state that indicates to
  60    get_next that the element pointed to by Spec_list->tail is
  61    being considered for the first time on this pass through the
  62    list -- it indicates that get_next should make any necessary
  63    initializations.  */
  64 #define NEW_ELEMENT (BEGIN_STATE + 1)
  65
  66 /* A value distinct from any character that may have been stored in a
  67    buffer as the result of a block-read in the function squeeze_filter.  */
  68 #define NOT_A_CHAR (unsigned int)(-1)
  69
  70 /* The following (but not CC_NO_CLASS) are indices into the array of
  71    valid character class strings.  */
  72 enum Char_class
  73   {
  74     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  75     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  76     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  77     CC_NO_CLASS = 9999
  78   };
  79
  80 /* Character class to which a character (returned by get_next) belonged;
  81    but it is set only if the construct from which the character was obtained
  82    was one of the character classes [:upper:] or [:lower:].  The value
  83    is used only when translating and then, only to make sure that upper
  84    and lower class constructs have the same relative positions in string1
  85    and string2.  */
  86 enum Upper_Lower_class
  87   {
  88     UL_LOWER = 0,
  89     UL_UPPER = 1,
  90     UL_NONE = 2
  91   };
  92
  93 /* A shortcut to ensure that when constructing the translation array,
  94    one of the values returned by paired calls to get_next (from s1 and s2)
  95    is from [:upper:] and the other is from [:lower:], or neither is from
  96    upper or lower.  By default, GNU tr permits the identity mappings: from
  97    [:upper:] to [:upper:] and [:lower:] to [:lower:].  But when
  98    POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
  99    by values of type enum Upper_Lower_class.  */
 100 static int const class_ok[3][3] =
 101 {
 102   {1, 1, 0},
 103   {1, 1, 0},
 104   {0, 0, 1}
 105 };
 106
 107 /* The type of a List_element.  See build_spec_list for more details.  */
 108 enum Range_element_type
 109   {
 110     RE_NO_TYPE = 0,
 111     RE_NORMAL_CHAR,
 112     RE_RANGE,
 113     RE_CHAR_CLASS,
 114     RE_EQUIV_CLASS,
 115     RE_REPEATED_CHAR
 116   };
 117
 118 /* One construct in one of tr's argument strings.
 119    For example, consider the POSIX version of the classic tr command:
 120        tr -cs 'a-zA-Z_' '[\n*]'
 121    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 122    and a single normal character, `_'.  String2 has one construct.  */
 123 struct List_element
 124   {
 125     enum Range_element_type type;
 126     struct List_element *next;
 127     union
 128       {
 129         int normal_char;
 130         struct                  /* unnamed */
 131           {
 132             unsigned int first_char;
 133             unsigned int last_char;
 134           }
 135         range;
 136         enum Char_class char_class;
 137         int equiv_code;
 138         struct                  /* unnamed */
 139           {
 140             unsigned int the_repeated_char;
 141             size_t repeat_count;
 142           }
 143         repeated_char;
 144       }
 145     u;
 146   };
 147
 148 /* Each of tr's argument strings is parsed into a form that is easier
 149    to work with: a linked list of constructs (struct List_element).
 150    Each Spec_list structure also encapsulates various attributes of
 151    the corresponding argument string.  The attributes are used mainly
 152    to verify that the strings are valid in the context of any options
 153    specified (like -s, -d, or -c).  The main exception is the member
 154    `tail', which is first used to construct the list.  After construction,
 155    it is used by get_next to save its state when traversing the list.
 156    The member `state' serves a similar function.  */
 157 struct Spec_list
 158   {
 159     /* Points to the head of the list of range elements.
 160        The first struct is a dummy; its members are never used.  */
 161     struct List_element *head;
 162
 163     /* When appending, points to the last element.  When traversing via
 164        get_next(), points to the element to process next.  Setting
 165        Spec_list.state to the value BEGIN_STATE before calling get_next
 166        signals get_next to initialize tail to point to head->next.  */
 167     struct List_element *tail;
 168
 169     /* Used to save state between calls to get_next().  */
 170     unsigned int state;
 171
 172     /* Length, in the sense that length ('a-z[:digit:]123abc')
 173        is 42 ( = 26 + 10 + 6).  */
 174     size_t length;
 175
 176     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 177     int n_indefinite_repeats;
 178
 179     /* If n_indefinite_repeats is nonzero, this points to the List_element
 180        corresponding to the last [c*] or [c*0] construct encountered in
 181        this spec.  Otherwise it is undefined.  */
 182     struct List_element *indefinite_repeat_element;
 183
 184     /* Non-zero if this spec contains at least one equivalence
 185        class construct e.g. [=c=].  */
 186     int has_equiv_class;
 187
 188     /* Non-zero if this spec contains at least one character class
 189        construct.  E.g. [:digit:].  */
 190     int has_char_class;
 191
 192     /* Non-zero if this spec contains at least one of the character class
 193        constructs (all but upper and lower) that aren't allowed in s2.  */
 194     int has_restricted_char_class;
 195   };
 196
 197 /* A representation for escaped string1 or string2.  As a string is parsed,
 198    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 199    etc.) are marked as such in this structure by setting the corresponding
 200    entry in the ESCAPED vector.  */
 201 struct E_string
 202 {
 203   unsigned char *s;
 204   int *escaped;
 205   size_t len;
 206 };
 207
 208 /* Return nonzero if the Ith character of escaped string ES matches C
 209    and is not escaped itself.  */
 210 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
 211
 212 /* The name by which this program was run.  */
 213 char *program_name;
 214
 215 /* When nonzero, each sequence in the input of a repeated character
 216    (call it c) is replaced (in the output) by a single occurrence of c
 217    for every c in the squeeze set.  */
 218 static int squeeze_repeats = 0;
 219
 220 /* When nonzero, removes characters in the delete set from input.  */
 221 static int delete = 0;
 222
 223 /* Use the complement of set1 in place of set1.  */
 224 static int complement = 0;
 225
 226 /* When nonzero, this flag causes GNU tr to provide strict
 227    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 228    says that when -d is used without -s, string2 (if present)
 229    must be ignored.  Silently ignoring arguments is a bad idea.
 230    The default GNU behavior is to give a usage message and exit.
 231    Additionally, when this flag is nonzero, tr prints warnings
 232    on stderr if it is being used in a manner that is not portable.
 233    Applicable warnings are given by default, but are suppressed
 234    if the environment variable `POSIXLY_CORRECT' is set, since
 235    being POSIX conformant means we can't issue such messages.
 236    Warnings on the following topics are suppressed when this
 237    variable is nonzero:
 238    1. Ambiguous octal escapes.  */
 239 static int posix_pedantic;
 240
 241 /* When tr is performing translation and string1 is longer than string2,
 242    POSIX says that the result is undefined.  That gives the implementor
 243    of a POSIX conforming version of tr two reasonable choices for the
 244    semantics of this case.
 245
 246    * The BSD tr pads string2 to the length of string1 by
 247    repeating the last character in string2.
 248
 249    * System V tr ignores characters in string1 that have no
 250    corresponding character in string2.  That is, string1 is effectively
 251    truncated to the length of string2.
 252
 253    When nonzero, this flag causes GNU tr to imitate the behavior
 254    of System V tr when translating with string1 longer than string2.
 255    The default is to emulate BSD tr.  This flag is ignored in modes where
 256    no translation is performed.  Emulating the System V tr
 257    in this exceptional case causes the relatively common BSD idiom:
 258
 259        tr -cs A-Za-z0-9 '\012'
 260
 261    to break (it would convert only zero bytes, rather than all
 262    non-alphanumerics, to newlines).
 263
 264    WARNING: This switch does not provide general BSD or System V
 265    compatibility.  For example, it doesn't disable the interpretation
 266    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 267    some unfortunate coincidence you use such constructs in scripts
 268    expecting to use some other version of tr, the scripts will break.  */
 269 static int truncate_set1 = 0;
 270
 271 /* An alias for (!delete && non_option_args == 2).
 272    It is set in main and used there and in validate().  */
 273 static int translating;
 274
 275 #ifndef BUFSIZ
 276 # define BUFSIZ 8192
 277 #endif
 278
 279 #define IO_BUF_SIZE BUFSIZ
 280 static unsigned char io_buf[IO_BUF_SIZE];
 281
 282 static char const *const char_class_name[] =
 283 {
 284   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 285   "lower", "print", "punct", "space", "upper", "xdigit"
 286 };
 287 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 288
 289 typedef char SET_TYPE;
 290
 291 /* Array of boolean values.  A character `c' is a member of the
 292    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 293    set is defined by the last (possibly, the only) string argument
 294    on the command line when the squeeze option is given.  */
 295 static SET_TYPE in_squeeze_set[N_CHARS];
 296
 297 /* Array of boolean values.  A character `c' is a member of the
 298    delete set if and only if in_delete_set[c] is true.  The delete
 299    set is defined by the first (or only) string argument on the
 300    command line when the delete option is given.  */
 301 static SET_TYPE in_delete_set[N_CHARS];
 302
 303 /* Array of character values defining the translation (if any) that
 304    tr is to perform.  Translation is performed only when there are
 305    two specification strings and the delete switch is not given.  */
 306 static char xlate[N_CHARS];
 307
 308 static struct option const long_options[] =
 309 {
 310   {"complement", no_argument, NULL, 'c'},
 311   {"delete", no_argument, NULL, 'd'},
 312   {"squeeze-repeats", no_argument, NULL, 's'},
 313   {"truncate-set1", no_argument, NULL, 't'},
 314   {GETOPT_HELP_OPTION_DECL},
 315   {GETOPT_VERSION_OPTION_DECL},
 316   {NULL, 0, NULL, 0}
 317 };
 318 \f
 319 void
 320 usage (int status)
 321 {
 322   if (status != 0)
 323     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 324              program_name);
 325   else
 326     {
 327       printf (_("\
 328 Usage: %s [OPTION]... SET1 [SET2]\n\
 329 "),
 330               program_name);
 331       fputs (_("\
 332 Translate, squeeze, and/or delete characters from standard input,\n\
 333 writing to standard output.\n\
 334 \n\
 335   -c, --complement        first complement SET1\n\
 336   -d, --delete            delete characters in SET1, do not translate\n\
 337   -s, --squeeze-repeats   replace sequence of characters with one\n\
 338   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 339 "), stdout);
 340       fputs (_("\
 341       --help              display this help and exit\n\
 342       --version           output version information and exit\n\
 343 "), stdout);
 344       fputs (_("\
 345 \n\
 346 SETs are specified as strings of characters.  Most represent themselves.\n\
 347 Interpreted sequences are:\n\
 348 \n\
 349   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 350   \\\\              backslash\n\
 351   \\a              audible BEL\n\
 352   \\b              backspace\n\
 353   \\f              form feed\n\
 354   \\n              new line\n\
 355   \\r              return\n\
 356   \\t              horizontal tab\n\
 357 "), stdout);
 358      fputs (_("\
 359   \\v              vertical tab\n\
 360   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 361   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 362   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 363   [:alnum:]       all letters and digits\n\
 364   [:alpha:]       all letters\n\
 365   [:blank:]       all horizontal whitespace\n\
 366   [:cntrl:]       all control characters\n\
 367   [:digit:]       all digits\n\
 368 "), stdout);
 369      fputs (_("\
 370   [:graph:]       all printable characters, not including space\n\
 371   [:lower:]       all lower case letters\n\
 372   [:print:]       all printable characters, including space\n\
 373   [:punct:]       all punctuation characters\n\
 374   [:space:]       all horizontal or vertical whitespace\n\
 375   [:upper:]       all upper case letters\n\
 376   [:xdigit:]      all hexadecimal digits\n\
 377   [=CHAR=]        all characters which are equivalent to CHAR\n\
 378 "), stdout);
 379      fputs (_("\
 380 \n\
 381 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 382 -t may be used only when translating.  SET2 is extended to length of\n\
 383 SET1 by repeating its last character as necessary.  Excess characters\n\
 384 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 385 "), stdout);
 386      fputs (_("\
 387 expand in ascending order; used in SET2 while translating, they may\n\
 388 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 389 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 390 translation or deletion.\n\
 391 "), stdout);
 392       puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
 393     }
 394   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 395 }
 396
 397 /* Return nonzero if the character C is a member of the
 398    equivalence class containing the character EQUIV_CLASS.  */
 399
 400 static int
 401 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
 402 {
 403   return (equiv_class == c);
 404 }
 405
 406 /* Return nonzero if the character C is a member of the
 407    character class CHAR_CLASS.  */
 408
 409 static int
 410 is_char_class_member (enum Char_class char_class, unsigned int c)
 411 {
 412   int result;
 413
 414   switch (char_class)
 415     {
 416     case CC_ALNUM:
 417       result = ISALNUM (c);
 418       break;
 419     case CC_ALPHA:
 420       result = ISALPHA (c);
 421       break;
 422     case CC_BLANK:
 423       result = ISBLANK (c);
 424       break;
 425     case CC_CNTRL:
 426       result = ISCNTRL (c);
 427       break;
 428     case CC_DIGIT:
 429       result = ISDIGIT_LOCALE (c);
 430       break;
 431     case CC_GRAPH:
 432       result = ISGRAPH (c);
 433       break;
 434     case CC_LOWER:
 435       result = ISLOWER (c);
 436       break;
 437     case CC_PRINT:
 438       result = ISPRINT (c);
 439       break;
 440     case CC_PUNCT:
 441       result = ISPUNCT (c);
 442       break;
 443     case CC_SPACE:
 444       result = ISSPACE (c);
 445       break;
 446     case CC_UPPER:
 447       result = ISUPPER (c);
 448       break;
 449     case CC_XDIGIT:
 450       result = ISXDIGIT (c);
 451       break;
 452     default:
 453       abort ();
 454       break;
 455     }
 456   return result;
 457 }
 458
 459 static void
 460 es_free (struct E_string *es)
 461 {
 462   free (es->s);
 463   free (es->escaped);
 464 }
 465
 466 /* Perform the first pass over each range-spec argument S, converting all
 467    \c and \ddd escapes to their one-byte representations.  The conversion
 468    is done in-place, so S must point to writable storage.  If an invalid
 469    quote sequence is found print an error message and return nonzero.
 470    Otherwise set *LEN to the length of the resulting string and return
 471    zero.  The resulting array of characters may contain zero-bytes;
 472    however, on input, S is assumed to be null-terminated, and hence
 473    cannot contain actual (non-escaped) zero bytes.  */
 474
 475 static int
 476 unquote (const unsigned char *s, struct E_string *es)
 477 {
 478   size_t i, j;
 479   size_t len;
 480
 481   len = strlen ((char *) s);
 482
 483   es->s = (unsigned char *) xmalloc (len);
 484   es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
 485   for (i = 0; i < len; i++)
 486     es->escaped[i] = 0;
 487
 488   j = 0;
 489   for (i = 0; s[i]; i++)
 490     {
 491       switch (s[i])
 492         {
 493           int c;
 494         case '\\':
 495           switch (s[i + 1])
 496             {
 497               int oct_digit;
 498             case '\\':
 499               c = '\\';
 500               break;
 501             case 'a':
 502               c = '\007';
 503               break;
 504             case 'b':
 505               c = '\b';
 506               break;
 507             case 'f':
 508               c = '\f';
 509               break;
 510             case 'n':
 511               c = '\n';
 512               break;
 513             case 'r':
 514               c = '\r';
 515               break;
 516             case 't':
 517               c = '\t';
 518               break;
 519             case 'v':
 520               c = '\v';
 521               break;
 522             case '0':
 523             case '1':
 524             case '2':
 525             case '3':
 526             case '4':
 527             case '5':
 528             case '6':
 529             case '7':
 530               c = s[i + 1] - '0';
 531               oct_digit = s[i + 2] - '0';
 532               if (0 <= oct_digit && oct_digit <= 7)
 533                 {
 534                   c = 8 * c + oct_digit;
 535                   ++i;
 536                   oct_digit = s[i + 2] - '0';
 537                   if (0 <= oct_digit && oct_digit <= 7)
 538                     {
 539                       if (8 * c + oct_digit < N_CHARS)
 540                         {
 541                           c = 8 * c + oct_digit;
 542                           ++i;
 543                         }
 544                       else if (!posix_pedantic)
 545                         {
 546                           /* Any octal number larger than 0377 won't
 547                              fit in 8 bits.  So we stop when adding the
 548                              next digit would put us over the limit and
 549                              give a warning about the ambiguity.  POSIX
 550                              isn't clear on this, but one person has said
 551                              that in his interpretation, POSIX says tr
 552                              can't even give a warning.  */
 553                           error (0, 0, _("warning: the ambiguous octal escape \
 554 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
 555                                  s[i], s[i + 1], s[i + 2],
 556                                  s[i], s[i + 1], s[i + 2]);
 557                         }
 558                     }
 559                 }
 560               break;
 561             case '\0':
 562               error (0, 0, _("invalid backslash escape at end of string"));
 563               return 1;
 564
 565             default:
 566               if (posix_pedantic)
 567                 {
 568                   error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
 569                   return 1;
 570                 }
 571               else
 572                 {
 573                   c = s[i + 1];
 574                   es->escaped[j] = 1;
 575                 }
 576             }
 577           ++i;
 578           es->s[j++] = c;
 579           break;
 580         default:
 581           es->s[j++] = s[i];
 582           break;
 583         }
 584     }
 585   es->len = j;
 586   return 0;
 587 }
 588
 589 /* If CLASS_STR is a valid character class string, return its index
 590    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 591
 592 static enum Char_class
 593 look_up_char_class (const unsigned char *class_str, size_t len)
 594 {
 595   unsigned int i;
 596
 597   for (i = 0; i < N_CHAR_CLASSES; i++)
 598     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 599         && strlen (char_class_name[i]) == len)
 600       return (enum Char_class) i;
 601   return CC_NO_CLASS;
 602 }
 603
 604 /* Return a newly allocated string with a printable version of C.
 605    This function is used solely for formatting error messages.  */
 606
 607 static char *
 608 make_printable_char (unsigned int c)
 609 {
 610   char *buf = xmalloc (5);
 611
 612   assert (c < N_CHARS);
 613   if (ISPRINT (c))
 614     {
 615       buf[0] = c;
 616       buf[1] = '\0';
 617     }
 618   else
 619     {
 620       sprintf (buf, "\\%03o", c);
 621     }
 622   return buf;
 623 }
 624
 625 /* Return a newly allocated copy of S which is suitable for printing.
 626    LEN is the number of characters in S.  Most non-printing
 627    (isprint) characters are represented by a backslash followed by
 628    3 octal digits.  However, the characters represented by \c escapes
 629    where c is one of [abfnrtv] are represented by their 2-character \c
 630    sequences.  This function is used solely for printing error messages.  */
 631
 632 static char *
 633 make_printable_str (const unsigned char *s, size_t len)
 634 {
 635   /* Worst case is that every character expands to a backslash
 636      followed by a 3-character octal escape sequence.  */
 637   char *printable_buf = xmalloc (4 * len + 1);
 638   char *p = printable_buf;
 639   size_t i;
 640
 641   for (i = 0; i < len; i++)
 642     {
 643       char buf[5];
 644       char *tmp = NULL;
 645
 646       switch (s[i])
 647         {
 648         case '\\':
 649           tmp = "\\";
 650           break;
 651         case '\007':
 652           tmp = "\\a";
 653           break;
 654         case '\b':
 655           tmp = "\\b";
 656           break;
 657         case '\f':
 658           tmp = "\\f";
 659           break;
 660         case '\n':
 661           tmp = "\\n";
 662           break;
 663         case '\r':
 664           tmp = "\\r";
 665           break;
 666         case '\t':
 667           tmp = "\\t";
 668           break;
 669         case '\v':
 670           tmp = "\\v";
 671           break;
 672         default:
 673           if (ISPRINT (s[i]))
 674             {
 675               buf[0] = s[i];
 676               buf[1] = '\0';
 677             }
 678           else
 679             sprintf (buf, "\\%03o", s[i]);
 680           tmp = buf;
 681           break;
 682         }
 683       p = stpcpy (p, tmp);
 684     }
 685   return printable_buf;
 686 }
 687
 688 /* Append a newly allocated structure representing a
 689    character C to the specification list LIST.  */
 690
 691 static void
 692 append_normal_char (struct Spec_list *list, unsigned int c)
 693 {
 694   struct List_element *new;
 695
 696   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 697   new->next = NULL;
 698   new->type = RE_NORMAL_CHAR;
 699   new->u.normal_char = c;
 700   assert (list->tail);
 701   list->tail->next = new;
 702   list->tail = new;
 703 }
 704
 705 /* Append a newly allocated structure representing the range
 706    of characters from FIRST to LAST to the specification list LIST.
 707    Return nonzero if LAST precedes FIRST in the collating sequence,
 708    zero otherwise.  This means that '[c-c]' is acceptable.  */
 709
 710 static int
 711 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
 712 {
 713   struct List_element *new;
 714
 715   if (ORD (first) > ORD (last))
 716     {
 717       char *tmp1 = make_printable_char (first);
 718       char *tmp2 = make_printable_char (last);
 719
 720       error (0, 0,
 721        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 722              tmp1, tmp2);
 723       free (tmp1);
 724       free (tmp2);
 725       return 1;
 726     }
 727   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 728   new->next = NULL;
 729   new->type = RE_RANGE;
 730   new->u.range.first_char = first;
 731   new->u.range.last_char = last;
 732   assert (list->tail);
 733   list->tail->next = new;
 734   list->tail = new;
 735   return 0;
 736 }
 737
 738 /* If CHAR_CLASS_STR is a valid character class string, append a
 739    newly allocated structure representing that character class to the end
 740    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 741    a valid string return nonzero.  */
 742
 743 static int
 744 append_char_class (struct Spec_list *list,
 745                    const unsigned char *char_class_str, size_t len)
 746 {
 747   enum Char_class char_class;
 748   struct List_element *new;
 749
 750   char_class = look_up_char_class (char_class_str, len);
 751   if (char_class == CC_NO_CLASS)
 752     return 1;
 753   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 754   new->next = NULL;
 755   new->type = RE_CHAR_CLASS;
 756   new->u.char_class = char_class;
 757   assert (list->tail);
 758   list->tail->next = new;
 759   list->tail = new;
 760   return 0;
 761 }
 762
 763 /* Append a newly allocated structure representing a [c*n]
 764    repeated character construct to the specification list LIST.
 765    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 766    is a non-negative repeat count.  */
 767
 768 static void
 769 append_repeated_char (struct Spec_list *list, unsigned int the_char,
 770                       size_t repeat_count)
 771 {
 772   struct List_element *new;
 773
 774   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 775   new->next = NULL;
 776   new->type = RE_REPEATED_CHAR;
 777   new->u.repeated_char.the_repeated_char = the_char;
 778   new->u.repeated_char.repeat_count = repeat_count;
 779   assert (list->tail);
 780   list->tail->next = new;
 781   list->tail = new;
 782 }
 783
 784 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 785    the length of that string, LEN, if LEN is exactly one, append
 786    a newly allocated structure representing the specified
 787    equivalence class to the specification list, LIST and return zero.
 788    If LEN is not 1, return nonzero.  */
 789
 790 static int
 791 append_equiv_class (struct Spec_list *list,
 792                     const unsigned char *equiv_class_str, size_t len)
 793 {
 794   struct List_element *new;
 795
 796   if (len != 1)
 797     return 1;
 798   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 799   new->next = NULL;
 800   new->type = RE_EQUIV_CLASS;
 801   new->u.equiv_code = *equiv_class_str;
 802   assert (list->tail);
 803   list->tail->next = new;
 804   list->tail = new;
 805   return 0;
 806 }
 807
 808 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 809    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 810    NUL bytes, and is *not* NUL-terminated.  */
 811
 812 static unsigned char *
 813 substr (const unsigned char *p, size_t first_idx, size_t last_idx)
 814 {
 815   size_t len;
 816   unsigned char *tmp;
 817
 818   assert (first_idx <= last_idx);
 819   len = last_idx - first_idx + 1;
 820   tmp = (unsigned char *) xmalloc (len);
 821
 822   assert (first_idx <= last_idx);
 823   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 824   memcpy (tmp, p + first_idx, len);
 825   return tmp;
 826 }
 827
 828 /* Search forward starting at START_IDX for the 2-char sequence
 829    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 830    a sequence is found, set *RESULT_IDX to the index of the first
 831    character and return nonzero. Otherwise return zero.  P may contain
 832    zero bytes.  */
 833
 834 static int
 835 find_closing_delim (const struct E_string *es, size_t start_idx,
 836                     unsigned int pre_bracket_char, size_t *result_idx)
 837 {
 838   size_t i;
 839
 840   for (i = start_idx; i < es->len - 1; i++)
 841     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 842         && !es->escaped[i] && !es->escaped[i + 1])
 843       {
 844         *result_idx = i;
 845         return 1;
 846       }
 847   return 0;
 848 }
 849
 850 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 851    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 852    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 853    and return zero. If the second character following
 854    the opening bracket is not `*' or if no closing bracket can be
 855    found, return -1.  If a closing bracket is found and the
 856    second char is `*', but the string between the `*' and `]' isn't
 857    empty, an octal number, or a decimal number, print an error message
 858    and return -2.  */
 859
 860 static int
 861 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 862                        unsigned int *char_to_repeat, size_t *repeat_count,
 863                        size_t *closing_bracket_idx)
 864 {
 865   size_t i;
 866
 867   assert (start_idx + 1 < es->len);
 868   if (!ES_MATCH (es, start_idx + 1, '*'))
 869     return -1;
 870
 871   for (i = start_idx + 2; i < es->len; i++)
 872     {
 873       if (ES_MATCH (es, i, ']'))
 874         {
 875           size_t digit_str_len = i - start_idx - 2;
 876
 877           *char_to_repeat = es->s[start_idx];
 878           if (digit_str_len == 0)
 879             {
 880               /* We've matched [c*] -- no explicit repeat count.  */
 881               *repeat_count = 0;
 882               *closing_bracket_idx = i;
 883               return 0;
 884             }
 885
 886           /* Here, we have found [c*s] where s should be a string
 887              of octal (if it starts with `0') or decimal digits.  */
 888           {
 889             const char *digit_str = (const char *) &es->s[start_idx + 2];
 890             unsigned long int tmp_ulong;
 891             char *d_end;
 892             int base = 10;
 893             /* Select the base manually so we can be sure it's either 8 or 10.
 894                If the spec allowed it to be interpreted as hexadecimal, we
 895                could have used `0' and let xstrtoul decide.  */
 896             if (*digit_str == '0')
 897               {
 898                 base = 8;
 899                 ++digit_str;
 900                 --digit_str_len;
 901               }
 902             if (xstrtoul (digit_str, &d_end, base, &tmp_ulong, NULL)
 903                   != LONGINT_OK
 904                 || BEGIN_STATE < tmp_ulong
 905                 || d_end - digit_str != digit_str_len)
 906               {
 907                 char *tmp = make_printable_str (es->s + start_idx + 2,
 908                                                 i - start_idx - 2);
 909                 error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
 910                        tmp);
 911                 free (tmp);
 912                 return -2;
 913               }
 914             *repeat_count = tmp_ulong;
 915           }
 916           *closing_bracket_idx = i;
 917           return 0;
 918         }
 919     }
 920   return -1;                    /* No bracket found.  */
 921 }
 922
 923 /* Return nonzero if the string at ES->s[IDX] matches the regular
 924    expression `\*[0-9]*\]', zero otherwise.  To match, the `*' and
 925    the `]' must not be escaped.  */
 926
 927 static int
 928 star_digits_closebracket (const struct E_string *es, size_t idx)
 929 {
 930   size_t i;
 931
 932   if (!ES_MATCH (es, idx, '*'))
 933     return 0;
 934
 935   for (i = idx + 1; i < es->len; i++)
 936     {
 937       if (!ISDIGIT (es->s[i]))
 938         {
 939           if (ES_MATCH (es, i, ']'))
 940             return 1;
 941           return 0;
 942         }
 943     }
 944   return 0;
 945 }
 946
 947 /* Convert string UNESCAPED_STRING (which has been preprocessed to
 948    convert backslash-escape sequences) of length LEN characters into
 949    a linked list of the following 5 types of constructs:
 950       - [:str:] Character class where `str' is one of the 12 valid strings.
 951       - [=c=] Equivalence class where `c' is any single character.
 952       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 953           However, if `n' is present, it must be a non-negative octal or
 954           decimal integer.
 955       - r-s Range of characters from `r' to `s'.  The second endpoint must
 956           not precede the first in the current collating sequence.
 957       - c Any other character is interpreted as itself.  */
 958
 959 static int
 960 build_spec_list (const struct E_string *es, struct Spec_list *result)
 961 {
 962   const unsigned char *p;
 963   size_t i;
 964
 965   p = es->s;
 966
 967   /* The main for-loop below recognizes the 4 multi-character constructs.
 968      A character that matches (in its context) none of the multi-character
 969      constructs is classified as `normal'.  Since all multi-character
 970      constructs have at least 3 characters, any strings of length 2 or
 971      less are composed solely of normal characters.  Hence, the index of
 972      the outer for-loop runs only as far as LEN-2.  */
 973
 974   for (i = 0; i + 2 < es->len; /* empty */)
 975     {
 976       if (ES_MATCH (es, i, '['))
 977         {
 978           int matched_multi_char_construct;
 979           size_t closing_bracket_idx;
 980           unsigned int char_to_repeat;
 981           size_t repeat_count;
 982           int err;
 983
 984           matched_multi_char_construct = 1;
 985           if (ES_MATCH (es, i + 1, ':')
 986               || ES_MATCH (es, i + 1, '='))
 987             {
 988               size_t closing_delim_idx;
 989               int found;
 990
 991               found = find_closing_delim (es, i + 2, p[i + 1],
 992                                           &closing_delim_idx);
 993               if (found)
 994                 {
 995                   int parse_failed;
 996                   unsigned char *opnd_str = substr (p, i + 2,
 997                                                     closing_delim_idx - 1);
 998                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
 999
1000                   if (p[i + 1] == ':')
1001                     {
1002                       parse_failed = append_char_class (result, opnd_str,
1003                                                         opnd_str_len);
1004
1005                       /* FIXME: big comment.  */
1006                       if (parse_failed)
1007                         {
1008                           if (star_digits_closebracket (es, i + 2))
1009                             {
1010                               free (opnd_str);
1011                               goto try_bracketed_repeat;
1012                             }
1013                           else
1014                             {
1015                               char *tmp = make_printable_str (opnd_str,
1016                                                               opnd_str_len);
1017                               error (0, 0, _("invalid character class `%s'"),
1018                                      tmp);
1019                               free (tmp);
1020                               return 1;
1021                             }
1022                         }
1023                     }
1024                   else
1025                     {
1026                       parse_failed = append_equiv_class (result, opnd_str,
1027                                                          opnd_str_len);
1028
1029                       /* FIXME: big comment.  */
1030                       if (parse_failed)
1031                         {
1032                           if (star_digits_closebracket (es, i + 2))
1033                             {
1034                               free (opnd_str);
1035                               goto try_bracketed_repeat;
1036                             }
1037                           else
1038                             {
1039                               char *tmp = make_printable_str (opnd_str,
1040                                                               opnd_str_len);
1041                               error (0, 0,
1042                _("%s: equivalence class operand must be a single character"),
1043                                      tmp);
1044                               free (tmp);
1045                               return 1;
1046                             }
1047                         }
1048                     }
1049                   free (opnd_str);
1050
1051                   /* Return nonzero if append_*_class reports a problem.  */
1052                   if (parse_failed)
1053                     return 1;
1054                   else
1055                     i = closing_delim_idx + 2;
1056                   continue;
1057                 }
1058               /* Else fall through.  This could be [:*] or [=*].  */
1059             }
1060
1061         try_bracketed_repeat:
1062
1063           /* Determine whether this is a bracketed repeat range
1064              matching the RE \[.\*(dec_or_oct_number)?\].  */
1065           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1066                                        &repeat_count,
1067                                        &closing_bracket_idx);
1068           if (err == 0)
1069             {
1070               append_repeated_char (result, char_to_repeat, repeat_count);
1071               i = closing_bracket_idx + 1;
1072             }
1073           else if (err == -1)
1074             {
1075               matched_multi_char_construct = 0;
1076             }
1077           else
1078             {
1079               /* Found a string that looked like [c*n] but the
1080                  numeric part was invalid.  */
1081               return 1;
1082             }
1083
1084           if (matched_multi_char_construct)
1085             continue;
1086
1087           /* We reach this point if P does not match [:str:], [=c=],
1088              [c*n], or [c*].  Now, see if P looks like a range `[-c'
1089              (from `[' to `c').  */
1090         }
1091
1092       /* Look ahead one char for ranges like a-z.  */
1093       if (ES_MATCH (es, i + 1, '-'))
1094         {
1095           if (append_range (result, p[i], p[i + 2]))
1096             return 1;
1097           i += 3;
1098         }
1099       else
1100         {
1101           append_normal_char (result, p[i]);
1102           ++i;
1103         }
1104     }
1105
1106   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1107   for (; i < es->len; i++)
1108     append_normal_char (result, p[i]);
1109
1110   return 0;
1111 }
1112
1113 /* Given a Spec_list S (with its saved state implicit in the values
1114    of its members `tail' and `state'), return the next single character
1115    in the expansion of S's constructs.  If the last character of S was
1116    returned on the previous call or if S was empty, this function
1117    returns -1.  For example, successive calls to get_next where S
1118    represents the spec-string 'a-d[y*3]' will return the sequence
1119    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1120    which the returned character comes is [:upper:] or [:lower:], the
1121    parameter CLASS is given a value to indicate which it was.  Otherwise
1122    CLASS is set to UL_NONE.  This value is used only when constructing
1123    the translation table to verify that any occurrences of upper and
1124    lower class constructs in the spec-strings appear in the same relative
1125    positions.  */
1126
1127 static int
1128 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1129 {
1130   struct List_element *p;
1131   int return_val;
1132   int i;
1133
1134   if (class)
1135     *class = UL_NONE;
1136
1137   if (s->state == BEGIN_STATE)
1138     {
1139       s->tail = s->head->next;
1140       s->state = NEW_ELEMENT;
1141     }
1142
1143   p = s->tail;
1144   if (p == NULL)
1145     return -1;
1146
1147   switch (p->type)
1148     {
1149     case RE_NORMAL_CHAR:
1150       return_val = p->u.normal_char;
1151       s->state = NEW_ELEMENT;
1152       s->tail = p->next;
1153       break;
1154
1155     case RE_RANGE:
1156       if (s->state == NEW_ELEMENT)
1157         s->state = ORD (p->u.range.first_char);
1158       else
1159         ++(s->state);
1160       return_val = CHR (s->state);
1161       if (s->state == ORD (p->u.range.last_char))
1162         {
1163           s->tail = p->next;
1164           s->state = NEW_ELEMENT;
1165         }
1166       break;
1167
1168     case RE_CHAR_CLASS:
1169       if (class)
1170         {
1171           int upper_or_lower;
1172           switch (p->u.char_class)
1173             {
1174             case CC_LOWER:
1175               *class = UL_LOWER;
1176               upper_or_lower = 1;
1177               break;
1178             case CC_UPPER:
1179               *class = UL_UPPER;
1180               upper_or_lower = 1;
1181               break;
1182             default:
1183               upper_or_lower = 0;
1184               break;
1185             }
1186
1187           if (upper_or_lower)
1188             {
1189               s->tail = p->next;
1190               s->state = NEW_ELEMENT;
1191               return_val = 0;
1192               break;
1193             }
1194         }
1195
1196       if (s->state == NEW_ELEMENT)
1197         {
1198           for (i = 0; i < N_CHARS; i++)
1199             if (is_char_class_member (p->u.char_class, i))
1200               break;
1201           assert (i < N_CHARS);
1202           s->state = i;
1203         }
1204       assert (is_char_class_member (p->u.char_class, s->state));
1205       return_val = CHR (s->state);
1206       for (i = s->state + 1; i < N_CHARS; i++)
1207         if (is_char_class_member (p->u.char_class, i))
1208           break;
1209       if (i < N_CHARS)
1210         s->state = i;
1211       else
1212         {
1213           s->tail = p->next;
1214           s->state = NEW_ELEMENT;
1215         }
1216       break;
1217
1218     case RE_EQUIV_CLASS:
1219       /* FIXME: this assumes that each character is alone in its own
1220          equivalence class (which appears to be correct for my
1221          LC_COLLATE.  But I don't know of any function that allows
1222          one to determine a character's equivalence class.  */
1223
1224       return_val = p->u.equiv_code;
1225       s->state = NEW_ELEMENT;
1226       s->tail = p->next;
1227       break;
1228
1229     case RE_REPEATED_CHAR:
1230       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1231       if (p->u.repeated_char.repeat_count == 0)
1232         {
1233           s->tail = p->next;
1234           s->state = NEW_ELEMENT;
1235           return_val = get_next (s, class);
1236         }
1237       else
1238         {
1239           if (s->state == NEW_ELEMENT)
1240             {
1241               s->state = 0;
1242             }
1243           ++(s->state);
1244           return_val = p->u.repeated_char.the_repeated_char;
1245           if (p->u.repeated_char.repeat_count > 0
1246               && s->state == p->u.repeated_char.repeat_count)
1247             {
1248               s->tail = p->next;
1249               s->state = NEW_ELEMENT;
1250             }
1251         }
1252       break;
1253
1254     case RE_NO_TYPE:
1255       abort ();
1256       break;
1257
1258     default:
1259       abort ();
1260       break;
1261     }
1262
1263   return return_val;
1264 }
1265
1266 /* This is a minor kludge.  This function is called from
1267    get_spec_stats to determine the cardinality of a set derived
1268    from a complemented string.  It's a kludge in that some of the
1269    same operations are (duplicated) performed in set_initialize.  */
1270
1271 static int
1272 card_of_complement (struct Spec_list *s)
1273 {
1274   int c;
1275   int cardinality = N_CHARS;
1276   SET_TYPE in_set[N_CHARS];
1277
1278   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1279   s->state = BEGIN_STATE;
1280   while ((c = get_next (s, NULL)) != -1)
1281     if (!in_set[c]++)
1282       --cardinality;
1283   return cardinality;
1284 }
1285
1286 /* Gather statistics about the spec-list S in preparation for the tests
1287    in validate that determine the consistency of the specs.  This function
1288    is called at most twice; once for string1, and again for any string2.
1289    LEN_S1 < 0 indicates that this is the first call and that S represents
1290    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1291    constructs in string1, and we can use its value to resolve any
1292    indefinite repeat construct in S (which represents string2).  Hence,
1293    this function has the side-effect that it converts a valid [c*]
1294    construct in string2 to [c*n] where n is large enough (or 0) to give
1295    string2 the same length as string1.  For example, with the command
1296    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1297    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1298
1299 static void
1300 get_spec_stats (struct Spec_list *s)
1301 {
1302   struct List_element *p;
1303   int len = 0;
1304
1305   s->n_indefinite_repeats = 0;
1306   s->has_equiv_class = 0;
1307   s->has_restricted_char_class = 0;
1308   s->has_char_class = 0;
1309   for (p = s->head->next; p; p = p->next)
1310     {
1311       switch (p->type)
1312         {
1313           int i;
1314         case RE_NORMAL_CHAR:
1315           ++len;
1316           break;
1317
1318         case RE_RANGE:
1319           assert (p->u.range.last_char >= p->u.range.first_char);
1320           len += p->u.range.last_char - p->u.range.first_char + 1;
1321           break;
1322
1323         case RE_CHAR_CLASS:
1324           s->has_char_class = 1;
1325           for (i = 0; i < N_CHARS; i++)
1326             if (is_char_class_member (p->u.char_class, i))
1327               ++len;
1328           switch (p->u.char_class)
1329             {
1330             case CC_UPPER:
1331             case CC_LOWER:
1332               break;
1333             default:
1334               s->has_restricted_char_class = 1;
1335               break;
1336             }
1337           break;
1338
1339         case RE_EQUIV_CLASS:
1340           for (i = 0; i < N_CHARS; i++)
1341             if (is_equiv_class_member (p->u.equiv_code, i))
1342               ++len;
1343           s->has_equiv_class = 1;
1344           break;
1345
1346         case RE_REPEATED_CHAR:
1347           if (p->u.repeated_char.repeat_count > 0)
1348             len += p->u.repeated_char.repeat_count;
1349           else if (p->u.repeated_char.repeat_count == 0)
1350             {
1351               s->indefinite_repeat_element = p;
1352               ++(s->n_indefinite_repeats);
1353             }
1354           break;
1355
1356         case RE_NO_TYPE:
1357           assert (0);
1358           break;
1359         }
1360     }
1361
1362   s->length = len;
1363 }
1364
1365 static void
1366 get_s1_spec_stats (struct Spec_list *s1)
1367 {
1368   get_spec_stats (s1);
1369   if (complement)
1370     s1->length = card_of_complement (s1);
1371 }
1372
1373 static void
1374 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1375 {
1376   get_spec_stats (s2);
1377   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1378     {
1379       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1380         len_s1 - s2->length;
1381       s2->length = len_s1;
1382     }
1383 }
1384
1385 static void
1386 spec_init (struct Spec_list *spec_list)
1387 {
1388   spec_list->head = spec_list->tail =
1389     (struct List_element *) xmalloc (sizeof (struct List_element));
1390   spec_list->head->next = NULL;
1391 }
1392
1393 /* This function makes two passes over the argument string S.  The first
1394    one converts all \c and \ddd escapes to their one-byte representations.
1395    The second constructs a linked specification list, SPEC_LIST, of the
1396    characters and constructs that comprise the argument string.  If either
1397    of these passes detects an error, this function returns nonzero.  */
1398
1399 static int
1400 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1401 {
1402   struct E_string es;
1403   int fail;
1404
1405   fail = unquote (s, &es);
1406   if (!fail)
1407     fail = build_spec_list (&es, spec_list);
1408   es_free (&es);
1409   return fail;
1410 }
1411
1412 /* Given two specification lists, S1 and S2, and assuming that
1413    S1->length > S2->length, append a single [c*n] element to S2 where c
1414    is the last character in the expansion of S2 and n is the difference
1415    between the two lengths.
1416    Upon successful completion, S2->length is set to S1->length.  The only
1417    way this function can fail to make S2 as long as S1 is when S2 has
1418    zero-length, since in that case, there is no last character to repeat.
1419    So S2->length is required to be at least 1.
1420
1421    Providing this functionality allows the user to do some pretty
1422    non-BSD (and non-portable) things:  For example, the command
1423        tr -cs '[:upper:]0-9' '[:lower:]'
1424    is almost guaranteed to give results that depend on your collating
1425    sequence.  */
1426
1427 static void
1428 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1429 {
1430   struct List_element *p;
1431   int char_to_repeat;
1432   int i;
1433
1434   assert (translating);
1435   assert (s1->length > s2->length);
1436   assert (s2->length > 0);
1437
1438   p = s2->tail;
1439   switch (p->type)
1440     {
1441     case RE_NORMAL_CHAR:
1442       char_to_repeat = p->u.normal_char;
1443       break;
1444     case RE_RANGE:
1445       char_to_repeat = p->u.range.last_char;
1446       break;
1447     case RE_CHAR_CLASS:
1448       for (i = N_CHARS; i >= 0; i--)
1449         if (is_char_class_member (p->u.char_class, i))
1450           break;
1451       assert (i >= 0);
1452       char_to_repeat = CHR (i);
1453       break;
1454
1455     case RE_REPEATED_CHAR:
1456       char_to_repeat = p->u.repeated_char.the_repeated_char;
1457       break;
1458
1459     case RE_EQUIV_CLASS:
1460       /* This shouldn't happen, because validate exits with an error
1461          if it finds an equiv class in string2 when translating.  */
1462       abort ();
1463       break;
1464
1465     case RE_NO_TYPE:
1466       abort ();
1467       break;
1468
1469     default:
1470       abort ();
1471       break;
1472     }
1473
1474   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1475   s2->length = s1->length;
1476 }
1477
1478 /* Return non-zero if S is a non-empty list in which exactly one
1479    character (but potentially, many instances of it) appears.
1480    E.g.  [X*] or xxxxxxxx.  */
1481
1482 static int
1483 homogeneous_spec_list (struct Spec_list *s)
1484 {
1485   int b, c;
1486
1487   s->state = BEGIN_STATE;
1488
1489   if ((b = get_next (s, NULL)) == -1)
1490     return 0;
1491
1492   while ((c = get_next (s, NULL)) != -1)
1493     if (c != b)
1494       return 0;
1495
1496   return 1;
1497 }
1498
1499 /* Die with an error message if S1 and S2 describe strings that
1500    are not valid with the given command line switches.
1501    A side effect of this function is that if a valid [c*] or
1502    [c*0] construct appears in string2, it is converted to [c*n]
1503    with a value for n that makes s2->length == s1->length.  By
1504    the same token, if the --truncate-set1 option is not
1505    given, S2 may be extended.  */
1506
1507 static void
1508 validate (struct Spec_list *s1, struct Spec_list *s2)
1509 {
1510   get_s1_spec_stats (s1);
1511   if (s1->n_indefinite_repeats > 0)
1512     {
1513       error (EXIT_FAILURE, 0,
1514              _("the [c*] repeat construct may not appear in string1"));
1515     }
1516
1517   if (s2)
1518     {
1519       get_s2_spec_stats (s2, s1->length);
1520
1521       if (s2->n_indefinite_repeats > 1)
1522         {
1523           error (EXIT_FAILURE, 0,
1524                  _("only one [c*] repeat construct may appear in string2"));
1525         }
1526
1527       if (translating)
1528         {
1529           if (s2->has_equiv_class)
1530             {
1531               error (EXIT_FAILURE, 0,
1532                      _("[=c=] expressions may not appear in string2 \
1533 when translating"));
1534             }
1535
1536           if (s1->length > s2->length)
1537             {
1538               if (!truncate_set1)
1539                 {
1540                   /* string2 must be non-empty unless --truncate-set1 is
1541                      given or string1 is empty.  */
1542
1543                   if (s2->length == 0)
1544                     error (EXIT_FAILURE, 0,
1545                      _("when not truncating set1, string2 must be non-empty"));
1546                   string2_extend (s1, s2);
1547                 }
1548             }
1549
1550           if (complement && s1->has_char_class
1551               && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1552             {
1553               error (EXIT_FAILURE, 0,
1554                      _("when translating with complemented character classes,\
1555 \nstring2 must map all characters in the domain to one"));
1556             }
1557
1558           if (s2->has_restricted_char_class)
1559             {
1560               error (EXIT_FAILURE, 0,
1561                      _("when translating, the only character classes that may \
1562 appear in\nstring2 are `upper' and `lower'"));
1563             }
1564         }
1565       else
1566         /* Not translating.  */
1567         {
1568           if (s2->n_indefinite_repeats > 0)
1569             error (EXIT_FAILURE, 0,
1570                    _("the [c*] construct may appear in string2 only \
1571 when translating"));
1572         }
1573     }
1574 }
1575
1576 /* Read buffers of SIZE bytes via the function READER (if READER is
1577    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1578    read_and_delete or read_and_xlate.  After each buffer is read, it is
1579    processed and written to stdout.  The buffers are processed so that
1580    multiple consecutive occurrences of the same character in the input
1581    stream are replaced by a single occurrence of that character if the
1582    character is in the squeeze set.  */
1583
1584 static void
1585 squeeze_filter (unsigned char *buf, size_t size, Filter reader)
1586 {
1587   unsigned int char_to_squeeze = NOT_A_CHAR;
1588   size_t i = 0;
1589   ssize_t nr = 0;
1590
1591   for (;;)
1592     {
1593       size_t begin;
1594
1595       if (i >= nr)
1596         {
1597           if (reader == NULL)
1598             {
1599               ssize_t signed_nr = safe_read (0, (char *) buf, size);
1600               if (signed_nr < 0)
1601                 error (EXIT_FAILURE, errno, _("read error"));
1602               nr = signed_nr;
1603             }
1604           else
1605             {
1606               nr = (*reader) (buf, size, NULL);
1607             }
1608
1609           if (nr == 0)
1610             break;
1611           i = 0;
1612         }
1613
1614       begin = i;
1615
1616       if (char_to_squeeze == NOT_A_CHAR)
1617         {
1618           size_t out_len;
1619           /* Here, by being a little tricky, we can get a significant
1620              performance increase in most cases when the input is
1621              reasonably large.  Since tr will modify the input only
1622              if two consecutive (and identical) input characters are
1623              in the squeeze set, we can step by two through the data
1624              when searching for a character in the squeeze set.  This
1625              means there may be a little more work in a few cases and
1626              perhaps twice as much work in the worst cases where most
1627              of the input is removed by squeezing repeats.  But most
1628              uses of this functionality seem to remove less than 20-30%
1629              of the input.  */
1630           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1631             ;                   /* empty */
1632
1633           /* There is a special case when i == nr and we've just
1634              skipped a character (the last one in buf) that is in
1635              the squeeze set.  */
1636           if (i == nr && in_squeeze_set[buf[i - 1]])
1637             --i;
1638
1639           if (i >= nr)
1640             out_len = nr - begin;
1641           else
1642             {
1643               char_to_squeeze = buf[i];
1644               /* We're about to output buf[begin..i].  */
1645               out_len = i - begin + 1;
1646
1647               /* But since we stepped by 2 in the loop above,
1648                  out_len may be one too large.  */
1649               if (i > 0 && buf[i - 1] == char_to_squeeze)
1650                 --out_len;
1651
1652               /* Advance i to the index of first character to be
1653                  considered when looking for a char different from
1654                  char_to_squeeze.  */
1655               ++i;
1656             }
1657           if (out_len > 0
1658               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1659             error (EXIT_FAILURE, errno, _("write error"));
1660         }
1661
1662       if (char_to_squeeze != NOT_A_CHAR)
1663         {
1664           /* Advance i to index of first char != char_to_squeeze
1665              (or to nr if all the rest of the characters in this
1666              buffer are the same as char_to_squeeze).  */
1667           for (; i < nr && buf[i] == char_to_squeeze; i++)
1668             ;                   /* empty */
1669           if (i < nr)
1670             char_to_squeeze = NOT_A_CHAR;
1671           /* If (i >= nr) we've squeezed the last character in this buffer.
1672              So now we have to read a new buffer and continue comparing
1673              characters against char_to_squeeze.  */
1674         }
1675     }
1676 }
1677
1678 /* Read buffers of SIZE bytes from stdin until one is found that
1679    contains at least one character not in the delete set.  Store
1680    in the array BUF, all characters from that buffer that are not
1681    in the delete set, and return the number of characters saved
1682    or 0 upon EOF.  */
1683
1684 static size_t
1685 read_and_delete (unsigned char *buf, size_t size, Filter not_used)
1686 {
1687   size_t n_saved;
1688   static int hit_eof = 0;
1689
1690   assert (not_used == NULL);
1691
1692   if (hit_eof)
1693     return 0;
1694
1695   /* This enclosing do-while loop is to make sure that
1696      we don't return zero (indicating EOF) when we've
1697      just deleted all the characters in a buffer.  */
1698   do
1699     {
1700       size_t i;
1701       ssize_t nr = safe_read (0, (char *) buf, size);
1702
1703       if (nr < 0)
1704         error (EXIT_FAILURE, errno, _("read error"));
1705       if (nr == 0)
1706         {
1707           hit_eof = 1;
1708           return 0;
1709         }
1710
1711       /* This first loop may be a waste of code, but gives much
1712          better performance when no characters are deleted in
1713          the beginning of a buffer.  It just avoids the copying
1714          of buf[i] into buf[n_saved] when it would be a NOP.  */
1715
1716       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1717         /* empty */ ;
1718       n_saved = i;
1719
1720       for (++i; i < nr; i++)
1721         if (!in_delete_set[buf[i]])
1722           buf[n_saved++] = buf[i];
1723     }
1724   while (n_saved == 0);
1725
1726   return n_saved;
1727 }
1728
1729 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1730    perform the in-place and one-to-one mapping specified by the global
1731    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1732
1733 static size_t
1734 read_and_xlate (unsigned char *buf, size_t size, Filter not_used)
1735 {
1736   ssize_t chars_read = 0;
1737   static int hit_eof = 0;
1738   size_t i;
1739
1740   assert (not_used == NULL);
1741
1742   if (hit_eof)
1743     return 0;
1744
1745   chars_read = safe_read (0, (char *) buf, size);
1746   if (chars_read < 0)
1747     error (EXIT_FAILURE, errno, _("read error"));
1748   if (chars_read == 0)
1749     {
1750       hit_eof = 1;
1751       return 0;
1752     }
1753
1754   for (i = 0; i < chars_read; i++)
1755     buf[i] = xlate[buf[i]];
1756
1757   return chars_read;
1758 }
1759
1760 /* Initialize a boolean membership set IN_SET with the character
1761    values obtained by traversing the linked list of constructs S
1762    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1763    nonzero the resulting set is complemented.  */
1764
1765 static void
1766 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1767 {
1768   int c;
1769   size_t i;
1770
1771   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1772   s->state = BEGIN_STATE;
1773   while ((c = get_next (s, NULL)) != -1)
1774     in_set[c] = 1;
1775   if (complement_this_set)
1776     for (i = 0; i < N_CHARS; i++)
1777       in_set[i] = (!in_set[i]);
1778 }
1779
1780 int
1781 main (int argc, char **argv)
1782 {
1783   int c;
1784   int non_option_args;
1785   struct Spec_list buf1, buf2;
1786   struct Spec_list *s1 = &buf1;
1787   struct Spec_list *s2 = &buf2;
1788
1789   program_name = argv[0];
1790   setlocale (LC_ALL, "");
1791   bindtextdomain (PACKAGE, LOCALEDIR);
1792   textdomain (PACKAGE);
1793
1794   atexit (close_stdout);
1795
1796   while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1797     {
1798       switch (c)
1799         {
1800         case 0:
1801           break;
1802
1803         case 'c':
1804           complement = 1;
1805           break;
1806
1807         case 'd':
1808           delete = 1;
1809           break;
1810
1811         case 's':
1812           squeeze_repeats = 1;
1813           break;
1814
1815         case 't':
1816           truncate_set1 = 1;
1817           break;
1818
1819         case_GETOPT_HELP_CHAR;
1820
1821         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1822
1823         default:
1824           usage (2);
1825           break;
1826         }
1827     }
1828
1829   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1830
1831   non_option_args = argc - optind;
1832   translating = (non_option_args == 2 && !delete);
1833
1834   /* Change this test if it is valid to give tr no options and
1835      no args at all.  POSIX doesn't specifically say anything
1836      either way, but it looks like they implied it's invalid
1837      by omission.  If you want to make tr do a slow imitation
1838      of `cat' use `tr a a'.  */
1839   if (non_option_args > 2)
1840     {
1841       error (0, 0, _("too many arguments"));
1842       usage (2);
1843     }
1844
1845   if (!delete && !squeeze_repeats && non_option_args != 2)
1846     error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1847
1848   if (delete && squeeze_repeats && non_option_args != 2)
1849     error (EXIT_FAILURE, 0, _("two strings must be given when both \
1850 deleting and squeezing repeats"));
1851
1852   /* If --delete is given without --squeeze-repeats, then
1853      only one string argument may be specified.  But POSIX
1854      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1855      is set, pretend we never saw string2.  But I think
1856      this deserves a fatal error, so that's the default.  */
1857   if ((delete && !squeeze_repeats) && non_option_args != 1)
1858     {
1859       if (posix_pedantic && non_option_args == 2)
1860         --non_option_args;
1861       else
1862         error (EXIT_FAILURE, 0,
1863                _("only one string may be given when deleting \
1864 without squeezing repeats"));
1865     }
1866
1867   if (squeeze_repeats && non_option_args == 0)
1868     error (EXIT_FAILURE, 0,
1869            _("at least one string must be given when squeezing repeats"));
1870
1871   spec_init (s1);
1872   if (parse_str ((unsigned char *) argv[optind], s1))
1873     exit (EXIT_FAILURE);
1874
1875   if (non_option_args == 2)
1876     {
1877       spec_init (s2);
1878       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1879         exit (EXIT_FAILURE);
1880     }
1881   else
1882     s2 = NULL;
1883
1884   validate (s1, s2);
1885
1886   /* Use binary I/O, since `tr' is sometimes used to transliterate
1887      non-printable characters, or characters which are stripped away
1888      by text-mode reads (like CR and ^Z).  */
1889   SET_BINARY2 (STDIN_FILENO, STDOUT_FILENO);
1890
1891   if (squeeze_repeats && non_option_args == 1)
1892     {
1893       set_initialize (s1, complement, in_squeeze_set);
1894       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1895     }
1896   else if (delete && non_option_args == 1)
1897     {
1898       size_t nr;
1899
1900       set_initialize (s1, complement, in_delete_set);
1901       do
1902         {
1903           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1904           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1905             error (EXIT_FAILURE, errno, _("write error"));
1906         }
1907       while (nr > 0);
1908     }
1909   else if (squeeze_repeats && delete && non_option_args == 2)
1910     {
1911       set_initialize (s1, complement, in_delete_set);
1912       set_initialize (s2, 0, in_squeeze_set);
1913       squeeze_filter (io_buf, IO_BUF_SIZE, read_and_delete);
1914     }
1915   else if (translating)
1916     {
1917       if (complement)
1918         {
1919           int i;
1920           SET_TYPE *in_s1 = in_delete_set;
1921
1922           set_initialize (s1, 0, in_s1);
1923           s2->state = BEGIN_STATE;
1924           for (i = 0; i < N_CHARS; i++)
1925             xlate[i] = i;
1926           for (i = 0; i < N_CHARS; i++)
1927             {
1928               if (!in_s1[i])
1929                 {
1930                   int ch = get_next (s2, NULL);
1931                   assert (ch != -1 || truncate_set1);
1932                   if (ch == -1)
1933                     {
1934                       /* This will happen when tr is invoked like e.g.
1935                          tr -cs A-Za-z0-9 '\012'.  */
1936                       break;
1937                     }
1938                   xlate[i] = ch;
1939                 }
1940             }
1941           assert (get_next (s2, NULL) == -1 || truncate_set1);
1942         }
1943       else
1944         {
1945           int c1, c2;
1946           int i;
1947           enum Upper_Lower_class class_s1;
1948           enum Upper_Lower_class class_s2;
1949
1950           for (i = 0; i < N_CHARS; i++)
1951             xlate[i] = i;
1952           s1->state = BEGIN_STATE;
1953           s2->state = BEGIN_STATE;
1954           for (;;)
1955             {
1956               c1 = get_next (s1, &class_s1);
1957               c2 = get_next (s2, &class_s2);
1958               if (!class_ok[(int) class_s1][(int) class_s2])
1959                 error (EXIT_FAILURE, 0,
1960                        _("misaligned [:upper:] and/or [:lower:] construct"));
1961
1962               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1963                 {
1964                   for (i = 0; i < N_CHARS; i++)
1965                     if (ISLOWER (i))
1966                       xlate[i] = toupper (i);
1967                 }
1968               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1969                 {
1970                   for (i = 0; i < N_CHARS; i++)
1971                     if (ISUPPER (i))
1972                       xlate[i] = tolower (i);
1973                 }
1974               else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1975                        || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1976                 {
1977                   /* By default, GNU tr permits the identity mappings: from
1978                      [:upper:] to [:upper:] and [:lower:] to [:lower:].  But
1979                      when POSIXLY_CORRECT is set, those evoke diagnostics.  */
1980                   if (posix_pedantic)
1981                     {
1982                       error (EXIT_FAILURE, 0,
1983                              _("\
1984 invalid identity mapping;  when translating, any [:lower:] or [:upper:]\n\
1985 construct in string1 must be aligned with a corresponding construct\n\
1986 ([:upper:] or [:lower:], respectively) in string2"));
1987                     }
1988                 }
1989               else
1990                 {
1991                   /* The following should have been checked by validate...  */
1992                   if (c1 == -1 || c2 == -1)
1993                     break;
1994                   xlate[c1] = c2;
1995                 }
1996             }
1997           assert (c1 == -1 || truncate_set1);
1998         }
1999       if (squeeze_repeats)
2000         {
2001           set_initialize (s2, 0, in_squeeze_set);
2002           squeeze_filter (io_buf, IO_BUF_SIZE, read_and_xlate);
2003         }
2004       else
2005         {
2006           size_t chars_read;
2007
2008           do
2009             {
2010               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2011               if (chars_read > 0
2012                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2013                 error (EXIT_FAILURE, errno, _("write error"));
2014             }
2015           while (chars_read > 0);
2016         }
2017     }
2018
2019   if (close (0) != 0)
2020     error (EXIT_FAILURE, errno, _("standard input"));
2021
2022   exit (EXIT_SUCCESS);
2023 }