src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 91, 1995-1998, 1999 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <assert.h>
  24 #include <errno.h>
  25 #include <sys/types.h>
  26 #include <getopt.h>
  27
  28 #include "system.h"
  29 #include "error.h"
  30 #include "long-options.h"
  31 #include "safe-read.h"
  32
  33 /* The official name of this program (e.g., no `g' prefix).  */
  34 #define PROGRAM_NAME "tr"
  35
  36 #define N_CHARS (UCHAR_MAX + 1)
  37
  38 /* A pointer to a function that returns an int.  */
  39 typedef int (*PFI) ();
  40
  41 /* Convert from character C to its index in the collating
  42    sequence array.  Just cast to an unsigned int to avoid
  43    problems with sign-extension.  */
  44 #define ORD(c) (unsigned int)(c)
  45
  46 /* The inverse of ORD.  */
  47 #define CHR(i) (unsigned char)(i)
  48
  49 /* The value for Spec_list->state that indicates to
  50    get_next that it should initialize the tail pointer.
  51    Its value should be as large as possible to avoid conflict
  52    a valid value for the state field -- and that may be as
  53    large as any valid repeat_count.  */
  54 #define BEGIN_STATE (INT_MAX - 1)
  55
  56 /* The value for Spec_list->state that indicates to
  57    get_next that the element pointed to by Spec_list->tail is
  58    being considered for the first time on this pass through the
  59    list -- it indicates that get_next should make any necessary
  60    initializations.  */
  61 #define NEW_ELEMENT (BEGIN_STATE + 1)
  62
  63 /* A value distinct from any character that may have been stored in a
  64    buffer as the result of a block-read in the function squeeze_filter.  */
  65 #define NOT_A_CHAR (unsigned int)(-1)
  66
  67 /* The following (but not CC_NO_CLASS) are indices into the array of
  68    valid character class strings.  */
  69 enum Char_class
  70   {
  71     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  72     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  73     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  74     CC_NO_CLASS = 9999
  75   };
  76
  77 /* Character class to which a character (returned by get_next) belonged;
  78    but it is set only if the construct from which the character was obtained
  79    was one of the character classes [:upper:] or [:lower:].  The value
  80    is used only when translating and then, only to make sure that upper
  81    and lower class constructs have the same relative positions in string1
  82    and string2.  */
  83 enum Upper_Lower_class
  84   {
  85     UL_LOWER = 0,
  86     UL_UPPER = 1,
  87     UL_NONE = 2
  88   };
  89
  90 /* A shortcut to ensure that when constructing the translation array,
  91    one of the values returned by paired calls to get_next (from s1 and s2)
  92    is from [:upper:] and the other is from [:lower:], or neither is from
  93    upper or lower.  By default, GNU tr permits the identity mappings: from
  94    [:upper:] to [:upper:] and [:lower:] to [:lower:].  But when
  95    POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
  96    by values of type enum Upper_Lower_class.  */
  97 static int const class_ok[3][3] =
  98 {
  99   {1, 1, 0},
 100   {1, 1, 0},
 101   {0, 0, 1}
 102 };
 103
 104 /* The type of a List_element.  See build_spec_list for more details.  */
 105 enum Range_element_type
 106   {
 107     RE_NO_TYPE = 0,
 108     RE_NORMAL_CHAR,
 109     RE_RANGE,
 110     RE_CHAR_CLASS,
 111     RE_EQUIV_CLASS,
 112     RE_REPEATED_CHAR
 113   };
 114
 115 /* One construct in one of tr's argument strings.
 116    For example, consider the POSIX version of the classic tr command:
 117        tr -cs 'a-zA-Z_' '[\n*]'
 118    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 119    and a single normal character, `_'.  String2 has one construct.  */
 120 struct List_element
 121   {
 122     enum Range_element_type type;
 123     struct List_element *next;
 124     union
 125       {
 126         int normal_char;
 127         struct                  /* unnamed */
 128           {
 129             unsigned int first_char;
 130             unsigned int last_char;
 131           }
 132         range;
 133         enum Char_class char_class;
 134         int equiv_code;
 135         struct                  /* unnamed */
 136           {
 137             unsigned int the_repeated_char;
 138             size_t repeat_count;
 139           }
 140         repeated_char;
 141       }
 142     u;
 143   };
 144
 145 /* Each of tr's argument strings is parsed into a form that is easier
 146    to work with: a linked list of constructs (struct List_element).
 147    Each Spec_list structure also encapsulates various attributes of
 148    the corresponding argument string.  The attributes are used mainly
 149    to verify that the strings are valid in the context of any options
 150    specified (like -s, -d, or -c).  The main exception is the member
 151    `tail', which is first used to construct the list.  After construction,
 152    it is used by get_next to save its state when traversing the list.
 153    The member `state' serves a similar function.  */
 154 struct Spec_list
 155   {
 156     /* Points to the head of the list of range elements.
 157        The first struct is a dummy; its members are never used.  */
 158     struct List_element *head;
 159
 160     /* When appending, points to the last element.  When traversing via
 161        get_next(), points to the element to process next.  Setting
 162        Spec_list.state to the value BEGIN_STATE before calling get_next
 163        signals get_next to initialize tail to point to head->next.  */
 164     struct List_element *tail;
 165
 166     /* Used to save state between calls to get_next().  */
 167     unsigned int state;
 168
 169     /* Length, in the sense that length ('a-z[:digit:]123abc')
 170        is 42 ( = 26 + 10 + 6).  */
 171     size_t length;
 172
 173     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 174     int n_indefinite_repeats;
 175
 176     /* If n_indefinite_repeats is nonzero, this points to the List_element
 177        corresponding to the last [c*] or [c*0] construct encountered in
 178        this spec.  Otherwise it is undefined.  */
 179     struct List_element *indefinite_repeat_element;
 180
 181     /* Non-zero if this spec contains at least one equivalence
 182        class construct e.g. [=c=].  */
 183     int has_equiv_class;
 184
 185     /* Non-zero if this spec contains at least one character class
 186        construct.  E.g. [:digit:].  */
 187     int has_char_class;
 188
 189     /* Non-zero if this spec contains at least one of the character class
 190        constructs (all but upper and lower) that aren't allowed in s2.  */
 191     int has_restricted_char_class;
 192   };
 193
 194 /* A representation for escaped string1 or string2.  As a string is parsed,
 195    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 196    etc.) are marked as such in this structure by setting the corresponding
 197    entry in the ESCAPED vector.  */
 198 struct E_string
 199 {
 200   unsigned char *s;
 201   int *escaped;
 202   size_t len;
 203 };
 204
 205 /* Return nonzero if the Ith character of escaped string ES matches C
 206    and is not escaped itself.  */
 207 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
 208
 209 /* The name by which this program was run.  */
 210 char *program_name;
 211
 212 /* When nonzero, each sequence in the input of a repeated character
 213    (call it c) is replaced (in the output) by a single occurrence of c
 214    for every c in the squeeze set.  */
 215 static int squeeze_repeats = 0;
 216
 217 /* When nonzero, removes characters in the delete set from input.  */
 218 static int delete = 0;
 219
 220 /* Use the complement of set1 in place of set1.  */
 221 static int complement = 0;
 222
 223 /* When nonzero, this flag causes GNU tr to provide strict
 224    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 225    says that when -d is used without -s, string2 (if present)
 226    must be ignored.  Silently ignoring arguments is a bad idea.
 227    The default GNU behavior is to give a usage message and exit.
 228    Additionally, when this flag is nonzero, tr prints warnings
 229    on stderr if it is being used in a manner that is not portable.
 230    Applicable warnings are given by default, but are suppressed
 231    if the environment variable `POSIXLY_CORRECT' is set, since
 232    being POSIX conformant means we can't issue such messages.
 233    Warnings on the following topics are suppressed when this
 234    variable is nonzero:
 235    1. Ambiguous octal escapes.  */
 236 static int posix_pedantic;
 237
 238 /* When tr is performing translation and string1 is longer than string2,
 239    POSIX says that the result is undefined.  That gives the implementor
 240    of a POSIX conforming version of tr two reasonable choices for the
 241    semantics of this case.
 242
 243    * The BSD tr pads string2 to the length of string1 by
 244    repeating the last character in string2.
 245
 246    * System V tr ignores characters in string1 that have no
 247    corresponding character in string2.  That is, string1 is effectively
 248    truncated to the length of string2.
 249
 250    When nonzero, this flag causes GNU tr to imitate the behavior
 251    of System V tr when translating with string1 longer than string2.
 252    The default is to emulate BSD tr.  This flag is ignored in modes where
 253    no translation is performed.  Emulating the System V tr
 254    in this exceptional case causes the relatively common BSD idiom:
 255
 256        tr -cs A-Za-z0-9 '\012'
 257
 258    to break (it would convert only zero bytes, rather than all
 259    non-alphanumerics, to newlines).
 260
 261    WARNING: This switch does not provide general BSD or System V
 262    compatibility.  For example, it doesn't disable the interpretation
 263    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 264    some unfortunate coincidence you use such constructs in scripts
 265    expecting to use some other version of tr, the scripts will break.  */
 266 static int truncate_set1 = 0;
 267
 268 /* An alias for (!delete && non_option_args == 2).
 269    It is set in main and used there and in validate().  */
 270 static int translating;
 271
 272 #ifndef BUFSIZ
 273 # define BUFSIZ 8192
 274 #endif
 275
 276 #define IO_BUF_SIZE BUFSIZ
 277 static unsigned char io_buf[IO_BUF_SIZE];
 278
 279 static char const *const char_class_name[] =
 280 {
 281   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 282   "lower", "print", "punct", "space", "upper", "xdigit"
 283 };
 284 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 285
 286 typedef char SET_TYPE;
 287
 288 /* Array of boolean values.  A character `c' is a member of the
 289    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 290    set is defined by the last (possibly, the only) string argument
 291    on the command line when the squeeze option is given.  */
 292 static SET_TYPE in_squeeze_set[N_CHARS];
 293
 294 /* Array of boolean values.  A character `c' is a member of the
 295    delete set if and only if in_delete_set[c] is true.  The delete
 296    set is defined by the first (or only) string argument on the
 297    command line when the delete option is given.  */
 298 static SET_TYPE in_delete_set[N_CHARS];
 299
 300 /* Array of character values defining the translation (if any) that
 301    tr is to perform.  Translation is performed only when there are
 302    two specification strings and the delete switch is not given.  */
 303 static char xlate[N_CHARS];
 304
 305 static struct option const long_options[] =
 306 {
 307   {"complement", no_argument, NULL, 'c'},
 308   {"delete", no_argument, NULL, 'd'},
 309   {"squeeze-repeats", no_argument, NULL, 's'},
 310   {"truncate-set1", no_argument, NULL, 't'},
 311   {NULL, 0, NULL, 0}
 312 };
 313 \f
 314 void
 315 usage (int status)
 316 {
 317   if (status != 0)
 318     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 319              program_name);
 320   else
 321     {
 322       printf (_("\
 323 Usage: %s [OPTION]... SET1 [SET2]\n\
 324 "),
 325               program_name);
 326       printf (_("\
 327 Translate, squeeze, and/or delete characters from standard input,\n\
 328 writing to standard output.\n\
 329 \n\
 330   -c, --complement        first complement SET1\n\
 331   -d, --delete            delete characters in SET1, do not translate\n\
 332   -s, --squeeze-repeats   replace sequence of characters with one\n\
 333   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 334       --help              display this help and exit\n\
 335       --version           output version information and exit\n\
 336 "));
 337       printf (_("\
 338 \n\
 339 SETs are specified as strings of characters.  Most represent themselves.\n\
 340 Interpreted sequences are:\n\
 341 \n\
 342   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 343   \\\\              backslash\n\
 344   \\a              audible BEL\n\
 345   \\b              backspace\n\
 346   \\f              form feed\n\
 347   \\n              new line\n\
 348   \\r              return\n\
 349   \\t              horizontal tab\n\
 350   \\v              vertical tab\n\
 351   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 352   [CHAR1-CHAR2]   same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
 353   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 354   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 355   [:alnum:]       all letters and digits\n\
 356   [:alpha:]       all letters\n\
 357   [:blank:]       all horizontal whitespace\n\
 358   [:cntrl:]       all control characters\n\
 359   [:digit:]       all digits\n\
 360   [:graph:]       all printable characters, not including space\n\
 361   [:lower:]       all lower case letters\n\
 362   [:print:]       all printable characters, including space\n\
 363   [:punct:]       all punctuation characters\n\
 364   [:space:]       all horizontal or vertical whitespace\n\
 365   [:upper:]       all upper case letters\n\
 366   [:xdigit:]      all hexadecimal digits\n\
 367   [=CHAR=]        all characters which are equivalent to CHAR\n\
 368 "));
 369       printf (_("\
 370 \n\
 371 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 372 -t may be used only when translating.  SET2 is extended to length of\n\
 373 SET1 by repeating its last character as necessary.  Excess characters\n\
 374 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 375 expand in ascending order; used in SET2 while translating, they may\n\
 376 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 377 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 378 translation or deletion.\n\
 379 "));
 380       puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
 381     }
 382   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 383 }
 384
 385 /* Return nonzero if the character C is a member of the
 386    equivalence class containing the character EQUIV_CLASS.  */
 387
 388 static int
 389 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
 390 {
 391   return (equiv_class == c);
 392 }
 393
 394 /* Return nonzero if the character C is a member of the
 395    character class CHAR_CLASS.  */
 396
 397 static int
 398 is_char_class_member (enum Char_class char_class, unsigned int c)
 399 {
 400   int result;
 401
 402   switch (char_class)
 403     {
 404     case CC_ALNUM:
 405       result = ISALNUM (c);
 406       break;
 407     case CC_ALPHA:
 408       result = ISALPHA (c);
 409       break;
 410     case CC_BLANK:
 411       result = ISBLANK (c);
 412       break;
 413     case CC_CNTRL:
 414       result = ISCNTRL (c);
 415       break;
 416     case CC_DIGIT:
 417       result = ISDIGIT_LOCALE (c);
 418       break;
 419     case CC_GRAPH:
 420       result = ISGRAPH (c);
 421       break;
 422     case CC_LOWER:
 423       result = ISLOWER (c);
 424       break;
 425     case CC_PRINT:
 426       result = ISPRINT (c);
 427       break;
 428     case CC_PUNCT:
 429       result = ISPUNCT (c);
 430       break;
 431     case CC_SPACE:
 432       result = ISSPACE (c);
 433       break;
 434     case CC_UPPER:
 435       result = ISUPPER (c);
 436       break;
 437     case CC_XDIGIT:
 438       result = ISXDIGIT (c);
 439       break;
 440     default:
 441       abort ();
 442       break;
 443     }
 444   return result;
 445 }
 446
 447 static void
 448 es_free (struct E_string *es)
 449 {
 450   free (es->s);
 451   free (es->escaped);
 452 }
 453
 454 /* Perform the first pass over each range-spec argument S, converting all
 455    \c and \ddd escapes to their one-byte representations.  The conversion
 456    is done in-place, so S must point to writable storage.  If an invalid
 457    quote sequence is found print an error message and return nonzero.
 458    Otherwise set *LEN to the length of the resulting string and return
 459    zero.  The resulting array of characters may contain zero-bytes;
 460    however, on input, S is assumed to be null-terminated, and hence
 461    cannot contain actual (non-escaped) zero bytes.  */
 462
 463 static int
 464 unquote (const unsigned char *s, struct E_string *es)
 465 {
 466   size_t i, j;
 467   size_t len;
 468
 469   len = strlen ((char *) s);
 470
 471   es->s = (unsigned char *) xmalloc (len);
 472   es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
 473   for (i = 0; i < len; i++)
 474     es->escaped[i] = 0;
 475
 476   j = 0;
 477   for (i = 0; s[i]; i++)
 478     {
 479       switch (s[i])
 480         {
 481           int c;
 482         case '\\':
 483           switch (s[i + 1])
 484             {
 485               int oct_digit;
 486             case '\\':
 487               c = '\\';
 488               break;
 489             case 'a':
 490               c = '\007';
 491               break;
 492             case 'b':
 493               c = '\b';
 494               break;
 495             case 'f':
 496               c = '\f';
 497               break;
 498             case 'n':
 499               c = '\n';
 500               break;
 501             case 'r':
 502               c = '\r';
 503               break;
 504             case 't':
 505               c = '\t';
 506               break;
 507             case 'v':
 508               c = '\v';
 509               break;
 510             case '0':
 511             case '1':
 512             case '2':
 513             case '3':
 514             case '4':
 515             case '5':
 516             case '6':
 517             case '7':
 518               c = s[i + 1] - '0';
 519               oct_digit = s[i + 2] - '0';
 520               if (0 <= oct_digit && oct_digit <= 7)
 521                 {
 522                   c = 8 * c + oct_digit;
 523                   ++i;
 524                   oct_digit = s[i + 2] - '0';
 525                   if (0 <= oct_digit && oct_digit <= 7)
 526                     {
 527                       if (8 * c + oct_digit < N_CHARS)
 528                         {
 529                           c = 8 * c + oct_digit;
 530                           ++i;
 531                         }
 532                       else if (!posix_pedantic)
 533                         {
 534                           /* Any octal number larger than 0377 won't
 535                              fit in 8 bits.  So we stop when adding the
 536                              next digit would put us over the limit and
 537                              give a warning about the ambiguity.  POSIX
 538                              isn't clear on this, but one person has said
 539                              that in his interpretation, POSIX says tr
 540                              can't even give a warning.  */
 541                           error (0, 0, _("warning: the ambiguous octal escape \
 542 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
 543                                  s[i], s[i + 1], s[i + 2],
 544                                  s[i], s[i + 1], s[i + 2]);
 545                         }
 546                     }
 547                 }
 548               break;
 549             case '\0':
 550               error (0, 0, _("invalid backslash escape at end of string"));
 551               return 1;
 552
 553             default:
 554               if (posix_pedantic)
 555                 {
 556                   error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
 557                   return 1;
 558                 }
 559               else
 560                 {
 561                   c = s[i + 1];
 562                   es->escaped[j] = 1;
 563                 }
 564             }
 565           ++i;
 566           es->s[j++] = c;
 567           break;
 568         default:
 569           es->s[j++] = s[i];
 570           break;
 571         }
 572     }
 573   es->len = j;
 574   return 0;
 575 }
 576
 577 /* If CLASS_STR is a valid character class string, return its index
 578    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 579
 580 static enum Char_class
 581 look_up_char_class (const unsigned char *class_str, size_t len)
 582 {
 583   unsigned int i;
 584
 585   for (i = 0; i < N_CHAR_CLASSES; i++)
 586     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 587         && strlen (char_class_name[i]) == len)
 588       return (enum Char_class) i;
 589   return CC_NO_CLASS;
 590 }
 591
 592 /* Return a newly allocated string with a printable version of C.
 593    This function is used solely for formatting error messages.  */
 594
 595 static char *
 596 make_printable_char (unsigned int c)
 597 {
 598   char *buf = xmalloc (5);
 599
 600   assert (c < N_CHARS);
 601   if (ISPRINT (c))
 602     {
 603       buf[0] = c;
 604       buf[1] = '\0';
 605     }
 606   else
 607     {
 608       sprintf (buf, "\\%03o", c);
 609     }
 610   return buf;
 611 }
 612
 613 /* Return a newly allocated copy of S which is suitable for printing.
 614    LEN is the number of characters in S.  Most non-printing
 615    (isprint) characters are represented by a backslash followed by
 616    3 octal digits.  However, the characters represented by \c escapes
 617    where c is one of [abfnrtv] are represented by their 2-character \c
 618    sequences.  This function is used solely for printing error messages.  */
 619
 620 static char *
 621 make_printable_str (const unsigned char *s, size_t len)
 622 {
 623   /* Worst case is that every character expands to a backslash
 624      followed by a 3-character octal escape sequence.  */
 625   char *printable_buf = xmalloc (4 * len + 1);
 626   char *p = printable_buf;
 627   size_t i;
 628
 629   for (i = 0; i < len; i++)
 630     {
 631       char buf[5];
 632       char *tmp = NULL;
 633
 634       switch (s[i])
 635         {
 636         case '\\':
 637           tmp = "\\";
 638           break;
 639         case '\007':
 640           tmp = "\\a";
 641           break;
 642         case '\b':
 643           tmp = "\\b";
 644           break;
 645         case '\f':
 646           tmp = "\\f";
 647           break;
 648         case '\n':
 649           tmp = "\\n";
 650           break;
 651         case '\r':
 652           tmp = "\\r";
 653           break;
 654         case '\t':
 655           tmp = "\\t";
 656           break;
 657         case '\v':
 658           tmp = "\\v";
 659           break;
 660         default:
 661           if (ISPRINT (s[i]))
 662             {
 663               buf[0] = s[i];
 664               buf[1] = '\0';
 665             }
 666           else
 667             sprintf (buf, "\\%03o", s[i]);
 668           tmp = buf;
 669           break;
 670         }
 671       p = stpcpy (p, tmp);
 672     }
 673   return printable_buf;
 674 }
 675
 676 /* Append a newly allocated structure representing a
 677    character C to the specification list LIST.  */
 678
 679 static void
 680 append_normal_char (struct Spec_list *list, unsigned int c)
 681 {
 682   struct List_element *new;
 683
 684   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 685   new->next = NULL;
 686   new->type = RE_NORMAL_CHAR;
 687   new->u.normal_char = c;
 688   assert (list->tail);
 689   list->tail->next = new;
 690   list->tail = new;
 691 }
 692
 693 /* Append a newly allocated structure representing the range
 694    of characters from FIRST to LAST to the specification list LIST.
 695    Return nonzero if LAST precedes FIRST in the collating sequence,
 696    zero otherwise.  This means that '[c-c]' is acceptable.  */
 697
 698 static int
 699 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
 700 {
 701   struct List_element *new;
 702
 703   if (ORD (first) > ORD (last))
 704     {
 705       char *tmp1 = make_printable_char (first);
 706       char *tmp2 = make_printable_char (last);
 707
 708       error (0, 0,
 709        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 710              tmp1, tmp2);
 711       free (tmp1);
 712       free (tmp2);
 713       return 1;
 714     }
 715   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 716   new->next = NULL;
 717   new->type = RE_RANGE;
 718   new->u.range.first_char = first;
 719   new->u.range.last_char = last;
 720   assert (list->tail);
 721   list->tail->next = new;
 722   list->tail = new;
 723   return 0;
 724 }
 725
 726 /* If CHAR_CLASS_STR is a valid character class string, append a
 727    newly allocated structure representing that character class to the end
 728    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 729    a valid string return nonzero.  */
 730
 731 static int
 732 append_char_class (struct Spec_list *list,
 733                    const unsigned char *char_class_str, size_t len)
 734 {
 735   enum Char_class char_class;
 736   struct List_element *new;
 737
 738   char_class = look_up_char_class (char_class_str, len);
 739   if (char_class == CC_NO_CLASS)
 740     return 1;
 741   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 742   new->next = NULL;
 743   new->type = RE_CHAR_CLASS;
 744   new->u.char_class = char_class;
 745   assert (list->tail);
 746   list->tail->next = new;
 747   list->tail = new;
 748   return 0;
 749 }
 750
 751 /* Append a newly allocated structure representing a [c*n]
 752    repeated character construct to the specification list LIST.
 753    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 754    is a non-negative repeat count.  */
 755
 756 static void
 757 append_repeated_char (struct Spec_list *list, unsigned int the_char,
 758                       size_t repeat_count)
 759 {
 760   struct List_element *new;
 761
 762   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 763   new->next = NULL;
 764   new->type = RE_REPEATED_CHAR;
 765   new->u.repeated_char.the_repeated_char = the_char;
 766   new->u.repeated_char.repeat_count = repeat_count;
 767   assert (list->tail);
 768   list->tail->next = new;
 769   list->tail = new;
 770 }
 771
 772 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 773    the length of that string, LEN, if LEN is exactly one, append
 774    a newly allocated structure representing the specified
 775    equivalence class to the specification list, LIST and return zero.
 776    If LEN is not 1, return nonzero.  */
 777
 778 static int
 779 append_equiv_class (struct Spec_list *list,
 780                     const unsigned char *equiv_class_str, size_t len)
 781 {
 782   struct List_element *new;
 783
 784   if (len != 1)
 785     return 1;
 786   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 787   new->next = NULL;
 788   new->type = RE_EQUIV_CLASS;
 789   new->u.equiv_code = *equiv_class_str;
 790   assert (list->tail);
 791   list->tail->next = new;
 792   list->tail = new;
 793   return 0;
 794 }
 795
 796 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 797    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 798    NUL bytes, and is *not* NUL-terminated.  */
 799
 800 static unsigned char *
 801 substr (const unsigned char *p, size_t first_idx, size_t last_idx)
 802 {
 803   size_t len;
 804   unsigned char *tmp;
 805
 806   assert (first_idx <= last_idx);
 807   len = last_idx - first_idx + 1;
 808   tmp = (unsigned char *) xmalloc (len);
 809
 810   assert (first_idx <= last_idx);
 811   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 812   memcpy (tmp, p + first_idx, len);
 813   return tmp;
 814 }
 815
 816 /* Search forward starting at START_IDX for the 2-char sequence
 817    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 818    a sequence is found, set *RESULT_IDX to the index of the first
 819    character and return nonzero. Otherwise return zero.  P may contain
 820    zero bytes.  */
 821
 822 static int
 823 find_closing_delim (const struct E_string *es, size_t start_idx,
 824                     unsigned int pre_bracket_char, size_t *result_idx)
 825 {
 826   size_t i;
 827
 828   for (i = start_idx; i < es->len - 1; i++)
 829     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 830         && !es->escaped[i] && !es->escaped[i + 1])
 831       {
 832         *result_idx = i;
 833         return 1;
 834       }
 835   return 0;
 836 }
 837
 838 /* Convert a string S with explicit length LEN, possibly
 839    containing embedded zero bytes, to a long integer value.
 840    If the string represents a negative value, a value larger
 841    than LONG_MAX, or if all LEN characters do not represent a
 842    valid integer, return nonzero and do not modify *VAL.
 843    Otherwise, return zero and set *VAL to the converted value.  */
 844
 845 static int
 846 non_neg_strtol (const unsigned char *s, size_t len, size_t *val)
 847 {
 848   size_t i;
 849   unsigned long sum = 0;
 850   unsigned int base;
 851
 852   if (len <= 0)
 853     return 1;
 854   if (s[0] == '0')
 855     base = 8;
 856   else if (ISDIGIT (s[0]))
 857     base = 10;
 858   else
 859     return 1;
 860
 861   for (i = 0; i < len; i++)
 862     {
 863       unsigned int c;
 864
 865       if (s[i] < '0')
 866         return 1;
 867
 868       c = s[i] - '0';
 869       if (c >= base)
 870         return 1;
 871
 872       if (sum > (LONG_MAX - c) / base)
 873         return 1;
 874       sum = sum * base + c;
 875     }
 876   *val = sum;
 877   return 0;
 878 }
 879
 880 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 881    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 882    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 883    and return zero. If the second character following
 884    the opening bracket is not `*' or if no closing bracket can be
 885    found, return -1.  If a closing bracket is found and the
 886    second char is `*', but the string between the `*' and `]' isn't
 887    empty, an octal number, or a decimal number, print an error message
 888    and return -2.  */
 889
 890 static int
 891 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 892                        unsigned int *char_to_repeat, size_t *repeat_count,
 893                        size_t *closing_bracket_idx)
 894 {
 895   size_t i;
 896
 897   assert (start_idx + 1 < es->len);
 898   if (!ES_MATCH (es, start_idx + 1, '*'))
 899     return -1;
 900
 901   for (i = start_idx + 2; i < es->len; i++)
 902     {
 903       if (ES_MATCH (es, i, ']'))
 904         {
 905           const unsigned char *digit_str;
 906           size_t digit_str_len = i - start_idx - 2;
 907
 908           *char_to_repeat = es->s[start_idx];
 909           if (digit_str_len == 0)
 910             {
 911               /* We've matched [c*] -- no explicit repeat count.  */
 912               *repeat_count = 0;
 913               *closing_bracket_idx = i;
 914               return 0;
 915             }
 916
 917           /* Here, we have found [c*s] where s should be a string
 918              of octal or decimal digits.  */
 919           digit_str = &es->s[start_idx + 2];
 920           if (non_neg_strtol (digit_str, digit_str_len, repeat_count)
 921               || *repeat_count > BEGIN_STATE)
 922             {
 923               char *tmp = make_printable_str (digit_str, digit_str_len);
 924               error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
 925                      tmp);
 926               free (tmp);
 927               return -2;
 928             }
 929           *closing_bracket_idx = i;
 930           return 0;
 931         }
 932     }
 933   return -1;                    /* No bracket found.  */
 934 }
 935
 936 /* Return nonzero if the string at ES->s[IDX] matches the regular
 937    expression `\*[0-9]*\]', zero otherwise.  To match, the `*' and
 938    the `]' must not be escaped.  */
 939
 940 static int
 941 star_digits_closebracket (const struct E_string *es, size_t idx)
 942 {
 943   size_t i;
 944
 945   if (!ES_MATCH (es, idx, '*'))
 946     return 0;
 947
 948   for (i = idx + 1; i < es->len; i++)
 949     {
 950       if (!ISDIGIT (es->s[i]))
 951         {
 952           if (ES_MATCH (es, i, ']'))
 953             return 1;
 954           return 0;
 955         }
 956     }
 957   return 0;
 958 }
 959
 960 /* Convert string UNESACPED_STRING (which has been preprocessed to
 961    convert backslash-escape sequences) of length LEN characters into
 962    a linked list of the following 5 types of constructs:
 963       - [:str:] Character class where `str' is one of the 12 valid strings.
 964       - [=c=] Equivalence class where `c' is any single character.
 965       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 966           However, if `n' is present, it must be a non-negative octal or
 967           decimal integer.
 968       - r-s Range of characters from `r' to `s'.  The second endpoint must
 969           not precede the first in the current collating sequence.
 970       - c Any other character is interpreted as itself.  */
 971
 972 static int
 973 build_spec_list (const struct E_string *es, struct Spec_list *result)
 974 {
 975   const unsigned char *p;
 976   size_t i;
 977
 978   p = es->s;
 979
 980   /* The main for-loop below recognizes the 4 multi-character constructs.
 981      A character that matches (in its context) none of the multi-character
 982      constructs is classified as `normal'.  Since all multi-character
 983      constructs have at least 3 characters, any strings of length 2 or
 984      less are composed solely of normal characters.  Hence, the index of
 985      the outer for-loop runs only as far as LEN-2.  */
 986
 987   for (i = 0; i + 2 < es->len; /* empty */)
 988     {
 989       if (ES_MATCH (es, i, '['))
 990         {
 991           int matched_multi_char_construct;
 992           size_t closing_bracket_idx;
 993           unsigned int char_to_repeat;
 994           size_t repeat_count;
 995           int err;
 996
 997           matched_multi_char_construct = 1;
 998           if (ES_MATCH (es, i + 1, ':')
 999               || ES_MATCH (es, i + 1, '='))
1000             {
1001               size_t closing_delim_idx;
1002               int found;
1003
1004               found = find_closing_delim (es, i + 2, p[i + 1],
1005                                           &closing_delim_idx);
1006               if (found)
1007                 {
1008                   int parse_failed;
1009                   unsigned char *opnd_str = substr (p, i + 2,
1010                                                     closing_delim_idx - 1);
1011                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
1012
1013                   if (p[i + 1] == ':')
1014                     {
1015                       parse_failed = append_char_class (result, opnd_str,
1016                                                         opnd_str_len);
1017
1018                       /* FIXME: big comment.  */
1019                       if (parse_failed)
1020                         {
1021                           if (star_digits_closebracket (es, i + 2))
1022                             {
1023                               free (opnd_str);
1024                               goto try_bracketed_repeat;
1025                             }
1026                           else
1027                             {
1028                               char *tmp = make_printable_str (opnd_str,
1029                                                               opnd_str_len);
1030                               error (0, 0, _("invalid character class `%s'"),
1031                                      tmp);
1032                               free (tmp);
1033                               return 1;
1034                             }
1035                         }
1036                     }
1037                   else
1038                     {
1039                       parse_failed = append_equiv_class (result, opnd_str,
1040                                                          opnd_str_len);
1041
1042                       /* FIXME: big comment.  */
1043                       if (parse_failed)
1044                         {
1045                           if (star_digits_closebracket (es, i + 2))
1046                             {
1047                               free (opnd_str);
1048                               goto try_bracketed_repeat;
1049                             }
1050                           else
1051                             {
1052                               char *tmp = make_printable_str (opnd_str,
1053                                                               opnd_str_len);
1054                               error (0, 0,
1055                _("%s: equivalence class operand must be a single character"),
1056                                      tmp);
1057                               free (tmp);
1058                               return 1;
1059                             }
1060                         }
1061                     }
1062                   free (opnd_str);
1063
1064                   /* Return nonzero if append_*_class reports a problem.  */
1065                   if (parse_failed)
1066                     return 1;
1067                   else
1068                     i = closing_delim_idx + 2;
1069                   continue;
1070                 }
1071               /* Else fall through.  This could be [:*] or [=*].  */
1072             }
1073
1074         try_bracketed_repeat:
1075
1076           /* Determine whether this is a bracketed repeat range
1077              matching the RE \[.\*(dec_or_oct_number)?\].  */
1078           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1079                                        &repeat_count,
1080                                        &closing_bracket_idx);
1081           if (err == 0)
1082             {
1083               append_repeated_char (result, char_to_repeat, repeat_count);
1084               i = closing_bracket_idx + 1;
1085             }
1086           else if (err == -1)
1087             {
1088               matched_multi_char_construct = 0;
1089             }
1090           else
1091             {
1092               /* Found a string that looked like [c*n] but the
1093                  numeric part was invalid.  */
1094               return 1;
1095             }
1096
1097           if (matched_multi_char_construct)
1098             continue;
1099
1100           /* We reach this point if P does not match [:str:], [=c=],
1101              [c*n], or [c*].  Now, see if P looks like a range `[-c'
1102              (from `[' to `c').  */
1103         }
1104
1105       /* Look ahead one char for ranges like a-z.  */
1106       if (ES_MATCH (es, i + 1, '-'))
1107         {
1108           if (append_range (result, p[i], p[i + 2]))
1109             return 1;
1110           i += 3;
1111         }
1112       else
1113         {
1114           append_normal_char (result, p[i]);
1115           ++i;
1116         }
1117     }
1118
1119   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1120   for (; i < es->len; i++)
1121     append_normal_char (result, p[i]);
1122
1123   return 0;
1124 }
1125
1126 /* Given a Spec_list S (with its saved state implicit in the values
1127    of its members `tail' and `state'), return the next single character
1128    in the expansion of S's constructs.  If the last character of S was
1129    returned on the previous call or if S was empty, this function
1130    returns -1.  For example, successive calls to get_next where S
1131    represents the spec-string 'a-d[y*3]' will return the sequence
1132    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1133    which the returned character comes is [:upper:] or [:lower:], the
1134    parameter CLASS is given a value to indicate which it was.  Otherwise
1135    CLASS is set to UL_NONE.  This value is used only when constructing
1136    the translation table to verify that any occurrences of upper and
1137    lower class constructs in the spec-strings appear in the same relative
1138    positions.  */
1139
1140 static int
1141 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1142 {
1143   struct List_element *p;
1144   int return_val;
1145   int i;
1146
1147   if (class)
1148     *class = UL_NONE;
1149
1150   if (s->state == BEGIN_STATE)
1151     {
1152       s->tail = s->head->next;
1153       s->state = NEW_ELEMENT;
1154     }
1155
1156   p = s->tail;
1157   if (p == NULL)
1158     return -1;
1159
1160   switch (p->type)
1161     {
1162     case RE_NORMAL_CHAR:
1163       return_val = p->u.normal_char;
1164       s->state = NEW_ELEMENT;
1165       s->tail = p->next;
1166       break;
1167
1168     case RE_RANGE:
1169       if (s->state == NEW_ELEMENT)
1170         s->state = ORD (p->u.range.first_char);
1171       else
1172         ++(s->state);
1173       return_val = CHR (s->state);
1174       if (s->state == ORD (p->u.range.last_char))
1175         {
1176           s->tail = p->next;
1177           s->state = NEW_ELEMENT;
1178         }
1179       break;
1180
1181     case RE_CHAR_CLASS:
1182       if (class)
1183         {
1184           int upper_or_lower;
1185           switch (p->u.char_class)
1186             {
1187             case CC_LOWER:
1188               *class = UL_LOWER;
1189               upper_or_lower = 1;
1190               break;
1191             case CC_UPPER:
1192               *class = UL_UPPER;
1193               upper_or_lower = 1;
1194               break;
1195             default:
1196               upper_or_lower = 0;
1197               break;
1198             }
1199
1200           if (upper_or_lower)
1201             {
1202               s->tail = p->next;
1203               s->state = NEW_ELEMENT;
1204               return_val = 0;
1205               break;
1206             }
1207         }
1208
1209       if (s->state == NEW_ELEMENT)
1210         {
1211           for (i = 0; i < N_CHARS; i++)
1212             if (is_char_class_member (p->u.char_class, i))
1213               break;
1214           assert (i < N_CHARS);
1215           s->state = i;
1216         }
1217       assert (is_char_class_member (p->u.char_class, s->state));
1218       return_val = CHR (s->state);
1219       for (i = s->state + 1; i < N_CHARS; i++)
1220         if (is_char_class_member (p->u.char_class, i))
1221           break;
1222       if (i < N_CHARS)
1223         s->state = i;
1224       else
1225         {
1226           s->tail = p->next;
1227           s->state = NEW_ELEMENT;
1228         }
1229       break;
1230
1231     case RE_EQUIV_CLASS:
1232       /* FIXME: this assumes that each character is alone in its own
1233          equivalence class (which appears to be correct for my
1234          LC_COLLATE.  But I don't know of any function that allows
1235          one to determine a character's equivalence class.  */
1236
1237       return_val = p->u.equiv_code;
1238       s->state = NEW_ELEMENT;
1239       s->tail = p->next;
1240       break;
1241
1242     case RE_REPEATED_CHAR:
1243       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1244       if (p->u.repeated_char.repeat_count == 0)
1245         {
1246           s->tail = p->next;
1247           s->state = NEW_ELEMENT;
1248           return_val = get_next (s, class);
1249         }
1250       else
1251         {
1252           if (s->state == NEW_ELEMENT)
1253             {
1254               s->state = 0;
1255             }
1256           ++(s->state);
1257           return_val = p->u.repeated_char.the_repeated_char;
1258           if (p->u.repeated_char.repeat_count > 0
1259               && s->state == p->u.repeated_char.repeat_count)
1260             {
1261               s->tail = p->next;
1262               s->state = NEW_ELEMENT;
1263             }
1264         }
1265       break;
1266
1267     case RE_NO_TYPE:
1268       abort ();
1269       break;
1270
1271     default:
1272       abort ();
1273       break;
1274     }
1275
1276   return return_val;
1277 }
1278
1279 /* This is a minor kludge.  This function is called from
1280    get_spec_stats to determine the cardinality of a set derived
1281    from a complemented string.  It's a kludge in that some of the
1282    same operations are (duplicated) performed in set_initialize.  */
1283
1284 static int
1285 card_of_complement (struct Spec_list *s)
1286 {
1287   int c;
1288   int cardinality = N_CHARS;
1289   SET_TYPE in_set[N_CHARS];
1290
1291   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1292   s->state = BEGIN_STATE;
1293   while ((c = get_next (s, NULL)) != -1)
1294     if (!in_set[c]++)
1295       --cardinality;
1296   return cardinality;
1297 }
1298
1299 /* Gather statistics about the spec-list S in preparation for the tests
1300    in validate that determine the consistency of the specs.  This function
1301    is called at most twice; once for string1, and again for any string2.
1302    LEN_S1 < 0 indicates that this is the first call and that S represents
1303    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1304    constructs in string1, and we can use its value to resolve any
1305    indefinite repeat construct in S (which represents string2).  Hence,
1306    this function has the side-effect that it converts a valid [c*]
1307    construct in string2 to [c*n] where n is large enough (or 0) to give
1308    string2 the same length as string1.  For example, with the command
1309    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1310    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1311
1312 static void
1313 get_spec_stats (struct Spec_list *s)
1314 {
1315   struct List_element *p;
1316   int len = 0;
1317
1318   s->n_indefinite_repeats = 0;
1319   s->has_equiv_class = 0;
1320   s->has_restricted_char_class = 0;
1321   s->has_char_class = 0;
1322   for (p = s->head->next; p; p = p->next)
1323     {
1324       switch (p->type)
1325         {
1326           int i;
1327         case RE_NORMAL_CHAR:
1328           ++len;
1329           break;
1330
1331         case RE_RANGE:
1332           assert (p->u.range.last_char >= p->u.range.first_char);
1333           len += p->u.range.last_char - p->u.range.first_char + 1;
1334           break;
1335
1336         case RE_CHAR_CLASS:
1337           s->has_char_class = 1;
1338           for (i = 0; i < N_CHARS; i++)
1339             if (is_char_class_member (p->u.char_class, i))
1340               ++len;
1341           switch (p->u.char_class)
1342             {
1343             case CC_UPPER:
1344             case CC_LOWER:
1345               break;
1346             default:
1347               s->has_restricted_char_class = 1;
1348               break;
1349             }
1350           break;
1351
1352         case RE_EQUIV_CLASS:
1353           for (i = 0; i < N_CHARS; i++)
1354             if (is_equiv_class_member (p->u.equiv_code, i))
1355               ++len;
1356           s->has_equiv_class = 1;
1357           break;
1358
1359         case RE_REPEATED_CHAR:
1360           if (p->u.repeated_char.repeat_count > 0)
1361             len += p->u.repeated_char.repeat_count;
1362           else if (p->u.repeated_char.repeat_count == 0)
1363             {
1364               s->indefinite_repeat_element = p;
1365               ++(s->n_indefinite_repeats);
1366             }
1367           break;
1368
1369         case RE_NO_TYPE:
1370           assert (0);
1371           break;
1372         }
1373     }
1374
1375   s->length = len;
1376 }
1377
1378 static void
1379 get_s1_spec_stats (struct Spec_list *s1)
1380 {
1381   get_spec_stats (s1);
1382   if (complement)
1383     s1->length = card_of_complement (s1);
1384 }
1385
1386 static void
1387 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1388 {
1389   get_spec_stats (s2);
1390   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1391     {
1392       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1393         len_s1 - s2->length;
1394       s2->length = len_s1;
1395     }
1396 }
1397
1398 static void
1399 spec_init (struct Spec_list *spec_list)
1400 {
1401   spec_list->head = spec_list->tail =
1402     (struct List_element *) xmalloc (sizeof (struct List_element));
1403   spec_list->head->next = NULL;
1404 }
1405
1406 /* This function makes two passes over the argument string S.  The first
1407    one converts all \c and \ddd escapes to their one-byte representations.
1408    The second constructs a linked specification list, SPEC_LIST, of the
1409    characters and constructs that comprise the argument string.  If either
1410    of these passes detects an error, this function returns nonzero.  */
1411
1412 static int
1413 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1414 {
1415   struct E_string es;
1416   int fail;
1417
1418   fail = unquote (s, &es);
1419   if (!fail)
1420     fail = build_spec_list (&es, spec_list);
1421   es_free (&es);
1422   return fail;
1423 }
1424
1425 /* Given two specification lists, S1 and S2, and assuming that
1426    S1->length > S2->length, append a single [c*n] element to S2 where c
1427    is the last character in the expansion of S2 and n is the difference
1428    between the two lengths.
1429    Upon successful completion, S2->length is set to S1->length.  The only
1430    way this function can fail to make S2 as long as S1 is when S2 has
1431    zero-length, since in that case, there is no last character to repeat.
1432    So S2->length is required to be at least 1.
1433
1434    Providing this functionality allows the user to do some pretty
1435    non-BSD (and non-portable) things:  For example, the command
1436        tr -cs '[:upper:]0-9' '[:lower:]'
1437    is almost guaranteed to give results that depend on your collating
1438    sequence.  */
1439
1440 static void
1441 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1442 {
1443   struct List_element *p;
1444   int char_to_repeat;
1445   int i;
1446
1447   assert (translating);
1448   assert (s1->length > s2->length);
1449   assert (s2->length > 0);
1450
1451   p = s2->tail;
1452   switch (p->type)
1453     {
1454     case RE_NORMAL_CHAR:
1455       char_to_repeat = p->u.normal_char;
1456       break;
1457     case RE_RANGE:
1458       char_to_repeat = p->u.range.last_char;
1459       break;
1460     case RE_CHAR_CLASS:
1461       for (i = N_CHARS; i >= 0; i--)
1462         if (is_char_class_member (p->u.char_class, i))
1463           break;
1464       assert (i >= 0);
1465       char_to_repeat = CHR (i);
1466       break;
1467
1468     case RE_REPEATED_CHAR:
1469       char_to_repeat = p->u.repeated_char.the_repeated_char;
1470       break;
1471
1472     case RE_EQUIV_CLASS:
1473       /* This shouldn't happen, because validate exits with an error
1474          if it finds an equiv class in string2 when translating.  */
1475       abort ();
1476       break;
1477
1478     case RE_NO_TYPE:
1479       abort ();
1480       break;
1481
1482     default:
1483       abort ();
1484       break;
1485     }
1486
1487   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1488   s2->length = s1->length;
1489 }
1490
1491 /* Return non-zero if S is a non-empty list in which exactly one
1492    character (but potentially, many instances of it) appears.
1493    E.g.  [X*] or xxxxxxxx.  */
1494
1495 static int
1496 homogeneous_spec_list (struct Spec_list *s)
1497 {
1498   int b, c;
1499
1500   s->state = BEGIN_STATE;
1501
1502   if ((b = get_next (s, NULL)) == -1)
1503     return 0;
1504
1505   while ((c = get_next (s, NULL)) != -1)
1506     if (c != b)
1507       return 0;
1508
1509   return 1;
1510 }
1511
1512 /* Die with an error message if S1 and S2 describe strings that
1513    are not valid with the given command line switches.
1514    A side effect of this function is that if a valid [c*] or
1515    [c*0] construct appears in string2, it is converted to [c*n]
1516    with a value for n that makes s2->length == s1->length.  By
1517    the same token, if the --truncate-set1 option is not
1518    given, S2 may be extended.  */
1519
1520 static void
1521 validate (struct Spec_list *s1, struct Spec_list *s2)
1522 {
1523   get_s1_spec_stats (s1);
1524   if (s1->n_indefinite_repeats > 0)
1525     {
1526       error (EXIT_FAILURE, 0,
1527              _("the [c*] repeat construct may not appear in string1"));
1528     }
1529
1530   if (s2)
1531     {
1532       get_s2_spec_stats (s2, s1->length);
1533
1534       if (s2->n_indefinite_repeats > 1)
1535         {
1536           error (EXIT_FAILURE, 0,
1537                  _("only one [c*] repeat construct may appear in string2"));
1538         }
1539
1540       if (translating)
1541         {
1542           if (s2->has_equiv_class)
1543             {
1544               error (EXIT_FAILURE, 0,
1545                      _("[=c=] expressions may not appear in string2 \
1546 when translating"));
1547             }
1548
1549           if (s1->length > s2->length)
1550             {
1551               if (!truncate_set1)
1552                 {
1553                   /* string2 must be non-empty unless --truncate-set1 is
1554                      given or string1 is empty.  */
1555
1556                   if (s2->length == 0)
1557                     error (EXIT_FAILURE, 0,
1558                      _("when not truncating set1, string2 must be non-empty"));
1559                   string2_extend (s1, s2);
1560                 }
1561             }
1562
1563           if (complement && s1->has_char_class
1564               && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1565             {
1566               error (EXIT_FAILURE, 0,
1567                      _("when translating with complemented character classes,\
1568 \nstring2 must map all characters in the domain to one"));
1569             }
1570
1571           if (s2->has_restricted_char_class)
1572             {
1573               error (EXIT_FAILURE, 0,
1574                      _("when translating, the only character classes that may \
1575 appear in\nstring2 are `upper' and `lower'"));
1576             }
1577         }
1578       else
1579         /* Not translating.  */
1580         {
1581           if (s2->n_indefinite_repeats > 0)
1582             error (EXIT_FAILURE, 0,
1583                    _("the [c*] construct may appear in string2 only \
1584 when translating"));
1585         }
1586     }
1587 }
1588
1589 /* Read buffers of SIZE bytes via the function READER (if READER is
1590    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1591    read_and_delete or read_and_xlate.  After each buffer is read, it is
1592    processed and written to stdout.  The buffers are processed so that
1593    multiple consecutive occurrences of the same character in the input
1594    stream are replaced by a single occurrence of that character if the
1595    character is in the squeeze set.  */
1596
1597 static void
1598 squeeze_filter (unsigned char *buf, long int size, PFI reader)
1599 {
1600   unsigned int char_to_squeeze = NOT_A_CHAR;
1601   int i = 0;
1602   int nr = 0;
1603
1604   for (;;)
1605     {
1606       int begin;
1607
1608       if (i >= nr)
1609         {
1610           if (reader == NULL)
1611             nr = safe_read (0, (char *) buf, size);
1612           else
1613             nr = (*reader) (buf, size, NULL);
1614
1615           if (nr < 0)
1616             error (EXIT_FAILURE, errno, _("read error"));
1617           if (nr == 0)
1618             break;
1619           i = 0;
1620         }
1621
1622       begin = i;
1623
1624       if (char_to_squeeze == NOT_A_CHAR)
1625         {
1626           int out_len;
1627           /* Here, by being a little tricky, we can get a significant
1628              performance increase in most cases when the input is
1629              reasonably large.  Since tr will modify the input only
1630              if two consecutive (and identical) input characters are
1631              in the squeeze set, we can step by two through the data
1632              when searching for a character in the squeeze set.  This
1633              means there may be a little more work in a few cases and
1634              perhaps twice as much work in the worst cases where most
1635              of the input is removed by squeezing repeats.  But most
1636              uses of this functionality seem to remove less than 20-30%
1637              of the input.  */
1638           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1639             ;                   /* empty */
1640
1641           /* There is a special case when i == nr and we've just
1642              skipped a character (the last one in buf) that is in
1643              the squeeze set.  */
1644           if (i == nr && in_squeeze_set[buf[i - 1]])
1645             --i;
1646
1647           if (i >= nr)
1648             out_len = nr - begin;
1649           else
1650             {
1651               char_to_squeeze = buf[i];
1652               /* We're about to output buf[begin..i].  */
1653               out_len = i - begin + 1;
1654
1655               /* But since we stepped by 2 in the loop above,
1656                  out_len may be one too large.  */
1657               if (i > 0 && buf[i - 1] == char_to_squeeze)
1658                 --out_len;
1659
1660               /* Advance i to the index of first character to be
1661                  considered when looking for a char different from
1662                  char_to_squeeze.  */
1663               ++i;
1664             }
1665           if (out_len > 0
1666               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1667             error (EXIT_FAILURE, errno, _("write error"));
1668         }
1669
1670       if (char_to_squeeze != NOT_A_CHAR)
1671         {
1672           /* Advance i to index of first char != char_to_squeeze
1673              (or to nr if all the rest of the characters in this
1674              buffer are the same as char_to_squeeze).  */
1675           for (; i < nr && buf[i] == char_to_squeeze; i++)
1676             ;                   /* empty */
1677           if (i < nr)
1678             char_to_squeeze = NOT_A_CHAR;
1679           /* If (i >= nr) we've squeezed the last character in this buffer.
1680              So now we have to read a new buffer and continue comparing
1681              characters against char_to_squeeze.  */
1682         }
1683     }
1684 }
1685
1686 /* Read buffers of SIZE bytes from stdin until one is found that
1687    contains at least one character not in the delete set.  Store
1688    in the array BUF, all characters from that buffer that are not
1689    in the delete set, and return the number of characters saved
1690    or 0 upon EOF.  */
1691
1692 static long
1693 read_and_delete (unsigned char *buf, long int size, PFI not_used)
1694 {
1695   long n_saved;
1696   static int hit_eof = 0;
1697
1698   assert (not_used == NULL);
1699   assert (size > 0);
1700
1701   if (hit_eof)
1702     return 0;
1703
1704   /* This enclosing do-while loop is to make sure that
1705      we don't return zero (indicating EOF) when we've
1706      just deleted all the characters in a buffer.  */
1707   do
1708     {
1709       int i;
1710       int nr = safe_read (0, (char *) buf, size);
1711
1712       if (nr < 0)
1713         error (EXIT_FAILURE, errno, _("read error"));
1714       if (nr == 0)
1715         {
1716           hit_eof = 1;
1717           return 0;
1718         }
1719
1720       /* This first loop may be a waste of code, but gives much
1721          better performance when no characters are deleted in
1722          the beginning of a buffer.  It just avoids the copying
1723          of buf[i] into buf[n_saved] when it would be a NOP.  */
1724
1725       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1726         /* empty */ ;
1727       n_saved = i;
1728
1729       for (++i; i < nr; i++)
1730         if (!in_delete_set[buf[i]])
1731           buf[n_saved++] = buf[i];
1732     }
1733   while (n_saved == 0);
1734
1735   return n_saved;
1736 }
1737
1738 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1739    perform the in-place and one-to-one mapping specified by the global
1740    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1741
1742 static long
1743 read_and_xlate (unsigned char *buf, long int size, PFI not_used)
1744 {
1745   long chars_read = 0;
1746   static int hit_eof = 0;
1747   int i;
1748
1749   assert (not_used == NULL);
1750   assert (size > 0);
1751
1752   if (hit_eof)
1753     return 0;
1754
1755   chars_read = safe_read (0, (char *) buf, size);
1756   if (chars_read < 0)
1757     error (EXIT_FAILURE, errno, _("read error"));
1758   if (chars_read == 0)
1759     {
1760       hit_eof = 1;
1761       return 0;
1762     }
1763
1764   for (i = 0; i < chars_read; i++)
1765     buf[i] = xlate[buf[i]];
1766
1767   return chars_read;
1768 }
1769
1770 /* Initialize a boolean membership set IN_SET with the character
1771    values obtained by traversing the linked list of constructs S
1772    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1773    nonzero the resulting set is complemented.  */
1774
1775 static void
1776 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1777 {
1778   int c;
1779   int i;
1780
1781   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1782   s->state = BEGIN_STATE;
1783   while ((c = get_next (s, NULL)) != -1)
1784     in_set[c] = 1;
1785   if (complement_this_set)
1786     for (i = 0; i < N_CHARS; i++)
1787       in_set[i] = (!in_set[i]);
1788 }
1789
1790 int
1791 main (int argc, char **argv)
1792 {
1793   int c;
1794   int non_option_args;
1795   struct Spec_list buf1, buf2;
1796   struct Spec_list *s1 = &buf1;
1797   struct Spec_list *s2 = &buf2;
1798
1799   program_name = argv[0];
1800   setlocale (LC_ALL, "");
1801   bindtextdomain (PACKAGE, LOCALEDIR);
1802   textdomain (PACKAGE);
1803
1804   parse_long_options (argc, argv, PROGRAM_NAME, GNU_PACKAGE, VERSION,
1805                       "Jim Meyering", usage);
1806
1807   while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1808     {
1809       switch (c)
1810         {
1811         case 0:
1812           break;
1813
1814         case 'c':
1815           complement = 1;
1816           break;
1817
1818         case 'd':
1819           delete = 1;
1820           break;
1821
1822         case 's':
1823           squeeze_repeats = 1;
1824           break;
1825
1826         case 't':
1827           truncate_set1 = 1;
1828           break;
1829
1830         default:
1831           usage (2);
1832           break;
1833         }
1834     }
1835
1836   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1837
1838   non_option_args = argc - optind;
1839   translating = (non_option_args == 2 && !delete);
1840
1841   /* Change this test if it is valid to give tr no options and
1842      no args at all.  POSIX doesn't specifically say anything
1843      either way, but it looks like they implied it's invalid
1844      by omission.  If you want to make tr do a slow imitation
1845      of `cat' use `tr a a'.  */
1846   if (non_option_args > 2)
1847     {
1848       error (0, 0, _("too many arguments"));
1849       usage (2);
1850     }
1851
1852   if (!delete && !squeeze_repeats && non_option_args != 2)
1853     error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1854
1855   if (delete && squeeze_repeats && non_option_args != 2)
1856     error (EXIT_FAILURE, 0, _("two strings must be given when both \
1857 deleting and squeezing repeats"));
1858
1859   /* If --delete is given without --squeeze-repeats, then
1860      only one string argument may be specified.  But POSIX
1861      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1862      is set, pretend we never saw string2.  But I think
1863      this deserves a fatal error, so that's the default.  */
1864   if ((delete && !squeeze_repeats) && non_option_args != 1)
1865     {
1866       if (posix_pedantic && non_option_args == 2)
1867         --non_option_args;
1868       else
1869         error (EXIT_FAILURE, 0,
1870                _("only one string may be given when deleting \
1871 without squeezing repeats"));
1872     }
1873
1874   if (squeeze_repeats && non_option_args == 0)
1875     error (EXIT_FAILURE, 0,
1876            _("at least one string must be given when squeezing repeats"));
1877
1878   spec_init (s1);
1879   if (parse_str ((unsigned char *) argv[optind], s1))
1880     exit (EXIT_FAILURE);
1881
1882   if (non_option_args == 2)
1883     {
1884       spec_init (s2);
1885       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1886         exit (EXIT_FAILURE);
1887     }
1888   else
1889     s2 = NULL;
1890
1891   validate (s1, s2);
1892
1893   /* Use binary I/O, since `tr' is sometimes used to transliterate
1894      non-printable characters, or characters which are stripped away
1895      by text-mode reads (like CR and ^Z).  */
1896   SET_BINARY2 (STDIN_FILENO, STDOUT_FILENO);
1897
1898   if (squeeze_repeats && non_option_args == 1)
1899     {
1900       set_initialize (s1, complement, in_squeeze_set);
1901       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1902     }
1903   else if (delete && non_option_args == 1)
1904     {
1905       long nr;
1906
1907       set_initialize (s1, complement, in_delete_set);
1908       do
1909         {
1910           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1911           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1912             error (EXIT_FAILURE, errno, _("write error"));
1913         }
1914       while (nr > 0);
1915     }
1916   else if (squeeze_repeats && delete && non_option_args == 2)
1917     {
1918       set_initialize (s1, complement, in_delete_set);
1919       set_initialize (s2, 0, in_squeeze_set);
1920       squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1921     }
1922   else if (translating)
1923     {
1924       if (complement)
1925         {
1926           int i;
1927           SET_TYPE *in_s1 = in_delete_set;
1928
1929           set_initialize (s1, 0, in_s1);
1930           s2->state = BEGIN_STATE;
1931           for (i = 0; i < N_CHARS; i++)
1932             xlate[i] = i;
1933           for (i = 0; i < N_CHARS; i++)
1934             {
1935               if (!in_s1[i])
1936                 {
1937                   int ch = get_next (s2, NULL);
1938                   assert (ch != -1 || truncate_set1);
1939                   if (ch == -1)
1940                     {
1941                       /* This will happen when tr is invoked like e.g.
1942                          tr -cs A-Za-z0-9 '\012'.  */
1943                       break;
1944                     }
1945                   xlate[i] = ch;
1946                 }
1947             }
1948           assert (get_next (s2, NULL) == -1 || truncate_set1);
1949         }
1950       else
1951         {
1952           int c1, c2;
1953           int i;
1954           enum Upper_Lower_class class_s1;
1955           enum Upper_Lower_class class_s2;
1956
1957           for (i = 0; i < N_CHARS; i++)
1958             xlate[i] = i;
1959           s1->state = BEGIN_STATE;
1960           s2->state = BEGIN_STATE;
1961           for (;;)
1962             {
1963               c1 = get_next (s1, &class_s1);
1964               c2 = get_next (s2, &class_s2);
1965               if (!class_ok[(int) class_s1][(int) class_s2])
1966                 error (EXIT_FAILURE, 0,
1967                        _("misaligned [:upper:] and/or [:lower:] construct"));
1968
1969               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1970                 {
1971                   for (i = 0; i < N_CHARS; i++)
1972                     if (ISLOWER (i))
1973                       xlate[i] = toupper (i);
1974                 }
1975               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1976                 {
1977                   for (i = 0; i < N_CHARS; i++)
1978                     if (ISUPPER (i))
1979                       xlate[i] = tolower (i);
1980                 }
1981               else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1982                        || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1983                 {
1984                   /* By default, GNU tr permits the identity mappings: from
1985                      [:upper:] to [:upper:] and [:lower:] to [:lower:].  But
1986                      when POSIXLY_CORRECT is set, those evoke diagnostics.  */
1987                   if (posix_pedantic)
1988                     {
1989                       error (EXIT_FAILURE, 0,
1990                              _("\
1991 invalid identity mapping;  when translating, any [:lower:] or [:upper:]\n\
1992 construct in string1 must be aligned with a corresponding construct\n\
1993 ([:upper:] or [:lower:], respectively) in string2"));
1994                     }
1995                 }
1996               else
1997                 {
1998                   /* The following should have been checked by validate...  */
1999                   if (c1 == -1 || c2 == -1)
2000                     break;
2001                   xlate[c1] = c2;
2002                 }
2003             }
2004           assert (c1 == -1 || truncate_set1);
2005         }
2006       if (squeeze_repeats)
2007         {
2008           set_initialize (s2, 0, in_squeeze_set);
2009           squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
2010         }
2011       else
2012         {
2013           long chars_read;
2014
2015           do
2016             {
2017               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2018               if (chars_read > 0
2019                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2020                 error (EXIT_FAILURE, errno, _("write error"));
2021             }
2022           while (chars_read > 0);
2023         }
2024     }
2025
2026   if (fclose (stdout) == EOF)
2027     error (EXIT_FAILURE, errno, _("write error"));
2028
2029   if (close (0) != 0)
2030     error (EXIT_FAILURE, errno, _("standard input"));
2031
2032   exit (EXIT_SUCCESS);
2033 }