src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 91, 1995-2003 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <assert.h>
  24 #include <sys/types.h>
  25 #include <getopt.h>
  26
  27 #include "system.h"
  28 #include "error.h"
  29 #include "safe-read.h"
  30 #include "xstrtol.h"
  31
  32 /* The official name of this program (e.g., no `g' prefix).  */
  33 #define PROGRAM_NAME "tr"
  34
  35 #define WRITTEN_BY _("Written by Jim Meyering.")
  36
  37 #define N_CHARS (UCHAR_MAX + 1)
  38
  39 /* A pointer to a filtering function.  */
  40 typedef size_t (*Filter) (/* unsigned char *, size_t, Filter */);
  41
  42 /* Convert from character C to its index in the collating
  43    sequence array.  Just cast to an unsigned int to avoid
  44    problems with sign-extension.  */
  45 #define ORD(c) (unsigned int)(c)
  46
  47 /* The inverse of ORD.  */
  48 #define CHR(i) (unsigned char)(i)
  49
  50 /* The value for Spec_list->state that indicates to
  51    get_next that it should initialize the tail pointer.
  52    Its value should be as large as possible to avoid conflict
  53    a valid value for the state field -- and that may be as
  54    large as any valid repeat_count.  */
  55 #define BEGIN_STATE (INT_MAX - 1)
  56
  57 /* The value for Spec_list->state that indicates to
  58    get_next that the element pointed to by Spec_list->tail is
  59    being considered for the first time on this pass through the
  60    list -- it indicates that get_next should make any necessary
  61    initializations.  */
  62 #define NEW_ELEMENT (BEGIN_STATE + 1)
  63
  64 /* A value distinct from any character that may have been stored in a
  65    buffer as the result of a block-read in the function squeeze_filter.  */
  66 #define NOT_A_CHAR (unsigned int)(-1)
  67
  68 /* The following (but not CC_NO_CLASS) are indices into the array of
  69    valid character class strings.  */
  70 enum Char_class
  71   {
  72     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  73     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  74     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  75     CC_NO_CLASS = 9999
  76   };
  77
  78 /* Character class to which a character (returned by get_next) belonged;
  79    but it is set only if the construct from which the character was obtained
  80    was one of the character classes [:upper:] or [:lower:].  The value
  81    is used only when translating and then, only to make sure that upper
  82    and lower class constructs have the same relative positions in string1
  83    and string2.  */
  84 enum Upper_Lower_class
  85   {
  86     UL_LOWER = 0,
  87     UL_UPPER = 1,
  88     UL_NONE = 2
  89   };
  90
  91 /* A shortcut to ensure that when constructing the translation array,
  92    one of the values returned by paired calls to get_next (from s1 and s2)
  93    is from [:upper:] and the other is from [:lower:], or neither is from
  94    upper or lower.  By default, GNU tr permits the identity mappings: from
  95    [:upper:] to [:upper:] and [:lower:] to [:lower:].  But when
  96    POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
  97    by values of type enum Upper_Lower_class.  */
  98 static int const class_ok[3][3] =
  99 {
 100   {1, 1, 0},
 101   {1, 1, 0},
 102   {0, 0, 1}
 103 };
 104
 105 /* The type of a List_element.  See build_spec_list for more details.  */
 106 enum Range_element_type
 107   {
 108     RE_NO_TYPE = 0,
 109     RE_NORMAL_CHAR,
 110     RE_RANGE,
 111     RE_CHAR_CLASS,
 112     RE_EQUIV_CLASS,
 113     RE_REPEATED_CHAR
 114   };
 115
 116 /* One construct in one of tr's argument strings.
 117    For example, consider the POSIX version of the classic tr command:
 118        tr -cs 'a-zA-Z_' '[\n*]'
 119    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 120    and a single normal character, `_'.  String2 has one construct.  */
 121 struct List_element
 122   {
 123     enum Range_element_type type;
 124     struct List_element *next;
 125     union
 126       {
 127         int normal_char;
 128         struct                  /* unnamed */
 129           {
 130             unsigned int first_char;
 131             unsigned int last_char;
 132           }
 133         range;
 134         enum Char_class char_class;
 135         int equiv_code;
 136         struct                  /* unnamed */
 137           {
 138             unsigned int the_repeated_char;
 139             size_t repeat_count;
 140           }
 141         repeated_char;
 142       }
 143     u;
 144   };
 145
 146 /* Each of tr's argument strings is parsed into a form that is easier
 147    to work with: a linked list of constructs (struct List_element).
 148    Each Spec_list structure also encapsulates various attributes of
 149    the corresponding argument string.  The attributes are used mainly
 150    to verify that the strings are valid in the context of any options
 151    specified (like -s, -d, or -c).  The main exception is the member
 152    `tail', which is first used to construct the list.  After construction,
 153    it is used by get_next to save its state when traversing the list.
 154    The member `state' serves a similar function.  */
 155 struct Spec_list
 156   {
 157     /* Points to the head of the list of range elements.
 158        The first struct is a dummy; its members are never used.  */
 159     struct List_element *head;
 160
 161     /* When appending, points to the last element.  When traversing via
 162        get_next(), points to the element to process next.  Setting
 163        Spec_list.state to the value BEGIN_STATE before calling get_next
 164        signals get_next to initialize tail to point to head->next.  */
 165     struct List_element *tail;
 166
 167     /* Used to save state between calls to get_next().  */
 168     unsigned int state;
 169
 170     /* Length, in the sense that length ('a-z[:digit:]123abc')
 171        is 42 ( = 26 + 10 + 6).  */
 172     size_t length;
 173
 174     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 175     int n_indefinite_repeats;
 176
 177     /* If n_indefinite_repeats is nonzero, this points to the List_element
 178        corresponding to the last [c*] or [c*0] construct encountered in
 179        this spec.  Otherwise it is undefined.  */
 180     struct List_element *indefinite_repeat_element;
 181
 182     /* Non-zero if this spec contains at least one equivalence
 183        class construct e.g. [=c=].  */
 184     int has_equiv_class;
 185
 186     /* Non-zero if this spec contains at least one character class
 187        construct.  E.g. [:digit:].  */
 188     int has_char_class;
 189
 190     /* Non-zero if this spec contains at least one of the character class
 191        constructs (all but upper and lower) that aren't allowed in s2.  */
 192     int has_restricted_char_class;
 193   };
 194
 195 /* A representation for escaped string1 or string2.  As a string is parsed,
 196    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 197    etc.) are marked as such in this structure by setting the corresponding
 198    entry in the ESCAPED vector.  */
 199 struct E_string
 200 {
 201   unsigned char *s;
 202   int *escaped;
 203   size_t len;
 204 };
 205
 206 /* Return nonzero if the Ith character of escaped string ES matches C
 207    and is not escaped itself.  */
 208 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
 209
 210 /* The name by which this program was run.  */
 211 char *program_name;
 212
 213 /* When nonzero, each sequence in the input of a repeated character
 214    (call it c) is replaced (in the output) by a single occurrence of c
 215    for every c in the squeeze set.  */
 216 static int squeeze_repeats = 0;
 217
 218 /* When nonzero, removes characters in the delete set from input.  */
 219 static int delete = 0;
 220
 221 /* Use the complement of set1 in place of set1.  */
 222 static int complement = 0;
 223
 224 /* When nonzero, this flag causes GNU tr to provide strict
 225    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 226    says that when -d is used without -s, string2 (if present)
 227    must be ignored.  Silently ignoring arguments is a bad idea.
 228    The default GNU behavior is to give a usage message and exit.
 229    Additionally, when this flag is nonzero, tr prints warnings
 230    on stderr if it is being used in a manner that is not portable.
 231    Applicable warnings are given by default, but are suppressed
 232    if the environment variable `POSIXLY_CORRECT' is set, since
 233    being POSIX conformant means we can't issue such messages.
 234    Warnings on the following topics are suppressed when this
 235    variable is nonzero:
 236    1. Ambiguous octal escapes.  */
 237 static int posix_pedantic;
 238
 239 /* When tr is performing translation and string1 is longer than string2,
 240    POSIX says that the result is undefined.  That gives the implementor
 241    of a POSIX conforming version of tr two reasonable choices for the
 242    semantics of this case.
 243
 244    * The BSD tr pads string2 to the length of string1 by
 245    repeating the last character in string2.
 246
 247    * System V tr ignores characters in string1 that have no
 248    corresponding character in string2.  That is, string1 is effectively
 249    truncated to the length of string2.
 250
 251    When nonzero, this flag causes GNU tr to imitate the behavior
 252    of System V tr when translating with string1 longer than string2.
 253    The default is to emulate BSD tr.  This flag is ignored in modes where
 254    no translation is performed.  Emulating the System V tr
 255    in this exceptional case causes the relatively common BSD idiom:
 256
 257        tr -cs A-Za-z0-9 '\012'
 258
 259    to break (it would convert only zero bytes, rather than all
 260    non-alphanumerics, to newlines).
 261
 262    WARNING: This switch does not provide general BSD or System V
 263    compatibility.  For example, it doesn't disable the interpretation
 264    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 265    some unfortunate coincidence you use such constructs in scripts
 266    expecting to use some other version of tr, the scripts will break.  */
 267 static int truncate_set1 = 0;
 268
 269 /* An alias for (!delete && non_option_args == 2).
 270    It is set in main and used there and in validate().  */
 271 static int translating;
 272
 273 static unsigned char io_buf[BUFSIZ];
 274
 275 static char const *const char_class_name[] =
 276 {
 277   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 278   "lower", "print", "punct", "space", "upper", "xdigit"
 279 };
 280 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 281
 282 typedef char SET_TYPE;
 283
 284 /* Array of boolean values.  A character `c' is a member of the
 285    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 286    set is defined by the last (possibly, the only) string argument
 287    on the command line when the squeeze option is given.  */
 288 static SET_TYPE in_squeeze_set[N_CHARS];
 289
 290 /* Array of boolean values.  A character `c' is a member of the
 291    delete set if and only if in_delete_set[c] is true.  The delete
 292    set is defined by the first (or only) string argument on the
 293    command line when the delete option is given.  */
 294 static SET_TYPE in_delete_set[N_CHARS];
 295
 296 /* Array of character values defining the translation (if any) that
 297    tr is to perform.  Translation is performed only when there are
 298    two specification strings and the delete switch is not given.  */
 299 static char xlate[N_CHARS];
 300
 301 static struct option const long_options[] =
 302 {
 303   {"complement", no_argument, NULL, 'c'},
 304   {"delete", no_argument, NULL, 'd'},
 305   {"squeeze-repeats", no_argument, NULL, 's'},
 306   {"truncate-set1", no_argument, NULL, 't'},
 307   {GETOPT_HELP_OPTION_DECL},
 308   {GETOPT_VERSION_OPTION_DECL},
 309   {NULL, 0, NULL, 0}
 310 };
 311 \f
 312 void
 313 usage (int status)
 314 {
 315   if (status != 0)
 316     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 317              program_name);
 318   else
 319     {
 320       printf (_("\
 321 Usage: %s [OPTION]... SET1 [SET2]\n\
 322 "),
 323               program_name);
 324       fputs (_("\
 325 Translate, squeeze, and/or delete characters from standard input,\n\
 326 writing to standard output.\n\
 327 \n\
 328   -c, --complement        first complement SET1\n\
 329   -d, --delete            delete characters in SET1, do not translate\n\
 330   -s, --squeeze-repeats   replace each input sequence of a repeated character\n\
 331                             that is listed in SET1 with a single occurrence\n\
 332                             of that character\n\
 333   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 334 "), stdout);
 335       fputs (HELP_OPTION_DESCRIPTION, stdout);
 336       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 337       fputs (_("\
 338 \n\
 339 SETs are specified as strings of characters.  Most represent themselves.\n\
 340 Interpreted sequences are:\n\
 341 \n\
 342   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 343   \\\\              backslash\n\
 344   \\a              audible BEL\n\
 345   \\b              backspace\n\
 346   \\f              form feed\n\
 347   \\n              new line\n\
 348   \\r              return\n\
 349   \\t              horizontal tab\n\
 350 "), stdout);
 351      fputs (_("\
 352   \\v              vertical tab\n\
 353   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 354   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 355   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 356   [:alnum:]       all letters and digits\n\
 357   [:alpha:]       all letters\n\
 358   [:blank:]       all horizontal whitespace\n\
 359   [:cntrl:]       all control characters\n\
 360   [:digit:]       all digits\n\
 361 "), stdout);
 362      fputs (_("\
 363   [:graph:]       all printable characters, not including space\n\
 364   [:lower:]       all lower case letters\n\
 365   [:print:]       all printable characters, including space\n\
 366   [:punct:]       all punctuation characters\n\
 367   [:space:]       all horizontal or vertical whitespace\n\
 368   [:upper:]       all upper case letters\n\
 369   [:xdigit:]      all hexadecimal digits\n\
 370   [=CHAR=]        all characters which are equivalent to CHAR\n\
 371 "), stdout);
 372      fputs (_("\
 373 \n\
 374 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 375 -t may be used only when translating.  SET2 is extended to length of\n\
 376 SET1 by repeating its last character as necessary.  \
 377 "), stdout);
 378      fputs (_("\
 379 Excess characters\n\
 380 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 381 expand in ascending order; used in SET2 while translating, they may\n\
 382 only be used in pairs to specify case conversion.  \
 383 "), stdout);
 384      fputs (_("\
 385 -s uses SET1 if not\n\
 386 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 387 translation or deletion.\n\
 388 "), stdout);
 389       printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 390     }
 391   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 392 }
 393
 394 /* Return nonzero if the character C is a member of the
 395    equivalence class containing the character EQUIV_CLASS.  */
 396
 397 static int
 398 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
 399 {
 400   return (equiv_class == c);
 401 }
 402
 403 /* Return nonzero if the character C is a member of the
 404    character class CHAR_CLASS.  */
 405
 406 static int
 407 is_char_class_member (enum Char_class char_class, unsigned int c)
 408 {
 409   int result;
 410
 411   switch (char_class)
 412     {
 413     case CC_ALNUM:
 414       result = ISALNUM (c);
 415       break;
 416     case CC_ALPHA:
 417       result = ISALPHA (c);
 418       break;
 419     case CC_BLANK:
 420       result = ISBLANK (c);
 421       break;
 422     case CC_CNTRL:
 423       result = ISCNTRL (c);
 424       break;
 425     case CC_DIGIT:
 426       result = ISDIGIT_LOCALE (c);
 427       break;
 428     case CC_GRAPH:
 429       result = ISGRAPH (c);
 430       break;
 431     case CC_LOWER:
 432       result = ISLOWER (c);
 433       break;
 434     case CC_PRINT:
 435       result = ISPRINT (c);
 436       break;
 437     case CC_PUNCT:
 438       result = ISPUNCT (c);
 439       break;
 440     case CC_SPACE:
 441       result = ISSPACE (c);
 442       break;
 443     case CC_UPPER:
 444       result = ISUPPER (c);
 445       break;
 446     case CC_XDIGIT:
 447       result = ISXDIGIT (c);
 448       break;
 449     default:
 450       abort ();
 451       break;
 452     }
 453   return result;
 454 }
 455
 456 static void
 457 es_free (struct E_string *es)
 458 {
 459   free (es->s);
 460   free (es->escaped);
 461 }
 462
 463 /* Perform the first pass over each range-spec argument S, converting all
 464    \c and \ddd escapes to their one-byte representations.  The conversion
 465    is done in-place, so S must point to writable storage.  If an invalid
 466    quote sequence is found print an error message and return nonzero.
 467    Otherwise set *LEN to the length of the resulting string and return
 468    zero.  The resulting array of characters may contain zero-bytes;
 469    however, on input, S is assumed to be null-terminated, and hence
 470    cannot contain actual (non-escaped) zero bytes.  */
 471
 472 static int
 473 unquote (const unsigned char *s, struct E_string *es)
 474 {
 475   size_t i, j;
 476   size_t len;
 477
 478   len = strlen ((char *) s);
 479
 480   es->s = xmalloc (len);
 481   es->escaped = xmalloc (len * sizeof (es->escaped[0]));
 482   for (i = 0; i < len; i++)
 483     es->escaped[i] = 0;
 484
 485   j = 0;
 486   for (i = 0; s[i]; i++)
 487     {
 488       switch (s[i])
 489         {
 490           int c;
 491         case '\\':
 492           switch (s[i + 1])
 493             {
 494               int oct_digit;
 495             case '\\':
 496               c = '\\';
 497               break;
 498             case 'a':
 499               c = '\007';
 500               break;
 501             case 'b':
 502               c = '\b';
 503               break;
 504             case 'f':
 505               c = '\f';
 506               break;
 507             case 'n':
 508               c = '\n';
 509               break;
 510             case 'r':
 511               c = '\r';
 512               break;
 513             case 't':
 514               c = '\t';
 515               break;
 516             case 'v':
 517               c = '\v';
 518               break;
 519             case '0':
 520             case '1':
 521             case '2':
 522             case '3':
 523             case '4':
 524             case '5':
 525             case '6':
 526             case '7':
 527               c = s[i + 1] - '0';
 528               oct_digit = s[i + 2] - '0';
 529               if (0 <= oct_digit && oct_digit <= 7)
 530                 {
 531                   c = 8 * c + oct_digit;
 532                   ++i;
 533                   oct_digit = s[i + 2] - '0';
 534                   if (0 <= oct_digit && oct_digit <= 7)
 535                     {
 536                       if (8 * c + oct_digit < N_CHARS)
 537                         {
 538                           c = 8 * c + oct_digit;
 539                           ++i;
 540                         }
 541                       else if (!posix_pedantic)
 542                         {
 543                           /* A 3-digit octal number larger than \377 won't
 544                              fit in 8 bits.  So we stop when adding the
 545                              next digit would put us over the limit and
 546                              give a warning about the ambiguity.  POSIX
 547                              isn't clear on this, but one person has said
 548                              that in his interpretation, POSIX says tr
 549                              can't even give a warning.  */
 550                           error (0, 0, _("warning: the ambiguous octal escape \
 551 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
 552                                  s[i], s[i + 1], s[i + 2],
 553                                  s[i], s[i + 1], s[i + 2]);
 554                         }
 555                     }
 556                 }
 557               break;
 558             case '\0':
 559               error (0, 0, _("invalid backslash escape at end of string"));
 560               return 1;
 561
 562             default:
 563               if (posix_pedantic)
 564                 {
 565                   error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
 566                   return 1;
 567                 }
 568               else
 569                 {
 570                   c = s[i + 1];
 571                   es->escaped[j] = 1;
 572                 }
 573             }
 574           ++i;
 575           es->s[j++] = c;
 576           break;
 577         default:
 578           es->s[j++] = s[i];
 579           break;
 580         }
 581     }
 582   es->len = j;
 583   return 0;
 584 }
 585
 586 /* If CLASS_STR is a valid character class string, return its index
 587    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 588
 589 static enum Char_class
 590 look_up_char_class (const unsigned char *class_str, size_t len)
 591 {
 592   unsigned int i;
 593
 594   for (i = 0; i < N_CHAR_CLASSES; i++)
 595     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 596         && strlen (char_class_name[i]) == len)
 597       return (enum Char_class) i;
 598   return CC_NO_CLASS;
 599 }
 600
 601 /* Return a newly allocated string with a printable version of C.
 602    This function is used solely for formatting error messages.  */
 603
 604 static char *
 605 make_printable_char (unsigned int c)
 606 {
 607   char *buf = xmalloc (5);
 608
 609   assert (c < N_CHARS);
 610   if (ISPRINT (c))
 611     {
 612       buf[0] = c;
 613       buf[1] = '\0';
 614     }
 615   else
 616     {
 617       sprintf (buf, "\\%03o", c);
 618     }
 619   return buf;
 620 }
 621
 622 /* Return a newly allocated copy of S which is suitable for printing.
 623    LEN is the number of characters in S.  Most non-printing
 624    (isprint) characters are represented by a backslash followed by
 625    3 octal digits.  However, the characters represented by \c escapes
 626    where c is one of [abfnrtv] are represented by their 2-character \c
 627    sequences.  This function is used solely for printing error messages.  */
 628
 629 static char *
 630 make_printable_str (const unsigned char *s, size_t len)
 631 {
 632   /* Worst case is that every character expands to a backslash
 633      followed by a 3-character octal escape sequence.  */
 634   char *printable_buf = xmalloc (4 * len + 1);
 635   char *p = printable_buf;
 636   size_t i;
 637
 638   for (i = 0; i < len; i++)
 639     {
 640       char buf[5];
 641       char *tmp = NULL;
 642
 643       switch (s[i])
 644         {
 645         case '\\':
 646           tmp = "\\";
 647           break;
 648         case '\007':
 649           tmp = "\\a";
 650           break;
 651         case '\b':
 652           tmp = "\\b";
 653           break;
 654         case '\f':
 655           tmp = "\\f";
 656           break;
 657         case '\n':
 658           tmp = "\\n";
 659           break;
 660         case '\r':
 661           tmp = "\\r";
 662           break;
 663         case '\t':
 664           tmp = "\\t";
 665           break;
 666         case '\v':
 667           tmp = "\\v";
 668           break;
 669         default:
 670           if (ISPRINT (s[i]))
 671             {
 672               buf[0] = s[i];
 673               buf[1] = '\0';
 674             }
 675           else
 676             sprintf (buf, "\\%03o", s[i]);
 677           tmp = buf;
 678           break;
 679         }
 680       p = stpcpy (p, tmp);
 681     }
 682   return printable_buf;
 683 }
 684
 685 /* Append a newly allocated structure representing a
 686    character C to the specification list LIST.  */
 687
 688 static void
 689 append_normal_char (struct Spec_list *list, unsigned int c)
 690 {
 691   struct List_element *new;
 692
 693   new = xmalloc (sizeof (struct List_element));
 694   new->next = NULL;
 695   new->type = RE_NORMAL_CHAR;
 696   new->u.normal_char = c;
 697   assert (list->tail);
 698   list->tail->next = new;
 699   list->tail = new;
 700 }
 701
 702 /* Append a newly allocated structure representing the range
 703    of characters from FIRST to LAST to the specification list LIST.
 704    Return nonzero if LAST precedes FIRST in the collating sequence,
 705    zero otherwise.  This means that '[c-c]' is acceptable.  */
 706
 707 static int
 708 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
 709 {
 710   struct List_element *new;
 711
 712   if (ORD (first) > ORD (last))
 713     {
 714       char *tmp1 = make_printable_char (first);
 715       char *tmp2 = make_printable_char (last);
 716
 717       error (0, 0,
 718        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 719              tmp1, tmp2);
 720       free (tmp1);
 721       free (tmp2);
 722       return 1;
 723     }
 724   new = xmalloc (sizeof (struct List_element));
 725   new->next = NULL;
 726   new->type = RE_RANGE;
 727   new->u.range.first_char = first;
 728   new->u.range.last_char = last;
 729   assert (list->tail);
 730   list->tail->next = new;
 731   list->tail = new;
 732   return 0;
 733 }
 734
 735 /* If CHAR_CLASS_STR is a valid character class string, append a
 736    newly allocated structure representing that character class to the end
 737    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 738    a valid string return nonzero.  */
 739
 740 static int
 741 append_char_class (struct Spec_list *list,
 742                    const unsigned char *char_class_str, size_t len)
 743 {
 744   enum Char_class char_class;
 745   struct List_element *new;
 746
 747   char_class = look_up_char_class (char_class_str, len);
 748   if (char_class == CC_NO_CLASS)
 749     return 1;
 750   new = xmalloc (sizeof (struct List_element));
 751   new->next = NULL;
 752   new->type = RE_CHAR_CLASS;
 753   new->u.char_class = char_class;
 754   assert (list->tail);
 755   list->tail->next = new;
 756   list->tail = new;
 757   return 0;
 758 }
 759
 760 /* Append a newly allocated structure representing a [c*n]
 761    repeated character construct to the specification list LIST.
 762    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 763    is a non-negative repeat count.  */
 764
 765 static void
 766 append_repeated_char (struct Spec_list *list, unsigned int the_char,
 767                       size_t repeat_count)
 768 {
 769   struct List_element *new;
 770
 771   new = xmalloc (sizeof (struct List_element));
 772   new->next = NULL;
 773   new->type = RE_REPEATED_CHAR;
 774   new->u.repeated_char.the_repeated_char = the_char;
 775   new->u.repeated_char.repeat_count = repeat_count;
 776   assert (list->tail);
 777   list->tail->next = new;
 778   list->tail = new;
 779 }
 780
 781 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 782    the length of that string, LEN, if LEN is exactly one, append
 783    a newly allocated structure representing the specified
 784    equivalence class to the specification list, LIST and return zero.
 785    If LEN is not 1, return nonzero.  */
 786
 787 static int
 788 append_equiv_class (struct Spec_list *list,
 789                     const unsigned char *equiv_class_str, size_t len)
 790 {
 791   struct List_element *new;
 792
 793   if (len != 1)
 794     return 1;
 795   new = xmalloc (sizeof (struct List_element));
 796   new->next = NULL;
 797   new->type = RE_EQUIV_CLASS;
 798   new->u.equiv_code = *equiv_class_str;
 799   assert (list->tail);
 800   list->tail->next = new;
 801   list->tail = new;
 802   return 0;
 803 }
 804
 805 /* Return a newly allocated copy of the LEN-byte prefix of P.
 806    The returned string may contain NUL bytes and is *not* NUL-terminated.  */
 807
 808 static unsigned char *
 809 xmemdup (const unsigned char *p, size_t len)
 810 {
 811   unsigned char *tmp = xmalloc (len);
 812
 813   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 814   memcpy (tmp, p, len);
 815   return tmp;
 816 }
 817
 818 /* Search forward starting at START_IDX for the 2-char sequence
 819    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 820    a sequence is found, set *RESULT_IDX to the index of the first
 821    character and return nonzero. Otherwise return zero.  P may contain
 822    zero bytes.  */
 823
 824 static int
 825 find_closing_delim (const struct E_string *es, size_t start_idx,
 826                     unsigned int pre_bracket_char, size_t *result_idx)
 827 {
 828   size_t i;
 829
 830   for (i = start_idx; i < es->len - 1; i++)
 831     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 832         && !es->escaped[i] && !es->escaped[i + 1])
 833       {
 834         *result_idx = i;
 835         return 1;
 836       }
 837   return 0;
 838 }
 839
 840 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 841    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 842    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 843    and return zero. If the second character following
 844    the opening bracket is not `*' or if no closing bracket can be
 845    found, return -1.  If a closing bracket is found and the
 846    second char is `*', but the string between the `*' and `]' isn't
 847    empty, an octal number, or a decimal number, print an error message
 848    and return -2.  */
 849
 850 static int
 851 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 852                        unsigned int *char_to_repeat, size_t *repeat_count,
 853                        size_t *closing_bracket_idx)
 854 {
 855   size_t i;
 856
 857   assert (start_idx + 1 < es->len);
 858   if (!ES_MATCH (es, start_idx + 1, '*'))
 859     return -1;
 860
 861   for (i = start_idx + 2; i < es->len; i++)
 862     {
 863       if (ES_MATCH (es, i, ']'))
 864         {
 865           size_t digit_str_len = i - start_idx - 2;
 866
 867           *char_to_repeat = es->s[start_idx];
 868           if (digit_str_len == 0)
 869             {
 870               /* We've matched [c*] -- no explicit repeat count.  */
 871               *repeat_count = 0;
 872               *closing_bracket_idx = i;
 873               return 0;
 874             }
 875
 876           /* Here, we have found [c*s] where s should be a string
 877              of octal (if it starts with `0') or decimal digits.  */
 878           {
 879             const char *digit_str = (const char *) &es->s[start_idx + 2];
 880             unsigned long int tmp_ulong;
 881             char *d_end;
 882             int base = 10;
 883             /* Select the base manually so we can be sure it's either 8 or 10.
 884                If the spec allowed it to be interpreted as hexadecimal, we
 885                could have used `0' and let xstrtoul decide.  */
 886             if (*digit_str == '0')
 887               {
 888                 base = 8;
 889                 ++digit_str;
 890                 --digit_str_len;
 891               }
 892             if (xstrtoul (digit_str, &d_end, base, &tmp_ulong, NULL)
 893                   != LONGINT_OK
 894                 || BEGIN_STATE < tmp_ulong
 895                 || digit_str + digit_str_len != d_end)
 896               {
 897                 char *tmp = make_printable_str (es->s + start_idx + 2,
 898                                                 i - start_idx - 2);
 899                 error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
 900                        tmp);
 901                 free (tmp);
 902                 return -2;
 903               }
 904             *repeat_count = tmp_ulong;
 905           }
 906           *closing_bracket_idx = i;
 907           return 0;
 908         }
 909     }
 910   return -1;                    /* No bracket found.  */
 911 }
 912
 913 /* Return nonzero if the string at ES->s[IDX] matches the regular
 914    expression `\*[0-9]*\]', zero otherwise.  To match, the `*' and
 915    the `]' must not be escaped.  */
 916
 917 static int
 918 star_digits_closebracket (const struct E_string *es, size_t idx)
 919 {
 920   size_t i;
 921
 922   if (!ES_MATCH (es, idx, '*'))
 923     return 0;
 924
 925   for (i = idx + 1; i < es->len; i++)
 926     {
 927       if (!ISDIGIT (es->s[i]))
 928         {
 929           if (ES_MATCH (es, i, ']'))
 930             return 1;
 931           return 0;
 932         }
 933     }
 934   return 0;
 935 }
 936
 937 /* Convert string UNESCAPED_STRING (which has been preprocessed to
 938    convert backslash-escape sequences) of length LEN characters into
 939    a linked list of the following 5 types of constructs:
 940       - [:str:] Character class where `str' is one of the 12 valid strings.
 941       - [=c=] Equivalence class where `c' is any single character.
 942       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 943           However, if `n' is present, it must be a non-negative octal or
 944           decimal integer.
 945       - r-s Range of characters from `r' to `s'.  The second endpoint must
 946           not precede the first in the current collating sequence.
 947       - c Any other character is interpreted as itself.  */
 948
 949 static int
 950 build_spec_list (const struct E_string *es, struct Spec_list *result)
 951 {
 952   const unsigned char *p;
 953   size_t i;
 954
 955   p = es->s;
 956
 957   /* The main for-loop below recognizes the 4 multi-character constructs.
 958      A character that matches (in its context) none of the multi-character
 959      constructs is classified as `normal'.  Since all multi-character
 960      constructs have at least 3 characters, any strings of length 2 or
 961      less are composed solely of normal characters.  Hence, the index of
 962      the outer for-loop runs only as far as LEN-2.  */
 963
 964   for (i = 0; i + 2 < es->len; /* empty */)
 965     {
 966       if (ES_MATCH (es, i, '['))
 967         {
 968           int matched_multi_char_construct;
 969           size_t closing_bracket_idx;
 970           unsigned int char_to_repeat;
 971           size_t repeat_count;
 972           int err;
 973
 974           matched_multi_char_construct = 1;
 975           if (ES_MATCH (es, i + 1, ':')
 976               || ES_MATCH (es, i + 1, '='))
 977             {
 978               size_t closing_delim_idx;
 979               int found;
 980
 981               found = find_closing_delim (es, i + 2, p[i + 1],
 982                                           &closing_delim_idx);
 983               if (found)
 984                 {
 985                   int parse_failed;
 986                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
 987                   unsigned char *opnd_str;
 988
 989                   if (opnd_str_len == 0)
 990                     {
 991                       if (p[i + 1] == ':')
 992                         error (0, 0, _("missing character class name `[::]'"));
 993                       else
 994                         error (0, 0,
 995                                _("missing equivalence class character `[==]'"));
 996                       return 1;
 997                     }
 998
 999                   opnd_str = xmemdup (p + i + 2, opnd_str_len);
1000
1001                   if (p[i + 1] == ':')
1002                     {
1003                       parse_failed = append_char_class (result, opnd_str,
1004                                                         opnd_str_len);
1005
1006                       /* FIXME: big comment.  */
1007                       if (parse_failed)
1008                         {
1009                           if (star_digits_closebracket (es, i + 2))
1010                             {
1011                               free (opnd_str);
1012                               goto try_bracketed_repeat;
1013                             }
1014                           else
1015                             {
1016                               char *tmp = make_printable_str (opnd_str,
1017                                                               opnd_str_len);
1018                               error (0, 0, _("invalid character class `%s'"),
1019                                      tmp);
1020                               free (tmp);
1021                               return 1;
1022                             }
1023                         }
1024                     }
1025                   else
1026                     {
1027                       parse_failed = append_equiv_class (result, opnd_str,
1028                                                          opnd_str_len);
1029
1030                       /* FIXME: big comment.  */
1031                       if (parse_failed)
1032                         {
1033                           if (star_digits_closebracket (es, i + 2))
1034                             {
1035                               free (opnd_str);
1036                               goto try_bracketed_repeat;
1037                             }
1038                           else
1039                             {
1040                               char *tmp = make_printable_str (opnd_str,
1041                                                               opnd_str_len);
1042                               error (0, 0,
1043                _("%s: equivalence class operand must be a single character"),
1044                                      tmp);
1045                               free (tmp);
1046                               return 1;
1047                             }
1048                         }
1049                     }
1050                   free (opnd_str);
1051
1052                   /* Return nonzero if append_*_class reports a problem.  */
1053                   if (parse_failed)
1054                     return 1;
1055                   else
1056                     i = closing_delim_idx + 2;
1057                   continue;
1058                 }
1059               /* Else fall through.  This could be [:*] or [=*].  */
1060             }
1061
1062         try_bracketed_repeat:
1063
1064           /* Determine whether this is a bracketed repeat range
1065              matching the RE \[.\*(dec_or_oct_number)?\].  */
1066           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1067                                        &repeat_count,
1068                                        &closing_bracket_idx);
1069           if (err == 0)
1070             {
1071               append_repeated_char (result, char_to_repeat, repeat_count);
1072               i = closing_bracket_idx + 1;
1073             }
1074           else if (err == -1)
1075             {
1076               matched_multi_char_construct = 0;
1077             }
1078           else
1079             {
1080               /* Found a string that looked like [c*n] but the
1081                  numeric part was invalid.  */
1082               return 1;
1083             }
1084
1085           if (matched_multi_char_construct)
1086             continue;
1087
1088           /* We reach this point if P does not match [:str:], [=c=],
1089              [c*n], or [c*].  Now, see if P looks like a range `[-c'
1090              (from `[' to `c').  */
1091         }
1092
1093       /* Look ahead one char for ranges like a-z.  */
1094       if (ES_MATCH (es, i + 1, '-'))
1095         {
1096           if (append_range (result, p[i], p[i + 2]))
1097             return 1;
1098           i += 3;
1099         }
1100       else
1101         {
1102           append_normal_char (result, p[i]);
1103           ++i;
1104         }
1105     }
1106
1107   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1108   for (; i < es->len; i++)
1109     append_normal_char (result, p[i]);
1110
1111   return 0;
1112 }
1113
1114 /* Given a Spec_list S (with its saved state implicit in the values
1115    of its members `tail' and `state'), return the next single character
1116    in the expansion of S's constructs.  If the last character of S was
1117    returned on the previous call or if S was empty, this function
1118    returns -1.  For example, successive calls to get_next where S
1119    represents the spec-string 'a-d[y*3]' will return the sequence
1120    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1121    which the returned character comes is [:upper:] or [:lower:], the
1122    parameter CLASS is given a value to indicate which it was.  Otherwise
1123    CLASS is set to UL_NONE.  This value is used only when constructing
1124    the translation table to verify that any occurrences of upper and
1125    lower class constructs in the spec-strings appear in the same relative
1126    positions.  */
1127
1128 static int
1129 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1130 {
1131   struct List_element *p;
1132   int return_val;
1133   int i;
1134
1135   if (class)
1136     *class = UL_NONE;
1137
1138   if (s->state == BEGIN_STATE)
1139     {
1140       s->tail = s->head->next;
1141       s->state = NEW_ELEMENT;
1142     }
1143
1144   p = s->tail;
1145   if (p == NULL)
1146     return -1;
1147
1148   switch (p->type)
1149     {
1150     case RE_NORMAL_CHAR:
1151       return_val = p->u.normal_char;
1152       s->state = NEW_ELEMENT;
1153       s->tail = p->next;
1154       break;
1155
1156     case RE_RANGE:
1157       if (s->state == NEW_ELEMENT)
1158         s->state = ORD (p->u.range.first_char);
1159       else
1160         ++(s->state);
1161       return_val = CHR (s->state);
1162       if (s->state == ORD (p->u.range.last_char))
1163         {
1164           s->tail = p->next;
1165           s->state = NEW_ELEMENT;
1166         }
1167       break;
1168
1169     case RE_CHAR_CLASS:
1170       if (class)
1171         {
1172           int upper_or_lower;
1173           switch (p->u.char_class)
1174             {
1175             case CC_LOWER:
1176               *class = UL_LOWER;
1177               upper_or_lower = 1;
1178               break;
1179             case CC_UPPER:
1180               *class = UL_UPPER;
1181               upper_or_lower = 1;
1182               break;
1183             default:
1184               upper_or_lower = 0;
1185               break;
1186             }
1187
1188           if (upper_or_lower)
1189             {
1190               s->tail = p->next;
1191               s->state = NEW_ELEMENT;
1192               return_val = 0;
1193               break;
1194             }
1195         }
1196
1197       if (s->state == NEW_ELEMENT)
1198         {
1199           for (i = 0; i < N_CHARS; i++)
1200             if (is_char_class_member (p->u.char_class, i))
1201               break;
1202           assert (i < N_CHARS);
1203           s->state = i;
1204         }
1205       assert (is_char_class_member (p->u.char_class, s->state));
1206       return_val = CHR (s->state);
1207       for (i = s->state + 1; i < N_CHARS; i++)
1208         if (is_char_class_member (p->u.char_class, i))
1209           break;
1210       if (i < N_CHARS)
1211         s->state = i;
1212       else
1213         {
1214           s->tail = p->next;
1215           s->state = NEW_ELEMENT;
1216         }
1217       break;
1218
1219     case RE_EQUIV_CLASS:
1220       /* FIXME: this assumes that each character is alone in its own
1221          equivalence class (which appears to be correct for my
1222          LC_COLLATE.  But I don't know of any function that allows
1223          one to determine a character's equivalence class.  */
1224
1225       return_val = p->u.equiv_code;
1226       s->state = NEW_ELEMENT;
1227       s->tail = p->next;
1228       break;
1229
1230     case RE_REPEATED_CHAR:
1231       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1232       if (p->u.repeated_char.repeat_count == 0)
1233         {
1234           s->tail = p->next;
1235           s->state = NEW_ELEMENT;
1236           return_val = get_next (s, class);
1237         }
1238       else
1239         {
1240           if (s->state == NEW_ELEMENT)
1241             {
1242               s->state = 0;
1243             }
1244           ++(s->state);
1245           return_val = p->u.repeated_char.the_repeated_char;
1246           if (p->u.repeated_char.repeat_count > 0
1247               && s->state == p->u.repeated_char.repeat_count)
1248             {
1249               s->tail = p->next;
1250               s->state = NEW_ELEMENT;
1251             }
1252         }
1253       break;
1254
1255     case RE_NO_TYPE:
1256       abort ();
1257       break;
1258
1259     default:
1260       abort ();
1261       break;
1262     }
1263
1264   return return_val;
1265 }
1266
1267 /* This is a minor kludge.  This function is called from
1268    get_spec_stats to determine the cardinality of a set derived
1269    from a complemented string.  It's a kludge in that some of the
1270    same operations are (duplicated) performed in set_initialize.  */
1271
1272 static int
1273 card_of_complement (struct Spec_list *s)
1274 {
1275   int c;
1276   int cardinality = N_CHARS;
1277   SET_TYPE in_set[N_CHARS];
1278
1279   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1280   s->state = BEGIN_STATE;
1281   while ((c = get_next (s, NULL)) != -1)
1282     if (!in_set[c]++)
1283       --cardinality;
1284   return cardinality;
1285 }
1286
1287 /* Gather statistics about the spec-list S in preparation for the tests
1288    in validate that determine the consistency of the specs.  This function
1289    is called at most twice; once for string1, and again for any string2.
1290    LEN_S1 < 0 indicates that this is the first call and that S represents
1291    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1292    constructs in string1, and we can use its value to resolve any
1293    indefinite repeat construct in S (which represents string2).  Hence,
1294    this function has the side-effect that it converts a valid [c*]
1295    construct in string2 to [c*n] where n is large enough (or 0) to give
1296    string2 the same length as string1.  For example, with the command
1297    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1298    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1299
1300 static void
1301 get_spec_stats (struct Spec_list *s)
1302 {
1303   struct List_element *p;
1304   int len = 0;
1305
1306   s->n_indefinite_repeats = 0;
1307   s->has_equiv_class = 0;
1308   s->has_restricted_char_class = 0;
1309   s->has_char_class = 0;
1310   for (p = s->head->next; p; p = p->next)
1311     {
1312       switch (p->type)
1313         {
1314           int i;
1315         case RE_NORMAL_CHAR:
1316           ++len;
1317           break;
1318
1319         case RE_RANGE:
1320           assert (p->u.range.last_char >= p->u.range.first_char);
1321           len += p->u.range.last_char - p->u.range.first_char + 1;
1322           break;
1323
1324         case RE_CHAR_CLASS:
1325           s->has_char_class = 1;
1326           for (i = 0; i < N_CHARS; i++)
1327             if (is_char_class_member (p->u.char_class, i))
1328               ++len;
1329           switch (p->u.char_class)
1330             {
1331             case CC_UPPER:
1332             case CC_LOWER:
1333               break;
1334             default:
1335               s->has_restricted_char_class = 1;
1336               break;
1337             }
1338           break;
1339
1340         case RE_EQUIV_CLASS:
1341           for (i = 0; i < N_CHARS; i++)
1342             if (is_equiv_class_member (p->u.equiv_code, i))
1343               ++len;
1344           s->has_equiv_class = 1;
1345           break;
1346
1347         case RE_REPEATED_CHAR:
1348           if (p->u.repeated_char.repeat_count > 0)
1349             len += p->u.repeated_char.repeat_count;
1350           else if (p->u.repeated_char.repeat_count == 0)
1351             {
1352               s->indefinite_repeat_element = p;
1353               ++(s->n_indefinite_repeats);
1354             }
1355           break;
1356
1357         case RE_NO_TYPE:
1358           assert (0);
1359           break;
1360         }
1361     }
1362
1363   s->length = len;
1364 }
1365
1366 static void
1367 get_s1_spec_stats (struct Spec_list *s1)
1368 {
1369   get_spec_stats (s1);
1370   if (complement)
1371     s1->length = card_of_complement (s1);
1372 }
1373
1374 static void
1375 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1376 {
1377   get_spec_stats (s2);
1378   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1379     {
1380       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1381         len_s1 - s2->length;
1382       s2->length = len_s1;
1383     }
1384 }
1385
1386 static void
1387 spec_init (struct Spec_list *spec_list)
1388 {
1389   spec_list->head = spec_list->tail =
1390     xmalloc (sizeof (struct List_element));
1391   spec_list->head->next = NULL;
1392 }
1393
1394 /* This function makes two passes over the argument string S.  The first
1395    one converts all \c and \ddd escapes to their one-byte representations.
1396    The second constructs a linked specification list, SPEC_LIST, of the
1397    characters and constructs that comprise the argument string.  If either
1398    of these passes detects an error, this function returns nonzero.  */
1399
1400 static int
1401 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1402 {
1403   struct E_string es;
1404   int fail;
1405
1406   fail = unquote (s, &es);
1407   if (!fail)
1408     fail = build_spec_list (&es, spec_list);
1409   es_free (&es);
1410   return fail;
1411 }
1412
1413 /* Given two specification lists, S1 and S2, and assuming that
1414    S1->length > S2->length, append a single [c*n] element to S2 where c
1415    is the last character in the expansion of S2 and n is the difference
1416    between the two lengths.
1417    Upon successful completion, S2->length is set to S1->length.  The only
1418    way this function can fail to make S2 as long as S1 is when S2 has
1419    zero-length, since in that case, there is no last character to repeat.
1420    So S2->length is required to be at least 1.
1421
1422    Providing this functionality allows the user to do some pretty
1423    non-BSD (and non-portable) things:  For example, the command
1424        tr -cs '[:upper:]0-9' '[:lower:]'
1425    is almost guaranteed to give results that depend on your collating
1426    sequence.  */
1427
1428 static void
1429 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1430 {
1431   struct List_element *p;
1432   int char_to_repeat;
1433   int i;
1434
1435   assert (translating);
1436   assert (s1->length > s2->length);
1437   assert (s2->length > 0);
1438
1439   p = s2->tail;
1440   switch (p->type)
1441     {
1442     case RE_NORMAL_CHAR:
1443       char_to_repeat = p->u.normal_char;
1444       break;
1445     case RE_RANGE:
1446       char_to_repeat = p->u.range.last_char;
1447       break;
1448     case RE_CHAR_CLASS:
1449       for (i = N_CHARS; i >= 0; i--)
1450         if (is_char_class_member (p->u.char_class, i))
1451           break;
1452       assert (i >= 0);
1453       char_to_repeat = CHR (i);
1454       break;
1455
1456     case RE_REPEATED_CHAR:
1457       char_to_repeat = p->u.repeated_char.the_repeated_char;
1458       break;
1459
1460     case RE_EQUIV_CLASS:
1461       /* This shouldn't happen, because validate exits with an error
1462          if it finds an equiv class in string2 when translating.  */
1463       abort ();
1464       break;
1465
1466     case RE_NO_TYPE:
1467       abort ();
1468       break;
1469
1470     default:
1471       abort ();
1472       break;
1473     }
1474
1475   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1476   s2->length = s1->length;
1477 }
1478
1479 /* Return non-zero if S is a non-empty list in which exactly one
1480    character (but potentially, many instances of it) appears.
1481    E.g.  [X*] or xxxxxxxx.  */
1482
1483 static int
1484 homogeneous_spec_list (struct Spec_list *s)
1485 {
1486   int b, c;
1487
1488   s->state = BEGIN_STATE;
1489
1490   if ((b = get_next (s, NULL)) == -1)
1491     return 0;
1492
1493   while ((c = get_next (s, NULL)) != -1)
1494     if (c != b)
1495       return 0;
1496
1497   return 1;
1498 }
1499
1500 /* Die with an error message if S1 and S2 describe strings that
1501    are not valid with the given command line switches.
1502    A side effect of this function is that if a valid [c*] or
1503    [c*0] construct appears in string2, it is converted to [c*n]
1504    with a value for n that makes s2->length == s1->length.  By
1505    the same token, if the --truncate-set1 option is not
1506    given, S2 may be extended.  */
1507
1508 static void
1509 validate (struct Spec_list *s1, struct Spec_list *s2)
1510 {
1511   get_s1_spec_stats (s1);
1512   if (s1->n_indefinite_repeats > 0)
1513     {
1514       error (EXIT_FAILURE, 0,
1515              _("the [c*] repeat construct may not appear in string1"));
1516     }
1517
1518   if (s2)
1519     {
1520       get_s2_spec_stats (s2, s1->length);
1521
1522       if (s2->n_indefinite_repeats > 1)
1523         {
1524           error (EXIT_FAILURE, 0,
1525                  _("only one [c*] repeat construct may appear in string2"));
1526         }
1527
1528       if (translating)
1529         {
1530           if (s2->has_equiv_class)
1531             {
1532               error (EXIT_FAILURE, 0,
1533                      _("[=c=] expressions may not appear in string2 \
1534 when translating"));
1535             }
1536
1537           if (s1->length > s2->length)
1538             {
1539               if (!truncate_set1)
1540                 {
1541                   /* string2 must be non-empty unless --truncate-set1 is
1542                      given or string1 is empty.  */
1543
1544                   if (s2->length == 0)
1545                     error (EXIT_FAILURE, 0,
1546                      _("when not truncating set1, string2 must be non-empty"));
1547                   string2_extend (s1, s2);
1548                 }
1549             }
1550
1551           if (complement && s1->has_char_class
1552               && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1553             {
1554               error (EXIT_FAILURE, 0,
1555                      _("when translating with complemented character classes,\
1556 \nstring2 must map all characters in the domain to one"));
1557             }
1558
1559           if (s2->has_restricted_char_class)
1560             {
1561               error (EXIT_FAILURE, 0,
1562                      _("when translating, the only character classes that may \
1563 appear in\nstring2 are `upper' and `lower'"));
1564             }
1565         }
1566       else
1567         /* Not translating.  */
1568         {
1569           if (s2->n_indefinite_repeats > 0)
1570             error (EXIT_FAILURE, 0,
1571                    _("the [c*] construct may appear in string2 only \
1572 when translating"));
1573         }
1574     }
1575 }
1576
1577 /* Read buffers of SIZE bytes via the function READER (if READER is
1578    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1579    read_and_delete or read_and_xlate.  After each buffer is read, it is
1580    processed and written to stdout.  The buffers are processed so that
1581    multiple consecutive occurrences of the same character in the input
1582    stream are replaced by a single occurrence of that character if the
1583    character is in the squeeze set.  */
1584
1585 static void
1586 squeeze_filter (unsigned char *buf, size_t size, Filter reader)
1587 {
1588   unsigned int char_to_squeeze = NOT_A_CHAR;
1589   size_t i = 0;
1590   size_t nr = 0;
1591
1592   for (;;)
1593     {
1594       size_t begin;
1595
1596       if (i >= nr)
1597         {
1598           if (reader == NULL)
1599             {
1600               nr = safe_read (0, (char *) buf, size);
1601               if (nr == SAFE_READ_ERROR)
1602                 error (EXIT_FAILURE, errno, _("read error"));
1603             }
1604           else
1605             {
1606               nr = (*reader) (buf, size, NULL);
1607             }
1608
1609           if (nr == 0)
1610             break;
1611           i = 0;
1612         }
1613
1614       begin = i;
1615
1616       if (char_to_squeeze == NOT_A_CHAR)
1617         {
1618           size_t out_len;
1619           /* Here, by being a little tricky, we can get a significant
1620              performance increase in most cases when the input is
1621              reasonably large.  Since tr will modify the input only
1622              if two consecutive (and identical) input characters are
1623              in the squeeze set, we can step by two through the data
1624              when searching for a character in the squeeze set.  This
1625              means there may be a little more work in a few cases and
1626              perhaps twice as much work in the worst cases where most
1627              of the input is removed by squeezing repeats.  But most
1628              uses of this functionality seem to remove less than 20-30%
1629              of the input.  */
1630           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1631             ;                   /* empty */
1632
1633           /* There is a special case when i == nr and we've just
1634              skipped a character (the last one in buf) that is in
1635              the squeeze set.  */
1636           if (i == nr && in_squeeze_set[buf[i - 1]])
1637             --i;
1638
1639           if (i >= nr)
1640             out_len = nr - begin;
1641           else
1642             {
1643               char_to_squeeze = buf[i];
1644               /* We're about to output buf[begin..i].  */
1645               out_len = i - begin + 1;
1646
1647               /* But since we stepped by 2 in the loop above,
1648                  out_len may be one too large.  */
1649               if (i > 0 && buf[i - 1] == char_to_squeeze)
1650                 --out_len;
1651
1652               /* Advance i to the index of first character to be
1653                  considered when looking for a char different from
1654                  char_to_squeeze.  */
1655               ++i;
1656             }
1657           if (out_len > 0
1658               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1659             error (EXIT_FAILURE, errno, _("write error"));
1660         }
1661
1662       if (char_to_squeeze != NOT_A_CHAR)
1663         {
1664           /* Advance i to index of first char != char_to_squeeze
1665              (or to nr if all the rest of the characters in this
1666              buffer are the same as char_to_squeeze).  */
1667           for (; i < nr && buf[i] == char_to_squeeze; i++)
1668             ;                   /* empty */
1669           if (i < nr)
1670             char_to_squeeze = NOT_A_CHAR;
1671           /* If (i >= nr) we've squeezed the last character in this buffer.
1672              So now we have to read a new buffer and continue comparing
1673              characters against char_to_squeeze.  */
1674         }
1675     }
1676 }
1677
1678 /* Read buffers of SIZE bytes from stdin until one is found that
1679    contains at least one character not in the delete set.  Store
1680    in the array BUF, all characters from that buffer that are not
1681    in the delete set, and return the number of characters saved
1682    or 0 upon EOF.  */
1683
1684 static size_t
1685 read_and_delete (unsigned char *buf, size_t size, Filter not_used)
1686 {
1687   size_t n_saved;
1688   static int hit_eof = 0;
1689
1690   assert (not_used == NULL);
1691
1692   if (hit_eof)
1693     return 0;
1694
1695   /* This enclosing do-while loop is to make sure that
1696      we don't return zero (indicating EOF) when we've
1697      just deleted all the characters in a buffer.  */
1698   do
1699     {
1700       size_t i;
1701       size_t nr = safe_read (0, (char *) buf, size);
1702
1703       if (nr == SAFE_READ_ERROR)
1704         error (EXIT_FAILURE, errno, _("read error"));
1705       if (nr == 0)
1706         {
1707           hit_eof = 1;
1708           return 0;
1709         }
1710
1711       /* This first loop may be a waste of code, but gives much
1712          better performance when no characters are deleted in
1713          the beginning of a buffer.  It just avoids the copying
1714          of buf[i] into buf[n_saved] when it would be a NOP.  */
1715
1716       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1717         /* empty */ ;
1718       n_saved = i;
1719
1720       for (++i; i < nr; i++)
1721         if (!in_delete_set[buf[i]])
1722           buf[n_saved++] = buf[i];
1723     }
1724   while (n_saved == 0);
1725
1726   return n_saved;
1727 }
1728
1729 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1730    perform the in-place and one-to-one mapping specified by the global
1731    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1732
1733 static size_t
1734 read_and_xlate (unsigned char *buf, size_t size, Filter not_used)
1735 {
1736   size_t bytes_read = 0;
1737   static int hit_eof = 0;
1738   size_t i;
1739
1740   assert (not_used == NULL);
1741
1742   if (hit_eof)
1743     return 0;
1744
1745   bytes_read = safe_read (0, (char *) buf, size);
1746   if (bytes_read == SAFE_READ_ERROR)
1747     error (EXIT_FAILURE, errno, _("read error"));
1748   if (bytes_read == 0)
1749     {
1750       hit_eof = 1;
1751       return 0;
1752     }
1753
1754   for (i = 0; i < bytes_read; i++)
1755     buf[i] = xlate[buf[i]];
1756
1757   return bytes_read;
1758 }
1759
1760 /* Initialize a boolean membership set IN_SET with the character
1761    values obtained by traversing the linked list of constructs S
1762    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1763    nonzero the resulting set is complemented.  */
1764
1765 static void
1766 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1767 {
1768   int c;
1769   size_t i;
1770
1771   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1772   s->state = BEGIN_STATE;
1773   while ((c = get_next (s, NULL)) != -1)
1774     in_set[c] = 1;
1775   if (complement_this_set)
1776     for (i = 0; i < N_CHARS; i++)
1777       in_set[i] = (!in_set[i]);
1778 }
1779
1780 int
1781 main (int argc, char **argv)
1782 {
1783   int c;
1784   int non_option_args;
1785   struct Spec_list buf1, buf2;
1786   struct Spec_list *s1 = &buf1;
1787   struct Spec_list *s2 = &buf2;
1788
1789   initialize_main (&argc, &argv);
1790   program_name = argv[0];
1791   setlocale (LC_ALL, "");
1792   bindtextdomain (PACKAGE, LOCALEDIR);
1793   textdomain (PACKAGE);
1794
1795   atexit (close_stdout);
1796
1797   while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1798     {
1799       switch (c)
1800         {
1801         case 0:
1802           break;
1803
1804         case 'c':
1805           complement = 1;
1806           break;
1807
1808         case 'd':
1809           delete = 1;
1810           break;
1811
1812         case 's':
1813           squeeze_repeats = 1;
1814           break;
1815
1816         case 't':
1817           truncate_set1 = 1;
1818           break;
1819
1820         case_GETOPT_HELP_CHAR;
1821
1822         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, WRITTEN_BY);
1823
1824         default:
1825           usage (2);
1826           break;
1827         }
1828     }
1829
1830   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1831
1832   non_option_args = argc - optind;
1833   translating = (non_option_args == 2 && !delete);
1834
1835   /* Change this test if it is valid to give tr no options and
1836      no args at all.  POSIX doesn't specifically say anything
1837      either way, but it looks like they implied it's invalid
1838      by omission.  If you want to make tr do a slow imitation
1839      of `cat' use `tr a a'.  */
1840   if (non_option_args > 2)
1841     {
1842       error (0, 0, _("too many arguments"));
1843       usage (2);
1844     }
1845
1846   if (!delete && !squeeze_repeats && non_option_args != 2)
1847     error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1848
1849   if (delete && squeeze_repeats && non_option_args != 2)
1850     error (EXIT_FAILURE, 0, _("two strings must be given when both \
1851 deleting and squeezing repeats"));
1852
1853   /* If --delete is given without --squeeze-repeats, then
1854      only one string argument may be specified.  But POSIX
1855      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1856      is set, pretend we never saw string2.  But I think
1857      this deserves a fatal error, so that's the default.  */
1858   if ((delete && !squeeze_repeats) && non_option_args != 1)
1859     {
1860       if (posix_pedantic && non_option_args == 2)
1861         --non_option_args;
1862       else
1863         error (EXIT_FAILURE, 0,
1864                _("only one string may be given when deleting \
1865 without squeezing repeats"));
1866     }
1867
1868   if (squeeze_repeats && non_option_args == 0)
1869     error (EXIT_FAILURE, 0,
1870            _("at least one string must be given when squeezing repeats"));
1871
1872   spec_init (s1);
1873   if (parse_str ((unsigned char *) argv[optind], s1))
1874     exit (EXIT_FAILURE);
1875
1876   if (non_option_args == 2)
1877     {
1878       spec_init (s2);
1879       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1880         exit (EXIT_FAILURE);
1881     }
1882   else
1883     s2 = NULL;
1884
1885   validate (s1, s2);
1886
1887   /* Use binary I/O, since `tr' is sometimes used to transliterate
1888      non-printable characters, or characters which are stripped away
1889      by text-mode reads (like CR and ^Z).  */
1890   SET_BINARY2 (STDIN_FILENO, STDOUT_FILENO);
1891
1892   if (squeeze_repeats && non_option_args == 1)
1893     {
1894       set_initialize (s1, complement, in_squeeze_set);
1895       squeeze_filter (io_buf, sizeof io_buf, NULL);
1896     }
1897   else if (delete && non_option_args == 1)
1898     {
1899       size_t nr;
1900
1901       set_initialize (s1, complement, in_delete_set);
1902       do
1903         {
1904           nr = read_and_delete (io_buf, sizeof io_buf, NULL);
1905           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1906             error (EXIT_FAILURE, errno, _("write error"));
1907         }
1908       while (nr > 0);
1909     }
1910   else if (squeeze_repeats && delete && non_option_args == 2)
1911     {
1912       set_initialize (s1, complement, in_delete_set);
1913       set_initialize (s2, 0, in_squeeze_set);
1914       squeeze_filter (io_buf, sizeof io_buf, read_and_delete);
1915     }
1916   else if (translating)
1917     {
1918       if (complement)
1919         {
1920           int i;
1921           SET_TYPE *in_s1 = in_delete_set;
1922
1923           set_initialize (s1, 0, in_s1);
1924           s2->state = BEGIN_STATE;
1925           for (i = 0; i < N_CHARS; i++)
1926             xlate[i] = i;
1927           for (i = 0; i < N_CHARS; i++)
1928             {
1929               if (!in_s1[i])
1930                 {
1931                   int ch = get_next (s2, NULL);
1932                   assert (ch != -1 || truncate_set1);
1933                   if (ch == -1)
1934                     {
1935                       /* This will happen when tr is invoked like e.g.
1936                          tr -cs A-Za-z0-9 '\012'.  */
1937                       break;
1938                     }
1939                   xlate[i] = ch;
1940                 }
1941             }
1942           assert (get_next (s2, NULL) == -1 || truncate_set1);
1943         }
1944       else
1945         {
1946           int c1, c2;
1947           int i;
1948           enum Upper_Lower_class class_s1;
1949           enum Upper_Lower_class class_s2;
1950
1951           for (i = 0; i < N_CHARS; i++)
1952             xlate[i] = i;
1953           s1->state = BEGIN_STATE;
1954           s2->state = BEGIN_STATE;
1955           for (;;)
1956             {
1957               c1 = get_next (s1, &class_s1);
1958               c2 = get_next (s2, &class_s2);
1959               if (!class_ok[(int) class_s1][(int) class_s2])
1960                 error (EXIT_FAILURE, 0,
1961                        _("misaligned [:upper:] and/or [:lower:] construct"));
1962
1963               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1964                 {
1965                   for (i = 0; i < N_CHARS; i++)
1966                     if (ISLOWER (i))
1967                       xlate[i] = toupper (i);
1968                 }
1969               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1970                 {
1971                   for (i = 0; i < N_CHARS; i++)
1972                     if (ISUPPER (i))
1973                       xlate[i] = tolower (i);
1974                 }
1975               else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1976                        || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1977                 {
1978                   /* By default, GNU tr permits the identity mappings: from
1979                      [:upper:] to [:upper:] and [:lower:] to [:lower:].  But
1980                      when POSIXLY_CORRECT is set, those evoke diagnostics.  */
1981                   if (posix_pedantic)
1982                     {
1983                       error (EXIT_FAILURE, 0,
1984                              _("\
1985 invalid identity mapping;  when translating, any [:lower:] or [:upper:]\n\
1986 construct in string1 must be aligned with a corresponding construct\n\
1987 ([:upper:] or [:lower:], respectively) in string2"));
1988                     }
1989                 }
1990               else
1991                 {
1992                   /* The following should have been checked by validate...  */
1993                   if (c1 == -1 || c2 == -1)
1994                     break;
1995                   xlate[c1] = c2;
1996                 }
1997             }
1998           assert (c1 == -1 || truncate_set1);
1999         }
2000       if (squeeze_repeats)
2001         {
2002           set_initialize (s2, 0, in_squeeze_set);
2003           squeeze_filter (io_buf, sizeof io_buf, read_and_xlate);
2004         }
2005       else
2006         {
2007           size_t bytes_read;
2008
2009           do
2010             {
2011               bytes_read = read_and_xlate (io_buf, sizeof io_buf, NULL);
2012               if (bytes_read > 0
2013                   && fwrite ((char *) io_buf, 1, bytes_read, stdout) == 0)
2014                 error (EXIT_FAILURE, errno, _("write error"));
2015             }
2016           while (bytes_read > 0);
2017         }
2018     }
2019
2020   if (close (STDIN_FILENO) != 0)
2021     error (EXIT_FAILURE, errno, _("standard input"));
2022
2023   exit (EXIT_SUCCESS);
2024 }