src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 91, 1995-1998, 1999, 2000, 2001 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <assert.h>
  24 #include <errno.h>
  25 #include <sys/types.h>
  26 #include <getopt.h>
  27
  28 #include "system.h"
  29 #include "closeout.h"
  30 #include "error.h"
  31 #include "safe-read.h"
  32 #include "xstrtol.h"
  33
  34 /* The official name of this program (e.g., no `g' prefix).  */
  35 #define PROGRAM_NAME "tr"
  36
  37 #define AUTHORS "Jim Meyering"
  38
  39 #define N_CHARS (UCHAR_MAX + 1)
  40
  41 /* A pointer to a filtering function.  */
  42 typedef size_t (*Filter) (/* unsigned char *, size_t, Filter */);
  43
  44 /* Convert from character C to its index in the collating
  45    sequence array.  Just cast to an unsigned int to avoid
  46    problems with sign-extension.  */
  47 #define ORD(c) (unsigned int)(c)
  48
  49 /* The inverse of ORD.  */
  50 #define CHR(i) (unsigned char)(i)
  51
  52 /* The value for Spec_list->state that indicates to
  53    get_next that it should initialize the tail pointer.
  54    Its value should be as large as possible to avoid conflict
  55    a valid value for the state field -- and that may be as
  56    large as any valid repeat_count.  */
  57 #define BEGIN_STATE (INT_MAX - 1)
  58
  59 /* The value for Spec_list->state that indicates to
  60    get_next that the element pointed to by Spec_list->tail is
  61    being considered for the first time on this pass through the
  62    list -- it indicates that get_next should make any necessary
  63    initializations.  */
  64 #define NEW_ELEMENT (BEGIN_STATE + 1)
  65
  66 /* A value distinct from any character that may have been stored in a
  67    buffer as the result of a block-read in the function squeeze_filter.  */
  68 #define NOT_A_CHAR (unsigned int)(-1)
  69
  70 /* The following (but not CC_NO_CLASS) are indices into the array of
  71    valid character class strings.  */
  72 enum Char_class
  73   {
  74     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  75     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  76     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  77     CC_NO_CLASS = 9999
  78   };
  79
  80 /* Character class to which a character (returned by get_next) belonged;
  81    but it is set only if the construct from which the character was obtained
  82    was one of the character classes [:upper:] or [:lower:].  The value
  83    is used only when translating and then, only to make sure that upper
  84    and lower class constructs have the same relative positions in string1
  85    and string2.  */
  86 enum Upper_Lower_class
  87   {
  88     UL_LOWER = 0,
  89     UL_UPPER = 1,
  90     UL_NONE = 2
  91   };
  92
  93 /* A shortcut to ensure that when constructing the translation array,
  94    one of the values returned by paired calls to get_next (from s1 and s2)
  95    is from [:upper:] and the other is from [:lower:], or neither is from
  96    upper or lower.  By default, GNU tr permits the identity mappings: from
  97    [:upper:] to [:upper:] and [:lower:] to [:lower:].  But when
  98    POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
  99    by values of type enum Upper_Lower_class.  */
 100 static int const class_ok[3][3] =
 101 {
 102   {1, 1, 0},
 103   {1, 1, 0},
 104   {0, 0, 1}
 105 };
 106
 107 /* The type of a List_element.  See build_spec_list for more details.  */
 108 enum Range_element_type
 109   {
 110     RE_NO_TYPE = 0,
 111     RE_NORMAL_CHAR,
 112     RE_RANGE,
 113     RE_CHAR_CLASS,
 114     RE_EQUIV_CLASS,
 115     RE_REPEATED_CHAR
 116   };
 117
 118 /* One construct in one of tr's argument strings.
 119    For example, consider the POSIX version of the classic tr command:
 120        tr -cs 'a-zA-Z_' '[\n*]'
 121    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 122    and a single normal character, `_'.  String2 has one construct.  */
 123 struct List_element
 124   {
 125     enum Range_element_type type;
 126     struct List_element *next;
 127     union
 128       {
 129         int normal_char;
 130         struct                  /* unnamed */
 131           {
 132             unsigned int first_char;
 133             unsigned int last_char;
 134           }
 135         range;
 136         enum Char_class char_class;
 137         int equiv_code;
 138         struct                  /* unnamed */
 139           {
 140             unsigned int the_repeated_char;
 141             size_t repeat_count;
 142           }
 143         repeated_char;
 144       }
 145     u;
 146   };
 147
 148 /* Each of tr's argument strings is parsed into a form that is easier
 149    to work with: a linked list of constructs (struct List_element).
 150    Each Spec_list structure also encapsulates various attributes of
 151    the corresponding argument string.  The attributes are used mainly
 152    to verify that the strings are valid in the context of any options
 153    specified (like -s, -d, or -c).  The main exception is the member
 154    `tail', which is first used to construct the list.  After construction,
 155    it is used by get_next to save its state when traversing the list.
 156    The member `state' serves a similar function.  */
 157 struct Spec_list
 158   {
 159     /* Points to the head of the list of range elements.
 160        The first struct is a dummy; its members are never used.  */
 161     struct List_element *head;
 162
 163     /* When appending, points to the last element.  When traversing via
 164        get_next(), points to the element to process next.  Setting
 165        Spec_list.state to the value BEGIN_STATE before calling get_next
 166        signals get_next to initialize tail to point to head->next.  */
 167     struct List_element *tail;
 168
 169     /* Used to save state between calls to get_next().  */
 170     unsigned int state;
 171
 172     /* Length, in the sense that length ('a-z[:digit:]123abc')
 173        is 42 ( = 26 + 10 + 6).  */
 174     size_t length;
 175
 176     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 177     int n_indefinite_repeats;
 178
 179     /* If n_indefinite_repeats is nonzero, this points to the List_element
 180        corresponding to the last [c*] or [c*0] construct encountered in
 181        this spec.  Otherwise it is undefined.  */
 182     struct List_element *indefinite_repeat_element;
 183
 184     /* Non-zero if this spec contains at least one equivalence
 185        class construct e.g. [=c=].  */
 186     int has_equiv_class;
 187
 188     /* Non-zero if this spec contains at least one character class
 189        construct.  E.g. [:digit:].  */
 190     int has_char_class;
 191
 192     /* Non-zero if this spec contains at least one of the character class
 193        constructs (all but upper and lower) that aren't allowed in s2.  */
 194     int has_restricted_char_class;
 195   };
 196
 197 /* A representation for escaped string1 or string2.  As a string is parsed,
 198    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 199    etc.) are marked as such in this structure by setting the corresponding
 200    entry in the ESCAPED vector.  */
 201 struct E_string
 202 {
 203   unsigned char *s;
 204   int *escaped;
 205   size_t len;
 206 };
 207
 208 /* Return nonzero if the Ith character of escaped string ES matches C
 209    and is not escaped itself.  */
 210 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
 211
 212 /* The name by which this program was run.  */
 213 char *program_name;
 214
 215 /* When nonzero, each sequence in the input of a repeated character
 216    (call it c) is replaced (in the output) by a single occurrence of c
 217    for every c in the squeeze set.  */
 218 static int squeeze_repeats = 0;
 219
 220 /* When nonzero, removes characters in the delete set from input.  */
 221 static int delete = 0;
 222
 223 /* Use the complement of set1 in place of set1.  */
 224 static int complement = 0;
 225
 226 /* When nonzero, this flag causes GNU tr to provide strict
 227    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 228    says that when -d is used without -s, string2 (if present)
 229    must be ignored.  Silently ignoring arguments is a bad idea.
 230    The default GNU behavior is to give a usage message and exit.
 231    Additionally, when this flag is nonzero, tr prints warnings
 232    on stderr if it is being used in a manner that is not portable.
 233    Applicable warnings are given by default, but are suppressed
 234    if the environment variable `POSIXLY_CORRECT' is set, since
 235    being POSIX conformant means we can't issue such messages.
 236    Warnings on the following topics are suppressed when this
 237    variable is nonzero:
 238    1. Ambiguous octal escapes.  */
 239 static int posix_pedantic;
 240
 241 /* When tr is performing translation and string1 is longer than string2,
 242    POSIX says that the result is undefined.  That gives the implementor
 243    of a POSIX conforming version of tr two reasonable choices for the
 244    semantics of this case.
 245
 246    * The BSD tr pads string2 to the length of string1 by
 247    repeating the last character in string2.
 248
 249    * System V tr ignores characters in string1 that have no
 250    corresponding character in string2.  That is, string1 is effectively
 251    truncated to the length of string2.
 252
 253    When nonzero, this flag causes GNU tr to imitate the behavior
 254    of System V tr when translating with string1 longer than string2.
 255    The default is to emulate BSD tr.  This flag is ignored in modes where
 256    no translation is performed.  Emulating the System V tr
 257    in this exceptional case causes the relatively common BSD idiom:
 258
 259        tr -cs A-Za-z0-9 '\012'
 260
 261    to break (it would convert only zero bytes, rather than all
 262    non-alphanumerics, to newlines).
 263
 264    WARNING: This switch does not provide general BSD or System V
 265    compatibility.  For example, it doesn't disable the interpretation
 266    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 267    some unfortunate coincidence you use such constructs in scripts
 268    expecting to use some other version of tr, the scripts will break.  */
 269 static int truncate_set1 = 0;
 270
 271 /* An alias for (!delete && non_option_args == 2).
 272    It is set in main and used there and in validate().  */
 273 static int translating;
 274
 275 #ifndef BUFSIZ
 276 # define BUFSIZ 8192
 277 #endif
 278
 279 #define IO_BUF_SIZE BUFSIZ
 280 static unsigned char io_buf[IO_BUF_SIZE];
 281
 282 static char const *const char_class_name[] =
 283 {
 284   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 285   "lower", "print", "punct", "space", "upper", "xdigit"
 286 };
 287 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 288
 289 typedef char SET_TYPE;
 290
 291 /* Array of boolean values.  A character `c' is a member of the
 292    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 293    set is defined by the last (possibly, the only) string argument
 294    on the command line when the squeeze option is given.  */
 295 static SET_TYPE in_squeeze_set[N_CHARS];
 296
 297 /* Array of boolean values.  A character `c' is a member of the
 298    delete set if and only if in_delete_set[c] is true.  The delete
 299    set is defined by the first (or only) string argument on the
 300    command line when the delete option is given.  */
 301 static SET_TYPE in_delete_set[N_CHARS];
 302
 303 /* Array of character values defining the translation (if any) that
 304    tr is to perform.  Translation is performed only when there are
 305    two specification strings and the delete switch is not given.  */
 306 static char xlate[N_CHARS];
 307
 308 static struct option const long_options[] =
 309 {
 310   {"complement", no_argument, NULL, 'c'},
 311   {"delete", no_argument, NULL, 'd'},
 312   {"squeeze-repeats", no_argument, NULL, 's'},
 313   {"truncate-set1", no_argument, NULL, 't'},
 314   {GETOPT_HELP_OPTION_DECL},
 315   {GETOPT_VERSION_OPTION_DECL},
 316   {NULL, 0, NULL, 0}
 317 };
 318 \f
 319 void
 320 usage (int status)
 321 {
 322   if (status != 0)
 323     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 324              program_name);
 325   else
 326     {
 327       printf (_("\
 328 Usage: %s [OPTION]... SET1 [SET2]\n\
 329 "),
 330               program_name);
 331       fputs (_("\
 332 Translate, squeeze, and/or delete characters from standard input,\n\
 333 writing to standard output.\n\
 334 \n\
 335   -c, --complement        first complement SET1\n\
 336   -d, --delete            delete characters in SET1, do not translate\n\
 337   -s, --squeeze-repeats   replace sequence of characters with one\n\
 338   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 339       --help              display this help and exit\n\
 340       --version           output version information and exit\n\
 341 "), stdout);
 342       fputs (_("\
 343 \n\
 344 SETs are specified as strings of characters.  Most represent themselves.\n\
 345 Interpreted sequences are:\n\
 346 \n\
 347   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 348   \\\\              backslash\n\
 349   \\a              audible BEL\n\
 350   \\b              backspace\n\
 351   \\f              form feed\n\
 352   \\n              new line\n\
 353   \\r              return\n\
 354   \\t              horizontal tab\n\
 355 "), stdout);
 356      fputs (_("\
 357   \\v              vertical tab\n\
 358   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 359   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 360   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 361   [:alnum:]       all letters and digits\n\
 362   [:alpha:]       all letters\n\
 363   [:blank:]       all horizontal whitespace\n\
 364   [:cntrl:]       all control characters\n\
 365   [:digit:]       all digits\n\
 366 "), stdout);
 367      fputs (_("\
 368   [:graph:]       all printable characters, not including space\n\
 369   [:lower:]       all lower case letters\n\
 370   [:print:]       all printable characters, including space\n\
 371   [:punct:]       all punctuation characters\n\
 372   [:space:]       all horizontal or vertical whitespace\n\
 373   [:upper:]       all upper case letters\n\
 374   [:xdigit:]      all hexadecimal digits\n\
 375   [=CHAR=]        all characters which are equivalent to CHAR\n\
 376 "), stdout);
 377      fputs (_("\
 378 \n\
 379 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 380 -t may be used only when translating.  SET2 is extended to length of\n\
 381 SET1 by repeating its last character as necessary.  Excess characters\n\
 382 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 383 "), stdout);
 384      fputs (_("\
 385 expand in ascending order; used in SET2 while translating, they may\n\
 386 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 387 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 388 translation or deletion.\n\
 389 "), stdout);
 390       puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
 391     }
 392   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 393 }
 394
 395 /* Return nonzero if the character C is a member of the
 396    equivalence class containing the character EQUIV_CLASS.  */
 397
 398 static int
 399 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
 400 {
 401   return (equiv_class == c);
 402 }
 403
 404 /* Return nonzero if the character C is a member of the
 405    character class CHAR_CLASS.  */
 406
 407 static int
 408 is_char_class_member (enum Char_class char_class, unsigned int c)
 409 {
 410   int result;
 411
 412   switch (char_class)
 413     {
 414     case CC_ALNUM:
 415       result = ISALNUM (c);
 416       break;
 417     case CC_ALPHA:
 418       result = ISALPHA (c);
 419       break;
 420     case CC_BLANK:
 421       result = ISBLANK (c);
 422       break;
 423     case CC_CNTRL:
 424       result = ISCNTRL (c);
 425       break;
 426     case CC_DIGIT:
 427       result = ISDIGIT_LOCALE (c);
 428       break;
 429     case CC_GRAPH:
 430       result = ISGRAPH (c);
 431       break;
 432     case CC_LOWER:
 433       result = ISLOWER (c);
 434       break;
 435     case CC_PRINT:
 436       result = ISPRINT (c);
 437       break;
 438     case CC_PUNCT:
 439       result = ISPUNCT (c);
 440       break;
 441     case CC_SPACE:
 442       result = ISSPACE (c);
 443       break;
 444     case CC_UPPER:
 445       result = ISUPPER (c);
 446       break;
 447     case CC_XDIGIT:
 448       result = ISXDIGIT (c);
 449       break;
 450     default:
 451       abort ();
 452       break;
 453     }
 454   return result;
 455 }
 456
 457 static void
 458 es_free (struct E_string *es)
 459 {
 460   free (es->s);
 461   free (es->escaped);
 462 }
 463
 464 /* Perform the first pass over each range-spec argument S, converting all
 465    \c and \ddd escapes to their one-byte representations.  The conversion
 466    is done in-place, so S must point to writable storage.  If an invalid
 467    quote sequence is found print an error message and return nonzero.
 468    Otherwise set *LEN to the length of the resulting string and return
 469    zero.  The resulting array of characters may contain zero-bytes;
 470    however, on input, S is assumed to be null-terminated, and hence
 471    cannot contain actual (non-escaped) zero bytes.  */
 472
 473 static int
 474 unquote (const unsigned char *s, struct E_string *es)
 475 {
 476   size_t i, j;
 477   size_t len;
 478
 479   len = strlen ((char *) s);
 480
 481   es->s = (unsigned char *) xmalloc (len);
 482   es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
 483   for (i = 0; i < len; i++)
 484     es->escaped[i] = 0;
 485
 486   j = 0;
 487   for (i = 0; s[i]; i++)
 488     {
 489       switch (s[i])
 490         {
 491           int c;
 492         case '\\':
 493           switch (s[i + 1])
 494             {
 495               int oct_digit;
 496             case '\\':
 497               c = '\\';
 498               break;
 499             case 'a':
 500               c = '\007';
 501               break;
 502             case 'b':
 503               c = '\b';
 504               break;
 505             case 'f':
 506               c = '\f';
 507               break;
 508             case 'n':
 509               c = '\n';
 510               break;
 511             case 'r':
 512               c = '\r';
 513               break;
 514             case 't':
 515               c = '\t';
 516               break;
 517             case 'v':
 518               c = '\v';
 519               break;
 520             case '0':
 521             case '1':
 522             case '2':
 523             case '3':
 524             case '4':
 525             case '5':
 526             case '6':
 527             case '7':
 528               c = s[i + 1] - '0';
 529               oct_digit = s[i + 2] - '0';
 530               if (0 <= oct_digit && oct_digit <= 7)
 531                 {
 532                   c = 8 * c + oct_digit;
 533                   ++i;
 534                   oct_digit = s[i + 2] - '0';
 535                   if (0 <= oct_digit && oct_digit <= 7)
 536                     {
 537                       if (8 * c + oct_digit < N_CHARS)
 538                         {
 539                           c = 8 * c + oct_digit;
 540                           ++i;
 541                         }
 542                       else if (!posix_pedantic)
 543                         {
 544                           /* Any octal number larger than 0377 won't
 545                              fit in 8 bits.  So we stop when adding the
 546                              next digit would put us over the limit and
 547                              give a warning about the ambiguity.  POSIX
 548                              isn't clear on this, but one person has said
 549                              that in his interpretation, POSIX says tr
 550                              can't even give a warning.  */
 551                           error (0, 0, _("warning: the ambiguous octal escape \
 552 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
 553                                  s[i], s[i + 1], s[i + 2],
 554                                  s[i], s[i + 1], s[i + 2]);
 555                         }
 556                     }
 557                 }
 558               break;
 559             case '\0':
 560               error (0, 0, _("invalid backslash escape at end of string"));
 561               return 1;
 562
 563             default:
 564               if (posix_pedantic)
 565                 {
 566                   error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
 567                   return 1;
 568                 }
 569               else
 570                 {
 571                   c = s[i + 1];
 572                   es->escaped[j] = 1;
 573                 }
 574             }
 575           ++i;
 576           es->s[j++] = c;
 577           break;
 578         default:
 579           es->s[j++] = s[i];
 580           break;
 581         }
 582     }
 583   es->len = j;
 584   return 0;
 585 }
 586
 587 /* If CLASS_STR is a valid character class string, return its index
 588    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 589
 590 static enum Char_class
 591 look_up_char_class (const unsigned char *class_str, size_t len)
 592 {
 593   unsigned int i;
 594
 595   for (i = 0; i < N_CHAR_CLASSES; i++)
 596     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 597         && strlen (char_class_name[i]) == len)
 598       return (enum Char_class) i;
 599   return CC_NO_CLASS;
 600 }
 601
 602 /* Return a newly allocated string with a printable version of C.
 603    This function is used solely for formatting error messages.  */
 604
 605 static char *
 606 make_printable_char (unsigned int c)
 607 {
 608   char *buf = xmalloc (5);
 609
 610   assert (c < N_CHARS);
 611   if (ISPRINT (c))
 612     {
 613       buf[0] = c;
 614       buf[1] = '\0';
 615     }
 616   else
 617     {
 618       sprintf (buf, "\\%03o", c);
 619     }
 620   return buf;
 621 }
 622
 623 /* Return a newly allocated copy of S which is suitable for printing.
 624    LEN is the number of characters in S.  Most non-printing
 625    (isprint) characters are represented by a backslash followed by
 626    3 octal digits.  However, the characters represented by \c escapes
 627    where c is one of [abfnrtv] are represented by their 2-character \c
 628    sequences.  This function is used solely for printing error messages.  */
 629
 630 static char *
 631 make_printable_str (const unsigned char *s, size_t len)
 632 {
 633   /* Worst case is that every character expands to a backslash
 634      followed by a 3-character octal escape sequence.  */
 635   char *printable_buf = xmalloc (4 * len + 1);
 636   char *p = printable_buf;
 637   size_t i;
 638
 639   for (i = 0; i < len; i++)
 640     {
 641       char buf[5];
 642       char *tmp = NULL;
 643
 644       switch (s[i])
 645         {
 646         case '\\':
 647           tmp = "\\";
 648           break;
 649         case '\007':
 650           tmp = "\\a";
 651           break;
 652         case '\b':
 653           tmp = "\\b";
 654           break;
 655         case '\f':
 656           tmp = "\\f";
 657           break;
 658         case '\n':
 659           tmp = "\\n";
 660           break;
 661         case '\r':
 662           tmp = "\\r";
 663           break;
 664         case '\t':
 665           tmp = "\\t";
 666           break;
 667         case '\v':
 668           tmp = "\\v";
 669           break;
 670         default:
 671           if (ISPRINT (s[i]))
 672             {
 673               buf[0] = s[i];
 674               buf[1] = '\0';
 675             }
 676           else
 677             sprintf (buf, "\\%03o", s[i]);
 678           tmp = buf;
 679           break;
 680         }
 681       p = stpcpy (p, tmp);
 682     }
 683   return printable_buf;
 684 }
 685
 686 /* Append a newly allocated structure representing a
 687    character C to the specification list LIST.  */
 688
 689 static void
 690 append_normal_char (struct Spec_list *list, unsigned int c)
 691 {
 692   struct List_element *new;
 693
 694   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 695   new->next = NULL;
 696   new->type = RE_NORMAL_CHAR;
 697   new->u.normal_char = c;
 698   assert (list->tail);
 699   list->tail->next = new;
 700   list->tail = new;
 701 }
 702
 703 /* Append a newly allocated structure representing the range
 704    of characters from FIRST to LAST to the specification list LIST.
 705    Return nonzero if LAST precedes FIRST in the collating sequence,
 706    zero otherwise.  This means that '[c-c]' is acceptable.  */
 707
 708 static int
 709 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
 710 {
 711   struct List_element *new;
 712
 713   if (ORD (first) > ORD (last))
 714     {
 715       char *tmp1 = make_printable_char (first);
 716       char *tmp2 = make_printable_char (last);
 717
 718       error (0, 0,
 719        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 720              tmp1, tmp2);
 721       free (tmp1);
 722       free (tmp2);
 723       return 1;
 724     }
 725   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 726   new->next = NULL;
 727   new->type = RE_RANGE;
 728   new->u.range.first_char = first;
 729   new->u.range.last_char = last;
 730   assert (list->tail);
 731   list->tail->next = new;
 732   list->tail = new;
 733   return 0;
 734 }
 735
 736 /* If CHAR_CLASS_STR is a valid character class string, append a
 737    newly allocated structure representing that character class to the end
 738    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 739    a valid string return nonzero.  */
 740
 741 static int
 742 append_char_class (struct Spec_list *list,
 743                    const unsigned char *char_class_str, size_t len)
 744 {
 745   enum Char_class char_class;
 746   struct List_element *new;
 747
 748   char_class = look_up_char_class (char_class_str, len);
 749   if (char_class == CC_NO_CLASS)
 750     return 1;
 751   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 752   new->next = NULL;
 753   new->type = RE_CHAR_CLASS;
 754   new->u.char_class = char_class;
 755   assert (list->tail);
 756   list->tail->next = new;
 757   list->tail = new;
 758   return 0;
 759 }
 760
 761 /* Append a newly allocated structure representing a [c*n]
 762    repeated character construct to the specification list LIST.
 763    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 764    is a non-negative repeat count.  */
 765
 766 static void
 767 append_repeated_char (struct Spec_list *list, unsigned int the_char,
 768                       size_t repeat_count)
 769 {
 770   struct List_element *new;
 771
 772   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 773   new->next = NULL;
 774   new->type = RE_REPEATED_CHAR;
 775   new->u.repeated_char.the_repeated_char = the_char;
 776   new->u.repeated_char.repeat_count = repeat_count;
 777   assert (list->tail);
 778   list->tail->next = new;
 779   list->tail = new;
 780 }
 781
 782 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 783    the length of that string, LEN, if LEN is exactly one, append
 784    a newly allocated structure representing the specified
 785    equivalence class to the specification list, LIST and return zero.
 786    If LEN is not 1, return nonzero.  */
 787
 788 static int
 789 append_equiv_class (struct Spec_list *list,
 790                     const unsigned char *equiv_class_str, size_t len)
 791 {
 792   struct List_element *new;
 793
 794   if (len != 1)
 795     return 1;
 796   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 797   new->next = NULL;
 798   new->type = RE_EQUIV_CLASS;
 799   new->u.equiv_code = *equiv_class_str;
 800   assert (list->tail);
 801   list->tail->next = new;
 802   list->tail = new;
 803   return 0;
 804 }
 805
 806 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 807    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 808    NUL bytes, and is *not* NUL-terminated.  */
 809
 810 static unsigned char *
 811 substr (const unsigned char *p, size_t first_idx, size_t last_idx)
 812 {
 813   size_t len;
 814   unsigned char *tmp;
 815
 816   assert (first_idx <= last_idx);
 817   len = last_idx - first_idx + 1;
 818   tmp = (unsigned char *) xmalloc (len);
 819
 820   assert (first_idx <= last_idx);
 821   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 822   memcpy (tmp, p + first_idx, len);
 823   return tmp;
 824 }
 825
 826 /* Search forward starting at START_IDX for the 2-char sequence
 827    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 828    a sequence is found, set *RESULT_IDX to the index of the first
 829    character and return nonzero. Otherwise return zero.  P may contain
 830    zero bytes.  */
 831
 832 static int
 833 find_closing_delim (const struct E_string *es, size_t start_idx,
 834                     unsigned int pre_bracket_char, size_t *result_idx)
 835 {
 836   size_t i;
 837
 838   for (i = start_idx; i < es->len - 1; i++)
 839     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 840         && !es->escaped[i] && !es->escaped[i + 1])
 841       {
 842         *result_idx = i;
 843         return 1;
 844       }
 845   return 0;
 846 }
 847
 848 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 849    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 850    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 851    and return zero. If the second character following
 852    the opening bracket is not `*' or if no closing bracket can be
 853    found, return -1.  If a closing bracket is found and the
 854    second char is `*', but the string between the `*' and `]' isn't
 855    empty, an octal number, or a decimal number, print an error message
 856    and return -2.  */
 857
 858 static int
 859 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 860                        unsigned int *char_to_repeat, size_t *repeat_count,
 861                        size_t *closing_bracket_idx)
 862 {
 863   size_t i;
 864
 865   assert (start_idx + 1 < es->len);
 866   if (!ES_MATCH (es, start_idx + 1, '*'))
 867     return -1;
 868
 869   for (i = start_idx + 2; i < es->len; i++)
 870     {
 871       if (ES_MATCH (es, i, ']'))
 872         {
 873           size_t digit_str_len = i - start_idx - 2;
 874
 875           *char_to_repeat = es->s[start_idx];
 876           if (digit_str_len == 0)
 877             {
 878               /* We've matched [c*] -- no explicit repeat count.  */
 879               *repeat_count = 0;
 880               *closing_bracket_idx = i;
 881               return 0;
 882             }
 883
 884           /* Here, we have found [c*s] where s should be a string
 885              of octal (if it starts with `0') or decimal digits.  */
 886           {
 887             const char *digit_str = (const char *) &es->s[start_idx + 2];
 888             unsigned long int tmp_ulong;
 889             char *d_end;
 890             int base = 10;
 891             /* Select the base manually so we can be sure it's either 8 or 10.
 892                If the spec allowed it to be interpreted as hexadecimal, we
 893                could have used `0' and let xstrtoul decide.  */
 894             if (*digit_str == '0')
 895               {
 896                 base = 8;
 897                 ++digit_str;
 898                 --digit_str_len;
 899               }
 900             if (xstrtoul (digit_str, &d_end, base, &tmp_ulong, NULL)
 901                   != LONGINT_OK
 902                 || BEGIN_STATE < tmp_ulong
 903                 || d_end - digit_str != digit_str_len)
 904               {
 905                 char *tmp = make_printable_str (es->s + start_idx + 2,
 906                                                 i - start_idx - 2);
 907                 error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
 908                        tmp);
 909                 free (tmp);
 910                 return -2;
 911               }
 912             *repeat_count = tmp_ulong;
 913           }
 914           *closing_bracket_idx = i;
 915           return 0;
 916         }
 917     }
 918   return -1;                    /* No bracket found.  */
 919 }
 920
 921 /* Return nonzero if the string at ES->s[IDX] matches the regular
 922    expression `\*[0-9]*\]', zero otherwise.  To match, the `*' and
 923    the `]' must not be escaped.  */
 924
 925 static int
 926 star_digits_closebracket (const struct E_string *es, size_t idx)
 927 {
 928   size_t i;
 929
 930   if (!ES_MATCH (es, idx, '*'))
 931     return 0;
 932
 933   for (i = idx + 1; i < es->len; i++)
 934     {
 935       if (!ISDIGIT (es->s[i]))
 936         {
 937           if (ES_MATCH (es, i, ']'))
 938             return 1;
 939           return 0;
 940         }
 941     }
 942   return 0;
 943 }
 944
 945 /* Convert string UNESCAPED_STRING (which has been preprocessed to
 946    convert backslash-escape sequences) of length LEN characters into
 947    a linked list of the following 5 types of constructs:
 948       - [:str:] Character class where `str' is one of the 12 valid strings.
 949       - [=c=] Equivalence class where `c' is any single character.
 950       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 951           However, if `n' is present, it must be a non-negative octal or
 952           decimal integer.
 953       - r-s Range of characters from `r' to `s'.  The second endpoint must
 954           not precede the first in the current collating sequence.
 955       - c Any other character is interpreted as itself.  */
 956
 957 static int
 958 build_spec_list (const struct E_string *es, struct Spec_list *result)
 959 {
 960   const unsigned char *p;
 961   size_t i;
 962
 963   p = es->s;
 964
 965   /* The main for-loop below recognizes the 4 multi-character constructs.
 966      A character that matches (in its context) none of the multi-character
 967      constructs is classified as `normal'.  Since all multi-character
 968      constructs have at least 3 characters, any strings of length 2 or
 969      less are composed solely of normal characters.  Hence, the index of
 970      the outer for-loop runs only as far as LEN-2.  */
 971
 972   for (i = 0; i + 2 < es->len; /* empty */)
 973     {
 974       if (ES_MATCH (es, i, '['))
 975         {
 976           int matched_multi_char_construct;
 977           size_t closing_bracket_idx;
 978           unsigned int char_to_repeat;
 979           size_t repeat_count;
 980           int err;
 981
 982           matched_multi_char_construct = 1;
 983           if (ES_MATCH (es, i + 1, ':')
 984               || ES_MATCH (es, i + 1, '='))
 985             {
 986               size_t closing_delim_idx;
 987               int found;
 988
 989               found = find_closing_delim (es, i + 2, p[i + 1],
 990                                           &closing_delim_idx);
 991               if (found)
 992                 {
 993                   int parse_failed;
 994                   unsigned char *opnd_str = substr (p, i + 2,
 995                                                     closing_delim_idx - 1);
 996                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
 997
 998                   if (p[i + 1] == ':')
 999                     {
1000                       parse_failed = append_char_class (result, opnd_str,
1001                                                         opnd_str_len);
1002
1003                       /* FIXME: big comment.  */
1004                       if (parse_failed)
1005                         {
1006                           if (star_digits_closebracket (es, i + 2))
1007                             {
1008                               free (opnd_str);
1009                               goto try_bracketed_repeat;
1010                             }
1011                           else
1012                             {
1013                               char *tmp = make_printable_str (opnd_str,
1014                                                               opnd_str_len);
1015                               error (0, 0, _("invalid character class `%s'"),
1016                                      tmp);
1017                               free (tmp);
1018                               return 1;
1019                             }
1020                         }
1021                     }
1022                   else
1023                     {
1024                       parse_failed = append_equiv_class (result, opnd_str,
1025                                                          opnd_str_len);
1026
1027                       /* FIXME: big comment.  */
1028                       if (parse_failed)
1029                         {
1030                           if (star_digits_closebracket (es, i + 2))
1031                             {
1032                               free (opnd_str);
1033                               goto try_bracketed_repeat;
1034                             }
1035                           else
1036                             {
1037                               char *tmp = make_printable_str (opnd_str,
1038                                                               opnd_str_len);
1039                               error (0, 0,
1040                _("%s: equivalence class operand must be a single character"),
1041                                      tmp);
1042                               free (tmp);
1043                               return 1;
1044                             }
1045                         }
1046                     }
1047                   free (opnd_str);
1048
1049                   /* Return nonzero if append_*_class reports a problem.  */
1050                   if (parse_failed)
1051                     return 1;
1052                   else
1053                     i = closing_delim_idx + 2;
1054                   continue;
1055                 }
1056               /* Else fall through.  This could be [:*] or [=*].  */
1057             }
1058
1059         try_bracketed_repeat:
1060
1061           /* Determine whether this is a bracketed repeat range
1062              matching the RE \[.\*(dec_or_oct_number)?\].  */
1063           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1064                                        &repeat_count,
1065                                        &closing_bracket_idx);
1066           if (err == 0)
1067             {
1068               append_repeated_char (result, char_to_repeat, repeat_count);
1069               i = closing_bracket_idx + 1;
1070             }
1071           else if (err == -1)
1072             {
1073               matched_multi_char_construct = 0;
1074             }
1075           else
1076             {
1077               /* Found a string that looked like [c*n] but the
1078                  numeric part was invalid.  */
1079               return 1;
1080             }
1081
1082           if (matched_multi_char_construct)
1083             continue;
1084
1085           /* We reach this point if P does not match [:str:], [=c=],
1086              [c*n], or [c*].  Now, see if P looks like a range `[-c'
1087              (from `[' to `c').  */
1088         }
1089
1090       /* Look ahead one char for ranges like a-z.  */
1091       if (ES_MATCH (es, i + 1, '-'))
1092         {
1093           if (append_range (result, p[i], p[i + 2]))
1094             return 1;
1095           i += 3;
1096         }
1097       else
1098         {
1099           append_normal_char (result, p[i]);
1100           ++i;
1101         }
1102     }
1103
1104   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1105   for (; i < es->len; i++)
1106     append_normal_char (result, p[i]);
1107
1108   return 0;
1109 }
1110
1111 /* Given a Spec_list S (with its saved state implicit in the values
1112    of its members `tail' and `state'), return the next single character
1113    in the expansion of S's constructs.  If the last character of S was
1114    returned on the previous call or if S was empty, this function
1115    returns -1.  For example, successive calls to get_next where S
1116    represents the spec-string 'a-d[y*3]' will return the sequence
1117    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1118    which the returned character comes is [:upper:] or [:lower:], the
1119    parameter CLASS is given a value to indicate which it was.  Otherwise
1120    CLASS is set to UL_NONE.  This value is used only when constructing
1121    the translation table to verify that any occurrences of upper and
1122    lower class constructs in the spec-strings appear in the same relative
1123    positions.  */
1124
1125 static int
1126 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1127 {
1128   struct List_element *p;
1129   int return_val;
1130   int i;
1131
1132   if (class)
1133     *class = UL_NONE;
1134
1135   if (s->state == BEGIN_STATE)
1136     {
1137       s->tail = s->head->next;
1138       s->state = NEW_ELEMENT;
1139     }
1140
1141   p = s->tail;
1142   if (p == NULL)
1143     return -1;
1144
1145   switch (p->type)
1146     {
1147     case RE_NORMAL_CHAR:
1148       return_val = p->u.normal_char;
1149       s->state = NEW_ELEMENT;
1150       s->tail = p->next;
1151       break;
1152
1153     case RE_RANGE:
1154       if (s->state == NEW_ELEMENT)
1155         s->state = ORD (p->u.range.first_char);
1156       else
1157         ++(s->state);
1158       return_val = CHR (s->state);
1159       if (s->state == ORD (p->u.range.last_char))
1160         {
1161           s->tail = p->next;
1162           s->state = NEW_ELEMENT;
1163         }
1164       break;
1165
1166     case RE_CHAR_CLASS:
1167       if (class)
1168         {
1169           int upper_or_lower;
1170           switch (p->u.char_class)
1171             {
1172             case CC_LOWER:
1173               *class = UL_LOWER;
1174               upper_or_lower = 1;
1175               break;
1176             case CC_UPPER:
1177               *class = UL_UPPER;
1178               upper_or_lower = 1;
1179               break;
1180             default:
1181               upper_or_lower = 0;
1182               break;
1183             }
1184
1185           if (upper_or_lower)
1186             {
1187               s->tail = p->next;
1188               s->state = NEW_ELEMENT;
1189               return_val = 0;
1190               break;
1191             }
1192         }
1193
1194       if (s->state == NEW_ELEMENT)
1195         {
1196           for (i = 0; i < N_CHARS; i++)
1197             if (is_char_class_member (p->u.char_class, i))
1198               break;
1199           assert (i < N_CHARS);
1200           s->state = i;
1201         }
1202       assert (is_char_class_member (p->u.char_class, s->state));
1203       return_val = CHR (s->state);
1204       for (i = s->state + 1; i < N_CHARS; i++)
1205         if (is_char_class_member (p->u.char_class, i))
1206           break;
1207       if (i < N_CHARS)
1208         s->state = i;
1209       else
1210         {
1211           s->tail = p->next;
1212           s->state = NEW_ELEMENT;
1213         }
1214       break;
1215
1216     case RE_EQUIV_CLASS:
1217       /* FIXME: this assumes that each character is alone in its own
1218          equivalence class (which appears to be correct for my
1219          LC_COLLATE.  But I don't know of any function that allows
1220          one to determine a character's equivalence class.  */
1221
1222       return_val = p->u.equiv_code;
1223       s->state = NEW_ELEMENT;
1224       s->tail = p->next;
1225       break;
1226
1227     case RE_REPEATED_CHAR:
1228       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1229       if (p->u.repeated_char.repeat_count == 0)
1230         {
1231           s->tail = p->next;
1232           s->state = NEW_ELEMENT;
1233           return_val = get_next (s, class);
1234         }
1235       else
1236         {
1237           if (s->state == NEW_ELEMENT)
1238             {
1239               s->state = 0;
1240             }
1241           ++(s->state);
1242           return_val = p->u.repeated_char.the_repeated_char;
1243           if (p->u.repeated_char.repeat_count > 0
1244               && s->state == p->u.repeated_char.repeat_count)
1245             {
1246               s->tail = p->next;
1247               s->state = NEW_ELEMENT;
1248             }
1249         }
1250       break;
1251
1252     case RE_NO_TYPE:
1253       abort ();
1254       break;
1255
1256     default:
1257       abort ();
1258       break;
1259     }
1260
1261   return return_val;
1262 }
1263
1264 /* This is a minor kludge.  This function is called from
1265    get_spec_stats to determine the cardinality of a set derived
1266    from a complemented string.  It's a kludge in that some of the
1267    same operations are (duplicated) performed in set_initialize.  */
1268
1269 static int
1270 card_of_complement (struct Spec_list *s)
1271 {
1272   int c;
1273   int cardinality = N_CHARS;
1274   SET_TYPE in_set[N_CHARS];
1275
1276   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1277   s->state = BEGIN_STATE;
1278   while ((c = get_next (s, NULL)) != -1)
1279     if (!in_set[c]++)
1280       --cardinality;
1281   return cardinality;
1282 }
1283
1284 /* Gather statistics about the spec-list S in preparation for the tests
1285    in validate that determine the consistency of the specs.  This function
1286    is called at most twice; once for string1, and again for any string2.
1287    LEN_S1 < 0 indicates that this is the first call and that S represents
1288    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1289    constructs in string1, and we can use its value to resolve any
1290    indefinite repeat construct in S (which represents string2).  Hence,
1291    this function has the side-effect that it converts a valid [c*]
1292    construct in string2 to [c*n] where n is large enough (or 0) to give
1293    string2 the same length as string1.  For example, with the command
1294    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1295    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1296
1297 static void
1298 get_spec_stats (struct Spec_list *s)
1299 {
1300   struct List_element *p;
1301   int len = 0;
1302
1303   s->n_indefinite_repeats = 0;
1304   s->has_equiv_class = 0;
1305   s->has_restricted_char_class = 0;
1306   s->has_char_class = 0;
1307   for (p = s->head->next; p; p = p->next)
1308     {
1309       switch (p->type)
1310         {
1311           int i;
1312         case RE_NORMAL_CHAR:
1313           ++len;
1314           break;
1315
1316         case RE_RANGE:
1317           assert (p->u.range.last_char >= p->u.range.first_char);
1318           len += p->u.range.last_char - p->u.range.first_char + 1;
1319           break;
1320
1321         case RE_CHAR_CLASS:
1322           s->has_char_class = 1;
1323           for (i = 0; i < N_CHARS; i++)
1324             if (is_char_class_member (p->u.char_class, i))
1325               ++len;
1326           switch (p->u.char_class)
1327             {
1328             case CC_UPPER:
1329             case CC_LOWER:
1330               break;
1331             default:
1332               s->has_restricted_char_class = 1;
1333               break;
1334             }
1335           break;
1336
1337         case RE_EQUIV_CLASS:
1338           for (i = 0; i < N_CHARS; i++)
1339             if (is_equiv_class_member (p->u.equiv_code, i))
1340               ++len;
1341           s->has_equiv_class = 1;
1342           break;
1343
1344         case RE_REPEATED_CHAR:
1345           if (p->u.repeated_char.repeat_count > 0)
1346             len += p->u.repeated_char.repeat_count;
1347           else if (p->u.repeated_char.repeat_count == 0)
1348             {
1349               s->indefinite_repeat_element = p;
1350               ++(s->n_indefinite_repeats);
1351             }
1352           break;
1353
1354         case RE_NO_TYPE:
1355           assert (0);
1356           break;
1357         }
1358     }
1359
1360   s->length = len;
1361 }
1362
1363 static void
1364 get_s1_spec_stats (struct Spec_list *s1)
1365 {
1366   get_spec_stats (s1);
1367   if (complement)
1368     s1->length = card_of_complement (s1);
1369 }
1370
1371 static void
1372 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1373 {
1374   get_spec_stats (s2);
1375   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1376     {
1377       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1378         len_s1 - s2->length;
1379       s2->length = len_s1;
1380     }
1381 }
1382
1383 static void
1384 spec_init (struct Spec_list *spec_list)
1385 {
1386   spec_list->head = spec_list->tail =
1387     (struct List_element *) xmalloc (sizeof (struct List_element));
1388   spec_list->head->next = NULL;
1389 }
1390
1391 /* This function makes two passes over the argument string S.  The first
1392    one converts all \c and \ddd escapes to their one-byte representations.
1393    The second constructs a linked specification list, SPEC_LIST, of the
1394    characters and constructs that comprise the argument string.  If either
1395    of these passes detects an error, this function returns nonzero.  */
1396
1397 static int
1398 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1399 {
1400   struct E_string es;
1401   int fail;
1402
1403   fail = unquote (s, &es);
1404   if (!fail)
1405     fail = build_spec_list (&es, spec_list);
1406   es_free (&es);
1407   return fail;
1408 }
1409
1410 /* Given two specification lists, S1 and S2, and assuming that
1411    S1->length > S2->length, append a single [c*n] element to S2 where c
1412    is the last character in the expansion of S2 and n is the difference
1413    between the two lengths.
1414    Upon successful completion, S2->length is set to S1->length.  The only
1415    way this function can fail to make S2 as long as S1 is when S2 has
1416    zero-length, since in that case, there is no last character to repeat.
1417    So S2->length is required to be at least 1.
1418
1419    Providing this functionality allows the user to do some pretty
1420    non-BSD (and non-portable) things:  For example, the command
1421        tr -cs '[:upper:]0-9' '[:lower:]'
1422    is almost guaranteed to give results that depend on your collating
1423    sequence.  */
1424
1425 static void
1426 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1427 {
1428   struct List_element *p;
1429   int char_to_repeat;
1430   int i;
1431
1432   assert (translating);
1433   assert (s1->length > s2->length);
1434   assert (s2->length > 0);
1435
1436   p = s2->tail;
1437   switch (p->type)
1438     {
1439     case RE_NORMAL_CHAR:
1440       char_to_repeat = p->u.normal_char;
1441       break;
1442     case RE_RANGE:
1443       char_to_repeat = p->u.range.last_char;
1444       break;
1445     case RE_CHAR_CLASS:
1446       for (i = N_CHARS; i >= 0; i--)
1447         if (is_char_class_member (p->u.char_class, i))
1448           break;
1449       assert (i >= 0);
1450       char_to_repeat = CHR (i);
1451       break;
1452
1453     case RE_REPEATED_CHAR:
1454       char_to_repeat = p->u.repeated_char.the_repeated_char;
1455       break;
1456
1457     case RE_EQUIV_CLASS:
1458       /* This shouldn't happen, because validate exits with an error
1459          if it finds an equiv class in string2 when translating.  */
1460       abort ();
1461       break;
1462
1463     case RE_NO_TYPE:
1464       abort ();
1465       break;
1466
1467     default:
1468       abort ();
1469       break;
1470     }
1471
1472   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1473   s2->length = s1->length;
1474 }
1475
1476 /* Return non-zero if S is a non-empty list in which exactly one
1477    character (but potentially, many instances of it) appears.
1478    E.g.  [X*] or xxxxxxxx.  */
1479
1480 static int
1481 homogeneous_spec_list (struct Spec_list *s)
1482 {
1483   int b, c;
1484
1485   s->state = BEGIN_STATE;
1486
1487   if ((b = get_next (s, NULL)) == -1)
1488     return 0;
1489
1490   while ((c = get_next (s, NULL)) != -1)
1491     if (c != b)
1492       return 0;
1493
1494   return 1;
1495 }
1496
1497 /* Die with an error message if S1 and S2 describe strings that
1498    are not valid with the given command line switches.
1499    A side effect of this function is that if a valid [c*] or
1500    [c*0] construct appears in string2, it is converted to [c*n]
1501    with a value for n that makes s2->length == s1->length.  By
1502    the same token, if the --truncate-set1 option is not
1503    given, S2 may be extended.  */
1504
1505 static void
1506 validate (struct Spec_list *s1, struct Spec_list *s2)
1507 {
1508   get_s1_spec_stats (s1);
1509   if (s1->n_indefinite_repeats > 0)
1510     {
1511       error (EXIT_FAILURE, 0,
1512              _("the [c*] repeat construct may not appear in string1"));
1513     }
1514
1515   if (s2)
1516     {
1517       get_s2_spec_stats (s2, s1->length);
1518
1519       if (s2->n_indefinite_repeats > 1)
1520         {
1521           error (EXIT_FAILURE, 0,
1522                  _("only one [c*] repeat construct may appear in string2"));
1523         }
1524
1525       if (translating)
1526         {
1527           if (s2->has_equiv_class)
1528             {
1529               error (EXIT_FAILURE, 0,
1530                      _("[=c=] expressions may not appear in string2 \
1531 when translating"));
1532             }
1533
1534           if (s1->length > s2->length)
1535             {
1536               if (!truncate_set1)
1537                 {
1538                   /* string2 must be non-empty unless --truncate-set1 is
1539                      given or string1 is empty.  */
1540
1541                   if (s2->length == 0)
1542                     error (EXIT_FAILURE, 0,
1543                      _("when not truncating set1, string2 must be non-empty"));
1544                   string2_extend (s1, s2);
1545                 }
1546             }
1547
1548           if (complement && s1->has_char_class
1549               && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1550             {
1551               error (EXIT_FAILURE, 0,
1552                      _("when translating with complemented character classes,\
1553 \nstring2 must map all characters in the domain to one"));
1554             }
1555
1556           if (s2->has_restricted_char_class)
1557             {
1558               error (EXIT_FAILURE, 0,
1559                      _("when translating, the only character classes that may \
1560 appear in\nstring2 are `upper' and `lower'"));
1561             }
1562         }
1563       else
1564         /* Not translating.  */
1565         {
1566           if (s2->n_indefinite_repeats > 0)
1567             error (EXIT_FAILURE, 0,
1568                    _("the [c*] construct may appear in string2 only \
1569 when translating"));
1570         }
1571     }
1572 }
1573
1574 /* Read buffers of SIZE bytes via the function READER (if READER is
1575    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1576    read_and_delete or read_and_xlate.  After each buffer is read, it is
1577    processed and written to stdout.  The buffers are processed so that
1578    multiple consecutive occurrences of the same character in the input
1579    stream are replaced by a single occurrence of that character if the
1580    character is in the squeeze set.  */
1581
1582 static void
1583 squeeze_filter (unsigned char *buf, size_t size, Filter reader)
1584 {
1585   unsigned int char_to_squeeze = NOT_A_CHAR;
1586   size_t i = 0;
1587   ssize_t nr = 0;
1588
1589   for (;;)
1590     {
1591       size_t begin;
1592
1593       if (i >= nr)
1594         {
1595           if (reader == NULL)
1596             {
1597               ssize_t signed_nr = safe_read (0, (char *) buf, size);
1598               if (signed_nr < 0)
1599                 error (EXIT_FAILURE, errno, _("read error"));
1600               nr = signed_nr;
1601             }
1602           else
1603             {
1604               nr = (*reader) (buf, size, NULL);
1605             }
1606
1607           if (nr == 0)
1608             break;
1609           i = 0;
1610         }
1611
1612       begin = i;
1613
1614       if (char_to_squeeze == NOT_A_CHAR)
1615         {
1616           size_t out_len;
1617           /* Here, by being a little tricky, we can get a significant
1618              performance increase in most cases when the input is
1619              reasonably large.  Since tr will modify the input only
1620              if two consecutive (and identical) input characters are
1621              in the squeeze set, we can step by two through the data
1622              when searching for a character in the squeeze set.  This
1623              means there may be a little more work in a few cases and
1624              perhaps twice as much work in the worst cases where most
1625              of the input is removed by squeezing repeats.  But most
1626              uses of this functionality seem to remove less than 20-30%
1627              of the input.  */
1628           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1629             ;                   /* empty */
1630
1631           /* There is a special case when i == nr and we've just
1632              skipped a character (the last one in buf) that is in
1633              the squeeze set.  */
1634           if (i == nr && in_squeeze_set[buf[i - 1]])
1635             --i;
1636
1637           if (i >= nr)
1638             out_len = nr - begin;
1639           else
1640             {
1641               char_to_squeeze = buf[i];
1642               /* We're about to output buf[begin..i].  */
1643               out_len = i - begin + 1;
1644
1645               /* But since we stepped by 2 in the loop above,
1646                  out_len may be one too large.  */
1647               if (i > 0 && buf[i - 1] == char_to_squeeze)
1648                 --out_len;
1649
1650               /* Advance i to the index of first character to be
1651                  considered when looking for a char different from
1652                  char_to_squeeze.  */
1653               ++i;
1654             }
1655           if (out_len > 0
1656               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1657             error (EXIT_FAILURE, errno, _("write error"));
1658         }
1659
1660       if (char_to_squeeze != NOT_A_CHAR)
1661         {
1662           /* Advance i to index of first char != char_to_squeeze
1663              (or to nr if all the rest of the characters in this
1664              buffer are the same as char_to_squeeze).  */
1665           for (; i < nr && buf[i] == char_to_squeeze; i++)
1666             ;                   /* empty */
1667           if (i < nr)
1668             char_to_squeeze = NOT_A_CHAR;
1669           /* If (i >= nr) we've squeezed the last character in this buffer.
1670              So now we have to read a new buffer and continue comparing
1671              characters against char_to_squeeze.  */
1672         }
1673     }
1674 }
1675
1676 /* Read buffers of SIZE bytes from stdin until one is found that
1677    contains at least one character not in the delete set.  Store
1678    in the array BUF, all characters from that buffer that are not
1679    in the delete set, and return the number of characters saved
1680    or 0 upon EOF.  */
1681
1682 static size_t
1683 read_and_delete (unsigned char *buf, size_t size, Filter not_used)
1684 {
1685   size_t n_saved;
1686   static int hit_eof = 0;
1687
1688   assert (not_used == NULL);
1689
1690   if (hit_eof)
1691     return 0;
1692
1693   /* This enclosing do-while loop is to make sure that
1694      we don't return zero (indicating EOF) when we've
1695      just deleted all the characters in a buffer.  */
1696   do
1697     {
1698       size_t i;
1699       ssize_t nr = safe_read (0, (char *) buf, size);
1700
1701       if (nr < 0)
1702         error (EXIT_FAILURE, errno, _("read error"));
1703       if (nr == 0)
1704         {
1705           hit_eof = 1;
1706           return 0;
1707         }
1708
1709       /* This first loop may be a waste of code, but gives much
1710          better performance when no characters are deleted in
1711          the beginning of a buffer.  It just avoids the copying
1712          of buf[i] into buf[n_saved] when it would be a NOP.  */
1713
1714       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1715         /* empty */ ;
1716       n_saved = i;
1717
1718       for (++i; i < nr; i++)
1719         if (!in_delete_set[buf[i]])
1720           buf[n_saved++] = buf[i];
1721     }
1722   while (n_saved == 0);
1723
1724   return n_saved;
1725 }
1726
1727 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1728    perform the in-place and one-to-one mapping specified by the global
1729    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1730
1731 static size_t
1732 read_and_xlate (unsigned char *buf, size_t size, Filter not_used)
1733 {
1734   ssize_t chars_read = 0;
1735   static int hit_eof = 0;
1736   size_t i;
1737
1738   assert (not_used == NULL);
1739
1740   if (hit_eof)
1741     return 0;
1742
1743   chars_read = safe_read (0, (char *) buf, size);
1744   if (chars_read < 0)
1745     error (EXIT_FAILURE, errno, _("read error"));
1746   if (chars_read == 0)
1747     {
1748       hit_eof = 1;
1749       return 0;
1750     }
1751
1752   for (i = 0; i < chars_read; i++)
1753     buf[i] = xlate[buf[i]];
1754
1755   return chars_read;
1756 }
1757
1758 /* Initialize a boolean membership set IN_SET with the character
1759    values obtained by traversing the linked list of constructs S
1760    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1761    nonzero the resulting set is complemented.  */
1762
1763 static void
1764 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1765 {
1766   int c;
1767   size_t i;
1768
1769   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1770   s->state = BEGIN_STATE;
1771   while ((c = get_next (s, NULL)) != -1)
1772     in_set[c] = 1;
1773   if (complement_this_set)
1774     for (i = 0; i < N_CHARS; i++)
1775       in_set[i] = (!in_set[i]);
1776 }
1777
1778 int
1779 main (int argc, char **argv)
1780 {
1781   int c;
1782   int non_option_args;
1783   struct Spec_list buf1, buf2;
1784   struct Spec_list *s1 = &buf1;
1785   struct Spec_list *s2 = &buf2;
1786
1787   program_name = argv[0];
1788   setlocale (LC_ALL, "");
1789   bindtextdomain (PACKAGE, LOCALEDIR);
1790   textdomain (PACKAGE);
1791
1792   atexit (close_stdout);
1793
1794   while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1795     {
1796       switch (c)
1797         {
1798         case 0:
1799           break;
1800
1801         case 'c':
1802           complement = 1;
1803           break;
1804
1805         case 'd':
1806           delete = 1;
1807           break;
1808
1809         case 's':
1810           squeeze_repeats = 1;
1811           break;
1812
1813         case 't':
1814           truncate_set1 = 1;
1815           break;
1816
1817         case_GETOPT_HELP_CHAR;
1818
1819         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1820
1821         default:
1822           usage (2);
1823           break;
1824         }
1825     }
1826
1827   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1828
1829   non_option_args = argc - optind;
1830   translating = (non_option_args == 2 && !delete);
1831
1832   /* Change this test if it is valid to give tr no options and
1833      no args at all.  POSIX doesn't specifically say anything
1834      either way, but it looks like they implied it's invalid
1835      by omission.  If you want to make tr do a slow imitation
1836      of `cat' use `tr a a'.  */
1837   if (non_option_args > 2)
1838     {
1839       error (0, 0, _("too many arguments"));
1840       usage (2);
1841     }
1842
1843   if (!delete && !squeeze_repeats && non_option_args != 2)
1844     error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1845
1846   if (delete && squeeze_repeats && non_option_args != 2)
1847     error (EXIT_FAILURE, 0, _("two strings must be given when both \
1848 deleting and squeezing repeats"));
1849
1850   /* If --delete is given without --squeeze-repeats, then
1851      only one string argument may be specified.  But POSIX
1852      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1853      is set, pretend we never saw string2.  But I think
1854      this deserves a fatal error, so that's the default.  */
1855   if ((delete && !squeeze_repeats) && non_option_args != 1)
1856     {
1857       if (posix_pedantic && non_option_args == 2)
1858         --non_option_args;
1859       else
1860         error (EXIT_FAILURE, 0,
1861                _("only one string may be given when deleting \
1862 without squeezing repeats"));
1863     }
1864
1865   if (squeeze_repeats && non_option_args == 0)
1866     error (EXIT_FAILURE, 0,
1867            _("at least one string must be given when squeezing repeats"));
1868
1869   spec_init (s1);
1870   if (parse_str ((unsigned char *) argv[optind], s1))
1871     exit (EXIT_FAILURE);
1872
1873   if (non_option_args == 2)
1874     {
1875       spec_init (s2);
1876       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1877         exit (EXIT_FAILURE);
1878     }
1879   else
1880     s2 = NULL;
1881
1882   validate (s1, s2);
1883
1884   /* Use binary I/O, since `tr' is sometimes used to transliterate
1885      non-printable characters, or characters which are stripped away
1886      by text-mode reads (like CR and ^Z).  */
1887   SET_BINARY2 (STDIN_FILENO, STDOUT_FILENO);
1888
1889   if (squeeze_repeats && non_option_args == 1)
1890     {
1891       set_initialize (s1, complement, in_squeeze_set);
1892       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1893     }
1894   else if (delete && non_option_args == 1)
1895     {
1896       size_t nr;
1897
1898       set_initialize (s1, complement, in_delete_set);
1899       do
1900         {
1901           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1902           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1903             error (EXIT_FAILURE, errno, _("write error"));
1904         }
1905       while (nr > 0);
1906     }
1907   else if (squeeze_repeats && delete && non_option_args == 2)
1908     {
1909       set_initialize (s1, complement, in_delete_set);
1910       set_initialize (s2, 0, in_squeeze_set);
1911       squeeze_filter (io_buf, IO_BUF_SIZE, read_and_delete);
1912     }
1913   else if (translating)
1914     {
1915       if (complement)
1916         {
1917           int i;
1918           SET_TYPE *in_s1 = in_delete_set;
1919
1920           set_initialize (s1, 0, in_s1);
1921           s2->state = BEGIN_STATE;
1922           for (i = 0; i < N_CHARS; i++)
1923             xlate[i] = i;
1924           for (i = 0; i < N_CHARS; i++)
1925             {
1926               if (!in_s1[i])
1927                 {
1928                   int ch = get_next (s2, NULL);
1929                   assert (ch != -1 || truncate_set1);
1930                   if (ch == -1)
1931                     {
1932                       /* This will happen when tr is invoked like e.g.
1933                          tr -cs A-Za-z0-9 '\012'.  */
1934                       break;
1935                     }
1936                   xlate[i] = ch;
1937                 }
1938             }
1939           assert (get_next (s2, NULL) == -1 || truncate_set1);
1940         }
1941       else
1942         {
1943           int c1, c2;
1944           int i;
1945           enum Upper_Lower_class class_s1;
1946           enum Upper_Lower_class class_s2;
1947
1948           for (i = 0; i < N_CHARS; i++)
1949             xlate[i] = i;
1950           s1->state = BEGIN_STATE;
1951           s2->state = BEGIN_STATE;
1952           for (;;)
1953             {
1954               c1 = get_next (s1, &class_s1);
1955               c2 = get_next (s2, &class_s2);
1956               if (!class_ok[(int) class_s1][(int) class_s2])
1957                 error (EXIT_FAILURE, 0,
1958                        _("misaligned [:upper:] and/or [:lower:] construct"));
1959
1960               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1961                 {
1962                   for (i = 0; i < N_CHARS; i++)
1963                     if (ISLOWER (i))
1964                       xlate[i] = toupper (i);
1965                 }
1966               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1967                 {
1968                   for (i = 0; i < N_CHARS; i++)
1969                     if (ISUPPER (i))
1970                       xlate[i] = tolower (i);
1971                 }
1972               else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1973                        || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1974                 {
1975                   /* By default, GNU tr permits the identity mappings: from
1976                      [:upper:] to [:upper:] and [:lower:] to [:lower:].  But
1977                      when POSIXLY_CORRECT is set, those evoke diagnostics.  */
1978                   if (posix_pedantic)
1979                     {
1980                       error (EXIT_FAILURE, 0,
1981                              _("\
1982 invalid identity mapping;  when translating, any [:lower:] or [:upper:]\n\
1983 construct in string1 must be aligned with a corresponding construct\n\
1984 ([:upper:] or [:lower:], respectively) in string2"));
1985                     }
1986                 }
1987               else
1988                 {
1989                   /* The following should have been checked by validate...  */
1990                   if (c1 == -1 || c2 == -1)
1991                     break;
1992                   xlate[c1] = c2;
1993                 }
1994             }
1995           assert (c1 == -1 || truncate_set1);
1996         }
1997       if (squeeze_repeats)
1998         {
1999           set_initialize (s2, 0, in_squeeze_set);
2000           squeeze_filter (io_buf, IO_BUF_SIZE, read_and_xlate);
2001         }
2002       else
2003         {
2004           size_t chars_read;
2005
2006           do
2007             {
2008               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2009               if (chars_read > 0
2010                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2011                 error (EXIT_FAILURE, errno, _("write error"));
2012             }
2013           while (chars_read > 0);
2014         }
2015     }
2016
2017   if (close (0) != 0)
2018     error (EXIT_FAILURE, errno, _("standard input"));
2019
2020   exit (EXIT_SUCCESS);
2021 }