src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 91, 95, 96, 1997, 1998 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering */
  19
  20 #include <config.h>
  21
  22 /* Get isblank from GNU libc.  */
  23 #define _GNU_SOURCE
  24
  25 #include <stdio.h>
  26 #define NDEBUG 1
  27 #include <assert.h>
  28 #include <errno.h>
  29 #include <sys/types.h>
  30 #include <getopt.h>
  31
  32 #include "system.h"
  33 #include "error.h"
  34
  35 #define N_CHARS (UCHAR_MAX + 1)
  36
  37 /* A pointer to a function that returns an int.  */
  38 typedef int (*PFI) ();
  39
  40 /* Convert from character C to its index in the collating
  41    sequence array.  Just cast to an unsigned int to avoid
  42    problems with sign-extension.  */
  43 #define ORD(c) (unsigned int)(c)
  44
  45 /* The inverse of ORD.  */
  46 #define CHR(i) (unsigned char)(i)
  47
  48 /* The value for Spec_list->state that indicates to
  49    get_next that it should initialize the tail pointer.
  50    Its value should be as large as possible to avoid conflict
  51    a valid value for the state field -- and that may be as
  52    large as any valid repeat_count.  */
  53 #define BEGIN_STATE (INT_MAX - 1)
  54
  55 /* The value for Spec_list->state that indicates to
  56    get_next that the element pointed to by Spec_list->tail is
  57    being considered for the first time on this pass through the
  58    list -- it indicates that get_next should make any necessary
  59    initializations.  */
  60 #define NEW_ELEMENT (BEGIN_STATE + 1)
  61
  62 /* A value distinct from any character that may have been stored in a
  63    buffer as the result of a block-read in the function squeeze_filter.  */
  64 #define NOT_A_CHAR (unsigned int)(-1)
  65
  66 /* The following (but not CC_NO_CLASS) are indices into the array of
  67    valid character class strings.  */
  68 enum Char_class
  69   {
  70     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  71     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  72     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  73     CC_NO_CLASS = 9999
  74   };
  75
  76 /* Character class to which a character (returned by get_next) belonged;
  77    but it is set only if the construct from which the character was obtained
  78    was one of the character classes [:upper:] or [:lower:].  The value
  79    is used only when translating and then, only to make sure that upper
  80    and lower class constructs have the same relative positions in string1
  81    and string2.  */
  82 enum Upper_Lower_class
  83   {
  84     UL_LOWER = 0,
  85     UL_UPPER = 1,
  86     UL_NONE = 2
  87   };
  88
  89 /* A shortcut to ensure that when constructing the translation array,
  90    one of the values returned by paired calls to get_next (from s1 and s2)
  91    is from [:upper:] and the other is from [:lower:], or neither is from
  92    upper or lower.  By default, GNU tr permits the identity mappings: from
  93    [:upper:] to [:upper:] and [:lower:] to [:lower:].  But when
  94    POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
  95    by values of type enum Upper_Lower_class.  */
  96 static int const class_ok[3][3] =
  97 {
  98   {1, 1, 0},
  99   {1, 1, 0},
 100   {0, 0, 1}
 101 };
 102
 103 /* The type of a List_element.  See build_spec_list for more details.  */
 104 enum Range_element_type
 105   {
 106     RE_NO_TYPE = 0,
 107     RE_NORMAL_CHAR,
 108     RE_RANGE,
 109     RE_CHAR_CLASS,
 110     RE_EQUIV_CLASS,
 111     RE_REPEATED_CHAR
 112   };
 113
 114 /* One construct in one of tr's argument strings.
 115    For example, consider the POSIX version of the classic tr command:
 116        tr -cs 'a-zA-Z_' '[\n*]'
 117    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 118    and a single normal character, `_'.  String2 has one construct.  */
 119 struct List_element
 120   {
 121     enum Range_element_type type;
 122     struct List_element *next;
 123     union
 124       {
 125         int normal_char;
 126         struct                  /* unnamed */
 127           {
 128             unsigned int first_char;
 129             unsigned int last_char;
 130           }
 131         range;
 132         enum Char_class char_class;
 133         int equiv_code;
 134         struct                  /* unnamed */
 135           {
 136             unsigned int the_repeated_char;
 137             size_t repeat_count;
 138           }
 139         repeated_char;
 140       }
 141     u;
 142   };
 143
 144 /* Each of tr's argument strings is parsed into a form that is easier
 145    to work with: a linked list of constructs (struct List_element).
 146    Each Spec_list structure also encapsulates various attributes of
 147    the corresponding argument string.  The attributes are used mainly
 148    to verify that the strings are valid in the context of any options
 149    specified (like -s, -d, or -c).  The main exception is the member
 150    `tail', which is first used to construct the list.  After construction,
 151    it is used by get_next to save its state when traversing the list.
 152    The member `state' serves a similar function.  */
 153 struct Spec_list
 154   {
 155     /* Points to the head of the list of range elements.
 156        The first struct is a dummy; its members are never used.  */
 157     struct List_element *head;
 158
 159     /* When appending, points to the last element.  When traversing via
 160        get_next(), points to the element to process next.  Setting
 161        Spec_list.state to the value BEGIN_STATE before calling get_next
 162        signals get_next to initialize tail to point to head->next.  */
 163     struct List_element *tail;
 164
 165     /* Used to save state between calls to get_next().  */
 166     unsigned int state;
 167
 168     /* Length, in the sense that length ('a-z[:digit:]123abc')
 169        is 42 ( = 26 + 10 + 6).  */
 170     size_t length;
 171
 172     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 173     int n_indefinite_repeats;
 174
 175     /* If n_indefinite_repeats is nonzero, this points to the List_element
 176        corresponding to the last [c*] or [c*0] construct encountered in
 177        this spec.  Otherwise it is undefined.  */
 178     struct List_element *indefinite_repeat_element;
 179
 180     /* Non-zero if this spec contains at least one equivalence
 181        class construct e.g. [=c=].  */
 182     int has_equiv_class;
 183
 184     /* Non-zero if this spec contains at least one character class
 185        construct.  E.g. [:digit:].  */
 186     int has_char_class;
 187
 188     /* Non-zero if this spec contains at least one of the character class
 189        constructs (all but upper and lower) that aren't allowed in s2.  */
 190     int has_restricted_char_class;
 191   };
 192
 193 /* A representation for escaped string1 or string2.  As a string is parsed,
 194    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 195    etc.) are marked as such in this structure by setting the corresponding
 196    entry in the ESCAPED vector.  */
 197 struct E_string
 198 {
 199   unsigned char *s;
 200   int *escaped;
 201   size_t len;
 202 };
 203
 204 /* Return nonzero if the Ith character of escaped string ES matches C
 205    and is not escaped itself.  */
 206 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
 207
 208 int safe_read ();
 209
 210 /* The name by which this program was run.  */
 211 char *program_name;
 212
 213 /* When nonzero, each sequence in the input of a repeated character
 214    (call it c) is replaced (in the output) by a single occurrence of c
 215    for every c in the squeeze set.  */
 216 static int squeeze_repeats = 0;
 217
 218 /* When nonzero, removes characters in the delete set from input.  */
 219 static int delete = 0;
 220
 221 /* Use the complement of set1 in place of set1.  */
 222 static int complement = 0;
 223
 224 /* When nonzero, this flag causes GNU tr to provide strict
 225    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 226    says that when -d is used without -s, string2 (if present)
 227    must be ignored.  Silently ignoring arguments is a bad idea.
 228    The default GNU behavior is to give a usage message and exit.
 229    Additionally, when this flag is nonzero, tr prints warnings
 230    on stderr if it is being used in a manner that is not portable.
 231    Applicable warnings are given by default, but are suppressed
 232    if the environment variable `POSIXLY_CORRECT' is set, since
 233    being POSIX conformant means we can't issue such messages.
 234    Warnings on the following topics are suppressed when this
 235    variable is nonzero:
 236    1. Ambiguous octal escapes.  */
 237 static int posix_pedantic;
 238
 239 /* When tr is performing translation and string1 is longer than string2,
 240    POSIX says that the result is undefined.  That gives the implementor
 241    of a POSIX conforming version of tr two reasonable choices for the
 242    semantics of this case.
 243
 244    * The BSD tr pads string2 to the length of string1 by
 245    repeating the last character in string2.
 246
 247    * System V tr ignores characters in string1 that have no
 248    corresponding character in string2.  That is, string1 is effectively
 249    truncated to the length of string2.
 250
 251    When nonzero, this flag causes GNU tr to imitate the behavior
 252    of System V tr when translating with string1 longer than string2.
 253    The default is to emulate BSD tr.  This flag is ignored in modes where
 254    no translation is performed.  Emulating the System V tr
 255    in this exceptional case causes the relatively common BSD idiom:
 256
 257        tr -cs A-Za-z0-9 '\012'
 258
 259    to break (it would convert only zero bytes, rather than all
 260    non-alphanumerics, to newlines).
 261
 262    WARNING: This switch does not provide general BSD or System V
 263    compatibility.  For example, it doesn't disable the interpretation
 264    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 265    some unfortunate coincidence you use such constructs in scripts
 266    expecting to use some other version of tr, the scripts will break.  */
 267 static int truncate_set1 = 0;
 268
 269 /* An alias for (!delete && non_option_args == 2).
 270    It is set in main and used there and in validate().  */
 271 static int translating;
 272
 273 #ifndef BUFSIZ
 274 # define BUFSIZ 8192
 275 #endif
 276
 277 #define IO_BUF_SIZE BUFSIZ
 278 static unsigned char io_buf[IO_BUF_SIZE];
 279
 280 static char const *const char_class_name[] =
 281 {
 282   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 283   "lower", "print", "punct", "space", "upper", "xdigit"
 284 };
 285 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 286
 287 typedef char SET_TYPE;
 288
 289 /* Array of boolean values.  A character `c' is a member of the
 290    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 291    set is defined by the last (possibly, the only) string argument
 292    on the command line when the squeeze option is given.  */
 293 static SET_TYPE in_squeeze_set[N_CHARS];
 294
 295 /* Array of boolean values.  A character `c' is a member of the
 296    delete set if and only if in_delete_set[c] is true.  The delete
 297    set is defined by the first (or only) string argument on the
 298    command line when the delete option is given.  */
 299 static SET_TYPE in_delete_set[N_CHARS];
 300
 301 /* Array of character values defining the translation (if any) that
 302    tr is to perform.  Translation is performed only when there are
 303    two specification strings and the delete switch is not given.  */
 304 static char xlate[N_CHARS];
 305
 306 /* If nonzero, display usage information and exit.  */
 307 static int show_help;
 308
 309 /* If nonzero, print the version on standard output then exit.  */
 310 static int show_version;
 311
 312 static struct option const long_options[] =
 313 {
 314   {"complement", no_argument, NULL, 'c'},
 315   {"delete", no_argument, NULL, 'd'},
 316   {"squeeze-repeats", no_argument, NULL, 's'},
 317   {"truncate-set1", no_argument, NULL, 't'},
 318   {"help", no_argument, &show_help, 1},
 319   {"version", no_argument, &show_version, 1},
 320   {NULL, 0, NULL, 0}
 321 };
 322 \f
 323 static void
 324 usage (int status)
 325 {
 326   if (status != 0)
 327     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 328              program_name);
 329   else
 330     {
 331       printf (_("\
 332 Usage: %s [OPTION]... SET1 [SET2]\n\
 333 "),
 334               program_name);
 335       printf (_("\
 336 Translate, squeeze, and/or delete characters from standard input,\n\
 337 writing to standard output.\n\
 338 \n\
 339   -c, --complement        first complement SET1\n\
 340   -d, --delete            delete characters in SET1, do not translate\n\
 341   -s, --squeeze-repeats   replace sequence of characters with one\n\
 342   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 343       --help              display this help and exit\n\
 344       --version           output version information and exit\n\
 345 "));
 346       printf (_("\
 347 \n\
 348 SETs are specified as strings of characters.  Most represent themselves.\n\
 349 Interpreted sequences are:\n\
 350 \n\
 351   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 352   \\\\              backslash\n\
 353   \\a              audible BEL\n\
 354   \\b              backspace\n\
 355   \\f              form feed\n\
 356   \\n              new line\n\
 357   \\r              return\n\
 358   \\t              horizontal tab\n\
 359   \\v              vertical tab\n\
 360   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 361   [CHAR1-CHAR2]   same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
 362   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 363   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 364   [:alnum:]       all letters and digits\n\
 365   [:alpha:]       all letters\n\
 366   [:blank:]       all horizontal whitespace\n\
 367   [:cntrl:]       all control characters\n\
 368   [:digit:]       all digits\n\
 369   [:graph:]       all printable characters, not including space\n\
 370   [:lower:]       all lower case letters\n\
 371   [:print:]       all printable characters, including space\n\
 372   [:punct:]       all punctuation characters\n\
 373   [:space:]       all horizontal or vertical whitespace\n\
 374   [:upper:]       all upper case letters\n\
 375   [:xdigit:]      all hexadecimal digits\n\
 376   [=CHAR=]        all characters which are equivalent to CHAR\n\
 377 "));
 378       printf (_("\
 379 \n\
 380 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 381 -t may be used only when translating.  SET2 is extended to length of\n\
 382 SET1 by repeating its last character as necessary.  Excess characters\n\
 383 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 384 expand in ascending order; used in SET2 while translating, they may\n\
 385 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 386 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 387 translation or deletion.\n\
 388 "));
 389       puts (_("\nReport bugs to <textutils-bugs@gnu.org>."));
 390     }
 391   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 392 }
 393
 394 /* Return nonzero if the character C is a member of the
 395    equivalence class containing the character EQUIV_CLASS.  */
 396
 397 static int
 398 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
 399 {
 400   return (equiv_class == c);
 401 }
 402
 403 /* Return nonzero if the character C is a member of the
 404    character class CHAR_CLASS.  */
 405
 406 static int
 407 is_char_class_member (enum Char_class char_class, unsigned int c)
 408 {
 409   int result;
 410
 411   switch (char_class)
 412     {
 413     case CC_ALNUM:
 414       result = ISALNUM (c);
 415       break;
 416     case CC_ALPHA:
 417       result = ISALPHA (c);
 418       break;
 419     case CC_BLANK:
 420       result = ISBLANK (c);
 421       break;
 422     case CC_CNTRL:
 423       result = ISCNTRL (c);
 424       break;
 425     case CC_DIGIT:
 426       result = ISDIGIT_LOCALE (c);
 427       break;
 428     case CC_GRAPH:
 429       result = ISGRAPH (c);
 430       break;
 431     case CC_LOWER:
 432       result = ISLOWER (c);
 433       break;
 434     case CC_PRINT:
 435       result = ISPRINT (c);
 436       break;
 437     case CC_PUNCT:
 438       result = ISPUNCT (c);
 439       break;
 440     case CC_SPACE:
 441       result = ISSPACE (c);
 442       break;
 443     case CC_UPPER:
 444       result = ISUPPER (c);
 445       break;
 446     case CC_XDIGIT:
 447       result = ISXDIGIT (c);
 448       break;
 449     default:
 450       abort ();
 451       break;
 452     }
 453   return result;
 454 }
 455
 456 static void
 457 es_free (struct E_string *es)
 458 {
 459   free (es->s);
 460   free (es->escaped);
 461 }
 462
 463 /* Perform the first pass over each range-spec argument S, converting all
 464    \c and \ddd escapes to their one-byte representations.  The conversion
 465    is done in-place, so S must point to writable storage.  If an invalid
 466    quote sequence is found print an error message and return nonzero.
 467    Otherwise set *LEN to the length of the resulting string and return
 468    zero.  The resulting array of characters may contain zero-bytes;
 469    however, on input, S is assumed to be null-terminated, and hence
 470    cannot contain actual (non-escaped) zero bytes.  */
 471
 472 static int
 473 unquote (const unsigned char *s, struct E_string *es)
 474 {
 475   size_t i, j;
 476   size_t len;
 477
 478   len = strlen ((char *) s);
 479
 480   es->s = (unsigned char *) xmalloc (len);
 481   es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
 482   for (i = 0; i < len; i++)
 483     es->escaped[i] = 0;
 484
 485   j = 0;
 486   for (i = 0; s[i]; i++)
 487     {
 488       switch (s[i])
 489         {
 490           int c;
 491         case '\\':
 492           switch (s[i + 1])
 493             {
 494               int oct_digit;
 495             case '\\':
 496               c = '\\';
 497               break;
 498             case 'a':
 499               c = '\007';
 500               break;
 501             case 'b':
 502               c = '\b';
 503               break;
 504             case 'f':
 505               c = '\f';
 506               break;
 507             case 'n':
 508               c = '\n';
 509               break;
 510             case 'r':
 511               c = '\r';
 512               break;
 513             case 't':
 514               c = '\t';
 515               break;
 516             case 'v':
 517               c = '\v';
 518               break;
 519             case '0':
 520             case '1':
 521             case '2':
 522             case '3':
 523             case '4':
 524             case '5':
 525             case '6':
 526             case '7':
 527               c = s[i + 1] - '0';
 528               oct_digit = s[i + 2] - '0';
 529               if (0 <= oct_digit && oct_digit <= 7)
 530                 {
 531                   c = 8 * c + oct_digit;
 532                   ++i;
 533                   oct_digit = s[i + 2] - '0';
 534                   if (0 <= oct_digit && oct_digit <= 7)
 535                     {
 536                       if (8 * c + oct_digit < N_CHARS)
 537                         {
 538                           c = 8 * c + oct_digit;
 539                           ++i;
 540                         }
 541                       else if (!posix_pedantic)
 542                         {
 543                           /* Any octal number larger than 0377 won't
 544                              fit in 8 bits.  So we stop when adding the
 545                              next digit would put us over the limit and
 546                              give a warning about the ambiguity.  POSIX
 547                              isn't clear on this, but one person has said
 548                              that in his interpretation, POSIX says tr
 549                              can't even give a warning.  */
 550                           error (0, 0, _("warning: the ambiguous octal escape \
 551 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
 552                                  s[i], s[i + 1], s[i + 2],
 553                                  s[i], s[i + 1], s[i + 2]);
 554                         }
 555                     }
 556                 }
 557               break;
 558             case '\0':
 559               error (0, 0, _("invalid backslash escape at end of string"));
 560               return 1;
 561
 562             default:
 563               if (posix_pedantic)
 564                 {
 565                   error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
 566                   return 1;
 567                 }
 568               else
 569                 {
 570                   c = s[i + 1];
 571                   es->escaped[j] = 1;
 572                 }
 573             }
 574           ++i;
 575           es->s[j++] = c;
 576           break;
 577         default:
 578           es->s[j++] = s[i];
 579           break;
 580         }
 581     }
 582   es->len = j;
 583   return 0;
 584 }
 585
 586 /* If CLASS_STR is a valid character class string, return its index
 587    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 588
 589 static enum Char_class
 590 look_up_char_class (const unsigned char *class_str, size_t len)
 591 {
 592   unsigned int i;
 593
 594   for (i = 0; i < N_CHAR_CLASSES; i++)
 595     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 596         && strlen (char_class_name[i]) == len)
 597       return (enum Char_class) i;
 598   return CC_NO_CLASS;
 599 }
 600
 601 /* Return a newly allocated string with a printable version of C.
 602    This function is used solely for formatting error messages.  */
 603
 604 static char *
 605 make_printable_char (unsigned int c)
 606 {
 607   char *buf = xmalloc (5);
 608
 609   assert (c < N_CHARS);
 610   if (ISPRINT (c))
 611     {
 612       buf[0] = c;
 613       buf[1] = '\0';
 614     }
 615   else
 616     {
 617       sprintf (buf, "\\%03o", c);
 618     }
 619   return buf;
 620 }
 621
 622 /* Return a newly allocated copy of S which is suitable for printing.
 623    LEN is the number of characters in S.  Most non-printing
 624    (isprint) characters are represented by a backslash followed by
 625    3 octal digits.  However, the characters represented by \c escapes
 626    where c is one of [abfnrtv] are represented by their 2-character \c
 627    sequences.  This function is used solely for printing error messages.  */
 628
 629 static char *
 630 make_printable_str (const unsigned char *s, size_t len)
 631 {
 632   /* Worst case is that every character expands to a backslash
 633      followed by a 3-character octal escape sequence.  */
 634   char *printable_buf = xmalloc (4 * len + 1);
 635   char *p = printable_buf;
 636   size_t i;
 637
 638   for (i = 0; i < len; i++)
 639     {
 640       char buf[5];
 641       char *tmp = NULL;
 642
 643       switch (s[i])
 644         {
 645         case '\\':
 646           tmp = "\\";
 647           break;
 648         case '\007':
 649           tmp = "\\a";
 650           break;
 651         case '\b':
 652           tmp = "\\b";
 653           break;
 654         case '\f':
 655           tmp = "\\f";
 656           break;
 657         case '\n':
 658           tmp = "\\n";
 659           break;
 660         case '\r':
 661           tmp = "\\r";
 662           break;
 663         case '\t':
 664           tmp = "\\t";
 665           break;
 666         case '\v':
 667           tmp = "\\v";
 668           break;
 669         default:
 670           if (ISPRINT (s[i]))
 671             {
 672               buf[0] = s[i];
 673               buf[1] = '\0';
 674             }
 675           else
 676             sprintf (buf, "\\%03o", s[i]);
 677           tmp = buf;
 678           break;
 679         }
 680       p = stpcpy (p, tmp);
 681     }
 682   return printable_buf;
 683 }
 684
 685 /* Append a newly allocated structure representing a
 686    character C to the specification list LIST.  */
 687
 688 static void
 689 append_normal_char (struct Spec_list *list, unsigned int c)
 690 {
 691   struct List_element *new;
 692
 693   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 694   new->next = NULL;
 695   new->type = RE_NORMAL_CHAR;
 696   new->u.normal_char = c;
 697   assert (list->tail);
 698   list->tail->next = new;
 699   list->tail = new;
 700 }
 701
 702 /* Append a newly allocated structure representing the range
 703    of characters from FIRST to LAST to the specification list LIST.
 704    Return nonzero if LAST precedes FIRST in the collating sequence,
 705    zero otherwise.  This means that '[c-c]' is acceptable.  */
 706
 707 static int
 708 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
 709 {
 710   struct List_element *new;
 711
 712   if (ORD (first) > ORD (last))
 713     {
 714       char *tmp1 = make_printable_char (first);
 715       char *tmp2 = make_printable_char (last);
 716
 717       error (0, 0,
 718        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 719              tmp1, tmp2);
 720       free (tmp1);
 721       free (tmp2);
 722       return 1;
 723     }
 724   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 725   new->next = NULL;
 726   new->type = RE_RANGE;
 727   new->u.range.first_char = first;
 728   new->u.range.last_char = last;
 729   assert (list->tail);
 730   list->tail->next = new;
 731   list->tail = new;
 732   return 0;
 733 }
 734
 735 /* If CHAR_CLASS_STR is a valid character class string, append a
 736    newly allocated structure representing that character class to the end
 737    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 738    a valid string return nonzero.  */
 739
 740 static int
 741 append_char_class (struct Spec_list *list,
 742                    const unsigned char *char_class_str, size_t len)
 743 {
 744   enum Char_class char_class;
 745   struct List_element *new;
 746
 747   char_class = look_up_char_class (char_class_str, len);
 748   if (char_class == CC_NO_CLASS)
 749     return 1;
 750   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 751   new->next = NULL;
 752   new->type = RE_CHAR_CLASS;
 753   new->u.char_class = char_class;
 754   assert (list->tail);
 755   list->tail->next = new;
 756   list->tail = new;
 757   return 0;
 758 }
 759
 760 /* Append a newly allocated structure representing a [c*n]
 761    repeated character construct to the specification list LIST.
 762    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 763    is a non-negative repeat count.  */
 764
 765 static void
 766 append_repeated_char (struct Spec_list *list, unsigned int the_char,
 767                       size_t repeat_count)
 768 {
 769   struct List_element *new;
 770
 771   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 772   new->next = NULL;
 773   new->type = RE_REPEATED_CHAR;
 774   new->u.repeated_char.the_repeated_char = the_char;
 775   new->u.repeated_char.repeat_count = repeat_count;
 776   assert (list->tail);
 777   list->tail->next = new;
 778   list->tail = new;
 779 }
 780
 781 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 782    the length of that string, LEN, if LEN is exactly one, append
 783    a newly allocated structure representing the specified
 784    equivalence class to the specification list, LIST and return zero.
 785    If LEN is not 1, return nonzero.  */
 786
 787 static int
 788 append_equiv_class (struct Spec_list *list,
 789                     const unsigned char *equiv_class_str, size_t len)
 790 {
 791   struct List_element *new;
 792
 793   if (len != 1)
 794     return 1;
 795   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 796   new->next = NULL;
 797   new->type = RE_EQUIV_CLASS;
 798   new->u.equiv_code = *equiv_class_str;
 799   assert (list->tail);
 800   list->tail->next = new;
 801   list->tail = new;
 802   return 0;
 803 }
 804
 805 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 806    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 807    NUL bytes, and is *not* NUL-terminated.  */
 808
 809 static unsigned char *
 810 substr (const unsigned char *p, size_t first_idx, size_t last_idx)
 811 {
 812   size_t len;
 813   unsigned char *tmp;
 814
 815   assert (first_idx <= last_idx);
 816   len = last_idx - first_idx + 1;
 817   tmp = (unsigned char *) xmalloc (len);
 818
 819   assert (first_idx <= last_idx);
 820   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 821   memcpy (tmp, p + first_idx, len);
 822   return tmp;
 823 }
 824
 825 /* Search forward starting at START_IDX for the 2-char sequence
 826    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 827    a sequence is found, set *RESULT_IDX to the index of the first
 828    character and return nonzero. Otherwise return zero.  P may contain
 829    zero bytes.  */
 830
 831 static int
 832 find_closing_delim (const struct E_string *es, size_t start_idx,
 833                     unsigned int pre_bracket_char, size_t *result_idx)
 834 {
 835   size_t i;
 836
 837   for (i = start_idx; i < es->len - 1; i++)
 838     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 839         && !es->escaped[i] && !es->escaped[i + 1])
 840       {
 841         *result_idx = i;
 842         return 1;
 843       }
 844   return 0;
 845 }
 846
 847 /* Convert a string S with explicit length LEN, possibly
 848    containing embedded zero bytes, to a long integer value.
 849    If the string represents a negative value, a value larger
 850    than LONG_MAX, or if all LEN characters do not represent a
 851    valid integer, return nonzero and do not modify *VAL.
 852    Otherwise, return zero and set *VAL to the converted value.  */
 853
 854 static int
 855 non_neg_strtol (const unsigned char *s, size_t len, size_t *val)
 856 {
 857   size_t i;
 858   unsigned long sum = 0;
 859   unsigned int base;
 860
 861   if (len <= 0)
 862     return 1;
 863   if (s[0] == '0')
 864     base = 8;
 865   else if (ISDIGIT (s[0]))
 866     base = 10;
 867   else
 868     return 1;
 869
 870   for (i = 0; i < len; i++)
 871     {
 872       unsigned int c;
 873
 874       if (s[i] < '0')
 875         return 1;
 876
 877       c = s[i] - '0';
 878       if (c >= base)
 879         return 1;
 880
 881       if (sum > (LONG_MAX - c) / base)
 882         return 1;
 883       sum = sum * base + c;
 884     }
 885   *val = sum;
 886   return 0;
 887 }
 888
 889 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 890    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 891    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 892    and return zero. If the second character following
 893    the opening bracket is not `*' or if no closing bracket can be
 894    found, return -1.  If a closing bracket is found and the
 895    second char is `*', but the string between the `*' and `]' isn't
 896    empty, an octal number, or a decimal number, print an error message
 897    and return -2.  */
 898
 899 static int
 900 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 901                        unsigned int *char_to_repeat, size_t *repeat_count,
 902                        size_t *closing_bracket_idx)
 903 {
 904   size_t i;
 905
 906   assert (start_idx + 1 < es->len);
 907   if (!ES_MATCH (es, start_idx + 1, '*'))
 908     return -1;
 909
 910   for (i = start_idx + 2; i < es->len; i++)
 911     {
 912       if (ES_MATCH (es, i, ']'))
 913         {
 914           const unsigned char *digit_str;
 915           size_t digit_str_len = i - start_idx - 2;
 916
 917           *char_to_repeat = es->s[start_idx];
 918           if (digit_str_len == 0)
 919             {
 920               /* We've matched [c*] -- no explicit repeat count.  */
 921               *repeat_count = 0;
 922               *closing_bracket_idx = i;
 923               return 0;
 924             }
 925
 926           /* Here, we have found [c*s] where s should be a string
 927              of octal or decimal digits.  */
 928           digit_str = &es->s[start_idx + 2];
 929           if (non_neg_strtol (digit_str, digit_str_len, repeat_count)
 930               || *repeat_count > BEGIN_STATE)
 931             {
 932               char *tmp = make_printable_str (digit_str, digit_str_len);
 933               error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
 934                      tmp);
 935               free (tmp);
 936               return -2;
 937             }
 938           *closing_bracket_idx = i;
 939           return 0;
 940         }
 941     }
 942   return -1;                    /* No bracket found.  */
 943 }
 944
 945 /* Return nonzero if the string at ES->s[IDX] matches the regular
 946    expression `\*[0-9]*\]', zero otherwise.  To match, the `*' and
 947    the `]' must not be escaped.  */
 948
 949 static int
 950 star_digits_closebracket (const struct E_string *es, size_t idx)
 951 {
 952   size_t i;
 953
 954   if (!ES_MATCH (es, idx, '*'))
 955     return 0;
 956
 957   for (i = idx + 1; i < es->len; i++)
 958     {
 959       if (!ISDIGIT (es->s[i]))
 960         {
 961           if (ES_MATCH (es, i, ']'))
 962             return 1;
 963           return 0;
 964         }
 965     }
 966   return 0;
 967 }
 968
 969 /* Convert string UNESACPED_STRING (which has been preprocessed to
 970    convert backslash-escape sequences) of length LEN characters into
 971    a linked list of the following 5 types of constructs:
 972       - [:str:] Character class where `str' is one of the 12 valid strings.
 973       - [=c=] Equivalence class where `c' is any single character.
 974       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 975           However, if `n' is present, it must be a non-negative octal or
 976           decimal integer.
 977       - r-s Range of characters from `r' to `s'.  The second endpoint must
 978           not precede the first in the current collating sequence.
 979       - c Any other character is interpreted as itself.  */
 980
 981 static int
 982 build_spec_list (const struct E_string *es, struct Spec_list *result)
 983 {
 984   const unsigned char *p;
 985   size_t i;
 986
 987   p = es->s;
 988
 989   /* The main for-loop below recognizes the 4 multi-character constructs.
 990      A character that matches (in its context) none of the multi-character
 991      constructs is classified as `normal'.  Since all multi-character
 992      constructs have at least 3 characters, any strings of length 2 or
 993      less are composed solely of normal characters.  Hence, the index of
 994      the outer for-loop runs only as far as LEN-2.  */
 995
 996   for (i = 0; i + 2 < es->len; /* empty */)
 997     {
 998       if (ES_MATCH (es, i, '['))
 999         {
1000           int matched_multi_char_construct;
1001           size_t closing_bracket_idx;
1002           unsigned int char_to_repeat;
1003           size_t repeat_count;
1004           int err;
1005
1006           matched_multi_char_construct = 1;
1007           if (ES_MATCH (es, i + 1, ':')
1008               || ES_MATCH (es, i + 1, '='))
1009             {
1010               size_t closing_delim_idx;
1011               int found;
1012
1013               found = find_closing_delim (es, i + 2, p[i + 1],
1014                                           &closing_delim_idx);
1015               if (found)
1016                 {
1017                   int parse_failed;
1018                   unsigned char *opnd_str = substr (p, i + 2,
1019                                                     closing_delim_idx - 1);
1020                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
1021
1022                   if (p[i + 1] == ':')
1023                     {
1024                       parse_failed = append_char_class (result, opnd_str,
1025                                                         opnd_str_len);
1026
1027                       /* FIXME: big comment.  */
1028                       if (parse_failed)
1029                         {
1030                           if (star_digits_closebracket (es, i + 2))
1031                             {
1032                               free (opnd_str);
1033                               goto try_bracketed_repeat;
1034                             }
1035                           else
1036                             {
1037                               char *tmp = make_printable_str (opnd_str,
1038                                                               opnd_str_len);
1039                               error (0, 0, _("invalid character class `%s'"),
1040                                      tmp);
1041                               free (tmp);
1042                               return 1;
1043                             }
1044                         }
1045                     }
1046                   else
1047                     {
1048                       parse_failed = append_equiv_class (result, opnd_str,
1049                                                          opnd_str_len);
1050
1051                       /* FIXME: big comment.  */
1052                       if (parse_failed)
1053                         {
1054                           if (star_digits_closebracket (es, i + 2))
1055                             {
1056                               free (opnd_str);
1057                               goto try_bracketed_repeat;
1058                             }
1059                           else
1060                             {
1061                               char *tmp = make_printable_str (opnd_str,
1062                                                               opnd_str_len);
1063                               error (0, 0,
1064                _("%s: equivalence class operand must be a single character"),
1065                                      tmp);
1066                               free (tmp);
1067                               return 1;
1068                             }
1069                         }
1070                     }
1071                   free (opnd_str);
1072
1073                   /* Return nonzero if append_*_class reports a problem.  */
1074                   if (parse_failed)
1075                     return 1;
1076                   else
1077                     i = closing_delim_idx + 2;
1078                   continue;
1079                 }
1080               /* Else fall through.  This could be [:*] or [=*].  */
1081             }
1082
1083         try_bracketed_repeat:
1084
1085           /* Determine whether this is a bracketed repeat range
1086              matching the RE \[.\*(dec_or_oct_number)?\].  */
1087           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1088                                        &repeat_count,
1089                                        &closing_bracket_idx);
1090           if (err == 0)
1091             {
1092               append_repeated_char (result, char_to_repeat, repeat_count);
1093               i = closing_bracket_idx + 1;
1094             }
1095           else if (err == -1)
1096             {
1097               matched_multi_char_construct = 0;
1098             }
1099           else
1100             {
1101               /* Found a string that looked like [c*n] but the
1102                  numeric part was invalid.  */
1103               return 1;
1104             }
1105
1106           if (matched_multi_char_construct)
1107             continue;
1108
1109           /* We reach this point if P does not match [:str:], [=c=],
1110              [c*n], or [c*].  Now, see if P looks like a range `[-c'
1111              (from `[' to `c').  */
1112         }
1113
1114       /* Look ahead one char for ranges like a-z.  */
1115       if (ES_MATCH (es, i + 1, '-'))
1116         {
1117           if (append_range (result, p[i], p[i + 2]))
1118             return 1;
1119           i += 3;
1120         }
1121       else
1122         {
1123           append_normal_char (result, p[i]);
1124           ++i;
1125         }
1126     }
1127
1128   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1129   for (; i < es->len; i++)
1130     append_normal_char (result, p[i]);
1131
1132   return 0;
1133 }
1134
1135 /* Given a Spec_list S (with its saved state implicit in the values
1136    of its members `tail' and `state'), return the next single character
1137    in the expansion of S's constructs.  If the last character of S was
1138    returned on the previous call or if S was empty, this function
1139    returns -1.  For example, successive calls to get_next where S
1140    represents the spec-string 'a-d[y*3]' will return the sequence
1141    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1142    which the returned character comes is [:upper:] or [:lower:], the
1143    parameter CLASS is given a value to indicate which it was.  Otherwise
1144    CLASS is set to UL_NONE.  This value is used only when constructing
1145    the translation table to verify that any occurrences of upper and
1146    lower class constructs in the spec-strings appear in the same relative
1147    positions.  */
1148
1149 static int
1150 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1151 {
1152   struct List_element *p;
1153   int return_val;
1154   int i;
1155
1156   if (class)
1157     *class = UL_NONE;
1158
1159   if (s->state == BEGIN_STATE)
1160     {
1161       s->tail = s->head->next;
1162       s->state = NEW_ELEMENT;
1163     }
1164
1165   p = s->tail;
1166   if (p == NULL)
1167     return -1;
1168
1169   switch (p->type)
1170     {
1171     case RE_NORMAL_CHAR:
1172       return_val = p->u.normal_char;
1173       s->state = NEW_ELEMENT;
1174       s->tail = p->next;
1175       break;
1176
1177     case RE_RANGE:
1178       if (s->state == NEW_ELEMENT)
1179         s->state = ORD (p->u.range.first_char);
1180       else
1181         ++(s->state);
1182       return_val = CHR (s->state);
1183       if (s->state == ORD (p->u.range.last_char))
1184         {
1185           s->tail = p->next;
1186           s->state = NEW_ELEMENT;
1187         }
1188       break;
1189
1190     case RE_CHAR_CLASS:
1191       if (class)
1192         {
1193           int upper_or_lower;
1194           switch (p->u.char_class)
1195             {
1196             case CC_LOWER:
1197               *class = UL_LOWER;
1198               upper_or_lower = 1;
1199               break;
1200             case CC_UPPER:
1201               *class = UL_UPPER;
1202               upper_or_lower = 1;
1203               break;
1204             default:
1205               upper_or_lower = 0;
1206               break;
1207             }
1208
1209           if (upper_or_lower)
1210             {
1211               s->tail = p->next;
1212               s->state = NEW_ELEMENT;
1213               return_val = 0;
1214               break;
1215             }
1216         }
1217
1218       if (s->state == NEW_ELEMENT)
1219         {
1220           for (i = 0; i < N_CHARS; i++)
1221             if (is_char_class_member (p->u.char_class, i))
1222               break;
1223           assert (i < N_CHARS);
1224           s->state = i;
1225         }
1226       assert (is_char_class_member (p->u.char_class, s->state));
1227       return_val = CHR (s->state);
1228       for (i = s->state + 1; i < N_CHARS; i++)
1229         if (is_char_class_member (p->u.char_class, i))
1230           break;
1231       if (i < N_CHARS)
1232         s->state = i;
1233       else
1234         {
1235           s->tail = p->next;
1236           s->state = NEW_ELEMENT;
1237         }
1238       break;
1239
1240     case RE_EQUIV_CLASS:
1241       /* FIXME: this assumes that each character is alone in its own
1242          equivalence class (which appears to be correct for my
1243          LC_COLLATE.  But I don't know of any function that allows
1244          one to determine a character's equivalence class.  */
1245
1246       return_val = p->u.equiv_code;
1247       s->state = NEW_ELEMENT;
1248       s->tail = p->next;
1249       break;
1250
1251     case RE_REPEATED_CHAR:
1252       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1253       if (p->u.repeated_char.repeat_count == 0)
1254         {
1255           s->tail = p->next;
1256           s->state = NEW_ELEMENT;
1257           return_val = get_next (s, class);
1258         }
1259       else
1260         {
1261           if (s->state == NEW_ELEMENT)
1262             {
1263               s->state = 0;
1264             }
1265           ++(s->state);
1266           return_val = p->u.repeated_char.the_repeated_char;
1267           if (p->u.repeated_char.repeat_count > 0
1268               && s->state == p->u.repeated_char.repeat_count)
1269             {
1270               s->tail = p->next;
1271               s->state = NEW_ELEMENT;
1272             }
1273         }
1274       break;
1275
1276     case RE_NO_TYPE:
1277       abort ();
1278       break;
1279
1280     default:
1281       abort ();
1282       break;
1283     }
1284
1285   return return_val;
1286 }
1287
1288 /* This is a minor kludge.  This function is called from
1289    get_spec_stats to determine the cardinality of a set derived
1290    from a complemented string.  It's a kludge in that some of the
1291    same operations are (duplicated) performed in set_initialize.  */
1292
1293 static int
1294 card_of_complement (struct Spec_list *s)
1295 {
1296   int c;
1297   int cardinality = N_CHARS;
1298   SET_TYPE in_set[N_CHARS];
1299
1300   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1301   s->state = BEGIN_STATE;
1302   while ((c = get_next (s, NULL)) != -1)
1303     if (!in_set[c]++)
1304       --cardinality;
1305   return cardinality;
1306 }
1307
1308 /* Gather statistics about the spec-list S in preparation for the tests
1309    in validate that determine the consistency of the specs.  This function
1310    is called at most twice; once for string1, and again for any string2.
1311    LEN_S1 < 0 indicates that this is the first call and that S represents
1312    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1313    constructs in string1, and we can use its value to resolve any
1314    indefinite repeat construct in S (which represents string2).  Hence,
1315    this function has the side-effect that it converts a valid [c*]
1316    construct in string2 to [c*n] where n is large enough (or 0) to give
1317    string2 the same length as string1.  For example, with the command
1318    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1319    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1320
1321 static void
1322 get_spec_stats (struct Spec_list *s)
1323 {
1324   struct List_element *p;
1325   int len = 0;
1326
1327   s->n_indefinite_repeats = 0;
1328   s->has_equiv_class = 0;
1329   s->has_restricted_char_class = 0;
1330   s->has_char_class = 0;
1331   for (p = s->head->next; p; p = p->next)
1332     {
1333       switch (p->type)
1334         {
1335           int i;
1336         case RE_NORMAL_CHAR:
1337           ++len;
1338           break;
1339
1340         case RE_RANGE:
1341           assert (p->u.range.last_char >= p->u.range.first_char);
1342           len += p->u.range.last_char - p->u.range.first_char + 1;
1343           break;
1344
1345         case RE_CHAR_CLASS:
1346           s->has_char_class = 1;
1347           for (i = 0; i < N_CHARS; i++)
1348             if (is_char_class_member (p->u.char_class, i))
1349               ++len;
1350           switch (p->u.char_class)
1351             {
1352             case CC_UPPER:
1353             case CC_LOWER:
1354               break;
1355             default:
1356               s->has_restricted_char_class = 1;
1357               break;
1358             }
1359           break;
1360
1361         case RE_EQUIV_CLASS:
1362           for (i = 0; i < N_CHARS; i++)
1363             if (is_equiv_class_member (p->u.equiv_code, i))
1364               ++len;
1365           s->has_equiv_class = 1;
1366           break;
1367
1368         case RE_REPEATED_CHAR:
1369           if (p->u.repeated_char.repeat_count > 0)
1370             len += p->u.repeated_char.repeat_count;
1371           else if (p->u.repeated_char.repeat_count == 0)
1372             {
1373               s->indefinite_repeat_element = p;
1374               ++(s->n_indefinite_repeats);
1375             }
1376           break;
1377
1378         case RE_NO_TYPE:
1379           assert (0);
1380           break;
1381         }
1382     }
1383
1384   s->length = len;
1385 }
1386
1387 static void
1388 get_s1_spec_stats (struct Spec_list *s1)
1389 {
1390   get_spec_stats (s1);
1391   if (complement)
1392     s1->length = card_of_complement (s1);
1393 }
1394
1395 static void
1396 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1397 {
1398   get_spec_stats (s2);
1399   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1400     {
1401       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1402         len_s1 - s2->length;
1403       s2->length = len_s1;
1404     }
1405 }
1406
1407 static void
1408 spec_init (struct Spec_list *spec_list)
1409 {
1410   spec_list->head = spec_list->tail =
1411     (struct List_element *) xmalloc (sizeof (struct List_element));
1412   spec_list->head->next = NULL;
1413 }
1414
1415 /* This function makes two passes over the argument string S.  The first
1416    one converts all \c and \ddd escapes to their one-byte representations.
1417    The second constructs a linked specification list, SPEC_LIST, of the
1418    characters and constructs that comprise the argument string.  If either
1419    of these passes detects an error, this function returns nonzero.  */
1420
1421 static int
1422 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1423 {
1424   struct E_string es;
1425   int fail;
1426
1427   fail = unquote (s, &es);
1428   if (!fail)
1429     fail = build_spec_list (&es, spec_list);
1430   es_free (&es);
1431   return fail;
1432 }
1433
1434 /* Given two specification lists, S1 and S2, and assuming that
1435    S1->length > S2->length, append a single [c*n] element to S2 where c
1436    is the last character in the expansion of S2 and n is the difference
1437    between the two lengths.
1438    Upon successful completion, S2->length is set to S1->length.  The only
1439    way this function can fail to make S2 as long as S1 is when S2 has
1440    zero-length, since in that case, there is no last character to repeat.
1441    So S2->length is required to be at least 1.
1442
1443    Providing this functionality allows the user to do some pretty
1444    non-BSD (and non-portable) things:  For example, the command
1445        tr -cs '[:upper:]0-9' '[:lower:]'
1446    is almost guaranteed to give results that depend on your collating
1447    sequence.  */
1448
1449 static void
1450 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1451 {
1452   struct List_element *p;
1453   int char_to_repeat;
1454   int i;
1455
1456   assert (translating);
1457   assert (s1->length > s2->length);
1458   assert (s2->length > 0);
1459
1460   p = s2->tail;
1461   switch (p->type)
1462     {
1463     case RE_NORMAL_CHAR:
1464       char_to_repeat = p->u.normal_char;
1465       break;
1466     case RE_RANGE:
1467       char_to_repeat = p->u.range.last_char;
1468       break;
1469     case RE_CHAR_CLASS:
1470       for (i = N_CHARS; i >= 0; i--)
1471         if (is_char_class_member (p->u.char_class, i))
1472           break;
1473       assert (i >= 0);
1474       char_to_repeat = CHR (i);
1475       break;
1476
1477     case RE_REPEATED_CHAR:
1478       char_to_repeat = p->u.repeated_char.the_repeated_char;
1479       break;
1480
1481     case RE_EQUIV_CLASS:
1482       /* This shouldn't happen, because validate exits with an error
1483          if it finds an equiv class in string2 when translating.  */
1484       abort ();
1485       break;
1486
1487     case RE_NO_TYPE:
1488       abort ();
1489       break;
1490
1491     default:
1492       abort ();
1493       break;
1494     }
1495
1496   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1497   s2->length = s1->length;
1498 }
1499
1500 /* Return non-zero if S is a non-empty list in which exactly one
1501    character (but potentially, many instances of it) appears.
1502    E.g.  [X*] or xxxxxxxx.  */
1503
1504 static int
1505 homogeneous_spec_list (struct Spec_list *s)
1506 {
1507   int b, c;
1508
1509   s->state = BEGIN_STATE;
1510
1511   if ((b = get_next (s, NULL)) == -1)
1512     return 0;
1513
1514   while ((c = get_next (s, NULL)) != -1)
1515     if (c != b)
1516       return 0;
1517
1518   return 1;
1519 }
1520
1521 /* Die with an error message if S1 and S2 describe strings that
1522    are not valid with the given command line switches.
1523    A side effect of this function is that if a valid [c*] or
1524    [c*0] construct appears in string2, it is converted to [c*n]
1525    with a value for n that makes s2->length == s1->length.  By
1526    the same token, if the --truncate-set1 option is not
1527    given, S2 may be extended.  */
1528
1529 static void
1530 validate (struct Spec_list *s1, struct Spec_list *s2)
1531 {
1532   get_s1_spec_stats (s1);
1533   if (s1->n_indefinite_repeats > 0)
1534     {
1535       error (EXIT_FAILURE, 0,
1536              _("the [c*] repeat construct may not appear in string1"));
1537     }
1538
1539   if (s2)
1540     {
1541       get_s2_spec_stats (s2, s1->length);
1542
1543       if (s2->n_indefinite_repeats > 1)
1544         {
1545           error (EXIT_FAILURE, 0,
1546                  _("only one [c*] repeat construct may appear in string2"));
1547         }
1548
1549       if (translating)
1550         {
1551           if (s2->has_equiv_class)
1552             {
1553               error (EXIT_FAILURE, 0,
1554                      _("[=c=] expressions may not appear in string2 \
1555 when translating"));
1556             }
1557
1558           if (s1->length > s2->length)
1559             {
1560               if (!truncate_set1)
1561                 {
1562                   /* string2 must be non-empty unless --truncate-set1 is
1563                      given or string1 is empty.  */
1564
1565                   if (s2->length == 0)
1566                     error (EXIT_FAILURE, 0,
1567                      _("when not truncating set1, string2 must be non-empty"));
1568                   string2_extend (s1, s2);
1569                 }
1570             }
1571
1572           if (complement && s1->has_char_class
1573               && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1574             {
1575               error (EXIT_FAILURE, 0,
1576                      _("when translating with complemented character classes,\
1577 \nstring2 must map all characters in the domain to one"));
1578             }
1579
1580           if (s2->has_restricted_char_class)
1581             {
1582               error (EXIT_FAILURE, 0,
1583                      _("when translating, the only character classes that may \
1584 appear in\nstring2 are `upper' and `lower'"));
1585             }
1586         }
1587       else
1588         /* Not translating.  */
1589         {
1590           if (s2->n_indefinite_repeats > 0)
1591             error (EXIT_FAILURE, 0,
1592                    _("the [c*] construct may appear in string2 only \
1593 when translating"));
1594         }
1595     }
1596 }
1597
1598 /* Read buffers of SIZE bytes via the function READER (if READER is
1599    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1600    read_and_delete or read_and_xlate.  After each buffer is read, it is
1601    processed and written to stdout.  The buffers are processed so that
1602    multiple consecutive occurrences of the same character in the input
1603    stream are replaced by a single occurrence of that character if the
1604    character is in the squeeze set.  */
1605
1606 static void
1607 squeeze_filter (unsigned char *buf, long int size, PFI reader)
1608 {
1609   unsigned int char_to_squeeze = NOT_A_CHAR;
1610   int i = 0;
1611   int nr = 0;
1612
1613   for (;;)
1614     {
1615       int begin;
1616
1617       if (i >= nr)
1618         {
1619           if (reader == NULL)
1620             nr = safe_read (0, (char *) buf, size);
1621           else
1622             nr = (*reader) (buf, size, NULL);
1623
1624           if (nr < 0)
1625             error (EXIT_FAILURE, errno, _("read error"));
1626           if (nr == 0)
1627             break;
1628           i = 0;
1629         }
1630
1631       begin = i;
1632
1633       if (char_to_squeeze == NOT_A_CHAR)
1634         {
1635           int out_len;
1636           /* Here, by being a little tricky, we can get a significant
1637              performance increase in most cases when the input is
1638              reasonably large.  Since tr will modify the input only
1639              if two consecutive (and identical) input characters are
1640              in the squeeze set, we can step by two through the data
1641              when searching for a character in the squeeze set.  This
1642              means there may be a little more work in a few cases and
1643              perhaps twice as much work in the worst cases where most
1644              of the input is removed by squeezing repeats.  But most
1645              uses of this functionality seem to remove less than 20-30%
1646              of the input.  */
1647           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1648             ;                   /* empty */
1649
1650           /* There is a special case when i == nr and we've just
1651              skipped a character (the last one in buf) that is in
1652              the squeeze set.  */
1653           if (i == nr && in_squeeze_set[buf[i - 1]])
1654             --i;
1655
1656           if (i >= nr)
1657             out_len = nr - begin;
1658           else
1659             {
1660               char_to_squeeze = buf[i];
1661               /* We're about to output buf[begin..i].  */
1662               out_len = i - begin + 1;
1663
1664               /* But since we stepped by 2 in the loop above,
1665                  out_len may be one too large.  */
1666               if (i > 0 && buf[i - 1] == char_to_squeeze)
1667                 --out_len;
1668
1669               /* Advance i to the index of first character to be
1670                  considered when looking for a char different from
1671                  char_to_squeeze.  */
1672               ++i;
1673             }
1674           if (out_len > 0
1675               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1676             error (EXIT_FAILURE, errno, _("write error"));
1677         }
1678
1679       if (char_to_squeeze != NOT_A_CHAR)
1680         {
1681           /* Advance i to index of first char != char_to_squeeze
1682              (or to nr if all the rest of the characters in this
1683              buffer are the same as char_to_squeeze).  */
1684           for (; i < nr && buf[i] == char_to_squeeze; i++)
1685             ;                   /* empty */
1686           if (i < nr)
1687             char_to_squeeze = NOT_A_CHAR;
1688           /* If (i >= nr) we've squeezed the last character in this buffer.
1689              So now we have to read a new buffer and continue comparing
1690              characters against char_to_squeeze.  */
1691         }
1692     }
1693 }
1694
1695 /* Read buffers of SIZE bytes from stdin until one is found that
1696    contains at least one character not in the delete set.  Store
1697    in the array BUF, all characters from that buffer that are not
1698    in the delete set, and return the number of characters saved
1699    or 0 upon EOF.  */
1700
1701 static long
1702 read_and_delete (unsigned char *buf, long int size, PFI not_used)
1703 {
1704   long n_saved;
1705   static int hit_eof = 0;
1706
1707   assert (not_used == NULL);
1708   assert (size > 0);
1709
1710   if (hit_eof)
1711     return 0;
1712
1713   /* This enclosing do-while loop is to make sure that
1714      we don't return zero (indicating EOF) when we've
1715      just deleted all the characters in a buffer.  */
1716   do
1717     {
1718       int i;
1719       int nr = safe_read (0, (char *) buf, size);
1720
1721       if (nr < 0)
1722         error (EXIT_FAILURE, errno, _("read error"));
1723       if (nr == 0)
1724         {
1725           hit_eof = 1;
1726           return 0;
1727         }
1728
1729       /* This first loop may be a waste of code, but gives much
1730          better performance when no characters are deleted in
1731          the beginning of a buffer.  It just avoids the copying
1732          of buf[i] into buf[n_saved] when it would be a NOP.  */
1733
1734       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1735         /* empty */ ;
1736       n_saved = i;
1737
1738       for (++i; i < nr; i++)
1739         if (!in_delete_set[buf[i]])
1740           buf[n_saved++] = buf[i];
1741     }
1742   while (n_saved == 0);
1743
1744   return n_saved;
1745 }
1746
1747 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1748    perform the in-place and one-to-one mapping specified by the global
1749    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1750
1751 static long
1752 read_and_xlate (unsigned char *buf, long int size, PFI not_used)
1753 {
1754   long chars_read = 0;
1755   static int hit_eof = 0;
1756   int i;
1757
1758   assert (not_used == NULL);
1759   assert (size > 0);
1760
1761   if (hit_eof)
1762     return 0;
1763
1764   chars_read = safe_read (0, (char *) buf, size);
1765   if (chars_read < 0)
1766     error (EXIT_FAILURE, errno, _("read error"));
1767   if (chars_read == 0)
1768     {
1769       hit_eof = 1;
1770       return 0;
1771     }
1772
1773   for (i = 0; i < chars_read; i++)
1774     buf[i] = xlate[buf[i]];
1775
1776   return chars_read;
1777 }
1778
1779 /* Initialize a boolean membership set IN_SET with the character
1780    values obtained by traversing the linked list of constructs S
1781    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1782    nonzero the resulting set is complemented.  */
1783
1784 static void
1785 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1786 {
1787   int c;
1788   int i;
1789
1790   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1791   s->state = BEGIN_STATE;
1792   while ((c = get_next (s, NULL)) != -1)
1793     in_set[c] = 1;
1794   if (complement_this_set)
1795     for (i = 0; i < N_CHARS; i++)
1796       in_set[i] = (!in_set[i]);
1797 }
1798
1799 int
1800 main (int argc, char **argv)
1801 {
1802   int c;
1803   int non_option_args;
1804   struct Spec_list buf1, buf2;
1805   struct Spec_list *s1 = &buf1;
1806   struct Spec_list *s2 = &buf2;
1807
1808   program_name = argv[0];
1809   setlocale (LC_ALL, "");
1810   bindtextdomain (PACKAGE, LOCALEDIR);
1811   textdomain (PACKAGE);
1812
1813   while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1814     {
1815       switch (c)
1816         {
1817         case 0:
1818           break;
1819
1820         case 'c':
1821           complement = 1;
1822           break;
1823
1824         case 'd':
1825           delete = 1;
1826           break;
1827
1828         case 's':
1829           squeeze_repeats = 1;
1830           break;
1831
1832         case 't':
1833           truncate_set1 = 1;
1834           break;
1835
1836         default:
1837           usage (2);
1838           break;
1839         }
1840     }
1841
1842   if (show_version)
1843     {
1844       printf ("tr (%s) %s\n", GNU_PACKAGE, VERSION);
1845       exit (EXIT_SUCCESS);
1846     }
1847
1848   if (show_help)
1849     usage (0);
1850
1851   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1852
1853   non_option_args = argc - optind;
1854   translating = (non_option_args == 2 && !delete);
1855
1856   /* Change this test if it is valid to give tr no options and
1857      no args at all.  POSIX doesn't specifically say anything
1858      either way, but it looks like they implied it's invalid
1859      by omission.  If you want to make tr do a slow imitation
1860      of `cat' use `tr a a'.  */
1861   if (non_option_args > 2)
1862     {
1863       error (0, 0, _("too many arguments"));
1864       usage (2);
1865     }
1866
1867   if (!delete && !squeeze_repeats && non_option_args != 2)
1868     error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1869
1870   if (delete && squeeze_repeats && non_option_args != 2)
1871     error (EXIT_FAILURE, 0, _("two strings must be given when both \
1872 deleting and squeezing repeats"));
1873
1874   /* If --delete is given without --squeeze-repeats, then
1875      only one string argument may be specified.  But POSIX
1876      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1877      is set, pretend we never saw string2.  But I think
1878      this deserves a fatal error, so that's the default.  */
1879   if ((delete && !squeeze_repeats) && non_option_args != 1)
1880     {
1881       if (posix_pedantic && non_option_args == 2)
1882         --non_option_args;
1883       else
1884         error (EXIT_FAILURE, 0,
1885                _("only one string may be given when deleting \
1886 without squeezing repeats"));
1887     }
1888
1889   if (squeeze_repeats && non_option_args == 0)
1890     error (EXIT_FAILURE, 0,
1891            _("at least one string must be given when squeezing repeats"));
1892
1893   spec_init (s1);
1894   if (parse_str ((unsigned char *) argv[optind], s1))
1895     exit (EXIT_FAILURE);
1896
1897   if (non_option_args == 2)
1898     {
1899       spec_init (s2);
1900       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1901         exit (EXIT_FAILURE);
1902     }
1903   else
1904     s2 = NULL;
1905
1906   validate (s1, s2);
1907
1908   if (squeeze_repeats && non_option_args == 1)
1909     {
1910       set_initialize (s1, complement, in_squeeze_set);
1911       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1912     }
1913   else if (delete && non_option_args == 1)
1914     {
1915       long nr;
1916
1917       set_initialize (s1, complement, in_delete_set);
1918       do
1919         {
1920           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1921           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1922             error (EXIT_FAILURE, errno, _("write error"));
1923         }
1924       while (nr > 0);
1925     }
1926   else if (squeeze_repeats && delete && non_option_args == 2)
1927     {
1928       set_initialize (s1, complement, in_delete_set);
1929       set_initialize (s2, 0, in_squeeze_set);
1930       squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1931     }
1932   else if (translating)
1933     {
1934       if (complement)
1935         {
1936           int i;
1937           SET_TYPE *in_s1 = in_delete_set;
1938
1939           set_initialize (s1, 0, in_s1);
1940           s2->state = BEGIN_STATE;
1941           for (i = 0; i < N_CHARS; i++)
1942             xlate[i] = i;
1943           for (i = 0; i < N_CHARS; i++)
1944             {
1945               if (!in_s1[i])
1946                 {
1947                   int ch = get_next (s2, NULL);
1948                   assert (ch != -1 || truncate_set1);
1949                   if (ch == -1)
1950                     {
1951                       /* This will happen when tr is invoked like e.g.
1952                          tr -cs A-Za-z0-9 '\012'.  */
1953                       break;
1954                     }
1955                   xlate[i] = ch;
1956                 }
1957             }
1958           assert (get_next (s2, NULL) == -1 || truncate_set1);
1959         }
1960       else
1961         {
1962           int c1, c2;
1963           int i;
1964           enum Upper_Lower_class class_s1;
1965           enum Upper_Lower_class class_s2;
1966
1967           for (i = 0; i < N_CHARS; i++)
1968             xlate[i] = i;
1969           s1->state = BEGIN_STATE;
1970           s2->state = BEGIN_STATE;
1971           for (;;)
1972             {
1973               c1 = get_next (s1, &class_s1);
1974               c2 = get_next (s2, &class_s2);
1975               if (!class_ok[(int) class_s1][(int) class_s2])
1976                 error (EXIT_FAILURE, 0,
1977                        _("misaligned [:upper:] and/or [:lower:] construct"));
1978
1979               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1980                 {
1981                   for (i = 0; i < N_CHARS; i++)
1982                     if (ISLOWER (i))
1983                       xlate[i] = toupper (i);
1984                 }
1985               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1986                 {
1987                   for (i = 0; i < N_CHARS; i++)
1988                     if (ISUPPER (i))
1989                       xlate[i] = tolower (i);
1990                 }
1991               else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1992                        || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1993                 {
1994                   /* By default, GNU tr permits the identity mappings: from
1995                      [:upper:] to [:upper:] and [:lower:] to [:lower:].  But
1996                      when POSIXLY_CORRECT is set, those evoke diagnostics.  */
1997                   if (posix_pedantic)
1998                     {
1999                       error (EXIT_FAILURE, 0,
2000                              _("\
2001 invalid identity mapping;  when translating, any [:lower:] or [:upper:]\n\
2002 construct in string1 must be aligned with a corresponding construct\n\
2003 ([:upper:] or [:lower:], respectively) in string2"));
2004                     }
2005                 }
2006               else
2007                 {
2008                   /* The following should have been checked by validate...  */
2009                   if (c1 == -1 || c2 == -1)
2010                     break;
2011                   xlate[c1] = c2;
2012                 }
2013             }
2014           assert (c1 == -1 || truncate_set1);
2015         }
2016       if (squeeze_repeats)
2017         {
2018           set_initialize (s2, 0, in_squeeze_set);
2019           squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
2020         }
2021       else
2022         {
2023           long chars_read;
2024
2025           do
2026             {
2027               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2028               if (chars_read > 0
2029                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2030                 error (EXIT_FAILURE, errno, _("write error"));
2031             }
2032           while (chars_read > 0);
2033         }
2034     }
2035
2036   if (fclose (stdout) == EOF)
2037     error (EXIT_FAILURE, errno, _("write error"));
2038
2039   if (close (0) != 0)
2040     error (EXIT_FAILURE, errno, _("standard input"));
2041
2042   exit (EXIT_SUCCESS);
2043 }