src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 91, 95, 96, 1997, 1998 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering */
  19
  20 #include <config.h>
  21
  22 /* Get isblank from GNU libc.  */
  23 #define _GNU_SOURCE
  24
  25 #include <stdio.h>
  26 #define NDEBUG 1
  27 #include <assert.h>
  28 #include <errno.h>
  29 #include <sys/types.h>
  30 #include <getopt.h>
  31
  32 #include "system.h"
  33 #include "error.h"
  34 #include "safe-read.h"
  35
  36 #define N_CHARS (UCHAR_MAX + 1)
  37
  38 /* A pointer to a function that returns an int.  */
  39 typedef int (*PFI) ();
  40
  41 /* Convert from character C to its index in the collating
  42    sequence array.  Just cast to an unsigned int to avoid
  43    problems with sign-extension.  */
  44 #define ORD(c) (unsigned int)(c)
  45
  46 /* The inverse of ORD.  */
  47 #define CHR(i) (unsigned char)(i)
  48
  49 /* The value for Spec_list->state that indicates to
  50    get_next that it should initialize the tail pointer.
  51    Its value should be as large as possible to avoid conflict
  52    a valid value for the state field -- and that may be as
  53    large as any valid repeat_count.  */
  54 #define BEGIN_STATE (INT_MAX - 1)
  55
  56 /* The value for Spec_list->state that indicates to
  57    get_next that the element pointed to by Spec_list->tail is
  58    being considered for the first time on this pass through the
  59    list -- it indicates that get_next should make any necessary
  60    initializations.  */
  61 #define NEW_ELEMENT (BEGIN_STATE + 1)
  62
  63 /* A value distinct from any character that may have been stored in a
  64    buffer as the result of a block-read in the function squeeze_filter.  */
  65 #define NOT_A_CHAR (unsigned int)(-1)
  66
  67 /* The following (but not CC_NO_CLASS) are indices into the array of
  68    valid character class strings.  */
  69 enum Char_class
  70   {
  71     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  72     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  73     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  74     CC_NO_CLASS = 9999
  75   };
  76
  77 /* Character class to which a character (returned by get_next) belonged;
  78    but it is set only if the construct from which the character was obtained
  79    was one of the character classes [:upper:] or [:lower:].  The value
  80    is used only when translating and then, only to make sure that upper
  81    and lower class constructs have the same relative positions in string1
  82    and string2.  */
  83 enum Upper_Lower_class
  84   {
  85     UL_LOWER = 0,
  86     UL_UPPER = 1,
  87     UL_NONE = 2
  88   };
  89
  90 /* A shortcut to ensure that when constructing the translation array,
  91    one of the values returned by paired calls to get_next (from s1 and s2)
  92    is from [:upper:] and the other is from [:lower:], or neither is from
  93    upper or lower.  By default, GNU tr permits the identity mappings: from
  94    [:upper:] to [:upper:] and [:lower:] to [:lower:].  But when
  95    POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
  96    by values of type enum Upper_Lower_class.  */
  97 static int const class_ok[3][3] =
  98 {
  99   {1, 1, 0},
 100   {1, 1, 0},
 101   {0, 0, 1}
 102 };
 103
 104 /* The type of a List_element.  See build_spec_list for more details.  */
 105 enum Range_element_type
 106   {
 107     RE_NO_TYPE = 0,
 108     RE_NORMAL_CHAR,
 109     RE_RANGE,
 110     RE_CHAR_CLASS,
 111     RE_EQUIV_CLASS,
 112     RE_REPEATED_CHAR
 113   };
 114
 115 /* One construct in one of tr's argument strings.
 116    For example, consider the POSIX version of the classic tr command:
 117        tr -cs 'a-zA-Z_' '[\n*]'
 118    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 119    and a single normal character, `_'.  String2 has one construct.  */
 120 struct List_element
 121   {
 122     enum Range_element_type type;
 123     struct List_element *next;
 124     union
 125       {
 126         int normal_char;
 127         struct                  /* unnamed */
 128           {
 129             unsigned int first_char;
 130             unsigned int last_char;
 131           }
 132         range;
 133         enum Char_class char_class;
 134         int equiv_code;
 135         struct                  /* unnamed */
 136           {
 137             unsigned int the_repeated_char;
 138             size_t repeat_count;
 139           }
 140         repeated_char;
 141       }
 142     u;
 143   };
 144
 145 /* Each of tr's argument strings is parsed into a form that is easier
 146    to work with: a linked list of constructs (struct List_element).
 147    Each Spec_list structure also encapsulates various attributes of
 148    the corresponding argument string.  The attributes are used mainly
 149    to verify that the strings are valid in the context of any options
 150    specified (like -s, -d, or -c).  The main exception is the member
 151    `tail', which is first used to construct the list.  After construction,
 152    it is used by get_next to save its state when traversing the list.
 153    The member `state' serves a similar function.  */
 154 struct Spec_list
 155   {
 156     /* Points to the head of the list of range elements.
 157        The first struct is a dummy; its members are never used.  */
 158     struct List_element *head;
 159
 160     /* When appending, points to the last element.  When traversing via
 161        get_next(), points to the element to process next.  Setting
 162        Spec_list.state to the value BEGIN_STATE before calling get_next
 163        signals get_next to initialize tail to point to head->next.  */
 164     struct List_element *tail;
 165
 166     /* Used to save state between calls to get_next().  */
 167     unsigned int state;
 168
 169     /* Length, in the sense that length ('a-z[:digit:]123abc')
 170        is 42 ( = 26 + 10 + 6).  */
 171     size_t length;
 172
 173     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 174     int n_indefinite_repeats;
 175
 176     /* If n_indefinite_repeats is nonzero, this points to the List_element
 177        corresponding to the last [c*] or [c*0] construct encountered in
 178        this spec.  Otherwise it is undefined.  */
 179     struct List_element *indefinite_repeat_element;
 180
 181     /* Non-zero if this spec contains at least one equivalence
 182        class construct e.g. [=c=].  */
 183     int has_equiv_class;
 184
 185     /* Non-zero if this spec contains at least one character class
 186        construct.  E.g. [:digit:].  */
 187     int has_char_class;
 188
 189     /* Non-zero if this spec contains at least one of the character class
 190        constructs (all but upper and lower) that aren't allowed in s2.  */
 191     int has_restricted_char_class;
 192   };
 193
 194 /* A representation for escaped string1 or string2.  As a string is parsed,
 195    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 196    etc.) are marked as such in this structure by setting the corresponding
 197    entry in the ESCAPED vector.  */
 198 struct E_string
 199 {
 200   unsigned char *s;
 201   int *escaped;
 202   size_t len;
 203 };
 204
 205 /* Return nonzero if the Ith character of escaped string ES matches C
 206    and is not escaped itself.  */
 207 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
 208
 209 /* The name by which this program was run.  */
 210 char *program_name;
 211
 212 /* When nonzero, each sequence in the input of a repeated character
 213    (call it c) is replaced (in the output) by a single occurrence of c
 214    for every c in the squeeze set.  */
 215 static int squeeze_repeats = 0;
 216
 217 /* When nonzero, removes characters in the delete set from input.  */
 218 static int delete = 0;
 219
 220 /* Use the complement of set1 in place of set1.  */
 221 static int complement = 0;
 222
 223 /* When nonzero, this flag causes GNU tr to provide strict
 224    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 225    says that when -d is used without -s, string2 (if present)
 226    must be ignored.  Silently ignoring arguments is a bad idea.
 227    The default GNU behavior is to give a usage message and exit.
 228    Additionally, when this flag is nonzero, tr prints warnings
 229    on stderr if it is being used in a manner that is not portable.
 230    Applicable warnings are given by default, but are suppressed
 231    if the environment variable `POSIXLY_CORRECT' is set, since
 232    being POSIX conformant means we can't issue such messages.
 233    Warnings on the following topics are suppressed when this
 234    variable is nonzero:
 235    1. Ambiguous octal escapes.  */
 236 static int posix_pedantic;
 237
 238 /* When tr is performing translation and string1 is longer than string2,
 239    POSIX says that the result is undefined.  That gives the implementor
 240    of a POSIX conforming version of tr two reasonable choices for the
 241    semantics of this case.
 242
 243    * The BSD tr pads string2 to the length of string1 by
 244    repeating the last character in string2.
 245
 246    * System V tr ignores characters in string1 that have no
 247    corresponding character in string2.  That is, string1 is effectively
 248    truncated to the length of string2.
 249
 250    When nonzero, this flag causes GNU tr to imitate the behavior
 251    of System V tr when translating with string1 longer than string2.
 252    The default is to emulate BSD tr.  This flag is ignored in modes where
 253    no translation is performed.  Emulating the System V tr
 254    in this exceptional case causes the relatively common BSD idiom:
 255
 256        tr -cs A-Za-z0-9 '\012'
 257
 258    to break (it would convert only zero bytes, rather than all
 259    non-alphanumerics, to newlines).
 260
 261    WARNING: This switch does not provide general BSD or System V
 262    compatibility.  For example, it doesn't disable the interpretation
 263    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 264    some unfortunate coincidence you use such constructs in scripts
 265    expecting to use some other version of tr, the scripts will break.  */
 266 static int truncate_set1 = 0;
 267
 268 /* An alias for (!delete && non_option_args == 2).
 269    It is set in main and used there and in validate().  */
 270 static int translating;
 271
 272 #ifndef BUFSIZ
 273 # define BUFSIZ 8192
 274 #endif
 275
 276 #define IO_BUF_SIZE BUFSIZ
 277 static unsigned char io_buf[IO_BUF_SIZE];
 278
 279 static char const *const char_class_name[] =
 280 {
 281   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 282   "lower", "print", "punct", "space", "upper", "xdigit"
 283 };
 284 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 285
 286 typedef char SET_TYPE;
 287
 288 /* Array of boolean values.  A character `c' is a member of the
 289    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 290    set is defined by the last (possibly, the only) string argument
 291    on the command line when the squeeze option is given.  */
 292 static SET_TYPE in_squeeze_set[N_CHARS];
 293
 294 /* Array of boolean values.  A character `c' is a member of the
 295    delete set if and only if in_delete_set[c] is true.  The delete
 296    set is defined by the first (or only) string argument on the
 297    command line when the delete option is given.  */
 298 static SET_TYPE in_delete_set[N_CHARS];
 299
 300 /* Array of character values defining the translation (if any) that
 301    tr is to perform.  Translation is performed only when there are
 302    two specification strings and the delete switch is not given.  */
 303 static char xlate[N_CHARS];
 304
 305 /* If nonzero, display usage information and exit.  */
 306 static int show_help;
 307
 308 /* If nonzero, print the version on standard output then exit.  */
 309 static int show_version;
 310
 311 static struct option const long_options[] =
 312 {
 313   {"complement", no_argument, NULL, 'c'},
 314   {"delete", no_argument, NULL, 'd'},
 315   {"squeeze-repeats", no_argument, NULL, 's'},
 316   {"truncate-set1", no_argument, NULL, 't'},
 317   {"help", no_argument, &show_help, 1},
 318   {"version", no_argument, &show_version, 1},
 319   {NULL, 0, NULL, 0}
 320 };
 321 \f
 322 static void
 323 usage (int status)
 324 {
 325   if (status != 0)
 326     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 327              program_name);
 328   else
 329     {
 330       printf (_("\
 331 Usage: %s [OPTION]... SET1 [SET2]\n\
 332 "),
 333               program_name);
 334       printf (_("\
 335 Translate, squeeze, and/or delete characters from standard input,\n\
 336 writing to standard output.\n\
 337 \n\
 338   -c, --complement        first complement SET1\n\
 339   -d, --delete            delete characters in SET1, do not translate\n\
 340   -s, --squeeze-repeats   replace sequence of characters with one\n\
 341   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 342       --help              display this help and exit\n\
 343       --version           output version information and exit\n\
 344 "));
 345       printf (_("\
 346 \n\
 347 SETs are specified as strings of characters.  Most represent themselves.\n\
 348 Interpreted sequences are:\n\
 349 \n\
 350   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 351   \\\\              backslash\n\
 352   \\a              audible BEL\n\
 353   \\b              backspace\n\
 354   \\f              form feed\n\
 355   \\n              new line\n\
 356   \\r              return\n\
 357   \\t              horizontal tab\n\
 358   \\v              vertical tab\n\
 359   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 360   [CHAR1-CHAR2]   same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
 361   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 362   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 363   [:alnum:]       all letters and digits\n\
 364   [:alpha:]       all letters\n\
 365   [:blank:]       all horizontal whitespace\n\
 366   [:cntrl:]       all control characters\n\
 367   [:digit:]       all digits\n\
 368   [:graph:]       all printable characters, not including space\n\
 369   [:lower:]       all lower case letters\n\
 370   [:print:]       all printable characters, including space\n\
 371   [:punct:]       all punctuation characters\n\
 372   [:space:]       all horizontal or vertical whitespace\n\
 373   [:upper:]       all upper case letters\n\
 374   [:xdigit:]      all hexadecimal digits\n\
 375   [=CHAR=]        all characters which are equivalent to CHAR\n\
 376 "));
 377       printf (_("\
 378 \n\
 379 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 380 -t may be used only when translating.  SET2 is extended to length of\n\
 381 SET1 by repeating its last character as necessary.  Excess characters\n\
 382 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 383 expand in ascending order; used in SET2 while translating, they may\n\
 384 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 385 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 386 translation or deletion.\n\
 387 "));
 388       puts (_("\nReport bugs to <textutils-bugs@gnu.org>."));
 389     }
 390   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 391 }
 392
 393 /* Return nonzero if the character C is a member of the
 394    equivalence class containing the character EQUIV_CLASS.  */
 395
 396 static int
 397 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
 398 {
 399   return (equiv_class == c);
 400 }
 401
 402 /* Return nonzero if the character C is a member of the
 403    character class CHAR_CLASS.  */
 404
 405 static int
 406 is_char_class_member (enum Char_class char_class, unsigned int c)
 407 {
 408   int result;
 409
 410   switch (char_class)
 411     {
 412     case CC_ALNUM:
 413       result = ISALNUM (c);
 414       break;
 415     case CC_ALPHA:
 416       result = ISALPHA (c);
 417       break;
 418     case CC_BLANK:
 419       result = ISBLANK (c);
 420       break;
 421     case CC_CNTRL:
 422       result = ISCNTRL (c);
 423       break;
 424     case CC_DIGIT:
 425       result = ISDIGIT_LOCALE (c);
 426       break;
 427     case CC_GRAPH:
 428       result = ISGRAPH (c);
 429       break;
 430     case CC_LOWER:
 431       result = ISLOWER (c);
 432       break;
 433     case CC_PRINT:
 434       result = ISPRINT (c);
 435       break;
 436     case CC_PUNCT:
 437       result = ISPUNCT (c);
 438       break;
 439     case CC_SPACE:
 440       result = ISSPACE (c);
 441       break;
 442     case CC_UPPER:
 443       result = ISUPPER (c);
 444       break;
 445     case CC_XDIGIT:
 446       result = ISXDIGIT (c);
 447       break;
 448     default:
 449       abort ();
 450       break;
 451     }
 452   return result;
 453 }
 454
 455 static void
 456 es_free (struct E_string *es)
 457 {
 458   free (es->s);
 459   free (es->escaped);
 460 }
 461
 462 /* Perform the first pass over each range-spec argument S, converting all
 463    \c and \ddd escapes to their one-byte representations.  The conversion
 464    is done in-place, so S must point to writable storage.  If an invalid
 465    quote sequence is found print an error message and return nonzero.
 466    Otherwise set *LEN to the length of the resulting string and return
 467    zero.  The resulting array of characters may contain zero-bytes;
 468    however, on input, S is assumed to be null-terminated, and hence
 469    cannot contain actual (non-escaped) zero bytes.  */
 470
 471 static int
 472 unquote (const unsigned char *s, struct E_string *es)
 473 {
 474   size_t i, j;
 475   size_t len;
 476
 477   len = strlen ((char *) s);
 478
 479   es->s = (unsigned char *) xmalloc (len);
 480   es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
 481   for (i = 0; i < len; i++)
 482     es->escaped[i] = 0;
 483
 484   j = 0;
 485   for (i = 0; s[i]; i++)
 486     {
 487       switch (s[i])
 488         {
 489           int c;
 490         case '\\':
 491           switch (s[i + 1])
 492             {
 493               int oct_digit;
 494             case '\\':
 495               c = '\\';
 496               break;
 497             case 'a':
 498               c = '\007';
 499               break;
 500             case 'b':
 501               c = '\b';
 502               break;
 503             case 'f':
 504               c = '\f';
 505               break;
 506             case 'n':
 507               c = '\n';
 508               break;
 509             case 'r':
 510               c = '\r';
 511               break;
 512             case 't':
 513               c = '\t';
 514               break;
 515             case 'v':
 516               c = '\v';
 517               break;
 518             case '0':
 519             case '1':
 520             case '2':
 521             case '3':
 522             case '4':
 523             case '5':
 524             case '6':
 525             case '7':
 526               c = s[i + 1] - '0';
 527               oct_digit = s[i + 2] - '0';
 528               if (0 <= oct_digit && oct_digit <= 7)
 529                 {
 530                   c = 8 * c + oct_digit;
 531                   ++i;
 532                   oct_digit = s[i + 2] - '0';
 533                   if (0 <= oct_digit && oct_digit <= 7)
 534                     {
 535                       if (8 * c + oct_digit < N_CHARS)
 536                         {
 537                           c = 8 * c + oct_digit;
 538                           ++i;
 539                         }
 540                       else if (!posix_pedantic)
 541                         {
 542                           /* Any octal number larger than 0377 won't
 543                              fit in 8 bits.  So we stop when adding the
 544                              next digit would put us over the limit and
 545                              give a warning about the ambiguity.  POSIX
 546                              isn't clear on this, but one person has said
 547                              that in his interpretation, POSIX says tr
 548                              can't even give a warning.  */
 549                           error (0, 0, _("warning: the ambiguous octal escape \
 550 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
 551                                  s[i], s[i + 1], s[i + 2],
 552                                  s[i], s[i + 1], s[i + 2]);
 553                         }
 554                     }
 555                 }
 556               break;
 557             case '\0':
 558               error (0, 0, _("invalid backslash escape at end of string"));
 559               return 1;
 560
 561             default:
 562               if (posix_pedantic)
 563                 {
 564                   error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
 565                   return 1;
 566                 }
 567               else
 568                 {
 569                   c = s[i + 1];
 570                   es->escaped[j] = 1;
 571                 }
 572             }
 573           ++i;
 574           es->s[j++] = c;
 575           break;
 576         default:
 577           es->s[j++] = s[i];
 578           break;
 579         }
 580     }
 581   es->len = j;
 582   return 0;
 583 }
 584
 585 /* If CLASS_STR is a valid character class string, return its index
 586    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 587
 588 static enum Char_class
 589 look_up_char_class (const unsigned char *class_str, size_t len)
 590 {
 591   unsigned int i;
 592
 593   for (i = 0; i < N_CHAR_CLASSES; i++)
 594     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 595         && strlen (char_class_name[i]) == len)
 596       return (enum Char_class) i;
 597   return CC_NO_CLASS;
 598 }
 599
 600 /* Return a newly allocated string with a printable version of C.
 601    This function is used solely for formatting error messages.  */
 602
 603 static char *
 604 make_printable_char (unsigned int c)
 605 {
 606   char *buf = xmalloc (5);
 607
 608   assert (c < N_CHARS);
 609   if (ISPRINT (c))
 610     {
 611       buf[0] = c;
 612       buf[1] = '\0';
 613     }
 614   else
 615     {
 616       sprintf (buf, "\\%03o", c);
 617     }
 618   return buf;
 619 }
 620
 621 /* Return a newly allocated copy of S which is suitable for printing.
 622    LEN is the number of characters in S.  Most non-printing
 623    (isprint) characters are represented by a backslash followed by
 624    3 octal digits.  However, the characters represented by \c escapes
 625    where c is one of [abfnrtv] are represented by their 2-character \c
 626    sequences.  This function is used solely for printing error messages.  */
 627
 628 static char *
 629 make_printable_str (const unsigned char *s, size_t len)
 630 {
 631   /* Worst case is that every character expands to a backslash
 632      followed by a 3-character octal escape sequence.  */
 633   char *printable_buf = xmalloc (4 * len + 1);
 634   char *p = printable_buf;
 635   size_t i;
 636
 637   for (i = 0; i < len; i++)
 638     {
 639       char buf[5];
 640       char *tmp = NULL;
 641
 642       switch (s[i])
 643         {
 644         case '\\':
 645           tmp = "\\";
 646           break;
 647         case '\007':
 648           tmp = "\\a";
 649           break;
 650         case '\b':
 651           tmp = "\\b";
 652           break;
 653         case '\f':
 654           tmp = "\\f";
 655           break;
 656         case '\n':
 657           tmp = "\\n";
 658           break;
 659         case '\r':
 660           tmp = "\\r";
 661           break;
 662         case '\t':
 663           tmp = "\\t";
 664           break;
 665         case '\v':
 666           tmp = "\\v";
 667           break;
 668         default:
 669           if (ISPRINT (s[i]))
 670             {
 671               buf[0] = s[i];
 672               buf[1] = '\0';
 673             }
 674           else
 675             sprintf (buf, "\\%03o", s[i]);
 676           tmp = buf;
 677           break;
 678         }
 679       p = stpcpy (p, tmp);
 680     }
 681   return printable_buf;
 682 }
 683
 684 /* Append a newly allocated structure representing a
 685    character C to the specification list LIST.  */
 686
 687 static void
 688 append_normal_char (struct Spec_list *list, unsigned int c)
 689 {
 690   struct List_element *new;
 691
 692   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 693   new->next = NULL;
 694   new->type = RE_NORMAL_CHAR;
 695   new->u.normal_char = c;
 696   assert (list->tail);
 697   list->tail->next = new;
 698   list->tail = new;
 699 }
 700
 701 /* Append a newly allocated structure representing the range
 702    of characters from FIRST to LAST to the specification list LIST.
 703    Return nonzero if LAST precedes FIRST in the collating sequence,
 704    zero otherwise.  This means that '[c-c]' is acceptable.  */
 705
 706 static int
 707 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
 708 {
 709   struct List_element *new;
 710
 711   if (ORD (first) > ORD (last))
 712     {
 713       char *tmp1 = make_printable_char (first);
 714       char *tmp2 = make_printable_char (last);
 715
 716       error (0, 0,
 717        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 718              tmp1, tmp2);
 719       free (tmp1);
 720       free (tmp2);
 721       return 1;
 722     }
 723   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 724   new->next = NULL;
 725   new->type = RE_RANGE;
 726   new->u.range.first_char = first;
 727   new->u.range.last_char = last;
 728   assert (list->tail);
 729   list->tail->next = new;
 730   list->tail = new;
 731   return 0;
 732 }
 733
 734 /* If CHAR_CLASS_STR is a valid character class string, append a
 735    newly allocated structure representing that character class to the end
 736    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 737    a valid string return nonzero.  */
 738
 739 static int
 740 append_char_class (struct Spec_list *list,
 741                    const unsigned char *char_class_str, size_t len)
 742 {
 743   enum Char_class char_class;
 744   struct List_element *new;
 745
 746   char_class = look_up_char_class (char_class_str, len);
 747   if (char_class == CC_NO_CLASS)
 748     return 1;
 749   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 750   new->next = NULL;
 751   new->type = RE_CHAR_CLASS;
 752   new->u.char_class = char_class;
 753   assert (list->tail);
 754   list->tail->next = new;
 755   list->tail = new;
 756   return 0;
 757 }
 758
 759 /* Append a newly allocated structure representing a [c*n]
 760    repeated character construct to the specification list LIST.
 761    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 762    is a non-negative repeat count.  */
 763
 764 static void
 765 append_repeated_char (struct Spec_list *list, unsigned int the_char,
 766                       size_t repeat_count)
 767 {
 768   struct List_element *new;
 769
 770   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 771   new->next = NULL;
 772   new->type = RE_REPEATED_CHAR;
 773   new->u.repeated_char.the_repeated_char = the_char;
 774   new->u.repeated_char.repeat_count = repeat_count;
 775   assert (list->tail);
 776   list->tail->next = new;
 777   list->tail = new;
 778 }
 779
 780 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 781    the length of that string, LEN, if LEN is exactly one, append
 782    a newly allocated structure representing the specified
 783    equivalence class to the specification list, LIST and return zero.
 784    If LEN is not 1, return nonzero.  */
 785
 786 static int
 787 append_equiv_class (struct Spec_list *list,
 788                     const unsigned char *equiv_class_str, size_t len)
 789 {
 790   struct List_element *new;
 791
 792   if (len != 1)
 793     return 1;
 794   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 795   new->next = NULL;
 796   new->type = RE_EQUIV_CLASS;
 797   new->u.equiv_code = *equiv_class_str;
 798   assert (list->tail);
 799   list->tail->next = new;
 800   list->tail = new;
 801   return 0;
 802 }
 803
 804 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 805    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 806    NUL bytes, and is *not* NUL-terminated.  */
 807
 808 static unsigned char *
 809 substr (const unsigned char *p, size_t first_idx, size_t last_idx)
 810 {
 811   size_t len;
 812   unsigned char *tmp;
 813
 814   assert (first_idx <= last_idx);
 815   len = last_idx - first_idx + 1;
 816   tmp = (unsigned char *) xmalloc (len);
 817
 818   assert (first_idx <= last_idx);
 819   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 820   memcpy (tmp, p + first_idx, len);
 821   return tmp;
 822 }
 823
 824 /* Search forward starting at START_IDX for the 2-char sequence
 825    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 826    a sequence is found, set *RESULT_IDX to the index of the first
 827    character and return nonzero. Otherwise return zero.  P may contain
 828    zero bytes.  */
 829
 830 static int
 831 find_closing_delim (const struct E_string *es, size_t start_idx,
 832                     unsigned int pre_bracket_char, size_t *result_idx)
 833 {
 834   size_t i;
 835
 836   for (i = start_idx; i < es->len - 1; i++)
 837     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 838         && !es->escaped[i] && !es->escaped[i + 1])
 839       {
 840         *result_idx = i;
 841         return 1;
 842       }
 843   return 0;
 844 }
 845
 846 /* Convert a string S with explicit length LEN, possibly
 847    containing embedded zero bytes, to a long integer value.
 848    If the string represents a negative value, a value larger
 849    than LONG_MAX, or if all LEN characters do not represent a
 850    valid integer, return nonzero and do not modify *VAL.
 851    Otherwise, return zero and set *VAL to the converted value.  */
 852
 853 static int
 854 non_neg_strtol (const unsigned char *s, size_t len, size_t *val)
 855 {
 856   size_t i;
 857   unsigned long sum = 0;
 858   unsigned int base;
 859
 860   if (len <= 0)
 861     return 1;
 862   if (s[0] == '0')
 863     base = 8;
 864   else if (ISDIGIT (s[0]))
 865     base = 10;
 866   else
 867     return 1;
 868
 869   for (i = 0; i < len; i++)
 870     {
 871       unsigned int c;
 872
 873       if (s[i] < '0')
 874         return 1;
 875
 876       c = s[i] - '0';
 877       if (c >= base)
 878         return 1;
 879
 880       if (sum > (LONG_MAX - c) / base)
 881         return 1;
 882       sum = sum * base + c;
 883     }
 884   *val = sum;
 885   return 0;
 886 }
 887
 888 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 889    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 890    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 891    and return zero. If the second character following
 892    the opening bracket is not `*' or if no closing bracket can be
 893    found, return -1.  If a closing bracket is found and the
 894    second char is `*', but the string between the `*' and `]' isn't
 895    empty, an octal number, or a decimal number, print an error message
 896    and return -2.  */
 897
 898 static int
 899 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 900                        unsigned int *char_to_repeat, size_t *repeat_count,
 901                        size_t *closing_bracket_idx)
 902 {
 903   size_t i;
 904
 905   assert (start_idx + 1 < es->len);
 906   if (!ES_MATCH (es, start_idx + 1, '*'))
 907     return -1;
 908
 909   for (i = start_idx + 2; i < es->len; i++)
 910     {
 911       if (ES_MATCH (es, i, ']'))
 912         {
 913           const unsigned char *digit_str;
 914           size_t digit_str_len = i - start_idx - 2;
 915
 916           *char_to_repeat = es->s[start_idx];
 917           if (digit_str_len == 0)
 918             {
 919               /* We've matched [c*] -- no explicit repeat count.  */
 920               *repeat_count = 0;
 921               *closing_bracket_idx = i;
 922               return 0;
 923             }
 924
 925           /* Here, we have found [c*s] where s should be a string
 926              of octal or decimal digits.  */
 927           digit_str = &es->s[start_idx + 2];
 928           if (non_neg_strtol (digit_str, digit_str_len, repeat_count)
 929               || *repeat_count > BEGIN_STATE)
 930             {
 931               char *tmp = make_printable_str (digit_str, digit_str_len);
 932               error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
 933                      tmp);
 934               free (tmp);
 935               return -2;
 936             }
 937           *closing_bracket_idx = i;
 938           return 0;
 939         }
 940     }
 941   return -1;                    /* No bracket found.  */
 942 }
 943
 944 /* Return nonzero if the string at ES->s[IDX] matches the regular
 945    expression `\*[0-9]*\]', zero otherwise.  To match, the `*' and
 946    the `]' must not be escaped.  */
 947
 948 static int
 949 star_digits_closebracket (const struct E_string *es, size_t idx)
 950 {
 951   size_t i;
 952
 953   if (!ES_MATCH (es, idx, '*'))
 954     return 0;
 955
 956   for (i = idx + 1; i < es->len; i++)
 957     {
 958       if (!ISDIGIT (es->s[i]))
 959         {
 960           if (ES_MATCH (es, i, ']'))
 961             return 1;
 962           return 0;
 963         }
 964     }
 965   return 0;
 966 }
 967
 968 /* Convert string UNESACPED_STRING (which has been preprocessed to
 969    convert backslash-escape sequences) of length LEN characters into
 970    a linked list of the following 5 types of constructs:
 971       - [:str:] Character class where `str' is one of the 12 valid strings.
 972       - [=c=] Equivalence class where `c' is any single character.
 973       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 974           However, if `n' is present, it must be a non-negative octal or
 975           decimal integer.
 976       - r-s Range of characters from `r' to `s'.  The second endpoint must
 977           not precede the first in the current collating sequence.
 978       - c Any other character is interpreted as itself.  */
 979
 980 static int
 981 build_spec_list (const struct E_string *es, struct Spec_list *result)
 982 {
 983   const unsigned char *p;
 984   size_t i;
 985
 986   p = es->s;
 987
 988   /* The main for-loop below recognizes the 4 multi-character constructs.
 989      A character that matches (in its context) none of the multi-character
 990      constructs is classified as `normal'.  Since all multi-character
 991      constructs have at least 3 characters, any strings of length 2 or
 992      less are composed solely of normal characters.  Hence, the index of
 993      the outer for-loop runs only as far as LEN-2.  */
 994
 995   for (i = 0; i + 2 < es->len; /* empty */)
 996     {
 997       if (ES_MATCH (es, i, '['))
 998         {
 999           int matched_multi_char_construct;
1000           size_t closing_bracket_idx;
1001           unsigned int char_to_repeat;
1002           size_t repeat_count;
1003           int err;
1004
1005           matched_multi_char_construct = 1;
1006           if (ES_MATCH (es, i + 1, ':')
1007               || ES_MATCH (es, i + 1, '='))
1008             {
1009               size_t closing_delim_idx;
1010               int found;
1011
1012               found = find_closing_delim (es, i + 2, p[i + 1],
1013                                           &closing_delim_idx);
1014               if (found)
1015                 {
1016                   int parse_failed;
1017                   unsigned char *opnd_str = substr (p, i + 2,
1018                                                     closing_delim_idx - 1);
1019                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
1020
1021                   if (p[i + 1] == ':')
1022                     {
1023                       parse_failed = append_char_class (result, opnd_str,
1024                                                         opnd_str_len);
1025
1026                       /* FIXME: big comment.  */
1027                       if (parse_failed)
1028                         {
1029                           if (star_digits_closebracket (es, i + 2))
1030                             {
1031                               free (opnd_str);
1032                               goto try_bracketed_repeat;
1033                             }
1034                           else
1035                             {
1036                               char *tmp = make_printable_str (opnd_str,
1037                                                               opnd_str_len);
1038                               error (0, 0, _("invalid character class `%s'"),
1039                                      tmp);
1040                               free (tmp);
1041                               return 1;
1042                             }
1043                         }
1044                     }
1045                   else
1046                     {
1047                       parse_failed = append_equiv_class (result, opnd_str,
1048                                                          opnd_str_len);
1049
1050                       /* FIXME: big comment.  */
1051                       if (parse_failed)
1052                         {
1053                           if (star_digits_closebracket (es, i + 2))
1054                             {
1055                               free (opnd_str);
1056                               goto try_bracketed_repeat;
1057                             }
1058                           else
1059                             {
1060                               char *tmp = make_printable_str (opnd_str,
1061                                                               opnd_str_len);
1062                               error (0, 0,
1063                _("%s: equivalence class operand must be a single character"),
1064                                      tmp);
1065                               free (tmp);
1066                               return 1;
1067                             }
1068                         }
1069                     }
1070                   free (opnd_str);
1071
1072                   /* Return nonzero if append_*_class reports a problem.  */
1073                   if (parse_failed)
1074                     return 1;
1075                   else
1076                     i = closing_delim_idx + 2;
1077                   continue;
1078                 }
1079               /* Else fall through.  This could be [:*] or [=*].  */
1080             }
1081
1082         try_bracketed_repeat:
1083
1084           /* Determine whether this is a bracketed repeat range
1085              matching the RE \[.\*(dec_or_oct_number)?\].  */
1086           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1087                                        &repeat_count,
1088                                        &closing_bracket_idx);
1089           if (err == 0)
1090             {
1091               append_repeated_char (result, char_to_repeat, repeat_count);
1092               i = closing_bracket_idx + 1;
1093             }
1094           else if (err == -1)
1095             {
1096               matched_multi_char_construct = 0;
1097             }
1098           else
1099             {
1100               /* Found a string that looked like [c*n] but the
1101                  numeric part was invalid.  */
1102               return 1;
1103             }
1104
1105           if (matched_multi_char_construct)
1106             continue;
1107
1108           /* We reach this point if P does not match [:str:], [=c=],
1109              [c*n], or [c*].  Now, see if P looks like a range `[-c'
1110              (from `[' to `c').  */
1111         }
1112
1113       /* Look ahead one char for ranges like a-z.  */
1114       if (ES_MATCH (es, i + 1, '-'))
1115         {
1116           if (append_range (result, p[i], p[i + 2]))
1117             return 1;
1118           i += 3;
1119         }
1120       else
1121         {
1122           append_normal_char (result, p[i]);
1123           ++i;
1124         }
1125     }
1126
1127   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1128   for (; i < es->len; i++)
1129     append_normal_char (result, p[i]);
1130
1131   return 0;
1132 }
1133
1134 /* Given a Spec_list S (with its saved state implicit in the values
1135    of its members `tail' and `state'), return the next single character
1136    in the expansion of S's constructs.  If the last character of S was
1137    returned on the previous call or if S was empty, this function
1138    returns -1.  For example, successive calls to get_next where S
1139    represents the spec-string 'a-d[y*3]' will return the sequence
1140    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1141    which the returned character comes is [:upper:] or [:lower:], the
1142    parameter CLASS is given a value to indicate which it was.  Otherwise
1143    CLASS is set to UL_NONE.  This value is used only when constructing
1144    the translation table to verify that any occurrences of upper and
1145    lower class constructs in the spec-strings appear in the same relative
1146    positions.  */
1147
1148 static int
1149 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1150 {
1151   struct List_element *p;
1152   int return_val;
1153   int i;
1154
1155   if (class)
1156     *class = UL_NONE;
1157
1158   if (s->state == BEGIN_STATE)
1159     {
1160       s->tail = s->head->next;
1161       s->state = NEW_ELEMENT;
1162     }
1163
1164   p = s->tail;
1165   if (p == NULL)
1166     return -1;
1167
1168   switch (p->type)
1169     {
1170     case RE_NORMAL_CHAR:
1171       return_val = p->u.normal_char;
1172       s->state = NEW_ELEMENT;
1173       s->tail = p->next;
1174       break;
1175
1176     case RE_RANGE:
1177       if (s->state == NEW_ELEMENT)
1178         s->state = ORD (p->u.range.first_char);
1179       else
1180         ++(s->state);
1181       return_val = CHR (s->state);
1182       if (s->state == ORD (p->u.range.last_char))
1183         {
1184           s->tail = p->next;
1185           s->state = NEW_ELEMENT;
1186         }
1187       break;
1188
1189     case RE_CHAR_CLASS:
1190       if (class)
1191         {
1192           int upper_or_lower;
1193           switch (p->u.char_class)
1194             {
1195             case CC_LOWER:
1196               *class = UL_LOWER;
1197               upper_or_lower = 1;
1198               break;
1199             case CC_UPPER:
1200               *class = UL_UPPER;
1201               upper_or_lower = 1;
1202               break;
1203             default:
1204               upper_or_lower = 0;
1205               break;
1206             }
1207
1208           if (upper_or_lower)
1209             {
1210               s->tail = p->next;
1211               s->state = NEW_ELEMENT;
1212               return_val = 0;
1213               break;
1214             }
1215         }
1216
1217       if (s->state == NEW_ELEMENT)
1218         {
1219           for (i = 0; i < N_CHARS; i++)
1220             if (is_char_class_member (p->u.char_class, i))
1221               break;
1222           assert (i < N_CHARS);
1223           s->state = i;
1224         }
1225       assert (is_char_class_member (p->u.char_class, s->state));
1226       return_val = CHR (s->state);
1227       for (i = s->state + 1; i < N_CHARS; i++)
1228         if (is_char_class_member (p->u.char_class, i))
1229           break;
1230       if (i < N_CHARS)
1231         s->state = i;
1232       else
1233         {
1234           s->tail = p->next;
1235           s->state = NEW_ELEMENT;
1236         }
1237       break;
1238
1239     case RE_EQUIV_CLASS:
1240       /* FIXME: this assumes that each character is alone in its own
1241          equivalence class (which appears to be correct for my
1242          LC_COLLATE.  But I don't know of any function that allows
1243          one to determine a character's equivalence class.  */
1244
1245       return_val = p->u.equiv_code;
1246       s->state = NEW_ELEMENT;
1247       s->tail = p->next;
1248       break;
1249
1250     case RE_REPEATED_CHAR:
1251       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1252       if (p->u.repeated_char.repeat_count == 0)
1253         {
1254           s->tail = p->next;
1255           s->state = NEW_ELEMENT;
1256           return_val = get_next (s, class);
1257         }
1258       else
1259         {
1260           if (s->state == NEW_ELEMENT)
1261             {
1262               s->state = 0;
1263             }
1264           ++(s->state);
1265           return_val = p->u.repeated_char.the_repeated_char;
1266           if (p->u.repeated_char.repeat_count > 0
1267               && s->state == p->u.repeated_char.repeat_count)
1268             {
1269               s->tail = p->next;
1270               s->state = NEW_ELEMENT;
1271             }
1272         }
1273       break;
1274
1275     case RE_NO_TYPE:
1276       abort ();
1277       break;
1278
1279     default:
1280       abort ();
1281       break;
1282     }
1283
1284   return return_val;
1285 }
1286
1287 /* This is a minor kludge.  This function is called from
1288    get_spec_stats to determine the cardinality of a set derived
1289    from a complemented string.  It's a kludge in that some of the
1290    same operations are (duplicated) performed in set_initialize.  */
1291
1292 static int
1293 card_of_complement (struct Spec_list *s)
1294 {
1295   int c;
1296   int cardinality = N_CHARS;
1297   SET_TYPE in_set[N_CHARS];
1298
1299   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1300   s->state = BEGIN_STATE;
1301   while ((c = get_next (s, NULL)) != -1)
1302     if (!in_set[c]++)
1303       --cardinality;
1304   return cardinality;
1305 }
1306
1307 /* Gather statistics about the spec-list S in preparation for the tests
1308    in validate that determine the consistency of the specs.  This function
1309    is called at most twice; once for string1, and again for any string2.
1310    LEN_S1 < 0 indicates that this is the first call and that S represents
1311    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1312    constructs in string1, and we can use its value to resolve any
1313    indefinite repeat construct in S (which represents string2).  Hence,
1314    this function has the side-effect that it converts a valid [c*]
1315    construct in string2 to [c*n] where n is large enough (or 0) to give
1316    string2 the same length as string1.  For example, with the command
1317    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1318    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1319
1320 static void
1321 get_spec_stats (struct Spec_list *s)
1322 {
1323   struct List_element *p;
1324   int len = 0;
1325
1326   s->n_indefinite_repeats = 0;
1327   s->has_equiv_class = 0;
1328   s->has_restricted_char_class = 0;
1329   s->has_char_class = 0;
1330   for (p = s->head->next; p; p = p->next)
1331     {
1332       switch (p->type)
1333         {
1334           int i;
1335         case RE_NORMAL_CHAR:
1336           ++len;
1337           break;
1338
1339         case RE_RANGE:
1340           assert (p->u.range.last_char >= p->u.range.first_char);
1341           len += p->u.range.last_char - p->u.range.first_char + 1;
1342           break;
1343
1344         case RE_CHAR_CLASS:
1345           s->has_char_class = 1;
1346           for (i = 0; i < N_CHARS; i++)
1347             if (is_char_class_member (p->u.char_class, i))
1348               ++len;
1349           switch (p->u.char_class)
1350             {
1351             case CC_UPPER:
1352             case CC_LOWER:
1353               break;
1354             default:
1355               s->has_restricted_char_class = 1;
1356               break;
1357             }
1358           break;
1359
1360         case RE_EQUIV_CLASS:
1361           for (i = 0; i < N_CHARS; i++)
1362             if (is_equiv_class_member (p->u.equiv_code, i))
1363               ++len;
1364           s->has_equiv_class = 1;
1365           break;
1366
1367         case RE_REPEATED_CHAR:
1368           if (p->u.repeated_char.repeat_count > 0)
1369             len += p->u.repeated_char.repeat_count;
1370           else if (p->u.repeated_char.repeat_count == 0)
1371             {
1372               s->indefinite_repeat_element = p;
1373               ++(s->n_indefinite_repeats);
1374             }
1375           break;
1376
1377         case RE_NO_TYPE:
1378           assert (0);
1379           break;
1380         }
1381     }
1382
1383   s->length = len;
1384 }
1385
1386 static void
1387 get_s1_spec_stats (struct Spec_list *s1)
1388 {
1389   get_spec_stats (s1);
1390   if (complement)
1391     s1->length = card_of_complement (s1);
1392 }
1393
1394 static void
1395 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1396 {
1397   get_spec_stats (s2);
1398   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1399     {
1400       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1401         len_s1 - s2->length;
1402       s2->length = len_s1;
1403     }
1404 }
1405
1406 static void
1407 spec_init (struct Spec_list *spec_list)
1408 {
1409   spec_list->head = spec_list->tail =
1410     (struct List_element *) xmalloc (sizeof (struct List_element));
1411   spec_list->head->next = NULL;
1412 }
1413
1414 /* This function makes two passes over the argument string S.  The first
1415    one converts all \c and \ddd escapes to their one-byte representations.
1416    The second constructs a linked specification list, SPEC_LIST, of the
1417    characters and constructs that comprise the argument string.  If either
1418    of these passes detects an error, this function returns nonzero.  */
1419
1420 static int
1421 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1422 {
1423   struct E_string es;
1424   int fail;
1425
1426   fail = unquote (s, &es);
1427   if (!fail)
1428     fail = build_spec_list (&es, spec_list);
1429   es_free (&es);
1430   return fail;
1431 }
1432
1433 /* Given two specification lists, S1 and S2, and assuming that
1434    S1->length > S2->length, append a single [c*n] element to S2 where c
1435    is the last character in the expansion of S2 and n is the difference
1436    between the two lengths.
1437    Upon successful completion, S2->length is set to S1->length.  The only
1438    way this function can fail to make S2 as long as S1 is when S2 has
1439    zero-length, since in that case, there is no last character to repeat.
1440    So S2->length is required to be at least 1.
1441
1442    Providing this functionality allows the user to do some pretty
1443    non-BSD (and non-portable) things:  For example, the command
1444        tr -cs '[:upper:]0-9' '[:lower:]'
1445    is almost guaranteed to give results that depend on your collating
1446    sequence.  */
1447
1448 static void
1449 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1450 {
1451   struct List_element *p;
1452   int char_to_repeat;
1453   int i;
1454
1455   assert (translating);
1456   assert (s1->length > s2->length);
1457   assert (s2->length > 0);
1458
1459   p = s2->tail;
1460   switch (p->type)
1461     {
1462     case RE_NORMAL_CHAR:
1463       char_to_repeat = p->u.normal_char;
1464       break;
1465     case RE_RANGE:
1466       char_to_repeat = p->u.range.last_char;
1467       break;
1468     case RE_CHAR_CLASS:
1469       for (i = N_CHARS; i >= 0; i--)
1470         if (is_char_class_member (p->u.char_class, i))
1471           break;
1472       assert (i >= 0);
1473       char_to_repeat = CHR (i);
1474       break;
1475
1476     case RE_REPEATED_CHAR:
1477       char_to_repeat = p->u.repeated_char.the_repeated_char;
1478       break;
1479
1480     case RE_EQUIV_CLASS:
1481       /* This shouldn't happen, because validate exits with an error
1482          if it finds an equiv class in string2 when translating.  */
1483       abort ();
1484       break;
1485
1486     case RE_NO_TYPE:
1487       abort ();
1488       break;
1489
1490     default:
1491       abort ();
1492       break;
1493     }
1494
1495   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1496   s2->length = s1->length;
1497 }
1498
1499 /* Return non-zero if S is a non-empty list in which exactly one
1500    character (but potentially, many instances of it) appears.
1501    E.g.  [X*] or xxxxxxxx.  */
1502
1503 static int
1504 homogeneous_spec_list (struct Spec_list *s)
1505 {
1506   int b, c;
1507
1508   s->state = BEGIN_STATE;
1509
1510   if ((b = get_next (s, NULL)) == -1)
1511     return 0;
1512
1513   while ((c = get_next (s, NULL)) != -1)
1514     if (c != b)
1515       return 0;
1516
1517   return 1;
1518 }
1519
1520 /* Die with an error message if S1 and S2 describe strings that
1521    are not valid with the given command line switches.
1522    A side effect of this function is that if a valid [c*] or
1523    [c*0] construct appears in string2, it is converted to [c*n]
1524    with a value for n that makes s2->length == s1->length.  By
1525    the same token, if the --truncate-set1 option is not
1526    given, S2 may be extended.  */
1527
1528 static void
1529 validate (struct Spec_list *s1, struct Spec_list *s2)
1530 {
1531   get_s1_spec_stats (s1);
1532   if (s1->n_indefinite_repeats > 0)
1533     {
1534       error (EXIT_FAILURE, 0,
1535              _("the [c*] repeat construct may not appear in string1"));
1536     }
1537
1538   if (s2)
1539     {
1540       get_s2_spec_stats (s2, s1->length);
1541
1542       if (s2->n_indefinite_repeats > 1)
1543         {
1544           error (EXIT_FAILURE, 0,
1545                  _("only one [c*] repeat construct may appear in string2"));
1546         }
1547
1548       if (translating)
1549         {
1550           if (s2->has_equiv_class)
1551             {
1552               error (EXIT_FAILURE, 0,
1553                      _("[=c=] expressions may not appear in string2 \
1554 when translating"));
1555             }
1556
1557           if (s1->length > s2->length)
1558             {
1559               if (!truncate_set1)
1560                 {
1561                   /* string2 must be non-empty unless --truncate-set1 is
1562                      given or string1 is empty.  */
1563
1564                   if (s2->length == 0)
1565                     error (EXIT_FAILURE, 0,
1566                      _("when not truncating set1, string2 must be non-empty"));
1567                   string2_extend (s1, s2);
1568                 }
1569             }
1570
1571           if (complement && s1->has_char_class
1572               && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1573             {
1574               error (EXIT_FAILURE, 0,
1575                      _("when translating with complemented character classes,\
1576 \nstring2 must map all characters in the domain to one"));
1577             }
1578
1579           if (s2->has_restricted_char_class)
1580             {
1581               error (EXIT_FAILURE, 0,
1582                      _("when translating, the only character classes that may \
1583 appear in\nstring2 are `upper' and `lower'"));
1584             }
1585         }
1586       else
1587         /* Not translating.  */
1588         {
1589           if (s2->n_indefinite_repeats > 0)
1590             error (EXIT_FAILURE, 0,
1591                    _("the [c*] construct may appear in string2 only \
1592 when translating"));
1593         }
1594     }
1595 }
1596
1597 /* Read buffers of SIZE bytes via the function READER (if READER is
1598    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1599    read_and_delete or read_and_xlate.  After each buffer is read, it is
1600    processed and written to stdout.  The buffers are processed so that
1601    multiple consecutive occurrences of the same character in the input
1602    stream are replaced by a single occurrence of that character if the
1603    character is in the squeeze set.  */
1604
1605 static void
1606 squeeze_filter (unsigned char *buf, long int size, PFI reader)
1607 {
1608   unsigned int char_to_squeeze = NOT_A_CHAR;
1609   int i = 0;
1610   int nr = 0;
1611
1612   for (;;)
1613     {
1614       int begin;
1615
1616       if (i >= nr)
1617         {
1618           if (reader == NULL)
1619             nr = safe_read (0, (char *) buf, size);
1620           else
1621             nr = (*reader) (buf, size, NULL);
1622
1623           if (nr < 0)
1624             error (EXIT_FAILURE, errno, _("read error"));
1625           if (nr == 0)
1626             break;
1627           i = 0;
1628         }
1629
1630       begin = i;
1631
1632       if (char_to_squeeze == NOT_A_CHAR)
1633         {
1634           int out_len;
1635           /* Here, by being a little tricky, we can get a significant
1636              performance increase in most cases when the input is
1637              reasonably large.  Since tr will modify the input only
1638              if two consecutive (and identical) input characters are
1639              in the squeeze set, we can step by two through the data
1640              when searching for a character in the squeeze set.  This
1641              means there may be a little more work in a few cases and
1642              perhaps twice as much work in the worst cases where most
1643              of the input is removed by squeezing repeats.  But most
1644              uses of this functionality seem to remove less than 20-30%
1645              of the input.  */
1646           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1647             ;                   /* empty */
1648
1649           /* There is a special case when i == nr and we've just
1650              skipped a character (the last one in buf) that is in
1651              the squeeze set.  */
1652           if (i == nr && in_squeeze_set[buf[i - 1]])
1653             --i;
1654
1655           if (i >= nr)
1656             out_len = nr - begin;
1657           else
1658             {
1659               char_to_squeeze = buf[i];
1660               /* We're about to output buf[begin..i].  */
1661               out_len = i - begin + 1;
1662
1663               /* But since we stepped by 2 in the loop above,
1664                  out_len may be one too large.  */
1665               if (i > 0 && buf[i - 1] == char_to_squeeze)
1666                 --out_len;
1667
1668               /* Advance i to the index of first character to be
1669                  considered when looking for a char different from
1670                  char_to_squeeze.  */
1671               ++i;
1672             }
1673           if (out_len > 0
1674               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1675             error (EXIT_FAILURE, errno, _("write error"));
1676         }
1677
1678       if (char_to_squeeze != NOT_A_CHAR)
1679         {
1680           /* Advance i to index of first char != char_to_squeeze
1681              (or to nr if all the rest of the characters in this
1682              buffer are the same as char_to_squeeze).  */
1683           for (; i < nr && buf[i] == char_to_squeeze; i++)
1684             ;                   /* empty */
1685           if (i < nr)
1686             char_to_squeeze = NOT_A_CHAR;
1687           /* If (i >= nr) we've squeezed the last character in this buffer.
1688              So now we have to read a new buffer and continue comparing
1689              characters against char_to_squeeze.  */
1690         }
1691     }
1692 }
1693
1694 /* Read buffers of SIZE bytes from stdin until one is found that
1695    contains at least one character not in the delete set.  Store
1696    in the array BUF, all characters from that buffer that are not
1697    in the delete set, and return the number of characters saved
1698    or 0 upon EOF.  */
1699
1700 static long
1701 read_and_delete (unsigned char *buf, long int size, PFI not_used)
1702 {
1703   long n_saved;
1704   static int hit_eof = 0;
1705
1706   assert (not_used == NULL);
1707   assert (size > 0);
1708
1709   if (hit_eof)
1710     return 0;
1711
1712   /* This enclosing do-while loop is to make sure that
1713      we don't return zero (indicating EOF) when we've
1714      just deleted all the characters in a buffer.  */
1715   do
1716     {
1717       int i;
1718       int nr = safe_read (0, (char *) buf, size);
1719
1720       if (nr < 0)
1721         error (EXIT_FAILURE, errno, _("read error"));
1722       if (nr == 0)
1723         {
1724           hit_eof = 1;
1725           return 0;
1726         }
1727
1728       /* This first loop may be a waste of code, but gives much
1729          better performance when no characters are deleted in
1730          the beginning of a buffer.  It just avoids the copying
1731          of buf[i] into buf[n_saved] when it would be a NOP.  */
1732
1733       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1734         /* empty */ ;
1735       n_saved = i;
1736
1737       for (++i; i < nr; i++)
1738         if (!in_delete_set[buf[i]])
1739           buf[n_saved++] = buf[i];
1740     }
1741   while (n_saved == 0);
1742
1743   return n_saved;
1744 }
1745
1746 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1747    perform the in-place and one-to-one mapping specified by the global
1748    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1749
1750 static long
1751 read_and_xlate (unsigned char *buf, long int size, PFI not_used)
1752 {
1753   long chars_read = 0;
1754   static int hit_eof = 0;
1755   int i;
1756
1757   assert (not_used == NULL);
1758   assert (size > 0);
1759
1760   if (hit_eof)
1761     return 0;
1762
1763   chars_read = safe_read (0, (char *) buf, size);
1764   if (chars_read < 0)
1765     error (EXIT_FAILURE, errno, _("read error"));
1766   if (chars_read == 0)
1767     {
1768       hit_eof = 1;
1769       return 0;
1770     }
1771
1772   for (i = 0; i < chars_read; i++)
1773     buf[i] = xlate[buf[i]];
1774
1775   return chars_read;
1776 }
1777
1778 /* Initialize a boolean membership set IN_SET with the character
1779    values obtained by traversing the linked list of constructs S
1780    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1781    nonzero the resulting set is complemented.  */
1782
1783 static void
1784 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1785 {
1786   int c;
1787   int i;
1788
1789   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1790   s->state = BEGIN_STATE;
1791   while ((c = get_next (s, NULL)) != -1)
1792     in_set[c] = 1;
1793   if (complement_this_set)
1794     for (i = 0; i < N_CHARS; i++)
1795       in_set[i] = (!in_set[i]);
1796 }
1797
1798 int
1799 main (int argc, char **argv)
1800 {
1801   int c;
1802   int non_option_args;
1803   struct Spec_list buf1, buf2;
1804   struct Spec_list *s1 = &buf1;
1805   struct Spec_list *s2 = &buf2;
1806
1807   program_name = argv[0];
1808   setlocale (LC_ALL, "");
1809   bindtextdomain (PACKAGE, LOCALEDIR);
1810   textdomain (PACKAGE);
1811
1812   while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1813     {
1814       switch (c)
1815         {
1816         case 0:
1817           break;
1818
1819         case 'c':
1820           complement = 1;
1821           break;
1822
1823         case 'd':
1824           delete = 1;
1825           break;
1826
1827         case 's':
1828           squeeze_repeats = 1;
1829           break;
1830
1831         case 't':
1832           truncate_set1 = 1;
1833           break;
1834
1835         default:
1836           usage (2);
1837           break;
1838         }
1839     }
1840
1841   if (show_version)
1842     {
1843       printf ("tr (%s) %s\n", GNU_PACKAGE, VERSION);
1844       exit (EXIT_SUCCESS);
1845     }
1846
1847   if (show_help)
1848     usage (0);
1849
1850   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1851
1852   non_option_args = argc - optind;
1853   translating = (non_option_args == 2 && !delete);
1854
1855   /* Change this test if it is valid to give tr no options and
1856      no args at all.  POSIX doesn't specifically say anything
1857      either way, but it looks like they implied it's invalid
1858      by omission.  If you want to make tr do a slow imitation
1859      of `cat' use `tr a a'.  */
1860   if (non_option_args > 2)
1861     {
1862       error (0, 0, _("too many arguments"));
1863       usage (2);
1864     }
1865
1866   if (!delete && !squeeze_repeats && non_option_args != 2)
1867     error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1868
1869   if (delete && squeeze_repeats && non_option_args != 2)
1870     error (EXIT_FAILURE, 0, _("two strings must be given when both \
1871 deleting and squeezing repeats"));
1872
1873   /* If --delete is given without --squeeze-repeats, then
1874      only one string argument may be specified.  But POSIX
1875      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1876      is set, pretend we never saw string2.  But I think
1877      this deserves a fatal error, so that's the default.  */
1878   if ((delete && !squeeze_repeats) && non_option_args != 1)
1879     {
1880       if (posix_pedantic && non_option_args == 2)
1881         --non_option_args;
1882       else
1883         error (EXIT_FAILURE, 0,
1884                _("only one string may be given when deleting \
1885 without squeezing repeats"));
1886     }
1887
1888   if (squeeze_repeats && non_option_args == 0)
1889     error (EXIT_FAILURE, 0,
1890            _("at least one string must be given when squeezing repeats"));
1891
1892   spec_init (s1);
1893   if (parse_str ((unsigned char *) argv[optind], s1))
1894     exit (EXIT_FAILURE);
1895
1896   if (non_option_args == 2)
1897     {
1898       spec_init (s2);
1899       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1900         exit (EXIT_FAILURE);
1901     }
1902   else
1903     s2 = NULL;
1904
1905   validate (s1, s2);
1906
1907   if (squeeze_repeats && non_option_args == 1)
1908     {
1909       set_initialize (s1, complement, in_squeeze_set);
1910       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1911     }
1912   else if (delete && non_option_args == 1)
1913     {
1914       long nr;
1915
1916       set_initialize (s1, complement, in_delete_set);
1917       do
1918         {
1919           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1920           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1921             error (EXIT_FAILURE, errno, _("write error"));
1922         }
1923       while (nr > 0);
1924     }
1925   else if (squeeze_repeats && delete && non_option_args == 2)
1926     {
1927       set_initialize (s1, complement, in_delete_set);
1928       set_initialize (s2, 0, in_squeeze_set);
1929       squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1930     }
1931   else if (translating)
1932     {
1933       if (complement)
1934         {
1935           int i;
1936           SET_TYPE *in_s1 = in_delete_set;
1937
1938           set_initialize (s1, 0, in_s1);
1939           s2->state = BEGIN_STATE;
1940           for (i = 0; i < N_CHARS; i++)
1941             xlate[i] = i;
1942           for (i = 0; i < N_CHARS; i++)
1943             {
1944               if (!in_s1[i])
1945                 {
1946                   int ch = get_next (s2, NULL);
1947                   assert (ch != -1 || truncate_set1);
1948                   if (ch == -1)
1949                     {
1950                       /* This will happen when tr is invoked like e.g.
1951                          tr -cs A-Za-z0-9 '\012'.  */
1952                       break;
1953                     }
1954                   xlate[i] = ch;
1955                 }
1956             }
1957           assert (get_next (s2, NULL) == -1 || truncate_set1);
1958         }
1959       else
1960         {
1961           int c1, c2;
1962           int i;
1963           enum Upper_Lower_class class_s1;
1964           enum Upper_Lower_class class_s2;
1965
1966           for (i = 0; i < N_CHARS; i++)
1967             xlate[i] = i;
1968           s1->state = BEGIN_STATE;
1969           s2->state = BEGIN_STATE;
1970           for (;;)
1971             {
1972               c1 = get_next (s1, &class_s1);
1973               c2 = get_next (s2, &class_s2);
1974               if (!class_ok[(int) class_s1][(int) class_s2])
1975                 error (EXIT_FAILURE, 0,
1976                        _("misaligned [:upper:] and/or [:lower:] construct"));
1977
1978               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1979                 {
1980                   for (i = 0; i < N_CHARS; i++)
1981                     if (ISLOWER (i))
1982                       xlate[i] = toupper (i);
1983                 }
1984               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1985                 {
1986                   for (i = 0; i < N_CHARS; i++)
1987                     if (ISUPPER (i))
1988                       xlate[i] = tolower (i);
1989                 }
1990               else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1991                        || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1992                 {
1993                   /* By default, GNU tr permits the identity mappings: from
1994                      [:upper:] to [:upper:] and [:lower:] to [:lower:].  But
1995                      when POSIXLY_CORRECT is set, those evoke diagnostics.  */
1996                   if (posix_pedantic)
1997                     {
1998                       error (EXIT_FAILURE, 0,
1999                              _("\
2000 invalid identity mapping;  when translating, any [:lower:] or [:upper:]\n\
2001 construct in string1 must be aligned with a corresponding construct\n\
2002 ([:upper:] or [:lower:], respectively) in string2"));
2003                     }
2004                 }
2005               else
2006                 {
2007                   /* The following should have been checked by validate...  */
2008                   if (c1 == -1 || c2 == -1)
2009                     break;
2010                   xlate[c1] = c2;
2011                 }
2012             }
2013           assert (c1 == -1 || truncate_set1);
2014         }
2015       if (squeeze_repeats)
2016         {
2017           set_initialize (s2, 0, in_squeeze_set);
2018           squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
2019         }
2020       else
2021         {
2022           long chars_read;
2023
2024           do
2025             {
2026               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2027               if (chars_read > 0
2028                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2029                 error (EXIT_FAILURE, errno, _("write error"));
2030             }
2031           while (chars_read > 0);
2032         }
2033     }
2034
2035   if (fclose (stdout) == EOF)
2036     error (EXIT_FAILURE, errno, _("write error"));
2037
2038   if (close (0) != 0)
2039     error (EXIT_FAILURE, errno, _("standard input"));
2040
2041   exit (EXIT_SUCCESS);
2042 }