1 /* tr -- a filter to translate characters
2 Copyright (C) 91, 1995-2002 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 /* Written by Jim Meyering */
25 #include <sys/types.h>
31 #include "safe-read.h"
34 /* The official name of this program (e.g., no `g' prefix). */
35 #define PROGRAM_NAME "tr"
37 #define AUTHORS "Jim Meyering"
39 #define N_CHARS (UCHAR_MAX + 1)
41 /* A pointer to a filtering function. */
42 typedef size_t (*Filter) (/* unsigned char *, size_t, Filter */);
44 /* Convert from character C to its index in the collating
45 sequence array. Just cast to an unsigned int to avoid
46 problems with sign-extension. */
47 #define ORD(c) (unsigned int)(c)
49 /* The inverse of ORD. */
50 #define CHR(i) (unsigned char)(i)
52 /* The value for Spec_list->state that indicates to
53 get_next that it should initialize the tail pointer.
54 Its value should be as large as possible to avoid conflict
55 a valid value for the state field -- and that may be as
56 large as any valid repeat_count. */
57 #define BEGIN_STATE (INT_MAX - 1)
59 /* The value for Spec_list->state that indicates to
60 get_next that the element pointed to by Spec_list->tail is
61 being considered for the first time on this pass through the
62 list -- it indicates that get_next should make any necessary
64 #define NEW_ELEMENT (BEGIN_STATE + 1)
66 /* A value distinct from any character that may have been stored in a
67 buffer as the result of a block-read in the function squeeze_filter. */
68 #define NOT_A_CHAR (unsigned int)(-1)
70 /* The following (but not CC_NO_CLASS) are indices into the array of
71 valid character class strings. */
74 CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
75 CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
76 CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
80 /* Character class to which a character (returned by get_next) belonged;
81 but it is set only if the construct from which the character was obtained
82 was one of the character classes [:upper:] or [:lower:]. The value
83 is used only when translating and then, only to make sure that upper
84 and lower class constructs have the same relative positions in string1
86 enum Upper_Lower_class
93 /* A shortcut to ensure that when constructing the translation array,
94 one of the values returned by paired calls to get_next (from s1 and s2)
95 is from [:upper:] and the other is from [:lower:], or neither is from
96 upper or lower. By default, GNU tr permits the identity mappings: from
97 [:upper:] to [:upper:] and [:lower:] to [:lower:]. But when
98 POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
99 by values of type enum Upper_Lower_class. */
100 static int const class_ok[3][3] =
107 /* The type of a List_element. See build_spec_list for more details. */
108 enum Range_element_type
118 /* One construct in one of tr's argument strings.
119 For example, consider the POSIX version of the classic tr command:
120 tr -cs 'a-zA-Z_' '[\n*]'
121 String1 has 3 constructs, two of which are ranges (a-z and A-Z),
122 and a single normal character, `_'. String2 has one construct. */
125 enum Range_element_type type;
126 struct List_element *next;
132 unsigned int first_char;
133 unsigned int last_char;
136 enum Char_class char_class;
140 unsigned int the_repeated_char;
148 /* Each of tr's argument strings is parsed into a form that is easier
149 to work with: a linked list of constructs (struct List_element).
150 Each Spec_list structure also encapsulates various attributes of
151 the corresponding argument string. The attributes are used mainly
152 to verify that the strings are valid in the context of any options
153 specified (like -s, -d, or -c). The main exception is the member
154 `tail', which is first used to construct the list. After construction,
155 it is used by get_next to save its state when traversing the list.
156 The member `state' serves a similar function. */
159 /* Points to the head of the list of range elements.
160 The first struct is a dummy; its members are never used. */
161 struct List_element *head;
163 /* When appending, points to the last element. When traversing via
164 get_next(), points to the element to process next. Setting
165 Spec_list.state to the value BEGIN_STATE before calling get_next
166 signals get_next to initialize tail to point to head->next. */
167 struct List_element *tail;
169 /* Used to save state between calls to get_next(). */
172 /* Length, in the sense that length ('a-z[:digit:]123abc')
173 is 42 ( = 26 + 10 + 6). */
176 /* The number of [c*] and [c*0] constructs that appear in this spec. */
177 int n_indefinite_repeats;
179 /* If n_indefinite_repeats is nonzero, this points to the List_element
180 corresponding to the last [c*] or [c*0] construct encountered in
181 this spec. Otherwise it is undefined. */
182 struct List_element *indefinite_repeat_element;
184 /* Non-zero if this spec contains at least one equivalence
185 class construct e.g. [=c=]. */
188 /* Non-zero if this spec contains at least one character class
189 construct. E.g. [:digit:]. */
192 /* Non-zero if this spec contains at least one of the character class
193 constructs (all but upper and lower) that aren't allowed in s2. */
194 int has_restricted_char_class;
197 /* A representation for escaped string1 or string2. As a string is parsed,
198 any backslash-escaped characters (other than octal or \a, \b, \f, \n,
199 etc.) are marked as such in this structure by setting the corresponding
200 entry in the ESCAPED vector. */
208 /* Return nonzero if the Ith character of escaped string ES matches C
209 and is not escaped itself. */
210 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
212 /* The name by which this program was run. */
215 /* When nonzero, each sequence in the input of a repeated character
216 (call it c) is replaced (in the output) by a single occurrence of c
217 for every c in the squeeze set. */
218 static int squeeze_repeats = 0;
220 /* When nonzero, removes characters in the delete set from input. */
221 static int delete = 0;
223 /* Use the complement of set1 in place of set1. */
224 static int complement = 0;
226 /* When nonzero, this flag causes GNU tr to provide strict
227 compliance with POSIX draft 1003.2.11.2. The POSIX spec
228 says that when -d is used without -s, string2 (if present)
229 must be ignored. Silently ignoring arguments is a bad idea.
230 The default GNU behavior is to give a usage message and exit.
231 Additionally, when this flag is nonzero, tr prints warnings
232 on stderr if it is being used in a manner that is not portable.
233 Applicable warnings are given by default, but are suppressed
234 if the environment variable `POSIXLY_CORRECT' is set, since
235 being POSIX conformant means we can't issue such messages.
236 Warnings on the following topics are suppressed when this
238 1. Ambiguous octal escapes. */
239 static int posix_pedantic;
241 /* When tr is performing translation and string1 is longer than string2,
242 POSIX says that the result is undefined. That gives the implementor
243 of a POSIX conforming version of tr two reasonable choices for the
244 semantics of this case.
246 * The BSD tr pads string2 to the length of string1 by
247 repeating the last character in string2.
249 * System V tr ignores characters in string1 that have no
250 corresponding character in string2. That is, string1 is effectively
251 truncated to the length of string2.
253 When nonzero, this flag causes GNU tr to imitate the behavior
254 of System V tr when translating with string1 longer than string2.
255 The default is to emulate BSD tr. This flag is ignored in modes where
256 no translation is performed. Emulating the System V tr
257 in this exceptional case causes the relatively common BSD idiom:
259 tr -cs A-Za-z0-9 '\012'
261 to break (it would convert only zero bytes, rather than all
262 non-alphanumerics, to newlines).
264 WARNING: This switch does not provide general BSD or System V
265 compatibility. For example, it doesn't disable the interpretation
266 of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
267 some unfortunate coincidence you use such constructs in scripts
268 expecting to use some other version of tr, the scripts will break. */
269 static int truncate_set1 = 0;
271 /* An alias for (!delete && non_option_args == 2).
272 It is set in main and used there and in validate(). */
273 static int translating;
279 #define IO_BUF_SIZE BUFSIZ
280 static unsigned char io_buf[IO_BUF_SIZE];
282 static char const *const char_class_name[] =
284 "alnum", "alpha", "blank", "cntrl", "digit", "graph",
285 "lower", "print", "punct", "space", "upper", "xdigit"
287 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
289 typedef char SET_TYPE;
291 /* Array of boolean values. A character `c' is a member of the
292 squeeze set if and only if in_squeeze_set[c] is true. The squeeze
293 set is defined by the last (possibly, the only) string argument
294 on the command line when the squeeze option is given. */
295 static SET_TYPE in_squeeze_set[N_CHARS];
297 /* Array of boolean values. A character `c' is a member of the
298 delete set if and only if in_delete_set[c] is true. The delete
299 set is defined by the first (or only) string argument on the
300 command line when the delete option is given. */
301 static SET_TYPE in_delete_set[N_CHARS];
303 /* Array of character values defining the translation (if any) that
304 tr is to perform. Translation is performed only when there are
305 two specification strings and the delete switch is not given. */
306 static char xlate[N_CHARS];
308 static struct option const long_options[] =
310 {"complement", no_argument, NULL, 'c'},
311 {"delete", no_argument, NULL, 'd'},
312 {"squeeze-repeats", no_argument, NULL, 's'},
313 {"truncate-set1", no_argument, NULL, 't'},
314 {GETOPT_HELP_OPTION_DECL},
315 {GETOPT_VERSION_OPTION_DECL},
323 fprintf (stderr, _("Try `%s --help' for more information.\n"),
328 Usage: %s [OPTION]... SET1 [SET2]\n\
332 Translate, squeeze, and/or delete characters from standard input,\n\
333 writing to standard output.\n\
335 -c, --complement first complement SET1\n\
336 -d, --delete delete characters in SET1, do not translate\n\
337 -s, --squeeze-repeats replace each input sequence of a repeated character\n\
338 that is listed in SET1 with a single occurrence\n\
340 -t, --truncate-set1 first truncate SET1 to length of SET2\n\
342 fputs (HELP_OPTION_DESCRIPTION, stdout);
343 fputs (VERSION_OPTION_DESCRIPTION, stdout);
346 SETs are specified as strings of characters. Most represent themselves.\n\
347 Interpreted sequences are:\n\
349 \\NNN character with octal value NNN (1 to 3 octal digits)\n\
356 \\t horizontal tab\n\
360 CHAR1-CHAR2 all characters from CHAR1 to CHAR2 in ascending order\n\
361 [CHAR*] in SET2, copies of CHAR until length of SET1\n\
362 [CHAR*REPEAT] REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
363 [:alnum:] all letters and digits\n\
364 [:alpha:] all letters\n\
365 [:blank:] all horizontal whitespace\n\
366 [:cntrl:] all control characters\n\
367 [:digit:] all digits\n\
370 [:graph:] all printable characters, not including space\n\
371 [:lower:] all lower case letters\n\
372 [:print:] all printable characters, including space\n\
373 [:punct:] all punctuation characters\n\
374 [:space:] all horizontal or vertical whitespace\n\
375 [:upper:] all upper case letters\n\
376 [:xdigit:] all hexadecimal digits\n\
377 [=CHAR=] all characters which are equivalent to CHAR\n\
381 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
382 -t may be used only when translating. SET2 is extended to length of\n\
383 SET1 by repeating its last character as necessary. \
387 of SET2 are ignored. Only [:lower:] and [:upper:] are guaranteed to\n\
388 expand in ascending order; used in SET2 while translating, they may\n\
389 only be used in pairs to specify case conversion. \
392 -s uses SET1 if not\n\
393 translating nor deleting; else squeezing uses SET2 and occurs after\n\
394 translation or deletion.\n\
396 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
398 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
401 /* Return nonzero if the character C is a member of the
402 equivalence class containing the character EQUIV_CLASS. */
405 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
407 return (equiv_class == c);
410 /* Return nonzero if the character C is a member of the
411 character class CHAR_CLASS. */
414 is_char_class_member (enum Char_class char_class, unsigned int c)
421 result = ISALNUM (c);
424 result = ISALPHA (c);
427 result = ISBLANK (c);
430 result = ISCNTRL (c);
433 result = ISDIGIT_LOCALE (c);
436 result = ISGRAPH (c);
439 result = ISLOWER (c);
442 result = ISPRINT (c);
445 result = ISPUNCT (c);
448 result = ISSPACE (c);
451 result = ISUPPER (c);
454 result = ISXDIGIT (c);
464 es_free (struct E_string *es)
470 /* Perform the first pass over each range-spec argument S, converting all
471 \c and \ddd escapes to their one-byte representations. The conversion
472 is done in-place, so S must point to writable storage. If an invalid
473 quote sequence is found print an error message and return nonzero.
474 Otherwise set *LEN to the length of the resulting string and return
475 zero. The resulting array of characters may contain zero-bytes;
476 however, on input, S is assumed to be null-terminated, and hence
477 cannot contain actual (non-escaped) zero bytes. */
480 unquote (const unsigned char *s, struct E_string *es)
485 len = strlen ((char *) s);
487 es->s = (unsigned char *) xmalloc (len);
488 es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
489 for (i = 0; i < len; i++)
493 for (i = 0; s[i]; i++)
535 oct_digit = s[i + 2] - '0';
536 if (0 <= oct_digit && oct_digit <= 7)
538 c = 8 * c + oct_digit;
540 oct_digit = s[i + 2] - '0';
541 if (0 <= oct_digit && oct_digit <= 7)
543 if (8 * c + oct_digit < N_CHARS)
545 c = 8 * c + oct_digit;
548 else if (!posix_pedantic)
550 /* A 3-digit octal number larger than \377 won't
551 fit in 8 bits. So we stop when adding the
552 next digit would put us over the limit and
553 give a warning about the ambiguity. POSIX
554 isn't clear on this, but one person has said
555 that in his interpretation, POSIX says tr
556 can't even give a warning. */
557 error (0, 0, _("warning: the ambiguous octal escape \
558 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
559 s[i], s[i + 1], s[i + 2],
560 s[i], s[i + 1], s[i + 2]);
566 error (0, 0, _("invalid backslash escape at end of string"));
572 error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
593 /* If CLASS_STR is a valid character class string, return its index
594 in the global char_class_name array. Otherwise, return CC_NO_CLASS. */
596 static enum Char_class
597 look_up_char_class (const unsigned char *class_str, size_t len)
601 for (i = 0; i < N_CHAR_CLASSES; i++)
602 if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
603 && strlen (char_class_name[i]) == len)
604 return (enum Char_class) i;
608 /* Return a newly allocated string with a printable version of C.
609 This function is used solely for formatting error messages. */
612 make_printable_char (unsigned int c)
614 char *buf = xmalloc (5);
616 assert (c < N_CHARS);
624 sprintf (buf, "\\%03o", c);
629 /* Return a newly allocated copy of S which is suitable for printing.
630 LEN is the number of characters in S. Most non-printing
631 (isprint) characters are represented by a backslash followed by
632 3 octal digits. However, the characters represented by \c escapes
633 where c is one of [abfnrtv] are represented by their 2-character \c
634 sequences. This function is used solely for printing error messages. */
637 make_printable_str (const unsigned char *s, size_t len)
639 /* Worst case is that every character expands to a backslash
640 followed by a 3-character octal escape sequence. */
641 char *printable_buf = xmalloc (4 * len + 1);
642 char *p = printable_buf;
645 for (i = 0; i < len; i++)
683 sprintf (buf, "\\%03o", s[i]);
689 return printable_buf;
692 /* Append a newly allocated structure representing a
693 character C to the specification list LIST. */
696 append_normal_char (struct Spec_list *list, unsigned int c)
698 struct List_element *new;
700 new = (struct List_element *) xmalloc (sizeof (struct List_element));
702 new->type = RE_NORMAL_CHAR;
703 new->u.normal_char = c;
705 list->tail->next = new;
709 /* Append a newly allocated structure representing the range
710 of characters from FIRST to LAST to the specification list LIST.
711 Return nonzero if LAST precedes FIRST in the collating sequence,
712 zero otherwise. This means that '[c-c]' is acceptable. */
715 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
717 struct List_element *new;
719 if (ORD (first) > ORD (last))
721 char *tmp1 = make_printable_char (first);
722 char *tmp2 = make_printable_char (last);
725 _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
731 new = (struct List_element *) xmalloc (sizeof (struct List_element));
733 new->type = RE_RANGE;
734 new->u.range.first_char = first;
735 new->u.range.last_char = last;
737 list->tail->next = new;
742 /* If CHAR_CLASS_STR is a valid character class string, append a
743 newly allocated structure representing that character class to the end
744 of the specification list LIST and return 0. If CHAR_CLASS_STR is not
745 a valid string return nonzero. */
748 append_char_class (struct Spec_list *list,
749 const unsigned char *char_class_str, size_t len)
751 enum Char_class char_class;
752 struct List_element *new;
754 char_class = look_up_char_class (char_class_str, len);
755 if (char_class == CC_NO_CLASS)
757 new = (struct List_element *) xmalloc (sizeof (struct List_element));
759 new->type = RE_CHAR_CLASS;
760 new->u.char_class = char_class;
762 list->tail->next = new;
767 /* Append a newly allocated structure representing a [c*n]
768 repeated character construct to the specification list LIST.
769 THE_CHAR is the single character to be repeated, and REPEAT_COUNT
770 is a non-negative repeat count. */
773 append_repeated_char (struct Spec_list *list, unsigned int the_char,
776 struct List_element *new;
778 new = (struct List_element *) xmalloc (sizeof (struct List_element));
780 new->type = RE_REPEATED_CHAR;
781 new->u.repeated_char.the_repeated_char = the_char;
782 new->u.repeated_char.repeat_count = repeat_count;
784 list->tail->next = new;
788 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
789 the length of that string, LEN, if LEN is exactly one, append
790 a newly allocated structure representing the specified
791 equivalence class to the specification list, LIST and return zero.
792 If LEN is not 1, return nonzero. */
795 append_equiv_class (struct Spec_list *list,
796 const unsigned char *equiv_class_str, size_t len)
798 struct List_element *new;
802 new = (struct List_element *) xmalloc (sizeof (struct List_element));
804 new->type = RE_EQUIV_CLASS;
805 new->u.equiv_code = *equiv_class_str;
807 list->tail->next = new;
812 /* Return a newly allocated copy of the LEN-byte prefix of P.
813 The returned string may contain NUL bytes and is *not* NUL-terminated. */
815 static unsigned char *
816 xmemdup (const unsigned char *p, size_t len)
818 unsigned char *tmp = (unsigned char *) xmalloc (len);
820 /* Use memcpy rather than strncpy because `p' may contain zero-bytes. */
821 memcpy (tmp, p, len);
825 /* Search forward starting at START_IDX for the 2-char sequence
826 (PRE_BRACKET_CHAR,']') in the string P of length P_LEN. If such
827 a sequence is found, set *RESULT_IDX to the index of the first
828 character and return nonzero. Otherwise return zero. P may contain
832 find_closing_delim (const struct E_string *es, size_t start_idx,
833 unsigned int pre_bracket_char, size_t *result_idx)
837 for (i = start_idx; i < es->len - 1; i++)
838 if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
839 && !es->escaped[i] && !es->escaped[i + 1])
847 /* Parse the bracketed repeat-char syntax. If the P_LEN characters
848 beginning with P[ START_IDX ] comprise a valid [c*n] construct,
849 then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
850 and return zero. If the second character following
851 the opening bracket is not `*' or if no closing bracket can be
852 found, return -1. If a closing bracket is found and the
853 second char is `*', but the string between the `*' and `]' isn't
854 empty, an octal number, or a decimal number, print an error message
858 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
859 unsigned int *char_to_repeat, size_t *repeat_count,
860 size_t *closing_bracket_idx)
864 assert (start_idx + 1 < es->len);
865 if (!ES_MATCH (es, start_idx + 1, '*'))
868 for (i = start_idx + 2; i < es->len; i++)
870 if (ES_MATCH (es, i, ']'))
872 size_t digit_str_len = i - start_idx - 2;
874 *char_to_repeat = es->s[start_idx];
875 if (digit_str_len == 0)
877 /* We've matched [c*] -- no explicit repeat count. */
879 *closing_bracket_idx = i;
883 /* Here, we have found [c*s] where s should be a string
884 of octal (if it starts with `0') or decimal digits. */
886 const char *digit_str = (const char *) &es->s[start_idx + 2];
887 unsigned long int tmp_ulong;
890 /* Select the base manually so we can be sure it's either 8 or 10.
891 If the spec allowed it to be interpreted as hexadecimal, we
892 could have used `0' and let xstrtoul decide. */
893 if (*digit_str == '0')
899 if (xstrtoul (digit_str, &d_end, base, &tmp_ulong, NULL)
901 || BEGIN_STATE < tmp_ulong
902 || digit_str + digit_str_len != d_end)
904 char *tmp = make_printable_str (es->s + start_idx + 2,
906 error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
911 *repeat_count = tmp_ulong;
913 *closing_bracket_idx = i;
917 return -1; /* No bracket found. */
920 /* Return nonzero if the string at ES->s[IDX] matches the regular
921 expression `\*[0-9]*\]', zero otherwise. To match, the `*' and
922 the `]' must not be escaped. */
925 star_digits_closebracket (const struct E_string *es, size_t idx)
929 if (!ES_MATCH (es, idx, '*'))
932 for (i = idx + 1; i < es->len; i++)
934 if (!ISDIGIT (es->s[i]))
936 if (ES_MATCH (es, i, ']'))
944 /* Convert string UNESCAPED_STRING (which has been preprocessed to
945 convert backslash-escape sequences) of length LEN characters into
946 a linked list of the following 5 types of constructs:
947 - [:str:] Character class where `str' is one of the 12 valid strings.
948 - [=c=] Equivalence class where `c' is any single character.
949 - [c*n] Repeat the single character `c' `n' times. n may be omitted.
950 However, if `n' is present, it must be a non-negative octal or
952 - r-s Range of characters from `r' to `s'. The second endpoint must
953 not precede the first in the current collating sequence.
954 - c Any other character is interpreted as itself. */
957 build_spec_list (const struct E_string *es, struct Spec_list *result)
959 const unsigned char *p;
964 /* The main for-loop below recognizes the 4 multi-character constructs.
965 A character that matches (in its context) none of the multi-character
966 constructs is classified as `normal'. Since all multi-character
967 constructs have at least 3 characters, any strings of length 2 or
968 less are composed solely of normal characters. Hence, the index of
969 the outer for-loop runs only as far as LEN-2. */
971 for (i = 0; i + 2 < es->len; /* empty */)
973 if (ES_MATCH (es, i, '['))
975 int matched_multi_char_construct;
976 size_t closing_bracket_idx;
977 unsigned int char_to_repeat;
981 matched_multi_char_construct = 1;
982 if (ES_MATCH (es, i + 1, ':')
983 || ES_MATCH (es, i + 1, '='))
985 size_t closing_delim_idx;
988 found = find_closing_delim (es, i + 2, p[i + 1],
993 size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
994 unsigned char *opnd_str;
996 if (opnd_str_len == 0)
999 error (0, 0, _("missing character class name `[::]'"));
1002 _("missing equivalence class character `[==]'"));
1006 opnd_str = xmemdup (p + i + 2, opnd_str_len);
1008 if (p[i + 1] == ':')
1010 parse_failed = append_char_class (result, opnd_str,
1013 /* FIXME: big comment. */
1016 if (star_digits_closebracket (es, i + 2))
1019 goto try_bracketed_repeat;
1023 char *tmp = make_printable_str (opnd_str,
1025 error (0, 0, _("invalid character class `%s'"),
1034 parse_failed = append_equiv_class (result, opnd_str,
1037 /* FIXME: big comment. */
1040 if (star_digits_closebracket (es, i + 2))
1043 goto try_bracketed_repeat;
1047 char *tmp = make_printable_str (opnd_str,
1050 _("%s: equivalence class operand must be a single character"),
1059 /* Return nonzero if append_*_class reports a problem. */
1063 i = closing_delim_idx + 2;
1066 /* Else fall through. This could be [:*] or [=*]. */
1069 try_bracketed_repeat:
1071 /* Determine whether this is a bracketed repeat range
1072 matching the RE \[.\*(dec_or_oct_number)?\]. */
1073 err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1075 &closing_bracket_idx);
1078 append_repeated_char (result, char_to_repeat, repeat_count);
1079 i = closing_bracket_idx + 1;
1083 matched_multi_char_construct = 0;
1087 /* Found a string that looked like [c*n] but the
1088 numeric part was invalid. */
1092 if (matched_multi_char_construct)
1095 /* We reach this point if P does not match [:str:], [=c=],
1096 [c*n], or [c*]. Now, see if P looks like a range `[-c'
1097 (from `[' to `c'). */
1100 /* Look ahead one char for ranges like a-z. */
1101 if (ES_MATCH (es, i + 1, '-'))
1103 if (append_range (result, p[i], p[i + 2]))
1109 append_normal_char (result, p[i]);
1114 /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1]. */
1115 for (; i < es->len; i++)
1116 append_normal_char (result, p[i]);
1121 /* Given a Spec_list S (with its saved state implicit in the values
1122 of its members `tail' and `state'), return the next single character
1123 in the expansion of S's constructs. If the last character of S was
1124 returned on the previous call or if S was empty, this function
1125 returns -1. For example, successive calls to get_next where S
1126 represents the spec-string 'a-d[y*3]' will return the sequence
1127 of values a, b, c, d, y, y, y, -1. Finally, if the construct from
1128 which the returned character comes is [:upper:] or [:lower:], the
1129 parameter CLASS is given a value to indicate which it was. Otherwise
1130 CLASS is set to UL_NONE. This value is used only when constructing
1131 the translation table to verify that any occurrences of upper and
1132 lower class constructs in the spec-strings appear in the same relative
1136 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1138 struct List_element *p;
1145 if (s->state == BEGIN_STATE)
1147 s->tail = s->head->next;
1148 s->state = NEW_ELEMENT;
1157 case RE_NORMAL_CHAR:
1158 return_val = p->u.normal_char;
1159 s->state = NEW_ELEMENT;
1164 if (s->state == NEW_ELEMENT)
1165 s->state = ORD (p->u.range.first_char);
1168 return_val = CHR (s->state);
1169 if (s->state == ORD (p->u.range.last_char))
1172 s->state = NEW_ELEMENT;
1180 switch (p->u.char_class)
1198 s->state = NEW_ELEMENT;
1204 if (s->state == NEW_ELEMENT)
1206 for (i = 0; i < N_CHARS; i++)
1207 if (is_char_class_member (p->u.char_class, i))
1209 assert (i < N_CHARS);
1212 assert (is_char_class_member (p->u.char_class, s->state));
1213 return_val = CHR (s->state);
1214 for (i = s->state + 1; i < N_CHARS; i++)
1215 if (is_char_class_member (p->u.char_class, i))
1222 s->state = NEW_ELEMENT;
1226 case RE_EQUIV_CLASS:
1227 /* FIXME: this assumes that each character is alone in its own
1228 equivalence class (which appears to be correct for my
1229 LC_COLLATE. But I don't know of any function that allows
1230 one to determine a character's equivalence class. */
1232 return_val = p->u.equiv_code;
1233 s->state = NEW_ELEMENT;
1237 case RE_REPEATED_CHAR:
1238 /* Here, a repeat count of n == 0 means don't repeat at all. */
1239 if (p->u.repeated_char.repeat_count == 0)
1242 s->state = NEW_ELEMENT;
1243 return_val = get_next (s, class);
1247 if (s->state == NEW_ELEMENT)
1252 return_val = p->u.repeated_char.the_repeated_char;
1253 if (p->u.repeated_char.repeat_count > 0
1254 && s->state == p->u.repeated_char.repeat_count)
1257 s->state = NEW_ELEMENT;
1274 /* This is a minor kludge. This function is called from
1275 get_spec_stats to determine the cardinality of a set derived
1276 from a complemented string. It's a kludge in that some of the
1277 same operations are (duplicated) performed in set_initialize. */
1280 card_of_complement (struct Spec_list *s)
1283 int cardinality = N_CHARS;
1284 SET_TYPE in_set[N_CHARS];
1286 memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1287 s->state = BEGIN_STATE;
1288 while ((c = get_next (s, NULL)) != -1)
1294 /* Gather statistics about the spec-list S in preparation for the tests
1295 in validate that determine the consistency of the specs. This function
1296 is called at most twice; once for string1, and again for any string2.
1297 LEN_S1 < 0 indicates that this is the first call and that S represents
1298 string1. When LEN_S1 >= 0, it is the length of the expansion of the
1299 constructs in string1, and we can use its value to resolve any
1300 indefinite repeat construct in S (which represents string2). Hence,
1301 this function has the side-effect that it converts a valid [c*]
1302 construct in string2 to [c*n] where n is large enough (or 0) to give
1303 string2 the same length as string1. For example, with the command
1304 tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1305 be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */
1308 get_spec_stats (struct Spec_list *s)
1310 struct List_element *p;
1313 s->n_indefinite_repeats = 0;
1314 s->has_equiv_class = 0;
1315 s->has_restricted_char_class = 0;
1316 s->has_char_class = 0;
1317 for (p = s->head->next; p; p = p->next)
1322 case RE_NORMAL_CHAR:
1327 assert (p->u.range.last_char >= p->u.range.first_char);
1328 len += p->u.range.last_char - p->u.range.first_char + 1;
1332 s->has_char_class = 1;
1333 for (i = 0; i < N_CHARS; i++)
1334 if (is_char_class_member (p->u.char_class, i))
1336 switch (p->u.char_class)
1342 s->has_restricted_char_class = 1;
1347 case RE_EQUIV_CLASS:
1348 for (i = 0; i < N_CHARS; i++)
1349 if (is_equiv_class_member (p->u.equiv_code, i))
1351 s->has_equiv_class = 1;
1354 case RE_REPEATED_CHAR:
1355 if (p->u.repeated_char.repeat_count > 0)
1356 len += p->u.repeated_char.repeat_count;
1357 else if (p->u.repeated_char.repeat_count == 0)
1359 s->indefinite_repeat_element = p;
1360 ++(s->n_indefinite_repeats);
1374 get_s1_spec_stats (struct Spec_list *s1)
1376 get_spec_stats (s1);
1378 s1->length = card_of_complement (s1);
1382 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1384 get_spec_stats (s2);
1385 if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1387 s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1388 len_s1 - s2->length;
1389 s2->length = len_s1;
1394 spec_init (struct Spec_list *spec_list)
1396 spec_list->head = spec_list->tail =
1397 (struct List_element *) xmalloc (sizeof (struct List_element));
1398 spec_list->head->next = NULL;
1401 /* This function makes two passes over the argument string S. The first
1402 one converts all \c and \ddd escapes to their one-byte representations.
1403 The second constructs a linked specification list, SPEC_LIST, of the
1404 characters and constructs that comprise the argument string. If either
1405 of these passes detects an error, this function returns nonzero. */
1408 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1413 fail = unquote (s, &es);
1415 fail = build_spec_list (&es, spec_list);
1420 /* Given two specification lists, S1 and S2, and assuming that
1421 S1->length > S2->length, append a single [c*n] element to S2 where c
1422 is the last character in the expansion of S2 and n is the difference
1423 between the two lengths.
1424 Upon successful completion, S2->length is set to S1->length. The only
1425 way this function can fail to make S2 as long as S1 is when S2 has
1426 zero-length, since in that case, there is no last character to repeat.
1427 So S2->length is required to be at least 1.
1429 Providing this functionality allows the user to do some pretty
1430 non-BSD (and non-portable) things: For example, the command
1431 tr -cs '[:upper:]0-9' '[:lower:]'
1432 is almost guaranteed to give results that depend on your collating
1436 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1438 struct List_element *p;
1442 assert (translating);
1443 assert (s1->length > s2->length);
1444 assert (s2->length > 0);
1449 case RE_NORMAL_CHAR:
1450 char_to_repeat = p->u.normal_char;
1453 char_to_repeat = p->u.range.last_char;
1456 for (i = N_CHARS; i >= 0; i--)
1457 if (is_char_class_member (p->u.char_class, i))
1460 char_to_repeat = CHR (i);
1463 case RE_REPEATED_CHAR:
1464 char_to_repeat = p->u.repeated_char.the_repeated_char;
1467 case RE_EQUIV_CLASS:
1468 /* This shouldn't happen, because validate exits with an error
1469 if it finds an equiv class in string2 when translating. */
1482 append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1483 s2->length = s1->length;
1486 /* Return non-zero if S is a non-empty list in which exactly one
1487 character (but potentially, many instances of it) appears.
1488 E.g. [X*] or xxxxxxxx. */
1491 homogeneous_spec_list (struct Spec_list *s)
1495 s->state = BEGIN_STATE;
1497 if ((b = get_next (s, NULL)) == -1)
1500 while ((c = get_next (s, NULL)) != -1)
1507 /* Die with an error message if S1 and S2 describe strings that
1508 are not valid with the given command line switches.
1509 A side effect of this function is that if a valid [c*] or
1510 [c*0] construct appears in string2, it is converted to [c*n]
1511 with a value for n that makes s2->length == s1->length. By
1512 the same token, if the --truncate-set1 option is not
1513 given, S2 may be extended. */
1516 validate (struct Spec_list *s1, struct Spec_list *s2)
1518 get_s1_spec_stats (s1);
1519 if (s1->n_indefinite_repeats > 0)
1521 error (EXIT_FAILURE, 0,
1522 _("the [c*] repeat construct may not appear in string1"));
1527 get_s2_spec_stats (s2, s1->length);
1529 if (s2->n_indefinite_repeats > 1)
1531 error (EXIT_FAILURE, 0,
1532 _("only one [c*] repeat construct may appear in string2"));
1537 if (s2->has_equiv_class)
1539 error (EXIT_FAILURE, 0,
1540 _("[=c=] expressions may not appear in string2 \
1541 when translating"));
1544 if (s1->length > s2->length)
1548 /* string2 must be non-empty unless --truncate-set1 is
1549 given or string1 is empty. */
1551 if (s2->length == 0)
1552 error (EXIT_FAILURE, 0,
1553 _("when not truncating set1, string2 must be non-empty"));
1554 string2_extend (s1, s2);
1558 if (complement && s1->has_char_class
1559 && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1561 error (EXIT_FAILURE, 0,
1562 _("when translating with complemented character classes,\
1563 \nstring2 must map all characters in the domain to one"));
1566 if (s2->has_restricted_char_class)
1568 error (EXIT_FAILURE, 0,
1569 _("when translating, the only character classes that may \
1570 appear in\nstring2 are `upper' and `lower'"));
1574 /* Not translating. */
1576 if (s2->n_indefinite_repeats > 0)
1577 error (EXIT_FAILURE, 0,
1578 _("the [c*] construct may appear in string2 only \
1579 when translating"));
1584 /* Read buffers of SIZE bytes via the function READER (if READER is
1585 NULL, read from stdin) until EOF. When non-NULL, READER is either
1586 read_and_delete or read_and_xlate. After each buffer is read, it is
1587 processed and written to stdout. The buffers are processed so that
1588 multiple consecutive occurrences of the same character in the input
1589 stream are replaced by a single occurrence of that character if the
1590 character is in the squeeze set. */
1593 squeeze_filter (unsigned char *buf, size_t size, Filter reader)
1595 unsigned int char_to_squeeze = NOT_A_CHAR;
1607 ssize_t signed_nr = safe_read (0, (char *) buf, size);
1609 error (EXIT_FAILURE, errno, _("read error"));
1614 nr = (*reader) (buf, size, NULL);
1624 if (char_to_squeeze == NOT_A_CHAR)
1627 /* Here, by being a little tricky, we can get a significant
1628 performance increase in most cases when the input is
1629 reasonably large. Since tr will modify the input only
1630 if two consecutive (and identical) input characters are
1631 in the squeeze set, we can step by two through the data
1632 when searching for a character in the squeeze set. This
1633 means there may be a little more work in a few cases and
1634 perhaps twice as much work in the worst cases where most
1635 of the input is removed by squeezing repeats. But most
1636 uses of this functionality seem to remove less than 20-30%
1638 for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1641 /* There is a special case when i == nr and we've just
1642 skipped a character (the last one in buf) that is in
1644 if (i == nr && in_squeeze_set[buf[i - 1]])
1648 out_len = nr - begin;
1651 char_to_squeeze = buf[i];
1652 /* We're about to output buf[begin..i]. */
1653 out_len = i - begin + 1;
1655 /* But since we stepped by 2 in the loop above,
1656 out_len may be one too large. */
1657 if (i > 0 && buf[i - 1] == char_to_squeeze)
1660 /* Advance i to the index of first character to be
1661 considered when looking for a char different from
1666 && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1667 error (EXIT_FAILURE, errno, _("write error"));
1670 if (char_to_squeeze != NOT_A_CHAR)
1672 /* Advance i to index of first char != char_to_squeeze
1673 (or to nr if all the rest of the characters in this
1674 buffer are the same as char_to_squeeze). */
1675 for (; i < nr && buf[i] == char_to_squeeze; i++)
1678 char_to_squeeze = NOT_A_CHAR;
1679 /* If (i >= nr) we've squeezed the last character in this buffer.
1680 So now we have to read a new buffer and continue comparing
1681 characters against char_to_squeeze. */
1686 /* Read buffers of SIZE bytes from stdin until one is found that
1687 contains at least one character not in the delete set. Store
1688 in the array BUF, all characters from that buffer that are not
1689 in the delete set, and return the number of characters saved
1693 read_and_delete (unsigned char *buf, size_t size, Filter not_used)
1696 static int hit_eof = 0;
1698 assert (not_used == NULL);
1703 /* This enclosing do-while loop is to make sure that
1704 we don't return zero (indicating EOF) when we've
1705 just deleted all the characters in a buffer. */
1709 ssize_t nr = safe_read (0, (char *) buf, size);
1712 error (EXIT_FAILURE, errno, _("read error"));
1719 /* This first loop may be a waste of code, but gives much
1720 better performance when no characters are deleted in
1721 the beginning of a buffer. It just avoids the copying
1722 of buf[i] into buf[n_saved] when it would be a NOP. */
1724 for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1728 for (++i; i < nr; i++)
1729 if (!in_delete_set[buf[i]])
1730 buf[n_saved++] = buf[i];
1732 while (n_saved == 0);
1737 /* Read at most SIZE bytes from stdin into the array BUF. Then
1738 perform the in-place and one-to-one mapping specified by the global
1739 array `xlate'. Return the number of characters read, or 0 upon EOF. */
1742 read_and_xlate (unsigned char *buf, size_t size, Filter not_used)
1744 ssize_t chars_read = 0;
1745 static int hit_eof = 0;
1748 assert (not_used == NULL);
1753 chars_read = safe_read (0, (char *) buf, size);
1755 error (EXIT_FAILURE, errno, _("read error"));
1756 if (chars_read == 0)
1762 for (i = 0; i < chars_read; i++)
1763 buf[i] = xlate[buf[i]];
1768 /* Initialize a boolean membership set IN_SET with the character
1769 values obtained by traversing the linked list of constructs S
1770 using the function `get_next'. If COMPLEMENT_THIS_SET is
1771 nonzero the resulting set is complemented. */
1774 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1779 memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1780 s->state = BEGIN_STATE;
1781 while ((c = get_next (s, NULL)) != -1)
1783 if (complement_this_set)
1784 for (i = 0; i < N_CHARS; i++)
1785 in_set[i] = (!in_set[i]);
1789 main (int argc, char **argv)
1792 int non_option_args;
1793 struct Spec_list buf1, buf2;
1794 struct Spec_list *s1 = &buf1;
1795 struct Spec_list *s2 = &buf2;
1797 program_name = argv[0];
1798 setlocale (LC_ALL, "");
1799 bindtextdomain (PACKAGE, LOCALEDIR);
1800 textdomain (PACKAGE);
1802 atexit (close_stdout);
1804 while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1820 squeeze_repeats = 1;
1827 case_GETOPT_HELP_CHAR;
1829 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1837 posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1839 non_option_args = argc - optind;
1840 translating = (non_option_args == 2 && !delete);
1842 /* Change this test if it is valid to give tr no options and
1843 no args at all. POSIX doesn't specifically say anything
1844 either way, but it looks like they implied it's invalid
1845 by omission. If you want to make tr do a slow imitation
1846 of `cat' use `tr a a'. */
1847 if (non_option_args > 2)
1849 error (0, 0, _("too many arguments"));
1853 if (!delete && !squeeze_repeats && non_option_args != 2)
1854 error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1856 if (delete && squeeze_repeats && non_option_args != 2)
1857 error (EXIT_FAILURE, 0, _("two strings must be given when both \
1858 deleting and squeezing repeats"));
1860 /* If --delete is given without --squeeze-repeats, then
1861 only one string argument may be specified. But POSIX
1862 says to ignore any string2 in this case, so if POSIXLY_CORRECT
1863 is set, pretend we never saw string2. But I think
1864 this deserves a fatal error, so that's the default. */
1865 if ((delete && !squeeze_repeats) && non_option_args != 1)
1867 if (posix_pedantic && non_option_args == 2)
1870 error (EXIT_FAILURE, 0,
1871 _("only one string may be given when deleting \
1872 without squeezing repeats"));
1875 if (squeeze_repeats && non_option_args == 0)
1876 error (EXIT_FAILURE, 0,
1877 _("at least one string must be given when squeezing repeats"));
1880 if (parse_str ((unsigned char *) argv[optind], s1))
1881 exit (EXIT_FAILURE);
1883 if (non_option_args == 2)
1886 if (parse_str ((unsigned char *) argv[optind + 1], s2))
1887 exit (EXIT_FAILURE);
1894 /* Use binary I/O, since `tr' is sometimes used to transliterate
1895 non-printable characters, or characters which are stripped away
1896 by text-mode reads (like CR and ^Z). */
1897 SET_BINARY2 (STDIN_FILENO, STDOUT_FILENO);
1899 if (squeeze_repeats && non_option_args == 1)
1901 set_initialize (s1, complement, in_squeeze_set);
1902 squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1904 else if (delete && non_option_args == 1)
1908 set_initialize (s1, complement, in_delete_set);
1911 nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1912 if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1913 error (EXIT_FAILURE, errno, _("write error"));
1917 else if (squeeze_repeats && delete && non_option_args == 2)
1919 set_initialize (s1, complement, in_delete_set);
1920 set_initialize (s2, 0, in_squeeze_set);
1921 squeeze_filter (io_buf, IO_BUF_SIZE, read_and_delete);
1923 else if (translating)
1928 SET_TYPE *in_s1 = in_delete_set;
1930 set_initialize (s1, 0, in_s1);
1931 s2->state = BEGIN_STATE;
1932 for (i = 0; i < N_CHARS; i++)
1934 for (i = 0; i < N_CHARS; i++)
1938 int ch = get_next (s2, NULL);
1939 assert (ch != -1 || truncate_set1);
1942 /* This will happen when tr is invoked like e.g.
1943 tr -cs A-Za-z0-9 '\012'. */
1949 assert (get_next (s2, NULL) == -1 || truncate_set1);
1955 enum Upper_Lower_class class_s1;
1956 enum Upper_Lower_class class_s2;
1958 for (i = 0; i < N_CHARS; i++)
1960 s1->state = BEGIN_STATE;
1961 s2->state = BEGIN_STATE;
1964 c1 = get_next (s1, &class_s1);
1965 c2 = get_next (s2, &class_s2);
1966 if (!class_ok[(int) class_s1][(int) class_s2])
1967 error (EXIT_FAILURE, 0,
1968 _("misaligned [:upper:] and/or [:lower:] construct"));
1970 if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1972 for (i = 0; i < N_CHARS; i++)
1974 xlate[i] = toupper (i);
1976 else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1978 for (i = 0; i < N_CHARS; i++)
1980 xlate[i] = tolower (i);
1982 else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1983 || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1985 /* By default, GNU tr permits the identity mappings: from
1986 [:upper:] to [:upper:] and [:lower:] to [:lower:]. But
1987 when POSIXLY_CORRECT is set, those evoke diagnostics. */
1990 error (EXIT_FAILURE, 0,
1992 invalid identity mapping; when translating, any [:lower:] or [:upper:]\n\
1993 construct in string1 must be aligned with a corresponding construct\n\
1994 ([:upper:] or [:lower:], respectively) in string2"));
1999 /* The following should have been checked by validate... */
2000 if (c1 == -1 || c2 == -1)
2005 assert (c1 == -1 || truncate_set1);
2007 if (squeeze_repeats)
2009 set_initialize (s2, 0, in_squeeze_set);
2010 squeeze_filter (io_buf, IO_BUF_SIZE, read_and_xlate);
2018 chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2020 && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2021 error (EXIT_FAILURE, errno, _("write error"));
2023 while (chars_read > 0);
2027 if (close (STDIN_FILENO) != 0)
2028 error (EXIT_FAILURE, errno, _("standard input"));
2030 exit (EXIT_SUCCESS);