1 /* tr -- a filter to translate characters
2 Copyright (C) 91, 1995-1998, 1999, 2000, 2001 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 /* Written by Jim Meyering */
25 #include <sys/types.h>
31 #include "safe-read.h"
34 /* The official name of this program (e.g., no `g' prefix). */
35 #define PROGRAM_NAME "tr"
37 #define AUTHORS "Jim Meyering"
39 #define N_CHARS (UCHAR_MAX + 1)
41 /* A pointer to a filtering function. */
42 typedef size_t (*Filter) (/* unsigned char *, size_t, Filter */);
44 /* Convert from character C to its index in the collating
45 sequence array. Just cast to an unsigned int to avoid
46 problems with sign-extension. */
47 #define ORD(c) (unsigned int)(c)
49 /* The inverse of ORD. */
50 #define CHR(i) (unsigned char)(i)
52 /* The value for Spec_list->state that indicates to
53 get_next that it should initialize the tail pointer.
54 Its value should be as large as possible to avoid conflict
55 a valid value for the state field -- and that may be as
56 large as any valid repeat_count. */
57 #define BEGIN_STATE (INT_MAX - 1)
59 /* The value for Spec_list->state that indicates to
60 get_next that the element pointed to by Spec_list->tail is
61 being considered for the first time on this pass through the
62 list -- it indicates that get_next should make any necessary
64 #define NEW_ELEMENT (BEGIN_STATE + 1)
66 /* A value distinct from any character that may have been stored in a
67 buffer as the result of a block-read in the function squeeze_filter. */
68 #define NOT_A_CHAR (unsigned int)(-1)
70 /* The following (but not CC_NO_CLASS) are indices into the array of
71 valid character class strings. */
74 CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
75 CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
76 CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
80 /* Character class to which a character (returned by get_next) belonged;
81 but it is set only if the construct from which the character was obtained
82 was one of the character classes [:upper:] or [:lower:]. The value
83 is used only when translating and then, only to make sure that upper
84 and lower class constructs have the same relative positions in string1
86 enum Upper_Lower_class
93 /* A shortcut to ensure that when constructing the translation array,
94 one of the values returned by paired calls to get_next (from s1 and s2)
95 is from [:upper:] and the other is from [:lower:], or neither is from
96 upper or lower. By default, GNU tr permits the identity mappings: from
97 [:upper:] to [:upper:] and [:lower:] to [:lower:]. But when
98 POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
99 by values of type enum Upper_Lower_class. */
100 static int const class_ok[3][3] =
107 /* The type of a List_element. See build_spec_list for more details. */
108 enum Range_element_type
118 /* One construct in one of tr's argument strings.
119 For example, consider the POSIX version of the classic tr command:
120 tr -cs 'a-zA-Z_' '[\n*]'
121 String1 has 3 constructs, two of which are ranges (a-z and A-Z),
122 and a single normal character, `_'. String2 has one construct. */
125 enum Range_element_type type;
126 struct List_element *next;
132 unsigned int first_char;
133 unsigned int last_char;
136 enum Char_class char_class;
140 unsigned int the_repeated_char;
148 /* Each of tr's argument strings is parsed into a form that is easier
149 to work with: a linked list of constructs (struct List_element).
150 Each Spec_list structure also encapsulates various attributes of
151 the corresponding argument string. The attributes are used mainly
152 to verify that the strings are valid in the context of any options
153 specified (like -s, -d, or -c). The main exception is the member
154 `tail', which is first used to construct the list. After construction,
155 it is used by get_next to save its state when traversing the list.
156 The member `state' serves a similar function. */
159 /* Points to the head of the list of range elements.
160 The first struct is a dummy; its members are never used. */
161 struct List_element *head;
163 /* When appending, points to the last element. When traversing via
164 get_next(), points to the element to process next. Setting
165 Spec_list.state to the value BEGIN_STATE before calling get_next
166 signals get_next to initialize tail to point to head->next. */
167 struct List_element *tail;
169 /* Used to save state between calls to get_next(). */
172 /* Length, in the sense that length ('a-z[:digit:]123abc')
173 is 42 ( = 26 + 10 + 6). */
176 /* The number of [c*] and [c*0] constructs that appear in this spec. */
177 int n_indefinite_repeats;
179 /* If n_indefinite_repeats is nonzero, this points to the List_element
180 corresponding to the last [c*] or [c*0] construct encountered in
181 this spec. Otherwise it is undefined. */
182 struct List_element *indefinite_repeat_element;
184 /* Non-zero if this spec contains at least one equivalence
185 class construct e.g. [=c=]. */
188 /* Non-zero if this spec contains at least one character class
189 construct. E.g. [:digit:]. */
192 /* Non-zero if this spec contains at least one of the character class
193 constructs (all but upper and lower) that aren't allowed in s2. */
194 int has_restricted_char_class;
197 /* A representation for escaped string1 or string2. As a string is parsed,
198 any backslash-escaped characters (other than octal or \a, \b, \f, \n,
199 etc.) are marked as such in this structure by setting the corresponding
200 entry in the ESCAPED vector. */
208 /* Return nonzero if the Ith character of escaped string ES matches C
209 and is not escaped itself. */
210 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
212 /* The name by which this program was run. */
215 /* When nonzero, each sequence in the input of a repeated character
216 (call it c) is replaced (in the output) by a single occurrence of c
217 for every c in the squeeze set. */
218 static int squeeze_repeats = 0;
220 /* When nonzero, removes characters in the delete set from input. */
221 static int delete = 0;
223 /* Use the complement of set1 in place of set1. */
224 static int complement = 0;
226 /* When nonzero, this flag causes GNU tr to provide strict
227 compliance with POSIX draft 1003.2.11.2. The POSIX spec
228 says that when -d is used without -s, string2 (if present)
229 must be ignored. Silently ignoring arguments is a bad idea.
230 The default GNU behavior is to give a usage message and exit.
231 Additionally, when this flag is nonzero, tr prints warnings
232 on stderr if it is being used in a manner that is not portable.
233 Applicable warnings are given by default, but are suppressed
234 if the environment variable `POSIXLY_CORRECT' is set, since
235 being POSIX conformant means we can't issue such messages.
236 Warnings on the following topics are suppressed when this
238 1. Ambiguous octal escapes. */
239 static int posix_pedantic;
241 /* When tr is performing translation and string1 is longer than string2,
242 POSIX says that the result is undefined. That gives the implementor
243 of a POSIX conforming version of tr two reasonable choices for the
244 semantics of this case.
246 * The BSD tr pads string2 to the length of string1 by
247 repeating the last character in string2.
249 * System V tr ignores characters in string1 that have no
250 corresponding character in string2. That is, string1 is effectively
251 truncated to the length of string2.
253 When nonzero, this flag causes GNU tr to imitate the behavior
254 of System V tr when translating with string1 longer than string2.
255 The default is to emulate BSD tr. This flag is ignored in modes where
256 no translation is performed. Emulating the System V tr
257 in this exceptional case causes the relatively common BSD idiom:
259 tr -cs A-Za-z0-9 '\012'
261 to break (it would convert only zero bytes, rather than all
262 non-alphanumerics, to newlines).
264 WARNING: This switch does not provide general BSD or System V
265 compatibility. For example, it doesn't disable the interpretation
266 of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
267 some unfortunate coincidence you use such constructs in scripts
268 expecting to use some other version of tr, the scripts will break. */
269 static int truncate_set1 = 0;
271 /* An alias for (!delete && non_option_args == 2).
272 It is set in main and used there and in validate(). */
273 static int translating;
279 #define IO_BUF_SIZE BUFSIZ
280 static unsigned char io_buf[IO_BUF_SIZE];
282 static char const *const char_class_name[] =
284 "alnum", "alpha", "blank", "cntrl", "digit", "graph",
285 "lower", "print", "punct", "space", "upper", "xdigit"
287 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
289 typedef char SET_TYPE;
291 /* Array of boolean values. A character `c' is a member of the
292 squeeze set if and only if in_squeeze_set[c] is true. The squeeze
293 set is defined by the last (possibly, the only) string argument
294 on the command line when the squeeze option is given. */
295 static SET_TYPE in_squeeze_set[N_CHARS];
297 /* Array of boolean values. A character `c' is a member of the
298 delete set if and only if in_delete_set[c] is true. The delete
299 set is defined by the first (or only) string argument on the
300 command line when the delete option is given. */
301 static SET_TYPE in_delete_set[N_CHARS];
303 /* Array of character values defining the translation (if any) that
304 tr is to perform. Translation is performed only when there are
305 two specification strings and the delete switch is not given. */
306 static char xlate[N_CHARS];
308 static struct option const long_options[] =
310 {"complement", no_argument, NULL, 'c'},
311 {"delete", no_argument, NULL, 'd'},
312 {"squeeze-repeats", no_argument, NULL, 's'},
313 {"truncate-set1", no_argument, NULL, 't'},
314 {GETOPT_HELP_OPTION_DECL},
315 {GETOPT_VERSION_OPTION_DECL},
323 fprintf (stderr, _("Try `%s --help' for more information.\n"),
328 Usage: %s [OPTION]... SET1 [SET2]\n\
332 Translate, squeeze, and/or delete characters from standard input,\n\
333 writing to standard output.\n\
335 -c, --complement first complement SET1\n\
336 -d, --delete delete characters in SET1, do not translate\n\
337 -s, --squeeze-repeats replace sequence of characters with one\n\
338 -t, --truncate-set1 first truncate SET1 to length of SET2\n\
339 --help display this help and exit\n\
340 --version output version information and exit\n\
344 SETs are specified as strings of characters. Most represent themselves.\n\
345 Interpreted sequences are:\n\
347 \\NNN character with octal value NNN (1 to 3 octal digits)\n\
354 \\t horizontal tab\n\
358 CHAR1-CHAR2 all characters from CHAR1 to CHAR2 in ascending order\n\
359 [CHAR*] in SET2, copies of CHAR until length of SET1\n\
360 [CHAR*REPEAT] REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
361 [:alnum:] all letters and digits\n\
362 [:alpha:] all letters\n\
363 [:blank:] all horizontal whitespace\n\
364 [:cntrl:] all control characters\n\
365 [:digit:] all digits\n\
368 [:graph:] all printable characters, not including space\n\
369 [:lower:] all lower case letters\n\
370 [:print:] all printable characters, including space\n\
371 [:punct:] all punctuation characters\n\
372 [:space:] all horizontal or vertical whitespace\n\
373 [:upper:] all upper case letters\n\
374 [:xdigit:] all hexadecimal digits\n\
375 [=CHAR=] all characters which are equivalent to CHAR\n\
379 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
380 -t may be used only when translating. SET2 is extended to length of\n\
381 SET1 by repeating its last character as necessary. Excess characters\n\
382 of SET2 are ignored. Only [:lower:] and [:upper:] are guaranteed to\n\
385 expand in ascending order; used in SET2 while translating, they may\n\
386 only be used in pairs to specify case conversion. -s uses SET1 if not\n\
387 translating nor deleting; else squeezing uses SET2 and occurs after\n\
388 translation or deletion.\n\
390 puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
392 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
395 /* Return nonzero if the character C is a member of the
396 equivalence class containing the character EQUIV_CLASS. */
399 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
401 return (equiv_class == c);
404 /* Return nonzero if the character C is a member of the
405 character class CHAR_CLASS. */
408 is_char_class_member (enum Char_class char_class, unsigned int c)
415 result = ISALNUM (c);
418 result = ISALPHA (c);
421 result = ISBLANK (c);
424 result = ISCNTRL (c);
427 result = ISDIGIT_LOCALE (c);
430 result = ISGRAPH (c);
433 result = ISLOWER (c);
436 result = ISPRINT (c);
439 result = ISPUNCT (c);
442 result = ISSPACE (c);
445 result = ISUPPER (c);
448 result = ISXDIGIT (c);
458 es_free (struct E_string *es)
464 /* Perform the first pass over each range-spec argument S, converting all
465 \c and \ddd escapes to their one-byte representations. The conversion
466 is done in-place, so S must point to writable storage. If an invalid
467 quote sequence is found print an error message and return nonzero.
468 Otherwise set *LEN to the length of the resulting string and return
469 zero. The resulting array of characters may contain zero-bytes;
470 however, on input, S is assumed to be null-terminated, and hence
471 cannot contain actual (non-escaped) zero bytes. */
474 unquote (const unsigned char *s, struct E_string *es)
479 len = strlen ((char *) s);
481 es->s = (unsigned char *) xmalloc (len);
482 es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
483 for (i = 0; i < len; i++)
487 for (i = 0; s[i]; i++)
529 oct_digit = s[i + 2] - '0';
530 if (0 <= oct_digit && oct_digit <= 7)
532 c = 8 * c + oct_digit;
534 oct_digit = s[i + 2] - '0';
535 if (0 <= oct_digit && oct_digit <= 7)
537 if (8 * c + oct_digit < N_CHARS)
539 c = 8 * c + oct_digit;
542 else if (!posix_pedantic)
544 /* Any octal number larger than 0377 won't
545 fit in 8 bits. So we stop when adding the
546 next digit would put us over the limit and
547 give a warning about the ambiguity. POSIX
548 isn't clear on this, but one person has said
549 that in his interpretation, POSIX says tr
550 can't even give a warning. */
551 error (0, 0, _("warning: the ambiguous octal escape \
552 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
553 s[i], s[i + 1], s[i + 2],
554 s[i], s[i + 1], s[i + 2]);
560 error (0, 0, _("invalid backslash escape at end of string"));
566 error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
587 /* If CLASS_STR is a valid character class string, return its index
588 in the global char_class_name array. Otherwise, return CC_NO_CLASS. */
590 static enum Char_class
591 look_up_char_class (const unsigned char *class_str, size_t len)
595 for (i = 0; i < N_CHAR_CLASSES; i++)
596 if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
597 && strlen (char_class_name[i]) == len)
598 return (enum Char_class) i;
602 /* Return a newly allocated string with a printable version of C.
603 This function is used solely for formatting error messages. */
606 make_printable_char (unsigned int c)
608 char *buf = xmalloc (5);
610 assert (c < N_CHARS);
618 sprintf (buf, "\\%03o", c);
623 /* Return a newly allocated copy of S which is suitable for printing.
624 LEN is the number of characters in S. Most non-printing
625 (isprint) characters are represented by a backslash followed by
626 3 octal digits. However, the characters represented by \c escapes
627 where c is one of [abfnrtv] are represented by their 2-character \c
628 sequences. This function is used solely for printing error messages. */
631 make_printable_str (const unsigned char *s, size_t len)
633 /* Worst case is that every character expands to a backslash
634 followed by a 3-character octal escape sequence. */
635 char *printable_buf = xmalloc (4 * len + 1);
636 char *p = printable_buf;
639 for (i = 0; i < len; i++)
677 sprintf (buf, "\\%03o", s[i]);
683 return printable_buf;
686 /* Append a newly allocated structure representing a
687 character C to the specification list LIST. */
690 append_normal_char (struct Spec_list *list, unsigned int c)
692 struct List_element *new;
694 new = (struct List_element *) xmalloc (sizeof (struct List_element));
696 new->type = RE_NORMAL_CHAR;
697 new->u.normal_char = c;
699 list->tail->next = new;
703 /* Append a newly allocated structure representing the range
704 of characters from FIRST to LAST to the specification list LIST.
705 Return nonzero if LAST precedes FIRST in the collating sequence,
706 zero otherwise. This means that '[c-c]' is acceptable. */
709 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
711 struct List_element *new;
713 if (ORD (first) > ORD (last))
715 char *tmp1 = make_printable_char (first);
716 char *tmp2 = make_printable_char (last);
719 _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
725 new = (struct List_element *) xmalloc (sizeof (struct List_element));
727 new->type = RE_RANGE;
728 new->u.range.first_char = first;
729 new->u.range.last_char = last;
731 list->tail->next = new;
736 /* If CHAR_CLASS_STR is a valid character class string, append a
737 newly allocated structure representing that character class to the end
738 of the specification list LIST and return 0. If CHAR_CLASS_STR is not
739 a valid string return nonzero. */
742 append_char_class (struct Spec_list *list,
743 const unsigned char *char_class_str, size_t len)
745 enum Char_class char_class;
746 struct List_element *new;
748 char_class = look_up_char_class (char_class_str, len);
749 if (char_class == CC_NO_CLASS)
751 new = (struct List_element *) xmalloc (sizeof (struct List_element));
753 new->type = RE_CHAR_CLASS;
754 new->u.char_class = char_class;
756 list->tail->next = new;
761 /* Append a newly allocated structure representing a [c*n]
762 repeated character construct to the specification list LIST.
763 THE_CHAR is the single character to be repeated, and REPEAT_COUNT
764 is a non-negative repeat count. */
767 append_repeated_char (struct Spec_list *list, unsigned int the_char,
770 struct List_element *new;
772 new = (struct List_element *) xmalloc (sizeof (struct List_element));
774 new->type = RE_REPEATED_CHAR;
775 new->u.repeated_char.the_repeated_char = the_char;
776 new->u.repeated_char.repeat_count = repeat_count;
778 list->tail->next = new;
782 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
783 the length of that string, LEN, if LEN is exactly one, append
784 a newly allocated structure representing the specified
785 equivalence class to the specification list, LIST and return zero.
786 If LEN is not 1, return nonzero. */
789 append_equiv_class (struct Spec_list *list,
790 const unsigned char *equiv_class_str, size_t len)
792 struct List_element *new;
796 new = (struct List_element *) xmalloc (sizeof (struct List_element));
798 new->type = RE_EQUIV_CLASS;
799 new->u.equiv_code = *equiv_class_str;
801 list->tail->next = new;
806 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
807 The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
808 NUL bytes, and is *not* NUL-terminated. */
810 static unsigned char *
811 substr (const unsigned char *p, size_t first_idx, size_t last_idx)
816 assert (first_idx <= last_idx);
817 len = last_idx - first_idx + 1;
818 tmp = (unsigned char *) xmalloc (len);
820 assert (first_idx <= last_idx);
821 /* Use memcpy rather than strncpy because `p' may contain zero-bytes. */
822 memcpy (tmp, p + first_idx, len);
826 /* Search forward starting at START_IDX for the 2-char sequence
827 (PRE_BRACKET_CHAR,']') in the string P of length P_LEN. If such
828 a sequence is found, set *RESULT_IDX to the index of the first
829 character and return nonzero. Otherwise return zero. P may contain
833 find_closing_delim (const struct E_string *es, size_t start_idx,
834 unsigned int pre_bracket_char, size_t *result_idx)
838 for (i = start_idx; i < es->len - 1; i++)
839 if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
840 && !es->escaped[i] && !es->escaped[i + 1])
848 /* Parse the bracketed repeat-char syntax. If the P_LEN characters
849 beginning with P[ START_IDX ] comprise a valid [c*n] construct,
850 then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
851 and return zero. If the second character following
852 the opening bracket is not `*' or if no closing bracket can be
853 found, return -1. If a closing bracket is found and the
854 second char is `*', but the string between the `*' and `]' isn't
855 empty, an octal number, or a decimal number, print an error message
859 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
860 unsigned int *char_to_repeat, size_t *repeat_count,
861 size_t *closing_bracket_idx)
865 assert (start_idx + 1 < es->len);
866 if (!ES_MATCH (es, start_idx + 1, '*'))
869 for (i = start_idx + 2; i < es->len; i++)
871 if (ES_MATCH (es, i, ']'))
873 size_t digit_str_len = i - start_idx - 2;
875 *char_to_repeat = es->s[start_idx];
876 if (digit_str_len == 0)
878 /* We've matched [c*] -- no explicit repeat count. */
880 *closing_bracket_idx = i;
884 /* Here, we have found [c*s] where s should be a string
885 of octal (if it starts with `0') or decimal digits. */
887 const char *digit_str = (const char *) &es->s[start_idx + 2];
888 unsigned long int tmp_ulong;
891 /* Select the base manually so we can be sure it's either 8 or 10.
892 If the spec allowed it to be interpreted as hexadecimal, we
893 could have used `0' and let xstrtoul decide. */
894 if (*digit_str == '0')
900 if (xstrtoul (digit_str, &d_end, base, &tmp_ulong, NULL)
902 || BEGIN_STATE < tmp_ulong
903 || d_end - digit_str != digit_str_len)
905 char *tmp = make_printable_str (es->s + start_idx + 2,
907 error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
912 *repeat_count = tmp_ulong;
914 *closing_bracket_idx = i;
918 return -1; /* No bracket found. */
921 /* Return nonzero if the string at ES->s[IDX] matches the regular
922 expression `\*[0-9]*\]', zero otherwise. To match, the `*' and
923 the `]' must not be escaped. */
926 star_digits_closebracket (const struct E_string *es, size_t idx)
930 if (!ES_MATCH (es, idx, '*'))
933 for (i = idx + 1; i < es->len; i++)
935 if (!ISDIGIT (es->s[i]))
937 if (ES_MATCH (es, i, ']'))
945 /* Convert string UNESCAPED_STRING (which has been preprocessed to
946 convert backslash-escape sequences) of length LEN characters into
947 a linked list of the following 5 types of constructs:
948 - [:str:] Character class where `str' is one of the 12 valid strings.
949 - [=c=] Equivalence class where `c' is any single character.
950 - [c*n] Repeat the single character `c' `n' times. n may be omitted.
951 However, if `n' is present, it must be a non-negative octal or
953 - r-s Range of characters from `r' to `s'. The second endpoint must
954 not precede the first in the current collating sequence.
955 - c Any other character is interpreted as itself. */
958 build_spec_list (const struct E_string *es, struct Spec_list *result)
960 const unsigned char *p;
965 /* The main for-loop below recognizes the 4 multi-character constructs.
966 A character that matches (in its context) none of the multi-character
967 constructs is classified as `normal'. Since all multi-character
968 constructs have at least 3 characters, any strings of length 2 or
969 less are composed solely of normal characters. Hence, the index of
970 the outer for-loop runs only as far as LEN-2. */
972 for (i = 0; i + 2 < es->len; /* empty */)
974 if (ES_MATCH (es, i, '['))
976 int matched_multi_char_construct;
977 size_t closing_bracket_idx;
978 unsigned int char_to_repeat;
982 matched_multi_char_construct = 1;
983 if (ES_MATCH (es, i + 1, ':')
984 || ES_MATCH (es, i + 1, '='))
986 size_t closing_delim_idx;
989 found = find_closing_delim (es, i + 2, p[i + 1],
994 unsigned char *opnd_str = substr (p, i + 2,
995 closing_delim_idx - 1);
996 size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
1000 parse_failed = append_char_class (result, opnd_str,
1003 /* FIXME: big comment. */
1006 if (star_digits_closebracket (es, i + 2))
1009 goto try_bracketed_repeat;
1013 char *tmp = make_printable_str (opnd_str,
1015 error (0, 0, _("invalid character class `%s'"),
1024 parse_failed = append_equiv_class (result, opnd_str,
1027 /* FIXME: big comment. */
1030 if (star_digits_closebracket (es, i + 2))
1033 goto try_bracketed_repeat;
1037 char *tmp = make_printable_str (opnd_str,
1040 _("%s: equivalence class operand must be a single character"),
1049 /* Return nonzero if append_*_class reports a problem. */
1053 i = closing_delim_idx + 2;
1056 /* Else fall through. This could be [:*] or [=*]. */
1059 try_bracketed_repeat:
1061 /* Determine whether this is a bracketed repeat range
1062 matching the RE \[.\*(dec_or_oct_number)?\]. */
1063 err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1065 &closing_bracket_idx);
1068 append_repeated_char (result, char_to_repeat, repeat_count);
1069 i = closing_bracket_idx + 1;
1073 matched_multi_char_construct = 0;
1077 /* Found a string that looked like [c*n] but the
1078 numeric part was invalid. */
1082 if (matched_multi_char_construct)
1085 /* We reach this point if P does not match [:str:], [=c=],
1086 [c*n], or [c*]. Now, see if P looks like a range `[-c'
1087 (from `[' to `c'). */
1090 /* Look ahead one char for ranges like a-z. */
1091 if (ES_MATCH (es, i + 1, '-'))
1093 if (append_range (result, p[i], p[i + 2]))
1099 append_normal_char (result, p[i]);
1104 /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1]. */
1105 for (; i < es->len; i++)
1106 append_normal_char (result, p[i]);
1111 /* Given a Spec_list S (with its saved state implicit in the values
1112 of its members `tail' and `state'), return the next single character
1113 in the expansion of S's constructs. If the last character of S was
1114 returned on the previous call or if S was empty, this function
1115 returns -1. For example, successive calls to get_next where S
1116 represents the spec-string 'a-d[y*3]' will return the sequence
1117 of values a, b, c, d, y, y, y, -1. Finally, if the construct from
1118 which the returned character comes is [:upper:] or [:lower:], the
1119 parameter CLASS is given a value to indicate which it was. Otherwise
1120 CLASS is set to UL_NONE. This value is used only when constructing
1121 the translation table to verify that any occurrences of upper and
1122 lower class constructs in the spec-strings appear in the same relative
1126 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1128 struct List_element *p;
1135 if (s->state == BEGIN_STATE)
1137 s->tail = s->head->next;
1138 s->state = NEW_ELEMENT;
1147 case RE_NORMAL_CHAR:
1148 return_val = p->u.normal_char;
1149 s->state = NEW_ELEMENT;
1154 if (s->state == NEW_ELEMENT)
1155 s->state = ORD (p->u.range.first_char);
1158 return_val = CHR (s->state);
1159 if (s->state == ORD (p->u.range.last_char))
1162 s->state = NEW_ELEMENT;
1170 switch (p->u.char_class)
1188 s->state = NEW_ELEMENT;
1194 if (s->state == NEW_ELEMENT)
1196 for (i = 0; i < N_CHARS; i++)
1197 if (is_char_class_member (p->u.char_class, i))
1199 assert (i < N_CHARS);
1202 assert (is_char_class_member (p->u.char_class, s->state));
1203 return_val = CHR (s->state);
1204 for (i = s->state + 1; i < N_CHARS; i++)
1205 if (is_char_class_member (p->u.char_class, i))
1212 s->state = NEW_ELEMENT;
1216 case RE_EQUIV_CLASS:
1217 /* FIXME: this assumes that each character is alone in its own
1218 equivalence class (which appears to be correct for my
1219 LC_COLLATE. But I don't know of any function that allows
1220 one to determine a character's equivalence class. */
1222 return_val = p->u.equiv_code;
1223 s->state = NEW_ELEMENT;
1227 case RE_REPEATED_CHAR:
1228 /* Here, a repeat count of n == 0 means don't repeat at all. */
1229 if (p->u.repeated_char.repeat_count == 0)
1232 s->state = NEW_ELEMENT;
1233 return_val = get_next (s, class);
1237 if (s->state == NEW_ELEMENT)
1242 return_val = p->u.repeated_char.the_repeated_char;
1243 if (p->u.repeated_char.repeat_count > 0
1244 && s->state == p->u.repeated_char.repeat_count)
1247 s->state = NEW_ELEMENT;
1264 /* This is a minor kludge. This function is called from
1265 get_spec_stats to determine the cardinality of a set derived
1266 from a complemented string. It's a kludge in that some of the
1267 same operations are (duplicated) performed in set_initialize. */
1270 card_of_complement (struct Spec_list *s)
1273 int cardinality = N_CHARS;
1274 SET_TYPE in_set[N_CHARS];
1276 memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1277 s->state = BEGIN_STATE;
1278 while ((c = get_next (s, NULL)) != -1)
1284 /* Gather statistics about the spec-list S in preparation for the tests
1285 in validate that determine the consistency of the specs. This function
1286 is called at most twice; once for string1, and again for any string2.
1287 LEN_S1 < 0 indicates that this is the first call and that S represents
1288 string1. When LEN_S1 >= 0, it is the length of the expansion of the
1289 constructs in string1, and we can use its value to resolve any
1290 indefinite repeat construct in S (which represents string2). Hence,
1291 this function has the side-effect that it converts a valid [c*]
1292 construct in string2 to [c*n] where n is large enough (or 0) to give
1293 string2 the same length as string1. For example, with the command
1294 tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1295 be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */
1298 get_spec_stats (struct Spec_list *s)
1300 struct List_element *p;
1303 s->n_indefinite_repeats = 0;
1304 s->has_equiv_class = 0;
1305 s->has_restricted_char_class = 0;
1306 s->has_char_class = 0;
1307 for (p = s->head->next; p; p = p->next)
1312 case RE_NORMAL_CHAR:
1317 assert (p->u.range.last_char >= p->u.range.first_char);
1318 len += p->u.range.last_char - p->u.range.first_char + 1;
1322 s->has_char_class = 1;
1323 for (i = 0; i < N_CHARS; i++)
1324 if (is_char_class_member (p->u.char_class, i))
1326 switch (p->u.char_class)
1332 s->has_restricted_char_class = 1;
1337 case RE_EQUIV_CLASS:
1338 for (i = 0; i < N_CHARS; i++)
1339 if (is_equiv_class_member (p->u.equiv_code, i))
1341 s->has_equiv_class = 1;
1344 case RE_REPEATED_CHAR:
1345 if (p->u.repeated_char.repeat_count > 0)
1346 len += p->u.repeated_char.repeat_count;
1347 else if (p->u.repeated_char.repeat_count == 0)
1349 s->indefinite_repeat_element = p;
1350 ++(s->n_indefinite_repeats);
1364 get_s1_spec_stats (struct Spec_list *s1)
1366 get_spec_stats (s1);
1368 s1->length = card_of_complement (s1);
1372 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1374 get_spec_stats (s2);
1375 if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1377 s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1378 len_s1 - s2->length;
1379 s2->length = len_s1;
1384 spec_init (struct Spec_list *spec_list)
1386 spec_list->head = spec_list->tail =
1387 (struct List_element *) xmalloc (sizeof (struct List_element));
1388 spec_list->head->next = NULL;
1391 /* This function makes two passes over the argument string S. The first
1392 one converts all \c and \ddd escapes to their one-byte representations.
1393 The second constructs a linked specification list, SPEC_LIST, of the
1394 characters and constructs that comprise the argument string. If either
1395 of these passes detects an error, this function returns nonzero. */
1398 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1403 fail = unquote (s, &es);
1405 fail = build_spec_list (&es, spec_list);
1410 /* Given two specification lists, S1 and S2, and assuming that
1411 S1->length > S2->length, append a single [c*n] element to S2 where c
1412 is the last character in the expansion of S2 and n is the difference
1413 between the two lengths.
1414 Upon successful completion, S2->length is set to S1->length. The only
1415 way this function can fail to make S2 as long as S1 is when S2 has
1416 zero-length, since in that case, there is no last character to repeat.
1417 So S2->length is required to be at least 1.
1419 Providing this functionality allows the user to do some pretty
1420 non-BSD (and non-portable) things: For example, the command
1421 tr -cs '[:upper:]0-9' '[:lower:]'
1422 is almost guaranteed to give results that depend on your collating
1426 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1428 struct List_element *p;
1432 assert (translating);
1433 assert (s1->length > s2->length);
1434 assert (s2->length > 0);
1439 case RE_NORMAL_CHAR:
1440 char_to_repeat = p->u.normal_char;
1443 char_to_repeat = p->u.range.last_char;
1446 for (i = N_CHARS; i >= 0; i--)
1447 if (is_char_class_member (p->u.char_class, i))
1450 char_to_repeat = CHR (i);
1453 case RE_REPEATED_CHAR:
1454 char_to_repeat = p->u.repeated_char.the_repeated_char;
1457 case RE_EQUIV_CLASS:
1458 /* This shouldn't happen, because validate exits with an error
1459 if it finds an equiv class in string2 when translating. */
1472 append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1473 s2->length = s1->length;
1476 /* Return non-zero if S is a non-empty list in which exactly one
1477 character (but potentially, many instances of it) appears.
1478 E.g. [X*] or xxxxxxxx. */
1481 homogeneous_spec_list (struct Spec_list *s)
1485 s->state = BEGIN_STATE;
1487 if ((b = get_next (s, NULL)) == -1)
1490 while ((c = get_next (s, NULL)) != -1)
1497 /* Die with an error message if S1 and S2 describe strings that
1498 are not valid with the given command line switches.
1499 A side effect of this function is that if a valid [c*] or
1500 [c*0] construct appears in string2, it is converted to [c*n]
1501 with a value for n that makes s2->length == s1->length. By
1502 the same token, if the --truncate-set1 option is not
1503 given, S2 may be extended. */
1506 validate (struct Spec_list *s1, struct Spec_list *s2)
1508 get_s1_spec_stats (s1);
1509 if (s1->n_indefinite_repeats > 0)
1511 error (EXIT_FAILURE, 0,
1512 _("the [c*] repeat construct may not appear in string1"));
1517 get_s2_spec_stats (s2, s1->length);
1519 if (s2->n_indefinite_repeats > 1)
1521 error (EXIT_FAILURE, 0,
1522 _("only one [c*] repeat construct may appear in string2"));
1527 if (s2->has_equiv_class)
1529 error (EXIT_FAILURE, 0,
1530 _("[=c=] expressions may not appear in string2 \
1531 when translating"));
1534 if (s1->length > s2->length)
1538 /* string2 must be non-empty unless --truncate-set1 is
1539 given or string1 is empty. */
1541 if (s2->length == 0)
1542 error (EXIT_FAILURE, 0,
1543 _("when not truncating set1, string2 must be non-empty"));
1544 string2_extend (s1, s2);
1548 if (complement && s1->has_char_class
1549 && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1551 error (EXIT_FAILURE, 0,
1552 _("when translating with complemented character classes,\
1553 \nstring2 must map all characters in the domain to one"));
1556 if (s2->has_restricted_char_class)
1558 error (EXIT_FAILURE, 0,
1559 _("when translating, the only character classes that may \
1560 appear in\nstring2 are `upper' and `lower'"));
1564 /* Not translating. */
1566 if (s2->n_indefinite_repeats > 0)
1567 error (EXIT_FAILURE, 0,
1568 _("the [c*] construct may appear in string2 only \
1569 when translating"));
1574 /* Read buffers of SIZE bytes via the function READER (if READER is
1575 NULL, read from stdin) until EOF. When non-NULL, READER is either
1576 read_and_delete or read_and_xlate. After each buffer is read, it is
1577 processed and written to stdout. The buffers are processed so that
1578 multiple consecutive occurrences of the same character in the input
1579 stream are replaced by a single occurrence of that character if the
1580 character is in the squeeze set. */
1583 squeeze_filter (unsigned char *buf, size_t size, Filter reader)
1585 unsigned int char_to_squeeze = NOT_A_CHAR;
1597 ssize_t signed_nr = safe_read (0, (char *) buf, size);
1599 error (EXIT_FAILURE, errno, _("read error"));
1604 nr = (*reader) (buf, size, NULL);
1614 if (char_to_squeeze == NOT_A_CHAR)
1617 /* Here, by being a little tricky, we can get a significant
1618 performance increase in most cases when the input is
1619 reasonably large. Since tr will modify the input only
1620 if two consecutive (and identical) input characters are
1621 in the squeeze set, we can step by two through the data
1622 when searching for a character in the squeeze set. This
1623 means there may be a little more work in a few cases and
1624 perhaps twice as much work in the worst cases where most
1625 of the input is removed by squeezing repeats. But most
1626 uses of this functionality seem to remove less than 20-30%
1628 for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1631 /* There is a special case when i == nr and we've just
1632 skipped a character (the last one in buf) that is in
1634 if (i == nr && in_squeeze_set[buf[i - 1]])
1638 out_len = nr - begin;
1641 char_to_squeeze = buf[i];
1642 /* We're about to output buf[begin..i]. */
1643 out_len = i - begin + 1;
1645 /* But since we stepped by 2 in the loop above,
1646 out_len may be one too large. */
1647 if (i > 0 && buf[i - 1] == char_to_squeeze)
1650 /* Advance i to the index of first character to be
1651 considered when looking for a char different from
1656 && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1657 error (EXIT_FAILURE, errno, _("write error"));
1660 if (char_to_squeeze != NOT_A_CHAR)
1662 /* Advance i to index of first char != char_to_squeeze
1663 (or to nr if all the rest of the characters in this
1664 buffer are the same as char_to_squeeze). */
1665 for (; i < nr && buf[i] == char_to_squeeze; i++)
1668 char_to_squeeze = NOT_A_CHAR;
1669 /* If (i >= nr) we've squeezed the last character in this buffer.
1670 So now we have to read a new buffer and continue comparing
1671 characters against char_to_squeeze. */
1676 /* Read buffers of SIZE bytes from stdin until one is found that
1677 contains at least one character not in the delete set. Store
1678 in the array BUF, all characters from that buffer that are not
1679 in the delete set, and return the number of characters saved
1683 read_and_delete (unsigned char *buf, size_t size, Filter not_used)
1686 static int hit_eof = 0;
1688 assert (not_used == NULL);
1693 /* This enclosing do-while loop is to make sure that
1694 we don't return zero (indicating EOF) when we've
1695 just deleted all the characters in a buffer. */
1699 ssize_t nr = safe_read (0, (char *) buf, size);
1702 error (EXIT_FAILURE, errno, _("read error"));
1709 /* This first loop may be a waste of code, but gives much
1710 better performance when no characters are deleted in
1711 the beginning of a buffer. It just avoids the copying
1712 of buf[i] into buf[n_saved] when it would be a NOP. */
1714 for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1718 for (++i; i < nr; i++)
1719 if (!in_delete_set[buf[i]])
1720 buf[n_saved++] = buf[i];
1722 while (n_saved == 0);
1727 /* Read at most SIZE bytes from stdin into the array BUF. Then
1728 perform the in-place and one-to-one mapping specified by the global
1729 array `xlate'. Return the number of characters read, or 0 upon EOF. */
1732 read_and_xlate (unsigned char *buf, size_t size, Filter not_used)
1734 ssize_t chars_read = 0;
1735 static int hit_eof = 0;
1738 assert (not_used == NULL);
1743 chars_read = safe_read (0, (char *) buf, size);
1745 error (EXIT_FAILURE, errno, _("read error"));
1746 if (chars_read == 0)
1752 for (i = 0; i < chars_read; i++)
1753 buf[i] = xlate[buf[i]];
1758 /* Initialize a boolean membership set IN_SET with the character
1759 values obtained by traversing the linked list of constructs S
1760 using the function `get_next'. If COMPLEMENT_THIS_SET is
1761 nonzero the resulting set is complemented. */
1764 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1769 memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1770 s->state = BEGIN_STATE;
1771 while ((c = get_next (s, NULL)) != -1)
1773 if (complement_this_set)
1774 for (i = 0; i < N_CHARS; i++)
1775 in_set[i] = (!in_set[i]);
1779 main (int argc, char **argv)
1782 int non_option_args;
1783 struct Spec_list buf1, buf2;
1784 struct Spec_list *s1 = &buf1;
1785 struct Spec_list *s2 = &buf2;
1787 program_name = argv[0];
1788 setlocale (LC_ALL, "");
1789 bindtextdomain (PACKAGE, LOCALEDIR);
1790 textdomain (PACKAGE);
1792 atexit (close_stdout);
1794 while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1810 squeeze_repeats = 1;
1817 case_GETOPT_HELP_CHAR;
1819 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1827 posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1829 non_option_args = argc - optind;
1830 translating = (non_option_args == 2 && !delete);
1832 /* Change this test if it is valid to give tr no options and
1833 no args at all. POSIX doesn't specifically say anything
1834 either way, but it looks like they implied it's invalid
1835 by omission. If you want to make tr do a slow imitation
1836 of `cat' use `tr a a'. */
1837 if (non_option_args > 2)
1839 error (0, 0, _("too many arguments"));
1843 if (!delete && !squeeze_repeats && non_option_args != 2)
1844 error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1846 if (delete && squeeze_repeats && non_option_args != 2)
1847 error (EXIT_FAILURE, 0, _("two strings must be given when both \
1848 deleting and squeezing repeats"));
1850 /* If --delete is given without --squeeze-repeats, then
1851 only one string argument may be specified. But POSIX
1852 says to ignore any string2 in this case, so if POSIXLY_CORRECT
1853 is set, pretend we never saw string2. But I think
1854 this deserves a fatal error, so that's the default. */
1855 if ((delete && !squeeze_repeats) && non_option_args != 1)
1857 if (posix_pedantic && non_option_args == 2)
1860 error (EXIT_FAILURE, 0,
1861 _("only one string may be given when deleting \
1862 without squeezing repeats"));
1865 if (squeeze_repeats && non_option_args == 0)
1866 error (EXIT_FAILURE, 0,
1867 _("at least one string must be given when squeezing repeats"));
1870 if (parse_str ((unsigned char *) argv[optind], s1))
1871 exit (EXIT_FAILURE);
1873 if (non_option_args == 2)
1876 if (parse_str ((unsigned char *) argv[optind + 1], s2))
1877 exit (EXIT_FAILURE);
1884 /* Use binary I/O, since `tr' is sometimes used to transliterate
1885 non-printable characters, or characters which are stripped away
1886 by text-mode reads (like CR and ^Z). */
1887 SET_BINARY2 (STDIN_FILENO, STDOUT_FILENO);
1889 if (squeeze_repeats && non_option_args == 1)
1891 set_initialize (s1, complement, in_squeeze_set);
1892 squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1894 else if (delete && non_option_args == 1)
1898 set_initialize (s1, complement, in_delete_set);
1901 nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1902 if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1903 error (EXIT_FAILURE, errno, _("write error"));
1907 else if (squeeze_repeats && delete && non_option_args == 2)
1909 set_initialize (s1, complement, in_delete_set);
1910 set_initialize (s2, 0, in_squeeze_set);
1911 squeeze_filter (io_buf, IO_BUF_SIZE, read_and_delete);
1913 else if (translating)
1918 SET_TYPE *in_s1 = in_delete_set;
1920 set_initialize (s1, 0, in_s1);
1921 s2->state = BEGIN_STATE;
1922 for (i = 0; i < N_CHARS; i++)
1924 for (i = 0; i < N_CHARS; i++)
1928 int ch = get_next (s2, NULL);
1929 assert (ch != -1 || truncate_set1);
1932 /* This will happen when tr is invoked like e.g.
1933 tr -cs A-Za-z0-9 '\012'. */
1939 assert (get_next (s2, NULL) == -1 || truncate_set1);
1945 enum Upper_Lower_class class_s1;
1946 enum Upper_Lower_class class_s2;
1948 for (i = 0; i < N_CHARS; i++)
1950 s1->state = BEGIN_STATE;
1951 s2->state = BEGIN_STATE;
1954 c1 = get_next (s1, &class_s1);
1955 c2 = get_next (s2, &class_s2);
1956 if (!class_ok[(int) class_s1][(int) class_s2])
1957 error (EXIT_FAILURE, 0,
1958 _("misaligned [:upper:] and/or [:lower:] construct"));
1960 if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1962 for (i = 0; i < N_CHARS; i++)
1964 xlate[i] = toupper (i);
1966 else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1968 for (i = 0; i < N_CHARS; i++)
1970 xlate[i] = tolower (i);
1972 else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1973 || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1975 /* By default, GNU tr permits the identity mappings: from
1976 [:upper:] to [:upper:] and [:lower:] to [:lower:]. But
1977 when POSIXLY_CORRECT is set, those evoke diagnostics. */
1980 error (EXIT_FAILURE, 0,
1982 invalid identity mapping; when translating, any [:lower:] or [:upper:]\n\
1983 construct in string1 must be aligned with a corresponding construct\n\
1984 ([:upper:] or [:lower:], respectively) in string2"));
1989 /* The following should have been checked by validate... */
1990 if (c1 == -1 || c2 == -1)
1995 assert (c1 == -1 || truncate_set1);
1997 if (squeeze_repeats)
1999 set_initialize (s2, 0, in_squeeze_set);
2000 squeeze_filter (io_buf, IO_BUF_SIZE, read_and_xlate);
2008 chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2010 && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2011 error (EXIT_FAILURE, errno, _("write error"));
2013 while (chars_read > 0);
2018 error (EXIT_FAILURE, errno, _("standard input"));
2020 exit (EXIT_SUCCESS);