1 /* tr -- a filter to translate characters
2 Copyright (C) 1991 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
18 /* Written by Jim Meyering, meyering@cs.utexas.edu. */
20 /* Get isblank from GNU libc. */
26 #include <sys/types.h>
32 #define LONG_MAX 0x7FFFFFFF
36 #define UCHAR_MAX 0xFF
39 #define N_CHARS (UCHAR_MAX + 1)
41 /* A pointer to a function that returns an int. */
42 typedef int (*PFI) ();
44 /* Convert from character C to its index in the collating
45 sequence array. Just cast to an unsigned int to avoid
46 problems with sign-extension. */
47 #define ORD(c) (unsigned int)(c)
49 /* The inverse of ORD. */
50 #define CHR(i) (unsigned char)(i)
52 /* The value for Spec_list->state that indicates to
53 get_next that it should initialize the tail pointer.
54 Its value doesn't matter as long as it can't be
55 confused with a valid character code. */
56 #define BEGIN_STATE (2 * N_CHARS)
58 /* The value for Spec_list->state that indicates to
59 get_next that the element pointed to by Spec_list->tail is
60 being considered for the first time on this pass through the
61 list -- it indicates that get_next should make any necessary
63 #define NEW_ELEMENT (BEGIN_STATE + 1)
65 /* A value distinct from any character that may have been stored in a
66 buffer as the result of a block-read in the function squeeze_filter. */
67 #define NOT_A_CHAR (unsigned int)(-1)
69 /* The following (but not CC_NO_CLASS) are indices into the array of
70 valid character class strings. */
73 CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
74 CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
75 CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
79 /* Character class to which a character (returned by get_next) belonged;
80 but it is set only if the construct from which the character was obtained
81 was one of the character classes [:upper:] or [:lower:]. The value
82 is used only when translating and then, only to make sure that upper
83 and lower class constructs have the same relative positions in string1
85 enum Upper_Lower_class
92 /* A shortcut to ensure that when constructing the translation array,
93 one of the values returned by paired calls to get_next (from s1 and s2)
94 is from [:upper:] and the other is from [:lower:], or neither is from
95 upper or lower. In fact, no other character classes are allowed when
96 translating, but that condition is tested elsewhere. This array is
97 indexed by values of type enum Upper_Lower_class. */
98 static int const class_ok[3][3] =
105 /* The type of a List_element. See build_spec_list for more details. */
106 enum Range_element_type
116 /* One construct in one of tr's argument strings.
117 For example, consider the POSIX version of the classic tr command:
118 tr -cs 'a-zA-Z_' '[\n*]'
119 String1 has 3 constructs, two of which are ranges (a-z and A-Z),
120 and a single normal character, `_'. String2 has one construct. */
123 enum Range_element_type type;
124 struct List_element *next;
130 unsigned int first_char;
131 unsigned int last_char;
134 enum Char_class char_class;
138 unsigned int the_repeated_char;
146 /* Each of tr's argument strings is parsed into a form that is easier
147 to work with: a linked list of constructs (struct List_element).
148 Each Spec_list structure also encapsulates various attributes of
149 the corresponding argument string. The attributes are used mainly
150 to verify that the strings are legal in the context of any options
151 specified (like -s, -d, or -c). The main exception is the member
152 `tail', which is first used to construct the list. After construction,
153 it is used by get_next to save its state when traversing the list.
154 The member `state' serves a similar function. */
157 /* Points to the head of the list of range elements.
158 The first struct is a dummy; its members are never used. */
159 struct List_element *head;
161 /* When appending, points to the last element. When traversing via
162 get_next(), points to the element to process next. Setting
163 Spec_list.state to the value BEGIN_STATE before calling get_next
164 signals get_next to initialize tail to point to head->next. */
165 struct List_element *tail;
167 /* Used to save state between calls to get_next(). */
170 /* Length, in the sense that length('a-z[:digit:]123abc')
171 is 42 ( = 26 + 10 + 6). */
174 /* The number of [c*] and [c*0] constructs that appear in this spec. */
175 int n_indefinite_repeats;
177 /* Non-zero if this spec contains at least one equivalence
178 class construct e.g. [=c=]. */
181 /* Non-zero if this spec contains at least one of [:upper:] or
182 [:lower:] class constructs. */
183 int has_upper_or_lower;
185 /* Non-zero if this spec contains at least one of the character class
186 constructs (all but upper and lower) that aren't allowed in s2. */
187 int has_restricted_char_class;
194 /* The name by which this program was run. */
197 /* When non-zero, each sequence in the input of a repeated character
198 (call it c) is replaced (in the output) by a single occurrence of c
199 for every c in the squeeze set. */
200 static int squeeze_repeats = 0;
202 /* When non-zero, removes characters in the delete set from input. */
203 static int delete = 0;
205 /* Use the complement of set1 in place of set1. */
206 static int complement = 0;
208 /* When non-zero, this flag causes GNU tr to provide strict
209 compliance with POSIX draft 1003.2.11.2. The POSIX spec
210 says that when -d is used without -s, string2 (if present)
211 must be ignored. Silently ignoring arguments is a bad idea.
212 The default GNU behavior is to give a usage message and exit.
213 Additionally, when this flag is non-zero, tr prints warnings
214 on stderr if it is being used in a manner that is not portable.
215 Applicable warnings are given by default, but are suppressed
216 if the environment variable `POSIXLY_CORRECT' is set, since
217 being POSIX conformant means we can't issue such messages.
218 Warnings on the following topics are suppressed when this
219 variable is non-zero:
220 1. Ambiguous octal escapes. */
221 static int posix_pedantic;
223 /* When tr is performing translation and string1 is longer than string2,
224 POSIX says that the result is undefined. That gives the implementor
225 of a POSIX conforming version of tr two reasonable choices for the
226 semantics of this case.
228 * The BSD tr pads string2 to the length of string1 by
229 repeating the last character in string2.
231 * System V tr ignores characters in string1 that have no
232 corresponding character in string2. That is, string1 is effectively
233 truncated to the length of string2.
235 When non-zero, this flag causes GNU tr to imitate the behavior
236 of System V tr when translating with string1 longer than string2.
237 The default is to emulate BSD tr. This flag is ignored in modes where
238 no translation is performed. Emulating the System V tr
239 in this exceptional case causes the relatively common BSD idiom:
241 tr -cs A-Za-z0-9 '\012'
243 to break (it would convert only zero bytes, rather than all
244 non-alphanumerics, to newlines).
246 WARNING: This switch does not provide general BSD or System V
247 compatibility. For example, it doesn't disable the interpretation
248 of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
249 some unfortunate coincidence you use such constructs in scripts
250 expecting to use some other version of tr, the scripts will break. */
251 static int truncate_set1 = 0;
253 /* An alias for (!delete && non_option_args == 2).
254 It is set in main and used there and in validate(). */
255 static int translating;
261 #define IO_BUF_SIZE BUFSIZ
262 static unsigned char io_buf[IO_BUF_SIZE];
264 static char const *const char_class_name[] =
266 "alnum", "alpha", "blank", "cntrl", "digit", "graph",
267 "lower", "print", "punct", "space", "upper", "xdigit"
269 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
271 typedef char SET_TYPE;
273 /* Array of boolean values. A character `c' is a member of the
274 squeeze set if and only if in_squeeze_set[c] is true. The squeeze
275 set is defined by the last (possibly, the only) string argument
276 on the command line when the squeeze option is given. */
277 static SET_TYPE in_squeeze_set[N_CHARS];
279 /* Array of boolean values. A character `c' is a member of the
280 delete set if and only if in_delete_set[c] is true. The delete
281 set is defined by the first (or only) string argument on the
282 command line when the delete option is given. */
283 static SET_TYPE in_delete_set[N_CHARS];
285 /* Array of character values defining the translation (if any) that
286 tr is to perform. Translation is performed only when there are
287 two specification strings and the delete switch is not given. */
288 static char xlate[N_CHARS];
290 /* If non-zero, display usage information and exit. */
291 static int flag_help;
293 /* If non-zero, print the version on standard error. */
294 static int flag_version;
296 static struct option const long_options[] =
298 {"complement", no_argument, NULL, 'c'},
299 {"delete", no_argument, NULL, 'd'},
300 {"squeeze-repeats", no_argument, NULL, 's'},
301 {"truncate-set1", no_argument, NULL, 't'},
302 {"help", no_argument, &flag_help, 1},
303 {"version", no_argument, &flag_version, 1},
311 Usage: %s [-cdst] [--complement] [--delete] [--squeeze-repeats]\n\
312 [--truncate-set1] [--help] [--version] string1 [string2]\n",
317 /* Return non-zero if the character C is a member of the
318 equivalence class containing the character EQUIV_CLASS. */
321 is_equiv_class_member (equiv_class, c)
322 unsigned int equiv_class;
325 return (equiv_class == c);
328 /* Return non-zero if the character C is a member of the
329 character class CHAR_CLASS. */
332 is_char_class_member (char_class, c)
333 enum Char_class char_class;
380 /* Perform the first pass over each range-spec argument S,
381 converting all \c and \ddd escapes to their one-byte representations.
382 The conversion is done in-place, so S must point to writable
383 storage. If an illegal quote sequence is found, an error message is
384 printed and the function returns non-zero. Otherwise the length of
385 the resulting string is returned through LEN and the function returns 0.
386 The resulting array of characters may contain zero-bytes; however,
387 on input, S is assumed to be null-terminated, and hence
388 cannot contain actual (non-escaped) zero bytes. */
398 for (i = 0; s[i]; i++)
440 oct_digit = s[i + 2] - '0';
441 if (0 <= oct_digit && oct_digit <= 7)
443 c = 8 * c + oct_digit;
445 oct_digit = s[i + 2] - '0';
446 if (0 <= oct_digit && oct_digit <= 7)
448 if (8 * c + oct_digit < N_CHARS)
450 c = 8 * c + oct_digit;
453 else if (!posix_pedantic)
455 /* Any octal number larger than 0377 won't
456 fit in 8 bits. So we stop when adding the
457 next digit would put us over the limit and
458 give a warning about the ambiguity. POSIX
459 isn't clear on this, but one person has said
460 that in his interpretation, POSIX says tr
461 can't even give a warning. */
462 error (0, 0, "warning: the ambiguous octal escape \
463 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'",
464 s[i], s[i + 1], s[i + 2],
465 s[i], s[i + 1], s[i + 2]);
471 error (0, 0, "invalid backslash escape at end of string");
475 error (0, 0, "invalid backslash escape `\\%c'", s[i + 1]);
491 /* If CLASS_STR is a valid character class string, return its index
492 in the global char_class_name array. Otherwise, return CC_NO_CLASS. */
494 static enum Char_class
495 look_up_char_class (class_str)
496 unsigned char *class_str;
500 for (i = 0; i < N_CHAR_CLASSES; i++)
501 if (strcmp ((const char *) class_str, char_class_name[i]) == 0)
502 return (enum Char_class) i;
506 /* Return a newly allocated string with a printable version of C.
507 This function is used solely for formatting error messages. */
510 make_printable_char (c)
513 char *buf = xmalloc (5);
515 assert (c < N_CHARS);
523 sprintf (buf, "\\%03o", c);
528 /* Return a newly allocated copy of S which is suitable for printing.
529 LEN is the number of characters in S. Most non-printing
530 (isprint) characters are represented by a backslash followed by
531 3 octal digits. However, the characters represented by \c escapes
532 where c is one of [abfnrtv] are represented by their 2-character \c
533 sequences. This function is used solely for printing error messages. */
536 make_printable_str (s, len)
540 /* Worst case is that every character expands to a backslash
541 followed by a 3-character octal escape sequence. */
542 char *printable_buf = xmalloc (4 * len + 1);
543 char *p = printable_buf;
546 for (i = 0; i < len; i++)
584 sprintf (buf, "\\%03o", s[i]);
590 return printable_buf;
593 /* Append a newly allocated structure representing a
594 character C to the specification list LIST. */
597 append_normal_char (list, c)
598 struct Spec_list *list;
601 struct List_element *new;
603 new = (struct List_element *) xmalloc (sizeof (struct List_element));
605 new->type = RE_NORMAL_CHAR;
606 new->u.normal_char = c;
608 list->tail->next = new;
612 /* Append a newly allocated structure representing the range
613 of characters from FIRST to LAST to the specification list LIST.
614 Return non-zero if LAST precedes FIRST in the collating sequence,
615 zero otherwise. This means that '[c-c]' is acceptable. */
618 append_range (list, first, last)
619 struct Spec_list *list;
623 struct List_element *new;
625 if (ORD (first) > ORD (last))
627 char *tmp1 = make_printable_char (first);
628 char *tmp2 = make_printable_char (last);
631 "range-endpoints of `%s-%s' are in reverse collating sequence order",
637 new = (struct List_element *) xmalloc (sizeof (struct List_element));
639 new->type = RE_RANGE;
640 new->u.range.first_char = first;
641 new->u.range.last_char = last;
643 list->tail->next = new;
648 /* If CHAR_CLASS_STR is a valid character class string, append a
649 newly allocated structure representing that character class to the end
650 of the specification list LIST and return 0. If CHAR_CLASS_STR is not
651 a valid string, give an error message and return non-zero. */
654 append_char_class (list, char_class_str, len)
655 struct Spec_list *list;
656 unsigned char *char_class_str;
659 enum Char_class char_class;
660 struct List_element *new;
662 char_class = look_up_char_class (char_class_str);
663 if (char_class == CC_NO_CLASS)
665 char *tmp = make_printable_str (char_class_str, len);
667 error (0, 0, "invalid character class `%s'", tmp);
671 new = (struct List_element *) xmalloc (sizeof (struct List_element));
673 new->type = RE_CHAR_CLASS;
674 new->u.char_class = char_class;
676 list->tail->next = new;
681 /* Append a newly allocated structure representing a [c*n]
682 repeated character construct, to the specification list LIST.
683 THE_CHAR is the single character to be repeated, and REPEAT_COUNT
684 is non-negative repeat count. */
687 append_repeated_char (list, the_char, repeat_count)
688 struct Spec_list *list;
689 unsigned int the_char;
690 long int repeat_count;
692 struct List_element *new;
694 new = (struct List_element *) xmalloc (sizeof (struct List_element));
696 new->type = RE_REPEATED_CHAR;
697 new->u.repeated_char.the_repeated_char = the_char;
698 new->u.repeated_char.repeat_count = repeat_count;
700 list->tail->next = new;
704 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
705 the length of that string, LEN, if LEN is exactly one, append
706 a newly allocated structure representing the specified
707 equivalence class to the specification list, LIST and return zero.
708 If LEN is not 1, issue an error message and return non-zero. */
711 append_equiv_class (list, equiv_class_str, len)
712 struct Spec_list *list;
713 unsigned char *equiv_class_str;
716 struct List_element *new;
720 char *tmp = make_printable_str (equiv_class_str, len);
722 error (0, 0, "%s: equivalence class operand must be a single character",
727 new = (struct List_element *) xmalloc (sizeof (struct List_element));
729 new->type = RE_EQUIV_CLASS;
730 new->u.equiv_code = *equiv_class_str;
732 list->tail->next = new;
737 /* Return a newly allocated copy of P[FIRST_IDX..LAST_IDX]. */
739 static unsigned char *
740 substr (p, first_idx, last_idx)
745 int len = last_idx - first_idx + 1;
746 unsigned char *tmp = (unsigned char *) xmalloc (len);
748 assert (first_idx <= last_idx);
749 /* We must use bcopy or memcopy rather than strncpy
750 because `p' may contain zero-bytes. */
751 bcopy (p + first_idx, tmp, len);
756 /* Search forward starting at START_IDX for the 2-char sequence
757 (PRE_BRACKET_CHAR,']') in the string P of length P_LEN. If such
758 a sequence is found, return the index of the first character,
759 otherwise return -1. P may contain zero bytes. */
762 find_closing_delim (p, start_idx, p_len, pre_bracket_char)
766 unsigned int pre_bracket_char;
770 for (i = start_idx; i < p_len - 1; i++)
771 if (p[i] == pre_bracket_char && p[i + 1] == ']')
776 /* Convert a string S with explicit length LEN, possibly
777 containing embedded zero bytes, to a long integer value.
778 If the string represents a negative value, a value larger
779 than LONG_MAX, or if all LEN characters do not represent a
780 valid integer, return non-zero and do not modify *VAL.
781 Otherwise, return zero and set *VAL to the converted value. */
784 non_neg_strtol (s, len, val)
797 else if (ISDIGIT (s[0]))
802 for (i = 0; i < len; i++)
806 if (c >= base || c < 0)
808 if (i > 8 && sum > (LONG_MAX - c) / base)
810 sum = sum * base + c;
816 /* Parse the bracketed repeat-char syntax. If the P_LEN characters
817 beginning with P[ START_IDX ] comprise a valid [c*n] construct,
818 return the character and the repeat count through the arg pointers,
819 CHAR_TO_REPEAT and N, and then return the index of the closing
820 bracket as the function value. If the second character following
821 the opening bracket is not `*' or if no closing bracket can be
822 found, return -1. If a closing bracket is found and the
823 second char is `*', but the string between the `*' and `]' isn't
824 empty, an octal number, or a decimal number, print an error message
828 find_bracketed_repeat (p, start_idx, p_len, char_to_repeat, n)
832 unsigned int *char_to_repeat;
837 assert (start_idx + 1 < p_len);
838 if (p[start_idx + 1] != '*')
841 for (i = start_idx + 2; i < p_len; i++)
845 unsigned char *digit_str;
846 int digit_str_len = i - start_idx - 2;
848 *char_to_repeat = p[start_idx];
849 if (digit_str_len == 0)
851 /* We've matched [c*] -- no explicit repeat count. */
856 /* Here, we have found [c*s] where s should be a string
857 of octal or decimal digits. */
858 digit_str = &p[start_idx + 2];
859 if (non_neg_strtol (digit_str, digit_str_len, n))
861 char *tmp = make_printable_str (digit_str, digit_str_len);
862 error (0, 0, "invalid repeat count `%s' in [c*n] construct", tmp);
869 return -1; /* No bracket found. */
872 /* Convert string UNESACPED_STRING (which has been preprocessed to
873 convert backslash-escape sequences) of length LEN characters into
874 a linked list of the following 5 types of constructs:
875 - [:str:] Character class where `str' is one of the 12 valid strings.
876 - [=c=] Equivalence class where `c' is any single character.
877 - [c*n] Repeat the single character `c' `n' times. n may be omitted.
878 However, if `n' is present, it must be a non-negative octal or
880 - r-s Range of characters from `r' to `s'. The second endpoint must
881 not precede the first in the current collating sequence.
882 - c Any other character is interpreted as itself. */
885 build_spec_list (unescaped_string, len, result)
886 unsigned char *unescaped_string;
888 struct Spec_list *result;
893 p = unescaped_string;
895 /* The main for-loop below recognizes the 4 multi-character constructs.
896 A character that matches (in its context) none of the multi-character
897 constructs is classified as `normal'. Since all multi-character
898 constructs have at least 3 characters, any strings of length 2 or
899 less are composed solely of normal characters. Hence, the index of
900 the outer for-loop runs only as far as LEN-2. */
902 for (i = 0; i < len - 2;)
911 int closing_delim_idx;
912 int closing_bracket_idx;
913 unsigned int char_to_repeat;
917 closing_delim_idx = find_closing_delim (p, i + 2, len, p[i + 1]);
918 if (closing_delim_idx >= 0)
921 unsigned char *opnd_str = substr (p, i + 2, closing_delim_idx - 1);
923 parse_failed = append_char_class (result, opnd_str,
924 (closing_delim_idx - 1) - (i + 2) + 1);
926 parse_failed = append_equiv_class (result, opnd_str,
927 (closing_delim_idx - 1) - (i + 2) + 1);
930 /* Return non-zero if append_*_class reports a problem. */
934 i = closing_delim_idx + 2;
937 /* Else fall through. This could be [:*] or [=*]. */
939 /* Determine whether this is a bracketed repeat range
940 matching the RE \[.\*(dec_or_oct_number)?\]. */
941 closing_bracket_idx = find_bracketed_repeat (p, i + 1,
942 len, &char_to_repeat, &repeat_count);
943 if (closing_bracket_idx >= 0)
945 append_repeated_char (result, char_to_repeat, repeat_count);
946 i = closing_bracket_idx + 1;
949 else if (closing_bracket_idx == -1)
954 /* Found a string that looked like [c*n] but the
955 numeric part was invalid. */
962 /* Here if we've tried to match [c*n], [:str:], and [=c=]
963 and none of them fit. So we still have to consider the
964 range `[-c' (from `[' to `c'). */
966 /* Look ahead one char for ranges like a-z. */
969 if (append_range (result, p[i], p[i + 2]))
975 append_normal_char (result, p[i]);
982 /* Now handle the (2 or fewer) remaining characters p[i]..p[len - 1]. */
984 append_normal_char (result, p[i]);
989 /* Given a Spec_list S (with its saved state implicit in the values
990 of its members `tail' and `state'), return the next single character
991 in the expansion of S's constructs. If the last character of S was
992 returned on the previous call or if S was empty, this function
993 returns -1. For example, successive calls to get_next where S
994 represents the spec-string 'a-d[y*3]' will return the sequence
995 of values a, b, c, d, y, y, y, -1. Finally, if the construct from
996 which the returned character comes is [:upper:] or [:lower:], the
997 parameter CLASS is given a value to indicate which it was. Otherwise
998 CLASS is set to UL_NONE. This value is used only when constructing
999 the translation table to verify that any occurrences of upper and
1000 lower class constructs in the spec-strings appear in the same relative
1005 struct Spec_list *s;
1006 enum Upper_Lower_class *class;
1008 struct List_element *p;
1015 if (s->state == BEGIN_STATE)
1017 s->tail = s->head->next;
1018 s->state = NEW_ELEMENT;
1027 case RE_NORMAL_CHAR:
1028 return_val = p->u.normal_char;
1029 s->state = NEW_ELEMENT;
1034 if (s->state == NEW_ELEMENT)
1035 s->state = ORD (p->u.range.first_char);
1038 return_val = CHR (s->state);
1039 if (s->state == ORD (p->u.range.last_char))
1042 s->state = NEW_ELEMENT;
1047 if (s->state == NEW_ELEMENT)
1049 for (i = 0; i < N_CHARS; i++)
1050 if (is_char_class_member (p->u.char_class, i))
1052 assert (i < N_CHARS);
1055 assert (is_char_class_member (p->u.char_class, s->state));
1056 return_val = CHR (s->state);
1057 for (i = s->state + 1; i < N_CHARS; i++)
1058 if (is_char_class_member (p->u.char_class, i))
1065 s->state = NEW_ELEMENT;
1069 switch (p->u.char_class)
1084 case RE_EQUIV_CLASS:
1085 /* FIXME: this assumes that each character is alone in its own
1086 equivalence class (which appears to be correct for my
1087 LC_COLLATE. But I don't know of any function that allows
1088 one to determine a character's equivalence class. */
1090 return_val = p->u.equiv_code;
1091 s->state = NEW_ELEMENT;
1095 case RE_REPEATED_CHAR:
1096 /* Here, a repeat count of n == 0 means don't repeat at all. */
1097 assert (p->u.repeated_char.repeat_count >= 0);
1098 if (p->u.repeated_char.repeat_count == 0)
1101 s->state = NEW_ELEMENT;
1102 return_val = get_next (s, class);
1106 if (s->state == NEW_ELEMENT)
1111 return_val = p->u.repeated_char.the_repeated_char;
1112 if (p->u.repeated_char.repeat_count > 0
1113 && s->state == p->u.repeated_char.repeat_count)
1116 s->state = NEW_ELEMENT;
1128 /* This is a minor kludge. This function is called from
1129 get_spec_stats to determine the cardinality of a set derived
1130 from a complemented string. It's a kludge in that some of
1131 the same operations are (duplicated) performed in set_initialize. */
1134 card_of_complement (s)
1135 struct Spec_list *s;
1138 int cardinality = N_CHARS;
1139 SET_TYPE in_set[N_CHARS];
1141 bzero (in_set, N_CHARS * sizeof (in_set[0]));
1142 s->state = BEGIN_STATE;
1143 while ((c = get_next (s, NULL)) != -1)
1149 /* Gather statistics about the spec-list S in preparation for the tests
1150 in validate that determine the legality of the specs. This function
1151 is called at most twice; once for string1, and again for any string2.
1152 LEN_S1 < 0 indicates that this is the first call and that S represents
1153 string1. When LEN_S1 >= 0, it is the length of the expansion of the
1154 constructs in string1, and we can use its value to resolve any
1155 indefinite repeat construct in S (which represents string2). Hence,
1156 this function has the side-effect that it converts a valid [c*]
1157 construct in string2 to [c*n] where n is large enough (or 0) to give
1158 string2 the same length as string1. For example, with the command
1159 tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1160 be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */
1163 get_spec_stats (s, len_s1)
1164 struct Spec_list *s;
1167 struct List_element *p;
1168 struct List_element *indefinite_repeat_element = NULL;
1171 s->n_indefinite_repeats = 0;
1172 s->has_equiv_class = 0;
1173 s->has_restricted_char_class = 0;
1174 s->has_upper_or_lower = 0;
1175 for (p = s->head->next; p; p = p->next)
1180 case RE_NORMAL_CHAR:
1185 assert (p->u.range.last_char >= p->u.range.first_char);
1186 len += p->u.range.last_char - p->u.range.first_char + 1;
1190 for (i = 0; i < N_CHARS; i++)
1191 if (is_char_class_member (p->u.char_class, i))
1193 switch (p->u.char_class)
1197 s->has_upper_or_lower = 1;
1200 s->has_restricted_char_class = 1;
1205 case RE_EQUIV_CLASS:
1206 for (i = 0; i < N_CHARS; i++)
1207 if (is_equiv_class_member (p->u.equiv_code, i))
1209 s->has_equiv_class = 1;
1212 case RE_REPEATED_CHAR:
1213 if (p->u.repeated_char.repeat_count > 0)
1214 len += p->u.repeated_char.repeat_count;
1215 else if (p->u.repeated_char.repeat_count == 0)
1217 indefinite_repeat_element = p;
1218 ++(s->n_indefinite_repeats);
1228 if (len_s1 >= len && s->n_indefinite_repeats == 1)
1230 indefinite_repeat_element->u.repeated_char.repeat_count = len_s1 - len;
1233 if (complement && len_s1 < 0)
1234 s->length = card_of_complement (s);
1241 spec_init (spec_list)
1242 struct Spec_list *spec_list;
1244 spec_list->head = spec_list->tail =
1245 (struct List_element *) xmalloc (sizeof (struct List_element));
1246 spec_list->head->next = NULL;
1249 /* This function makes two passes over the argument string S. The first
1250 one converts all \c and \ddd escapes to their one-byte representations.
1251 The second constructs a linked specification list, SPEC_LIST, of the
1252 characters and constructs that comprise the argument string. If either
1253 of these passes detects an error, this function returns non-zero. */
1256 parse_str (s, spec_list)
1258 struct Spec_list *spec_list;
1262 if (unquote (s, &len))
1264 if (build_spec_list (s, len, spec_list))
1269 /* Given two specification lists, S1 and S2, and assuming that
1270 S1->length > S2->length, append a single [c*n] element to S2 where c
1271 is the last character in the expansion of S2 and n is the difference
1272 between the two lengths.
1273 Upon successful completion, S2->length is set to S1->length. The only
1274 way this function can fail to make S2 as long as S1 is when S2 has
1275 zero-length, since in that case, there is no last character to repeat.
1276 So S2->length is required to be at least 1.
1278 Providing this functionality allows the user to do some pretty
1279 non-BSD (and non-portable) things: For example, the command
1280 tr -cs '[:upper:]0-9' '[:lower:]'
1281 is almost guaranteed to give results that depend on your collating
1285 string2_extend (s1, s2)
1286 struct Spec_list *s1;
1287 struct Spec_list *s2;
1289 struct List_element *p;
1293 assert (translating);
1294 assert (s1->length > s2->length);
1295 assert (s2->length > 0);
1300 case RE_NORMAL_CHAR:
1301 char_to_repeat = p->u.normal_char;
1304 char_to_repeat = p->u.range.last_char;
1307 for (i = N_CHARS; i >= 0; i--)
1308 if (is_char_class_member (p->u.char_class, i))
1311 char_to_repeat = CHR (i);
1314 case RE_REPEATED_CHAR:
1315 char_to_repeat = p->u.repeated_char.the_repeated_char;
1318 case RE_EQUIV_CLASS:
1319 /* This shouldn't happen, because validate exits with an error
1320 if it finds an equiv class in string2 when translating. */
1328 append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1329 s2->length = s1->length;
1333 /* Die with an error message if S1 and S2 describe strings that
1334 are not valid with the given command line switches.
1335 A side effect of this function is that if a legal [c*] or
1336 [c*0] construct appears in string2, it is converted to [c*n]
1337 with a value for n that makes s2->length == s1->length. By
1338 the same token, if the --truncate-set1 option is not
1339 given, S2 may be extended. */
1343 struct Spec_list *s1;
1344 struct Spec_list *s2;
1346 get_spec_stats (s1, -1);
1347 if (s1->n_indefinite_repeats > 0)
1349 error (1, 0, "the [c*] repeat construct may not appear in string1");
1352 /* FIXME: it isn't clear from the POSIX spec that this is illegal,
1353 but in the spirit of the other restrictions put on translation
1354 with character classes, this seems a logical interpretation. */
1355 if (complement && s1->has_upper_or_lower)
1358 "character classes may not be used when translating and complementing");
1363 get_spec_stats (s2, s1->length);
1364 if (s2->has_restricted_char_class)
1367 "when translating, the only character classes that may appear in\n\
1368 \tstring2 are `upper' and `lower'");
1371 if (s2->n_indefinite_repeats > 1)
1373 error (1, 0, "only one [c*] repeat construct may appear in string2");
1378 if (s2->has_equiv_class)
1381 "[=c=] expressions may not appear in string2 when translating");
1384 if (s1->length > s2->length)
1388 /* string2 must be non-empty unless --truncate-set1 is
1389 given or string1 is empty. */
1391 if (s2->length == 0)
1393 "when not truncating set1, string2 must be non-empty");
1394 string2_extend (s1, s2);
1398 if (complement && s2->has_upper_or_lower)
1400 "character classes may not be used when translating and complementing");
1403 /* Not translating. */
1405 if (s2->n_indefinite_repeats > 0)
1407 "the [c*] construct may appear in string2 only when translating");
1412 /* Read buffers of SIZE bytes via the function READER (if READER is
1413 NULL, read from stdin) until EOF. When non-NULL, READER is either
1414 read_and_delete or read_and_xlate. After each buffer is read, it is
1415 processed and written to stdout. The buffers are processed so that
1416 multiple consecutive occurrences of the same character in the input
1417 stream are replaced by a single occurrence of that character if the
1418 character is in the squeeze set. */
1421 squeeze_filter (buf, size, reader)
1426 unsigned int char_to_squeeze = NOT_A_CHAR;
1437 nr = read (0, (char *) buf, size);
1439 nr = (*reader) (buf, size, NULL);
1442 error (1, errno, "read error");
1450 if (char_to_squeeze == NOT_A_CHAR)
1453 /* Here, by being a little tricky, we can get a significant
1454 performance increase in most cases when the input is
1455 reasonably large. Since tr will modify the input only
1456 if two consecutive (and identical) input characters are
1457 in the squeeze set, we can step by two through the data
1458 when searching for a character in the squeeze set. This
1459 means there may be a little more work in a few cases and
1460 perhaps twice as much work in the worst cases where most
1461 of the input is removed by squeezing repeats. But most
1462 uses of this functionality seem to remove less than 20-30%
1464 for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1467 /* There is a special case when i == nr and we've just
1468 skipped a character (the last one in buf) that is in
1470 if (i == nr && in_squeeze_set[buf[i - 1]])
1474 out_len = nr - begin;
1477 char_to_squeeze = buf[i];
1478 /* We're about to output buf[begin..i]. */
1479 out_len = i - begin + 1;
1481 /* But since we stepped by 2 in the loop above,
1482 out_len may be one too large. */
1483 if (i > 0 && buf[i - 1] == char_to_squeeze)
1486 /* Advance i to the index of first character to be
1487 considered when looking for a char different from
1492 && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1493 error (1, errno, "write error");
1496 if (char_to_squeeze != NOT_A_CHAR)
1498 /* Advance i to index of first char != char_to_squeeze
1499 (or to nr if all the rest of the characters in this
1500 buffer are the same as char_to_squeeze). */
1501 for (; i < nr && buf[i] == char_to_squeeze; i++)
1504 char_to_squeeze = NOT_A_CHAR;
1505 /* If (i >= nr) we've squeezed the last character in this buffer.
1506 So now we have to read a new buffer and continue comparing
1507 characters against char_to_squeeze. */
1512 /* Read buffers of SIZE bytes from stdin until one is found that
1513 contains at least one character not in the delete set. Store
1514 in the array BUF, all characters from that buffer that are not
1515 in the delete set, and return the number of characters saved
1519 read_and_delete (buf, size, not_used)
1525 static int hit_eof = 0;
1527 assert (not_used == NULL);
1533 /* This enclosing do-while loop is to make sure that
1534 we don't return zero (indicating EOF) when we've
1535 just deleted all the characters in a buffer. */
1539 int nr = read (0, (char *) buf, size);
1542 error (1, errno, "read error");
1549 /* This first loop may be a waste of code, but gives much
1550 better performance when no characters are deleted in
1551 the beginning of a buffer. It just avoids the copying
1552 of buf[i] into buf[n_saved] when it would be a NOP. */
1554 for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1558 for (++i; i < nr; i++)
1559 if (!in_delete_set[buf[i]])
1560 buf[n_saved++] = buf[i];
1562 while (n_saved == 0);
1567 /* Read at most SIZE bytes from stdin into the array BUF. Then
1568 perform the in-place and one-to-one mapping specified by the global
1569 array `xlate'. Return the number of characters read, or 0 upon EOF. */
1572 read_and_xlate (buf, size, not_used)
1577 long chars_read = 0;
1578 static int hit_eof = 0;
1581 assert (not_used == NULL);
1587 chars_read = read (0, (char *) buf, size);
1589 error (1, errno, "read error");
1590 if (chars_read == 0)
1596 for (i = 0; i < chars_read; i++)
1597 buf[i] = xlate[buf[i]];
1602 /* Initialize a boolean membership set IN_SET with the character
1603 values obtained by traversing the linked list of constructs S
1604 using the function `get_next'. If COMPLEMENT_THIS_SET is
1605 non-zero the resulting set is complemented. */
1608 set_initialize (s, complement_this_set, in_set)
1609 struct Spec_list *s;
1610 int complement_this_set;
1616 bzero (in_set, N_CHARS * sizeof (in_set[0]));
1617 s->state = BEGIN_STATE;
1618 while ((c = get_next (s, NULL)) != -1)
1620 if (complement_this_set)
1621 for (i = 0; i < N_CHARS; i++)
1622 in_set[i] = (!in_set[i]);
1631 int non_option_args;
1632 struct Spec_list buf1, buf2;
1633 struct Spec_list *s1 = &buf1;
1634 struct Spec_list *s2 = &buf2;
1636 program_name = argv[0];
1638 while ((c = getopt_long (argc, argv, "cdst", long_options,
1655 squeeze_repeats = 1;
1669 fprintf (stderr, "%s\n", version_string);
1674 posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1676 non_option_args = argc - optind;
1677 translating = (non_option_args == 2 && !delete);
1679 /* Change this test if it is legal to give tr no options and
1680 no args at all. POSIX doesn't specifically say anything
1681 either way, but it looks like they implied it's illegal
1682 by omission. If you want to make tr do a slow imitation
1683 of `cat' use `tr a a'. */
1684 if (non_option_args > 2)
1687 if (!delete && !squeeze_repeats && non_option_args != 2)
1688 error (1, 0, "two strings must be given when translating");
1690 if (delete && squeeze_repeats && non_option_args != 2)
1691 error (1, 0, "two strings must be given when both \
1692 deleting and squeezing repeats");
1694 /* If --delete is given without --squeeze-repeats, then
1695 only one string argument may be specified. But POSIX
1696 says to ignore any string2 in this case, so if POSIXLY_CORRECT
1697 is set, pretend we never saw string2. But I think
1698 this deserves a fatal error, so that's the default. */
1699 if ((delete && !squeeze_repeats) && non_option_args != 1)
1701 if (posix_pedantic && non_option_args == 2)
1705 "only one string may be given when deleting without squeezing repeats");
1709 if (parse_str ((unsigned char *) argv[optind], s1))
1712 if (non_option_args == 2)
1715 if (parse_str ((unsigned char *) argv[optind + 1], s2))
1723 if (squeeze_repeats && non_option_args == 1)
1725 set_initialize (s1, complement, in_squeeze_set);
1726 squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1728 else if (delete && non_option_args == 1)
1732 set_initialize (s1, complement, in_delete_set);
1735 nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1736 if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1737 error (1, errno, "write error");
1741 else if (squeeze_repeats && delete && non_option_args == 2)
1743 set_initialize (s1, complement, in_delete_set);
1744 set_initialize (s2, 0, in_squeeze_set);
1745 squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1747 else if (translating)
1752 SET_TYPE *in_s1 = in_delete_set;
1754 set_initialize (s1, 0, in_s1);
1755 s2->state = BEGIN_STATE;
1756 for (i = 0; i < N_CHARS; i++)
1758 for (i = 0; i < N_CHARS; i++)
1762 int ch = get_next (s2, NULL);
1763 assert (ch != -1 || truncate_set1);
1766 /* This will happen when tr is invoked like e.g.
1767 tr -cs A-Za-z0-9 '\012'. */
1773 assert (get_next (s2, NULL) == -1 || truncate_set1);
1779 enum Upper_Lower_class class_s1;
1780 enum Upper_Lower_class class_s2;
1782 for (i = 0; i < N_CHARS; i++)
1784 s1->state = BEGIN_STATE;
1785 s2->state = BEGIN_STATE;
1788 c1 = get_next (s1, &class_s1);
1789 c2 = get_next (s2, &class_s2);
1790 if (!class_ok[(int) class_s1][(int) class_s2])
1792 "misaligned or mismatched upper and/or lower classes");
1793 /* The following should have been checked by validate... */
1798 assert (c1 == -1 || truncate_set1);
1800 if (squeeze_repeats)
1802 set_initialize (s2, 0, in_squeeze_set);
1803 squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
1811 chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
1813 && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
1814 error (1, errno, "write error");
1816 while (chars_read > 0);
1820 if (fclose (stdout) == EOF)
1821 error (2, errno, "write error");
1824 error (2, errno, "standard input");