1 /* cut - remove parts of lines of files
2 Copyright (C) 1984, 1997, 1998, 1999, 2000, 2001 by David M. Ihnat
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 /* Written by David Ihnat. */
20 /* POSIX changes, bug fixes, long-named options, and cleanup
21 by David MacKenzie <djm@gnu.ai.mit.edu>.
23 Rewrite cut_fields and cut_bytes -- Jim Meyering. */
30 #include <sys/types.h>
36 /* The official name of this program (e.g., no `g' prefix). */
37 #define PROGRAM_NAME "cut"
39 #define AUTHORS N_ ("David Ihnat, David MacKenzie, and Jim Meyering")
41 #define FATAL_ERROR(Message) \
44 error (0, 0, (Message)); \
49 /* Append LOW, HIGH to the list RP of range pairs, allocating additional
50 space if necessary. Update local variable N_RP. When allocating,
51 update global variable N_RP_ALLOCATED. */
53 #define ADD_RANGE_PAIR(rp, low, high) \
56 if (n_rp >= n_rp_allocated) \
58 n_rp_allocated *= 2; \
59 (rp) = (struct range_pair *) xrealloc ((char *) (rp), \
60 n_rp_allocated * sizeof (*(rp))); \
62 rp[n_rp].lo = (low); \
63 rp[n_rp].hi = (high); \
74 /* This buffer is used to support the semantics of the -s option
75 (or lack of same) when the specified field list includes (does
76 not include) the first field. In both of those cases, the entire
77 first field must be read into this buffer to determine whether it
78 is followed by a delimiter or a newline before any of it may be
79 output. Otherwise, cut_fields can do the job without using this
81 static char *field_1_buffer;
83 /* The number of bytes allocated for FIELD_1_BUFFER. */
84 static size_t field_1_bufsize;
86 /* The largest field or byte index used as an endpoint of a closed
87 or degenerate range specification; this doesn't include the starting
88 index of right-open-ended ranges. For example, with either range spec
89 `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
90 static unsigned int max_range_endpoint;
92 /* If nonzero, this is the index of the first field in a range that goes
94 static unsigned int eol_range_start;
96 /* In byte mode, which bytes to output.
97 In field mode, which DELIM-separated fields to output.
98 Both bytes and fields are numbered starting with 1,
99 so the zeroth element of this array is unused.
100 A field or byte K has been selected if
101 (K <= MAX_RANGE_ENDPOINT and PRINTABLE_FIELD[K])
102 || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
103 static int *printable_field;
109 /* Output characters that are in the given bytes. */
112 /* Output the given delimeter-separated fields. */
116 /* The name this program was run with. */
119 static enum operating_mode operating_mode;
121 /* If nonzero do not output lines containing no delimeter characters.
122 Otherwise, all such lines are printed. This option is valid only
124 static int suppress_non_delimited;
126 /* The delimeter character for field mode. */
129 /* The length of output_delimiter_string. */
130 static size_t output_delimiter_length;
132 /* The output field separator string. Defaults to the 1-character
133 string consisting of the input delimiter. */
134 static char *output_delimiter_string;
136 /* Nonzero if we have ever read standard input. */
137 static int have_read_stdin;
139 /* For long options that have no equivalent short option, use a
140 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
143 OUTPUT_DELIMITER_OPTION = CHAR_MAX + 1
146 static struct option const longopts[] =
148 {"bytes", required_argument, 0, 'b'},
149 {"characters", required_argument, 0, 'c'},
150 {"fields", required_argument, 0, 'f'},
151 {"delimiter", required_argument, 0, 'd'},
152 {"only-delimited", no_argument, 0, 's'},
153 {"output-delimiter", required_argument, 0, OUTPUT_DELIMITER_OPTION},
154 {GETOPT_HELP_OPTION_DECL},
155 {GETOPT_VERSION_OPTION_DECL},
163 fprintf (stderr, _("Try `%s --help' for more information.\n"),
168 Usage: %s [OPTION]... [FILE]...\n\
172 Print selected parts of lines from each FILE to standard output.\n\
174 Mandatory arguments to long options are mandatory for short options too.\n\
175 -b, --bytes=LIST output only these bytes\n\
176 -c, --characters=LIST output only these characters\n\
177 -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
180 -f, --fields=LIST output only these fields; also print any line\n\
181 that contains no delimiter character, unless\n\
182 the -s option is specified\n\
186 -s, --only-delimited do not print lines not containing delimiters\n\
187 --output-delimiter=STRING use STRING as the output delimiter\n\
188 the default is to use the input delimiter\n\
189 --help display this help and exit\n\
190 --version output version information and exit\n\
194 Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
195 range, or many ranges separated by commas. Each range is one of:\n\
197 N N'th byte, character or field, counted from 1\n\
198 N- from N'th byte, character or field, to end of line\n\
199 N-M from N'th to M'th (included) byte, character or field\n\
200 -M from first to M'th (included) byte, character or field\n\
202 With no FILE, or when FILE is -, read standard input.\n\
204 puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
206 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
210 print_kth (unsigned int k)
212 return ((0 < eol_range_start && eol_range_start <= k)
213 || (k <= max_range_endpoint && printable_field[k]));
216 /* Given the list of field or byte range specifications FIELDSTR, set
217 MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
218 array. If there is a right-open-ended range, set EOL_RANGE_START
219 to its starting index. FIELDSTR should be composed of one or more
220 numbers or ranges of numbers, separated by blanks or commas.
221 Incomplete ranges may be given: `-m' means `1-m'; `n-' means `n'
222 through end of line. Return nonzero if FIELDSTR contains at least
223 one field specification, zero otherwise. */
225 /* FIXME-someday: What if the user wants to cut out the 1,000,000-th field
226 of some huge input file? This function shouldn't have to alloate a table
227 of a million ints just so we can test every field < 10^6 with an array
228 dereference. Instead, consider using a dynamic hash table. It would be
229 simpler and nearly as good a solution to use a 32K x 4-byte table with
230 one bit per field index instead of a whole `int' per index. */
233 set_fields (const char *fieldstr)
235 unsigned int initial = 1; /* Value of first number in a range. */
236 unsigned int value = 0; /* If nonzero, a number being accumulated. */
237 int dash_found = 0; /* Nonzero if a '-' is found in this field. */
238 int field_found = 0; /* Non-zero if at least one field spec
239 has been processed. */
241 struct range_pair *rp;
243 unsigned int n_rp_allocated;
248 rp = (struct range_pair *) xmalloc (n_rp_allocated * sizeof (*rp));
250 /* Collect and store in RP the range end points.
251 It also sets EOL_RANGE_START if appropriate. */
255 if (*fieldstr == '-')
257 /* Starting a range. */
259 FATAL_ERROR (_("invalid byte or field list"));
271 else if (*fieldstr == ',' || ISBLANK (*fieldstr) || *fieldstr == '\0')
273 /* Ending the string, or this field/byte sublist. */
278 /* A range. Possibilites: -n, m-n, n-.
279 In any case, `initial' contains the start of the range. */
282 /* `n-'. From `initial' to end of line. */
283 eol_range_start = initial;
288 /* `m-n' or `-n' (1-n). */
290 FATAL_ERROR (_("invalid byte or field list"));
292 /* Is there already a range going to end of line? */
293 if (eol_range_start != 0)
295 /* Yes. Is the new sequence already contained
296 in the old one? If so, no processing is
298 if (initial < eol_range_start)
300 /* No, the new sequence starts before the
301 old. Does the old range going to end of line
302 extend into the new range? */
303 if (value + 1 >= eol_range_start)
305 /* Yes. Simply move the end of line marker. */
306 eol_range_start = initial;
310 /* No. A simple range, before and disjoint from
311 the range going to end of line. Fill it. */
312 ADD_RANGE_PAIR (rp, initial, value);
315 /* In any case, some fields were selected. */
321 /* There is no range going to end of line. */
322 ADD_RANGE_PAIR (rp, initial, value);
330 /* A simple field number, not a range. */
331 ADD_RANGE_PAIR (rp, value, value);
336 if (*fieldstr == '\0')
343 else if (ISDIGIT (*fieldstr))
345 /* FIXME: detect overflow? */
346 value = 10 * value + *fieldstr - '0';
350 FATAL_ERROR (_("invalid byte or field list"));
353 max_range_endpoint = 0;
354 for (i = 0; i < n_rp; i++)
356 if (rp[i].hi > max_range_endpoint)
357 max_range_endpoint = rp[i].hi;
360 /* Allocate an array large enough so that it may be indexed by
361 the field numbers corresponding to all finite ranges
362 (i.e. `2-6' or `-4', but not `5-') in FIELDSTR. */
364 printable_field = (int *) xmalloc ((max_range_endpoint + 1) * sizeof (int));
365 memset (printable_field, 0, (max_range_endpoint + 1) * sizeof (int));
367 /* Set the array entries corresponding to integers in the ranges of RP. */
368 for (i = 0; i < n_rp; i++)
371 for (j = rp[i].lo; j <= rp[i].hi; j++)
373 printable_field[j] = 1;
382 /* Read from stream STREAM, printing to standard output any selected bytes. */
385 cut_bytes (FILE *stream)
387 unsigned int byte_idx; /* Number of chars in the line so far. */
392 register int c; /* Each character from the file. */
410 if (print_kth (byte_idx))
418 /* Read from stream STREAM, printing to standard output any selected fields. */
421 cut_fields (FILE *stream)
424 unsigned int field_idx;
425 int found_any_selected_field;
426 int buffer_first_field;
429 found_any_selected_field = 0;
433 empty_input = (c == EOF);
437 /* To support the semantics of the -s flag, we may have to buffer
438 all of the first field to determine whether it is `delimited.'
439 But that is unnecessary if all non-delimited lines must be printed
440 and the first field has been selected, or if non-delimited lines
441 must be suppressed and the first field has *not* been selected.
442 That is because a non-delimited line has exactly one field. */
443 buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
447 if (field_idx == 1 && buffer_first_field)
451 len = getstr (&field_1_buffer, &field_1_bufsize, stream,
455 if (ferror (stream) || feof (stream))
462 /* If the first field extends to the end of line (it is not
463 delimited) and we are printing all non-delimited lines,
465 if ((unsigned char) field_1_buffer[len - 1] != delim)
467 if (suppress_non_delimited)
473 fwrite (field_1_buffer, sizeof (char), len, stdout);
474 /* Make sure the output line is newline terminated. */
475 if (field_1_buffer[len - 1] != '\n')
482 /* Print the field, but not the trailing delimiter. */
483 fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
484 found_any_selected_field = 1;
491 if (print_kth (field_idx))
493 if (found_any_selected_field)
495 fwrite (output_delimiter_string, sizeof (char),
496 output_delimiter_length, stdout);
498 found_any_selected_field = 1;
500 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
507 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
526 else if (c == '\n' || c == EOF)
528 if (found_any_selected_field
529 || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
534 found_any_selected_field = 0;
540 cut_stream (FILE *stream)
542 if (operating_mode == byte_mode)
548 /* Process file FILE to standard output.
549 Return 0 if successful, 1 if not. */
552 cut_file (char *file)
556 if (STREQ (file, "-"))
563 stream = fopen (file, "r");
566 error (0, errno, "%s", file);
575 error (0, errno, "%s", file);
578 if (STREQ (file, "-"))
579 clearerr (stream); /* Also clear EOF. */
580 else if (fclose (stream) == EOF)
582 error (0, errno, "%s", file);
589 main (int argc, char **argv)
591 int optc, exit_status = 0;
592 int delim_specified = 0;
594 program_name = argv[0];
595 setlocale (LC_ALL, "");
596 bindtextdomain (PACKAGE, LOCALEDIR);
597 textdomain (PACKAGE);
599 atexit (close_stdout);
601 operating_mode = undefined_mode;
603 /* By default, all non-delimited lines are printed. */
604 suppress_non_delimited = 0;
609 while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, NULL)) != -1)
618 /* Build the byte list. */
619 if (operating_mode != undefined_mode)
620 FATAL_ERROR (_("only one type of list may be specified"));
621 operating_mode = byte_mode;
622 if (set_fields (optarg) == 0)
623 FATAL_ERROR (_("missing list of positions"));
627 /* Build the field list. */
628 if (operating_mode != undefined_mode)
629 FATAL_ERROR (_("only one type of list may be specified"));
630 operating_mode = field_mode;
631 if (set_fields (optarg) == 0)
632 FATAL_ERROR (_("missing list of fields"));
637 /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
638 if (optarg[0] != '\0' && optarg[1] != '\0')
639 FATAL_ERROR (_("the delimiter must be a single character"));
640 delim = (unsigned char) optarg[0];
644 case OUTPUT_DELIMITER_OPTION:
645 /* Interpret --output-delimiter='' to mean
646 `use the NUL byte as the delimiter.' */
647 output_delimiter_length = (optarg[0] == '\0'
648 ? 1 : strlen (optarg));
649 output_delimiter_string = xstrdup (optarg);
656 suppress_non_delimited = 1;
659 case_GETOPT_HELP_CHAR;
661 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
668 if (operating_mode == undefined_mode)
669 FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
671 if (delim != '\0' && operating_mode != field_mode)
672 FATAL_ERROR (_("a delimiter may be specified only when operating on fields"));
674 if (suppress_non_delimited && operating_mode != field_mode)
675 FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
676 \tonly when operating on fields"));
678 if (!delim_specified)
681 if (output_delimiter_string == NULL)
683 static char dummy[2];
686 output_delimiter_string = dummy;
687 output_delimiter_length = 1;
691 exit_status |= cut_file ("-");
693 for (; optind < argc; optind++)
694 exit_status |= cut_file (argv[optind]);
696 if (have_read_stdin && fclose (stdin) == EOF)
698 error (0, errno, "-");
702 exit (exit_status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);