1 /* cut - remove parts of lines of files
2 Copyright (C) 1984 by David M. Ihnat
4 This program is a total rewrite of the Bell Laboratories Unix(Tm)
5 command of the same name, as of System V. It contains no proprietary
6 code, and therefore may be used without violation of any proprietary
7 agreements whatsoever. However, you will notice that the program is
8 copyrighted by me. This is to assure the program does *not* fall
9 into the public domain. Thus, I may specify just what I am now:
10 This program may be freely copied and distributed, provided this notice
11 remains; it may not be sold for profit without express written consent of
13 Please note that I recreated the behavior of the Unix(Tm) 'cut' command
14 as faithfully as possible; however, I haven't run a full set of regression
15 tests. Thus, the user of this program accepts full responsibility for any
16 effects or loss; in particular, the author is not responsible for any losses,
17 explicit or incidental, that may be incurred through use of this program.
19 I ask that any bugs (and, if possible, fixes) be reported to me when
20 possible. -David Ihnat (312) 784-4544 ignatz@homebru.chi.il.us
22 POSIX changes, bug fixes, long-named options, and cleanup
23 by David MacKenzie <djm@gnu.ai.mit.edu>.
25 Rewrite cut_fields and cut_bytes -- Jim Meyering (meyering@comco.com).
29 -b byte-list Print only the bytes in positions listed
31 Tabs and backspaces are treated like any
32 other character; they take up 1 byte.
34 --characters=character-list
35 -c character-list Print only characters in positions listed
37 The same as -b for now, but
38 internationalization will change that.
39 Tabs and backspaces are treated like any
40 other character; they take up 1 character.
43 -f field-list Print only the fields listed in FIELD-LIST.
44 Fields are separated by a TAB by default.
47 -d delim For -f, fields are separated by the first
48 character in DELIM instead of TAB.
50 -n Do not split multibyte chars (no-op for now).
53 -s For -f, do not print lines that do not contain
54 the field separator character.
56 The BYTE-LIST, CHARACTER-LIST, and FIELD-LIST are one or more numbers
57 or ranges separated by commas. The first byte, character, and field
60 A FILE of `-' means standard input. */
64 /* Get isblank from GNU libc. */
73 #include <sys/types.h>
78 #define FATAL_ERROR(s) \
86 /* Append LOW, HIGH to the list RP of range pairs, allocating additional
87 space if necessary. Update local variable N_RP. When allocating,
88 update global variable N_RP_ALLOCATED. */
90 #define ADD_RANGE_PAIR(rp, low, high) \
93 if (n_rp >= n_rp_allocated) \
95 n_rp_allocated *= 2; \
96 (rp) = (struct range_pair *) xrealloc ((rp), \
97 n_rp_allocated * sizeof (*(rp))); \
99 rp[n_rp].lo = (low); \
100 rp[n_rp].hi = (high); \
114 /* This buffer is used to support the semantics of the -s option
115 (or lack of same) when the specified field list includes (does
116 not include) the first field. In both of those cases, the entire
117 first field must be read into this buffer to determine whether it
118 is followed by a delimiter or a newline before any of it may be
119 output. Otherwise, cut_fields can do the job without using this
121 static char *field_1_buffer;
123 /* The number of bytes allocated for FIELD_1_BUFFER. */
124 static int field_1_bufsize;
126 /* The largest field or byte index used as an endpoint of a closed
127 or degenerate range specification; this doesn't include the starting
128 index of right-open-ended ranges. For example, with either range spec
129 `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
130 static int max_range_endpoint;
132 /* If nonzero, this is the index of the first field in a range that goes
134 static int eol_range_start;
136 /* In byte mode, which bytes to output.
137 In field mode, which DELIM-separated fields to output.
138 Both bytes and fields are numbered starting with 1,
139 so the zeroth element of this array is unused.
140 A field or byte K has been selected if
141 (K <= MAX_RANGE_ENDPOINT and PRINTABLE_FIELD[K])
142 || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
143 static int *printable_field;
149 /* Output characters that are in the given bytes. */
152 /* Output the given delimeter-separated fields. */
156 /* The name this program was run with. */
159 static enum operating_mode operating_mode;
161 /* If non-zero do not output lines containing no delimeter characters.
162 Otherwise, all such lines are printed. This option is valid only
164 static int suppress_non_delimited;
166 /* The delimeter character for field mode. */
169 /* Nonzero if we have ever read standard input. */
170 static int have_read_stdin;
172 /* If non-zero, display usage information and exit. */
173 static int show_help;
175 /* If non-zero, print the version on standard output then exit. */
176 static int show_version;
178 static struct option const longopts[] =
180 {"bytes", required_argument, 0, 'b'},
181 {"characters", required_argument, 0, 'c'},
182 {"fields", required_argument, 0, 'f'},
183 {"delimiter", required_argument, 0, 'd'},
184 {"only-delimited", no_argument, 0, 's'},
185 {"help", no_argument, &show_help, 1},
186 {"version", no_argument, &show_version, 1},
195 fprintf (stderr, "Try `%s --help' for more information.\n",
200 Usage: %s [OPTION]... [FILE]...\n\
205 -b, --bytes=LIST output only these bytes\n\
206 -c, --characters=LIST output only these characters\n\
207 -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
208 -f, --fields=LIST output only these fields\n\
210 -s, --only-delimited do not print lines not containing delimiters\n\
211 --help display this help and exit\n\
212 --version output version information and exit\n\
214 Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
215 range, or many ranges separated by commas. Each range is one of:\n\
217 N N'th byte, character or field, counted from 1\n\
218 N- from N'th byte, character or field, to end of line\n\
219 N-M from N'th to M'th (included) byte, character or field\n\
220 -M from first to M'th (included) byte, character or field\n\
222 With no FILE, or when FILE is -, read standard input.\n\
228 /* The following function was copied from getline.c, but with these changes:
229 - Read up to and including a newline or TERMINATOR, whichever comes first.
230 The original does not treat newline specially.
231 - Remove unused argument, OFFSET.
232 - Use xmalloc and xrealloc instead of malloc and realloc.
233 - Declare this function static. */
235 /* Always add at least this many bytes when extending the buffer. */
238 /* Read up to (and including) a newline or TERMINATOR from STREAM into
239 *LINEPTR (and null-terminate it). *LINEPTR is a pointer returned from
240 xmalloc (or NULL), pointing to *N characters of space. It is
241 xrealloc'd as necessary. Return the number of characters read (not
242 including the null terminator), or -1 on error or EOF. */
245 getstr (lineptr, n, stream, terminator)
251 int nchars_avail; /* Allocated but unused chars in *LINEPTR. */
252 char *read_pos; /* Where we're reading into *LINEPTR. */
254 if (!lineptr || !n || !stream)
260 *lineptr = xmalloc (*n);
270 register int c = getc (stream);
272 /* We always want at least one char left in the buffer, since we
273 always (unless we get an error while reading the first char)
274 NUL-terminate the line buffer. */
276 assert (*n - nchars_avail == read_pos - *lineptr);
277 if (nchars_avail < 1)
284 nchars_avail = *n + *lineptr - read_pos;
285 *lineptr = xrealloc (*lineptr, *n);
288 read_pos = *n - nchars_avail + *lineptr;
289 assert (*n - nchars_avail == read_pos - *lineptr);
292 if (feof (stream) || ferror (stream))
294 /* Return partial line, if any. */
295 if (read_pos == *lineptr)
304 if (c == terminator || c == '\n')
305 /* Return the line. */
309 /* Done - NUL terminate and return the number of chars read. */
312 return read_pos - *lineptr;
319 return ((eol_range_start > 0 && eol_range_start <= k)
320 || (k <= max_range_endpoint && printable_field[k]));
323 /* Given the list of field or byte range specifications FIELDSTR, set
324 MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
325 array. If there is a right-open-ended range, set EOL_RANGE_START
326 to its starting index. FIELDSTR should be composed of one or more
327 numbers or ranges of numbers, separated by blanks or commas.
328 Incomplete ranges may be given: `-m' means `1-m'; `n-' means `n'
329 through end of line. Return non-zero if FIELDSTR contains at least
330 one field specification, zero otherwise. */
332 /* FIXME-someday: What if the user wants to cut out the 1,000,000-th field
333 of some huge input file? This function shouldn't have to alloate a table
334 of a million ints just so we can test every field < 10^6 with an array
335 dereference. Instead, consider using a dynamic hash table. It would be
336 simpler and nearly as good a solution to use a 32K x 4-byte table with
337 one bit per field index instead of a whole `int' per index. */
340 set_fields (fieldstr)
341 const char *fieldstr;
343 int initial = 1; /* Value of first number in a range. */
344 int dash_found = 0; /* Nonzero if a '-' is found in this field. */
345 int value = 0; /* If nonzero, a number being accumulated. */
346 int field_found = 0; /* Non-zero if at least one field spec
347 has been processed. */
349 struct range_pair *rp;
351 unsigned int n_rp_allocated;
356 rp = (struct range_pair *) xmalloc (n_rp_allocated * sizeof (*rp));
358 /* Collect and store in RP the range end points.
359 It also sets EOL_RANGE_START if appropriate. */
363 if (*fieldstr == '-')
365 /* Starting a range. */
367 FATAL_ERROR ("invalid byte or field list");
379 else if (*fieldstr == ',' || ISBLANK (*fieldstr) || *fieldstr == '\0')
381 /* Ending the string, or this field/byte sublist. */
386 /* A range. Possibilites: -n, m-n, n-.
387 In any case, `initial' contains the start of the range. */
390 /* `n-'. From `initial' to end of line. */
391 eol_range_start = initial;
396 /* `m-n' or `-n' (1-n). */
398 FATAL_ERROR ("invalid byte or field list");
400 /* Is there already a range going to end of line? */
401 if (eol_range_start != 0)
403 /* Yes. Is the new sequence already contained
404 in the old one? If so, no processing is
406 if (initial < eol_range_start)
408 /* No, the new sequence starts before the
409 old. Does the old range going to end of line
410 extend into the new range? */
411 if (value >= eol_range_start - 1)
413 /* Yes. Simply move the end of line marker. */
414 eol_range_start = initial;
418 /* No. A simple range, before and disjoint from
419 the range going to end of line. Fill it. */
420 ADD_RANGE_PAIR (rp, initial, value);
423 /* In any case, some fields were selected. */
429 /* There is no range going to end of line. */
430 ADD_RANGE_PAIR (rp, initial, value);
438 /* A simple field number, not a range. */
439 ADD_RANGE_PAIR (rp, value, value);
444 if (*fieldstr == '\0')
451 else if (ISDIGIT (*fieldstr))
453 /* FIXME: detect overflow? */
454 value = 10 * value + *fieldstr - '0';
458 FATAL_ERROR ("invalid byte or field list");
461 max_range_endpoint = 0;
462 for (i = 0; i < n_rp; i++)
464 if (rp[i].hi > max_range_endpoint)
465 max_range_endpoint = rp[i].hi;
468 /* Allocate an array large enough so that it may be indexed by
469 the field numbers corresponding to all finite ranges
470 (i.e. `2-6' or `-4', but not `5-') in FIELDSTR. */
472 printable_field = (int *) xmalloc ((max_range_endpoint + 1) * sizeof (int));
473 for (i = 1; i <= max_range_endpoint; i++)
474 printable_field[i] = 0;
476 /* Set the array entries corresponding to integers in the ranges of RP. */
477 for (i = 0; i < n_rp; i++)
480 for (j = rp[i].lo; j <= rp[i].hi; j++)
482 printable_field[j] = 1;
491 /* Read from stream STREAM, printing to standard output any selected bytes. */
497 int byte_idx; /* Number of chars in the line so far. */
502 register int c; /* Each character from the file. */
520 if (print_kth (byte_idx))
528 /* Read from stream STREAM, printing to standard output any selected fields. */
536 int found_any_selected_field;
537 int buffer_first_field;
539 found_any_selected_field = 0;
542 /* To support the semantics of the -s flag, we may have to buffer
543 all of the first field to determine whether it is `delimited.'
544 But that is unnecessary if all non-delimited lines must be printed
545 and the first field has been selected, or if non-delimited lines
546 must be suppressed and the first field has *not* been selected.
547 That is because a non-delimited line has exactly one field. */
548 buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
552 if (field_idx == 1 && buffer_first_field)
556 len = getstr (&field_1_buffer, &field_1_bufsize, stream, delim);
562 /* If the first field extends to the end of line (it is not
563 delimited) and we are printing all non-delimited lines,
565 if (field_1_buffer[len - 1] != delim)
567 if (suppress_non_delimited)
573 fwrite (field_1_buffer, sizeof (char), len, stdout);
574 /* Make sure the output line is newline terminated. */
575 if (field_1_buffer[len - 1] != '\n')
582 /* Print the field, but not the trailing delimiter. */
583 fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
584 found_any_selected_field = 1;
589 if (print_kth (field_idx))
591 if (found_any_selected_field)
593 found_any_selected_field = 1;
595 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
602 while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
620 else if (c == '\n' || c == EOF)
622 if (found_any_selected_field
623 || !(suppress_non_delimited && field_idx == 1))
628 found_any_selected_field = 0;
637 if (operating_mode == byte_mode)
643 /* Process file FILE to standard output.
644 Return 0 if successful, 1 if not. */
652 if (!strcmp (file, "-"))
659 stream = fopen (file, "r");
662 error (0, errno, "%s", file);
671 error (0, errno, "%s", file);
674 if (!strcmp (file, "-"))
675 clearerr (stream); /* Also clear EOF. */
676 else if (fclose (stream) == EOF)
678 error (0, errno, "%s", file);
689 int optc, exit_status = 0;
691 program_name = argv[0];
693 operating_mode = undefined_mode;
695 /* By default, all non-delimited lines are printed. */
696 suppress_non_delimited = 0;
701 while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, (int *) 0))
711 /* Build the byte list. */
712 if (operating_mode != undefined_mode)
713 FATAL_ERROR ("only one type of list may be specified");
714 operating_mode = byte_mode;
715 if (set_fields (optarg) == 0)
716 FATAL_ERROR ("missing list of positions");
720 /* Build the field list. */
721 if (operating_mode != undefined_mode)
722 FATAL_ERROR ("only one type of list may be specified");
723 operating_mode = field_mode;
724 if (set_fields (optarg) == 0)
725 FATAL_ERROR ("missing list of fields");
730 /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
731 if (optarg[0] != '\0' && optarg[1] != '\0')
732 FATAL_ERROR ("the delimiter must be a single character");
740 suppress_non_delimited = 1;
750 printf ("cut - %s\n", version_string);
757 if (operating_mode == undefined_mode)
758 FATAL_ERROR ("you must specify a list of bytes, characters, or fields");
760 if (delim != '\0' && operating_mode != field_mode)
761 FATAL_ERROR ("a delimiter may be specified only when operating on fields");
763 if (suppress_non_delimited && operating_mode != field_mode)
764 FATAL_ERROR ("suppressing non-delimited lines makes sense\n\
765 \tonly when operating on fields");
771 exit_status |= cut_file ("-");
773 for (; optind < argc; optind++)
774 exit_status |= cut_file (argv[optind]);
776 if (have_read_stdin && fclose (stdin) == EOF)
778 error (0, errno, "-");
781 if (ferror (stdout) || fclose (stdout) == EOF)
782 error (1, errno, "write error");