1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 86, 91, 1995-2001, Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 /* Written by Richard Stallman and David MacKenzie. */
24 #include <sys/types.h>
29 #include "linebuffer.h"
32 #include "memcasecmp.h"
34 /* The official name of this program (e.g., no `g' prefix). */
35 #define PROGRAM_NAME "uniq"
37 #define AUTHORS N_ ("Richard Stallman and David MacKenzie")
39 #define SWAP_LINES(A, B) \
42 struct linebuffer *_tmp; \
49 /* The name this program was run with. */
52 /* Number of fields to skip on each line when doing comparisons. */
53 static size_t skip_fields;
55 /* Number of chars to skip after skipping any fields. */
56 static size_t skip_chars;
58 /* Number of chars to compare. */
59 static size_t check_chars;
63 count_occurrences, /* -c Print count before output lines. */
64 count_none /* Default. Do not print counts. */
67 /* Whether and how to precede the output lines with a count of the number of
68 times they occurred in the input. */
69 static enum countmode countmode;
73 output_repeated, /* -d Only lines that are repeated. */
74 output_all_repeated, /* -D All lines that are repeated. */
75 output_unique, /* -u Only lines that are not repeated. */
76 output_all /* Default. Print first copy of each line. */
79 /* Which lines to output. */
80 static enum output_mode mode;
82 /* If nonzero, ignore case when comparing. */
83 static int ignore_case;
87 /* No delimiters output. --all-repeated[=none] */
90 /* Delimiter precedes all groups. --all-repeated=prepend */
93 /* Delimit all groups. --all-repeated=separate */
97 static char const *const delimit_method_string[] =
99 "none", "prepend", "separate", 0
102 static enum delimit_method const delimit_method_map[] =
104 DM_NONE, DM_PREPEND, DM_SEPARATE
107 /* Select whether/how to delimit groups of duplicate lines. */
108 static enum delimit_method delimit_groups;
110 static struct option const longopts[] =
112 {"count", no_argument, NULL, 'c'},
113 {"repeated", no_argument, NULL, 'd'},
114 {"all-repeated", optional_argument, NULL, 'D'},
115 {"ignore-case", no_argument, NULL, 'i'},
116 {"unique", no_argument, NULL, 'u'},
117 {"skip-fields", required_argument, NULL, 'f'},
118 {"skip-chars", required_argument, NULL, 's'},
119 {"check-chars", required_argument, NULL, 'w'},
120 {GETOPT_HELP_OPTION_DECL},
121 {GETOPT_VERSION_OPTION_DECL},
129 fprintf (stderr, _("Try `%s --help' for more information.\n"),
134 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
138 Discard all but one of successive identical lines from INPUT (or\n\
139 standard input), writing to OUTPUT (or standard output).\n\
141 -c, --count prefix lines by the number of occurrences\n\
142 -d, --repeated only print duplicate lines\n\
143 -D, --all-repeated[=delimit-method] print all duplicate lines\n\
144 delimit-method={none(default),prepend,separate)}\n\
145 Delimiting is done with blank lines.\n\
146 -f, --skip-fields=N avoid comparing the first N fields\n\
147 -i, --ignore-case ignore differences in case when comparing\n\
148 -s, --skip-chars=N avoid comparing the first N characters\n\
149 -u, --unique only print unique lines\n\
150 -w, --check-chars=N compare no more than N characters in lines\n\
152 +N same as -s N (obsolescent; will be withdrawn)\n\
153 --help display this help and exit\n\
154 --version output version information and exit\n\
156 A field is a run of whitespace, then non-whitespace characters.\n\
157 Fields are skipped before chars.\n\
159 puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
161 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
164 /* Convert OPT to size_t, reporting an error using MSGID if it does
168 size_opt (char const *opt, char const *msgid)
170 unsigned long int size;
171 if (xstrtoul (opt, NULL, 10, &size, "") != LONGINT_OK
173 error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
177 /* Given a linebuffer LINE,
178 return a pointer to the beginning of the line's field to be compared. */
181 find_field (const struct linebuffer *line)
183 register size_t count;
184 register char *lp = line->buffer;
185 register size_t size = line->length - 1;
186 register size_t i = 0;
188 for (count = 0; count < skip_fields && i < size; count++)
190 while (i < size && ISBLANK (lp[i]))
192 while (i < size && !ISBLANK (lp[i]))
196 for (count = 0; count < skip_chars && i < size; count++)
202 /* Return zero if two strings OLD and NEW match, nonzero if not.
203 OLD and NEW point not to the beginnings of the lines
204 but rather to the beginnings of the fields to compare.
205 OLDLEN and NEWLEN are their lengths. */
208 different (const char *old, const char *new, size_t oldlen, size_t newlen)
210 if (check_chars < oldlen)
211 oldlen = check_chars;
212 if (check_chars < newlen)
213 newlen = check_chars;
215 if (oldlen != newlen)
218 /* Use an if-statement here rather than a function variable to
219 avoid portability hassles of getting a non-conflicting declaration
222 return memcasecmp (old, new, oldlen);
224 return memcmp (old, new, oldlen);
227 /* Output the line in linebuffer LINE to stream STREAM
228 provided that the switches say it should be output.
229 If requested, print the number of times it occurred, as well;
230 LINECOUNT + 1 is the number of times that the line occurred. */
233 writeline (const struct linebuffer *line, FILE *stream, int linecount)
235 if ((mode == output_unique && linecount != 0)
236 || (mode == output_repeated && linecount == 0)
237 || (mode == output_all_repeated && linecount == 0))
240 if (countmode == count_occurrences)
241 fprintf (stream, "%7d\t", linecount + 1);
243 fwrite (line->buffer, sizeof (char), line->length, stream);
246 /* Process input file INFILE with output to OUTFILE.
247 If either is "-", use the standard I/O stream for it instead. */
250 check_file (const char *infile, const char *outfile)
254 struct linebuffer lb1, lb2;
255 struct linebuffer *thisline, *prevline;
257 if (STREQ (infile, "-"))
260 istream = fopen (infile, "r");
262 error (EXIT_FAILURE, errno, "%s", infile);
264 if (STREQ (outfile, "-"))
267 ostream = fopen (outfile, "w");
269 error (EXIT_FAILURE, errno, "%s", outfile);
274 initbuffer (thisline);
275 initbuffer (prevline);
277 /* The duplication in the following `if' and `else' blocks is an
278 optimization to distinguish the common case (in which none of
279 the following options has been specified: --count, -repeated,
280 --all-repeated, --unique) from the others. In the common case,
281 this optimization lets uniq output each different line right away,
282 without waiting to see if the next one is different. */
284 if (mode == output_all && countmode == count_none)
286 char *prevfield IF_LINT (= NULL);
287 size_t prevlen IF_LINT (= 0);
289 while (!feof (istream))
293 if (readline (thisline, istream) == 0)
295 thisfield = find_field (thisline);
296 thislen = thisline->length - (thisfield - thisline->buffer);
297 if (prevline->length == 0
298 || different (thisfield, prevfield, thislen, prevlen))
300 fwrite (thisline->buffer, sizeof (char),
301 thisline->length, ostream);
303 SWAP_LINES (prevline, thisline);
304 prevfield = thisfield;
314 int first_delimiter = 1;
316 if (readline (prevline, istream) == 0)
318 prevfield = find_field (prevline);
319 prevlen = prevline->length - (prevfield - prevline->buffer);
321 while (!feof (istream))
326 if (readline (thisline, istream) == 0)
328 thisfield = find_field (thisline);
329 thislen = thisline->length - (thisfield - thisline->buffer);
330 match = !different (thisfield, prevfield, thislen, prevlen);
335 if (mode == output_all_repeated && delimit_groups != DM_NONE)
339 if (match_count) /* a previous match */
340 first_delimiter = 0; /* Only used when DM_SEPARATE */
342 else if (match_count == 1)
344 if ((delimit_groups == DM_PREPEND)
345 || (delimit_groups == DM_SEPARATE
346 && !first_delimiter))
347 putc ('\n', ostream);
351 if (!match || mode == output_all_repeated)
353 writeline (prevline, ostream, match_count);
354 SWAP_LINES (prevline, thisline);
355 prevfield = thisfield;
362 writeline (prevline, ostream, match_count);
366 if (ferror (istream) || fclose (istream) == EOF)
367 error (EXIT_FAILURE, errno, _("error reading %s"), infile);
369 /* Close ostream only if it's not stdout -- the latter is closed
370 via the atexit-invoked close_stdout. */
371 if (ostream != stdout && (ferror (ostream) || fclose (ostream) == EOF))
372 error (EXIT_FAILURE, errno, _("error writing %s"), outfile);
379 main (int argc, char **argv)
382 int posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
386 file[0] = file[1] = "-";
387 program_name = argv[0];
388 setlocale (LC_ALL, "");
389 bindtextdomain (PACKAGE, LOCALEDIR);
390 textdomain (PACKAGE);
392 atexit (close_stdout);
396 check_chars = SIZE_MAX;
398 countmode = count_none;
399 delimit_groups = DM_NONE;
403 /* Parse an operand with leading "+" as a file after "--" was
404 seen; or if pedantic and a file was seen. POSIX 1003.1-200x
405 d7 removes support for such operands, so when it becomes
406 official the code will need to be changed. */
409 || (posixly_correct && nfiles != 0)
410 || ((optc = getopt_long (argc, argv,
411 "-0123456789cdDf:is:uw:", longopts, NULL))
418 error (0, 0, _("extra operand `%s'"), argv[optind]);
421 file[nfiles++] = argv[optind++];
427 unsigned long int size;
429 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
432 else if (nfiles == 2)
434 error (0, 0, _("extra operand `%s'"), optarg);
438 file[nfiles++] = optarg;
453 size_t s = skip_fields;
454 skip_fields = s * 10 + optc - '0';
455 if (SIZE_MAX / 10 < s || skip_fields < s)
456 error (EXIT_FAILURE, 0, "%s",
457 _("invalid number of fields to skip"));
462 countmode = count_occurrences;
466 mode = output_repeated;
470 mode = output_all_repeated;
472 delimit_groups = DM_NONE;
474 delimit_groups = XARGMATCH ("--all-repeated", optarg,
475 delimit_method_string,
479 case 'f': /* Like '-#'. */
480 skip_fields = size_opt (optarg,
481 N_("invalid number of fields to skip"));
488 case 's': /* Like '+#'. */
489 skip_chars = size_opt (optarg,
490 N_("invalid number of bytes to skip"));
494 mode = output_unique;
498 check_chars = size_opt (optarg,
499 N_("invalid number of bytes to compare"));
502 case_GETOPT_HELP_CHAR;
504 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
511 if (countmode == count_occurrences && mode == output_all_repeated)
514 _("printing all duplicated lines and repeat counts is meaningless"));
518 check_file (file[0], file[1]);