1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 86, 91, 1995-2001, Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 /* Written by Richard Stallman and David MacKenzie. */
24 #include <sys/types.h>
29 #include "linebuffer.h"
32 #include "memcasecmp.h"
34 /* The official name of this program (e.g., no `g' prefix). */
35 #define PROGRAM_NAME "uniq"
37 #define AUTHORS N_ ("Richard Stallman and David MacKenzie")
39 #define SWAP_LINES(A, B) \
42 struct linebuffer *_tmp; \
49 /* The name this program was run with. */
52 /* Number of fields to skip on each line when doing comparisons. */
53 static size_t skip_fields;
55 /* Number of chars to skip after skipping any fields. */
56 static size_t skip_chars;
58 /* Number of chars to compare. */
59 static size_t check_chars;
63 count_occurrences, /* -c Print count before output lines. */
64 count_none /* Default. Do not print counts. */
67 /* Whether and how to precede the output lines with a count of the number of
68 times they occurred in the input. */
69 static enum countmode countmode;
73 output_repeated, /* -d Only lines that are repeated. */
74 output_all_repeated, /* -D All lines that are repeated. */
75 output_unique, /* -u Only lines that are not repeated. */
76 output_all /* Default. Print first copy of each line. */
79 /* Which lines to output. */
80 static enum output_mode mode;
82 /* If nonzero, ignore case when comparing. */
83 static int ignore_case;
87 /* No delimiters output. --all-repeated[=none] */
90 /* Delimiter precedes all groups. --all-repeated=prepend */
93 /* Delimit all groups. --all-repeated=separate */
97 static char const *const delimit_method_string[] =
99 "none", "prepend", "separate", 0
102 static enum delimit_method const delimit_method_map[] =
104 DM_NONE, DM_PREPEND, DM_SEPARATE
107 /* Select whether/how to delimit groups of duplicate lines. */
108 static enum delimit_method delimit_groups;
110 static struct option const longopts[] =
112 {"count", no_argument, NULL, 'c'},
113 {"repeated", no_argument, NULL, 'd'},
114 {"all-repeated", optional_argument, NULL, 'D'},
115 {"ignore-case", no_argument, NULL, 'i'},
116 {"unique", no_argument, NULL, 'u'},
117 {"skip-fields", required_argument, NULL, 'f'},
118 {"skip-chars", required_argument, NULL, 's'},
119 {"check-chars", required_argument, NULL, 'w'},
120 {GETOPT_HELP_OPTION_DECL},
121 {GETOPT_VERSION_OPTION_DECL},
129 fprintf (stderr, _("Try `%s --help' for more information.\n"),
134 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
138 Discard all but one of successive identical lines from INPUT (or\n\
139 standard input), writing to OUTPUT (or standard output).\n\
141 Mandatory arguments to long options are mandatory for short options too.\n\
142 -c, --count prefix lines by the number of occurrences\n\
143 -d, --repeated only print duplicate lines\n\
146 -D, --all-repeated[=delimit-method] print all duplicate lines\n\
147 delimit-method={none(default),prepend,separate)}\n\
148 Delimiting is done with blank lines.\n\
149 -f, --skip-fields=N avoid comparing the first N fields\n\
150 -i, --ignore-case ignore differences in case when comparing\n\
151 -s, --skip-chars=N avoid comparing the first N characters\n\
152 -u, --unique only print unique lines\n\
155 -w, --check-chars=N compare no more than N characters in lines\n\
157 +N same as -s N (obsolescent; will be withdrawn)\n\
158 --help display this help and exit\n\
159 --version output version information and exit\n\
161 A field is a run of whitespace, then non-whitespace characters.\n\
162 Fields are skipped before chars.\n\
164 puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
166 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
169 /* Convert OPT to size_t, reporting an error using MSGID if it does
173 size_opt (char const *opt, char const *msgid)
175 unsigned long int size;
176 if (xstrtoul (opt, NULL, 10, &size, "") != LONGINT_OK
178 error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
182 /* Given a linebuffer LINE,
183 return a pointer to the beginning of the line's field to be compared. */
186 find_field (const struct linebuffer *line)
188 register size_t count;
189 register char *lp = line->buffer;
190 register size_t size = line->length - 1;
191 register size_t i = 0;
193 for (count = 0; count < skip_fields && i < size; count++)
195 while (i < size && ISBLANK (lp[i]))
197 while (i < size && !ISBLANK (lp[i]))
201 for (count = 0; count < skip_chars && i < size; count++)
207 /* Return zero if two strings OLD and NEW match, nonzero if not.
208 OLD and NEW point not to the beginnings of the lines
209 but rather to the beginnings of the fields to compare.
210 OLDLEN and NEWLEN are their lengths. */
213 different (const char *old, const char *new, size_t oldlen, size_t newlen)
215 if (check_chars < oldlen)
216 oldlen = check_chars;
217 if (check_chars < newlen)
218 newlen = check_chars;
220 if (oldlen != newlen)
223 /* Use an if-statement here rather than a function variable to
224 avoid portability hassles of getting a non-conflicting declaration
227 return memcasecmp (old, new, oldlen);
229 return memcmp (old, new, oldlen);
232 /* Output the line in linebuffer LINE to stream STREAM
233 provided that the switches say it should be output.
234 If requested, print the number of times it occurred, as well;
235 LINECOUNT + 1 is the number of times that the line occurred. */
238 writeline (const struct linebuffer *line, FILE *stream, int linecount)
240 if ((mode == output_unique && linecount != 0)
241 || (mode == output_repeated && linecount == 0)
242 || (mode == output_all_repeated && linecount == 0))
245 if (countmode == count_occurrences)
246 fprintf (stream, "%7d\t", linecount + 1);
248 fwrite (line->buffer, sizeof (char), line->length, stream);
251 /* Process input file INFILE with output to OUTFILE.
252 If either is "-", use the standard I/O stream for it instead. */
255 check_file (const char *infile, const char *outfile)
259 struct linebuffer lb1, lb2;
260 struct linebuffer *thisline, *prevline;
262 if (STREQ (infile, "-"))
265 istream = fopen (infile, "r");
267 error (EXIT_FAILURE, errno, "%s", infile);
269 if (STREQ (outfile, "-"))
272 ostream = fopen (outfile, "w");
274 error (EXIT_FAILURE, errno, "%s", outfile);
279 initbuffer (thisline);
280 initbuffer (prevline);
282 /* The duplication in the following `if' and `else' blocks is an
283 optimization to distinguish the common case (in which none of
284 the following options has been specified: --count, -repeated,
285 --all-repeated, --unique) from the others. In the common case,
286 this optimization lets uniq output each different line right away,
287 without waiting to see if the next one is different. */
289 if (mode == output_all && countmode == count_none)
291 char *prevfield IF_LINT (= NULL);
292 size_t prevlen IF_LINT (= 0);
294 while (!feof (istream))
298 if (readline (thisline, istream) == 0)
300 thisfield = find_field (thisline);
301 thislen = thisline->length - (thisfield - thisline->buffer);
302 if (prevline->length == 0
303 || different (thisfield, prevfield, thislen, prevlen))
305 fwrite (thisline->buffer, sizeof (char),
306 thisline->length, ostream);
308 SWAP_LINES (prevline, thisline);
309 prevfield = thisfield;
319 int first_delimiter = 1;
321 if (readline (prevline, istream) == 0)
323 prevfield = find_field (prevline);
324 prevlen = prevline->length - (prevfield - prevline->buffer);
326 while (!feof (istream))
331 if (readline (thisline, istream) == 0)
333 thisfield = find_field (thisline);
334 thislen = thisline->length - (thisfield - thisline->buffer);
335 match = !different (thisfield, prevfield, thislen, prevlen);
340 if (mode == output_all_repeated && delimit_groups != DM_NONE)
344 if (match_count) /* a previous match */
345 first_delimiter = 0; /* Only used when DM_SEPARATE */
347 else if (match_count == 1)
349 if ((delimit_groups == DM_PREPEND)
350 || (delimit_groups == DM_SEPARATE
351 && !first_delimiter))
352 putc ('\n', ostream);
356 if (!match || mode == output_all_repeated)
358 writeline (prevline, ostream, match_count);
359 SWAP_LINES (prevline, thisline);
360 prevfield = thisfield;
367 writeline (prevline, ostream, match_count);
371 if (ferror (istream) || fclose (istream) == EOF)
372 error (EXIT_FAILURE, errno, _("error reading %s"), infile);
374 /* Close ostream only if it's not stdout -- the latter is closed
375 via the atexit-invoked close_stdout. */
376 if (ostream != stdout && (ferror (ostream) || fclose (ostream) == EOF))
377 error (EXIT_FAILURE, errno, _("error writing %s"), outfile);
384 main (int argc, char **argv)
387 int posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
391 file[0] = file[1] = "-";
392 program_name = argv[0];
393 setlocale (LC_ALL, "");
394 bindtextdomain (PACKAGE, LOCALEDIR);
395 textdomain (PACKAGE);
397 atexit (close_stdout);
401 check_chars = SIZE_MAX;
403 countmode = count_none;
404 delimit_groups = DM_NONE;
408 /* Parse an operand with leading "+" as a file after "--" was
409 seen; or if pedantic and a file was seen. POSIX 1003.1-200x
410 d7 removes support for such operands, so when it becomes
411 official the code will need to be changed. */
414 || (posixly_correct && nfiles != 0)
415 || ((optc = getopt_long (argc, argv,
416 "-0123456789cdDf:is:uw:", longopts, NULL))
423 error (0, 0, _("extra operand `%s'"), argv[optind]);
426 file[nfiles++] = argv[optind++];
432 unsigned long int size;
434 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
437 else if (nfiles == 2)
439 error (0, 0, _("extra operand `%s'"), optarg);
443 file[nfiles++] = optarg;
458 size_t s = skip_fields;
459 skip_fields = s * 10 + optc - '0';
460 if (SIZE_MAX / 10 < s || skip_fields < s)
461 error (EXIT_FAILURE, 0, "%s",
462 _("invalid number of fields to skip"));
467 countmode = count_occurrences;
471 mode = output_repeated;
475 mode = output_all_repeated;
477 delimit_groups = DM_NONE;
479 delimit_groups = XARGMATCH ("--all-repeated", optarg,
480 delimit_method_string,
484 case 'f': /* Like '-#'. */
485 skip_fields = size_opt (optarg,
486 N_("invalid number of fields to skip"));
493 case 's': /* Like '+#'. */
494 skip_chars = size_opt (optarg,
495 N_("invalid number of bytes to skip"));
499 mode = output_unique;
503 check_chars = size_opt (optarg,
504 N_("invalid number of bytes to compare"));
507 case_GETOPT_HELP_CHAR;
509 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
516 if (countmode == count_occurrences && mode == output_all_repeated)
519 _("printing all duplicated lines and repeat counts is meaningless"));
523 check_file (file[0], file[1]);