1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 86, 91, 1995-2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Richard M. Stallman and David MacKenzie. */
23 #include <sys/types.h>
27 #include "linebuffer.h"
29 #include "hard-locale.h"
34 #include "memcasecmp.h"
36 /* The official name of this program (e.g., no `g' prefix). */
37 #define PROGRAM_NAME "uniq"
40 proper_name ("Richard M. Stallman"), \
41 proper_name ("David MacKenzie")
43 #define SWAP_LINES(A, B) \
46 struct linebuffer *_tmp; \
53 /* True if the LC_COLLATE locale is hard. */
54 static bool hard_LC_COLLATE;
56 /* Number of fields to skip on each line when doing comparisons. */
57 static size_t skip_fields;
59 /* Number of chars to skip after skipping any fields. */
60 static size_t skip_chars;
62 /* Number of chars to compare. */
63 static size_t check_chars;
67 count_occurrences, /* -c Print count before output lines. */
68 count_none /* Default. Do not print counts. */
71 /* Whether and how to precede the output lines with a count of the number of
72 times they occurred in the input. */
73 static enum countmode countmode;
75 /* Which lines to output: unique lines, the first of a group of
76 repeated lines, and the second and subsequented of a group of
78 static bool output_unique;
79 static bool output_first_repeated;
80 static bool output_later_repeated;
82 /* If true, ignore case when comparing. */
83 static bool ignore_case;
87 /* No delimiters output. --all-repeated[=none] */
90 /* Delimiter precedes all groups. --all-repeated=prepend */
93 /* Delimit all groups. --all-repeated=separate */
97 static char const *const delimit_method_string[] =
99 "none", "prepend", "separate", NULL
102 static enum delimit_method const delimit_method_map[] =
104 DM_NONE, DM_PREPEND, DM_SEPARATE
107 /* Select whether/how to delimit groups of duplicate lines. */
108 static enum delimit_method delimit_groups;
110 static struct option const longopts[] =
112 {"count", no_argument, NULL, 'c'},
113 {"repeated", no_argument, NULL, 'd'},
114 {"all-repeated", optional_argument, NULL, 'D'},
115 {"ignore-case", no_argument, NULL, 'i'},
116 {"unique", no_argument, NULL, 'u'},
117 {"skip-fields", required_argument, NULL, 'f'},
118 {"skip-chars", required_argument, NULL, 's'},
119 {"check-chars", required_argument, NULL, 'w'},
120 {"zero-terminated", no_argument, NULL, 'z'},
121 {GETOPT_HELP_OPTION_DECL},
122 {GETOPT_VERSION_OPTION_DECL},
129 if (status != EXIT_SUCCESS)
130 fprintf (stderr, _("Try `%s --help' for more information.\n"),
135 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
139 Discard all but one of successive identical lines from INPUT (or\n\
140 standard input), writing to OUTPUT (or standard output).\n\
144 Mandatory arguments to long options are mandatory for short options too.\n\
147 -c, --count prefix lines by the number of occurrences\n\
148 -d, --repeated only print duplicate lines\n\
151 -D, --all-repeated[=delimit-method] print all duplicate lines\n\
152 delimit-method={none(default),prepend,separate}\n\
153 Delimiting is done with blank lines.\n\
154 -f, --skip-fields=N avoid comparing the first N fields\n\
155 -i, --ignore-case ignore differences in case when comparing\n\
156 -s, --skip-chars=N avoid comparing the first N characters\n\
157 -u, --unique only print unique lines\n\
158 -z, --zero-terminated end lines with 0 byte, not newline\n\
161 -w, --check-chars=N compare no more than N characters in lines\n\
163 fputs (HELP_OPTION_DESCRIPTION, stdout);
164 fputs (VERSION_OPTION_DESCRIPTION, stdout);
167 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
168 characters. Fields are skipped before chars.\n\
172 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
173 You may want to sort the input first, or use `sort -u' without `uniq'.\n\
175 emit_bug_reporting_address ();
180 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
181 invalid. Silently convert too-large values to SIZE_MAX. */
184 size_opt (char const *opt, char const *msgid)
186 unsigned long int size;
187 verify (SIZE_MAX <= ULONG_MAX);
189 switch (xstrtoul (opt, NULL, 10, &size, ""))
192 case LONGINT_OVERFLOW:
196 error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
199 return MIN (size, SIZE_MAX);
202 /* Given a linebuffer LINE,
203 return a pointer to the beginning of the line's field to be compared. */
206 find_field (struct linebuffer const *line)
209 char const *lp = line->buffer;
210 size_t size = line->length - 1;
213 for (count = 0; count < skip_fields; count++)
215 while (i < size && isblank (to_uchar (lp[i])))
217 while (i < size && !isblank (to_uchar (lp[i])))
221 for (count = 0; count < skip_chars && i < size; count++)
224 return line->buffer + i;
227 /* Return false if two strings OLD and NEW match, true if not.
228 OLD and NEW point not to the beginnings of the lines
229 but rather to the beginnings of the fields to compare.
230 OLDLEN and NEWLEN are their lengths. */
233 different (char *old, char *new, size_t oldlen, size_t newlen)
235 if (check_chars < oldlen)
236 oldlen = check_chars;
237 if (check_chars < newlen)
238 newlen = check_chars;
242 /* FIXME: This should invoke strcoll somehow. */
243 return oldlen != newlen || memcasecmp (old, new, oldlen);
245 else if (hard_LC_COLLATE)
246 return xmemcoll (old, oldlen, new, newlen) != 0;
248 return oldlen != newlen || memcmp (old, new, oldlen);
251 /* Output the line in linebuffer LINE to standard output
252 provided that the switches say it should be output.
253 MATCH is true if the line matches the previous line.
254 If requested, print the number of times it occurred, as well;
255 LINECOUNT + 1 is the number of times that the line occurred. */
258 writeline (struct linebuffer const *line,
259 bool match, uintmax_t linecount)
261 if (! (linecount == 0 ? output_unique
262 : !match ? output_first_repeated
263 : output_later_repeated))
266 if (countmode == count_occurrences)
267 printf ("%7" PRIuMAX " ", linecount + 1);
269 fwrite (line->buffer, sizeof (char), line->length, stdout);
272 /* Process input file INFILE with output to OUTFILE.
273 If either is "-", use the standard I/O stream for it instead. */
276 check_file (const char *infile, const char *outfile, char delimiter)
278 struct linebuffer lb1, lb2;
279 struct linebuffer *thisline, *prevline;
281 if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
282 error (EXIT_FAILURE, errno, "%s", infile);
283 if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
284 error (EXIT_FAILURE, errno, "%s", outfile);
289 initbuffer (thisline);
290 initbuffer (prevline);
292 /* The duplication in the following `if' and `else' blocks is an
293 optimization to distinguish the common case (in which none of
294 the following options has been specified: --count, -repeated,
295 --all-repeated, --unique) from the others. In the common case,
296 this optimization lets uniq output each different line right away,
297 without waiting to see if the next one is different. */
299 if (output_unique && output_first_repeated && countmode == count_none)
301 char *prevfield IF_LINT (= NULL);
302 size_t prevlen IF_LINT (= 0);
304 while (!feof (stdin))
308 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
310 thisfield = find_field (thisline);
311 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
312 if (prevline->length == 0
313 || different (thisfield, prevfield, thislen, prevlen))
315 fwrite (thisline->buffer, sizeof (char),
316 thisline->length, stdout);
318 SWAP_LINES (prevline, thisline);
319 prevfield = thisfield;
328 uintmax_t match_count = 0;
329 bool first_delimiter = true;
331 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
333 prevfield = find_field (prevline);
334 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
336 while (!feof (stdin))
341 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
347 thisfield = find_field (thisline);
348 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
349 match = !different (thisfield, prevfield, thislen, prevlen);
350 match_count += match;
352 if (match_count == UINTMAX_MAX)
354 if (count_occurrences)
355 error (EXIT_FAILURE, 0, _("too many repeated lines"));
359 if (delimit_groups != DM_NONE)
363 if (match_count) /* a previous match */
364 first_delimiter = false; /* Only used when DM_SEPARATE */
366 else if (match_count == 1)
368 if ((delimit_groups == DM_PREPEND)
369 || (delimit_groups == DM_SEPARATE
370 && !first_delimiter))
375 if (!match || output_later_repeated)
377 writeline (prevline, match, match_count);
378 SWAP_LINES (prevline, thisline);
379 prevfield = thisfield;
386 writeline (prevline, false, match_count);
390 if (ferror (stdin) || fclose (stdin) != 0)
391 error (EXIT_FAILURE, 0, _("error reading %s"), infile);
393 /* stdout is handled via the atexit-invoked close_stdout function. */
399 enum Skip_field_option_type
407 main (int argc, char **argv)
410 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
411 enum Skip_field_option_type skip_field_option_type = SFO_NONE;
414 char delimiter = '\n'; /* change with --zero-terminated, -z */
416 file[0] = file[1] = "-";
417 initialize_main (&argc, &argv);
418 set_program_name (argv[0]);
419 setlocale (LC_ALL, "");
420 bindtextdomain (PACKAGE, LOCALEDIR);
421 textdomain (PACKAGE);
422 hard_LC_COLLATE = hard_locale (LC_COLLATE);
424 atexit (close_stdout);
428 check_chars = SIZE_MAX;
429 output_unique = output_first_repeated = true;
430 output_later_repeated = false;
431 countmode = count_none;
432 delimit_groups = DM_NONE;
436 /* Parse an operand with leading "+" as a file after "--" was
437 seen; or if pedantic and a file was seen; or if not
441 || (posixly_correct && nfiles != 0)
442 || ((optc = getopt_long (argc, argv,
443 "-0123456789Dcdf:is:uw:z", longopts, NULL))
450 error (0, 0, _("extra operand %s"), quote (argv[optind]));
451 usage (EXIT_FAILURE);
453 file[nfiles++] = argv[optind++];
459 unsigned long int size;
461 && posix2_version () < 200112
462 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
465 else if (nfiles == 2)
467 error (0, 0, _("extra operand %s"), quote (optarg));
468 usage (EXIT_FAILURE);
471 file[nfiles++] = optarg;
486 if (skip_field_option_type == SFO_NEW)
489 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
490 skip_fields = SIZE_MAX;
492 skip_field_option_type = SFO_OBSOLETE;
497 countmode = count_occurrences;
501 output_unique = false;
505 output_unique = false;
506 output_later_repeated = true;
508 delimit_groups = DM_NONE;
510 delimit_groups = XARGMATCH ("--all-repeated", optarg,
511 delimit_method_string,
516 skip_field_option_type = SFO_NEW;
517 skip_fields = size_opt (optarg,
518 N_("invalid number of fields to skip"));
526 skip_chars = size_opt (optarg,
527 N_("invalid number of bytes to skip"));
531 output_first_repeated = false;
535 check_chars = size_opt (optarg,
536 N_("invalid number of bytes to compare"));
543 case_GETOPT_HELP_CHAR;
545 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
548 usage (EXIT_FAILURE);
552 if (countmode == count_occurrences && output_later_repeated)
555 _("printing all duplicated lines and repeat counts is meaningless"));
556 usage (EXIT_FAILURE);
559 check_file (file[0], file[1], delimiter);