src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 86, 91, 1995-2006 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  17
  18 /* Written by Richard Stallman and David MacKenzie. */
  19 \f
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <getopt.h>
  24 #include <sys/types.h>
  25
  26 #include "system.h"
  27 #include "argmatch.h"
  28 #include "linebuffer.h"
  29 #include "error.h"
  30 #include "hard-locale.h"
  31 #include "posixver.h"
  32 #include "quote.h"
  33 #include "xmemcoll.h"
  34 #include "xstrtol.h"
  35 #include "memcasecmp.h"
  36
  37 /* The official name of this program (e.g., no `g' prefix).  */
  38 #define PROGRAM_NAME "uniq"
  39
  40 #define AUTHORS "Richard Stallman", "David MacKenzie"
  41
  42 #define SWAP_LINES(A, B)                        \
  43   do                                            \
  44     {                                           \
  45       struct linebuffer *_tmp;                  \
  46       _tmp = (A);                               \
  47       (A) = (B);                                \
  48       (B) = _tmp;                               \
  49     }                                           \
  50   while (0)
  51
  52 /* The name this program was run with. */
  53 char *program_name;
  54
  55 /* True if the LC_COLLATE locale is hard.  */
  56 static bool hard_LC_COLLATE;
  57
  58 /* Number of fields to skip on each line when doing comparisons. */
  59 static size_t skip_fields;
  60
  61 /* Number of chars to skip after skipping any fields. */
  62 static size_t skip_chars;
  63
  64 /* Number of chars to compare. */
  65 static size_t check_chars;
  66
  67 enum countmode
  68 {
  69   count_occurrences,            /* -c Print count before output lines. */
  70   count_none                    /* Default.  Do not print counts. */
  71 };
  72
  73 /* Whether and how to precede the output lines with a count of the number of
  74    times they occurred in the input. */
  75 static enum countmode countmode;
  76
  77 /* Which lines to output: unique lines, the first of a group of
  78    repeated lines, and the second and subsequented of a group of
  79    repeated lines.  */
  80 static bool output_unique;
  81 static bool output_first_repeated;
  82 static bool output_later_repeated;
  83
  84 /* If true, ignore case when comparing.  */
  85 static bool ignore_case;
  86
  87 enum delimit_method
  88 {
  89   /* No delimiters output.  --all-repeated[=none] */
  90   DM_NONE,
  91
  92   /* Delimiter precedes all groups.  --all-repeated=prepend */
  93   DM_PREPEND,
  94
  95   /* Delimit all groups.  --all-repeated=separate */
  96   DM_SEPARATE
  97 };
  98
  99 static char const *const delimit_method_string[] =
 100 {
 101   "none", "prepend", "separate", NULL
 102 };
 103
 104 static enum delimit_method const delimit_method_map[] =
 105 {
 106   DM_NONE, DM_PREPEND, DM_SEPARATE
 107 };
 108
 109 /* Select whether/how to delimit groups of duplicate lines.  */
 110 static enum delimit_method delimit_groups;
 111
 112 static struct option const longopts[] =
 113 {
 114   {"count", no_argument, NULL, 'c'},
 115   {"repeated", no_argument, NULL, 'd'},
 116   {"all-repeated", optional_argument, NULL, 'D'},
 117   {"ignore-case", no_argument, NULL, 'i'},
 118   {"unique", no_argument, NULL, 'u'},
 119   {"skip-fields", required_argument, NULL, 'f'},
 120   {"skip-chars", required_argument, NULL, 's'},
 121   {"check-chars", required_argument, NULL, 'w'},
 122   {GETOPT_HELP_OPTION_DECL},
 123   {GETOPT_VERSION_OPTION_DECL},
 124   {NULL, 0, NULL, 0}
 125 };
 126
 127 void
 128 usage (int status)
 129 {
 130   if (status != EXIT_SUCCESS)
 131     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 132              program_name);
 133   else
 134     {
 135       printf (_("\
 136 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 137 "),
 138               program_name);
 139       fputs (_("\
 140 Discard all but one of successive identical lines from INPUT (or\n\
 141 standard input), writing to OUTPUT (or standard output).\n\
 142 \n\
 143 "), stdout);
 144      fputs (_("\
 145 Mandatory arguments to long options are mandatory for short options too.\n\
 146 "), stdout);
 147      fputs (_("\
 148   -c, --count           prefix lines by the number of occurrences\n\
 149   -d, --repeated        only print duplicate lines\n\
 150 "), stdout);
 151      fputs (_("\
 152   -D, --all-repeated[=delimit-method]  print all duplicate lines\n\
 153                         delimit-method={none(default),prepend,separate}\n\
 154                         Delimiting is done with blank lines.\n\
 155   -f, --skip-fields=N   avoid comparing the first N fields\n\
 156   -i, --ignore-case     ignore differences in case when comparing\n\
 157   -s, --skip-chars=N    avoid comparing the first N characters\n\
 158   -u, --unique          only print unique lines\n\
 159 "), stdout);
 160      fputs (_("\
 161   -w, --check-chars=N   compare no more than N characters in lines\n\
 162 "), stdout);
 163      fputs (HELP_OPTION_DESCRIPTION, stdout);
 164      fputs (VERSION_OPTION_DESCRIPTION, stdout);
 165      fputs (_("\
 166 \n\
 167 A field is a run of whitespace, then non-whitespace characters.\n\
 168 Fields are skipped before chars.\n\
 169 "), stdout);
 170       printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 171     }
 172   exit (status);
 173 }
 174
 175 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
 176    invalid.  Silently convert too-large values to SIZE_MAX.  */
 177
 178 static size_t
 179 size_opt (char const *opt, char const *msgid)
 180 {
 181   unsigned long int size;
 182   verify (SIZE_MAX <= ULONG_MAX);
 183
 184   switch (xstrtoul (opt, NULL, 10, &size, ""))
 185     {
 186     case LONGINT_OK:
 187     case LONGINT_OVERFLOW:
 188       break;
 189
 190     default:
 191       error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 192     }
 193
 194   return MIN (size, SIZE_MAX);
 195 }
 196
 197 /* Given a linebuffer LINE,
 198    return a pointer to the beginning of the line's field to be compared. */
 199
 200 static char *
 201 find_field (const struct linebuffer *line)
 202 {
 203   size_t count;
 204   char *lp = line->buffer;
 205   size_t size = line->length - 1;
 206   size_t i = 0;
 207
 208   for (count = 0; count < skip_fields && i < size; count++)
 209     {
 210       while (i < size && isblank (lp[i]))
 211         i++;
 212       while (i < size && !isblank (lp[i]))
 213         i++;
 214     }
 215
 216   for (count = 0; count < skip_chars && i < size; count++)
 217     i++;
 218
 219   return lp + i;
 220 }
 221
 222 /* Return false if two strings OLD and NEW match, true if not.
 223    OLD and NEW point not to the beginnings of the lines
 224    but rather to the beginnings of the fields to compare.
 225    OLDLEN and NEWLEN are their lengths. */
 226
 227 static bool
 228 different (char *old, char *new, size_t oldlen, size_t newlen)
 229 {
 230   if (check_chars < oldlen)
 231     oldlen = check_chars;
 232   if (check_chars < newlen)
 233     newlen = check_chars;
 234
 235   if (ignore_case)
 236     {
 237       /* FIXME: This should invoke strcoll somehow.  */
 238       return oldlen != newlen || memcasecmp (old, new, oldlen);
 239     }
 240   else if (hard_LC_COLLATE)
 241     return xmemcoll (old, oldlen, new, newlen) != 0;
 242   else
 243     return oldlen != newlen || memcmp (old, new, oldlen);
 244 }
 245
 246 /* Output the line in linebuffer LINE to standard output
 247    provided that the switches say it should be output.
 248    MATCH is true if the line matches the previous line.
 249    If requested, print the number of times it occurred, as well;
 250    LINECOUNT + 1 is the number of times that the line occurred. */
 251
 252 static void
 253 writeline (struct linebuffer const *line,
 254            bool match, uintmax_t linecount)
 255 {
 256   if (! (linecount == 0 ? output_unique
 257          : !match ? output_first_repeated
 258          : output_later_repeated))
 259     return;
 260
 261   if (countmode == count_occurrences)
 262     printf ("%7" PRIuMAX " ", linecount + 1);
 263
 264   fwrite (line->buffer, sizeof (char), line->length, stdout);
 265 }
 266
 267 /* Process input file INFILE with output to OUTFILE.
 268    If either is "-", use the standard I/O stream for it instead. */
 269
 270 static void
 271 check_file (const char *infile, const char *outfile)
 272 {
 273   struct linebuffer lb1, lb2;
 274   struct linebuffer *thisline, *prevline;
 275
 276   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
 277     error (EXIT_FAILURE, errno, "%s", infile);
 278   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
 279     error (EXIT_FAILURE, errno, "%s", outfile);
 280
 281   thisline = &lb1;
 282   prevline = &lb2;
 283
 284   initbuffer (thisline);
 285   initbuffer (prevline);
 286
 287   /* The duplication in the following `if' and `else' blocks is an
 288      optimization to distinguish the common case (in which none of
 289      the following options has been specified: --count, -repeated,
 290      --all-repeated, --unique) from the others.  In the common case,
 291      this optimization lets uniq output each different line right away,
 292      without waiting to see if the next one is different.  */
 293
 294   if (output_unique && output_first_repeated && countmode == count_none)
 295     {
 296       char *prevfield IF_LINT (= NULL);
 297       size_t prevlen IF_LINT (= 0);
 298
 299       while (!feof (stdin))
 300         {
 301           char *thisfield;
 302           size_t thislen;
 303           if (readlinebuffer (thisline, stdin) == 0)
 304             break;
 305           thisfield = find_field (thisline);
 306           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 307           if (prevline->length == 0
 308               || different (thisfield, prevfield, thislen, prevlen))
 309             {
 310               fwrite (thisline->buffer, sizeof (char),
 311                       thisline->length, stdout);
 312
 313               SWAP_LINES (prevline, thisline);
 314               prevfield = thisfield;
 315               prevlen = thislen;
 316             }
 317         }
 318     }
 319   else
 320     {
 321       char *prevfield;
 322       size_t prevlen;
 323       uintmax_t match_count = 0;
 324       bool first_delimiter = true;
 325
 326       if (readlinebuffer (prevline, stdin) == 0)
 327         goto closefiles;
 328       prevfield = find_field (prevline);
 329       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
 330
 331       while (!feof (stdin))
 332         {
 333           bool match;
 334           char *thisfield;
 335           size_t thislen;
 336           if (readlinebuffer (thisline, stdin) == 0)
 337             {
 338               if (ferror (stdin))
 339                 goto closefiles;
 340               break;
 341             }
 342           thisfield = find_field (thisline);
 343           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 344           match = !different (thisfield, prevfield, thislen, prevlen);
 345           match_count += match;
 346
 347           if (match_count == UINTMAX_MAX)
 348             {
 349               if (count_occurrences)
 350                 error (EXIT_FAILURE, 0, _("too many repeated lines"));
 351               match_count--;
 352             }
 353
 354           if (delimit_groups != DM_NONE)
 355             {
 356               if (!match)
 357                 {
 358                   if (match_count) /* a previous match */
 359                     first_delimiter = false; /* Only used when DM_SEPARATE */
 360                 }
 361               else if (match_count == 1)
 362                 {
 363                   if ((delimit_groups == DM_PREPEND)
 364                       || (delimit_groups == DM_SEPARATE
 365                           && !first_delimiter))
 366                     putchar ('\n');
 367                 }
 368             }
 369
 370           if (!match || output_later_repeated)
 371             {
 372               writeline (prevline, match, match_count);
 373               SWAP_LINES (prevline, thisline);
 374               prevfield = thisfield;
 375               prevlen = thislen;
 376               if (!match)
 377                 match_count = 0;
 378             }
 379         }
 380
 381       writeline (prevline, false, match_count);
 382     }
 383
 384  closefiles:
 385   if (ferror (stdin) || fclose (stdin) != 0)
 386     error (EXIT_FAILURE, 0, _("error reading %s"), infile);
 387
 388   /* stdout is handled via the atexit-invoked close_stdout function.  */
 389
 390   free (lb1.buffer);
 391   free (lb2.buffer);
 392 }
 393
 394 enum Skip_field_option_type
 395   {
 396     SFO_NONE,
 397     SFO_OBSOLETE,
 398     SFO_NEW
 399   };
 400
 401 int
 402 main (int argc, char **argv)
 403 {
 404   int optc = 0;
 405   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
 406   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
 407   int nfiles = 0;
 408   char const *file[2];
 409
 410   file[0] = file[1] = "-";
 411   initialize_main (&argc, &argv);
 412   program_name = argv[0];
 413   setlocale (LC_ALL, "");
 414   bindtextdomain (PACKAGE, LOCALEDIR);
 415   textdomain (PACKAGE);
 416   hard_LC_COLLATE = hard_locale (LC_COLLATE);
 417
 418   atexit (close_stdout);
 419
 420   skip_chars = 0;
 421   skip_fields = 0;
 422   check_chars = SIZE_MAX;
 423   output_unique = output_first_repeated = true;
 424   output_later_repeated = false;
 425   countmode = count_none;
 426   delimit_groups = DM_NONE;
 427
 428   for (;;)
 429     {
 430       /* Parse an operand with leading "+" as a file after "--" was
 431          seen; or if pedantic and a file was seen; or if not
 432          obsolete.  */
 433
 434       if (optc == -1
 435           || (posixly_correct && nfiles != 0)
 436           || ((optc = getopt_long (argc, argv,
 437                                    "-0123456789Dcdf:is:uw:", longopts, NULL))
 438               == -1))
 439         {
 440           if (argc <= optind)
 441             break;
 442           if (nfiles == 2)
 443             {
 444               error (0, 0, _("extra operand %s"), quote (argv[optind]));
 445               usage (EXIT_FAILURE);
 446             }
 447           file[nfiles++] = argv[optind++];
 448         }
 449       else switch (optc)
 450         {
 451         case 1:
 452           {
 453             unsigned long int size;
 454             if (optarg[0] == '+'
 455                 && posix2_version () < 200112
 456                 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
 457                 && size <= SIZE_MAX)
 458               skip_chars = size;
 459             else if (nfiles == 2)
 460               {
 461                 error (0, 0, _("extra operand %s"), quote (optarg));
 462                 usage (EXIT_FAILURE);
 463               }
 464             else
 465               file[nfiles++] = optarg;
 466           }
 467           break;
 468
 469         case '0':
 470         case '1':
 471         case '2':
 472         case '3':
 473         case '4':
 474         case '5':
 475         case '6':
 476         case '7':
 477         case '8':
 478         case '9':
 479           {
 480             if (skip_field_option_type == SFO_NEW)
 481               skip_fields = 0;
 482
 483             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
 484               skip_fields = SIZE_MAX;
 485
 486             skip_field_option_type = SFO_OBSOLETE;
 487           }
 488           break;
 489
 490         case 'c':
 491           countmode = count_occurrences;
 492           break;
 493
 494         case 'd':
 495           output_unique = false;
 496           break;
 497
 498         case 'D':
 499           output_unique = false;
 500           output_later_repeated = true;
 501           if (optarg == NULL)
 502             delimit_groups = DM_NONE;
 503           else
 504             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 505                                         delimit_method_string,
 506                                         delimit_method_map);
 507           break;
 508
 509         case 'f':
 510           skip_field_option_type = SFO_NEW;
 511           skip_fields = size_opt (optarg,
 512                                   N_("invalid number of fields to skip"));
 513           break;
 514
 515         case 'i':
 516           ignore_case = true;
 517           break;
 518
 519         case 's':
 520           skip_chars = size_opt (optarg,
 521                                  N_("invalid number of bytes to skip"));
 522           break;
 523
 524         case 'u':
 525           output_first_repeated = false;
 526           break;
 527
 528         case 'w':
 529           check_chars = size_opt (optarg,
 530                                   N_("invalid number of bytes to compare"));
 531           break;
 532
 533         case_GETOPT_HELP_CHAR;
 534
 535         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 536
 537         default:
 538           usage (EXIT_FAILURE);
 539         }
 540     }
 541
 542   if (countmode == count_occurrences && output_later_repeated)
 543     {
 544       error (0, 0,
 545            _("printing all duplicated lines and repeat counts is meaningless"));
 546       usage (EXIT_FAILURE);
 547     }
 548
 549   check_file (file[0], file[1]);
 550
 551   exit (EXIT_SUCCESS);
 552 }