src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 86, 91, 1995-2008 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Richard M. Stallman and David MacKenzie. */
  18 \f
  19 #include <config.h>
  20
  21 #include <stdio.h>
  22 #include <getopt.h>
  23 #include <sys/types.h>
  24
  25 #include "system.h"
  26 #include "argmatch.h"
  27 #include "linebuffer.h"
  28 #include "error.h"
  29 #include "hard-locale.h"
  30 #include "posixver.h"
  31 #include "quote.h"
  32 #include "xmemcoll.h"
  33 #include "xstrtol.h"
  34 #include "memcasecmp.h"
  35
  36 /* The official name of this program (e.g., no `g' prefix).  */
  37 #define PROGRAM_NAME "uniq"
  38
  39 #define AUTHORS \
  40   proper_name ("Richard M. Stallman"), \
  41   proper_name ("David MacKenzie")
  42
  43 #define SWAP_LINES(A, B)                        \
  44   do                                            \
  45     {                                           \
  46       struct linebuffer *_tmp;                  \
  47       _tmp = (A);                               \
  48       (A) = (B);                                \
  49       (B) = _tmp;                               \
  50     }                                           \
  51   while (0)
  52
  53 /* True if the LC_COLLATE locale is hard.  */
  54 static bool hard_LC_COLLATE;
  55
  56 /* Number of fields to skip on each line when doing comparisons. */
  57 static size_t skip_fields;
  58
  59 /* Number of chars to skip after skipping any fields. */
  60 static size_t skip_chars;
  61
  62 /* Number of chars to compare. */
  63 static size_t check_chars;
  64
  65 enum countmode
  66 {
  67   count_occurrences,            /* -c Print count before output lines. */
  68   count_none                    /* Default.  Do not print counts. */
  69 };
  70
  71 /* Whether and how to precede the output lines with a count of the number of
  72    times they occurred in the input. */
  73 static enum countmode countmode;
  74
  75 /* Which lines to output: unique lines, the first of a group of
  76    repeated lines, and the second and subsequented of a group of
  77    repeated lines.  */
  78 static bool output_unique;
  79 static bool output_first_repeated;
  80 static bool output_later_repeated;
  81
  82 /* If true, ignore case when comparing.  */
  83 static bool ignore_case;
  84
  85 enum delimit_method
  86 {
  87   /* No delimiters output.  --all-repeated[=none] */
  88   DM_NONE,
  89
  90   /* Delimiter precedes all groups.  --all-repeated=prepend */
  91   DM_PREPEND,
  92
  93   /* Delimit all groups.  --all-repeated=separate */
  94   DM_SEPARATE
  95 };
  96
  97 static char const *const delimit_method_string[] =
  98 {
  99   "none", "prepend", "separate", NULL
 100 };
 101
 102 static enum delimit_method const delimit_method_map[] =
 103 {
 104   DM_NONE, DM_PREPEND, DM_SEPARATE
 105 };
 106
 107 /* Select whether/how to delimit groups of duplicate lines.  */
 108 static enum delimit_method delimit_groups;
 109
 110 static struct option const longopts[] =
 111 {
 112   {"count", no_argument, NULL, 'c'},
 113   {"repeated", no_argument, NULL, 'd'},
 114   {"all-repeated", optional_argument, NULL, 'D'},
 115   {"ignore-case", no_argument, NULL, 'i'},
 116   {"unique", no_argument, NULL, 'u'},
 117   {"skip-fields", required_argument, NULL, 'f'},
 118   {"skip-chars", required_argument, NULL, 's'},
 119   {"check-chars", required_argument, NULL, 'w'},
 120   {"zero-terminated", no_argument, NULL, 'z'},
 121   {GETOPT_HELP_OPTION_DECL},
 122   {GETOPT_VERSION_OPTION_DECL},
 123   {NULL, 0, NULL, 0}
 124 };
 125
 126 void
 127 usage (int status)
 128 {
 129   if (status != EXIT_SUCCESS)
 130     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 131              program_name);
 132   else
 133     {
 134       printf (_("\
 135 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 136 "),
 137               program_name);
 138       fputs (_("\
 139 Discard all but one of successive identical lines from INPUT (or\n\
 140 standard input), writing to OUTPUT (or standard output).\n\
 141 \n\
 142 "), stdout);
 143      fputs (_("\
 144 Mandatory arguments to long options are mandatory for short options too.\n\
 145 "), stdout);
 146      fputs (_("\
 147   -c, --count           prefix lines by the number of occurrences\n\
 148   -d, --repeated        only print duplicate lines\n\
 149 "), stdout);
 150      fputs (_("\
 151   -D, --all-repeated[=delimit-method]  print all duplicate lines\n\
 152                         delimit-method={none(default),prepend,separate}\n\
 153                         Delimiting is done with blank lines.\n\
 154   -f, --skip-fields=N   avoid comparing the first N fields\n\
 155   -i, --ignore-case     ignore differences in case when comparing\n\
 156   -s, --skip-chars=N    avoid comparing the first N characters\n\
 157   -u, --unique          only print unique lines\n\
 158   -z, --zero-terminated  end lines with 0 byte, not newline\n\
 159 "), stdout);
 160      fputs (_("\
 161   -w, --check-chars=N   compare no more than N characters in lines\n\
 162 "), stdout);
 163      fputs (HELP_OPTION_DESCRIPTION, stdout);
 164      fputs (VERSION_OPTION_DESCRIPTION, stdout);
 165      fputs (_("\
 166 \n\
 167 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
 168 characters.  Fields are skipped before chars.\n\
 169 "), stdout);
 170      fputs (_("\
 171 \n\
 172 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
 173 You may want to sort the input first, or use `sort -u' without `uniq'.\n\
 174 "), stdout);
 175       emit_bug_reporting_address ();
 176     }
 177   exit (status);
 178 }
 179
 180 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
 181    invalid.  Silently convert too-large values to SIZE_MAX.  */
 182
 183 static size_t
 184 size_opt (char const *opt, char const *msgid)
 185 {
 186   unsigned long int size;
 187   verify (SIZE_MAX <= ULONG_MAX);
 188
 189   switch (xstrtoul (opt, NULL, 10, &size, ""))
 190     {
 191     case LONGINT_OK:
 192     case LONGINT_OVERFLOW:
 193       break;
 194
 195     default:
 196       error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 197     }
 198
 199   return MIN (size, SIZE_MAX);
 200 }
 201
 202 /* Given a linebuffer LINE,
 203    return a pointer to the beginning of the line's field to be compared. */
 204
 205 static char *
 206 find_field (struct linebuffer const *line)
 207 {
 208   size_t count;
 209   char const *lp = line->buffer;
 210   size_t size = line->length - 1;
 211   size_t i = 0;
 212
 213   for (count = 0; count < skip_fields; count++)
 214     {
 215       while (i < size && isblank (to_uchar (lp[i])))
 216         i++;
 217       while (i < size && !isblank (to_uchar (lp[i])))
 218         i++;
 219     }
 220
 221   for (count = 0; count < skip_chars && i < size; count++)
 222     i++;
 223
 224   return line->buffer + i;
 225 }
 226
 227 /* Return false if two strings OLD and NEW match, true if not.
 228    OLD and NEW point not to the beginnings of the lines
 229    but rather to the beginnings of the fields to compare.
 230    OLDLEN and NEWLEN are their lengths. */
 231
 232 static bool
 233 different (char *old, char *new, size_t oldlen, size_t newlen)
 234 {
 235   if (check_chars < oldlen)
 236     oldlen = check_chars;
 237   if (check_chars < newlen)
 238     newlen = check_chars;
 239
 240   if (ignore_case)
 241     {
 242       /* FIXME: This should invoke strcoll somehow.  */
 243       return oldlen != newlen || memcasecmp (old, new, oldlen);
 244     }
 245   else if (hard_LC_COLLATE)
 246     return xmemcoll (old, oldlen, new, newlen) != 0;
 247   else
 248     return oldlen != newlen || memcmp (old, new, oldlen);
 249 }
 250
 251 /* Output the line in linebuffer LINE to standard output
 252    provided that the switches say it should be output.
 253    MATCH is true if the line matches the previous line.
 254    If requested, print the number of times it occurred, as well;
 255    LINECOUNT + 1 is the number of times that the line occurred. */
 256
 257 static void
 258 writeline (struct linebuffer const *line,
 259            bool match, uintmax_t linecount)
 260 {
 261   if (! (linecount == 0 ? output_unique
 262          : !match ? output_first_repeated
 263          : output_later_repeated))
 264     return;
 265
 266   if (countmode == count_occurrences)
 267     printf ("%7" PRIuMAX " ", linecount + 1);
 268
 269   fwrite (line->buffer, sizeof (char), line->length, stdout);
 270 }
 271
 272 /* Process input file INFILE with output to OUTFILE.
 273    If either is "-", use the standard I/O stream for it instead. */
 274
 275 static void
 276 check_file (const char *infile, const char *outfile, char delimiter)
 277 {
 278   struct linebuffer lb1, lb2;
 279   struct linebuffer *thisline, *prevline;
 280
 281   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
 282     error (EXIT_FAILURE, errno, "%s", infile);
 283   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
 284     error (EXIT_FAILURE, errno, "%s", outfile);
 285
 286   thisline = &lb1;
 287   prevline = &lb2;
 288
 289   initbuffer (thisline);
 290   initbuffer (prevline);
 291
 292   /* The duplication in the following `if' and `else' blocks is an
 293      optimization to distinguish the common case (in which none of
 294      the following options has been specified: --count, -repeated,
 295      --all-repeated, --unique) from the others.  In the common case,
 296      this optimization lets uniq output each different line right away,
 297      without waiting to see if the next one is different.  */
 298
 299   if (output_unique && output_first_repeated && countmode == count_none)
 300     {
 301       char *prevfield IF_LINT (= NULL);
 302       size_t prevlen IF_LINT (= 0);
 303
 304       while (!feof (stdin))
 305         {
 306           char *thisfield;
 307           size_t thislen;
 308           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 309             break;
 310           thisfield = find_field (thisline);
 311           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 312           if (prevline->length == 0
 313               || different (thisfield, prevfield, thislen, prevlen))
 314             {
 315               fwrite (thisline->buffer, sizeof (char),
 316                       thisline->length, stdout);
 317
 318               SWAP_LINES (prevline, thisline);
 319               prevfield = thisfield;
 320               prevlen = thislen;
 321             }
 322         }
 323     }
 324   else
 325     {
 326       char *prevfield;
 327       size_t prevlen;
 328       uintmax_t match_count = 0;
 329       bool first_delimiter = true;
 330
 331       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
 332         goto closefiles;
 333       prevfield = find_field (prevline);
 334       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
 335
 336       while (!feof (stdin))
 337         {
 338           bool match;
 339           char *thisfield;
 340           size_t thislen;
 341           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 342             {
 343               if (ferror (stdin))
 344                 goto closefiles;
 345               break;
 346             }
 347           thisfield = find_field (thisline);
 348           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 349           match = !different (thisfield, prevfield, thislen, prevlen);
 350           match_count += match;
 351
 352           if (match_count == UINTMAX_MAX)
 353             {
 354               if (count_occurrences)
 355                 error (EXIT_FAILURE, 0, _("too many repeated lines"));
 356               match_count--;
 357             }
 358
 359           if (delimit_groups != DM_NONE)
 360             {
 361               if (!match)
 362                 {
 363                   if (match_count) /* a previous match */
 364                     first_delimiter = false; /* Only used when DM_SEPARATE */
 365                 }
 366               else if (match_count == 1)
 367                 {
 368                   if ((delimit_groups == DM_PREPEND)
 369                       || (delimit_groups == DM_SEPARATE
 370                           && !first_delimiter))
 371                     putchar (delimiter);
 372                 }
 373             }
 374
 375           if (!match || output_later_repeated)
 376             {
 377               writeline (prevline, match, match_count);
 378               SWAP_LINES (prevline, thisline);
 379               prevfield = thisfield;
 380               prevlen = thislen;
 381               if (!match)
 382                 match_count = 0;
 383             }
 384         }
 385
 386       writeline (prevline, false, match_count);
 387     }
 388
 389  closefiles:
 390   if (ferror (stdin) || fclose (stdin) != 0)
 391     error (EXIT_FAILURE, 0, _("error reading %s"), infile);
 392
 393   /* stdout is handled via the atexit-invoked close_stdout function.  */
 394
 395   free (lb1.buffer);
 396   free (lb2.buffer);
 397 }
 398
 399 enum Skip_field_option_type
 400   {
 401     SFO_NONE,
 402     SFO_OBSOLETE,
 403     SFO_NEW
 404   };
 405
 406 int
 407 main (int argc, char **argv)
 408 {
 409   int optc = 0;
 410   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
 411   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
 412   int nfiles = 0;
 413   char const *file[2];
 414   char delimiter = '\n';        /* change with --zero-terminated, -z */
 415
 416   file[0] = file[1] = "-";
 417   initialize_main (&argc, &argv);
 418   set_program_name (argv[0]);
 419   setlocale (LC_ALL, "");
 420   bindtextdomain (PACKAGE, LOCALEDIR);
 421   textdomain (PACKAGE);
 422   hard_LC_COLLATE = hard_locale (LC_COLLATE);
 423
 424   atexit (close_stdout);
 425
 426   skip_chars = 0;
 427   skip_fields = 0;
 428   check_chars = SIZE_MAX;
 429   output_unique = output_first_repeated = true;
 430   output_later_repeated = false;
 431   countmode = count_none;
 432   delimit_groups = DM_NONE;
 433
 434   for (;;)
 435     {
 436       /* Parse an operand with leading "+" as a file after "--" was
 437          seen; or if pedantic and a file was seen; or if not
 438          obsolete.  */
 439
 440       if (optc == -1
 441           || (posixly_correct && nfiles != 0)
 442           || ((optc = getopt_long (argc, argv,
 443                                    "-0123456789Dcdf:is:uw:z", longopts, NULL))
 444               == -1))
 445         {
 446           if (argc <= optind)
 447             break;
 448           if (nfiles == 2)
 449             {
 450               error (0, 0, _("extra operand %s"), quote (argv[optind]));
 451               usage (EXIT_FAILURE);
 452             }
 453           file[nfiles++] = argv[optind++];
 454         }
 455       else switch (optc)
 456         {
 457         case 1:
 458           {
 459             unsigned long int size;
 460             if (optarg[0] == '+'
 461                 && posix2_version () < 200112
 462                 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
 463                 && size <= SIZE_MAX)
 464               skip_chars = size;
 465             else if (nfiles == 2)
 466               {
 467                 error (0, 0, _("extra operand %s"), quote (optarg));
 468                 usage (EXIT_FAILURE);
 469               }
 470             else
 471               file[nfiles++] = optarg;
 472           }
 473           break;
 474
 475         case '0':
 476         case '1':
 477         case '2':
 478         case '3':
 479         case '4':
 480         case '5':
 481         case '6':
 482         case '7':
 483         case '8':
 484         case '9':
 485           {
 486             if (skip_field_option_type == SFO_NEW)
 487               skip_fields = 0;
 488
 489             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
 490               skip_fields = SIZE_MAX;
 491
 492             skip_field_option_type = SFO_OBSOLETE;
 493           }
 494           break;
 495
 496         case 'c':
 497           countmode = count_occurrences;
 498           break;
 499
 500         case 'd':
 501           output_unique = false;
 502           break;
 503
 504         case 'D':
 505           output_unique = false;
 506           output_later_repeated = true;
 507           if (optarg == NULL)
 508             delimit_groups = DM_NONE;
 509           else
 510             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 511                                         delimit_method_string,
 512                                         delimit_method_map);
 513           break;
 514
 515         case 'f':
 516           skip_field_option_type = SFO_NEW;
 517           skip_fields = size_opt (optarg,
 518                                   N_("invalid number of fields to skip"));
 519           break;
 520
 521         case 'i':
 522           ignore_case = true;
 523           break;
 524
 525         case 's':
 526           skip_chars = size_opt (optarg,
 527                                  N_("invalid number of bytes to skip"));
 528           break;
 529
 530         case 'u':
 531           output_first_repeated = false;
 532           break;
 533
 534         case 'w':
 535           check_chars = size_opt (optarg,
 536                                   N_("invalid number of bytes to compare"));
 537           break;
 538
 539         case 'z':
 540           delimiter = '\0';
 541           break;
 542
 543         case_GETOPT_HELP_CHAR;
 544
 545         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 546
 547         default:
 548           usage (EXIT_FAILURE);
 549         }
 550     }
 551
 552   if (countmode == count_occurrences && output_later_repeated)
 553     {
 554       error (0, 0,
 555            _("printing all duplicated lines and repeat counts is meaningless"));
 556       usage (EXIT_FAILURE);
 557     }
 558
 559   check_file (file[0], file[1], delimiter);
 560
 561   exit (EXIT_SUCCESS);
 562 }