src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 86, 91, 1995-2006 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  17
  18 /* Written by Richard Stallman and David MacKenzie. */
  19 \f
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <getopt.h>
  24 #include <sys/types.h>
  25
  26 #include "system.h"
  27 #include "argmatch.h"
  28 #include "linebuffer.h"
  29 #include "error.h"
  30 #include "hard-locale.h"
  31 #include "posixver.h"
  32 #include "quote.h"
  33 #include "xmemcoll.h"
  34 #include "xstrtol.h"
  35 #include "memcasecmp.h"
  36
  37 /* The official name of this program (e.g., no `g' prefix).  */
  38 #define PROGRAM_NAME "uniq"
  39
  40 #define AUTHORS "Richard Stallman", "David MacKenzie"
  41
  42 #define SWAP_LINES(A, B)                        \
  43   do                                            \
  44     {                                           \
  45       struct linebuffer *_tmp;                  \
  46       _tmp = (A);                               \
  47       (A) = (B);                                \
  48       (B) = _tmp;                               \
  49     }                                           \
  50   while (0)
  51
  52 /* The name this program was run with. */
  53 char *program_name;
  54
  55 /* True if the LC_COLLATE locale is hard.  */
  56 static bool hard_LC_COLLATE;
  57
  58 /* Number of fields to skip on each line when doing comparisons. */
  59 static size_t skip_fields;
  60
  61 /* Number of chars to skip after skipping any fields. */
  62 static size_t skip_chars;
  63
  64 /* Number of chars to compare. */
  65 static size_t check_chars;
  66
  67 enum countmode
  68 {
  69   count_occurrences,            /* -c Print count before output lines. */
  70   count_none                    /* Default.  Do not print counts. */
  71 };
  72
  73 /* Whether and how to precede the output lines with a count of the number of
  74    times they occurred in the input. */
  75 static enum countmode countmode;
  76
  77 /* Which lines to output: unique lines, the first of a group of
  78    repeated lines, and the second and subsequented of a group of
  79    repeated lines.  */
  80 static bool output_unique;
  81 static bool output_first_repeated;
  82 static bool output_later_repeated;
  83
  84 /* If true, ignore case when comparing.  */
  85 static bool ignore_case;
  86
  87 enum delimit_method
  88 {
  89   /* No delimiters output.  --all-repeated[=none] */
  90   DM_NONE,
  91
  92   /* Delimiter precedes all groups.  --all-repeated=prepend */
  93   DM_PREPEND,
  94
  95   /* Delimit all groups.  --all-repeated=separate */
  96   DM_SEPARATE
  97 };
  98
  99 static char const *const delimit_method_string[] =
 100 {
 101   "none", "prepend", "separate", NULL
 102 };
 103
 104 static enum delimit_method const delimit_method_map[] =
 105 {
 106   DM_NONE, DM_PREPEND, DM_SEPARATE
 107 };
 108
 109 /* Select whether/how to delimit groups of duplicate lines.  */
 110 static enum delimit_method delimit_groups;
 111
 112 static struct option const longopts[] =
 113 {
 114   {"count", no_argument, NULL, 'c'},
 115   {"repeated", no_argument, NULL, 'd'},
 116   {"all-repeated", optional_argument, NULL, 'D'},
 117   {"ignore-case", no_argument, NULL, 'i'},
 118   {"unique", no_argument, NULL, 'u'},
 119   {"skip-fields", required_argument, NULL, 'f'},
 120   {"skip-chars", required_argument, NULL, 's'},
 121   {"check-chars", required_argument, NULL, 'w'},
 122   {GETOPT_HELP_OPTION_DECL},
 123   {GETOPT_VERSION_OPTION_DECL},
 124   {NULL, 0, NULL, 0}
 125 };
 126
 127 void
 128 usage (int status)
 129 {
 130   if (status != EXIT_SUCCESS)
 131     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 132              program_name);
 133   else
 134     {
 135       printf (_("\
 136 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 137 "),
 138               program_name);
 139       fputs (_("\
 140 Discard all but one of successive identical lines from INPUT (or\n\
 141 standard input), writing to OUTPUT (or standard output).\n\
 142 \n\
 143 "), stdout);
 144      fputs (_("\
 145 Mandatory arguments to long options are mandatory for short options too.\n\
 146 "), stdout);
 147      fputs (_("\
 148   -c, --count           prefix lines by the number of occurrences\n\
 149   -d, --repeated        only print duplicate lines\n\
 150 "), stdout);
 151      fputs (_("\
 152   -D, --all-repeated[=delimit-method]  print all duplicate lines\n\
 153                         delimit-method={none(default),prepend,separate}\n\
 154                         Delimiting is done with blank lines.\n\
 155   -f, --skip-fields=N   avoid comparing the first N fields\n\
 156   -i, --ignore-case     ignore differences in case when comparing\n\
 157   -s, --skip-chars=N    avoid comparing the first N characters\n\
 158   -u, --unique          only print unique lines\n\
 159 "), stdout);
 160      fputs (_("\
 161   -w, --check-chars=N   compare no more than N characters in lines\n\
 162 "), stdout);
 163      fputs (HELP_OPTION_DESCRIPTION, stdout);
 164      fputs (VERSION_OPTION_DESCRIPTION, stdout);
 165      fputs (_("\
 166 \n\
 167 A field is a run of whitespace, then non-whitespace characters.\n\
 168 Fields are skipped before chars.\n\
 169 "), stdout);
 170       printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 171     }
 172   exit (status);
 173 }
 174
 175 /* Convert OPT to size_t, reporting an error using MSGID if it does
 176    not fit.  */
 177
 178 static size_t
 179 size_opt (char const *opt, char const *msgid)
 180 {
 181   unsigned long int size;
 182   if (xstrtoul (opt, NULL, 10, &size, "") != LONGINT_OK
 183       || SIZE_MAX < size)
 184     error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 185   return size;
 186 }
 187
 188 /* Given a linebuffer LINE,
 189    return a pointer to the beginning of the line's field to be compared. */
 190
 191 static char *
 192 find_field (const struct linebuffer *line)
 193 {
 194   size_t count;
 195   char *lp = line->buffer;
 196   size_t size = line->length - 1;
 197   size_t i = 0;
 198
 199   for (count = 0; count < skip_fields && i < size; count++)
 200     {
 201       while (i < size && isblank (lp[i]))
 202         i++;
 203       while (i < size && !isblank (lp[i]))
 204         i++;
 205     }
 206
 207   for (count = 0; count < skip_chars && i < size; count++)
 208     i++;
 209
 210   return lp + i;
 211 }
 212
 213 /* Return false if two strings OLD and NEW match, true if not.
 214    OLD and NEW point not to the beginnings of the lines
 215    but rather to the beginnings of the fields to compare.
 216    OLDLEN and NEWLEN are their lengths. */
 217
 218 static bool
 219 different (char *old, char *new, size_t oldlen, size_t newlen)
 220 {
 221   if (check_chars < oldlen)
 222     oldlen = check_chars;
 223   if (check_chars < newlen)
 224     newlen = check_chars;
 225
 226   if (ignore_case)
 227     {
 228       /* FIXME: This should invoke strcoll somehow.  */
 229       return oldlen != newlen || memcasecmp (old, new, oldlen);
 230     }
 231   else if (hard_LC_COLLATE)
 232     return xmemcoll (old, oldlen, new, newlen) != 0;
 233   else
 234     return oldlen != newlen || memcmp (old, new, oldlen);
 235 }
 236
 237 /* Output the line in linebuffer LINE to standard output
 238    provided that the switches say it should be output.
 239    MATCH is true if the line matches the previous line.
 240    If requested, print the number of times it occurred, as well;
 241    LINECOUNT + 1 is the number of times that the line occurred. */
 242
 243 static void
 244 writeline (struct linebuffer const *line,
 245            bool match, uintmax_t linecount)
 246 {
 247   if (! (linecount == 0 ? output_unique
 248          : !match ? output_first_repeated
 249          : output_later_repeated))
 250     return;
 251
 252   if (countmode == count_occurrences)
 253     printf ("%7" PRIuMAX " ", linecount + 1);
 254
 255   fwrite (line->buffer, sizeof (char), line->length, stdout);
 256 }
 257
 258 /* Process input file INFILE with output to OUTFILE.
 259    If either is "-", use the standard I/O stream for it instead. */
 260
 261 static void
 262 check_file (const char *infile, const char *outfile)
 263 {
 264   struct linebuffer lb1, lb2;
 265   struct linebuffer *thisline, *prevline;
 266
 267   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
 268     error (EXIT_FAILURE, errno, "%s", infile);
 269   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
 270     error (EXIT_FAILURE, errno, "%s", outfile);
 271
 272   thisline = &lb1;
 273   prevline = &lb2;
 274
 275   initbuffer (thisline);
 276   initbuffer (prevline);
 277
 278   /* The duplication in the following `if' and `else' blocks is an
 279      optimization to distinguish the common case (in which none of
 280      the following options has been specified: --count, -repeated,
 281      --all-repeated, --unique) from the others.  In the common case,
 282      this optimization lets uniq output each different line right away,
 283      without waiting to see if the next one is different.  */
 284
 285   if (output_unique && output_first_repeated && countmode == count_none)
 286     {
 287       char *prevfield IF_LINT (= NULL);
 288       size_t prevlen IF_LINT (= 0);
 289
 290       while (!feof (stdin))
 291         {
 292           char *thisfield;
 293           size_t thislen;
 294           if (readlinebuffer (thisline, stdin) == 0)
 295             break;
 296           thisfield = find_field (thisline);
 297           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 298           if (prevline->length == 0
 299               || different (thisfield, prevfield, thislen, prevlen))
 300             {
 301               fwrite (thisline->buffer, sizeof (char),
 302                       thisline->length, stdout);
 303
 304               SWAP_LINES (prevline, thisline);
 305               prevfield = thisfield;
 306               prevlen = thislen;
 307             }
 308         }
 309     }
 310   else
 311     {
 312       char *prevfield;
 313       size_t prevlen;
 314       uintmax_t match_count = 0;
 315       bool first_delimiter = true;
 316
 317       if (readlinebuffer (prevline, stdin) == 0)
 318         goto closefiles;
 319       prevfield = find_field (prevline);
 320       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
 321
 322       while (!feof (stdin))
 323         {
 324           bool match;
 325           char *thisfield;
 326           size_t thislen;
 327           if (readlinebuffer (thisline, stdin) == 0)
 328             {
 329               if (ferror (stdin))
 330                 goto closefiles;
 331               break;
 332             }
 333           thisfield = find_field (thisline);
 334           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 335           match = !different (thisfield, prevfield, thislen, prevlen);
 336           match_count += match;
 337
 338           if (match_count == UINTMAX_MAX)
 339             {
 340               if (count_occurrences)
 341                 error (EXIT_FAILURE, 0, _("too many repeated lines"));
 342               match_count--;
 343             }
 344
 345           if (delimit_groups != DM_NONE)
 346             {
 347               if (!match)
 348                 {
 349                   if (match_count) /* a previous match */
 350                     first_delimiter = false; /* Only used when DM_SEPARATE */
 351                 }
 352               else if (match_count == 1)
 353                 {
 354                   if ((delimit_groups == DM_PREPEND)
 355                       || (delimit_groups == DM_SEPARATE
 356                           && !first_delimiter))
 357                     putchar ('\n');
 358                 }
 359             }
 360
 361           if (!match || output_later_repeated)
 362             {
 363               writeline (prevline, match, match_count);
 364               SWAP_LINES (prevline, thisline);
 365               prevfield = thisfield;
 366               prevlen = thislen;
 367               if (!match)
 368                 match_count = 0;
 369             }
 370         }
 371
 372       writeline (prevline, false, match_count);
 373     }
 374
 375  closefiles:
 376   if (ferror (stdin) || fclose (stdin) != 0)
 377     error (EXIT_FAILURE, 0, _("error reading %s"), infile);
 378
 379   /* stdout is handled via the atexit-invoked close_stdout function.  */
 380
 381   free (lb1.buffer);
 382   free (lb2.buffer);
 383 }
 384
 385 enum Skip_field_option_type
 386   {
 387     SFO_NONE,
 388     SFO_OBSOLETE,
 389     SFO_NEW
 390   };
 391
 392 int
 393 main (int argc, char **argv)
 394 {
 395   int optc = 0;
 396   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
 397   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
 398   int nfiles = 0;
 399   char const *file[2];
 400
 401   file[0] = file[1] = "-";
 402   initialize_main (&argc, &argv);
 403   program_name = argv[0];
 404   setlocale (LC_ALL, "");
 405   bindtextdomain (PACKAGE, LOCALEDIR);
 406   textdomain (PACKAGE);
 407   hard_LC_COLLATE = hard_locale (LC_COLLATE);
 408
 409   atexit (close_stdout);
 410
 411   skip_chars = 0;
 412   skip_fields = 0;
 413   check_chars = SIZE_MAX;
 414   output_unique = output_first_repeated = true;
 415   output_later_repeated = false;
 416   countmode = count_none;
 417   delimit_groups = DM_NONE;
 418
 419   for (;;)
 420     {
 421       /* Parse an operand with leading "+" as a file after "--" was
 422          seen; or if pedantic and a file was seen; or if not
 423          obsolete.  */
 424
 425       if (optc == -1
 426           || (posixly_correct && nfiles != 0)
 427           || ((optc = getopt_long (argc, argv,
 428                                    "-0123456789Dcdf:is:uw:", longopts, NULL))
 429               == -1))
 430         {
 431           if (argc <= optind)
 432             break;
 433           if (nfiles == 2)
 434             {
 435               error (0, 0, _("extra operand %s"), quote (argv[optind]));
 436               usage (EXIT_FAILURE);
 437             }
 438           file[nfiles++] = argv[optind++];
 439         }
 440       else switch (optc)
 441         {
 442         case 1:
 443           {
 444             unsigned long int size;
 445             if (optarg[0] == '+'
 446                 && posix2_version () < 200112
 447                 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
 448                 && size <= SIZE_MAX)
 449               skip_chars = size;
 450             else if (nfiles == 2)
 451               {
 452                 error (0, 0, _("extra operand %s"), quote (optarg));
 453                 usage (EXIT_FAILURE);
 454               }
 455             else
 456               file[nfiles++] = optarg;
 457           }
 458           break;
 459
 460         case '0':
 461         case '1':
 462         case '2':
 463         case '3':
 464         case '4':
 465         case '5':
 466         case '6':
 467         case '7':
 468         case '8':
 469         case '9':
 470           {
 471             if (skip_field_option_type == SFO_NEW)
 472               skip_fields = 0;
 473
 474             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
 475               error (EXIT_FAILURE, 0, "%s",
 476                      _("invalid number of fields to skip"));
 477             skip_field_option_type = SFO_OBSOLETE;
 478           }
 479           break;
 480
 481         case 'c':
 482           countmode = count_occurrences;
 483           break;
 484
 485         case 'd':
 486           output_unique = false;
 487           break;
 488
 489         case 'D':
 490           output_unique = false;
 491           output_later_repeated = true;
 492           if (optarg == NULL)
 493             delimit_groups = DM_NONE;
 494           else
 495             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 496                                         delimit_method_string,
 497                                         delimit_method_map);
 498           break;
 499
 500         case 'f':
 501           skip_field_option_type = SFO_NEW;
 502           skip_fields = size_opt (optarg,
 503                                   N_("invalid number of fields to skip"));
 504           break;
 505
 506         case 'i':
 507           ignore_case = true;
 508           break;
 509
 510         case 's':
 511           skip_chars = size_opt (optarg,
 512                                  N_("invalid number of bytes to skip"));
 513           break;
 514
 515         case 'u':
 516           output_first_repeated = false;
 517           break;
 518
 519         case 'w':
 520           check_chars = size_opt (optarg,
 521                                   N_("invalid number of bytes to compare"));
 522           break;
 523
 524         case_GETOPT_HELP_CHAR;
 525
 526         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 527
 528         default:
 529           usage (EXIT_FAILURE);
 530         }
 531     }
 532
 533   if (countmode == count_occurrences && output_later_repeated)
 534     {
 535       error (0, 0,
 536            _("printing all duplicated lines and repeat counts is meaningless"));
 537       usage (EXIT_FAILURE);
 538     }
 539
 540   check_file (file[0], file[1]);
 541
 542   exit (EXIT_SUCCESS);
 543 }