src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 86, 91, 1995-2001, Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Richard Stallman and David MacKenzie. */
  19 \f
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <getopt.h>
  24 #include <sys/types.h>
  25
  26 #include "system.h"
  27 #include "closeout.h"
  28 #include "argmatch.h"
  29 #include "linebuffer.h"
  30 #include "error.h"
  31 #include "xstrtol.h"
  32 #include "memcasecmp.h"
  33
  34 /* The official name of this program (e.g., no `g' prefix).  */
  35 #define PROGRAM_NAME "uniq"
  36
  37 #define AUTHORS N_ ("Richard Stallman and David MacKenzie")
  38
  39 #define SWAP_LINES(A, B)                        \
  40   do                                            \
  41     {                                           \
  42       struct linebuffer *_tmp;                  \
  43       _tmp = (A);                               \
  44       (A) = (B);                                \
  45       (B) = _tmp;                               \
  46     }                                           \
  47   while (0)
  48
  49 /* The name this program was run with. */
  50 char *program_name;
  51
  52 /* Number of fields to skip on each line when doing comparisons. */
  53 static size_t skip_fields;
  54
  55 /* Number of chars to skip after skipping any fields. */
  56 static size_t skip_chars;
  57
  58 /* Number of chars to compare. */
  59 static size_t check_chars;
  60
  61 enum countmode
  62 {
  63   count_occurrences,            /* -c Print count before output lines. */
  64   count_none                    /* Default.  Do not print counts. */
  65 };
  66
  67 /* Whether and how to precede the output lines with a count of the number of
  68    times they occurred in the input. */
  69 static enum countmode countmode;
  70
  71 enum output_mode
  72 {
  73   output_repeated,              /* -d Only lines that are repeated. */
  74   output_all_repeated,          /* -D All lines that are repeated. */
  75   output_unique,                /* -u Only lines that are not repeated. */
  76   output_all                    /* Default.  Print first copy of each line. */
  77 };
  78
  79 /* Which lines to output. */
  80 static enum output_mode mode;
  81
  82 /* If nonzero, ignore case when comparing.  */
  83 static int ignore_case;
  84
  85 enum delimit_method
  86 {
  87   /* No delimiters output.  --all-repeated[=none] */
  88   DM_NONE,
  89
  90   /* Delimiter precedes all groups.  --all-repeated=prepend */
  91   DM_PREPEND,
  92
  93   /* Delimit all groups.  --all-repeated=separate */
  94   DM_SEPARATE
  95 };
  96
  97 static char const *const delimit_method_string[] =
  98 {
  99   "none", "prepend", "separate", 0
 100 };
 101
 102 static enum delimit_method const delimit_method_map[] =
 103 {
 104   DM_NONE, DM_PREPEND, DM_SEPARATE
 105 };
 106
 107 /* Select whether/how to delimit groups of duplicate lines.  */
 108 static enum delimit_method delimit_groups;
 109
 110 static struct option const longopts[] =
 111 {
 112   {"count", no_argument, NULL, 'c'},
 113   {"repeated", no_argument, NULL, 'd'},
 114   {"all-repeated", optional_argument, NULL, 'D'},
 115   {"ignore-case", no_argument, NULL, 'i'},
 116   {"unique", no_argument, NULL, 'u'},
 117   {"skip-fields", required_argument, NULL, 'f'},
 118   {"skip-chars", required_argument, NULL, 's'},
 119   {"check-chars", required_argument, NULL, 'w'},
 120   {GETOPT_HELP_OPTION_DECL},
 121   {GETOPT_VERSION_OPTION_DECL},
 122   {NULL, 0, NULL, 0}
 123 };
 124
 125 void
 126 usage (int status)
 127 {
 128   if (status != 0)
 129     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 130              program_name);
 131   else
 132     {
 133       printf (_("\
 134 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 135 "),
 136               program_name);
 137       printf (_("\
 138 Discard all but one of successive identical lines from INPUT (or\n\
 139 standard input), writing to OUTPUT (or standard output).\n\
 140 \n\
 141 Mandatory arguments to long options are mandatory for short options too.\n\
 142   -c, --count           prefix lines by the number of occurrences\n\
 143   -d, --repeated        only print duplicate lines\n\
 144   -D, --all-repeated[=delimit-method] print all duplicate lines\n\
 145                         delimit-method={none(default),prepend,separate)}\n\
 146                         Delimiting is done with blank lines.\n\
 147   -f, --skip-fields=N   avoid comparing the first N fields\n\
 148   -i, --ignore-case     ignore differences in case when comparing\n\
 149   -s, --skip-chars=N    avoid comparing the first N characters\n\
 150   -u, --unique          only print unique lines\n\
 151   -w, --check-chars=N   compare no more than N characters in lines\n\
 152   -N                    same as -f N\n\
 153   +N                    same as -s N (obsolescent; will be withdrawn)\n\
 154       --help            display this help and exit\n\
 155       --version         output version information and exit\n\
 156 \n\
 157 A field is a run of whitespace, then non-whitespace characters.\n\
 158 Fields are skipped before chars.\n\
 159 "));
 160       puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
 161     }
 162   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 163 }
 164
 165 /* Convert OPT to size_t, reporting an error using MSGID if it does
 166    not fit.  */
 167
 168 static size_t
 169 size_opt (char const *opt, char const *msgid)
 170 {
 171   unsigned long int size;
 172   if (xstrtoul (opt, NULL, 10, &size, "") != LONGINT_OK
 173       || SIZE_MAX < size)
 174     error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 175   return size;
 176 }
 177
 178 /* Given a linebuffer LINE,
 179    return a pointer to the beginning of the line's field to be compared. */
 180
 181 static char *
 182 find_field (const struct linebuffer *line)
 183 {
 184   register size_t count;
 185   register char *lp = line->buffer;
 186   register size_t size = line->length - 1;
 187   register size_t i = 0;
 188
 189   for (count = 0; count < skip_fields && i < size; count++)
 190     {
 191       while (i < size && ISBLANK (lp[i]))
 192         i++;
 193       while (i < size && !ISBLANK (lp[i]))
 194         i++;
 195     }
 196
 197   for (count = 0; count < skip_chars && i < size; count++)
 198     i++;
 199
 200   return lp + i;
 201 }
 202
 203 /* Return zero if two strings OLD and NEW match, nonzero if not.
 204    OLD and NEW point not to the beginnings of the lines
 205    but rather to the beginnings of the fields to compare.
 206    OLDLEN and NEWLEN are their lengths. */
 207
 208 static int
 209 different (const char *old, const char *new, size_t oldlen, size_t newlen)
 210 {
 211   if (check_chars < oldlen)
 212     oldlen = check_chars;
 213   if (check_chars < newlen)
 214     newlen = check_chars;
 215
 216   if (oldlen != newlen)
 217     return 1;
 218
 219   /* Use an if-statement here rather than a function variable to
 220      avoid portability hassles of getting a non-conflicting declaration
 221      of memcmp.  */
 222   if (ignore_case)
 223     return memcasecmp (old, new, oldlen);
 224   else
 225     return memcmp (old, new, oldlen);
 226 }
 227
 228 /* Output the line in linebuffer LINE to stream STREAM
 229    provided that the switches say it should be output.
 230    If requested, print the number of times it occurred, as well;
 231    LINECOUNT + 1 is the number of times that the line occurred. */
 232
 233 static void
 234 writeline (const struct linebuffer *line, FILE *stream, int linecount)
 235 {
 236   if ((mode == output_unique && linecount != 0)
 237       || (mode == output_repeated && linecount == 0)
 238       || (mode == output_all_repeated && linecount == 0))
 239     return;
 240
 241   if (countmode == count_occurrences)
 242     fprintf (stream, "%7d\t", linecount + 1);
 243
 244   fwrite (line->buffer, sizeof (char), line->length, stream);
 245 }
 246
 247 /* Process input file INFILE with output to OUTFILE.
 248    If either is "-", use the standard I/O stream for it instead. */
 249
 250 static void
 251 check_file (const char *infile, const char *outfile)
 252 {
 253   FILE *istream;
 254   FILE *ostream;
 255   struct linebuffer lb1, lb2;
 256   struct linebuffer *thisline, *prevline;
 257
 258   if (STREQ (infile, "-"))
 259     istream = stdin;
 260   else
 261     istream = fopen (infile, "r");
 262   if (istream == NULL)
 263     error (EXIT_FAILURE, errno, "%s", infile);
 264
 265   if (STREQ (outfile, "-"))
 266     ostream = stdout;
 267   else
 268     ostream = fopen (outfile, "w");
 269   if (ostream == NULL)
 270     error (EXIT_FAILURE, errno, "%s", outfile);
 271
 272   thisline = &lb1;
 273   prevline = &lb2;
 274
 275   initbuffer (thisline);
 276   initbuffer (prevline);
 277
 278   /* The duplication in the following `if' and `else' blocks is an
 279      optimization to distinguish the common case (in which none of
 280      the following options has been specified: --count, -repeated,
 281      --all-repeated, --unique) from the others.  In the common case,
 282      this optimization lets uniq output each different line right away,
 283      without waiting to see if the next one is different.  */
 284
 285   if (mode == output_all && countmode == count_none)
 286     {
 287       char *prevfield IF_LINT (= NULL);
 288       size_t prevlen IF_LINT (= 0);
 289
 290       while (!feof (istream))
 291         {
 292           char *thisfield;
 293           size_t thislen;
 294           if (readline (thisline, istream) == 0)
 295             break;
 296           thisfield = find_field (thisline);
 297           thislen = thisline->length - (thisfield - thisline->buffer);
 298           if (prevline->length == 0
 299               || different (thisfield, prevfield, thislen, prevlen))
 300             {
 301               fwrite (thisline->buffer, sizeof (char),
 302                       thisline->length, ostream);
 303
 304               SWAP_LINES (prevline, thisline);
 305               prevfield = thisfield;
 306               prevlen = thislen;
 307             }
 308         }
 309     }
 310   else
 311     {
 312       char *prevfield;
 313       size_t prevlen;
 314       int match_count = 0;
 315       int first_delimiter = 1;
 316
 317       if (readline (prevline, istream) == 0)
 318         goto closefiles;
 319       prevfield = find_field (prevline);
 320       prevlen = prevline->length - (prevfield - prevline->buffer);
 321
 322       while (!feof (istream))
 323         {
 324           int match;
 325           char *thisfield;
 326           size_t thislen;
 327           if (readline (thisline, istream) == 0)
 328             break;
 329           thisfield = find_field (thisline);
 330           thislen = thisline->length - (thisfield - thisline->buffer);
 331           match = !different (thisfield, prevfield, thislen, prevlen);
 332
 333           if (match)
 334             ++match_count;
 335
 336           if (mode == output_all_repeated && delimit_groups != DM_NONE)
 337             {
 338               if (!match)
 339                 {
 340                   if (match_count) /* a previous match */
 341                     first_delimiter = 0; /* Only used when DM_SEPARATE */
 342                 }
 343               else if (match_count == 1)
 344                 {
 345                   if ((delimit_groups == DM_PREPEND)
 346                       || (delimit_groups == DM_SEPARATE
 347                           && !first_delimiter))
 348                     putc ('\n', ostream);
 349                 }
 350             }
 351
 352           if (!match || mode == output_all_repeated)
 353             {
 354               writeline (prevline, ostream, match_count);
 355               SWAP_LINES (prevline, thisline);
 356               prevfield = thisfield;
 357               prevlen = thislen;
 358               if (!match)
 359                 match_count = 0;
 360             }
 361         }
 362
 363       writeline (prevline, ostream, match_count);
 364     }
 365
 366  closefiles:
 367   if (ferror (istream) || fclose (istream) == EOF)
 368     error (EXIT_FAILURE, errno, _("error reading %s"), infile);
 369
 370   /* Close ostream only if it's not stdout -- the latter is closed
 371      via the atexit-invoked close_stdout.  */
 372   if (ostream != stdout && (ferror (ostream) || fclose (ostream) == EOF))
 373     error (EXIT_FAILURE, errno, _("error writing %s"), outfile);
 374
 375   free (lb1.buffer);
 376   free (lb2.buffer);
 377 }
 378
 379 int
 380 main (int argc, char **argv)
 381 {
 382   int optc = 0;
 383   int posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
 384   int nfiles = 0;
 385   char const *file[2];
 386
 387   file[0] = file[1] = "-";
 388   program_name = argv[0];
 389   setlocale (LC_ALL, "");
 390   bindtextdomain (PACKAGE, LOCALEDIR);
 391   textdomain (PACKAGE);
 392
 393   atexit (close_stdout);
 394
 395   skip_chars = 0;
 396   skip_fields = 0;
 397   check_chars = SIZE_MAX;
 398   mode = output_all;
 399   countmode = count_none;
 400   delimit_groups = DM_NONE;
 401
 402   for (;;)
 403     {
 404       /* Parse an operand with leading "+" as a file after "--" was
 405          seen; or if pedantic and a file was seen.  POSIX 1003.1-200x
 406          d7 removes support for such operands, so when it becomes
 407          official the code will need to be changed.  */
 408
 409       if (optc == -1
 410           || (posixly_correct && nfiles != 0)
 411           || ((optc = getopt_long (argc, argv,
 412                                    "-0123456789cdDf:is:uw:", longopts, NULL))
 413               == -1))
 414         {
 415           if (optind == argc)
 416             break;
 417           if (nfiles == 2)
 418             {
 419               error (0, 0, _("extra operand `%s'"), argv[optind]);
 420               usage (1);
 421             }
 422           file[nfiles++] = argv[optind++];
 423         }
 424       else switch (optc)
 425         {
 426         case 1:
 427           {
 428             unsigned long int size;
 429             if (optarg[0] == '+'
 430                 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
 431                 && size <= SIZE_MAX)
 432               skip_chars = size;
 433             else if (nfiles == 2)
 434               {
 435                 error (0, 0, _("extra operand `%s'"), optarg);
 436                 usage (1);
 437               }
 438             else
 439               file[nfiles++] = optarg;
 440           }
 441           break;
 442
 443         case '0':
 444         case '1':
 445         case '2':
 446         case '3':
 447         case '4':
 448         case '5':
 449         case '6':
 450         case '7':
 451         case '8':
 452         case '9':
 453           {
 454             size_t s = skip_fields;
 455             skip_fields = s * 10 + optc - '0';
 456             if (SIZE_MAX / 10 < s || skip_fields < s)
 457               error (EXIT_FAILURE, 0, "%s",
 458                      _("invalid number of fields to skip"));
 459           }
 460           break;
 461
 462         case 'c':
 463           countmode = count_occurrences;
 464           break;
 465
 466         case 'd':
 467           mode = output_repeated;
 468           break;
 469
 470         case 'D':
 471           mode = output_all_repeated;
 472           if (optarg == NULL)
 473             delimit_groups = DM_NONE;
 474           else
 475             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 476                                         delimit_method_string,
 477                                         delimit_method_map);
 478           break;
 479
 480         case 'f':               /* Like '-#'. */
 481           skip_fields = size_opt (optarg,
 482                                   N_("invalid number of fields to skip"));
 483           break;
 484
 485         case 'i':
 486           ignore_case = 1;
 487           break;
 488
 489         case 's':               /* Like '+#'. */
 490           skip_chars = size_opt (optarg,
 491                                  N_("invalid number of bytes to skip"));
 492           break;
 493
 494         case 'u':
 495           mode = output_unique;
 496           break;
 497
 498         case 'w':
 499           check_chars = size_opt (optarg,
 500                                   N_("invalid number of bytes to compare"));
 501           break;
 502
 503         case_GETOPT_HELP_CHAR;
 504
 505         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 506
 507         default:
 508           usage (1);
 509         }
 510     }
 511
 512   if (countmode == count_occurrences && mode == output_all_repeated)
 513     {
 514       error (0, 0,
 515            _("printing all duplicated lines and repeat counts is meaningless"));
 516       usage (1);
 517     }
 518
 519   check_file (file[0], file[1]);
 520
 521   exit (EXIT_SUCCESS);
 522 }