src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 86, 91, 1995-2001, Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Richard Stallman and David MacKenzie. */
  19 \f
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <getopt.h>
  24 #include <sys/types.h>
  25
  26 #include "system.h"
  27 #include "closeout.h"
  28 #include "argmatch.h"
  29 #include "linebuffer.h"
  30 #include "error.h"
  31 #include "xstrtol.h"
  32 #include "memcasecmp.h"
  33
  34 /* The official name of this program (e.g., no `g' prefix).  */
  35 #define PROGRAM_NAME "uniq"
  36
  37 #define AUTHORS N_ ("Richard Stallman and David MacKenzie")
  38
  39 #define SWAP_LINES(A, B)                        \
  40   do                                            \
  41     {                                           \
  42       struct linebuffer *_tmp;                  \
  43       _tmp = (A);                               \
  44       (A) = (B);                                \
  45       (B) = _tmp;                               \
  46     }                                           \
  47   while (0)
  48
  49 /* The name this program was run with. */
  50 char *program_name;
  51
  52 /* Number of fields to skip on each line when doing comparisons. */
  53 static size_t skip_fields;
  54
  55 /* Number of chars to skip after skipping any fields. */
  56 static size_t skip_chars;
  57
  58 /* Number of chars to compare. */
  59 static size_t check_chars;
  60
  61 enum countmode
  62 {
  63   count_occurrences,            /* -c Print count before output lines. */
  64   count_none                    /* Default.  Do not print counts. */
  65 };
  66
  67 /* Whether and how to precede the output lines with a count of the number of
  68    times they occurred in the input. */
  69 static enum countmode countmode;
  70
  71 enum output_mode
  72 {
  73   output_repeated,              /* -d Only lines that are repeated. */
  74   output_all_repeated,          /* -D All lines that are repeated. */
  75   output_unique,                /* -u Only lines that are not repeated. */
  76   output_all                    /* Default.  Print first copy of each line. */
  77 };
  78
  79 /* Which lines to output. */
  80 static enum output_mode mode;
  81
  82 /* If nonzero, ignore case when comparing.  */
  83 static int ignore_case;
  84
  85 enum delimit_method
  86 {
  87   /* No delimiters output.  --all-repeated[=none] */
  88   DM_NONE,
  89
  90   /* Delimiter precedes all groups.  --all-repeated=prepend */
  91   DM_PREPEND,
  92
  93   /* Delimit all groups.  --all-repeated=separate */
  94   DM_SEPARATE
  95 };
  96
  97 static char const *const delimit_method_string[] =
  98 {
  99   "none", "prepend", "separate", 0
 100 };
 101
 102 static enum delimit_method const delimit_method_map[] =
 103 {
 104   DM_NONE, DM_PREPEND, DM_SEPARATE
 105 };
 106
 107 /* Select whether/how to delimit groups of duplicate lines.  */
 108 static enum delimit_method delimit_groups;
 109
 110 static struct option const longopts[] =
 111 {
 112   {"count", no_argument, NULL, 'c'},
 113   {"repeated", no_argument, NULL, 'd'},
 114   {"all-repeated", optional_argument, NULL, 'D'},
 115   {"ignore-case", no_argument, NULL, 'i'},
 116   {"unique", no_argument, NULL, 'u'},
 117   {"skip-fields", required_argument, NULL, 'f'},
 118   {"skip-chars", required_argument, NULL, 's'},
 119   {"check-chars", required_argument, NULL, 'w'},
 120   {GETOPT_HELP_OPTION_DECL},
 121   {GETOPT_VERSION_OPTION_DECL},
 122   {NULL, 0, NULL, 0}
 123 };
 124
 125 void
 126 usage (int status)
 127 {
 128   if (status != 0)
 129     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 130              program_name);
 131   else
 132     {
 133       printf (_("\
 134 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 135 "),
 136               program_name);
 137       fputs (_("\
 138 Discard all but one of successive identical lines from INPUT (or\n\
 139 standard input), writing to OUTPUT (or standard output).\n\
 140 \n\
 141 Mandatory arguments to long options are mandatory for short options too.\n\
 142   -c, --count           prefix lines by the number of occurrences\n\
 143   -d, --repeated        only print duplicate lines\n\
 144 "), stdout);
 145      fputs (_("\
 146   -D, --all-repeated[=delimit-method] print all duplicate lines\n\
 147                         delimit-method={none(default),prepend,separate)}\n\
 148                         Delimiting is done with blank lines.\n\
 149   -f, --skip-fields=N   avoid comparing the first N fields\n\
 150   -i, --ignore-case     ignore differences in case when comparing\n\
 151   -s, --skip-chars=N    avoid comparing the first N characters\n\
 152   -u, --unique          only print unique lines\n\
 153 "), stdout);
 154      fputs (_("\
 155   -w, --check-chars=N   compare no more than N characters in lines\n\
 156   -N                    same as -f N\n\
 157   +N                    same as -s N (obsolescent; will be withdrawn)\n\
 158       --help            display this help and exit\n\
 159       --version         output version information and exit\n\
 160 \n\
 161 A field is a run of whitespace, then non-whitespace characters.\n\
 162 Fields are skipped before chars.\n\
 163 "), stdout);
 164       puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
 165     }
 166   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 167 }
 168
 169 /* Convert OPT to size_t, reporting an error using MSGID if it does
 170    not fit.  */
 171
 172 static size_t
 173 size_opt (char const *opt, char const *msgid)
 174 {
 175   unsigned long int size;
 176   if (xstrtoul (opt, NULL, 10, &size, "") != LONGINT_OK
 177       || SIZE_MAX < size)
 178     error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 179   return size;
 180 }
 181
 182 /* Given a linebuffer LINE,
 183    return a pointer to the beginning of the line's field to be compared. */
 184
 185 static char *
 186 find_field (const struct linebuffer *line)
 187 {
 188   register size_t count;
 189   register char *lp = line->buffer;
 190   register size_t size = line->length - 1;
 191   register size_t i = 0;
 192
 193   for (count = 0; count < skip_fields && i < size; count++)
 194     {
 195       while (i < size && ISBLANK (lp[i]))
 196         i++;
 197       while (i < size && !ISBLANK (lp[i]))
 198         i++;
 199     }
 200
 201   for (count = 0; count < skip_chars && i < size; count++)
 202     i++;
 203
 204   return lp + i;
 205 }
 206
 207 /* Return zero if two strings OLD and NEW match, nonzero if not.
 208    OLD and NEW point not to the beginnings of the lines
 209    but rather to the beginnings of the fields to compare.
 210    OLDLEN and NEWLEN are their lengths. */
 211
 212 static int
 213 different (const char *old, const char *new, size_t oldlen, size_t newlen)
 214 {
 215   if (check_chars < oldlen)
 216     oldlen = check_chars;
 217   if (check_chars < newlen)
 218     newlen = check_chars;
 219
 220   if (oldlen != newlen)
 221     return 1;
 222
 223   /* Use an if-statement here rather than a function variable to
 224      avoid portability hassles of getting a non-conflicting declaration
 225      of memcmp.  */
 226   if (ignore_case)
 227     return memcasecmp (old, new, oldlen);
 228   else
 229     return memcmp (old, new, oldlen);
 230 }
 231
 232 /* Output the line in linebuffer LINE to stream STREAM
 233    provided that the switches say it should be output.
 234    If requested, print the number of times it occurred, as well;
 235    LINECOUNT + 1 is the number of times that the line occurred. */
 236
 237 static void
 238 writeline (const struct linebuffer *line, FILE *stream, int linecount)
 239 {
 240   if ((mode == output_unique && linecount != 0)
 241       || (mode == output_repeated && linecount == 0)
 242       || (mode == output_all_repeated && linecount == 0))
 243     return;
 244
 245   if (countmode == count_occurrences)
 246     fprintf (stream, "%7d\t", linecount + 1);
 247
 248   fwrite (line->buffer, sizeof (char), line->length, stream);
 249 }
 250
 251 /* Process input file INFILE with output to OUTFILE.
 252    If either is "-", use the standard I/O stream for it instead. */
 253
 254 static void
 255 check_file (const char *infile, const char *outfile)
 256 {
 257   FILE *istream;
 258   FILE *ostream;
 259   struct linebuffer lb1, lb2;
 260   struct linebuffer *thisline, *prevline;
 261
 262   if (STREQ (infile, "-"))
 263     istream = stdin;
 264   else
 265     istream = fopen (infile, "r");
 266   if (istream == NULL)
 267     error (EXIT_FAILURE, errno, "%s", infile);
 268
 269   if (STREQ (outfile, "-"))
 270     ostream = stdout;
 271   else
 272     ostream = fopen (outfile, "w");
 273   if (ostream == NULL)
 274     error (EXIT_FAILURE, errno, "%s", outfile);
 275
 276   thisline = &lb1;
 277   prevline = &lb2;
 278
 279   initbuffer (thisline);
 280   initbuffer (prevline);
 281
 282   /* The duplication in the following `if' and `else' blocks is an
 283      optimization to distinguish the common case (in which none of
 284      the following options has been specified: --count, -repeated,
 285      --all-repeated, --unique) from the others.  In the common case,
 286      this optimization lets uniq output each different line right away,
 287      without waiting to see if the next one is different.  */
 288
 289   if (mode == output_all && countmode == count_none)
 290     {
 291       char *prevfield IF_LINT (= NULL);
 292       size_t prevlen IF_LINT (= 0);
 293
 294       while (!feof (istream))
 295         {
 296           char *thisfield;
 297           size_t thislen;
 298           if (readline (thisline, istream) == 0)
 299             break;
 300           thisfield = find_field (thisline);
 301           thislen = thisline->length - (thisfield - thisline->buffer);
 302           if (prevline->length == 0
 303               || different (thisfield, prevfield, thislen, prevlen))
 304             {
 305               fwrite (thisline->buffer, sizeof (char),
 306                       thisline->length, ostream);
 307
 308               SWAP_LINES (prevline, thisline);
 309               prevfield = thisfield;
 310               prevlen = thislen;
 311             }
 312         }
 313     }
 314   else
 315     {
 316       char *prevfield;
 317       size_t prevlen;
 318       int match_count = 0;
 319       int first_delimiter = 1;
 320
 321       if (readline (prevline, istream) == 0)
 322         goto closefiles;
 323       prevfield = find_field (prevline);
 324       prevlen = prevline->length - (prevfield - prevline->buffer);
 325
 326       while (!feof (istream))
 327         {
 328           int match;
 329           char *thisfield;
 330           size_t thislen;
 331           if (readline (thisline, istream) == 0)
 332             break;
 333           thisfield = find_field (thisline);
 334           thislen = thisline->length - (thisfield - thisline->buffer);
 335           match = !different (thisfield, prevfield, thislen, prevlen);
 336
 337           if (match)
 338             ++match_count;
 339
 340           if (mode == output_all_repeated && delimit_groups != DM_NONE)
 341             {
 342               if (!match)
 343                 {
 344                   if (match_count) /* a previous match */
 345                     first_delimiter = 0; /* Only used when DM_SEPARATE */
 346                 }
 347               else if (match_count == 1)
 348                 {
 349                   if ((delimit_groups == DM_PREPEND)
 350                       || (delimit_groups == DM_SEPARATE
 351                           && !first_delimiter))
 352                     putc ('\n', ostream);
 353                 }
 354             }
 355
 356           if (!match || mode == output_all_repeated)
 357             {
 358               writeline (prevline, ostream, match_count);
 359               SWAP_LINES (prevline, thisline);
 360               prevfield = thisfield;
 361               prevlen = thislen;
 362               if (!match)
 363                 match_count = 0;
 364             }
 365         }
 366
 367       writeline (prevline, ostream, match_count);
 368     }
 369
 370  closefiles:
 371   if (ferror (istream) || fclose (istream) == EOF)
 372     error (EXIT_FAILURE, errno, _("error reading %s"), infile);
 373
 374   /* Close ostream only if it's not stdout -- the latter is closed
 375      via the atexit-invoked close_stdout.  */
 376   if (ostream != stdout && (ferror (ostream) || fclose (ostream) == EOF))
 377     error (EXIT_FAILURE, errno, _("error writing %s"), outfile);
 378
 379   free (lb1.buffer);
 380   free (lb2.buffer);
 381 }
 382
 383 int
 384 main (int argc, char **argv)
 385 {
 386   int optc = 0;
 387   int posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
 388   int nfiles = 0;
 389   char const *file[2];
 390
 391   file[0] = file[1] = "-";
 392   program_name = argv[0];
 393   setlocale (LC_ALL, "");
 394   bindtextdomain (PACKAGE, LOCALEDIR);
 395   textdomain (PACKAGE);
 396
 397   atexit (close_stdout);
 398
 399   skip_chars = 0;
 400   skip_fields = 0;
 401   check_chars = SIZE_MAX;
 402   mode = output_all;
 403   countmode = count_none;
 404   delimit_groups = DM_NONE;
 405
 406   for (;;)
 407     {
 408       /* Parse an operand with leading "+" as a file after "--" was
 409          seen; or if pedantic and a file was seen.  POSIX 1003.1-200x
 410          d7 removes support for such operands, so when it becomes
 411          official the code will need to be changed.  */
 412
 413       if (optc == -1
 414           || (posixly_correct && nfiles != 0)
 415           || ((optc = getopt_long (argc, argv,
 416                                    "-0123456789cdDf:is:uw:", longopts, NULL))
 417               == -1))
 418         {
 419           if (optind == argc)
 420             break;
 421           if (nfiles == 2)
 422             {
 423               error (0, 0, _("extra operand `%s'"), argv[optind]);
 424               usage (1);
 425             }
 426           file[nfiles++] = argv[optind++];
 427         }
 428       else switch (optc)
 429         {
 430         case 1:
 431           {
 432             unsigned long int size;
 433             if (optarg[0] == '+'
 434                 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
 435                 && size <= SIZE_MAX)
 436               skip_chars = size;
 437             else if (nfiles == 2)
 438               {
 439                 error (0, 0, _("extra operand `%s'"), optarg);
 440                 usage (1);
 441               }
 442             else
 443               file[nfiles++] = optarg;
 444           }
 445           break;
 446
 447         case '0':
 448         case '1':
 449         case '2':
 450         case '3':
 451         case '4':
 452         case '5':
 453         case '6':
 454         case '7':
 455         case '8':
 456         case '9':
 457           {
 458             size_t s = skip_fields;
 459             skip_fields = s * 10 + optc - '0';
 460             if (SIZE_MAX / 10 < s || skip_fields < s)
 461               error (EXIT_FAILURE, 0, "%s",
 462                      _("invalid number of fields to skip"));
 463           }
 464           break;
 465
 466         case 'c':
 467           countmode = count_occurrences;
 468           break;
 469
 470         case 'd':
 471           mode = output_repeated;
 472           break;
 473
 474         case 'D':
 475           mode = output_all_repeated;
 476           if (optarg == NULL)
 477             delimit_groups = DM_NONE;
 478           else
 479             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 480                                         delimit_method_string,
 481                                         delimit_method_map);
 482           break;
 483
 484         case 'f':               /* Like '-#'. */
 485           skip_fields = size_opt (optarg,
 486                                   N_("invalid number of fields to skip"));
 487           break;
 488
 489         case 'i':
 490           ignore_case = 1;
 491           break;
 492
 493         case 's':               /* Like '+#'. */
 494           skip_chars = size_opt (optarg,
 495                                  N_("invalid number of bytes to skip"));
 496           break;
 497
 498         case 'u':
 499           mode = output_unique;
 500           break;
 501
 502         case 'w':
 503           check_chars = size_opt (optarg,
 504                                   N_("invalid number of bytes to compare"));
 505           break;
 506
 507         case_GETOPT_HELP_CHAR;
 508
 509         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 510
 511         default:
 512           usage (1);
 513         }
 514     }
 515
 516   if (countmode == count_occurrences && mode == output_all_repeated)
 517     {
 518       error (0, 0,
 519            _("printing all duplicated lines and repeat counts is meaningless"));
 520       usage (1);
 521     }
 522
 523   check_file (file[0], file[1]);
 524
 525   exit (EXIT_SUCCESS);
 526 }