src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 86, 91, 1995-2001, Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Richard Stallman and David MacKenzie. */
  19 \f
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <getopt.h>
  24 #include <sys/types.h>
  25
  26 #include "system.h"
  27 #include "closeout.h"
  28 #include "argmatch.h"
  29 #include "linebuffer.h"
  30 #include "error.h"
  31 #include "xstrtol.h"
  32 #include "memcasecmp.h"
  33
  34 /* The official name of this program (e.g., no `g' prefix).  */
  35 #define PROGRAM_NAME "uniq"
  36
  37 #define AUTHORS N_ ("Richard Stallman and David MacKenzie")
  38
  39 #define SWAP_LINES(A, B)                        \
  40   do                                            \
  41     {                                           \
  42       struct linebuffer *_tmp;                  \
  43       _tmp = (A);                               \
  44       (A) = (B);                                \
  45       (B) = _tmp;                               \
  46     }                                           \
  47   while (0)
  48
  49 /* The name this program was run with. */
  50 char *program_name;
  51
  52 /* Number of fields to skip on each line when doing comparisons. */
  53 static size_t skip_fields;
  54
  55 /* Number of chars to skip after skipping any fields. */
  56 static size_t skip_chars;
  57
  58 /* Number of chars to compare. */
  59 static size_t check_chars;
  60
  61 enum countmode
  62 {
  63   count_occurrences,            /* -c Print count before output lines. */
  64   count_none                    /* Default.  Do not print counts. */
  65 };
  66
  67 /* Whether and how to precede the output lines with a count of the number of
  68    times they occurred in the input. */
  69 static enum countmode countmode;
  70
  71 enum output_mode
  72 {
  73   output_repeated,              /* -d Only lines that are repeated. */
  74   output_all_repeated,          /* -D All lines that are repeated. */
  75   output_unique,                /* -u Only lines that are not repeated. */
  76   output_all                    /* Default.  Print first copy of each line. */
  77 };
  78
  79 /* Which lines to output. */
  80 static enum output_mode mode;
  81
  82 /* If nonzero, ignore case when comparing.  */
  83 static int ignore_case;
  84
  85 enum delimit_method
  86 {
  87   /* No delimiters output.  --all-repeated[=none] */
  88   DM_NONE,
  89
  90   /* Delimiter precedes all groups.  --all-repeated=prepend */
  91   DM_PREPEND,
  92
  93   /* Delimit all groups.  --all-repeated=separate */
  94   DM_SEPARATE
  95 };
  96
  97 static char const *const delimit_method_string[] =
  98 {
  99   "none", "prepend", "separate", 0
 100 };
 101
 102 static enum delimit_method const delimit_method_map[] =
 103 {
 104   DM_NONE, DM_PREPEND, DM_SEPARATE
 105 };
 106
 107 /* Select whether/how to delimit groups of duplicate lines.  */
 108 static enum delimit_method delimit_groups;
 109
 110 static struct option const longopts[] =
 111 {
 112   {"count", no_argument, NULL, 'c'},
 113   {"repeated", no_argument, NULL, 'd'},
 114   {"all-repeated", optional_argument, NULL, 'D'},
 115   {"ignore-case", no_argument, NULL, 'i'},
 116   {"unique", no_argument, NULL, 'u'},
 117   {"skip-fields", required_argument, NULL, 'f'},
 118   {"skip-chars", required_argument, NULL, 's'},
 119   {"check-chars", required_argument, NULL, 'w'},
 120   {GETOPT_HELP_OPTION_DECL},
 121   {GETOPT_VERSION_OPTION_DECL},
 122   {NULL, 0, NULL, 0}
 123 };
 124
 125 void
 126 usage (int status)
 127 {
 128   if (status != 0)
 129     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 130              program_name);
 131   else
 132     {
 133       printf (_("\
 134 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 135 "),
 136               program_name);
 137       printf (_("\
 138 Discard all but one of successive identical lines from INPUT (or\n\
 139 standard input), writing to OUTPUT (or standard output).\n\
 140 \n\
 141   -c, --count           prefix lines by the number of occurrences\n\
 142   -d, --repeated        only print duplicate lines\n\
 143   -D, --all-repeated[=delimit-method] print all duplicate lines\n\
 144                         delimit-method={none(default),prepend,separate)}\n\
 145                         Delimiting is done with blank lines.\n\
 146   -f, --skip-fields=N   avoid comparing the first N fields\n\
 147   -i, --ignore-case     ignore differences in case when comparing\n\
 148   -s, --skip-chars=N    avoid comparing the first N characters\n\
 149   -u, --unique          only print unique lines\n\
 150   -w, --check-chars=N   compare no more than N characters in lines\n\
 151   -N                    same as -f N\n\
 152   +N                    same as -s N (obsolescent; will be withdrawn)\n\
 153       --help            display this help and exit\n\
 154       --version         output version information and exit\n\
 155 \n\
 156 A field is a run of whitespace, then non-whitespace characters.\n\
 157 Fields are skipped before chars.\n\
 158 "));
 159       puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
 160     }
 161   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 162 }
 163
 164 /* Convert OPT to size_t, reporting an error using MSGID if it does
 165    not fit.  */
 166
 167 static size_t
 168 size_opt (char const *opt, char const *msgid)
 169 {
 170   unsigned long int size;
 171   if (xstrtoul (opt, NULL, 10, &size, "") != LONGINT_OK
 172       || SIZE_MAX < size)
 173     error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 174   return size;
 175 }
 176
 177 /* Given a linebuffer LINE,
 178    return a pointer to the beginning of the line's field to be compared. */
 179
 180 static char *
 181 find_field (const struct linebuffer *line)
 182 {
 183   register size_t count;
 184   register char *lp = line->buffer;
 185   register size_t size = line->length - 1;
 186   register size_t i = 0;
 187
 188   for (count = 0; count < skip_fields && i < size; count++)
 189     {
 190       while (i < size && ISBLANK (lp[i]))
 191         i++;
 192       while (i < size && !ISBLANK (lp[i]))
 193         i++;
 194     }
 195
 196   for (count = 0; count < skip_chars && i < size; count++)
 197     i++;
 198
 199   return lp + i;
 200 }
 201
 202 /* Return zero if two strings OLD and NEW match, nonzero if not.
 203    OLD and NEW point not to the beginnings of the lines
 204    but rather to the beginnings of the fields to compare.
 205    OLDLEN and NEWLEN are their lengths. */
 206
 207 static int
 208 different (const char *old, const char *new, size_t oldlen, size_t newlen)
 209 {
 210   if (check_chars < oldlen)
 211     oldlen = check_chars;
 212   if (check_chars < newlen)
 213     newlen = check_chars;
 214
 215   if (oldlen != newlen)
 216     return 1;
 217
 218   /* Use an if-statement here rather than a function variable to
 219      avoid portability hassles of getting a non-conflicting declaration
 220      of memcmp.  */
 221   if (ignore_case)
 222     return memcasecmp (old, new, oldlen);
 223   else
 224     return memcmp (old, new, oldlen);
 225 }
 226
 227 /* Output the line in linebuffer LINE to stream STREAM
 228    provided that the switches say it should be output.
 229    If requested, print the number of times it occurred, as well;
 230    LINECOUNT + 1 is the number of times that the line occurred. */
 231
 232 static void
 233 writeline (const struct linebuffer *line, FILE *stream, int linecount)
 234 {
 235   if ((mode == output_unique && linecount != 0)
 236       || (mode == output_repeated && linecount == 0)
 237       || (mode == output_all_repeated && linecount == 0))
 238     return;
 239
 240   if (countmode == count_occurrences)
 241     fprintf (stream, "%7d\t", linecount + 1);
 242
 243   fwrite (line->buffer, sizeof (char), line->length, stream);
 244 }
 245
 246 /* Process input file INFILE with output to OUTFILE.
 247    If either is "-", use the standard I/O stream for it instead. */
 248
 249 static void
 250 check_file (const char *infile, const char *outfile)
 251 {
 252   FILE *istream;
 253   FILE *ostream;
 254   struct linebuffer lb1, lb2;
 255   struct linebuffer *thisline, *prevline;
 256
 257   if (STREQ (infile, "-"))
 258     istream = stdin;
 259   else
 260     istream = fopen (infile, "r");
 261   if (istream == NULL)
 262     error (EXIT_FAILURE, errno, "%s", infile);
 263
 264   if (STREQ (outfile, "-"))
 265     ostream = stdout;
 266   else
 267     ostream = fopen (outfile, "w");
 268   if (ostream == NULL)
 269     error (EXIT_FAILURE, errno, "%s", outfile);
 270
 271   thisline = &lb1;
 272   prevline = &lb2;
 273
 274   initbuffer (thisline);
 275   initbuffer (prevline);
 276
 277   /* The duplication in the following `if' and `else' blocks is an
 278      optimization to distinguish the common case (in which none of
 279      the following options has been specified: --count, -repeated,
 280      --all-repeated, --unique) from the others.  In the common case,
 281      this optimization lets uniq output each different line right away,
 282      without waiting to see if the next one is different.  */
 283
 284   if (mode == output_all && countmode == count_none)
 285     {
 286       char *prevfield IF_LINT (= NULL);
 287       size_t prevlen IF_LINT (= 0);
 288
 289       while (!feof (istream))
 290         {
 291           char *thisfield;
 292           size_t thislen;
 293           if (readline (thisline, istream) == 0)
 294             break;
 295           thisfield = find_field (thisline);
 296           thislen = thisline->length - (thisfield - thisline->buffer);
 297           if (prevline->length == 0
 298               || different (thisfield, prevfield, thislen, prevlen))
 299             {
 300               fwrite (thisline->buffer, sizeof (char),
 301                       thisline->length, ostream);
 302
 303               SWAP_LINES (prevline, thisline);
 304               prevfield = thisfield;
 305               prevlen = thislen;
 306             }
 307         }
 308     }
 309   else
 310     {
 311       char *prevfield;
 312       size_t prevlen;
 313       int match_count = 0;
 314       int first_delimiter = 1;
 315
 316       if (readline (prevline, istream) == 0)
 317         goto closefiles;
 318       prevfield = find_field (prevline);
 319       prevlen = prevline->length - (prevfield - prevline->buffer);
 320
 321       while (!feof (istream))
 322         {
 323           int match;
 324           char *thisfield;
 325           size_t thislen;
 326           if (readline (thisline, istream) == 0)
 327             break;
 328           thisfield = find_field (thisline);
 329           thislen = thisline->length - (thisfield - thisline->buffer);
 330           match = !different (thisfield, prevfield, thislen, prevlen);
 331
 332           if (match)
 333             ++match_count;
 334
 335           if (mode == output_all_repeated && delimit_groups != DM_NONE)
 336             {
 337               if (!match)
 338                 {
 339                   if (match_count) /* a previous match */
 340                     first_delimiter = 0; /* Only used when DM_SEPARATE */
 341                 }
 342               else if (match_count == 1)
 343                 {
 344                   if ((delimit_groups == DM_PREPEND)
 345                       || (delimit_groups == DM_SEPARATE
 346                           && !first_delimiter))
 347                     putc ('\n', ostream);
 348                 }
 349             }
 350
 351           if (!match || mode == output_all_repeated)
 352             {
 353               writeline (prevline, ostream, match_count);
 354               SWAP_LINES (prevline, thisline);
 355               prevfield = thisfield;
 356               prevlen = thislen;
 357               if (!match)
 358                 match_count = 0;
 359             }
 360         }
 361
 362       writeline (prevline, ostream, match_count);
 363     }
 364
 365  closefiles:
 366   if (ferror (istream) || fclose (istream) == EOF)
 367     error (EXIT_FAILURE, errno, _("error reading %s"), infile);
 368
 369   /* Close ostream only if it's not stdout -- the latter is closed
 370      via the atexit-invoked close_stdout.  */
 371   if (ostream != stdout && (ferror (ostream) || fclose (ostream) == EOF))
 372     error (EXIT_FAILURE, errno, _("error writing %s"), outfile);
 373
 374   free (lb1.buffer);
 375   free (lb2.buffer);
 376 }
 377
 378 int
 379 main (int argc, char **argv)
 380 {
 381   int optc = 0;
 382   int posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
 383   int nfiles = 0;
 384   char const *file[2];
 385
 386   file[0] = file[1] = "-";
 387   program_name = argv[0];
 388   setlocale (LC_ALL, "");
 389   bindtextdomain (PACKAGE, LOCALEDIR);
 390   textdomain (PACKAGE);
 391
 392   atexit (close_stdout);
 393
 394   skip_chars = 0;
 395   skip_fields = 0;
 396   check_chars = SIZE_MAX;
 397   mode = output_all;
 398   countmode = count_none;
 399   delimit_groups = DM_NONE;
 400
 401   for (;;)
 402     {
 403       /* Parse an operand with leading "+" as a file after "--" was
 404          seen; or if pedantic and a file was seen.  POSIX 1003.1-200x
 405          d7 removes support for such operands, so when it becomes
 406          official the code will need to be changed.  */
 407
 408       if (optc == -1
 409           || (posixly_correct && nfiles != 0)
 410           || ((optc = getopt_long (argc, argv,
 411                                    "-0123456789cdDf:is:uw:", longopts, NULL))
 412               == -1))
 413         {
 414           if (optind == argc)
 415             break;
 416           if (nfiles == 2)
 417             {
 418               error (0, 0, _("extra operand `%s'"), argv[optind]);
 419               usage (1);
 420             }
 421           file[nfiles++] = argv[optind++];
 422         }
 423       else switch (optc)
 424         {
 425         case 1:
 426           {
 427             unsigned long int size;
 428             if (optarg[0] == '+'
 429                 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
 430                 && size <= SIZE_MAX)
 431               skip_chars = size;
 432             else if (nfiles == 2)
 433               {
 434                 error (0, 0, _("extra operand `%s'"), optarg);
 435                 usage (1);
 436               }
 437             else
 438               file[nfiles++] = optarg;
 439           }
 440           break;
 441
 442         case '0':
 443         case '1':
 444         case '2':
 445         case '3':
 446         case '4':
 447         case '5':
 448         case '6':
 449         case '7':
 450         case '8':
 451         case '9':
 452           {
 453             size_t s = skip_fields;
 454             skip_fields = s * 10 + optc - '0';
 455             if (SIZE_MAX / 10 < s || skip_fields < s)
 456               error (EXIT_FAILURE, 0, "%s",
 457                      _("invalid number of fields to skip"));
 458           }
 459           break;
 460
 461         case 'c':
 462           countmode = count_occurrences;
 463           break;
 464
 465         case 'd':
 466           mode = output_repeated;
 467           break;
 468
 469         case 'D':
 470           mode = output_all_repeated;
 471           if (optarg == NULL)
 472             delimit_groups = DM_NONE;
 473           else
 474             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 475                                         delimit_method_string,
 476                                         delimit_method_map);
 477           break;
 478
 479         case 'f':               /* Like '-#'. */
 480           skip_fields = size_opt (optarg,
 481                                   N_("invalid number of fields to skip"));
 482           break;
 483
 484         case 'i':
 485           ignore_case = 1;
 486           break;
 487
 488         case 's':               /* Like '+#'. */
 489           skip_chars = size_opt (optarg,
 490                                  N_("invalid number of bytes to skip"));
 491           break;
 492
 493         case 'u':
 494           mode = output_unique;
 495           break;
 496
 497         case 'w':
 498           check_chars = size_opt (optarg,
 499                                   N_("invalid number of bytes to compare"));
 500           break;
 501
 502         case_GETOPT_HELP_CHAR;
 503
 504         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 505
 506         default:
 507           usage (1);
 508         }
 509     }
 510
 511   if (countmode == count_occurrences && mode == output_all_repeated)
 512     {
 513       error (0, 0,
 514            _("printing all duplicated lines and repeat counts is meaningless"));
 515       usage (1);
 516     }
 517
 518   check_file (file[0], file[1]);
 519
 520   exit (EXIT_SUCCESS);
 521 }