src/sort.c

   1 /* sort - sort lines of text (with all kinds of options).
   2    Copyright (C) 1988, 1991-2005 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  17
  18    Written December 1988 by Mike Haertel.
  19    The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
  20    or (US mail) as Mike Haertel c/o Free Software Foundation.
  21
  22    Ørn E. Hansen added NLS support in 1997.  */
  23
  24 #include <config.h>
  25
  26 #include <getopt.h>
  27 #include <sys/types.h>
  28 #include <signal.h>
  29 #include "system.h"
  30 #include "error.h"
  31 #include "hard-locale.h"
  32 #include "inttostr.h"
  33 #include "physmem.h"
  34 #include "posixver.h"
  35 #include "quote.h"
  36 #include "stdlib--.h"
  37 #include "stdio--.h"
  38 #include "strnumcmp.h"
  39 #include "xmemcoll.h"
  40 #include "xstrtol.h"
  41
  42 #if HAVE_SYS_RESOURCE_H
  43 # include <sys/resource.h>
  44 #endif
  45 #ifndef RLIMIT_DATA
  46 struct rlimit { size_t rlim_cur; };
  47 # define getrlimit(Resource, Rlp) (-1)
  48 #endif
  49
  50 /* The official name of this program (e.g., no `g' prefix).  */
  51 #define PROGRAM_NAME "sort"
  52
  53 #define AUTHORS "Mike Haertel", "Paul Eggert"
  54
  55 #if HAVE_LANGINFO_CODESET
  56 # include <langinfo.h>
  57 #endif
  58
  59 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
  60    present.  */
  61 #ifndef SA_NOCLDSTOP
  62 # define SA_NOCLDSTOP 0
  63 # define sigprocmask(How, Set, Oset) /* empty */
  64 # define sigset_t int
  65 # if ! HAVE_SIGINTERRUPT
  66 #  define siginterrupt(sig, flag) /* empty */
  67 # endif
  68 #endif
  69
  70 #ifndef STDC_HEADERS
  71 double strtod ();
  72 #endif
  73
  74 #define UCHAR_LIM (UCHAR_MAX + 1)
  75
  76 #ifndef DEFAULT_TMPDIR
  77 # define DEFAULT_TMPDIR "/tmp"
  78 #endif
  79
  80 #include "rand-isaac.c"
  81
  82 /* Exit statuses.  */
  83 enum
  84   {
  85     /* POSIX says to exit with status 1 if invoked with -c and the
  86        input is not properly sorted.  */
  87     SORT_OUT_OF_ORDER = 1,
  88
  89     /* POSIX says any other irregular exit must exit with a status
  90        code greater than 1.  */
  91     SORT_FAILURE = 2
  92   };
  93
  94 /* The representation of the decimal point in the current locale.  */
  95 static int decimal_point;
  96
  97 /* Thousands separator; if -1, then there isn't one.  */
  98 static int thousands_sep;
  99
 100 /* Nonzero if the corresponding locales are hard.  */
 101 static bool hard_LC_COLLATE;
 102 #if HAVE_NL_LANGINFO
 103 static bool hard_LC_TIME;
 104 #endif
 105
 106 #define NONZERO(x) ((x) != 0)
 107
 108 /* The kind of blanks for '-b' to skip in various options. */
 109 enum blanktype { bl_start, bl_end, bl_both };
 110
 111 /* The character marking end of line. Default to \n. */
 112 static char eolchar = '\n';
 113
 114 /* Lines are held in core as counted strings. */
 115 struct line
 116 {
 117   char *text;                   /* Text of the line. */
 118   size_t length;                /* Length including final newline. */
 119   char *keybeg;                 /* Start of first key. */
 120   char *keylim;                 /* Limit of first key. */
 121 };
 122
 123 /* Input buffers. */
 124 struct buffer
 125 {
 126   char *buf;                    /* Dynamically allocated buffer,
 127                                    partitioned into 3 regions:
 128                                    - input data;
 129                                    - unused area;
 130                                    - an array of lines, in reverse order.  */
 131   size_t used;                  /* Number of bytes used for input data.  */
 132   size_t nlines;                /* Number of lines in the line array.  */
 133   size_t alloc;                 /* Number of bytes allocated. */
 134   size_t left;                  /* Number of bytes left from previous reads. */
 135   size_t line_bytes;            /* Number of bytes to reserve for each line. */
 136   bool eof;                     /* An EOF has been read.  */
 137 };
 138
 139 struct keyfield
 140 {
 141   size_t sword;                 /* Zero-origin 'word' to start at. */
 142   size_t schar;                 /* Additional characters to skip. */
 143   size_t eword;                 /* Zero-origin first word after field. */
 144   size_t echar;                 /* Additional characters in field. */
 145   bool const *ignore;           /* Boolean array of characters to ignore. */
 146   char const *translate;        /* Translation applied to characters. */
 147   bool skipsblanks;             /* Skip leading blanks when finding start.  */
 148   bool skipeblanks;             /* Skip leading blanks when finding end.  */
 149   bool numeric;                 /* Flag for numeric comparison.  Handle
 150                                    strings of digits with optional decimal
 151                                    point, but no exponential notation. */
 152   bool random;                  /* Sort by random hash of key.  */
 153   bool general_numeric;         /* Flag for general, numeric comparison.
 154                                    Handle numbers in exponential notation. */
 155   bool month;                   /* Flag for comparison by month name. */
 156   bool reverse;                 /* Reverse the sense of comparison. */
 157   struct keyfield *next;        /* Next keyfield to try. */
 158 };
 159
 160 struct month
 161 {
 162   char const *name;
 163   int val;
 164 };
 165
 166 /* The name this program was run with. */
 167 char *program_name;
 168
 169 /* FIXME: None of these tables work with multibyte character sets.
 170    Also, there are many other bugs when handling multibyte characters.
 171    One way to fix this is to rewrite `sort' to use wide characters
 172    internally, but doing this with good performance is a bit
 173    tricky.  */
 174
 175 /* Table of blanks.  */
 176 static bool blanks[UCHAR_LIM];
 177
 178 /* Table of non-printing characters. */
 179 static bool nonprinting[UCHAR_LIM];
 180
 181 /* Table of non-dictionary characters (not letters, digits, or blanks). */
 182 static bool nondictionary[UCHAR_LIM];
 183
 184 /* Translation table folding lower case to upper.  */
 185 static char fold_toupper[UCHAR_LIM];
 186
 187 #define MONTHS_PER_YEAR 12
 188
 189 /* Table mapping month names to integers.
 190    Alphabetic order allows binary search. */
 191 static struct month monthtab[] =
 192 {
 193   {"APR", 4},
 194   {"AUG", 8},
 195   {"DEC", 12},
 196   {"FEB", 2},
 197   {"JAN", 1},
 198   {"JUL", 7},
 199   {"JUN", 6},
 200   {"MAR", 3},
 201   {"MAY", 5},
 202   {"NOV", 11},
 203   {"OCT", 10},
 204   {"SEP", 9}
 205 };
 206
 207 /* During the merge phase, the number of files to merge at once. */
 208 #define NMERGE 16
 209
 210 /* Minimum size for a merge or check buffer.  */
 211 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 212
 213 /* Minimum sort size; the code might not work with smaller sizes.  */
 214 #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
 215
 216 /* The number of bytes needed for a merge or check buffer, which can
 217    function relatively efficiently even if it holds only one line.  If
 218    a longer line is seen, this value is increased.  */
 219 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
 220
 221 /* The approximate maximum number of bytes of main memory to use, as
 222    specified by the user.  Zero if the user has not specified a size.  */
 223 static size_t sort_size;
 224
 225 /* The guessed size for non-regular files.  */
 226 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
 227
 228 /* Array of directory names in which any temporary files are to be created. */
 229 static char const **temp_dirs;
 230
 231 /* Number of temporary directory names used.  */
 232 static size_t temp_dir_count;
 233
 234 /* Number of allocated slots in temp_dirs.  */
 235 static size_t temp_dir_alloc;
 236
 237 /* Flag to reverse the order of all comparisons. */
 238 static bool reverse;
 239
 240 /* Flag for stable sort.  This turns off the last ditch bytewise
 241    comparison of lines, and instead leaves lines in the same order
 242    they were read if all keys compare equal.  */
 243 static bool stable;
 244
 245 /* If TAB has this value, blanks separate fields.  */
 246 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 247
 248 /* Tab character separating fields.  If TAB_DEFAULT, then fields are
 249    separated by the empty string between a non-blank character and a blank
 250    character. */
 251 static int tab = TAB_DEFAULT;
 252
 253 /* Flag to remove consecutive duplicate lines from the output.
 254    Only the last of a sequence of equal lines will be output. */
 255 static bool unique;
 256
 257 /* Nonzero if any of the input files are the standard input. */
 258 static bool have_read_stdin;
 259
 260 /* List of key field comparisons to be tried.  */
 261 static struct keyfield *keylist;
 262
 263 static void sortlines_temp (struct line *, size_t, struct line *);
 264
 265 /* Report MESSAGE for FILE, then clean up and exit.
 266    If FILE is null, it represents standard output.  */
 267
 268 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
 269 static void
 270 die (char const *message, char const *file)
 271 {
 272   error (0, errno, "%s: %s", message, file ? file : _("standard output"));
 273   exit (SORT_FAILURE);
 274 }
 275
 276 void
 277 usage (int status)
 278 {
 279   if (status != EXIT_SUCCESS)
 280     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 281              program_name);
 282   else
 283     {
 284       printf (_("\
 285 Usage: %s [OPTION]... [FILE]...\n\
 286 "),
 287               program_name);
 288       fputs (_("\
 289 Write sorted concatenation of all FILE(s) to standard output.\n\
 290 \n\
 291 "), stdout);
 292       fputs (_("\
 293 Mandatory arguments to long options are mandatory for short options too.\n\
 294 "), stdout);
 295       fputs (_("\
 296 Ordering options:\n\
 297 \n\
 298 "), stdout);
 299       fputs (_("\
 300   -b, --ignore-leading-blanks  ignore leading blanks\n\
 301   -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
 302   -f, --ignore-case           fold lower case to upper case characters\n\
 303 "), stdout);
 304       fputs (_("\
 305   -g, --general-numeric-sort  compare according to general numerical value\n\
 306   -i, --ignore-nonprinting    consider only printable characters\n\
 307   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 308   -n, --numeric-sort          compare according to string numerical value\n\
 309   -R, --random-sort           sort by random hash of keys\n\
 310   -r, --reverse               reverse the result of comparisons\n\
 311 \n\
 312 "), stdout);
 313       fputs (_("\
 314 Other options:\n\
 315 \n\
 316   -c, --check               check whether input is sorted; do not sort\n\
 317   -k, --key=POS1[,POS2]     start a key at POS1, end it at POS2 (origin 1)\n\
 318   -m, --merge               merge already sorted files; do not sort\n\
 319   -o, --output=FILE         write result to FILE instead of standard output\n\
 320   -s, --stable              stabilize sort by disabling last-resort comparison\n\
 321   -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
 322 "), stdout);
 323       printf (_("\
 324   -t, --field-separator=SEP  use SEP instead of non-blank to blank transition\n\
 325   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
 326                               multiple options specify multiple directories\n\
 327   -u, --unique              with -c, check for strict ordering;\n\
 328                               without -c, output only the first of an equal run\n\
 329 "), DEFAULT_TMPDIR);
 330       fputs (_("\
 331   -z, --zero-terminated     end lines with 0 byte, not newline\n\
 332 "), stdout);
 333       fputs (HELP_OPTION_DESCRIPTION, stdout);
 334       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 335       fputs (_("\
 336 \n\
 337 POS is F[.C][OPTS], where F is the field number and C the character position\n\
 338 in the field.  If neither the -t nor the -b option is in effect, the characters\n\
 339 in a field are counted from the beginning of the preceding whitespace.  OPTS is\n\
 340 one or more single-letter ordering options, which override global ordering\n\
 341 options for that key.  If no key is given, use the entire line as the key.\n\
 342 \n\
 343 SIZE may be followed by the following multiplicative suffixes:\n\
 344 "), stdout);
 345       fputs (_("\
 346 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
 347 \n\
 348 With no FILE, or when FILE is -, read standard input.\n\
 349 \n\
 350 *** WARNING ***\n\
 351 The locale specified by the environment affects sort order.\n\
 352 Set LC_ALL=C to get the traditional sort order that uses\n\
 353 native byte values.\n\
 354 "), stdout );
 355       printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 356     }
 357
 358   exit (status);
 359 }
 360
 361 /* For long options that have no equivalent short option, use a
 362    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 363 enum
 364 {
 365   SEED_OPTION = CHAR_MAX + 1
 366 };
 367
 368 static char const short_options[] = "-bcdfgik:mMno:rRsS:t:T:uy:z";
 369
 370 static struct option const long_options[] =
 371 {
 372   {"ignore-leading-blanks", no_argument, NULL, 'b'},
 373   {"check", no_argument, NULL, 'c'},
 374   {"dictionary-order", no_argument, NULL, 'd'},
 375   {"ignore-case", no_argument, NULL, 'f'},
 376   {"general-numeric-sort", no_argument, NULL, 'g'},
 377   {"ignore-nonprinting", no_argument, NULL, 'i'},
 378   {"key", required_argument, NULL, 'k'},
 379   {"merge", no_argument, NULL, 'm'},
 380   {"month-sort", no_argument, NULL, 'M'},
 381   {"numeric-sort", no_argument, NULL, 'n'},
 382   {"random-sort", no_argument, NULL, 'R'},
 383   {"output", required_argument, NULL, 'o'},
 384   {"reverse", no_argument, NULL, 'r'},
 385   {"stable", no_argument, NULL, 's'},
 386   {"buffer-size", required_argument, NULL, 'S'},
 387   {"field-separator", required_argument, NULL, 't'},
 388   {"temporary-directory", required_argument, NULL, 'T'},
 389   {"unique", no_argument, NULL, 'u'},
 390   {"zero-terminated", no_argument, NULL, 'z'},
 391   {"seed", required_argument, NULL, SEED_OPTION}, /* This will go away soon.  */
 392   {GETOPT_HELP_OPTION_DECL},
 393   {GETOPT_VERSION_OPTION_DECL},
 394   {NULL, 0, NULL, 0},
 395 };
 396
 397 /* The set of signals that are caught.  */
 398 static sigset_t caught_signals;
 399
 400 /* The list of temporary files. */
 401 struct tempnode
 402 {
 403   struct tempnode *volatile next;
 404   char name[1];  /* Actual size is 1 + file name length.  */
 405 };
 406 static struct tempnode *volatile temphead;
 407 static struct tempnode *volatile *temptail = &temphead;
 408
 409 /* Clean up any remaining temporary files.  */
 410
 411 static void
 412 cleanup (void)
 413 {
 414   struct tempnode const *node;
 415
 416   for (node = temphead; node; node = node->next)
 417     unlink (node->name);
 418 }
 419
 420 /* Create a new temporary file, returning its newly allocated name.
 421    Store into *PFP a stream open for writing.  */
 422
 423 static char *
 424 create_temp_file (FILE **pfp)
 425 {
 426   static char const slashbase[] = "/sortXXXXXX";
 427   static size_t temp_dir_index;
 428   sigset_t oldset;
 429   int fd;
 430   int saved_errno;
 431   char const *temp_dir = temp_dirs[temp_dir_index];
 432   size_t len = strlen (temp_dir);
 433   struct tempnode *node =
 434     xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
 435   char *file = node->name;
 436
 437   memcpy (file, temp_dir, len);
 438   memcpy (file + len, slashbase, sizeof slashbase);
 439   node->next = NULL;
 440   if (++temp_dir_index == temp_dir_count)
 441     temp_dir_index = 0;
 442
 443   /* Create the temporary file in a critical section, to avoid races.  */
 444   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
 445   fd = mkstemp (file);
 446   if (0 <= fd)
 447     {
 448       *temptail = node;
 449       temptail = &node->next;
 450     }
 451   saved_errno = errno;
 452   sigprocmask (SIG_SETMASK, &oldset, NULL);
 453   errno = saved_errno;
 454
 455   if (fd < 0 || (*pfp = fdopen (fd, "w")) == NULL)
 456     die (_("cannot create temporary file"), file);
 457
 458   return file;
 459 }
 460
 461 /* Return a stream for FILE, opened with mode HOW.  A null FILE means
 462    standard output; HOW should be "w".  When opening for input, "-"
 463    means standard input.  To avoid confusion, do not return file
 464    descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
 465    opening an ordinary FILE.  */
 466
 467 static FILE *
 468 xfopen (const char *file, const char *how)
 469 {
 470   FILE *fp;
 471
 472   if (!file)
 473     fp = stdout;
 474   else if (STREQ (file, "-") && *how == 'r')
 475     {
 476       have_read_stdin = true;
 477       fp = stdin;
 478     }
 479   else
 480     {
 481       fp = fopen (file, how);
 482       if (! fp)
 483         die (_("open failed"), file);
 484     }
 485
 486   return fp;
 487 }
 488
 489 /* Close FP, whose name is FILE, and report any errors.  */
 490
 491 static void
 492 xfclose (FILE *fp, char const *file)
 493 {
 494   switch (fileno (fp))
 495     {
 496     case STDIN_FILENO:
 497       /* Allow reading stdin from tty more than once.  */
 498       if (feof (fp))
 499         clearerr (fp);
 500       break;
 501
 502     case STDOUT_FILENO:
 503       /* Don't close stdout just yet.  close_stdout does that.  */
 504       if (fflush (fp) != 0)
 505         die (_("fflush failed"), file);
 506       break;
 507
 508     default:
 509       if (fclose (fp) != 0)
 510         die (_("close failed"), file);
 511       break;
 512     }
 513 }
 514
 515 static void
 516 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
 517 {
 518   if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
 519     die (_("write failed"), output_file);
 520 }
 521
 522 /* Append DIR to the array of temporary directory names.  */
 523 static void
 524 add_temp_dir (char const *dir)
 525 {
 526   if (temp_dir_count == temp_dir_alloc)
 527     temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
 528
 529   temp_dirs[temp_dir_count++] = dir;
 530 }
 531
 532 /* Remove NAME from the list of temporary files.  */
 533
 534 static void
 535 zaptemp (const char *name)
 536 {
 537   struct tempnode *volatile *pnode;
 538   struct tempnode *node;
 539   struct tempnode *next;
 540   sigset_t oldset;
 541   int unlink_status;
 542   int unlink_errno = 0;
 543
 544   for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
 545     continue;
 546
 547   /* Unlink the temporary file in a critical section to avoid races.  */
 548   next = node->next;
 549   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
 550   unlink_status = unlink (name);
 551   unlink_errno = errno;
 552   *pnode = next;
 553   sigprocmask (SIG_SETMASK, &oldset, NULL);
 554
 555   if (unlink_status != 0)
 556     error (0, unlink_errno, _("warning: cannot remove: %s"), name);
 557   if (! next)
 558     temptail = pnode;
 559   free (node);
 560 }
 561
 562 #if HAVE_NL_LANGINFO
 563
 564 static int
 565 struct_month_cmp (const void *m1, const void *m2)
 566 {
 567   struct month const *month1 = m1;
 568   struct month const *month2 = m2;
 569   return strcmp (month1->name, month2->name);
 570 }
 571
 572 #endif
 573
 574 /* Initialize the character class tables. */
 575
 576 static void
 577 inittables (void)
 578 {
 579   size_t i;
 580
 581   for (i = 0; i < UCHAR_LIM; ++i)
 582     {
 583       blanks[i] = !!ISBLANK (i);
 584       nonprinting[i] = !ISPRINT (i);
 585       nondictionary[i] = !ISALNUM (i) && !ISBLANK (i);
 586       fold_toupper[i] = (ISLOWER (i) ? toupper (i) : i);
 587     }
 588
 589 #if HAVE_NL_LANGINFO
 590   /* If we're not in the "C" locale, read different names for months.  */
 591   if (hard_LC_TIME)
 592     {
 593       for (i = 0; i < MONTHS_PER_YEAR; i++)
 594         {
 595           char const *s;
 596           size_t s_len;
 597           size_t j;
 598           char *name;
 599
 600           s = (char *) nl_langinfo (ABMON_1 + i);
 601           s_len = strlen (s);
 602           monthtab[i].name = name = xmalloc (s_len + 1);
 603           monthtab[i].val = i + 1;
 604
 605           for (j = 0; j < s_len; j++)
 606             name[j] = fold_toupper[to_uchar (s[j])];
 607           name[j] = '\0';
 608         }
 609       qsort ((void *) monthtab, MONTHS_PER_YEAR,
 610              sizeof *monthtab, struct_month_cmp);
 611     }
 612 #endif
 613 }
 614
 615 /* Specify the amount of main memory to use when sorting.  */
 616 static void
 617 specify_sort_size (char const *s)
 618 {
 619   uintmax_t n;
 620   char *suffix;
 621   enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
 622
 623   /* The default unit is KiB.  */
 624   if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
 625     {
 626       if (n <= UINTMAX_MAX / 1024)
 627         n *= 1024;
 628       else
 629         e = LONGINT_OVERFLOW;
 630     }
 631
 632   /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
 633   if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
 634     switch (suffix[0])
 635       {
 636       case 'b':
 637         e = LONGINT_OK;
 638         break;
 639
 640       case '%':
 641         {
 642           double mem = physmem_total () * n / 100;
 643
 644           /* Use "<", not "<=", to avoid problems with rounding.  */
 645           if (mem < UINTMAX_MAX)
 646             {
 647               n = mem;
 648               e = LONGINT_OK;
 649             }
 650           else
 651             e = LONGINT_OVERFLOW;
 652         }
 653         break;
 654       }
 655
 656   if (e == LONGINT_OK)
 657     {
 658       /* If multiple sort sizes are specified, take the maximum, so
 659          that option order does not matter.  */
 660       if (n < sort_size)
 661         return;
 662
 663       sort_size = n;
 664       if (sort_size == n)
 665         {
 666           sort_size = MAX (sort_size, MIN_SORT_SIZE);
 667           return;
 668         }
 669
 670       e = LONGINT_OVERFLOW;
 671     }
 672
 673   STRTOL_FATAL_ERROR (s, _("sort size"), e);
 674 }
 675
 676 /* Return the default sort size.  */
 677 static size_t
 678 default_sort_size (void)
 679 {
 680   /* Let MEM be available memory or 1/8 of total memory, whichever
 681      is greater.  */
 682   double avail = physmem_available ();
 683   double total = physmem_total ();
 684   double mem = MAX (avail, total / 8);
 685   struct rlimit rlimit;
 686
 687   /* Let SIZE be MEM, but no more than the maximum object size or
 688      system resource limits.  Avoid the MIN macro here, as it is not
 689      quite right when only one argument is floating point.  Don't
 690      bother to check for values like RLIM_INFINITY since in practice
 691      they are not much less than SIZE_MAX.  */
 692   size_t size = SIZE_MAX;
 693   if (mem < size)
 694     size = mem;
 695   if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
 696     size = rlimit.rlim_cur;
 697 #ifdef RLIMIT_AS
 698   if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
 699     size = rlimit.rlim_cur;
 700 #endif
 701
 702   /* Leave a large safety margin for the above limits, as failure can
 703      occur when they are exceeded.  */
 704   size /= 2;
 705
 706 #ifdef RLIMIT_RSS
 707   /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
 708      Exceeding RSS is not fatal, but can be quite slow.  */
 709   if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
 710     size = rlimit.rlim_cur / 16 * 15;
 711 #endif
 712
 713   /* Use no less than the minimum.  */
 714   return MAX (size, MIN_SORT_SIZE);
 715 }
 716
 717 /* Return the sort buffer size to use with the input files identified
 718    by FPS and FILES, which are alternate names of the same files.
 719    NFILES gives the number of input files; NFPS may be less.  Assume
 720    that each input line requires LINE_BYTES extra bytes' worth of line
 721    information.  Do not exceed the size bound specified by the user
 722    (or a default size bound, if the user does not specify one).  */
 723
 724 static size_t
 725 sort_buffer_size (FILE *const *fps, size_t nfps,
 726                   char *const *files, size_t nfiles,
 727                   size_t line_bytes)
 728 {
 729   /* A bound on the input size.  If zero, the bound hasn't been
 730      determined yet.  */
 731   static size_t size_bound;
 732
 733   /* In the worst case, each input byte is a newline.  */
 734   size_t worst_case_per_input_byte = line_bytes + 1;
 735
 736   /* Keep enough room for one extra input line and an extra byte.
 737      This extra room might be needed when preparing to read EOF.  */
 738   size_t size = worst_case_per_input_byte + 1;
 739
 740   size_t i;
 741
 742   for (i = 0; i < nfiles; i++)
 743     {
 744       struct stat st;
 745       off_t file_size;
 746       size_t worst_case;
 747
 748       if ((i < nfps ? fstat (fileno (fps[i]), &st)
 749            : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
 750            : stat (files[i], &st))
 751           != 0)
 752         die (_("stat failed"), files[i]);
 753
 754       if (S_ISREG (st.st_mode))
 755         file_size = st.st_size;
 756       else
 757         {
 758           /* The file has unknown size.  If the user specified a sort
 759              buffer size, use that; otherwise, guess the size.  */
 760           if (sort_size)
 761             return sort_size;
 762           file_size = INPUT_FILE_SIZE_GUESS;
 763         }
 764
 765       if (! size_bound)
 766         {
 767           size_bound = sort_size;
 768           if (! size_bound)
 769             size_bound = default_sort_size ();
 770         }
 771
 772       /* Add the amount of memory needed to represent the worst case
 773          where the input consists entirely of newlines followed by a
 774          single non-newline.  Check for overflow.  */
 775       worst_case = file_size * worst_case_per_input_byte + 1;
 776       if (file_size != worst_case / worst_case_per_input_byte
 777           || size_bound - size <= worst_case)
 778         return size_bound;
 779       size += worst_case;
 780     }
 781
 782   return size;
 783 }
 784
 785 /* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
 786    must be at least sizeof (struct line).  Allocate ALLOC bytes
 787    initially.  */
 788
 789 static void
 790 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
 791 {
 792   /* Ensure that the line array is properly aligned.  If the desired
 793      size cannot be allocated, repeatedly halve it until allocation
 794      succeeds.  The smaller allocation may hurt overall performance,
 795      but that's better than failing.  */
 796   for (;;)
 797     {
 798       alloc += sizeof (struct line) - alloc % sizeof (struct line);
 799       buf->buf = malloc (alloc);
 800       if (buf->buf)
 801         break;
 802       alloc /= 2;
 803       if (alloc <= line_bytes + 1)
 804         xalloc_die ();
 805     }
 806
 807   buf->line_bytes = line_bytes;
 808   buf->alloc = alloc;
 809   buf->used = buf->left = buf->nlines = 0;
 810   buf->eof = false;
 811 }
 812
 813 /* Return one past the limit of the line array.  */
 814
 815 static inline struct line *
 816 buffer_linelim (struct buffer const *buf)
 817 {
 818   return (struct line *) (buf->buf + buf->alloc);
 819 }
 820
 821 /* Return a pointer to the first character of the field specified
 822    by KEY in LINE. */
 823
 824 static char *
 825 begfield (const struct line *line, const struct keyfield *key)
 826 {
 827   char *ptr = line->text, *lim = ptr + line->length - 1;
 828   size_t sword = key->sword;
 829   size_t schar = key->schar;
 830   size_t remaining_bytes;
 831
 832   /* The leading field separator itself is included in a field when -t
 833      is absent.  */
 834
 835   if (tab != TAB_DEFAULT)
 836     while (ptr < lim && sword--)
 837       {
 838         while (ptr < lim && *ptr != tab)
 839           ++ptr;
 840         if (ptr < lim)
 841           ++ptr;
 842       }
 843   else
 844     while (ptr < lim && sword--)
 845       {
 846         while (ptr < lim && blanks[to_uchar (*ptr)])
 847           ++ptr;
 848         while (ptr < lim && !blanks[to_uchar (*ptr)])
 849           ++ptr;
 850       }
 851
 852   if (key->skipsblanks)
 853     while (ptr < lim && blanks[to_uchar (*ptr)])
 854       ++ptr;
 855
 856   /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
 857   remaining_bytes = lim - ptr;
 858   if (schar < remaining_bytes)
 859     ptr += schar;
 860   else
 861     ptr = lim;
 862
 863   return ptr;
 864 }
 865
 866 /* Return the limit of (a pointer to the first character after) the field
 867    in LINE specified by KEY. */
 868
 869 static char *
 870 limfield (const struct line *line, const struct keyfield *key)
 871 {
 872   char *ptr = line->text, *lim = ptr + line->length - 1;
 873   size_t eword = key->eword, echar = key->echar;
 874   size_t remaining_bytes;
 875
 876   /* Move PTR past EWORD fields or to one past the last byte on LINE,
 877      whichever comes first.  If there are more than EWORD fields, leave
 878      PTR pointing at the beginning of the field having zero-based index,
 879      EWORD.  If a delimiter character was specified (via -t), then that
 880      `beginning' is the first character following the delimiting TAB.
 881      Otherwise, leave PTR pointing at the first `blank' character after
 882      the preceding field.  */
 883   if (tab != TAB_DEFAULT)
 884     while (ptr < lim && eword--)
 885       {
 886         while (ptr < lim && *ptr != tab)
 887           ++ptr;
 888         if (ptr < lim && (eword | echar))
 889           ++ptr;
 890       }
 891   else
 892     while (ptr < lim && eword--)
 893       {
 894         while (ptr < lim && blanks[to_uchar (*ptr)])
 895           ++ptr;
 896         while (ptr < lim && !blanks[to_uchar (*ptr)])
 897           ++ptr;
 898       }
 899
 900 #ifdef POSIX_UNSPECIFIED
 901   /* The following block of code makes GNU sort incompatible with
 902      standard Unix sort, so it's ifdef'd out for now.
 903      The POSIX spec isn't clear on how to interpret this.
 904      FIXME: request clarification.
 905
 906      From: kwzh@gnu.ai.mit.edu (Karl Heuer)
 907      Date: Thu, 30 May 96 12:20:41 -0400
 908      [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
 909
 910      [...]I believe I've found another bug in `sort'.
 911
 912      $ cat /tmp/sort.in
 913      a b c 2 d
 914      pq rs 1 t
 915      $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
 916      a b c 2 d
 917      pq rs 1 t
 918      $ /bin/sort -k1.7,1.7 </tmp/sort.in
 919      pq rs 1 t
 920      a b c 2 d
 921
 922      Unix sort produced the answer I expected: sort on the single character
 923      in column 7.  GNU sort produced different results, because it disagrees
 924      on the interpretation of the key-end spec "M.N".  Unix sort reads this
 925      as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
 926      "skip M-1 fields, then either N-1 characters or the rest of the current
 927      field, whichever comes first".  This extra clause applies only to
 928      key-ends, not key-starts.
 929      */
 930
 931   /* Make LIM point to the end of (one byte past) the current field.  */
 932   if (tab != TAB_DEFAULT)
 933     {
 934       char *newlim;
 935       newlim = memchr (ptr, tab, lim - ptr);
 936       if (newlim)
 937         lim = newlim;
 938     }
 939   else
 940     {
 941       char *newlim;
 942       newlim = ptr;
 943       while (newlim < lim && blanks[to_uchar (*newlim)])
 944         ++newlim;
 945       while (newlim < lim && !blanks[to_uchar (*newlim)])
 946         ++newlim;
 947       lim = newlim;
 948     }
 949 #endif
 950
 951   /* If we're ignoring leading blanks when computing the End
 952      of the field, don't start counting bytes until after skipping
 953      past any leading blanks. */
 954   if (key->skipeblanks)
 955     while (ptr < lim && blanks[to_uchar (*ptr)])
 956       ++ptr;
 957
 958   /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
 959   remaining_bytes = lim - ptr;
 960   if (echar < remaining_bytes)
 961     ptr += echar;
 962   else
 963     ptr = lim;
 964
 965   return ptr;
 966 }
 967
 968 /* Fill BUF reading from FP, moving buf->left bytes from the end
 969    of buf->buf to the beginning first.  If EOF is reached and the
 970    file wasn't terminated by a newline, supply one.  Set up BUF's line
 971    table too.  FILE is the name of the file corresponding to FP.
 972    Return true if some input was read.  */
 973
 974 static bool
 975 fillbuf (struct buffer *buf, FILE *fp, char const *file)
 976 {
 977   struct keyfield const *key = keylist;
 978   char eol = eolchar;
 979   size_t line_bytes = buf->line_bytes;
 980   size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
 981
 982   if (buf->eof)
 983     return false;
 984
 985   if (buf->used != buf->left)
 986     {
 987       memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
 988       buf->used = buf->left;
 989       buf->nlines = 0;
 990     }
 991
 992   for (;;)
 993     {
 994       char *ptr = buf->buf + buf->used;
 995       struct line *linelim = buffer_linelim (buf);
 996       struct line *line = linelim - buf->nlines;
 997       size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
 998       char *line_start = buf->nlines ? line->text + line->length : buf->buf;
 999
1000       while (line_bytes + 1 < avail)
1001         {
1002           /* Read as many bytes as possible, but do not read so many
1003              bytes that there might not be enough room for the
1004              corresponding line array.  The worst case is when the
1005              rest of the input file consists entirely of newlines,
1006              except that the last byte is not a newline.  */
1007           size_t readsize = (avail - 1) / (line_bytes + 1);
1008           size_t bytes_read = fread (ptr, 1, readsize, fp);
1009           char *ptrlim = ptr + bytes_read;
1010           char *p;
1011           avail -= bytes_read;
1012
1013           if (bytes_read != readsize)
1014             {
1015               if (ferror (fp))
1016                 die (_("read failed"), file);
1017               if (feof (fp))
1018                 {
1019                   buf->eof = true;
1020                   if (buf->buf == ptrlim)
1021                     return false;
1022                   if (ptrlim[-1] != eol)
1023                     *ptrlim++ = eol;
1024                 }
1025             }
1026
1027           /* Find and record each line in the just-read input.  */
1028           while ((p = memchr (ptr, eol, ptrlim - ptr)))
1029             {
1030               ptr = p + 1;
1031               line--;
1032               line->text = line_start;
1033               line->length = ptr - line_start;
1034               mergesize = MAX (mergesize, line->length);
1035               avail -= line_bytes;
1036
1037               if (key)
1038                 {
1039                   /* Precompute the position of the first key for
1040                      efficiency.  */
1041                   line->keylim = (key->eword == SIZE_MAX
1042                                   ? p
1043                                   : limfield (line, key));
1044
1045                   if (key->sword != SIZE_MAX)
1046                     line->keybeg = begfield (line, key);
1047                   else
1048                     {
1049                       if (key->skipsblanks)
1050                         while (blanks[to_uchar (*line_start)])
1051                           line_start++;
1052                       line->keybeg = line_start;
1053                     }
1054                 }
1055
1056               line_start = ptr;
1057             }
1058
1059           ptr = ptrlim;
1060           if (buf->eof)
1061             break;
1062         }
1063
1064       buf->used = ptr - buf->buf;
1065       buf->nlines = buffer_linelim (buf) - line;
1066       if (buf->nlines != 0)
1067         {
1068           buf->left = ptr - line_start;
1069           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1070           return true;
1071         }
1072
1073       /* The current input line is too long to fit in the buffer.
1074          Double the buffer size and try again.  */
1075       buf->buf = X2REALLOC (buf->buf, &buf->alloc);
1076     }
1077 }
1078
1079 /* Compare strings A and B as numbers without explicitly converting them to
1080    machine numbers.  Comparatively slow for short strings, but asymptotically
1081    hideously fast. */
1082
1083 static int
1084 numcompare (const char *a, const char *b)
1085 {
1086   while (blanks[to_uchar (*a)])
1087     a++;
1088   while (blanks[to_uchar (*b)])
1089     b++;
1090
1091   return strnumcmp (a, b, decimal_point, thousands_sep);
1092 }
1093
1094 static int
1095 general_numcompare (const char *sa, const char *sb)
1096 {
1097   /* FIXME: add option to warn about failed conversions.  */
1098   /* FIXME: maybe add option to try expensive FP conversion
1099      only if A and B can't be compared more cheaply/accurately.  */
1100
1101   char *ea;
1102   char *eb;
1103   double a = strtod (sa, &ea);
1104   double b = strtod (sb, &eb);
1105
1106   /* Put conversion errors at the start of the collating sequence.  */
1107   if (sa == ea)
1108     return sb == eb ? 0 : -1;
1109   if (sb == eb)
1110     return 1;
1111
1112   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1113      conversion errors but before numbers; sort them by internal
1114      bit-pattern, for lack of a more portable alternative.  */
1115   return (a < b ? -1
1116           : a > b ? 1
1117           : a == b ? 0
1118           : b == b ? -1
1119           : a == a ? 1
1120           : memcmp ((char *) &a, (char *) &b, sizeof a));
1121 }
1122
1123 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1124    Return 0 if the name in S is not recognized.  */
1125
1126 static int
1127 getmonth (char const *month, size_t len)
1128 {
1129   size_t lo = 0;
1130   size_t hi = MONTHS_PER_YEAR;
1131   char const *monthlim = month + len;
1132
1133   for (;;)
1134     {
1135       if (month == monthlim)
1136         return 0;
1137       if (!blanks[to_uchar (*month)])
1138         break;
1139       ++month;
1140     }
1141
1142   do
1143     {
1144       size_t ix = (lo + hi) / 2;
1145       char const *m = month;
1146       char const *n = monthtab[ix].name;
1147
1148       for (;; m++, n++)
1149         {
1150           if (!*n)
1151             return monthtab[ix].val;
1152           if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1153             {
1154               hi = ix;
1155               break;
1156             }
1157           else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1158             {
1159               lo = ix + 1;
1160               break;
1161             }
1162         }
1163     }
1164   while (lo < hi);
1165
1166   return 0;
1167 }
1168
1169 /* The ISAAC state resulting from the user-specified seed, or from
1170    random data taken from the environment.  */
1171 static struct isaac_state rand_state;
1172
1173 /* Set *RESULT to the ISAAC state resulting from applying TEXT (with
1174    length LEN) to rand_state.  */
1175 static void
1176 get_hash (char const *text, size_t len, uint32_t resbuf[ISAAC_WORDS])
1177 {
1178   struct isaac_state s = rand_state;
1179   isaac_seed_data (&s, text, len);
1180   isaac_seed_finish (&s);
1181   isaac_refill (&s, resbuf);
1182 }
1183
1184 /* Compare two lines A and B trying every key in sequence until there
1185    are no more keys or a difference is found. */
1186
1187 static int
1188 keycompare (const struct line *a, const struct line *b)
1189 {
1190   struct keyfield const *key = keylist;
1191
1192   /* For the first iteration only, the key positions have been
1193      precomputed for us. */
1194   char *texta = a->keybeg;
1195   char *textb = b->keybeg;
1196   char *lima = a->keylim;
1197   char *limb = b->keylim;
1198
1199   int diff;
1200
1201   for (;;)
1202     {
1203       char const *translate = key->translate;
1204       bool const *ignore = key->ignore;
1205
1206       /* Find the lengths. */
1207       size_t lena = lima <= texta ? 0 : lima - texta;
1208       size_t lenb = limb <= textb ? 0 : limb - textb;
1209
1210       /* Actually compare the fields. */
1211
1212       if (key->random)
1213         {
1214           int i;
1215           uint32_t diga[ISAAC_WORDS];
1216           uint32_t digb[ISAAC_WORDS];
1217           get_hash (texta, lena, diga);
1218           get_hash (textb, lenb, digb);
1219
1220           /* Go backwards, since the last words are marginally better
1221              mixed.  */
1222           for (i = ISAAC_WORDS - 1; 0 <= i; i--)
1223             if (diga[i] != digb[i])
1224               {
1225                 diff = (diga[i] < digb[i] ? -1 : 1);
1226                 goto not_equal;
1227               }
1228
1229           /* FIXME: Should break ties by rehashing with a different
1230              random hashing function (and repeat until the tie is
1231              broken) unless --seed was specified.  The probability of
1232              this being needed should be infinitesimal.  */
1233         }
1234
1235       if (key->numeric | key->general_numeric)
1236         {
1237           char savea = *lima, saveb = *limb;
1238
1239           *lima = *limb = '\0';
1240           diff = ((key->numeric ? numcompare : general_numcompare)
1241                   (texta, textb));
1242           *lima = savea, *limb = saveb;
1243         }
1244       else if (key->month)
1245         diff = getmonth (texta, lena) - getmonth (textb, lenb);
1246       /* Sorting like this may become slow, so in a simple locale the user
1247          can select a faster sort that is similar to ascii sort.  */
1248       else if (hard_LC_COLLATE)
1249         {
1250           if (ignore || translate)
1251             {
1252               char buf[4000];
1253               size_t size = lena + 1 + lenb + 1;
1254               char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1255               char *copy_b = copy_a + lena + 1;
1256               size_t new_len_a, new_len_b, i;
1257
1258               /* Ignore and/or translate chars before comparing.  */
1259               for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1260                 {
1261                   if (i < lena)
1262                     {
1263                       copy_a[new_len_a] = (translate
1264                                            ? translate[to_uchar (texta[i])]
1265                                            : texta[i]);
1266                       if (!ignore || !ignore[to_uchar (texta[i])])
1267                         ++new_len_a;
1268                     }
1269                   if (i < lenb)
1270                     {
1271                       copy_b[new_len_b] = (translate
1272                                            ? translate[to_uchar (textb[i])]
1273                                            : textb [i]);
1274                       if (!ignore || !ignore[to_uchar (textb[i])])
1275                         ++new_len_b;
1276                     }
1277                 }
1278
1279               diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1280
1281               if (sizeof buf < size)
1282                 free (copy_a);
1283             }
1284           else if (lena == 0)
1285             diff = - NONZERO (lenb);
1286           else if (lenb == 0)
1287             goto greater;
1288           else
1289             diff = xmemcoll (texta, lena, textb, lenb);
1290         }
1291       else if (ignore)
1292         {
1293 #define CMP_WITH_IGNORE(A, B)                                           \
1294   do                                                                    \
1295     {                                                                   \
1296           for (;;)                                                      \
1297             {                                                           \
1298               while (texta < lima && ignore[to_uchar (*texta)])         \
1299                 ++texta;                                                \
1300               while (textb < limb && ignore[to_uchar (*textb)])         \
1301                 ++textb;                                                \
1302               if (! (texta < lima && textb < limb))                     \
1303                 break;                                                  \
1304               diff = to_uchar (A) - to_uchar (B);                       \
1305               if (diff)                                                 \
1306                 goto not_equal;                                         \
1307               ++texta;                                                  \
1308               ++textb;                                                  \
1309             }                                                           \
1310                                                                         \
1311           diff = (texta < lima) - (textb < limb);                       \
1312     }                                                                   \
1313   while (0)
1314
1315           if (translate)
1316             CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1317                              translate[to_uchar (*textb)]);
1318           else
1319             CMP_WITH_IGNORE (*texta, *textb);
1320         }
1321       else if (lena == 0)
1322         diff = - NONZERO (lenb);
1323       else if (lenb == 0)
1324         goto greater;
1325       else
1326         {
1327           if (translate)
1328             {
1329               while (texta < lima && textb < limb)
1330                 {
1331                   diff = (to_uchar (translate[to_uchar (*texta++)])
1332                           - to_uchar (translate[to_uchar (*textb++)]));
1333                   if (diff)
1334                     goto not_equal;
1335                 }
1336             }
1337           else
1338             {
1339               diff = memcmp (texta, textb, MIN (lena, lenb));
1340               if (diff)
1341                 goto not_equal;
1342             }
1343           diff = lena < lenb ? -1 : lena != lenb;
1344         }
1345
1346       if (diff)
1347         goto not_equal;
1348
1349       key = key->next;
1350       if (! key)
1351         break;
1352
1353       /* Find the beginning and limit of the next field.  */
1354       if (key->eword != SIZE_MAX)
1355         lima = limfield (a, key), limb = limfield (b, key);
1356       else
1357         lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1358
1359       if (key->sword != SIZE_MAX)
1360         texta = begfield (a, key), textb = begfield (b, key);
1361       else
1362         {
1363           texta = a->text, textb = b->text;
1364           if (key->skipsblanks)
1365             {
1366               while (texta < lima && blanks[to_uchar (*texta)])
1367                 ++texta;
1368               while (textb < limb && blanks[to_uchar (*textb)])
1369                 ++textb;
1370             }
1371         }
1372     }
1373
1374   return 0;
1375
1376  greater:
1377   diff = 1;
1378  not_equal:
1379   return key->reverse ? -diff : diff;
1380 }
1381
1382 /* Compare two lines A and B, returning negative, zero, or positive
1383    depending on whether A compares less than, equal to, or greater than B. */
1384
1385 static int
1386 compare (const struct line *a, const struct line *b)
1387 {
1388   int diff;
1389   size_t alen, blen;
1390
1391   /* First try to compare on the specified keys (if any).
1392      The only two cases with no key at all are unadorned sort,
1393      and unadorned sort -r. */
1394   if (keylist)
1395     {
1396       diff = keycompare (a, b);
1397       if (diff | unique | stable)
1398         return diff;
1399     }
1400
1401   /* If the keys all compare equal (or no keys were specified)
1402      fall through to the default comparison.  */
1403   alen = a->length - 1, blen = b->length - 1;
1404
1405   if (alen == 0)
1406     diff = - NONZERO (blen);
1407   else if (blen == 0)
1408     diff = 1;
1409   else if (hard_LC_COLLATE)
1410     diff = xmemcoll (a->text, alen, b->text, blen);
1411   else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
1412     diff = alen < blen ? -1 : alen != blen;
1413
1414   return reverse ? -diff : diff;
1415 }
1416
1417 /* Check that the lines read from FILE_NAME come in order.  Print a
1418    diagnostic (FILE_NAME, line number, contents of line) to stderr and return
1419    false if they are not in order.  Otherwise, print no diagnostic
1420    and return true.  */
1421
1422 static bool
1423 check (char const *file_name)
1424 {
1425   FILE *fp = xfopen (file_name, "r");
1426   struct buffer buf;            /* Input buffer. */
1427   struct line temp;             /* Copy of previous line. */
1428   size_t alloc = 0;
1429   uintmax_t line_number = 0;
1430   struct keyfield const *key = keylist;
1431   bool nonunique = ! unique;
1432   bool ordered = true;
1433
1434   initbuf (&buf, sizeof (struct line),
1435            MAX (merge_buffer_size, sort_size));
1436   temp.text = NULL;
1437
1438   while (fillbuf (&buf, fp, file_name))
1439     {
1440       struct line const *line = buffer_linelim (&buf);
1441       struct line const *linebase = line - buf.nlines;
1442
1443       /* Make sure the line saved from the old buffer contents is
1444          less than or equal to the first line of the new buffer. */
1445       if (alloc && nonunique <= compare (&temp, line - 1))
1446         {
1447         found_disorder:
1448           {
1449             struct line const *disorder_line = line - 1;
1450             uintmax_t disorder_line_number =
1451               buffer_linelim (&buf) - disorder_line + line_number;
1452             char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
1453             fprintf (stderr, _("%s: %s:%s: disorder: "),
1454                      program_name, file_name,
1455                      umaxtostr (disorder_line_number, hr_buf));
1456             write_bytes (disorder_line->text, disorder_line->length, stderr,
1457                          _("standard error"));
1458             ordered = false;
1459             break;
1460           }
1461         }
1462
1463       /* Compare each line in the buffer with its successor.  */
1464       while (linebase < --line)
1465         if (nonunique <= compare (line, line - 1))
1466           goto found_disorder;
1467
1468       line_number += buf.nlines;
1469
1470       /* Save the last line of the buffer.  */
1471       if (alloc < line->length)
1472         {
1473           do
1474             {
1475               alloc *= 2;
1476               if (! alloc)
1477                 {
1478                   alloc = line->length;
1479                   break;
1480                 }
1481             }
1482           while (alloc < line->length);
1483
1484           temp.text = xrealloc (temp.text, alloc);
1485         }
1486       memcpy (temp.text, line->text, line->length);
1487       temp.length = line->length;
1488       if (key)
1489         {
1490           temp.keybeg = temp.text + (line->keybeg - line->text);
1491           temp.keylim = temp.text + (line->keylim - line->text);
1492         }
1493     }
1494
1495   xfclose (fp, file_name);
1496   free (buf.buf);
1497   free (temp.text);
1498   return ordered;
1499 }
1500
1501 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
1502    files (all of which are at the start of the FILES array), and
1503    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
1504    Close input and output files before returning.
1505    OUTPUT_FILE gives the name of the output file.  If it is NULL,
1506    the output file is standard output.  If OFP is NULL, the output
1507    file has not been opened yet (or written to, if standard output).  */
1508
1509 static void
1510 mergefps (char **files, size_t ntemps, size_t nfiles,
1511           FILE *ofp, char const *output_file)
1512 {
1513   FILE *fps[NMERGE];            /* Input streams for each file.  */
1514   struct buffer buffer[NMERGE]; /* Input buffers for each file. */
1515   struct line saved;            /* Saved line storage for unique check. */
1516   struct line const *savedline = NULL;
1517                                 /* &saved if there is a saved line. */
1518   size_t savealloc = 0;         /* Size allocated for the saved line. */
1519   struct line const *cur[NMERGE]; /* Current line in each line table. */
1520   struct line const *base[NMERGE]; /* Base of each line table.  */
1521   size_t ord[NMERGE];           /* Table representing a permutation of fps,
1522                                    such that cur[ord[0]] is the smallest line
1523                                    and will be next output. */
1524   size_t i;
1525   size_t j;
1526   size_t t;
1527   struct keyfield const *key = keylist;
1528   saved.text = NULL;
1529
1530   /* Read initial lines from each input file. */
1531   for (i = 0; i < nfiles; )
1532     {
1533       fps[i] = xfopen (files[i], "r");
1534       initbuf (&buffer[i], sizeof (struct line),
1535                MAX (merge_buffer_size, sort_size / nfiles));
1536       if (fillbuf (&buffer[i], fps[i], files[i]))
1537         {
1538           struct line const *linelim = buffer_linelim (&buffer[i]);
1539           cur[i] = linelim - 1;
1540           base[i] = linelim - buffer[i].nlines;
1541           i++;
1542         }
1543       else
1544         {
1545           /* fps[i] is empty; eliminate it from future consideration.  */
1546           xfclose (fps[i], files[i]);
1547           if (i < ntemps)
1548             {
1549               ntemps--;
1550               zaptemp (files[i]);
1551             }
1552           free (buffer[i].buf);
1553           --nfiles;
1554           for (j = i; j < nfiles; ++j)
1555             files[j] = files[j + 1];
1556         }
1557     }
1558
1559   if (! ofp)
1560     ofp = xfopen (output_file, "w");
1561
1562   /* Set up the ord table according to comparisons among input lines.
1563      Since this only reorders two items if one is strictly greater than
1564      the other, it is stable. */
1565   for (i = 0; i < nfiles; ++i)
1566     ord[i] = i;
1567   for (i = 1; i < nfiles; ++i)
1568     if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
1569       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1570
1571   /* Repeatedly output the smallest line until no input remains. */
1572   while (nfiles)
1573     {
1574       struct line const *smallest = cur[ord[0]];
1575
1576       /* If uniquified output is turned on, output only the first of
1577          an identical series of lines. */
1578       if (unique)
1579         {
1580           if (savedline && compare (savedline, smallest))
1581             {
1582               savedline = NULL;
1583               write_bytes (saved.text, saved.length, ofp, output_file);
1584             }
1585           if (!savedline)
1586             {
1587               savedline = &saved;
1588               if (savealloc < smallest->length)
1589                 {
1590                   do
1591                     if (! savealloc)
1592                       {
1593                         savealloc = smallest->length;
1594                         break;
1595                       }
1596                   while ((savealloc *= 2) < smallest->length);
1597
1598                   saved.text = xrealloc (saved.text, savealloc);
1599                 }
1600               saved.length = smallest->length;
1601               memcpy (saved.text, smallest->text, saved.length);
1602               if (key)
1603                 {
1604                   saved.keybeg =
1605                     saved.text + (smallest->keybeg - smallest->text);
1606                   saved.keylim =
1607                     saved.text + (smallest->keylim - smallest->text);
1608                 }
1609             }
1610         }
1611       else
1612         write_bytes (smallest->text, smallest->length, ofp, output_file);
1613
1614       /* Check if we need to read more lines into core. */
1615       if (base[ord[0]] < smallest)
1616         cur[ord[0]] = smallest - 1;
1617       else
1618         {
1619           if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]]))
1620             {
1621               struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
1622               cur[ord[0]] = linelim - 1;
1623               base[ord[0]] = linelim - buffer[ord[0]].nlines;
1624             }
1625           else
1626             {
1627               /* We reached EOF on fps[ord[0]].  */
1628               for (i = 1; i < nfiles; ++i)
1629                 if (ord[i] > ord[0])
1630                   --ord[i];
1631               --nfiles;
1632               xfclose (fps[ord[0]], files[ord[0]]);
1633               if (ord[0] < ntemps)
1634                 {
1635                   ntemps--;
1636                   zaptemp (files[ord[0]]);
1637                 }
1638               free (buffer[ord[0]].buf);
1639               for (i = ord[0]; i < nfiles; ++i)
1640                 {
1641                   fps[i] = fps[i + 1];
1642                   files[i] = files[i + 1];
1643                   buffer[i] = buffer[i + 1];
1644                   cur[i] = cur[i + 1];
1645                   base[i] = base[i + 1];
1646                 }
1647               for (i = 0; i < nfiles; ++i)
1648                 ord[i] = ord[i + 1];
1649               continue;
1650             }
1651         }
1652
1653       /* The new line just read in may be larger than other lines
1654          already in main memory; push it back in the queue until we
1655          encounter a line larger than it.  Optimize for the common
1656          case where the new line is smallest.  */
1657       {
1658         size_t lo = 1;
1659         size_t hi = nfiles;
1660         size_t probe = lo;
1661         size_t ord0 = ord[0];
1662         size_t count_of_smaller_lines;
1663
1664         while (lo < hi)
1665           {
1666             int cmp = compare (cur[ord0], cur[ord[probe]]);
1667             if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
1668               hi = probe;
1669             else
1670               lo = probe + 1;
1671             probe = (lo + hi) / 2;
1672           }
1673
1674         count_of_smaller_lines = lo - 1;
1675         for (j = 0; j < count_of_smaller_lines; j++)
1676           ord[j] = ord[j + 1];
1677         ord[count_of_smaller_lines] = ord0;
1678       }
1679     }
1680
1681   if (unique && savedline)
1682     {
1683       write_bytes (saved.text, saved.length, ofp, output_file);
1684       free (saved.text);
1685     }
1686
1687   xfclose (ofp, output_file);
1688 }
1689
1690 /* Merge into T the two sorted arrays of lines LO (with NLO members)
1691    and HI (with NHI members).  T, LO, and HI point just past their
1692    respective arrays, and the arrays are in reverse order.  NLO and
1693    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
1694
1695 static inline void
1696 mergelines (struct line *t,
1697             struct line const *lo, size_t nlo,
1698             struct line const *hi, size_t nhi)
1699 {
1700   for (;;)
1701     if (compare (lo - 1, hi - 1) <= 0)
1702       {
1703         *--t = *--lo;
1704         if (! --nlo)
1705           {
1706             /* HI - NHI equalled T - (NLO + NHI) when this function
1707                began.  Therefore HI must equal T now, and there is no
1708                need to copy from HI to T.  */
1709             return;
1710           }
1711       }
1712     else
1713       {
1714         *--t = *--hi;
1715         if (! --nhi)
1716           {
1717             do
1718               *--t = *--lo;
1719             while (--nlo);
1720
1721             return;
1722           }
1723       }
1724 }
1725
1726 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
1727    NLINES must be at least 2.
1728    The input and output arrays are in reverse order, and LINES and
1729    TEMP point just past the end of their respective arrays.
1730
1731    Use a recursive divide-and-conquer algorithm, in the style
1732    suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
1733    the optimization suggested by exercise 5.2.4-10; this requires room
1734    for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
1735    writes that this memory optimization was originally published by
1736    D. A. Bell, Comp J. 1 (1958), 75.  */
1737
1738 static void
1739 sortlines (struct line *lines, size_t nlines, struct line *temp)
1740 {
1741   if (nlines == 2)
1742     {
1743       if (0 < compare (&lines[-1], &lines[-2]))
1744         {
1745           struct line tmp = lines[-1];
1746           lines[-1] = lines[-2];
1747           lines[-2] = tmp;
1748         }
1749     }
1750   else
1751     {
1752       size_t nlo = nlines / 2;
1753       size_t nhi = nlines - nlo;
1754       struct line *lo = lines;
1755       struct line *hi = lines - nlo;
1756       struct line *sorted_lo = temp;
1757
1758       sortlines (hi, nhi, temp);
1759       if (1 < nlo)
1760         sortlines_temp (lo, nlo, sorted_lo);
1761       else
1762         sorted_lo[-1] = lo[-1];
1763
1764       mergelines (lines, sorted_lo, nlo, hi, nhi);
1765     }
1766 }
1767
1768 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
1769    rather than sorting in place.  */
1770
1771 static void
1772 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
1773 {
1774   if (nlines == 2)
1775     {
1776       /* Declare `swap' as int, not bool, to work around a bug
1777          <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
1778          in the IBM xlc 6.0.0.0 compiler in 64-bit mode.  */
1779       int swap = (0 < compare (&lines[-1], &lines[-2]));
1780       temp[-1] = lines[-1 - swap];
1781       temp[-2] = lines[-2 + swap];
1782     }
1783   else
1784     {
1785       size_t nlo = nlines / 2;
1786       size_t nhi = nlines - nlo;
1787       struct line *lo = lines;
1788       struct line *hi = lines - nlo;
1789       struct line *sorted_hi = temp - nlo;
1790
1791       sortlines_temp (hi, nhi, sorted_hi);
1792       if (1 < nlo)
1793         sortlines (lo, nlo, temp);
1794
1795       mergelines (temp, lo, nlo, sorted_hi, nhi);
1796     }
1797 }
1798
1799 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
1800    the same as OUTFILE.  If found, merge the found instances (and perhaps
1801    some other files) into a temporary file so that it can in turn be
1802    merged into OUTFILE without destroying OUTFILE before it is completely
1803    read.  Return the new value of NFILES, which differs from the old if
1804    some merging occurred.
1805
1806    This test ensures that an otherwise-erroneous use like
1807    "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
1808    It's not clear that POSIX requires this nicety.
1809    Detect common error cases, but don't try to catch obscure cases like
1810    "cat ... FILE ... | sort -m -o FILE"
1811    where traditional "sort" doesn't copy the input and where
1812    people should know that they're getting into trouble anyway.
1813    Catching these obscure cases would slow down performance in
1814    common cases.  */
1815
1816 static size_t
1817 avoid_trashing_input (char **files, size_t ntemps, size_t nfiles,
1818                       char const *outfile)
1819 {
1820   size_t i;
1821   bool got_outstat = false;
1822   struct stat outstat;
1823
1824   for (i = ntemps; i < nfiles; i++)
1825     {
1826       bool is_stdin = STREQ (files[i], "-");
1827       bool same;
1828       struct stat instat;
1829
1830       if (outfile && STREQ (outfile, files[i]) && !is_stdin)
1831         same = true;
1832       else
1833         {
1834           if (! got_outstat)
1835             {
1836               if ((outfile
1837                    ? stat (outfile, &outstat)
1838                    : fstat (STDOUT_FILENO, &outstat))
1839                   != 0)
1840                 break;
1841               got_outstat = true;
1842             }
1843
1844           same = (((is_stdin
1845                     ? fstat (STDIN_FILENO, &instat)
1846                     : stat (files[i], &instat))
1847                    == 0)
1848                   && SAME_INODE (instat, outstat));
1849         }
1850
1851       if (same)
1852         {
1853           FILE *tftp;
1854           char *temp = create_temp_file (&tftp);
1855           mergefps (&files[i], 0, nfiles - i, tftp, temp);
1856           files[i] = temp;
1857           return i + 1;
1858         }
1859     }
1860
1861   return nfiles;
1862 }
1863
1864 /* Merge the input FILES.  NTEMPS is the number of files at the
1865    start of FILES that are temporary; it is zero at the top level.
1866    NFILES is the total number of files.  Put the output in
1867    OUTPUT_FILE; a null OUTPUT_FILE stands for standard output.  */
1868
1869 static void
1870 merge (char **files, size_t ntemps, size_t nfiles, char const *output_file)
1871 {
1872   while (NMERGE < nfiles)
1873     {
1874       /* Number of input files processed so far.  */
1875       size_t in;
1876
1877       /* Number of output files generated so far.  */
1878       size_t out;
1879
1880       /* nfiles % NMERGE; this counts input files that are left over
1881          after all full-sized merges have been done.  */
1882       size_t remainder;
1883
1884       /* Number of easily-available slots at the next loop iteration.  */
1885       size_t cheap_slots;
1886
1887       /* Do as many NMERGE-size merges as possible.  */
1888       for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
1889         {
1890           FILE *tfp;
1891           char *temp = create_temp_file (&tfp);
1892           size_t nt = MIN (ntemps, NMERGE);
1893           ntemps -= nt;
1894           mergefps (&files[in], nt, NMERGE, tfp, temp);
1895           files[out] = temp;
1896         }
1897
1898       remainder = nfiles - in;
1899       cheap_slots = NMERGE - out % NMERGE;
1900
1901       if (cheap_slots < remainder)
1902         {
1903           /* So many files remain that they can't all be put into the last
1904              NMERGE-sized output window.  Do one more merge.  Merge as few
1905              files as possible, to avoid needless I/O.  */
1906           size_t nshortmerge = remainder - cheap_slots + 1;
1907           FILE *tfp;
1908           char *temp = create_temp_file (&tfp);
1909           size_t nt = MIN (ntemps, nshortmerge);
1910           ntemps -= nt;
1911           mergefps (&files[in], nt, nshortmerge, tfp, temp);
1912           files[out++] = temp;
1913           in += nshortmerge;
1914         }
1915
1916       /* Put the remaining input files into the last NMERGE-sized output
1917          window, so they will be merged in the next pass.  */
1918       memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
1919       ntemps += out;
1920       nfiles -= in - out;
1921     }
1922
1923   nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
1924   mergefps (files, ntemps, nfiles, NULL, output_file);
1925 }
1926
1927 /* Sort NFILES FILES onto OUTPUT_FILE. */
1928
1929 static void
1930 sort (char * const *files, size_t nfiles, char const *output_file)
1931 {
1932   struct buffer buf;
1933   size_t ntemps = 0;
1934   bool output_file_created = false;
1935
1936   buf.alloc = 0;
1937
1938   while (nfiles)
1939     {
1940       char const *temp_output;
1941       char const *file = *files;
1942       FILE *fp = xfopen (file, "r");
1943       FILE *tfp;
1944       size_t bytes_per_line = (2 * sizeof (struct line)
1945                                - sizeof (struct line) / 2);
1946
1947       if (! buf.alloc)
1948         initbuf (&buf, bytes_per_line,
1949                  sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
1950       buf.eof = false;
1951       files++;
1952       nfiles--;
1953
1954       while (fillbuf (&buf, fp, file))
1955         {
1956           struct line *line;
1957           struct line *linebase;
1958
1959           if (buf.eof && nfiles
1960               && (bytes_per_line + 1
1961                   < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
1962             {
1963               /* End of file, but there is more input and buffer room.
1964                  Concatenate the next input file; this is faster in
1965                  the usual case.  */
1966               buf.left = buf.used;
1967               break;
1968             }
1969
1970           line = buffer_linelim (&buf);
1971           linebase = line - buf.nlines;
1972           if (1 < buf.nlines)
1973             sortlines (line, buf.nlines, linebase);
1974           if (buf.eof && !nfiles && !ntemps && !buf.left)
1975             {
1976               xfclose (fp, file);
1977               tfp = xfopen (output_file, "w");
1978               temp_output = output_file;
1979               output_file_created = true;
1980             }
1981           else
1982             {
1983               ++ntemps;
1984               temp_output = create_temp_file (&tfp);
1985             }
1986
1987           do
1988             {
1989               line--;
1990               write_bytes (line->text, line->length, tfp, temp_output);
1991               if (unique)
1992                 while (linebase < line && compare (line, line - 1) == 0)
1993                   line--;
1994             }
1995           while (linebase < line);
1996
1997           xfclose (tfp, temp_output);
1998
1999           if (output_file_created)
2000             goto finish;
2001         }
2002       xfclose (fp, file);
2003     }
2004
2005  finish:
2006   free (buf.buf);
2007
2008   if (! output_file_created)
2009     {
2010       size_t i;
2011       struct tempnode *node = temphead;
2012       char **tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2013       for (i = 0; node; i++)
2014         {
2015           tempfiles[i] = node->name;
2016           node = node->next;
2017         }
2018       merge (tempfiles, ntemps, ntemps, output_file);
2019       free (tempfiles);
2020     }
2021 }
2022
2023 /* Insert key KEY at the end of the key list.  */
2024
2025 static void
2026 insertkey (struct keyfield *key)
2027 {
2028   struct keyfield **p;
2029
2030   for (p = &keylist; *p; p = &(*p)->next)
2031     continue;
2032   *p = key;
2033   key->next = NULL;
2034 }
2035
2036 /* Report a bad field specification SPEC, with extra info MSGID.  */
2037
2038 static void badfieldspec (char const *, char const *)
2039      ATTRIBUTE_NORETURN;
2040 static void
2041 badfieldspec (char const *spec, char const *msgid)
2042 {
2043   error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
2044          _(msgid), quote (spec));
2045   abort ();
2046 }
2047
2048 /* Report incompatible options.  */
2049
2050 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
2051 static void
2052 incompatible_options (char const *opts)
2053 {
2054   error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
2055   abort ();
2056 }
2057
2058 /* Check compatibility of ordering options.  */
2059
2060 static void
2061 check_ordering_compatibility (void)
2062 {
2063   struct keyfield const *key;
2064
2065   for (key = keylist; key; key = key->next)
2066     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
2067               + !!key->ignore))
2068         || (key->random && key->translate))
2069       {
2070         char opts[7];
2071         char *p = opts;
2072         if (key->ignore == nondictionary)
2073           *p++ = 'd';
2074         if (key->translate)
2075           *p++ = 'f';
2076         if (key->general_numeric)
2077           *p++ = 'g';
2078         if (key->ignore == nonprinting)
2079           *p++ = 'i';
2080         if (key->month)
2081           *p++ = 'M';
2082         if (key->numeric)
2083           *p++ = 'n';
2084         if (key->random)
2085           *p++ = 'R';
2086         *p = '\0';
2087         incompatible_options (opts);
2088       }
2089 }
2090
2091 /* Parse the leading integer in STRING and store the resulting value
2092    (which must fit into size_t) into *VAL.  Return the address of the
2093    suffix after the integer.  If MSGID is NULL, return NULL after
2094    failure; otherwise, report MSGID and exit on failure.  */
2095
2096 static char const *
2097 parse_field_count (char const *string, size_t *val, char const *msgid)
2098 {
2099   char *suffix;
2100   uintmax_t n;
2101
2102   switch (xstrtoumax (string, &suffix, 10, &n, ""))
2103     {
2104     case LONGINT_OK:
2105     case LONGINT_INVALID_SUFFIX_CHAR:
2106       *val = n;
2107       if (*val == n)
2108         break;
2109       /* Fall through.  */
2110     case LONGINT_OVERFLOW:
2111     case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2112       if (msgid)
2113         error (SORT_FAILURE, 0, _("%s: count `%.*s' too large"),
2114                _(msgid), (int) (suffix - string), string);
2115       return NULL;
2116
2117     case LONGINT_INVALID:
2118       if (msgid)
2119         error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
2120                _(msgid), quote (string));
2121       return NULL;
2122     }
2123
2124   return suffix;
2125 }
2126
2127 /* Handle interrupts and hangups. */
2128
2129 static void
2130 sighandler (int sig)
2131 {
2132   if (! SA_NOCLDSTOP)
2133     signal (sig, SIG_IGN);
2134
2135   cleanup ();
2136
2137   signal (sig, SIG_DFL);
2138   raise (sig);
2139 }
2140
2141 /* Set the ordering options for KEY specified in S.
2142    Return the address of the first character in S that
2143    is not a valid ordering option.
2144    BLANKTYPE is the kind of blanks that 'b' should skip. */
2145
2146 static char *
2147 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
2148 {
2149   while (*s)
2150     {
2151       switch (*s)
2152         {
2153         case 'b':
2154           if (blanktype == bl_start || blanktype == bl_both)
2155             key->skipsblanks = true;
2156           if (blanktype == bl_end || blanktype == bl_both)
2157             key->skipeblanks = true;
2158           break;
2159         case 'd':
2160           key->ignore = nondictionary;
2161           break;
2162         case 'f':
2163           key->translate = fold_toupper;
2164           break;
2165         case 'g':
2166           key->general_numeric = true;
2167           break;
2168         case 'i':
2169           /* Option order should not matter, so don't let -i override
2170              -d.  -d implies -i, but -i does not imply -d.  */
2171           if (! key->ignore)
2172             key->ignore = nonprinting;
2173           break;
2174         case 'M':
2175           key->month = true;
2176           break;
2177         case 'n':
2178           key->numeric = true;
2179           break;
2180         case 'R':
2181           key->random = true;
2182           break;
2183         case 'r':
2184           key->reverse = true;
2185           break;
2186         default:
2187           return (char *) s;
2188         }
2189       ++s;
2190     }
2191   return (char *) s;
2192 }
2193
2194 static struct keyfield *
2195 new_key (void)
2196 {
2197   struct keyfield *key = xzalloc (sizeof *key);
2198   key->eword = SIZE_MAX;
2199   return key;
2200 }
2201
2202 int
2203 main (int argc, char **argv)
2204 {
2205   struct keyfield *key;
2206   struct keyfield gkey;
2207   char const *s;
2208   int c = 0;
2209   bool checkonly = false;
2210   bool mergeonly = false;
2211   char const *seed = NULL;
2212   bool need_random = false;
2213   size_t nfiles = 0;
2214   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2215   bool obsolete_usage = (posix2_version () < 200112);
2216   char *minus = "-", **files;
2217   char const *outfile = NULL;
2218
2219   initialize_main (&argc, &argv);
2220   program_name = argv[0];
2221   setlocale (LC_ALL, "");
2222   bindtextdomain (PACKAGE, LOCALEDIR);
2223   textdomain (PACKAGE);
2224
2225   atexit (cleanup);
2226
2227   initialize_exit_failure (SORT_FAILURE);
2228   atexit (close_stdout);
2229
2230   hard_LC_COLLATE = hard_locale (LC_COLLATE);
2231 #if HAVE_NL_LANGINFO
2232   hard_LC_TIME = hard_locale (LC_TIME);
2233 #endif
2234
2235   /* Get locale's representation of the decimal point.  */
2236   {
2237     struct lconv const *locale = localeconv ();
2238
2239     /* If the locale doesn't define a decimal point, or if the decimal
2240        point is multibyte, use the C locale's decimal point.  FIXME:
2241        add support for multibyte decimal points.  */
2242     decimal_point = to_uchar (locale->decimal_point[0]);
2243     if (! decimal_point || locale->decimal_point[1])
2244       decimal_point = '.';
2245
2246     /* FIXME: add support for multibyte thousands separators.  */
2247     thousands_sep = to_uchar (*locale->thousands_sep);
2248     if (! thousands_sep || locale->thousands_sep[1])
2249       thousands_sep = -1;
2250   }
2251
2252   have_read_stdin = false;
2253   inittables ();
2254
2255   {
2256     size_t i;
2257     static int const sig[] = { SIGHUP, SIGINT, SIGPIPE, SIGTERM };
2258     enum { nsigs = sizeof sig / sizeof sig[0] };
2259
2260 #if SA_NOCLDSTOP
2261     struct sigaction act;
2262
2263     sigemptyset (&caught_signals);
2264     for (i = 0; i < nsigs; i++)
2265       {
2266         sigaction (sig[i], NULL, &act);
2267         if (act.sa_handler != SIG_IGN)
2268           sigaddset (&caught_signals, sig[i]);
2269       }
2270
2271     act.sa_handler = sighandler;
2272     act.sa_mask = caught_signals;
2273     act.sa_flags = 0;
2274
2275     for (i = 0; i < nsigs; i++)
2276       if (sigismember (&caught_signals, sig[i]))
2277         sigaction (sig[i], &act, NULL);
2278 #else
2279     for (i = 0; i < nsigs; i++)
2280       if (signal (sig[i], SIG_IGN) != SIG_IGN)
2281         {
2282           signal (sig[i], sighandler);
2283           siginterrupt (sig[i], 1);
2284         }
2285 #endif
2286   }
2287
2288   gkey.sword = gkey.eword = SIZE_MAX;
2289   gkey.ignore = NULL;
2290   gkey.translate = NULL;
2291   gkey.numeric = gkey.general_numeric = gkey.random = false;
2292   gkey.month = gkey.reverse = false;
2293   gkey.skipsblanks = gkey.skipeblanks = false;
2294
2295   files = xnmalloc (argc, sizeof *files);
2296
2297   for (;;)
2298     {
2299       /* Parse an operand as a file after "--" was seen; or if
2300          pedantic and a file was seen, unless the POSIX version
2301          predates 1003.1-2001 and -c was not seen and the operand is
2302          "-o FILE" or "-oFILE".  */
2303
2304       if (c == -1
2305           || (posixly_correct && nfiles != 0
2306               && ! (obsolete_usage
2307                     && ! checkonly
2308                     && optind != argc
2309                     && argv[optind][0] == '-' && argv[optind][1] == 'o'
2310                     && (argv[optind][2] || optind + 1 != argc)))
2311           || ((c = getopt_long (argc, argv, short_options,
2312                                 long_options, NULL))
2313               == -1))
2314         {
2315           if (argc <= optind)
2316             break;
2317           files[nfiles++] = argv[optind++];
2318         }
2319       else switch (c)
2320         {
2321         case 1:
2322           key = NULL;
2323           if (optarg[0] == '+')
2324             {
2325               bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
2326                                       && ISDIGIT (argv[optind][1]));
2327               obsolete_usage |= minus_pos_usage & ~posixly_correct;
2328               if (obsolete_usage)
2329                 {
2330                   /* Treat +POS1 [-POS2] as a key if possible; but silently
2331                      treat an operand as a file if it is not a valid +POS1.  */
2332                   key = new_key ();
2333                   s = parse_field_count (optarg + 1, &key->sword, NULL);
2334                   if (s && *s == '.')
2335                     s = parse_field_count (s + 1, &key->schar, NULL);
2336                   if (! (key->sword | key->schar))
2337                     key->sword = SIZE_MAX;
2338                   if (! s || *set_ordering (s, key, bl_start))
2339                     {
2340                       free (key);
2341                       key = NULL;
2342                     }
2343                   else
2344                     {
2345                       if (minus_pos_usage)
2346                         {
2347                           char const *optarg1 = argv[optind++];
2348                           s = parse_field_count (optarg1 + 1, &key->eword,
2349                                              N_("invalid number after `-'"));
2350                           if (*s == '.')
2351                             s = parse_field_count (s + 1, &key->echar,
2352                                                N_("invalid number after `.'"));
2353                           if (*set_ordering (s, key, bl_end))
2354                             badfieldspec (optarg1,
2355                                       N_("stray character in field spec"));
2356                         }
2357                       insertkey (key);
2358                     }
2359                 }
2360             }
2361           if (! key)
2362             files[nfiles++] = optarg;
2363           break;
2364
2365         case 'b':
2366         case 'd':
2367         case 'f':
2368         case 'g':
2369         case 'i':
2370         case 'M':
2371         case 'n':
2372         case 'r':
2373         case 'R':
2374           {
2375             char str[2];
2376             str[0] = c;
2377             str[1] = '\0';
2378             set_ordering (str, &gkey, bl_both);
2379           }
2380           break;
2381
2382         case 'c':
2383           checkonly = true;
2384           break;
2385
2386         case 'k':
2387           key = new_key ();
2388
2389           /* Get POS1. */
2390           s = parse_field_count (optarg, &key->sword,
2391                                  N_("invalid number at field start"));
2392           if (! key->sword--)
2393             {
2394               /* Provoke with `sort -k0' */
2395               badfieldspec (optarg, N_("field number is zero"));
2396             }
2397           if (*s == '.')
2398             {
2399               s = parse_field_count (s + 1, &key->schar,
2400                                      N_("invalid number after `.'"));
2401               if (! key->schar--)
2402                 {
2403                   /* Provoke with `sort -k1.0' */
2404                   badfieldspec (optarg, N_("character offset is zero"));
2405                 }
2406             }
2407           if (! (key->sword | key->schar))
2408             key->sword = SIZE_MAX;
2409           s = set_ordering (s, key, bl_start);
2410           if (*s != ',')
2411             {
2412               key->eword = SIZE_MAX;
2413               key->echar = 0;
2414             }
2415           else
2416             {
2417               /* Get POS2. */
2418               s = parse_field_count (s + 1, &key->eword,
2419                                      N_("invalid number after `,'"));
2420               if (! key->eword--)
2421                 {
2422                   /* Provoke with `sort -k1,0' */
2423                   badfieldspec (optarg, N_("field number is zero"));
2424                 }
2425               if (*s == '.')
2426                 s = parse_field_count (s + 1, &key->echar,
2427                                        N_("invalid number after `.'"));
2428               else
2429                 {
2430                   /* `-k 2,3' is equivalent to `+1 -3'.  */
2431                   key->eword++;
2432                 }
2433               s = set_ordering (s, key, bl_end);
2434             }
2435           if (*s)
2436             badfieldspec (optarg, N_("stray character in field spec"));
2437           insertkey (key);
2438           break;
2439
2440         case 'm':
2441           mergeonly = true;
2442           break;
2443
2444         case 'o':
2445           if (outfile && !STREQ (outfile, optarg))
2446             error (SORT_FAILURE, 0, _("multiple output files specified"));
2447           outfile = optarg;
2448           break;
2449
2450         case SEED_OPTION:
2451           seed = optarg;
2452           break;
2453
2454         case 's':
2455           stable = true;
2456           break;
2457
2458         case 'S':
2459           specify_sort_size (optarg);
2460           break;
2461
2462         case 't':
2463           {
2464             char newtab = optarg[0];
2465             if (! newtab)
2466               error (SORT_FAILURE, 0, _("empty tab"));
2467             if (optarg[1])
2468               {
2469                 if (STREQ (optarg, "\\0"))
2470                   newtab = '\0';
2471                 else
2472                   {
2473                     /* Provoke with `sort -txx'.  Complain about
2474                        "multi-character tab" instead of "multibyte tab", so
2475                        that the diagnostic's wording does not need to be
2476                        changed once multibyte characters are supported.  */
2477                     error (SORT_FAILURE, 0, _("multi-character tab %s"),
2478                            quote (optarg));
2479                   }
2480               }
2481             if (tab != TAB_DEFAULT && tab != newtab)
2482               error (SORT_FAILURE, 0, _("incompatible tabs"));
2483             tab = newtab;
2484           }
2485           break;
2486
2487         case 'T':
2488           add_temp_dir (optarg);
2489           break;
2490
2491         case 'u':
2492           unique = true;
2493           break;
2494
2495         case 'y':
2496           /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
2497              through Solaris 7.  It is also accepted by many non-Solaris
2498              "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
2499              -y is marked as obsolete starting with Solaris 8 (1999), but is
2500              still accepted as of Solaris 10 prerelease (2004).
2501
2502              Solaris 2.5.1 "sort -y 100" reads the input file "100", but
2503              emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
2504              and which in general ignores the argument after "-y" if it
2505              consists entirely of digits (it can even be empty).  */
2506           if (optarg == argv[optind - 1])
2507             {
2508               char const *p;
2509               for (p = optarg; ISDIGIT (*p); p++)
2510                 continue;
2511               optind -= (*p != '\0');
2512             }
2513           break;
2514
2515         case 'z':
2516           eolchar = 0;
2517           break;
2518
2519         case_GETOPT_HELP_CHAR;
2520
2521         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2522
2523         default:
2524           usage (SORT_FAILURE);
2525         }
2526     }
2527
2528   /* Inheritance of global options to individual keys. */
2529   for (key = keylist; key; key = key->next)
2530     {
2531       if (! (key->ignore || key->translate
2532              || (key->skipsblanks | key->reverse
2533                  | key->skipeblanks | key->month | key->numeric
2534                  | key->general_numeric
2535                  | key->random)))
2536         {
2537           key->ignore = gkey.ignore;
2538           key->translate = gkey.translate;
2539           key->skipsblanks = gkey.skipsblanks;
2540           key->skipeblanks = gkey.skipeblanks;
2541           key->month = gkey.month;
2542           key->numeric = gkey.numeric;
2543           key->general_numeric = gkey.general_numeric;
2544           key->random = gkey.random;
2545           key->reverse = gkey.reverse;
2546         }
2547
2548       need_random |= key->random;
2549     }
2550
2551   if (!keylist && (gkey.ignore || gkey.translate
2552                    || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
2553                        | gkey.numeric | gkey.general_numeric
2554                        | gkey.random)))
2555     {
2556       insertkey (&gkey);
2557       need_random |= gkey.random;
2558     }
2559
2560   check_ordering_compatibility ();
2561
2562   reverse = gkey.reverse;
2563
2564   if (need_random)
2565     {
2566       if (seed)
2567         {
2568           isaac_seed_start (&rand_state);
2569           isaac_seed_data (&rand_state, seed, strlen (seed));
2570         }
2571       else
2572         isaac_seed (&rand_state);
2573     }
2574
2575   if (temp_dir_count == 0)
2576     {
2577       char const *tmp_dir = getenv ("TMPDIR");
2578       add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
2579     }
2580
2581   if (nfiles == 0)
2582     {
2583       nfiles = 1;
2584       files = &minus;
2585     }
2586
2587   if (checkonly)
2588     {
2589       if (nfiles > 1)
2590         error (SORT_FAILURE, 0, _("extra operand %s not allowed with -c"),
2591                quote (files[1]));
2592
2593       if (outfile)
2594         incompatible_options ("co");
2595
2596       /* POSIX requires that sort return 1 IFF invoked with -c and the
2597          input is not properly sorted.  */
2598       exit (check (files[0]) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
2599     }
2600
2601   if (mergeonly)
2602     merge (files, 0, nfiles, outfile);
2603   else
2604     sort (files, nfiles, outfile);
2605
2606   if (have_read_stdin && fclose (stdin) == EOF)
2607     die (_("close failed"), "-");
2608
2609   exit (EXIT_SUCCESS);
2610 }