src/sort.c

   1 /* sort - sort lines of text (with all kinds of options).
   2    Copyright (C) 1988, 1991-2006 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  17
  18    Written December 1988 by Mike Haertel.
  19    The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
  20    or (US mail) as Mike Haertel c/o Free Software Foundation.
  21
  22    Ørn E. Hansen added NLS support in 1997.  */
  23
  24 #include <config.h>
  25
  26 #include <getopt.h>
  27 #include <sys/types.h>
  28 #include <signal.h>
  29 #include "system.h"
  30 #include "error.h"
  31 #include "hard-locale.h"
  32 #include "inttostr.h"
  33 #include "md5.h"
  34 #include "physmem.h"
  35 #include "posixver.h"
  36 #include "quote.h"
  37 #include "randread.h"
  38 #include "stdio--.h"
  39 #include "stdlib--.h"
  40 #include "strnumcmp.h"
  41 #include "xmemcoll.h"
  42 #include "xmemxfrm.h"
  43 #include "xstrtol.h"
  44
  45 #if HAVE_SYS_RESOURCE_H
  46 # include <sys/resource.h>
  47 #endif
  48 #ifndef RLIMIT_DATA
  49 struct rlimit { size_t rlim_cur; };
  50 # define getrlimit(Resource, Rlp) (-1)
  51 #endif
  52
  53 /* The official name of this program (e.g., no `g' prefix).  */
  54 #define PROGRAM_NAME "sort"
  55
  56 #define AUTHORS "Mike Haertel", "Paul Eggert"
  57
  58 #if HAVE_LANGINFO_CODESET
  59 # include <langinfo.h>
  60 #endif
  61
  62 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
  63    present.  */
  64 #ifndef SA_NOCLDSTOP
  65 # define SA_NOCLDSTOP 0
  66 # define sigprocmask(How, Set, Oset) /* empty */
  67 # define sigset_t int
  68 # if ! HAVE_SIGINTERRUPT
  69 #  define siginterrupt(sig, flag) /* empty */
  70 # endif
  71 #endif
  72
  73 #ifndef STDC_HEADERS
  74 double strtod ();
  75 #endif
  76
  77 #define UCHAR_LIM (UCHAR_MAX + 1)
  78
  79 #ifndef DEFAULT_TMPDIR
  80 # define DEFAULT_TMPDIR "/tmp"
  81 #endif
  82
  83 /* Exit statuses.  */
  84 enum
  85   {
  86     /* POSIX says to exit with status 1 if invoked with -c and the
  87        input is not properly sorted.  */
  88     SORT_OUT_OF_ORDER = 1,
  89
  90     /* POSIX says any other irregular exit must exit with a status
  91        code greater than 1.  */
  92     SORT_FAILURE = 2
  93   };
  94
  95 /* The representation of the decimal point in the current locale.  */
  96 static int decimal_point;
  97
  98 /* Thousands separator; if -1, then there isn't one.  */
  99 static int thousands_sep;
 100
 101 /* Nonzero if the corresponding locales are hard.  */
 102 static bool hard_LC_COLLATE;
 103 #if HAVE_NL_LANGINFO
 104 static bool hard_LC_TIME;
 105 #endif
 106
 107 #define NONZERO(x) ((x) != 0)
 108
 109 /* The kind of blanks for '-b' to skip in various options. */
 110 enum blanktype { bl_start, bl_end, bl_both };
 111
 112 /* The character marking end of line. Default to \n. */
 113 static char eolchar = '\n';
 114
 115 /* Lines are held in core as counted strings. */
 116 struct line
 117 {
 118   char *text;                   /* Text of the line. */
 119   size_t length;                /* Length including final newline. */
 120   char *keybeg;                 /* Start of first key. */
 121   char *keylim;                 /* Limit of first key. */
 122 };
 123
 124 /* Input buffers. */
 125 struct buffer
 126 {
 127   char *buf;                    /* Dynamically allocated buffer,
 128                                    partitioned into 3 regions:
 129                                    - input data;
 130                                    - unused area;
 131                                    - an array of lines, in reverse order.  */
 132   size_t used;                  /* Number of bytes used for input data.  */
 133   size_t nlines;                /* Number of lines in the line array.  */
 134   size_t alloc;                 /* Number of bytes allocated. */
 135   size_t left;                  /* Number of bytes left from previous reads. */
 136   size_t line_bytes;            /* Number of bytes to reserve for each line. */
 137   bool eof;                     /* An EOF has been read.  */
 138 };
 139
 140 struct keyfield
 141 {
 142   size_t sword;                 /* Zero-origin 'word' to start at. */
 143   size_t schar;                 /* Additional characters to skip. */
 144   size_t eword;                 /* Zero-origin first word after field. */
 145   size_t echar;                 /* Additional characters in field. */
 146   bool const *ignore;           /* Boolean array of characters to ignore. */
 147   char const *translate;        /* Translation applied to characters. */
 148   bool skipsblanks;             /* Skip leading blanks when finding start.  */
 149   bool skipeblanks;             /* Skip leading blanks when finding end.  */
 150   bool numeric;                 /* Flag for numeric comparison.  Handle
 151                                    strings of digits with optional decimal
 152                                    point, but no exponential notation. */
 153   bool random;                  /* Sort by random hash of key.  */
 154   bool general_numeric;         /* Flag for general, numeric comparison.
 155                                    Handle numbers in exponential notation. */
 156   bool month;                   /* Flag for comparison by month name. */
 157   bool reverse;                 /* Reverse the sense of comparison. */
 158   struct keyfield *next;        /* Next keyfield to try. */
 159 };
 160
 161 struct month
 162 {
 163   char const *name;
 164   int val;
 165 };
 166
 167 /* The name this program was run with. */
 168 char *program_name;
 169
 170 /* FIXME: None of these tables work with multibyte character sets.
 171    Also, there are many other bugs when handling multibyte characters.
 172    One way to fix this is to rewrite `sort' to use wide characters
 173    internally, but doing this with good performance is a bit
 174    tricky.  */
 175
 176 /* Table of blanks.  */
 177 static bool blanks[UCHAR_LIM];
 178
 179 /* Table of non-printing characters. */
 180 static bool nonprinting[UCHAR_LIM];
 181
 182 /* Table of non-dictionary characters (not letters, digits, or blanks). */
 183 static bool nondictionary[UCHAR_LIM];
 184
 185 /* Translation table folding lower case to upper.  */
 186 static char fold_toupper[UCHAR_LIM];
 187
 188 #define MONTHS_PER_YEAR 12
 189
 190 /* Table mapping month names to integers.
 191    Alphabetic order allows binary search. */
 192 static struct month monthtab[] =
 193 {
 194   {"APR", 4},
 195   {"AUG", 8},
 196   {"DEC", 12},
 197   {"FEB", 2},
 198   {"JAN", 1},
 199   {"JUL", 7},
 200   {"JUN", 6},
 201   {"MAR", 3},
 202   {"MAY", 5},
 203   {"NOV", 11},
 204   {"OCT", 10},
 205   {"SEP", 9}
 206 };
 207
 208 /* During the merge phase, the number of files to merge at once. */
 209 #define NMERGE 16
 210
 211 /* Minimum size for a merge or check buffer.  */
 212 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 213
 214 /* Minimum sort size; the code might not work with smaller sizes.  */
 215 #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
 216
 217 /* The number of bytes needed for a merge or check buffer, which can
 218    function relatively efficiently even if it holds only one line.  If
 219    a longer line is seen, this value is increased.  */
 220 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
 221
 222 /* The approximate maximum number of bytes of main memory to use, as
 223    specified by the user.  Zero if the user has not specified a size.  */
 224 static size_t sort_size;
 225
 226 /* The guessed size for non-regular files.  */
 227 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
 228
 229 /* Array of directory names in which any temporary files are to be created. */
 230 static char const **temp_dirs;
 231
 232 /* Number of temporary directory names used.  */
 233 static size_t temp_dir_count;
 234
 235 /* Number of allocated slots in temp_dirs.  */
 236 static size_t temp_dir_alloc;
 237
 238 /* Flag to reverse the order of all comparisons. */
 239 static bool reverse;
 240
 241 /* Flag for stable sort.  This turns off the last ditch bytewise
 242    comparison of lines, and instead leaves lines in the same order
 243    they were read if all keys compare equal.  */
 244 static bool stable;
 245
 246 /* If TAB has this value, blanks separate fields.  */
 247 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 248
 249 /* Tab character separating fields.  If TAB_DEFAULT, then fields are
 250    separated by the empty string between a non-blank character and a blank
 251    character. */
 252 static int tab = TAB_DEFAULT;
 253
 254 /* Flag to remove consecutive duplicate lines from the output.
 255    Only the last of a sequence of equal lines will be output. */
 256 static bool unique;
 257
 258 /* Nonzero if any of the input files are the standard input. */
 259 static bool have_read_stdin;
 260
 261 /* List of key field comparisons to be tried.  */
 262 static struct keyfield *keylist;
 263
 264 static void sortlines_temp (struct line *, size_t, struct line *);
 265
 266 /* Report MESSAGE for FILE, then clean up and exit.
 267    If FILE is null, it represents standard output.  */
 268
 269 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
 270 static void
 271 die (char const *message, char const *file)
 272 {
 273   error (0, errno, "%s: %s", message, file ? file : _("standard output"));
 274   exit (SORT_FAILURE);
 275 }
 276
 277 void
 278 usage (int status)
 279 {
 280   if (status != EXIT_SUCCESS)
 281     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 282              program_name);
 283   else
 284     {
 285       printf (_("\
 286 Usage: %s [OPTION]... [FILE]...\n\
 287 "),
 288               program_name);
 289       fputs (_("\
 290 Write sorted concatenation of all FILE(s) to standard output.\n\
 291 \n\
 292 "), stdout);
 293       fputs (_("\
 294 Mandatory arguments to long options are mandatory for short options too.\n\
 295 "), stdout);
 296       fputs (_("\
 297 Ordering options:\n\
 298 \n\
 299 "), stdout);
 300       fputs (_("\
 301   -b, --ignore-leading-blanks  ignore leading blanks\n\
 302   -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
 303   -f, --ignore-case           fold lower case to upper case characters\n\
 304 "), stdout);
 305       fputs (_("\
 306   -g, --general-numeric-sort  compare according to general numerical value\n\
 307   -i, --ignore-nonprinting    consider only printable characters\n\
 308   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 309   -n, --numeric-sort          compare according to string numerical value\n\
 310   -R, --random-sort           sort by random hash of keys\n\
 311       --random-source=FILE    get random bytes from FILE (default /dev/urandom)\n\
 312   -r, --reverse               reverse the result of comparisons\n\
 313 \n\
 314 "), stdout);
 315       fputs (_("\
 316 Other options:\n\
 317 \n\
 318   -c, --check               check whether input is sorted; do not sort\n\
 319   -k, --key=POS1[,POS2]     start a key at POS1, end it at POS2 (origin 1)\n\
 320   -m, --merge               merge already sorted files; do not sort\n\
 321   -o, --output=FILE         write result to FILE instead of standard output\n\
 322   -s, --stable              stabilize sort by disabling last-resort comparison\n\
 323   -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
 324 "), stdout);
 325       printf (_("\
 326   -t, --field-separator=SEP  use SEP instead of non-blank to blank transition\n\
 327   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
 328                               multiple options specify multiple directories\n\
 329   -u, --unique              with -c, check for strict ordering;\n\
 330                               without -c, output only the first of an equal run\n\
 331 "), DEFAULT_TMPDIR);
 332       fputs (_("\
 333   -z, --zero-terminated     end lines with 0 byte, not newline\n\
 334 "), stdout);
 335       fputs (HELP_OPTION_DESCRIPTION, stdout);
 336       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 337       fputs (_("\
 338 \n\
 339 POS is F[.C][OPTS], where F is the field number and C the character position\n\
 340 in the field; both are origin 1.  If neither -t nor -b is in effect, characters\n\
 341 in a field are counted from the beginning of the preceding whitespace.  OPTS is\n\
 342 one or more single-letter ordering options, which override global ordering\n\
 343 options for that key.  If no key is given, use the entire line as the key.\n\
 344 \n\
 345 SIZE may be followed by the following multiplicative suffixes:\n\
 346 "), stdout);
 347       fputs (_("\
 348 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
 349 \n\
 350 With no FILE, or when FILE is -, read standard input.\n\
 351 \n\
 352 *** WARNING ***\n\
 353 The locale specified by the environment affects sort order.\n\
 354 Set LC_ALL=C to get the traditional sort order that uses\n\
 355 native byte values.\n\
 356 "), stdout );
 357       printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 358     }
 359
 360   exit (status);
 361 }
 362
 363 /* For long options that have no equivalent short option, use a
 364    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 365 enum
 366 {
 367   RANDOM_SOURCE_OPTION = CHAR_MAX + 1
 368 };
 369
 370 static char const short_options[] = "-bcdfgik:mMno:rRsS:t:T:uy:z";
 371
 372 static struct option const long_options[] =
 373 {
 374   {"ignore-leading-blanks", no_argument, NULL, 'b'},
 375   {"check", no_argument, NULL, 'c'},
 376   {"dictionary-order", no_argument, NULL, 'd'},
 377   {"ignore-case", no_argument, NULL, 'f'},
 378   {"general-numeric-sort", no_argument, NULL, 'g'},
 379   {"ignore-nonprinting", no_argument, NULL, 'i'},
 380   {"key", required_argument, NULL, 'k'},
 381   {"merge", no_argument, NULL, 'm'},
 382   {"month-sort", no_argument, NULL, 'M'},
 383   {"numeric-sort", no_argument, NULL, 'n'},
 384   {"random-sort", no_argument, NULL, 'R'},
 385   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
 386   {"output", required_argument, NULL, 'o'},
 387   {"reverse", no_argument, NULL, 'r'},
 388   {"stable", no_argument, NULL, 's'},
 389   {"buffer-size", required_argument, NULL, 'S'},
 390   {"field-separator", required_argument, NULL, 't'},
 391   {"temporary-directory", required_argument, NULL, 'T'},
 392   {"unique", no_argument, NULL, 'u'},
 393   {"zero-terminated", no_argument, NULL, 'z'},
 394   {GETOPT_HELP_OPTION_DECL},
 395   {GETOPT_VERSION_OPTION_DECL},
 396   {NULL, 0, NULL, 0},
 397 };
 398
 399 /* The set of signals that are caught.  */
 400 static sigset_t caught_signals;
 401
 402 /* The list of temporary files. */
 403 struct tempnode
 404 {
 405   struct tempnode *volatile next;
 406   char name[1];  /* Actual size is 1 + file name length.  */
 407 };
 408 static struct tempnode *volatile temphead;
 409 static struct tempnode *volatile *temptail = &temphead;
 410
 411 /* Clean up any remaining temporary files.  */
 412
 413 static void
 414 cleanup (void)
 415 {
 416   struct tempnode const *node;
 417
 418   for (node = temphead; node; node = node->next)
 419     unlink (node->name);
 420 }
 421
 422 /* Create a new temporary file, returning its newly allocated name.
 423    Store into *PFP a stream open for writing.  */
 424
 425 static char *
 426 create_temp_file (FILE **pfp)
 427 {
 428   static char const slashbase[] = "/sortXXXXXX";
 429   static size_t temp_dir_index;
 430   sigset_t oldset;
 431   int fd;
 432   int saved_errno;
 433   char const *temp_dir = temp_dirs[temp_dir_index];
 434   size_t len = strlen (temp_dir);
 435   struct tempnode *node =
 436     xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
 437   char *file = node->name;
 438
 439   memcpy (file, temp_dir, len);
 440   memcpy (file + len, slashbase, sizeof slashbase);
 441   node->next = NULL;
 442   if (++temp_dir_index == temp_dir_count)
 443     temp_dir_index = 0;
 444
 445   /* Create the temporary file in a critical section, to avoid races.  */
 446   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
 447   fd = mkstemp (file);
 448   if (0 <= fd)
 449     {
 450       *temptail = node;
 451       temptail = &node->next;
 452     }
 453   saved_errno = errno;
 454   sigprocmask (SIG_SETMASK, &oldset, NULL);
 455   errno = saved_errno;
 456
 457   if (fd < 0 || (*pfp = fdopen (fd, "w")) == NULL)
 458     die (_("cannot create temporary file"), file);
 459
 460   return file;
 461 }
 462
 463 /* Return a stream for FILE, opened with mode HOW.  A null FILE means
 464    standard output; HOW should be "w".  When opening for input, "-"
 465    means standard input.  To avoid confusion, do not return file
 466    descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
 467    opening an ordinary FILE.  */
 468
 469 static FILE *
 470 xfopen (const char *file, const char *how)
 471 {
 472   FILE *fp;
 473
 474   if (!file)
 475     fp = stdout;
 476   else if (STREQ (file, "-") && *how == 'r')
 477     {
 478       have_read_stdin = true;
 479       fp = stdin;
 480     }
 481   else
 482     {
 483       fp = fopen (file, how);
 484       if (! fp)
 485         die (_("open failed"), file);
 486     }
 487
 488   return fp;
 489 }
 490
 491 /* Close FP, whose name is FILE, and report any errors.  */
 492
 493 static void
 494 xfclose (FILE *fp, char const *file)
 495 {
 496   switch (fileno (fp))
 497     {
 498     case STDIN_FILENO:
 499       /* Allow reading stdin from tty more than once.  */
 500       if (feof (fp))
 501         clearerr (fp);
 502       break;
 503
 504     case STDOUT_FILENO:
 505       /* Don't close stdout just yet.  close_stdout does that.  */
 506       if (fflush (fp) != 0)
 507         die (_("fflush failed"), file);
 508       break;
 509
 510     default:
 511       if (fclose (fp) != 0)
 512         die (_("close failed"), file);
 513       break;
 514     }
 515 }
 516
 517 static void
 518 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
 519 {
 520   if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
 521     die (_("write failed"), output_file);
 522 }
 523
 524 /* Append DIR to the array of temporary directory names.  */
 525 static void
 526 add_temp_dir (char const *dir)
 527 {
 528   if (temp_dir_count == temp_dir_alloc)
 529     temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
 530
 531   temp_dirs[temp_dir_count++] = dir;
 532 }
 533
 534 /* Remove NAME from the list of temporary files.  */
 535
 536 static void
 537 zaptemp (const char *name)
 538 {
 539   struct tempnode *volatile *pnode;
 540   struct tempnode *node;
 541   struct tempnode *next;
 542   sigset_t oldset;
 543   int unlink_status;
 544   int unlink_errno = 0;
 545
 546   for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
 547     continue;
 548
 549   /* Unlink the temporary file in a critical section to avoid races.  */
 550   next = node->next;
 551   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
 552   unlink_status = unlink (name);
 553   unlink_errno = errno;
 554   *pnode = next;
 555   sigprocmask (SIG_SETMASK, &oldset, NULL);
 556
 557   if (unlink_status != 0)
 558     error (0, unlink_errno, _("warning: cannot remove: %s"), name);
 559   if (! next)
 560     temptail = pnode;
 561   free (node);
 562 }
 563
 564 #if HAVE_NL_LANGINFO
 565
 566 static int
 567 struct_month_cmp (const void *m1, const void *m2)
 568 {
 569   struct month const *month1 = m1;
 570   struct month const *month2 = m2;
 571   return strcmp (month1->name, month2->name);
 572 }
 573
 574 #endif
 575
 576 /* Initialize the character class tables. */
 577
 578 static void
 579 inittables (void)
 580 {
 581   size_t i;
 582
 583   for (i = 0; i < UCHAR_LIM; ++i)
 584     {
 585       blanks[i] = !! isblank (i);
 586       nonprinting[i] = ! isprint (i);
 587       nondictionary[i] = ! isalnum (i) && ! isblank (i);
 588       fold_toupper[i] = toupper (i);
 589     }
 590
 591 #if HAVE_NL_LANGINFO
 592   /* If we're not in the "C" locale, read different names for months.  */
 593   if (hard_LC_TIME)
 594     {
 595       for (i = 0; i < MONTHS_PER_YEAR; i++)
 596         {
 597           char const *s;
 598           size_t s_len;
 599           size_t j;
 600           char *name;
 601
 602           s = (char *) nl_langinfo (ABMON_1 + i);
 603           s_len = strlen (s);
 604           monthtab[i].name = name = xmalloc (s_len + 1);
 605           monthtab[i].val = i + 1;
 606
 607           for (j = 0; j < s_len; j++)
 608             name[j] = fold_toupper[to_uchar (s[j])];
 609           name[j] = '\0';
 610         }
 611       qsort ((void *) monthtab, MONTHS_PER_YEAR,
 612              sizeof *monthtab, struct_month_cmp);
 613     }
 614 #endif
 615 }
 616
 617 /* Specify the amount of main memory to use when sorting.  */
 618 static void
 619 specify_sort_size (char const *s)
 620 {
 621   uintmax_t n;
 622   char *suffix;
 623   enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
 624
 625   /* The default unit is KiB.  */
 626   if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
 627     {
 628       if (n <= UINTMAX_MAX / 1024)
 629         n *= 1024;
 630       else
 631         e = LONGINT_OVERFLOW;
 632     }
 633
 634   /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
 635   if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
 636     switch (suffix[0])
 637       {
 638       case 'b':
 639         e = LONGINT_OK;
 640         break;
 641
 642       case '%':
 643         {
 644           double mem = physmem_total () * n / 100;
 645
 646           /* Use "<", not "<=", to avoid problems with rounding.  */
 647           if (mem < UINTMAX_MAX)
 648             {
 649               n = mem;
 650               e = LONGINT_OK;
 651             }
 652           else
 653             e = LONGINT_OVERFLOW;
 654         }
 655         break;
 656       }
 657
 658   if (e == LONGINT_OK)
 659     {
 660       /* If multiple sort sizes are specified, take the maximum, so
 661          that option order does not matter.  */
 662       if (n < sort_size)
 663         return;
 664
 665       sort_size = n;
 666       if (sort_size == n)
 667         {
 668           sort_size = MAX (sort_size, MIN_SORT_SIZE);
 669           return;
 670         }
 671
 672       e = LONGINT_OVERFLOW;
 673     }
 674
 675   STRTOL_FATAL_ERROR (s, _("sort size"), e);
 676 }
 677
 678 /* Return the default sort size.  */
 679 static size_t
 680 default_sort_size (void)
 681 {
 682   /* Let MEM be available memory or 1/8 of total memory, whichever
 683      is greater.  */
 684   double avail = physmem_available ();
 685   double total = physmem_total ();
 686   double mem = MAX (avail, total / 8);
 687   struct rlimit rlimit;
 688
 689   /* Let SIZE be MEM, but no more than the maximum object size or
 690      system resource limits.  Avoid the MIN macro here, as it is not
 691      quite right when only one argument is floating point.  Don't
 692      bother to check for values like RLIM_INFINITY since in practice
 693      they are not much less than SIZE_MAX.  */
 694   size_t size = SIZE_MAX;
 695   if (mem < size)
 696     size = mem;
 697   if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
 698     size = rlimit.rlim_cur;
 699 #ifdef RLIMIT_AS
 700   if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
 701     size = rlimit.rlim_cur;
 702 #endif
 703
 704   /* Leave a large safety margin for the above limits, as failure can
 705      occur when they are exceeded.  */
 706   size /= 2;
 707
 708 #ifdef RLIMIT_RSS
 709   /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
 710      Exceeding RSS is not fatal, but can be quite slow.  */
 711   if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
 712     size = rlimit.rlim_cur / 16 * 15;
 713 #endif
 714
 715   /* Use no less than the minimum.  */
 716   return MAX (size, MIN_SORT_SIZE);
 717 }
 718
 719 /* Return the sort buffer size to use with the input files identified
 720    by FPS and FILES, which are alternate names of the same files.
 721    NFILES gives the number of input files; NFPS may be less.  Assume
 722    that each input line requires LINE_BYTES extra bytes' worth of line
 723    information.  Do not exceed the size bound specified by the user
 724    (or a default size bound, if the user does not specify one).  */
 725
 726 static size_t
 727 sort_buffer_size (FILE *const *fps, size_t nfps,
 728                   char *const *files, size_t nfiles,
 729                   size_t line_bytes)
 730 {
 731   /* A bound on the input size.  If zero, the bound hasn't been
 732      determined yet.  */
 733   static size_t size_bound;
 734
 735   /* In the worst case, each input byte is a newline.  */
 736   size_t worst_case_per_input_byte = line_bytes + 1;
 737
 738   /* Keep enough room for one extra input line and an extra byte.
 739      This extra room might be needed when preparing to read EOF.  */
 740   size_t size = worst_case_per_input_byte + 1;
 741
 742   size_t i;
 743
 744   for (i = 0; i < nfiles; i++)
 745     {
 746       struct stat st;
 747       off_t file_size;
 748       size_t worst_case;
 749
 750       if ((i < nfps ? fstat (fileno (fps[i]), &st)
 751            : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
 752            : stat (files[i], &st))
 753           != 0)
 754         die (_("stat failed"), files[i]);
 755
 756       if (S_ISREG (st.st_mode))
 757         file_size = st.st_size;
 758       else
 759         {
 760           /* The file has unknown size.  If the user specified a sort
 761              buffer size, use that; otherwise, guess the size.  */
 762           if (sort_size)
 763             return sort_size;
 764           file_size = INPUT_FILE_SIZE_GUESS;
 765         }
 766
 767       if (! size_bound)
 768         {
 769           size_bound = sort_size;
 770           if (! size_bound)
 771             size_bound = default_sort_size ();
 772         }
 773
 774       /* Add the amount of memory needed to represent the worst case
 775          where the input consists entirely of newlines followed by a
 776          single non-newline.  Check for overflow.  */
 777       worst_case = file_size * worst_case_per_input_byte + 1;
 778       if (file_size != worst_case / worst_case_per_input_byte
 779           || size_bound - size <= worst_case)
 780         return size_bound;
 781       size += worst_case;
 782     }
 783
 784   return size;
 785 }
 786
 787 /* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
 788    must be at least sizeof (struct line).  Allocate ALLOC bytes
 789    initially.  */
 790
 791 static void
 792 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
 793 {
 794   /* Ensure that the line array is properly aligned.  If the desired
 795      size cannot be allocated, repeatedly halve it until allocation
 796      succeeds.  The smaller allocation may hurt overall performance,
 797      but that's better than failing.  */
 798   for (;;)
 799     {
 800       alloc += sizeof (struct line) - alloc % sizeof (struct line);
 801       buf->buf = malloc (alloc);
 802       if (buf->buf)
 803         break;
 804       alloc /= 2;
 805       if (alloc <= line_bytes + 1)
 806         xalloc_die ();
 807     }
 808
 809   buf->line_bytes = line_bytes;
 810   buf->alloc = alloc;
 811   buf->used = buf->left = buf->nlines = 0;
 812   buf->eof = false;
 813 }
 814
 815 /* Return one past the limit of the line array.  */
 816
 817 static inline struct line *
 818 buffer_linelim (struct buffer const *buf)
 819 {
 820   return (struct line *) (buf->buf + buf->alloc);
 821 }
 822
 823 /* Return a pointer to the first character of the field specified
 824    by KEY in LINE. */
 825
 826 static char *
 827 begfield (const struct line *line, const struct keyfield *key)
 828 {
 829   char *ptr = line->text, *lim = ptr + line->length - 1;
 830   size_t sword = key->sword;
 831   size_t schar = key->schar;
 832   size_t remaining_bytes;
 833
 834   /* The leading field separator itself is included in a field when -t
 835      is absent.  */
 836
 837   if (tab != TAB_DEFAULT)
 838     while (ptr < lim && sword--)
 839       {
 840         while (ptr < lim && *ptr != tab)
 841           ++ptr;
 842         if (ptr < lim)
 843           ++ptr;
 844       }
 845   else
 846     while (ptr < lim && sword--)
 847       {
 848         while (ptr < lim && blanks[to_uchar (*ptr)])
 849           ++ptr;
 850         while (ptr < lim && !blanks[to_uchar (*ptr)])
 851           ++ptr;
 852       }
 853
 854   if (key->skipsblanks)
 855     while (ptr < lim && blanks[to_uchar (*ptr)])
 856       ++ptr;
 857
 858   /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
 859   remaining_bytes = lim - ptr;
 860   if (schar < remaining_bytes)
 861     ptr += schar;
 862   else
 863     ptr = lim;
 864
 865   return ptr;
 866 }
 867
 868 /* Return the limit of (a pointer to the first character after) the field
 869    in LINE specified by KEY. */
 870
 871 static char *
 872 limfield (const struct line *line, const struct keyfield *key)
 873 {
 874   char *ptr = line->text, *lim = ptr + line->length - 1;
 875   size_t eword = key->eword, echar = key->echar;
 876   size_t remaining_bytes;
 877
 878   /* Move PTR past EWORD fields or to one past the last byte on LINE,
 879      whichever comes first.  If there are more than EWORD fields, leave
 880      PTR pointing at the beginning of the field having zero-based index,
 881      EWORD.  If a delimiter character was specified (via -t), then that
 882      `beginning' is the first character following the delimiting TAB.
 883      Otherwise, leave PTR pointing at the first `blank' character after
 884      the preceding field.  */
 885   if (tab != TAB_DEFAULT)
 886     while (ptr < lim && eword--)
 887       {
 888         while (ptr < lim && *ptr != tab)
 889           ++ptr;
 890         if (ptr < lim && (eword | echar))
 891           ++ptr;
 892       }
 893   else
 894     while (ptr < lim && eword--)
 895       {
 896         while (ptr < lim && blanks[to_uchar (*ptr)])
 897           ++ptr;
 898         while (ptr < lim && !blanks[to_uchar (*ptr)])
 899           ++ptr;
 900       }
 901
 902 #ifdef POSIX_UNSPECIFIED
 903   /* The following block of code makes GNU sort incompatible with
 904      standard Unix sort, so it's ifdef'd out for now.
 905      The POSIX spec isn't clear on how to interpret this.
 906      FIXME: request clarification.
 907
 908      From: kwzh@gnu.ai.mit.edu (Karl Heuer)
 909      Date: Thu, 30 May 96 12:20:41 -0400
 910      [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
 911
 912      [...]I believe I've found another bug in `sort'.
 913
 914      $ cat /tmp/sort.in
 915      a b c 2 d
 916      pq rs 1 t
 917      $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
 918      a b c 2 d
 919      pq rs 1 t
 920      $ /bin/sort -k1.7,1.7 </tmp/sort.in
 921      pq rs 1 t
 922      a b c 2 d
 923
 924      Unix sort produced the answer I expected: sort on the single character
 925      in column 7.  GNU sort produced different results, because it disagrees
 926      on the interpretation of the key-end spec "M.N".  Unix sort reads this
 927      as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
 928      "skip M-1 fields, then either N-1 characters or the rest of the current
 929      field, whichever comes first".  This extra clause applies only to
 930      key-ends, not key-starts.
 931      */
 932
 933   /* Make LIM point to the end of (one byte past) the current field.  */
 934   if (tab != TAB_DEFAULT)
 935     {
 936       char *newlim;
 937       newlim = memchr (ptr, tab, lim - ptr);
 938       if (newlim)
 939         lim = newlim;
 940     }
 941   else
 942     {
 943       char *newlim;
 944       newlim = ptr;
 945       while (newlim < lim && blanks[to_uchar (*newlim)])
 946         ++newlim;
 947       while (newlim < lim && !blanks[to_uchar (*newlim)])
 948         ++newlim;
 949       lim = newlim;
 950     }
 951 #endif
 952
 953   /* If we're ignoring leading blanks when computing the End
 954      of the field, don't start counting bytes until after skipping
 955      past any leading blanks. */
 956   if (key->skipeblanks)
 957     while (ptr < lim && blanks[to_uchar (*ptr)])
 958       ++ptr;
 959
 960   /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
 961   remaining_bytes = lim - ptr;
 962   if (echar < remaining_bytes)
 963     ptr += echar;
 964   else
 965     ptr = lim;
 966
 967   return ptr;
 968 }
 969
 970 /* Fill BUF reading from FP, moving buf->left bytes from the end
 971    of buf->buf to the beginning first.  If EOF is reached and the
 972    file wasn't terminated by a newline, supply one.  Set up BUF's line
 973    table too.  FILE is the name of the file corresponding to FP.
 974    Return true if some input was read.  */
 975
 976 static bool
 977 fillbuf (struct buffer *buf, FILE *fp, char const *file)
 978 {
 979   struct keyfield const *key = keylist;
 980   char eol = eolchar;
 981   size_t line_bytes = buf->line_bytes;
 982   size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
 983
 984   if (buf->eof)
 985     return false;
 986
 987   if (buf->used != buf->left)
 988     {
 989       memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
 990       buf->used = buf->left;
 991       buf->nlines = 0;
 992     }
 993
 994   for (;;)
 995     {
 996       char *ptr = buf->buf + buf->used;
 997       struct line *linelim = buffer_linelim (buf);
 998       struct line *line = linelim - buf->nlines;
 999       size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1000       char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1001
1002       while (line_bytes + 1 < avail)
1003         {
1004           /* Read as many bytes as possible, but do not read so many
1005              bytes that there might not be enough room for the
1006              corresponding line array.  The worst case is when the
1007              rest of the input file consists entirely of newlines,
1008              except that the last byte is not a newline.  */
1009           size_t readsize = (avail - 1) / (line_bytes + 1);
1010           size_t bytes_read = fread (ptr, 1, readsize, fp);
1011           char *ptrlim = ptr + bytes_read;
1012           char *p;
1013           avail -= bytes_read;
1014
1015           if (bytes_read != readsize)
1016             {
1017               if (ferror (fp))
1018                 die (_("read failed"), file);
1019               if (feof (fp))
1020                 {
1021                   buf->eof = true;
1022                   if (buf->buf == ptrlim)
1023                     return false;
1024                   if (ptrlim[-1] != eol)
1025                     *ptrlim++ = eol;
1026                 }
1027             }
1028
1029           /* Find and record each line in the just-read input.  */
1030           while ((p = memchr (ptr, eol, ptrlim - ptr)))
1031             {
1032               ptr = p + 1;
1033               line--;
1034               line->text = line_start;
1035               line->length = ptr - line_start;
1036               mergesize = MAX (mergesize, line->length);
1037               avail -= line_bytes;
1038
1039               if (key)
1040                 {
1041                   /* Precompute the position of the first key for
1042                      efficiency.  */
1043                   line->keylim = (key->eword == SIZE_MAX
1044                                   ? p
1045                                   : limfield (line, key));
1046
1047                   if (key->sword != SIZE_MAX)
1048                     line->keybeg = begfield (line, key);
1049                   else
1050                     {
1051                       if (key->skipsblanks)
1052                         while (blanks[to_uchar (*line_start)])
1053                           line_start++;
1054                       line->keybeg = line_start;
1055                     }
1056                 }
1057
1058               line_start = ptr;
1059             }
1060
1061           ptr = ptrlim;
1062           if (buf->eof)
1063             break;
1064         }
1065
1066       buf->used = ptr - buf->buf;
1067       buf->nlines = buffer_linelim (buf) - line;
1068       if (buf->nlines != 0)
1069         {
1070           buf->left = ptr - line_start;
1071           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1072           return true;
1073         }
1074
1075       /* The current input line is too long to fit in the buffer.
1076          Double the buffer size and try again.  */
1077       buf->buf = X2REALLOC (buf->buf, &buf->alloc);
1078     }
1079 }
1080
1081 /* Compare strings A and B as numbers without explicitly converting them to
1082    machine numbers.  Comparatively slow for short strings, but asymptotically
1083    hideously fast. */
1084
1085 static int
1086 numcompare (const char *a, const char *b)
1087 {
1088   while (blanks[to_uchar (*a)])
1089     a++;
1090   while (blanks[to_uchar (*b)])
1091     b++;
1092
1093   return strnumcmp (a, b, decimal_point, thousands_sep);
1094 }
1095
1096 static int
1097 general_numcompare (const char *sa, const char *sb)
1098 {
1099   /* FIXME: add option to warn about failed conversions.  */
1100   /* FIXME: maybe add option to try expensive FP conversion
1101      only if A and B can't be compared more cheaply/accurately.  */
1102
1103   char *ea;
1104   char *eb;
1105   double a = strtod (sa, &ea);
1106   double b = strtod (sb, &eb);
1107
1108   /* Put conversion errors at the start of the collating sequence.  */
1109   if (sa == ea)
1110     return sb == eb ? 0 : -1;
1111   if (sb == eb)
1112     return 1;
1113
1114   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1115      conversion errors but before numbers; sort them by internal
1116      bit-pattern, for lack of a more portable alternative.  */
1117   return (a < b ? -1
1118           : a > b ? 1
1119           : a == b ? 0
1120           : b == b ? -1
1121           : a == a ? 1
1122           : memcmp ((char *) &a, (char *) &b, sizeof a));
1123 }
1124
1125 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1126    Return 0 if the name in S is not recognized.  */
1127
1128 static int
1129 getmonth (char const *month, size_t len)
1130 {
1131   size_t lo = 0;
1132   size_t hi = MONTHS_PER_YEAR;
1133   char const *monthlim = month + len;
1134
1135   for (;;)
1136     {
1137       if (month == monthlim)
1138         return 0;
1139       if (!blanks[to_uchar (*month)])
1140         break;
1141       ++month;
1142     }
1143
1144   do
1145     {
1146       size_t ix = (lo + hi) / 2;
1147       char const *m = month;
1148       char const *n = monthtab[ix].name;
1149
1150       for (;; m++, n++)
1151         {
1152           if (!*n)
1153             return monthtab[ix].val;
1154           if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1155             {
1156               hi = ix;
1157               break;
1158             }
1159           else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1160             {
1161               lo = ix + 1;
1162               break;
1163             }
1164         }
1165     }
1166   while (lo < hi);
1167
1168   return 0;
1169 }
1170
1171 /* A source of random data.  */
1172 static struct randread_source *randread_source;
1173
1174 /* Return the Ith randomly-generated state.  The caller must invoke
1175    random_state (H) for all H less than I before invoking random_state
1176    (I).  */
1177
1178 static struct md5_ctx
1179 random_state (size_t i)
1180 {
1181   /* An array of states resulting from the random data, and counts of
1182      its used and allocated members.  */
1183   static struct md5_ctx *state;
1184   static size_t used;
1185   static size_t allocated;
1186
1187   struct md5_ctx *s = &state[i];
1188
1189   if (used <= i)
1190     {
1191       unsigned char buf[MD5_DIGEST_SIZE];
1192
1193       used++;
1194
1195       if (allocated <= i)
1196         {
1197           state = X2NREALLOC (state, &allocated);
1198           s = &state[i];
1199         }
1200
1201       randread (randread_source, buf, sizeof buf);
1202       md5_init_ctx (s);
1203       md5_process_bytes (buf, sizeof buf, s);
1204     }
1205
1206   return *s;
1207 }
1208
1209 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
1210    with length LENGTHB.  Return negative if less, zero if equal,
1211    positive if greater.  */
1212
1213 static int
1214 cmp_hashes (char const *texta, size_t lena,
1215             char const *textb, size_t lenb)
1216 {
1217   /* Try random hashes until a pair of hashes disagree.  But if the
1218      first pair of random hashes agree, check whether the keys are
1219      identical and if so report no difference.  */
1220   int diff;
1221   size_t i;
1222   for (i = 0; ; i++)
1223     {
1224       uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
1225       struct md5_ctx s[2];
1226       s[0] = s[1] = random_state (i);
1227       md5_process_bytes (texta, lena, &s[0]);  md5_finish_ctx (&s[0], dig[0]);
1228       md5_process_bytes (textb, lenb, &s[1]);  md5_finish_ctx (&s[1], dig[1]);
1229       diff = memcmp (dig[0], dig[1], sizeof dig[0]);
1230       if (diff != 0)
1231         break;
1232       if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)
1233         break;
1234     }
1235
1236   return diff;
1237 }
1238
1239 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1240    using one or more random hash functions.  */
1241
1242 static int
1243 compare_random (char *restrict texta, size_t lena,
1244                 char *restrict textb, size_t lenb)
1245 {
1246   int diff;
1247
1248   if (! hard_LC_COLLATE)
1249     diff = cmp_hashes (texta, lena, textb, lenb);
1250   else
1251     {
1252       /* Transform the text into the basis of comparison, so that byte
1253          strings that would otherwise considered to be equal are
1254          considered equal here even if their bytes differ.  */
1255
1256       char *buf = NULL;
1257       char stackbuf[4000];
1258       size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
1259       bool a_fits = tlena <= sizeof stackbuf;
1260       size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
1261                                (a_fits ? sizeof stackbuf - tlena : 0),
1262                                textb, lenb);
1263
1264       if (a_fits && tlena + tlenb <= sizeof stackbuf)
1265         buf = stackbuf;
1266       else
1267         {
1268           /* Adding 1 to the buffer size lets xmemxfrm run a bit
1269              faster by avoiding the need for an extra buffer copy.  */
1270           buf = xmalloc (tlena + tlenb + 1);
1271           xmemxfrm (buf, tlena + 1, texta, lena);
1272           xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
1273         }
1274
1275       diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
1276
1277       if (buf != stackbuf)
1278         free (buf);
1279     }
1280
1281   return diff;
1282 }
1283
1284 /* Compare two lines A and B trying every key in sequence until there
1285    are no more keys or a difference is found. */
1286
1287 static int
1288 keycompare (const struct line *a, const struct line *b)
1289 {
1290   struct keyfield const *key = keylist;
1291
1292   /* For the first iteration only, the key positions have been
1293      precomputed for us. */
1294   char *texta = a->keybeg;
1295   char *textb = b->keybeg;
1296   char *lima = a->keylim;
1297   char *limb = b->keylim;
1298
1299   int diff;
1300
1301   for (;;)
1302     {
1303       char const *translate = key->translate;
1304       bool const *ignore = key->ignore;
1305
1306       /* Find the lengths. */
1307       size_t lena = lima <= texta ? 0 : lima - texta;
1308       size_t lenb = limb <= textb ? 0 : limb - textb;
1309
1310       /* Actually compare the fields. */
1311
1312       if (key->random)
1313         diff = compare_random (texta, lena, textb, lenb);
1314       else if (key->numeric | key->general_numeric)
1315         {
1316           char savea = *lima, saveb = *limb;
1317
1318           *lima = *limb = '\0';
1319           diff = ((key->numeric ? numcompare : general_numcompare)
1320                   (texta, textb));
1321           *lima = savea, *limb = saveb;
1322         }
1323       else if (key->month)
1324         diff = getmonth (texta, lena) - getmonth (textb, lenb);
1325       /* Sorting like this may become slow, so in a simple locale the user
1326          can select a faster sort that is similar to ascii sort.  */
1327       else if (hard_LC_COLLATE)
1328         {
1329           if (ignore || translate)
1330             {
1331               char buf[4000];
1332               size_t size = lena + 1 + lenb + 1;
1333               char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1334               char *copy_b = copy_a + lena + 1;
1335               size_t new_len_a, new_len_b, i;
1336
1337               /* Ignore and/or translate chars before comparing.  */
1338               for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1339                 {
1340                   if (i < lena)
1341                     {
1342                       copy_a[new_len_a] = (translate
1343                                            ? translate[to_uchar (texta[i])]
1344                                            : texta[i]);
1345                       if (!ignore || !ignore[to_uchar (texta[i])])
1346                         ++new_len_a;
1347                     }
1348                   if (i < lenb)
1349                     {
1350                       copy_b[new_len_b] = (translate
1351                                            ? translate[to_uchar (textb[i])]
1352                                            : textb [i]);
1353                       if (!ignore || !ignore[to_uchar (textb[i])])
1354                         ++new_len_b;
1355                     }
1356                 }
1357
1358               diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1359
1360               if (sizeof buf < size)
1361                 free (copy_a);
1362             }
1363           else if (lena == 0)
1364             diff = - NONZERO (lenb);
1365           else if (lenb == 0)
1366             goto greater;
1367           else
1368             diff = xmemcoll (texta, lena, textb, lenb);
1369         }
1370       else if (ignore)
1371         {
1372 #define CMP_WITH_IGNORE(A, B)                                           \
1373   do                                                                    \
1374     {                                                                   \
1375           for (;;)                                                      \
1376             {                                                           \
1377               while (texta < lima && ignore[to_uchar (*texta)])         \
1378                 ++texta;                                                \
1379               while (textb < limb && ignore[to_uchar (*textb)])         \
1380                 ++textb;                                                \
1381               if (! (texta < lima && textb < limb))                     \
1382                 break;                                                  \
1383               diff = to_uchar (A) - to_uchar (B);                       \
1384               if (diff)                                                 \
1385                 goto not_equal;                                         \
1386               ++texta;                                                  \
1387               ++textb;                                                  \
1388             }                                                           \
1389                                                                         \
1390           diff = (texta < lima) - (textb < limb);                       \
1391     }                                                                   \
1392   while (0)
1393
1394           if (translate)
1395             CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1396                              translate[to_uchar (*textb)]);
1397           else
1398             CMP_WITH_IGNORE (*texta, *textb);
1399         }
1400       else if (lena == 0)
1401         diff = - NONZERO (lenb);
1402       else if (lenb == 0)
1403         goto greater;
1404       else
1405         {
1406           if (translate)
1407             {
1408               while (texta < lima && textb < limb)
1409                 {
1410                   diff = (to_uchar (translate[to_uchar (*texta++)])
1411                           - to_uchar (translate[to_uchar (*textb++)]));
1412                   if (diff)
1413                     goto not_equal;
1414                 }
1415             }
1416           else
1417             {
1418               diff = memcmp (texta, textb, MIN (lena, lenb));
1419               if (diff)
1420                 goto not_equal;
1421             }
1422           diff = lena < lenb ? -1 : lena != lenb;
1423         }
1424
1425       if (diff)
1426         goto not_equal;
1427
1428       key = key->next;
1429       if (! key)
1430         break;
1431
1432       /* Find the beginning and limit of the next field.  */
1433       if (key->eword != SIZE_MAX)
1434         lima = limfield (a, key), limb = limfield (b, key);
1435       else
1436         lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1437
1438       if (key->sword != SIZE_MAX)
1439         texta = begfield (a, key), textb = begfield (b, key);
1440       else
1441         {
1442           texta = a->text, textb = b->text;
1443           if (key->skipsblanks)
1444             {
1445               while (texta < lima && blanks[to_uchar (*texta)])
1446                 ++texta;
1447               while (textb < limb && blanks[to_uchar (*textb)])
1448                 ++textb;
1449             }
1450         }
1451     }
1452
1453   return 0;
1454
1455  greater:
1456   diff = 1;
1457  not_equal:
1458   return key->reverse ? -diff : diff;
1459 }
1460
1461 /* Compare two lines A and B, returning negative, zero, or positive
1462    depending on whether A compares less than, equal to, or greater than B. */
1463
1464 static int
1465 compare (const struct line *a, const struct line *b)
1466 {
1467   int diff;
1468   size_t alen, blen;
1469
1470   /* First try to compare on the specified keys (if any).
1471      The only two cases with no key at all are unadorned sort,
1472      and unadorned sort -r. */
1473   if (keylist)
1474     {
1475       diff = keycompare (a, b);
1476       if (diff | unique | stable)
1477         return diff;
1478     }
1479
1480   /* If the keys all compare equal (or no keys were specified)
1481      fall through to the default comparison.  */
1482   alen = a->length - 1, blen = b->length - 1;
1483
1484   if (alen == 0)
1485     diff = - NONZERO (blen);
1486   else if (blen == 0)
1487     diff = 1;
1488   else if (hard_LC_COLLATE)
1489     diff = xmemcoll (a->text, alen, b->text, blen);
1490   else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
1491     diff = alen < blen ? -1 : alen != blen;
1492
1493   return reverse ? -diff : diff;
1494 }
1495
1496 /* Check that the lines read from FILE_NAME come in order.  Print a
1497    diagnostic (FILE_NAME, line number, contents of line) to stderr and return
1498    false if they are not in order.  Otherwise, print no diagnostic
1499    and return true.  */
1500
1501 static bool
1502 check (char const *file_name)
1503 {
1504   FILE *fp = xfopen (file_name, "r");
1505   struct buffer buf;            /* Input buffer. */
1506   struct line temp;             /* Copy of previous line. */
1507   size_t alloc = 0;
1508   uintmax_t line_number = 0;
1509   struct keyfield const *key = keylist;
1510   bool nonunique = ! unique;
1511   bool ordered = true;
1512
1513   initbuf (&buf, sizeof (struct line),
1514            MAX (merge_buffer_size, sort_size));
1515   temp.text = NULL;
1516
1517   while (fillbuf (&buf, fp, file_name))
1518     {
1519       struct line const *line = buffer_linelim (&buf);
1520       struct line const *linebase = line - buf.nlines;
1521
1522       /* Make sure the line saved from the old buffer contents is
1523          less than or equal to the first line of the new buffer. */
1524       if (alloc && nonunique <= compare (&temp, line - 1))
1525         {
1526         found_disorder:
1527           {
1528             struct line const *disorder_line = line - 1;
1529             uintmax_t disorder_line_number =
1530               buffer_linelim (&buf) - disorder_line + line_number;
1531             char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
1532             fprintf (stderr, _("%s: %s:%s: disorder: "),
1533                      program_name, file_name,
1534                      umaxtostr (disorder_line_number, hr_buf));
1535             write_bytes (disorder_line->text, disorder_line->length, stderr,
1536                          _("standard error"));
1537             ordered = false;
1538             break;
1539           }
1540         }
1541
1542       /* Compare each line in the buffer with its successor.  */
1543       while (linebase < --line)
1544         if (nonunique <= compare (line, line - 1))
1545           goto found_disorder;
1546
1547       line_number += buf.nlines;
1548
1549       /* Save the last line of the buffer.  */
1550       if (alloc < line->length)
1551         {
1552           do
1553             {
1554               alloc *= 2;
1555               if (! alloc)
1556                 {
1557                   alloc = line->length;
1558                   break;
1559                 }
1560             }
1561           while (alloc < line->length);
1562
1563           temp.text = xrealloc (temp.text, alloc);
1564         }
1565       memcpy (temp.text, line->text, line->length);
1566       temp.length = line->length;
1567       if (key)
1568         {
1569           temp.keybeg = temp.text + (line->keybeg - line->text);
1570           temp.keylim = temp.text + (line->keylim - line->text);
1571         }
1572     }
1573
1574   xfclose (fp, file_name);
1575   free (buf.buf);
1576   free (temp.text);
1577   return ordered;
1578 }
1579
1580 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
1581    files (all of which are at the start of the FILES array), and
1582    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
1583    Close input and output files before returning.
1584    OUTPUT_FILE gives the name of the output file.  If it is NULL,
1585    the output file is standard output.  If OFP is NULL, the output
1586    file has not been opened yet (or written to, if standard output).  */
1587
1588 static void
1589 mergefps (char **files, size_t ntemps, size_t nfiles,
1590           FILE *ofp, char const *output_file)
1591 {
1592   FILE *fps[NMERGE];            /* Input streams for each file.  */
1593   struct buffer buffer[NMERGE]; /* Input buffers for each file. */
1594   struct line saved;            /* Saved line storage for unique check. */
1595   struct line const *savedline = NULL;
1596                                 /* &saved if there is a saved line. */
1597   size_t savealloc = 0;         /* Size allocated for the saved line. */
1598   struct line const *cur[NMERGE]; /* Current line in each line table. */
1599   struct line const *base[NMERGE]; /* Base of each line table.  */
1600   size_t ord[NMERGE];           /* Table representing a permutation of fps,
1601                                    such that cur[ord[0]] is the smallest line
1602                                    and will be next output. */
1603   size_t i;
1604   size_t j;
1605   size_t t;
1606   struct keyfield const *key = keylist;
1607   saved.text = NULL;
1608
1609   /* Read initial lines from each input file. */
1610   for (i = 0; i < nfiles; )
1611     {
1612       fps[i] = xfopen (files[i], "r");
1613       initbuf (&buffer[i], sizeof (struct line),
1614                MAX (merge_buffer_size, sort_size / nfiles));
1615       if (fillbuf (&buffer[i], fps[i], files[i]))
1616         {
1617           struct line const *linelim = buffer_linelim (&buffer[i]);
1618           cur[i] = linelim - 1;
1619           base[i] = linelim - buffer[i].nlines;
1620           i++;
1621         }
1622       else
1623         {
1624           /* fps[i] is empty; eliminate it from future consideration.  */
1625           xfclose (fps[i], files[i]);
1626           if (i < ntemps)
1627             {
1628               ntemps--;
1629               zaptemp (files[i]);
1630             }
1631           free (buffer[i].buf);
1632           --nfiles;
1633           for (j = i; j < nfiles; ++j)
1634             files[j] = files[j + 1];
1635         }
1636     }
1637
1638   if (! ofp)
1639     ofp = xfopen (output_file, "w");
1640
1641   /* Set up the ord table according to comparisons among input lines.
1642      Since this only reorders two items if one is strictly greater than
1643      the other, it is stable. */
1644   for (i = 0; i < nfiles; ++i)
1645     ord[i] = i;
1646   for (i = 1; i < nfiles; ++i)
1647     if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
1648       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1649
1650   /* Repeatedly output the smallest line until no input remains. */
1651   while (nfiles)
1652     {
1653       struct line const *smallest = cur[ord[0]];
1654
1655       /* If uniquified output is turned on, output only the first of
1656          an identical series of lines. */
1657       if (unique)
1658         {
1659           if (savedline && compare (savedline, smallest))
1660             {
1661               savedline = NULL;
1662               write_bytes (saved.text, saved.length, ofp, output_file);
1663             }
1664           if (!savedline)
1665             {
1666               savedline = &saved;
1667               if (savealloc < smallest->length)
1668                 {
1669                   do
1670                     if (! savealloc)
1671                       {
1672                         savealloc = smallest->length;
1673                         break;
1674                       }
1675                   while ((savealloc *= 2) < smallest->length);
1676
1677                   saved.text = xrealloc (saved.text, savealloc);
1678                 }
1679               saved.length = smallest->length;
1680               memcpy (saved.text, smallest->text, saved.length);
1681               if (key)
1682                 {
1683                   saved.keybeg =
1684                     saved.text + (smallest->keybeg - smallest->text);
1685                   saved.keylim =
1686                     saved.text + (smallest->keylim - smallest->text);
1687                 }
1688             }
1689         }
1690       else
1691         write_bytes (smallest->text, smallest->length, ofp, output_file);
1692
1693       /* Check if we need to read more lines into core. */
1694       if (base[ord[0]] < smallest)
1695         cur[ord[0]] = smallest - 1;
1696       else
1697         {
1698           if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]]))
1699             {
1700               struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
1701               cur[ord[0]] = linelim - 1;
1702               base[ord[0]] = linelim - buffer[ord[0]].nlines;
1703             }
1704           else
1705             {
1706               /* We reached EOF on fps[ord[0]].  */
1707               for (i = 1; i < nfiles; ++i)
1708                 if (ord[i] > ord[0])
1709                   --ord[i];
1710               --nfiles;
1711               xfclose (fps[ord[0]], files[ord[0]]);
1712               if (ord[0] < ntemps)
1713                 {
1714                   ntemps--;
1715                   zaptemp (files[ord[0]]);
1716                 }
1717               free (buffer[ord[0]].buf);
1718               for (i = ord[0]; i < nfiles; ++i)
1719                 {
1720                   fps[i] = fps[i + 1];
1721                   files[i] = files[i + 1];
1722                   buffer[i] = buffer[i + 1];
1723                   cur[i] = cur[i + 1];
1724                   base[i] = base[i + 1];
1725                 }
1726               for (i = 0; i < nfiles; ++i)
1727                 ord[i] = ord[i + 1];
1728               continue;
1729             }
1730         }
1731
1732       /* The new line just read in may be larger than other lines
1733          already in main memory; push it back in the queue until we
1734          encounter a line larger than it.  Optimize for the common
1735          case where the new line is smallest.  */
1736       {
1737         size_t lo = 1;
1738         size_t hi = nfiles;
1739         size_t probe = lo;
1740         size_t ord0 = ord[0];
1741         size_t count_of_smaller_lines;
1742
1743         while (lo < hi)
1744           {
1745             int cmp = compare (cur[ord0], cur[ord[probe]]);
1746             if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
1747               hi = probe;
1748             else
1749               lo = probe + 1;
1750             probe = (lo + hi) / 2;
1751           }
1752
1753         count_of_smaller_lines = lo - 1;
1754         for (j = 0; j < count_of_smaller_lines; j++)
1755           ord[j] = ord[j + 1];
1756         ord[count_of_smaller_lines] = ord0;
1757       }
1758     }
1759
1760   if (unique && savedline)
1761     {
1762       write_bytes (saved.text, saved.length, ofp, output_file);
1763       free (saved.text);
1764     }
1765
1766   xfclose (ofp, output_file);
1767 }
1768
1769 /* Merge into T the two sorted arrays of lines LO (with NLO members)
1770    and HI (with NHI members).  T, LO, and HI point just past their
1771    respective arrays, and the arrays are in reverse order.  NLO and
1772    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
1773
1774 static inline void
1775 mergelines (struct line *t,
1776             struct line const *lo, size_t nlo,
1777             struct line const *hi, size_t nhi)
1778 {
1779   for (;;)
1780     if (compare (lo - 1, hi - 1) <= 0)
1781       {
1782         *--t = *--lo;
1783         if (! --nlo)
1784           {
1785             /* HI - NHI equalled T - (NLO + NHI) when this function
1786                began.  Therefore HI must equal T now, and there is no
1787                need to copy from HI to T.  */
1788             return;
1789           }
1790       }
1791     else
1792       {
1793         *--t = *--hi;
1794         if (! --nhi)
1795           {
1796             do
1797               *--t = *--lo;
1798             while (--nlo);
1799
1800             return;
1801           }
1802       }
1803 }
1804
1805 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
1806    NLINES must be at least 2.
1807    The input and output arrays are in reverse order, and LINES and
1808    TEMP point just past the end of their respective arrays.
1809
1810    Use a recursive divide-and-conquer algorithm, in the style
1811    suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
1812    the optimization suggested by exercise 5.2.4-10; this requires room
1813    for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
1814    writes that this memory optimization was originally published by
1815    D. A. Bell, Comp J. 1 (1958), 75.  */
1816
1817 static void
1818 sortlines (struct line *lines, size_t nlines, struct line *temp)
1819 {
1820   if (nlines == 2)
1821     {
1822       if (0 < compare (&lines[-1], &lines[-2]))
1823         {
1824           struct line tmp = lines[-1];
1825           lines[-1] = lines[-2];
1826           lines[-2] = tmp;
1827         }
1828     }
1829   else
1830     {
1831       size_t nlo = nlines / 2;
1832       size_t nhi = nlines - nlo;
1833       struct line *lo = lines;
1834       struct line *hi = lines - nlo;
1835       struct line *sorted_lo = temp;
1836
1837       sortlines (hi, nhi, temp);
1838       if (1 < nlo)
1839         sortlines_temp (lo, nlo, sorted_lo);
1840       else
1841         sorted_lo[-1] = lo[-1];
1842
1843       mergelines (lines, sorted_lo, nlo, hi, nhi);
1844     }
1845 }
1846
1847 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
1848    rather than sorting in place.  */
1849
1850 static void
1851 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
1852 {
1853   if (nlines == 2)
1854     {
1855       /* Declare `swap' as int, not bool, to work around a bug
1856          <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
1857          in the IBM xlc 6.0.0.0 compiler in 64-bit mode.  */
1858       int swap = (0 < compare (&lines[-1], &lines[-2]));
1859       temp[-1] = lines[-1 - swap];
1860       temp[-2] = lines[-2 + swap];
1861     }
1862   else
1863     {
1864       size_t nlo = nlines / 2;
1865       size_t nhi = nlines - nlo;
1866       struct line *lo = lines;
1867       struct line *hi = lines - nlo;
1868       struct line *sorted_hi = temp - nlo;
1869
1870       sortlines_temp (hi, nhi, sorted_hi);
1871       if (1 < nlo)
1872         sortlines (lo, nlo, temp);
1873
1874       mergelines (temp, lo, nlo, sorted_hi, nhi);
1875     }
1876 }
1877
1878 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
1879    the same as OUTFILE.  If found, merge the found instances (and perhaps
1880    some other files) into a temporary file so that it can in turn be
1881    merged into OUTFILE without destroying OUTFILE before it is completely
1882    read.  Return the new value of NFILES, which differs from the old if
1883    some merging occurred.
1884
1885    This test ensures that an otherwise-erroneous use like
1886    "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
1887    It's not clear that POSIX requires this nicety.
1888    Detect common error cases, but don't try to catch obscure cases like
1889    "cat ... FILE ... | sort -m -o FILE"
1890    where traditional "sort" doesn't copy the input and where
1891    people should know that they're getting into trouble anyway.
1892    Catching these obscure cases would slow down performance in
1893    common cases.  */
1894
1895 static size_t
1896 avoid_trashing_input (char **files, size_t ntemps, size_t nfiles,
1897                       char const *outfile)
1898 {
1899   size_t i;
1900   bool got_outstat = false;
1901   struct stat outstat;
1902
1903   for (i = ntemps; i < nfiles; i++)
1904     {
1905       bool is_stdin = STREQ (files[i], "-");
1906       bool same;
1907       struct stat instat;
1908
1909       if (outfile && STREQ (outfile, files[i]) && !is_stdin)
1910         same = true;
1911       else
1912         {
1913           if (! got_outstat)
1914             {
1915               if ((outfile
1916                    ? stat (outfile, &outstat)
1917                    : fstat (STDOUT_FILENO, &outstat))
1918                   != 0)
1919                 break;
1920               got_outstat = true;
1921             }
1922
1923           same = (((is_stdin
1924                     ? fstat (STDIN_FILENO, &instat)
1925                     : stat (files[i], &instat))
1926                    == 0)
1927                   && SAME_INODE (instat, outstat));
1928         }
1929
1930       if (same)
1931         {
1932           FILE *tftp;
1933           char *temp = create_temp_file (&tftp);
1934           mergefps (&files[i], 0, nfiles - i, tftp, temp);
1935           files[i] = temp;
1936           return i + 1;
1937         }
1938     }
1939
1940   return nfiles;
1941 }
1942
1943 /* Merge the input FILES.  NTEMPS is the number of files at the
1944    start of FILES that are temporary; it is zero at the top level.
1945    NFILES is the total number of files.  Put the output in
1946    OUTPUT_FILE; a null OUTPUT_FILE stands for standard output.  */
1947
1948 static void
1949 merge (char **files, size_t ntemps, size_t nfiles, char const *output_file)
1950 {
1951   while (NMERGE < nfiles)
1952     {
1953       /* Number of input files processed so far.  */
1954       size_t in;
1955
1956       /* Number of output files generated so far.  */
1957       size_t out;
1958
1959       /* nfiles % NMERGE; this counts input files that are left over
1960          after all full-sized merges have been done.  */
1961       size_t remainder;
1962
1963       /* Number of easily-available slots at the next loop iteration.  */
1964       size_t cheap_slots;
1965
1966       /* Do as many NMERGE-size merges as possible.  */
1967       for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
1968         {
1969           FILE *tfp;
1970           char *temp = create_temp_file (&tfp);
1971           size_t nt = MIN (ntemps, NMERGE);
1972           ntemps -= nt;
1973           mergefps (&files[in], nt, NMERGE, tfp, temp);
1974           files[out] = temp;
1975         }
1976
1977       remainder = nfiles - in;
1978       cheap_slots = NMERGE - out % NMERGE;
1979
1980       if (cheap_slots < remainder)
1981         {
1982           /* So many files remain that they can't all be put into the last
1983              NMERGE-sized output window.  Do one more merge.  Merge as few
1984              files as possible, to avoid needless I/O.  */
1985           size_t nshortmerge = remainder - cheap_slots + 1;
1986           FILE *tfp;
1987           char *temp = create_temp_file (&tfp);
1988           size_t nt = MIN (ntemps, nshortmerge);
1989           ntemps -= nt;
1990           mergefps (&files[in], nt, nshortmerge, tfp, temp);
1991           files[out++] = temp;
1992           in += nshortmerge;
1993         }
1994
1995       /* Put the remaining input files into the last NMERGE-sized output
1996          window, so they will be merged in the next pass.  */
1997       memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
1998       ntemps += out;
1999       nfiles -= in - out;
2000     }
2001
2002   nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2003   mergefps (files, ntemps, nfiles, NULL, output_file);
2004 }
2005
2006 /* Sort NFILES FILES onto OUTPUT_FILE. */
2007
2008 static void
2009 sort (char * const *files, size_t nfiles, char const *output_file)
2010 {
2011   struct buffer buf;
2012   size_t ntemps = 0;
2013   bool output_file_created = false;
2014
2015   buf.alloc = 0;
2016
2017   while (nfiles)
2018     {
2019       char const *temp_output;
2020       char const *file = *files;
2021       FILE *fp = xfopen (file, "r");
2022       FILE *tfp;
2023       size_t bytes_per_line = (2 * sizeof (struct line)
2024                                - sizeof (struct line) / 2);
2025
2026       if (! buf.alloc)
2027         initbuf (&buf, bytes_per_line,
2028                  sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2029       buf.eof = false;
2030       files++;
2031       nfiles--;
2032
2033       while (fillbuf (&buf, fp, file))
2034         {
2035           struct line *line;
2036           struct line *linebase;
2037
2038           if (buf.eof && nfiles
2039               && (bytes_per_line + 1
2040                   < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2041             {
2042               /* End of file, but there is more input and buffer room.
2043                  Concatenate the next input file; this is faster in
2044                  the usual case.  */
2045               buf.left = buf.used;
2046               break;
2047             }
2048
2049           line = buffer_linelim (&buf);
2050           linebase = line - buf.nlines;
2051           if (1 < buf.nlines)
2052             sortlines (line, buf.nlines, linebase);
2053           if (buf.eof && !nfiles && !ntemps && !buf.left)
2054             {
2055               xfclose (fp, file);
2056               tfp = xfopen (output_file, "w");
2057               temp_output = output_file;
2058               output_file_created = true;
2059             }
2060           else
2061             {
2062               ++ntemps;
2063               temp_output = create_temp_file (&tfp);
2064             }
2065
2066           do
2067             {
2068               line--;
2069               write_bytes (line->text, line->length, tfp, temp_output);
2070               if (unique)
2071                 while (linebase < line && compare (line, line - 1) == 0)
2072                   line--;
2073             }
2074           while (linebase < line);
2075
2076           xfclose (tfp, temp_output);
2077
2078           if (output_file_created)
2079             goto finish;
2080         }
2081       xfclose (fp, file);
2082     }
2083
2084  finish:
2085   free (buf.buf);
2086
2087   if (! output_file_created)
2088     {
2089       size_t i;
2090       struct tempnode *node = temphead;
2091       char **tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2092       for (i = 0; node; i++)
2093         {
2094           tempfiles[i] = node->name;
2095           node = node->next;
2096         }
2097       merge (tempfiles, ntemps, ntemps, output_file);
2098       free (tempfiles);
2099     }
2100 }
2101
2102 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list.  */
2103
2104 static void
2105 insertkey (struct keyfield *key_arg)
2106 {
2107   struct keyfield **p;
2108   struct keyfield *key = xmalloc (sizeof *key);
2109
2110   *key = *key_arg;
2111   for (p = &keylist; *p; p = &(*p)->next)
2112     continue;
2113   *p = key;
2114   key->next = NULL;
2115 }
2116
2117 /* Report a bad field specification SPEC, with extra info MSGID.  */
2118
2119 static void badfieldspec (char const *, char const *)
2120      ATTRIBUTE_NORETURN;
2121 static void
2122 badfieldspec (char const *spec, char const *msgid)
2123 {
2124   error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
2125          _(msgid), quote (spec));
2126   abort ();
2127 }
2128
2129 /* Report incompatible options.  */
2130
2131 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
2132 static void
2133 incompatible_options (char const *opts)
2134 {
2135   error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
2136   abort ();
2137 }
2138
2139 /* Check compatibility of ordering options.  */
2140
2141 static void
2142 check_ordering_compatibility (void)
2143 {
2144   struct keyfield const *key;
2145
2146   for (key = keylist; key; key = key->next)
2147     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
2148               + !!key->ignore))
2149         || (key->random && key->translate))
2150       {
2151         char opts[7];
2152         char *p = opts;
2153         if (key->ignore == nondictionary)
2154           *p++ = 'd';
2155         if (key->translate)
2156           *p++ = 'f';
2157         if (key->general_numeric)
2158           *p++ = 'g';
2159         if (key->ignore == nonprinting)
2160           *p++ = 'i';
2161         if (key->month)
2162           *p++ = 'M';
2163         if (key->numeric)
2164           *p++ = 'n';
2165         if (key->random)
2166           *p++ = 'R';
2167         *p = '\0';
2168         incompatible_options (opts);
2169       }
2170 }
2171
2172 /* Parse the leading integer in STRING and store the resulting value
2173    (which must fit into size_t) into *VAL.  Return the address of the
2174    suffix after the integer.  If MSGID is NULL, return NULL after
2175    failure; otherwise, report MSGID and exit on failure.  */
2176
2177 static char const *
2178 parse_field_count (char const *string, size_t *val, char const *msgid)
2179 {
2180   char *suffix;
2181   uintmax_t n;
2182
2183   switch (xstrtoumax (string, &suffix, 10, &n, ""))
2184     {
2185     case LONGINT_OK:
2186     case LONGINT_INVALID_SUFFIX_CHAR:
2187       *val = n;
2188       if (*val == n)
2189         break;
2190       /* Fall through.  */
2191     case LONGINT_OVERFLOW:
2192     case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2193       if (msgid)
2194         error (SORT_FAILURE, 0, _("%s: count `%.*s' too large"),
2195                _(msgid), (int) (suffix - string), string);
2196       return NULL;
2197
2198     case LONGINT_INVALID:
2199       if (msgid)
2200         error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
2201                _(msgid), quote (string));
2202       return NULL;
2203     }
2204
2205   return suffix;
2206 }
2207
2208 /* Handle interrupts and hangups. */
2209
2210 static void
2211 sighandler (int sig)
2212 {
2213   if (! SA_NOCLDSTOP)
2214     signal (sig, SIG_IGN);
2215
2216   cleanup ();
2217
2218   signal (sig, SIG_DFL);
2219   raise (sig);
2220 }
2221
2222 /* Set the ordering options for KEY specified in S.
2223    Return the address of the first character in S that
2224    is not a valid ordering option.
2225    BLANKTYPE is the kind of blanks that 'b' should skip. */
2226
2227 static char *
2228 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
2229 {
2230   while (*s)
2231     {
2232       switch (*s)
2233         {
2234         case 'b':
2235           if (blanktype == bl_start || blanktype == bl_both)
2236             key->skipsblanks = true;
2237           if (blanktype == bl_end || blanktype == bl_both)
2238             key->skipeblanks = true;
2239           break;
2240         case 'd':
2241           key->ignore = nondictionary;
2242           break;
2243         case 'f':
2244           key->translate = fold_toupper;
2245           break;
2246         case 'g':
2247           key->general_numeric = true;
2248           break;
2249         case 'i':
2250           /* Option order should not matter, so don't let -i override
2251              -d.  -d implies -i, but -i does not imply -d.  */
2252           if (! key->ignore)
2253             key->ignore = nonprinting;
2254           break;
2255         case 'M':
2256           key->month = true;
2257           break;
2258         case 'n':
2259           key->numeric = true;
2260           break;
2261         case 'R':
2262           key->random = true;
2263           break;
2264         case 'r':
2265           key->reverse = true;
2266           break;
2267         default:
2268           return (char *) s;
2269         }
2270       ++s;
2271     }
2272   return (char *) s;
2273 }
2274
2275 static struct keyfield *
2276 key_init (struct keyfield *key)
2277 {
2278   memset (key, 0, sizeof *key);
2279   key->eword = SIZE_MAX;
2280   return key;
2281 }
2282
2283 int
2284 main (int argc, char **argv)
2285 {
2286   struct keyfield *key;
2287   struct keyfield key_buf;
2288   struct keyfield gkey;
2289   char const *s;
2290   int c = 0;
2291   bool checkonly = false;
2292   bool mergeonly = false;
2293   char *random_source = NULL;
2294   bool need_random = false;
2295   size_t nfiles = 0;
2296   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2297   bool obsolete_usage = (posix2_version () < 200112);
2298   char **files;
2299   char const *outfile = NULL;
2300
2301   initialize_main (&argc, &argv);
2302   program_name = argv[0];
2303   setlocale (LC_ALL, "");
2304   bindtextdomain (PACKAGE, LOCALEDIR);
2305   textdomain (PACKAGE);
2306
2307   atexit (cleanup);
2308
2309   initialize_exit_failure (SORT_FAILURE);
2310   atexit (close_stdout);
2311
2312   hard_LC_COLLATE = hard_locale (LC_COLLATE);
2313 #if HAVE_NL_LANGINFO
2314   hard_LC_TIME = hard_locale (LC_TIME);
2315 #endif
2316
2317   /* Get locale's representation of the decimal point.  */
2318   {
2319     struct lconv const *locale = localeconv ();
2320
2321     /* If the locale doesn't define a decimal point, or if the decimal
2322        point is multibyte, use the C locale's decimal point.  FIXME:
2323        add support for multibyte decimal points.  */
2324     decimal_point = to_uchar (locale->decimal_point[0]);
2325     if (! decimal_point || locale->decimal_point[1])
2326       decimal_point = '.';
2327
2328     /* FIXME: add support for multibyte thousands separators.  */
2329     thousands_sep = to_uchar (*locale->thousands_sep);
2330     if (! thousands_sep || locale->thousands_sep[1])
2331       thousands_sep = -1;
2332   }
2333
2334   have_read_stdin = false;
2335   inittables ();
2336
2337   {
2338     size_t i;
2339     static int const sig[] = { SIGHUP, SIGINT, SIGPIPE, SIGTERM };
2340     enum { nsigs = sizeof sig / sizeof sig[0] };
2341
2342 #if SA_NOCLDSTOP
2343     struct sigaction act;
2344
2345     sigemptyset (&caught_signals);
2346     for (i = 0; i < nsigs; i++)
2347       {
2348         sigaction (sig[i], NULL, &act);
2349         if (act.sa_handler != SIG_IGN)
2350           sigaddset (&caught_signals, sig[i]);
2351       }
2352
2353     act.sa_handler = sighandler;
2354     act.sa_mask = caught_signals;
2355     act.sa_flags = 0;
2356
2357     for (i = 0; i < nsigs; i++)
2358       if (sigismember (&caught_signals, sig[i]))
2359         sigaction (sig[i], &act, NULL);
2360 #else
2361     for (i = 0; i < nsigs; i++)
2362       if (signal (sig[i], SIG_IGN) != SIG_IGN)
2363         {
2364           signal (sig[i], sighandler);
2365           siginterrupt (sig[i], 1);
2366         }
2367 #endif
2368   }
2369
2370   gkey.sword = gkey.eword = SIZE_MAX;
2371   gkey.ignore = NULL;
2372   gkey.translate = NULL;
2373   gkey.numeric = gkey.general_numeric = gkey.random = false;
2374   gkey.month = gkey.reverse = false;
2375   gkey.skipsblanks = gkey.skipeblanks = false;
2376
2377   files = xnmalloc (argc, sizeof *files);
2378
2379   for (;;)
2380     {
2381       /* Parse an operand as a file after "--" was seen; or if
2382          pedantic and a file was seen, unless the POSIX version
2383          predates 1003.1-2001 and -c was not seen and the operand is
2384          "-o FILE" or "-oFILE".  */
2385
2386       if (c == -1
2387           || (posixly_correct && nfiles != 0
2388               && ! (obsolete_usage
2389                     && ! checkonly
2390                     && optind != argc
2391                     && argv[optind][0] == '-' && argv[optind][1] == 'o'
2392                     && (argv[optind][2] || optind + 1 != argc)))
2393           || ((c = getopt_long (argc, argv, short_options,
2394                                 long_options, NULL))
2395               == -1))
2396         {
2397           if (argc <= optind)
2398             break;
2399           files[nfiles++] = argv[optind++];
2400         }
2401       else switch (c)
2402         {
2403         case 1:
2404           key = NULL;
2405           if (optarg[0] == '+')
2406             {
2407               bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
2408                                       && ISDIGIT (argv[optind][1]));
2409               obsolete_usage |= minus_pos_usage & ~posixly_correct;
2410               if (obsolete_usage)
2411                 {
2412                   /* Treat +POS1 [-POS2] as a key if possible; but silently
2413                      treat an operand as a file if it is not a valid +POS1.  */
2414                   key = key_init (&key_buf);
2415                   s = parse_field_count (optarg + 1, &key->sword, NULL);
2416                   if (s && *s == '.')
2417                     s = parse_field_count (s + 1, &key->schar, NULL);
2418                   if (! (key->sword | key->schar))
2419                     key->sword = SIZE_MAX;
2420                   if (! s || *set_ordering (s, key, bl_start))
2421                     {
2422                       free (key);
2423                       key = NULL;
2424                     }
2425                   else
2426                     {
2427                       if (minus_pos_usage)
2428                         {
2429                           char const *optarg1 = argv[optind++];
2430                           s = parse_field_count (optarg1 + 1, &key->eword,
2431                                              N_("invalid number after `-'"));
2432                           if (*s == '.')
2433                             s = parse_field_count (s + 1, &key->echar,
2434                                                N_("invalid number after `.'"));
2435                           if (*set_ordering (s, key, bl_end))
2436                             badfieldspec (optarg1,
2437                                       N_("stray character in field spec"));
2438                         }
2439                       insertkey (key);
2440                     }
2441                 }
2442             }
2443           if (! key)
2444             files[nfiles++] = optarg;
2445           break;
2446
2447         case 'b':
2448         case 'd':
2449         case 'f':
2450         case 'g':
2451         case 'i':
2452         case 'M':
2453         case 'n':
2454         case 'r':
2455         case 'R':
2456           {
2457             char str[2];
2458             str[0] = c;
2459             str[1] = '\0';
2460             set_ordering (str, &gkey, bl_both);
2461           }
2462           break;
2463
2464         case 'c':
2465           checkonly = true;
2466           break;
2467
2468         case 'k':
2469           key = key_init (&key_buf);
2470
2471           /* Get POS1. */
2472           s = parse_field_count (optarg, &key->sword,
2473                                  N_("invalid number at field start"));
2474           if (! key->sword--)
2475             {
2476               /* Provoke with `sort -k0' */
2477               badfieldspec (optarg, N_("field number is zero"));
2478             }
2479           if (*s == '.')
2480             {
2481               s = parse_field_count (s + 1, &key->schar,
2482                                      N_("invalid number after `.'"));
2483               if (! key->schar--)
2484                 {
2485                   /* Provoke with `sort -k1.0' */
2486                   badfieldspec (optarg, N_("character offset is zero"));
2487                 }
2488             }
2489           if (! (key->sword | key->schar))
2490             key->sword = SIZE_MAX;
2491           s = set_ordering (s, key, bl_start);
2492           if (*s != ',')
2493             {
2494               key->eword = SIZE_MAX;
2495               key->echar = 0;
2496             }
2497           else
2498             {
2499               /* Get POS2. */
2500               s = parse_field_count (s + 1, &key->eword,
2501                                      N_("invalid number after `,'"));
2502               if (! key->eword--)
2503                 {
2504                   /* Provoke with `sort -k1,0' */
2505                   badfieldspec (optarg, N_("field number is zero"));
2506                 }
2507               if (*s == '.')
2508                 s = parse_field_count (s + 1, &key->echar,
2509                                        N_("invalid number after `.'"));
2510               else
2511                 {
2512                   /* `-k 2,3' is equivalent to `+1 -3'.  */
2513                   key->eword++;
2514                 }
2515               s = set_ordering (s, key, bl_end);
2516             }
2517           if (*s)
2518             badfieldspec (optarg, N_("stray character in field spec"));
2519           insertkey (key);
2520           break;
2521
2522         case 'm':
2523           mergeonly = true;
2524           break;
2525
2526         case 'o':
2527           if (outfile && !STREQ (outfile, optarg))
2528             error (SORT_FAILURE, 0, _("multiple output files specified"));
2529           outfile = optarg;
2530           break;
2531
2532         case RANDOM_SOURCE_OPTION:
2533           if (random_source && !STREQ (random_source, optarg))
2534             error (SORT_FAILURE, 0, _("multiple random sources specified"));
2535           random_source = optarg;
2536           break;
2537
2538         case 's':
2539           stable = true;
2540           break;
2541
2542         case 'S':
2543           specify_sort_size (optarg);
2544           break;
2545
2546         case 't':
2547           {
2548             char newtab = optarg[0];
2549             if (! newtab)
2550               error (SORT_FAILURE, 0, _("empty tab"));
2551             if (optarg[1])
2552               {
2553                 if (STREQ (optarg, "\\0"))
2554                   newtab = '\0';
2555                 else
2556                   {
2557                     /* Provoke with `sort -txx'.  Complain about
2558                        "multi-character tab" instead of "multibyte tab", so
2559                        that the diagnostic's wording does not need to be
2560                        changed once multibyte characters are supported.  */
2561                     error (SORT_FAILURE, 0, _("multi-character tab %s"),
2562                            quote (optarg));
2563                   }
2564               }
2565             if (tab != TAB_DEFAULT && tab != newtab)
2566               error (SORT_FAILURE, 0, _("incompatible tabs"));
2567             tab = newtab;
2568           }
2569           break;
2570
2571         case 'T':
2572           add_temp_dir (optarg);
2573           break;
2574
2575         case 'u':
2576           unique = true;
2577           break;
2578
2579         case 'y':
2580           /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
2581              through Solaris 7.  It is also accepted by many non-Solaris
2582              "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
2583              -y is marked as obsolete starting with Solaris 8 (1999), but is
2584              still accepted as of Solaris 10 prerelease (2004).
2585
2586              Solaris 2.5.1 "sort -y 100" reads the input file "100", but
2587              emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
2588              and which in general ignores the argument after "-y" if it
2589              consists entirely of digits (it can even be empty).  */
2590           if (optarg == argv[optind - 1])
2591             {
2592               char const *p;
2593               for (p = optarg; ISDIGIT (*p); p++)
2594                 continue;
2595               optind -= (*p != '\0');
2596             }
2597           break;
2598
2599         case 'z':
2600           eolchar = 0;
2601           break;
2602
2603         case_GETOPT_HELP_CHAR;
2604
2605         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2606
2607         default:
2608           usage (SORT_FAILURE);
2609         }
2610     }
2611
2612   /* Inheritance of global options to individual keys. */
2613   for (key = keylist; key; key = key->next)
2614     {
2615       if (! (key->ignore || key->translate
2616              || (key->skipsblanks | key->reverse
2617                  | key->skipeblanks | key->month | key->numeric
2618                  | key->general_numeric
2619                  | key->random)))
2620         {
2621           key->ignore = gkey.ignore;
2622           key->translate = gkey.translate;
2623           key->skipsblanks = gkey.skipsblanks;
2624           key->skipeblanks = gkey.skipeblanks;
2625           key->month = gkey.month;
2626           key->numeric = gkey.numeric;
2627           key->general_numeric = gkey.general_numeric;
2628           key->random = gkey.random;
2629           key->reverse = gkey.reverse;
2630         }
2631
2632       need_random |= key->random;
2633     }
2634
2635   if (!keylist && (gkey.ignore || gkey.translate
2636                    || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
2637                        | gkey.numeric | gkey.general_numeric
2638                        | gkey.random)))
2639     {
2640       insertkey (&gkey);
2641       need_random |= gkey.random;
2642     }
2643
2644   check_ordering_compatibility ();
2645
2646   reverse = gkey.reverse;
2647
2648   if (need_random)
2649     {
2650       randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
2651       if (! randread_source)
2652         die (_("open failed"), random_source);
2653     }
2654
2655   if (temp_dir_count == 0)
2656     {
2657       char const *tmp_dir = getenv ("TMPDIR");
2658       add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
2659     }
2660
2661   if (nfiles == 0)
2662     {
2663       static char *minus = "-";
2664       nfiles = 1;
2665       free (files);
2666       files = &minus;
2667     }
2668
2669   if (checkonly)
2670     {
2671       if (nfiles > 1)
2672         error (SORT_FAILURE, 0, _("extra operand %s not allowed with -c"),
2673                quote (files[1]));
2674
2675       if (outfile)
2676         incompatible_options ("co");
2677
2678       /* POSIX requires that sort return 1 IFF invoked with -c and the
2679          input is not properly sorted.  */
2680       exit (check (files[0]) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
2681     }
2682
2683   if (mergeonly)
2684     merge (files, 0, nfiles, outfile);
2685   else
2686     sort (files, nfiles, outfile);
2687
2688   if (have_read_stdin && fclose (stdin) == EOF)
2689     die (_("close failed"), "-");
2690
2691   exit (EXIT_SUCCESS);
2692 }