src/sort.c

   1 /* sort - sort lines of text (with all kinds of options).
   2    Copyright (C) 1988, 1991-2008 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17    Written December 1988 by Mike Haertel.
  18    The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
  19    or (US mail) as Mike Haertel c/o Free Software Foundation.
  20
  21    Ørn E. Hansen added NLS support in 1997.  */
  22
  23 #include <config.h>
  24
  25 #include <getopt.h>
  26 #include <sys/types.h>
  27 #include <sys/wait.h>
  28 #include <signal.h>
  29 #include "system.h"
  30 #include "argmatch.h"
  31 #include "error.h"
  32 #include "hard-locale.h"
  33 #include "hash.h"
  34 #include "md5.h"
  35 #include "physmem.h"
  36 #include "posixver.h"
  37 #include "quote.h"
  38 #include "quotearg.h"
  39 #include "randread.h"
  40 #include "readtokens0.h"
  41 #include "stdio--.h"
  42 #include "stdlib--.h"
  43 #include "strnumcmp.h"
  44 #include "xmemcoll.h"
  45 #include "xmemxfrm.h"
  46 #include "xstrtol.h"
  47
  48 #if HAVE_SYS_RESOURCE_H
  49 # include <sys/resource.h>
  50 #endif
  51 #ifndef RLIMIT_DATA
  52 struct rlimit { size_t rlim_cur; };
  53 # define getrlimit(Resource, Rlp) (-1)
  54 #endif
  55
  56 /* The official name of this program (e.g., no `g' prefix).  */
  57 #define PROGRAM_NAME "sort"
  58
  59 #define AUTHORS \
  60   proper_name ("Mike Haertel"), \
  61   proper_name ("Paul Eggert")
  62
  63 #if HAVE_LANGINFO_CODESET
  64 # include <langinfo.h>
  65 #endif
  66
  67 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
  68    present.  */
  69 #ifndef SA_NOCLDSTOP
  70 # define SA_NOCLDSTOP 0
  71 /* No sigprocmask.  Always 'return' zero. */
  72 # define sigprocmask(How, Set, Oset) (0)
  73 # define sigset_t int
  74 # if ! HAVE_SIGINTERRUPT
  75 #  define siginterrupt(sig, flag) /* empty */
  76 # endif
  77 #endif
  78
  79 #ifndef STDC_HEADERS
  80 double strtod ();
  81 #endif
  82
  83 #define UCHAR_LIM (UCHAR_MAX + 1)
  84
  85 #ifndef DEFAULT_TMPDIR
  86 # define DEFAULT_TMPDIR "/tmp"
  87 #endif
  88
  89 /* Exit statuses.  */
  90 enum
  91   {
  92     /* POSIX says to exit with status 1 if invoked with -c and the
  93        input is not properly sorted.  */
  94     SORT_OUT_OF_ORDER = 1,
  95
  96     /* POSIX says any other irregular exit must exit with a status
  97        code greater than 1.  */
  98     SORT_FAILURE = 2
  99   };
 100
 101 enum
 102   {
 103     /* The number of times we should try to fork a compression process
 104        (we retry if the fork call fails).  We don't _need_ to compress
 105        temp files, this is just to reduce disk access, so this number
 106        can be small.  */
 107     MAX_FORK_TRIES_COMPRESS = 2,
 108
 109     /* The number of times we should try to fork a decompression process.
 110        If we can't fork a decompression process, we can't sort, so this
 111        number should be big.  */
 112     MAX_FORK_TRIES_DECOMPRESS = 8
 113   };
 114
 115 /* The representation of the decimal point in the current locale.  */
 116 static int decimal_point;
 117
 118 /* Thousands separator; if -1, then there isn't one.  */
 119 static int thousands_sep;
 120
 121 /* Nonzero if the corresponding locales are hard.  */
 122 static bool hard_LC_COLLATE;
 123 #if HAVE_NL_LANGINFO
 124 static bool hard_LC_TIME;
 125 #endif
 126
 127 #define NONZERO(x) ((x) != 0)
 128
 129 /* The kind of blanks for '-b' to skip in various options. */
 130 enum blanktype { bl_start, bl_end, bl_both };
 131
 132 /* The character marking end of line. Default to \n. */
 133 static char eolchar = '\n';
 134
 135 /* Lines are held in core as counted strings. */
 136 struct line
 137 {
 138   char *text;                   /* Text of the line. */
 139   size_t length;                /* Length including final newline. */
 140   char *keybeg;                 /* Start of first key. */
 141   char *keylim;                 /* Limit of first key. */
 142 };
 143
 144 /* Input buffers. */
 145 struct buffer
 146 {
 147   char *buf;                    /* Dynamically allocated buffer,
 148                                    partitioned into 3 regions:
 149                                    - input data;
 150                                    - unused area;
 151                                    - an array of lines, in reverse order.  */
 152   size_t used;                  /* Number of bytes used for input data.  */
 153   size_t nlines;                /* Number of lines in the line array.  */
 154   size_t alloc;                 /* Number of bytes allocated. */
 155   size_t left;                  /* Number of bytes left from previous reads. */
 156   size_t line_bytes;            /* Number of bytes to reserve for each line. */
 157   bool eof;                     /* An EOF has been read.  */
 158 };
 159
 160 struct keyfield
 161 {
 162   size_t sword;                 /* Zero-origin 'word' to start at. */
 163   size_t schar;                 /* Additional characters to skip. */
 164   size_t eword;                 /* Zero-origin first word after field. */
 165   size_t echar;                 /* Additional characters in field. */
 166   bool const *ignore;           /* Boolean array of characters to ignore. */
 167   char const *translate;        /* Translation applied to characters. */
 168   bool skipsblanks;             /* Skip leading blanks when finding start.  */
 169   bool skipeblanks;             /* Skip leading blanks when finding end.  */
 170   bool numeric;                 /* Flag for numeric comparison.  Handle
 171                                    strings of digits with optional decimal
 172                                    point, but no exponential notation. */
 173   bool random;                  /* Sort by random hash of key.  */
 174   bool general_numeric;         /* Flag for general, numeric comparison.
 175                                    Handle numbers in exponential notation. */
 176   bool month;                   /* Flag for comparison by month name. */
 177   bool reverse;                 /* Reverse the sense of comparison. */
 178   struct keyfield *next;        /* Next keyfield to try. */
 179 };
 180
 181 struct month
 182 {
 183   char const *name;
 184   int val;
 185 };
 186
 187 /* FIXME: None of these tables work with multibyte character sets.
 188    Also, there are many other bugs when handling multibyte characters.
 189    One way to fix this is to rewrite `sort' to use wide characters
 190    internally, but doing this with good performance is a bit
 191    tricky.  */
 192
 193 /* Table of blanks.  */
 194 static bool blanks[UCHAR_LIM];
 195
 196 /* Table of non-printing characters. */
 197 static bool nonprinting[UCHAR_LIM];
 198
 199 /* Table of non-dictionary characters (not letters, digits, or blanks). */
 200 static bool nondictionary[UCHAR_LIM];
 201
 202 /* Translation table folding lower case to upper.  */
 203 static char fold_toupper[UCHAR_LIM];
 204
 205 #define MONTHS_PER_YEAR 12
 206
 207 /* Table mapping month names to integers.
 208    Alphabetic order allows binary search. */
 209 static struct month monthtab[] =
 210 {
 211   {"APR", 4},
 212   {"AUG", 8},
 213   {"DEC", 12},
 214   {"FEB", 2},
 215   {"JAN", 1},
 216   {"JUL", 7},
 217   {"JUN", 6},
 218   {"MAR", 3},
 219   {"MAY", 5},
 220   {"NOV", 11},
 221   {"OCT", 10},
 222   {"SEP", 9}
 223 };
 224
 225 /* During the merge phase, the number of files to merge at once. */
 226 #define NMERGE_DEFAULT 16
 227
 228 /* Minimum size for a merge or check buffer.  */
 229 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 230
 231 /* Minimum sort size; the code might not work with smaller sizes.  */
 232 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
 233
 234 /* Maximum merge buffers we can theoretically support */
 235 #define MAX_NMERGE (SIZE_MAX / MIN_MERGE_BUFFER_SIZE)
 236
 237 /* The number of bytes needed for a merge or check buffer, which can
 238    function relatively efficiently even if it holds only one line.  If
 239    a longer line is seen, this value is increased.  */
 240 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
 241
 242 /* The approximate maximum number of bytes of main memory to use, as
 243    specified by the user.  Zero if the user has not specified a size.  */
 244 static size_t sort_size;
 245
 246 /* The guessed size for non-regular files.  */
 247 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
 248
 249 /* Array of directory names in which any temporary files are to be created. */
 250 static char const **temp_dirs;
 251
 252 /* Number of temporary directory names used.  */
 253 static size_t temp_dir_count;
 254
 255 /* Number of allocated slots in temp_dirs.  */
 256 static size_t temp_dir_alloc;
 257
 258 /* Flag to reverse the order of all comparisons. */
 259 static bool reverse;
 260
 261 /* Flag for stable sort.  This turns off the last ditch bytewise
 262    comparison of lines, and instead leaves lines in the same order
 263    they were read if all keys compare equal.  */
 264 static bool stable;
 265
 266 /* If TAB has this value, blanks separate fields.  */
 267 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 268
 269 /* Tab character separating fields.  If TAB_DEFAULT, then fields are
 270    separated by the empty string between a non-blank character and a blank
 271    character. */
 272 static int tab = TAB_DEFAULT;
 273
 274 /* Flag to remove consecutive duplicate lines from the output.
 275    Only the last of a sequence of equal lines will be output. */
 276 static bool unique;
 277
 278 /* Nonzero if any of the input files are the standard input. */
 279 static bool have_read_stdin;
 280
 281 /* List of key field comparisons to be tried.  */
 282 static struct keyfield *keylist;
 283
 284 /* Program used to (de)compress temp files.  Must accept -d.  */
 285 static char const *compress_program;
 286
 287 /* Maximum number of files to merge in one go.  If more than this
 288    number are present, temp files will be used. */
 289 static unsigned int nmerge = NMERGE_DEFAULT;
 290
 291 static void sortlines_temp (struct line *, size_t, struct line *);
 292
 293 /* Report MESSAGE for FILE, then clean up and exit.
 294    If FILE is null, it represents standard output.  */
 295
 296 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
 297 static void
 298 die (char const *message, char const *file)
 299 {
 300   error (0, errno, "%s: %s", message, file ? file : _("standard output"));
 301   exit (SORT_FAILURE);
 302 }
 303
 304 void
 305 usage (int status)
 306 {
 307   if (status != EXIT_SUCCESS)
 308     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 309              program_name);
 310   else
 311     {
 312       printf (_("\
 313 Usage: %s [OPTION]... [FILE]...\n\
 314   or:  %s [OPTION]... --files0-from=F\n\
 315 "),
 316               program_name, program_name);
 317       fputs (_("\
 318 Write sorted concatenation of all FILE(s) to standard output.\n\
 319 \n\
 320 "), stdout);
 321       fputs (_("\
 322 Mandatory arguments to long options are mandatory for short options too.\n\
 323 "), stdout);
 324       fputs (_("\
 325 Ordering options:\n\
 326 \n\
 327 "), stdout);
 328       fputs (_("\
 329   -b, --ignore-leading-blanks  ignore leading blanks\n\
 330   -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
 331   -f, --ignore-case           fold lower case to upper case characters\n\
 332 "), stdout);
 333       fputs (_("\
 334   -g, --general-numeric-sort  compare according to general numerical value\n\
 335   -i, --ignore-nonprinting    consider only printable characters\n\
 336   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 337   -n, --numeric-sort          compare according to string numerical value\n\
 338   -R, --random-sort           sort by random hash of keys\n\
 339       --random-source=FILE    get random bytes from FILE (default /dev/urandom)\n\
 340       --sort=WORD             sort according to WORD:\n\
 341                                 general-numeric -g, month -M, numeric -n,\n\
 342                                 random -R\n\
 343   -r, --reverse               reverse the result of comparisons\n\
 344 \n\
 345 "), stdout);
 346       fputs (_("\
 347 Other options:\n\
 348 \n\
 349 "), stdout);
 350       fputs (_("\
 351       --batch-size=NMERGE   merge at most NMERGE inputs at once;\n\
 352                             for more use temp files\n\
 353 "), stdout);
 354       fputs (_("\
 355   -c, --check, --check=diagnose-first  check for sorted input; do not sort\n\
 356   -C, --check=quiet, --check=silent  like -c, but do not report first bad line\n\
 357       --compress-program=PROG  compress temporaries with PROG;\n\
 358                               decompress them with PROG -d\n\
 359       --files0-from=F       read input from the files specified by\n\
 360                             NUL-terminated names in file F\n\
 361 "), stdout);
 362       fputs (_("\
 363   -k, --key=POS1[,POS2]     start a key at POS1, end it at POS2 (origin 1)\n\
 364   -m, --merge               merge already sorted files; do not sort\n\
 365 "), stdout);
 366       fputs (_("\
 367   -o, --output=FILE         write result to FILE instead of standard output\n\
 368   -s, --stable              stabilize sort by disabling last-resort comparison\n\
 369   -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
 370 "), stdout);
 371       printf (_("\
 372   -t, --field-separator=SEP  use SEP instead of non-blank to blank transition\n\
 373   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
 374                               multiple options specify multiple directories\n\
 375   -u, --unique              with -c, check for strict ordering;\n\
 376                               without -c, output only the first of an equal run\n\
 377 "), DEFAULT_TMPDIR);
 378       fputs (_("\
 379   -z, --zero-terminated     end lines with 0 byte, not newline\n\
 380 "), stdout);
 381       fputs (HELP_OPTION_DESCRIPTION, stdout);
 382       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 383       fputs (_("\
 384 \n\
 385 POS is F[.C][OPTS], where F is the field number and C the character position\n\
 386 in the field; both are origin 1.  If neither -t nor -b is in effect, characters\n\
 387 in a field are counted from the beginning of the preceding whitespace.  OPTS is\n\
 388 one or more single-letter ordering options, which override global ordering\n\
 389 options for that key.  If no key is given, use the entire line as the key.\n\
 390 \n\
 391 SIZE may be followed by the following multiplicative suffixes:\n\
 392 "), stdout);
 393       fputs (_("\
 394 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
 395 \n\
 396 With no FILE, or when FILE is -, read standard input.\n\
 397 \n\
 398 *** WARNING ***\n\
 399 The locale specified by the environment affects sort order.\n\
 400 Set LC_ALL=C to get the traditional sort order that uses\n\
 401 native byte values.\n\
 402 "), stdout );
 403       emit_bug_reporting_address ();
 404     }
 405
 406   exit (status);
 407 }
 408
 409 /* For long options that have no equivalent short option, use a
 410    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 411 enum
 412 {
 413   CHECK_OPTION = CHAR_MAX + 1,
 414   COMPRESS_PROGRAM_OPTION,
 415   FILES0_FROM_OPTION,
 416   NMERGE_OPTION,
 417   RANDOM_SOURCE_OPTION,
 418   SORT_OPTION
 419 };
 420
 421 static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uy:z";
 422
 423 static struct option const long_options[] =
 424 {
 425   {"ignore-leading-blanks", no_argument, NULL, 'b'},
 426   {"check", optional_argument, NULL, CHECK_OPTION},
 427   {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
 428   {"dictionary-order", no_argument, NULL, 'd'},
 429   {"ignore-case", no_argument, NULL, 'f'},
 430   {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
 431   {"general-numeric-sort", no_argument, NULL, 'g'},
 432   {"ignore-nonprinting", no_argument, NULL, 'i'},
 433   {"key", required_argument, NULL, 'k'},
 434   {"merge", no_argument, NULL, 'm'},
 435   {"month-sort", no_argument, NULL, 'M'},
 436   {"numeric-sort", no_argument, NULL, 'n'},
 437   {"random-sort", no_argument, NULL, 'R'},
 438   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
 439   {"sort", required_argument, NULL, SORT_OPTION},
 440   {"output", required_argument, NULL, 'o'},
 441   {"reverse", no_argument, NULL, 'r'},
 442   {"stable", no_argument, NULL, 's'},
 443   {"batch-size", required_argument, NULL, NMERGE_OPTION},
 444   {"buffer-size", required_argument, NULL, 'S'},
 445   {"field-separator", required_argument, NULL, 't'},
 446   {"temporary-directory", required_argument, NULL, 'T'},
 447   {"unique", no_argument, NULL, 'u'},
 448   {"zero-terminated", no_argument, NULL, 'z'},
 449   {GETOPT_HELP_OPTION_DECL},
 450   {GETOPT_VERSION_OPTION_DECL},
 451   {NULL, 0, NULL, 0},
 452 };
 453
 454 static char const *const check_args[] =
 455 {
 456   "quiet", "silent", "diagnose-first", NULL
 457 };
 458 static char const check_types[] =
 459 {
 460   'C', 'C', 'c'
 461 };
 462 ARGMATCH_VERIFY (check_args, check_types);
 463
 464 static char const *const sort_args[] =
 465 {
 466   "general-numeric", "month", "numeric", "random", NULL
 467 };
 468 static char const sort_types[] =
 469 {
 470   'g', 'M', 'n', 'R'
 471 };
 472 ARGMATCH_VERIFY (sort_args, sort_types);
 473
 474 /* The set of signals that are caught.  */
 475 static sigset_t caught_signals;
 476
 477 /* Critical section status.  */
 478 struct cs_status
 479 {
 480   bool valid;
 481   sigset_t sigs;
 482 };
 483
 484 /* Enter a critical section.  */
 485 static struct cs_status
 486 cs_enter (void)
 487 {
 488   struct cs_status status;
 489   status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
 490   return status;
 491 }
 492
 493 /* Leave a critical section.  */
 494 static void
 495 cs_leave (struct cs_status status)
 496 {
 497   if (status.valid)
 498     {
 499       /* Ignore failure when restoring the signal mask. */
 500       sigprocmask (SIG_SETMASK, &status.sigs, NULL);
 501     }
 502 }
 503
 504 /* The list of temporary files. */
 505 struct tempnode
 506 {
 507   struct tempnode *volatile next;
 508   pid_t pid;     /* If compressed, the pid of compressor, else zero */
 509   char name[1];  /* Actual size is 1 + file name length.  */
 510 };
 511 static struct tempnode *volatile temphead;
 512 static struct tempnode *volatile *temptail = &temphead;
 513
 514 struct sortfile
 515 {
 516   char const *name;
 517   pid_t pid;     /* If compressed, the pid of compressor, else zero */
 518 };
 519
 520 /* A table where we store compression process states.  We clean up all
 521    processes in a timely manner so as not to exhaust system resources,
 522    so we store the info on whether the process is still running, or has
 523    been reaped here.  */
 524 static Hash_table *proctab;
 525
 526 enum { INIT_PROCTAB_SIZE = 47 };
 527
 528 enum procstate { ALIVE, ZOMBIE };
 529
 530 /* A proctab entry.  The COUNT field is there in case we fork a new
 531    compression process that has the same PID as an old zombie process
 532    that is still in the table (because the process to decompress the
 533    temp file it was associated with hasn't started yet).  */
 534 struct procnode
 535 {
 536   pid_t pid;
 537   enum procstate state;
 538   size_t count;
 539 };
 540
 541 static size_t
 542 proctab_hasher (const void *entry, size_t tabsize)
 543 {
 544   const struct procnode *node = entry;
 545   return node->pid % tabsize;
 546 }
 547
 548 static bool
 549 proctab_comparator (const void *e1, const void *e2)
 550 {
 551   const struct procnode *n1 = e1, *n2 = e2;
 552   return n1->pid == n2->pid;
 553 }
 554
 555 /* The total number of forked processes (compressors and decompressors)
 556    that have not been reaped yet. */
 557 static size_t nprocs;
 558
 559 /* The number of child processes we'll allow before we try to reap some. */
 560 enum { MAX_PROCS_BEFORE_REAP = 2 };
 561
 562 /* If 0 < PID, wait for the child process with that PID to exit.
 563    If PID is -1, clean up a random child process which has finished and
 564    return the process ID of that child.  If PID is -1 and no processes
 565    have quit yet, return 0 without waiting.  */
 566
 567 static pid_t
 568 reap (pid_t pid)
 569 {
 570   int status;
 571   pid_t cpid = waitpid (pid, &status, pid < 0 ? WNOHANG : 0);
 572
 573   if (cpid < 0)
 574     error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
 575            compress_program);
 576   else if (0 < cpid)
 577     {
 578       if (! WIFEXITED (status) || WEXITSTATUS (status))
 579         error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
 580                compress_program);
 581       --nprocs;
 582     }
 583
 584   return cpid;
 585 }
 586
 587 /* Add the PID of a running compression process to proctab, or update
 588    the entry COUNT and STATE fields if it's already there.  This also
 589    creates the table for us the first time it's called.  */
 590
 591 static void
 592 register_proc (pid_t pid)
 593 {
 594   struct procnode test, *node;
 595
 596   if (! proctab)
 597     {
 598       proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
 599                                  proctab_hasher,
 600                                  proctab_comparator,
 601                                  free);
 602       if (! proctab)
 603         xalloc_die ();
 604     }
 605
 606   test.pid = pid;
 607   node = hash_lookup (proctab, &test);
 608   if (node)
 609     {
 610       node->state = ALIVE;
 611       ++node->count;
 612     }
 613   else
 614     {
 615       node = xmalloc (sizeof *node);
 616       node->pid = pid;
 617       node->state = ALIVE;
 618       node->count = 1;
 619       hash_insert (proctab, node);
 620     }
 621 }
 622
 623 /* This is called when we reap a random process.  We don't know
 624    whether we have reaped a compression process or a decompression
 625    process until we look in the table.  If there's an ALIVE entry for
 626    it, then we have reaped a compression process, so change the state
 627    to ZOMBIE.  Otherwise, it's a decompression processes, so ignore it.  */
 628
 629 static void
 630 update_proc (pid_t pid)
 631 {
 632   struct procnode test, *node;
 633
 634   test.pid = pid;
 635   node = hash_lookup (proctab, &test);
 636   if (node)
 637     node->state = ZOMBIE;
 638 }
 639
 640 /* This is for when we need to wait for a compression process to exit.
 641    If it has a ZOMBIE entry in the table then it's already dead and has
 642    been reaped.  Note that if there's an ALIVE entry for it, it still may
 643    already have died and been reaped if a second process was created with
 644    the same PID.  This is probably exceedingly rare, but to be on the safe
 645    side we will have to wait for any compression process with this PID.  */
 646
 647 static void
 648 wait_proc (pid_t pid)
 649 {
 650   struct procnode test, *node;
 651
 652   test.pid = pid;
 653   node = hash_lookup (proctab, &test);
 654   if (node->state == ALIVE)
 655     reap (pid);
 656
 657   node->state = ZOMBIE;
 658   if (! --node->count)
 659     {
 660       hash_delete (proctab, node);
 661       free (node);
 662     }
 663 }
 664
 665 /* Keep reaping finished children as long as there are more to reap.
 666    This doesn't block waiting for any of them, it only reaps those
 667    that are already dead.  */
 668
 669 static void
 670 reap_some (void)
 671 {
 672   pid_t pid;
 673
 674   while (0 < nprocs && (pid = reap (-1)))
 675     update_proc (pid);
 676 }
 677
 678 /* Clean up any remaining temporary files.  */
 679
 680 static void
 681 cleanup (void)
 682 {
 683   struct tempnode const *node;
 684
 685   for (node = temphead; node; node = node->next)
 686     unlink (node->name);
 687   temphead = NULL;
 688 }
 689
 690 /* Cleanup actions to take when exiting.  */
 691
 692 static void
 693 exit_cleanup (void)
 694 {
 695   if (temphead)
 696     {
 697       /* Clean up any remaining temporary files in a critical section so
 698          that a signal handler does not try to clean them too.  */
 699       struct cs_status cs = cs_enter ();
 700       cleanup ();
 701       cs_leave (cs);
 702     }
 703
 704   close_stdout ();
 705 }
 706
 707 /* Create a new temporary file, returning its newly allocated tempnode.
 708    Store into *PFD the file descriptor open for writing.  */
 709
 710 static struct tempnode *
 711 create_temp_file (int *pfd)
 712 {
 713   static char const slashbase[] = "/sortXXXXXX";
 714   static size_t temp_dir_index;
 715   int fd;
 716   int saved_errno;
 717   char const *temp_dir = temp_dirs[temp_dir_index];
 718   size_t len = strlen (temp_dir);
 719   struct tempnode *node =
 720     xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
 721   char *file = node->name;
 722   struct cs_status cs;
 723
 724   memcpy (file, temp_dir, len);
 725   memcpy (file + len, slashbase, sizeof slashbase);
 726   node->next = NULL;
 727   node->pid = 0;
 728   if (++temp_dir_index == temp_dir_count)
 729     temp_dir_index = 0;
 730
 731   /* Create the temporary file in a critical section, to avoid races.  */
 732   cs = cs_enter ();
 733   fd = mkstemp (file);
 734   if (0 <= fd)
 735     {
 736       *temptail = node;
 737       temptail = &node->next;
 738     }
 739   saved_errno = errno;
 740   cs_leave (cs);
 741   errno = saved_errno;
 742
 743   if (fd < 0)
 744     die (_("cannot create temporary file"), file);
 745
 746   *pfd = fd;
 747   return node;
 748 }
 749
 750 /* Return a stream for FILE, opened with mode HOW.  A null FILE means
 751    standard output; HOW should be "w".  When opening for input, "-"
 752    means standard input.  To avoid confusion, do not return file
 753    descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
 754    opening an ordinary FILE.  */
 755
 756 static FILE *
 757 xfopen (const char *file, const char *how)
 758 {
 759   FILE *fp;
 760
 761   if (!file)
 762     fp = stdout;
 763   else if (STREQ (file, "-") && *how == 'r')
 764     {
 765       have_read_stdin = true;
 766       fp = stdin;
 767     }
 768   else
 769     {
 770       fp = fopen (file, how);
 771       if (! fp)
 772         die (_("open failed"), file);
 773     }
 774
 775   return fp;
 776 }
 777
 778 /* Close FP, whose name is FILE, and report any errors.  */
 779
 780 static void
 781 xfclose (FILE *fp, char const *file)
 782 {
 783   switch (fileno (fp))
 784     {
 785     case STDIN_FILENO:
 786       /* Allow reading stdin from tty more than once.  */
 787       if (feof (fp))
 788         clearerr (fp);
 789       break;
 790
 791     case STDOUT_FILENO:
 792       /* Don't close stdout just yet.  close_stdout does that.  */
 793       if (fflush (fp) != 0)
 794         die (_("fflush failed"), file);
 795       break;
 796
 797     default:
 798       if (fclose (fp) != 0)
 799         die (_("close failed"), file);
 800       break;
 801     }
 802 }
 803
 804 static void
 805 dup2_or_die (int oldfd, int newfd)
 806 {
 807   if (dup2 (oldfd, newfd) < 0)
 808     error (SORT_FAILURE, errno, _("dup2 failed"));
 809 }
 810
 811 /* Fork a child process for piping to and do common cleanup.  The
 812    TRIES parameter tells us how many times to try to fork before
 813    giving up.  Return the PID of the child or -1 if fork failed.  */
 814
 815 static pid_t
 816 pipe_fork (int pipefds[2], size_t tries)
 817 {
 818 #if HAVE_WORKING_FORK
 819   struct tempnode *saved_temphead;
 820   int saved_errno;
 821   unsigned int wait_retry = 1;
 822   pid_t pid IF_LINT (= -1);
 823   struct cs_status cs;
 824
 825   if (pipe (pipefds) < 0)
 826     return -1;
 827
 828   while (tries--)
 829     {
 830       /* This is so the child process won't delete our temp files
 831          if it receives a signal before exec-ing.  */
 832       cs = cs_enter ();
 833       saved_temphead = temphead;
 834       temphead = NULL;
 835
 836       pid = fork ();
 837       saved_errno = errno;
 838       if (pid)
 839         temphead = saved_temphead;
 840
 841       cs_leave (cs);
 842       errno = saved_errno;
 843
 844       if (0 <= pid || errno != EAGAIN)
 845         break;
 846       else
 847         {
 848           sleep (wait_retry);
 849           wait_retry *= 2;
 850           reap_some ();
 851         }
 852     }
 853
 854   if (pid < 0)
 855     {
 856       close (pipefds[0]);
 857       close (pipefds[1]);
 858     }
 859   else if (pid == 0)
 860     {
 861       close (STDIN_FILENO);
 862       close (STDOUT_FILENO);
 863     }
 864   else
 865     ++nprocs;
 866
 867   return pid;
 868
 869 #else  /* ! HAVE_WORKING_FORK */
 870   return -1;
 871 #endif
 872 }
 873
 874 /* Create a temporary file and start a compression program to filter output
 875    to that file.  Set *PFP to the file handle and if *PPID is non-NULL,
 876    set it to the PID of the newly-created process.  */
 877
 878 static char *
 879 create_temp (FILE **pfp, pid_t *ppid)
 880 {
 881   int tempfd;
 882   struct tempnode *node = create_temp_file (&tempfd);
 883   char *name = node->name;
 884
 885   if (compress_program)
 886     {
 887       int pipefds[2];
 888
 889       node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
 890       if (0 < node->pid)
 891         {
 892           close (tempfd);
 893           close (pipefds[0]);
 894           tempfd = pipefds[1];
 895
 896           register_proc (node->pid);
 897         }
 898       else if (node->pid == 0)
 899         {
 900           close (pipefds[1]);
 901           dup2_or_die (tempfd, STDOUT_FILENO);
 902           close (tempfd);
 903           dup2_or_die (pipefds[0], STDIN_FILENO);
 904           close (pipefds[0]);
 905
 906           if (execlp (compress_program, compress_program, (char *) NULL) < 0)
 907             error (SORT_FAILURE, errno, _("couldn't execute %s"),
 908                    compress_program);
 909         }
 910       else
 911         node->pid = 0;
 912     }
 913
 914   *pfp = fdopen (tempfd, "w");
 915   if (! *pfp)
 916     die (_("couldn't create temporary file"), name);
 917
 918   if (ppid)
 919     *ppid = node->pid;
 920
 921   return name;
 922 }
 923
 924 /* Open a compressed temp file and start a decompression process through
 925    which to filter the input.  PID must be the valid processes ID of the
 926    process used to compress the file.  */
 927
 928 static FILE *
 929 open_temp (const char *name, pid_t pid)
 930 {
 931   int tempfd, pipefds[2];
 932   pid_t child_pid;
 933   FILE *fp;
 934
 935   wait_proc (pid);
 936
 937   tempfd = open (name, O_RDONLY);
 938   if (tempfd < 0)
 939     die (_("couldn't open temporary file"), name);
 940
 941   child_pid = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
 942   if (0 < child_pid)
 943     {
 944       close (tempfd);
 945       close (pipefds[1]);
 946     }
 947   else if (child_pid == 0)
 948     {
 949       close (pipefds[0]);
 950       dup2_or_die (tempfd, STDIN_FILENO);
 951       close (tempfd);
 952       dup2_or_die (pipefds[1], STDOUT_FILENO);
 953       close (pipefds[1]);
 954
 955       if (execlp (compress_program, compress_program, "-d", (char *) NULL) < 0)
 956         error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
 957                compress_program);
 958     }
 959   else
 960     error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
 961            compress_program);
 962
 963   fp = fdopen (pipefds[0], "r");
 964   if (! fp)
 965     die (_("couldn't create temporary file"), name);
 966
 967   return fp;
 968 }
 969
 970 static void
 971 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
 972 {
 973   if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
 974     die (_("write failed"), output_file);
 975 }
 976
 977 /* Append DIR to the array of temporary directory names.  */
 978 static void
 979 add_temp_dir (char const *dir)
 980 {
 981   if (temp_dir_count == temp_dir_alloc)
 982     temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
 983
 984   temp_dirs[temp_dir_count++] = dir;
 985 }
 986
 987 /* Remove NAME from the list of temporary files.  */
 988
 989 static void
 990 zaptemp (const char *name)
 991 {
 992   struct tempnode *volatile *pnode;
 993   struct tempnode *node;
 994   struct tempnode *next;
 995   int unlink_status;
 996   int unlink_errno = 0;
 997   struct cs_status cs;
 998
 999   for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1000     continue;
1001
1002   /* Unlink the temporary file in a critical section to avoid races.  */
1003   next = node->next;
1004   cs = cs_enter ();
1005   unlink_status = unlink (name);
1006   unlink_errno = errno;
1007   *pnode = next;
1008   cs_leave (cs);
1009
1010   if (unlink_status != 0)
1011     error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1012   if (! next)
1013     temptail = pnode;
1014   free (node);
1015 }
1016
1017 #if HAVE_NL_LANGINFO
1018
1019 static int
1020 struct_month_cmp (const void *m1, const void *m2)
1021 {
1022   struct month const *month1 = m1;
1023   struct month const *month2 = m2;
1024   return strcmp (month1->name, month2->name);
1025 }
1026
1027 #endif
1028
1029 /* Initialize the character class tables. */
1030
1031 static void
1032 inittables (void)
1033 {
1034   size_t i;
1035
1036   for (i = 0; i < UCHAR_LIM; ++i)
1037     {
1038       blanks[i] = !! isblank (i);
1039       nonprinting[i] = ! isprint (i);
1040       nondictionary[i] = ! isalnum (i) && ! isblank (i);
1041       fold_toupper[i] = toupper (i);
1042     }
1043
1044 #if HAVE_NL_LANGINFO
1045   /* If we're not in the "C" locale, read different names for months.  */
1046   if (hard_LC_TIME)
1047     {
1048       for (i = 0; i < MONTHS_PER_YEAR; i++)
1049         {
1050           char const *s;
1051           size_t s_len;
1052           size_t j;
1053           char *name;
1054
1055           s = (char *) nl_langinfo (ABMON_1 + i);
1056           s_len = strlen (s);
1057           monthtab[i].name = name = xmalloc (s_len + 1);
1058           monthtab[i].val = i + 1;
1059
1060           for (j = 0; j < s_len; j++)
1061             name[j] = fold_toupper[to_uchar (s[j])];
1062           name[j] = '\0';
1063         }
1064       qsort ((void *) monthtab, MONTHS_PER_YEAR,
1065              sizeof *monthtab, struct_month_cmp);
1066     }
1067 #endif
1068 }
1069
1070 /* Specify how many inputs may be merged at once.
1071    This may be set on the command-line with the
1072    --batch-size option. */
1073 static void
1074 specify_nmerge (int oi, char c, char const *s)
1075 {
1076   uintmax_t n;
1077   struct rlimit rlimit;
1078   unsigned int max_nmerge = (unsigned int) MAX_NMERGE;
1079   enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1080
1081   /* Try to find out how many file descriptors we'll be able
1082      to open.  We need at least nmerge + 3 (STDIN_FILENO,
1083      STDOUT_FILENO and STDERR_FILENO). */
1084   if (getrlimit (RLIMIT_NOFILE, &rlimit) == 0)
1085     max_nmerge = MIN (max_nmerge, rlimit.rlim_cur - 3);
1086
1087   if (e == LONGINT_OK)
1088     {
1089       nmerge = n;
1090       if (nmerge != n)
1091         e = LONGINT_OVERFLOW;
1092       else
1093         {
1094           if (nmerge < 2)
1095             {
1096               error (0, 0, _("invalid --%s argument %s"),
1097                      long_options[oi].name, quote(s));
1098               error (SORT_FAILURE, 0,
1099                      _("minimum --%s argument is %s"),
1100                      long_options[oi].name, quote("2"));
1101             }
1102           else if (max_nmerge < nmerge)
1103             {
1104               e = LONGINT_OVERFLOW;
1105             }
1106           else
1107             return;
1108         }
1109     }
1110
1111   if (e == LONGINT_OVERFLOW)
1112     {
1113       char max_nmerge_buf[INT_BUFSIZE_BOUND (unsigned int)];
1114       uinttostr (max_nmerge, max_nmerge_buf);
1115       error (0, 0, _("--%s argument %s too large"),
1116              long_options[oi].name, quote(s));
1117       error (SORT_FAILURE, 0,
1118              _("maximum --%s argument with current rlimit is %s"),
1119              long_options[oi].name, quote (max_nmerge_buf));
1120     }
1121   else
1122     xstrtol_fatal (e, oi, c, long_options, s);
1123 }
1124
1125 /* Specify the amount of main memory to use when sorting.  */
1126 static void
1127 specify_sort_size (int oi, char c, char const *s)
1128 {
1129   uintmax_t n;
1130   char *suffix;
1131   enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1132
1133   /* The default unit is KiB.  */
1134   if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1135     {
1136       if (n <= UINTMAX_MAX / 1024)
1137         n *= 1024;
1138       else
1139         e = LONGINT_OVERFLOW;
1140     }
1141
1142   /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
1143   if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1144     switch (suffix[0])
1145       {
1146       case 'b':
1147         e = LONGINT_OK;
1148         break;
1149
1150       case '%':
1151         {
1152           double mem = physmem_total () * n / 100;
1153
1154           /* Use "<", not "<=", to avoid problems with rounding.  */
1155           if (mem < UINTMAX_MAX)
1156             {
1157               n = mem;
1158               e = LONGINT_OK;
1159             }
1160           else
1161             e = LONGINT_OVERFLOW;
1162         }
1163         break;
1164       }
1165
1166   if (e == LONGINT_OK)
1167     {
1168       /* If multiple sort sizes are specified, take the maximum, so
1169          that option order does not matter.  */
1170       if (n < sort_size)
1171         return;
1172
1173       sort_size = n;
1174       if (sort_size == n)
1175         {
1176           sort_size = MAX (sort_size, MIN_SORT_SIZE);
1177           return;
1178         }
1179
1180       e = LONGINT_OVERFLOW;
1181     }
1182
1183   xstrtol_fatal (e, oi, c, long_options, s);
1184 }
1185
1186 /* Return the default sort size.  */
1187 static size_t
1188 default_sort_size (void)
1189 {
1190   /* Let MEM be available memory or 1/8 of total memory, whichever
1191      is greater.  */
1192   double avail = physmem_available ();
1193   double total = physmem_total ();
1194   double mem = MAX (avail, total / 8);
1195   struct rlimit rlimit;
1196
1197   /* Let SIZE be MEM, but no more than the maximum object size or
1198      system resource limits.  Avoid the MIN macro here, as it is not
1199      quite right when only one argument is floating point.  Don't
1200      bother to check for values like RLIM_INFINITY since in practice
1201      they are not much less than SIZE_MAX.  */
1202   size_t size = SIZE_MAX;
1203   if (mem < size)
1204     size = mem;
1205   if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1206     size = rlimit.rlim_cur;
1207 #ifdef RLIMIT_AS
1208   if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1209     size = rlimit.rlim_cur;
1210 #endif
1211
1212   /* Leave a large safety margin for the above limits, as failure can
1213      occur when they are exceeded.  */
1214   size /= 2;
1215
1216 #ifdef RLIMIT_RSS
1217   /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1218      Exceeding RSS is not fatal, but can be quite slow.  */
1219   if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1220     size = rlimit.rlim_cur / 16 * 15;
1221 #endif
1222
1223   /* Use no less than the minimum.  */
1224   return MAX (size, MIN_SORT_SIZE);
1225 }
1226
1227 /* Return the sort buffer size to use with the input files identified
1228    by FPS and FILES, which are alternate names of the same files.
1229    NFILES gives the number of input files; NFPS may be less.  Assume
1230    that each input line requires LINE_BYTES extra bytes' worth of line
1231    information.  Do not exceed the size bound specified by the user
1232    (or a default size bound, if the user does not specify one).  */
1233
1234 static size_t
1235 sort_buffer_size (FILE *const *fps, size_t nfps,
1236                   char *const *files, size_t nfiles,
1237                   size_t line_bytes)
1238 {
1239   /* A bound on the input size.  If zero, the bound hasn't been
1240      determined yet.  */
1241   static size_t size_bound;
1242
1243   /* In the worst case, each input byte is a newline.  */
1244   size_t worst_case_per_input_byte = line_bytes + 1;
1245
1246   /* Keep enough room for one extra input line and an extra byte.
1247      This extra room might be needed when preparing to read EOF.  */
1248   size_t size = worst_case_per_input_byte + 1;
1249
1250   size_t i;
1251
1252   for (i = 0; i < nfiles; i++)
1253     {
1254       struct stat st;
1255       off_t file_size;
1256       size_t worst_case;
1257
1258       if ((i < nfps ? fstat (fileno (fps[i]), &st)
1259            : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1260            : stat (files[i], &st))
1261           != 0)
1262         die (_("stat failed"), files[i]);
1263
1264       if (S_ISREG (st.st_mode))
1265         file_size = st.st_size;
1266       else
1267         {
1268           /* The file has unknown size.  If the user specified a sort
1269              buffer size, use that; otherwise, guess the size.  */
1270           if (sort_size)
1271             return sort_size;
1272           file_size = INPUT_FILE_SIZE_GUESS;
1273         }
1274
1275       if (! size_bound)
1276         {
1277           size_bound = sort_size;
1278           if (! size_bound)
1279             size_bound = default_sort_size ();
1280         }
1281
1282       /* Add the amount of memory needed to represent the worst case
1283          where the input consists entirely of newlines followed by a
1284          single non-newline.  Check for overflow.  */
1285       worst_case = file_size * worst_case_per_input_byte + 1;
1286       if (file_size != worst_case / worst_case_per_input_byte
1287           || size_bound - size <= worst_case)
1288         return size_bound;
1289       size += worst_case;
1290     }
1291
1292   return size;
1293 }
1294
1295 /* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
1296    must be at least sizeof (struct line).  Allocate ALLOC bytes
1297    initially.  */
1298
1299 static void
1300 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1301 {
1302   /* Ensure that the line array is properly aligned.  If the desired
1303      size cannot be allocated, repeatedly halve it until allocation
1304      succeeds.  The smaller allocation may hurt overall performance,
1305      but that's better than failing.  */
1306   for (;;)
1307     {
1308       alloc += sizeof (struct line) - alloc % sizeof (struct line);
1309       buf->buf = malloc (alloc);
1310       if (buf->buf)
1311         break;
1312       alloc /= 2;
1313       if (alloc <= line_bytes + 1)
1314         xalloc_die ();
1315     }
1316
1317   buf->line_bytes = line_bytes;
1318   buf->alloc = alloc;
1319   buf->used = buf->left = buf->nlines = 0;
1320   buf->eof = false;
1321 }
1322
1323 /* Return one past the limit of the line array.  */
1324
1325 static inline struct line *
1326 buffer_linelim (struct buffer const *buf)
1327 {
1328   return (struct line *) (buf->buf + buf->alloc);
1329 }
1330
1331 /* Return a pointer to the first character of the field specified
1332    by KEY in LINE. */
1333
1334 static char *
1335 begfield (const struct line *line, const struct keyfield *key)
1336 {
1337   char *ptr = line->text, *lim = ptr + line->length - 1;
1338   size_t sword = key->sword;
1339   size_t schar = key->schar;
1340   size_t remaining_bytes;
1341
1342   /* The leading field separator itself is included in a field when -t
1343      is absent.  */
1344
1345   if (tab != TAB_DEFAULT)
1346     while (ptr < lim && sword--)
1347       {
1348         while (ptr < lim && *ptr != tab)
1349           ++ptr;
1350         if (ptr < lim)
1351           ++ptr;
1352       }
1353   else
1354     while (ptr < lim && sword--)
1355       {
1356         while (ptr < lim && blanks[to_uchar (*ptr)])
1357           ++ptr;
1358         while (ptr < lim && !blanks[to_uchar (*ptr)])
1359           ++ptr;
1360       }
1361
1362   if (key->skipsblanks)
1363     while (ptr < lim && blanks[to_uchar (*ptr)])
1364       ++ptr;
1365
1366   /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
1367   remaining_bytes = lim - ptr;
1368   if (schar < remaining_bytes)
1369     ptr += schar;
1370   else
1371     ptr = lim;
1372
1373   return ptr;
1374 }
1375
1376 /* Return the limit of (a pointer to the first character after) the field
1377    in LINE specified by KEY. */
1378
1379 static char *
1380 limfield (const struct line *line, const struct keyfield *key)
1381 {
1382   char *ptr = line->text, *lim = ptr + line->length - 1;
1383   size_t eword = key->eword, echar = key->echar;
1384   size_t remaining_bytes;
1385
1386   /* Move PTR past EWORD fields or to one past the last byte on LINE,
1387      whichever comes first.  If there are more than EWORD fields, leave
1388      PTR pointing at the beginning of the field having zero-based index,
1389      EWORD.  If a delimiter character was specified (via -t), then that
1390      `beginning' is the first character following the delimiting TAB.
1391      Otherwise, leave PTR pointing at the first `blank' character after
1392      the preceding field.  */
1393   if (tab != TAB_DEFAULT)
1394     while (ptr < lim && eword--)
1395       {
1396         while (ptr < lim && *ptr != tab)
1397           ++ptr;
1398         if (ptr < lim && (eword | echar))
1399           ++ptr;
1400       }
1401   else
1402     while (ptr < lim && eword--)
1403       {
1404         while (ptr < lim && blanks[to_uchar (*ptr)])
1405           ++ptr;
1406         while (ptr < lim && !blanks[to_uchar (*ptr)])
1407           ++ptr;
1408       }
1409
1410 #ifdef POSIX_UNSPECIFIED
1411   /* The following block of code makes GNU sort incompatible with
1412      standard Unix sort, so it's ifdef'd out for now.
1413      The POSIX spec isn't clear on how to interpret this.
1414      FIXME: request clarification.
1415
1416      From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1417      Date: Thu, 30 May 96 12:20:41 -0400
1418      [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1419
1420      [...]I believe I've found another bug in `sort'.
1421
1422      $ cat /tmp/sort.in
1423      a b c 2 d
1424      pq rs 1 t
1425      $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1426      a b c 2 d
1427      pq rs 1 t
1428      $ /bin/sort -k1.7,1.7 </tmp/sort.in
1429      pq rs 1 t
1430      a b c 2 d
1431
1432      Unix sort produced the answer I expected: sort on the single character
1433      in column 7.  GNU sort produced different results, because it disagrees
1434      on the interpretation of the key-end spec "M.N".  Unix sort reads this
1435      as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1436      "skip M-1 fields, then either N-1 characters or the rest of the current
1437      field, whichever comes first".  This extra clause applies only to
1438      key-ends, not key-starts.
1439      */
1440
1441   /* Make LIM point to the end of (one byte past) the current field.  */
1442   if (tab != TAB_DEFAULT)
1443     {
1444       char *newlim;
1445       newlim = memchr (ptr, tab, lim - ptr);
1446       if (newlim)
1447         lim = newlim;
1448     }
1449   else
1450     {
1451       char *newlim;
1452       newlim = ptr;
1453       while (newlim < lim && blanks[to_uchar (*newlim)])
1454         ++newlim;
1455       while (newlim < lim && !blanks[to_uchar (*newlim)])
1456         ++newlim;
1457       lim = newlim;
1458     }
1459 #endif
1460
1461   /* If we're ignoring leading blanks when computing the End
1462      of the field, don't start counting bytes until after skipping
1463      past any leading blanks. */
1464   if (key->skipeblanks)
1465     while (ptr < lim && blanks[to_uchar (*ptr)])
1466       ++ptr;
1467
1468   /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
1469   remaining_bytes = lim - ptr;
1470   if (echar < remaining_bytes)
1471     ptr += echar;
1472   else
1473     ptr = lim;
1474
1475   return ptr;
1476 }
1477
1478 /* Fill BUF reading from FP, moving buf->left bytes from the end
1479    of buf->buf to the beginning first.  If EOF is reached and the
1480    file wasn't terminated by a newline, supply one.  Set up BUF's line
1481    table too.  FILE is the name of the file corresponding to FP.
1482    Return true if some input was read.  */
1483
1484 static bool
1485 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1486 {
1487   struct keyfield const *key = keylist;
1488   char eol = eolchar;
1489   size_t line_bytes = buf->line_bytes;
1490   size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1491
1492   if (buf->eof)
1493     return false;
1494
1495   if (buf->used != buf->left)
1496     {
1497       memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1498       buf->used = buf->left;
1499       buf->nlines = 0;
1500     }
1501
1502   for (;;)
1503     {
1504       char *ptr = buf->buf + buf->used;
1505       struct line *linelim = buffer_linelim (buf);
1506       struct line *line = linelim - buf->nlines;
1507       size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1508       char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1509
1510       while (line_bytes + 1 < avail)
1511         {
1512           /* Read as many bytes as possible, but do not read so many
1513              bytes that there might not be enough room for the
1514              corresponding line array.  The worst case is when the
1515              rest of the input file consists entirely of newlines,
1516              except that the last byte is not a newline.  */
1517           size_t readsize = (avail - 1) / (line_bytes + 1);
1518           size_t bytes_read = fread (ptr, 1, readsize, fp);
1519           char *ptrlim = ptr + bytes_read;
1520           char *p;
1521           avail -= bytes_read;
1522
1523           if (bytes_read != readsize)
1524             {
1525               if (ferror (fp))
1526                 die (_("read failed"), file);
1527               if (feof (fp))
1528                 {
1529                   buf->eof = true;
1530                   if (buf->buf == ptrlim)
1531                     return false;
1532                   if (ptrlim[-1] != eol)
1533                     *ptrlim++ = eol;
1534                 }
1535             }
1536
1537           /* Find and record each line in the just-read input.  */
1538           while ((p = memchr (ptr, eol, ptrlim - ptr)))
1539             {
1540               ptr = p + 1;
1541               line--;
1542               line->text = line_start;
1543               line->length = ptr - line_start;
1544               mergesize = MAX (mergesize, line->length);
1545               avail -= line_bytes;
1546
1547               if (key)
1548                 {
1549                   /* Precompute the position of the first key for
1550                      efficiency.  */
1551                   line->keylim = (key->eword == SIZE_MAX
1552                                   ? p
1553                                   : limfield (line, key));
1554
1555                   if (key->sword != SIZE_MAX)
1556                     line->keybeg = begfield (line, key);
1557                   else
1558                     {
1559                       if (key->skipsblanks)
1560                         while (blanks[to_uchar (*line_start)])
1561                           line_start++;
1562                       line->keybeg = line_start;
1563                     }
1564                 }
1565
1566               line_start = ptr;
1567             }
1568
1569           ptr = ptrlim;
1570           if (buf->eof)
1571             break;
1572         }
1573
1574       buf->used = ptr - buf->buf;
1575       buf->nlines = buffer_linelim (buf) - line;
1576       if (buf->nlines != 0)
1577         {
1578           buf->left = ptr - line_start;
1579           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1580           return true;
1581         }
1582
1583       {
1584         /* The current input line is too long to fit in the buffer.
1585            Double the buffer size and try again, keeping it properly
1586            aligned.  */
1587         size_t line_alloc = buf->alloc / sizeof (struct line);
1588         buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1589         buf->alloc = line_alloc * sizeof (struct line);
1590       }
1591     }
1592 }
1593
1594 /* Compare strings A and B as numbers without explicitly converting them to
1595    machine numbers.  Comparatively slow for short strings, but asymptotically
1596    hideously fast. */
1597
1598 static int
1599 numcompare (const char *a, const char *b)
1600 {
1601   while (blanks[to_uchar (*a)])
1602     a++;
1603   while (blanks[to_uchar (*b)])
1604     b++;
1605
1606   return strnumcmp (a, b, decimal_point, thousands_sep);
1607 }
1608
1609 static int
1610 general_numcompare (const char *sa, const char *sb)
1611 {
1612   /* FIXME: add option to warn about failed conversions.  */
1613   /* FIXME: maybe add option to try expensive FP conversion
1614      only if A and B can't be compared more cheaply/accurately.  */
1615
1616   char *ea;
1617   char *eb;
1618   double a = strtod (sa, &ea);
1619   double b = strtod (sb, &eb);
1620
1621   /* Put conversion errors at the start of the collating sequence.  */
1622   if (sa == ea)
1623     return sb == eb ? 0 : -1;
1624   if (sb == eb)
1625     return 1;
1626
1627   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1628      conversion errors but before numbers; sort them by internal
1629      bit-pattern, for lack of a more portable alternative.  */
1630   return (a < b ? -1
1631           : a > b ? 1
1632           : a == b ? 0
1633           : b == b ? -1
1634           : a == a ? 1
1635           : memcmp ((char *) &a, (char *) &b, sizeof a));
1636 }
1637
1638 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1639    Return 0 if the name in S is not recognized.  */
1640
1641 static int
1642 getmonth (char const *month, size_t len)
1643 {
1644   size_t lo = 0;
1645   size_t hi = MONTHS_PER_YEAR;
1646   char const *monthlim = month + len;
1647
1648   for (;;)
1649     {
1650       if (month == monthlim)
1651         return 0;
1652       if (!blanks[to_uchar (*month)])
1653         break;
1654       ++month;
1655     }
1656
1657   do
1658     {
1659       size_t ix = (lo + hi) / 2;
1660       char const *m = month;
1661       char const *n = monthtab[ix].name;
1662
1663       for (;; m++, n++)
1664         {
1665           if (!*n)
1666             return monthtab[ix].val;
1667           if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1668             {
1669               hi = ix;
1670               break;
1671             }
1672           else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1673             {
1674               lo = ix + 1;
1675               break;
1676             }
1677         }
1678     }
1679   while (lo < hi);
1680
1681   return 0;
1682 }
1683
1684 /* A source of random data.  */
1685 static struct randread_source *randread_source;
1686
1687 /* Return the Ith randomly-generated state.  The caller must invoke
1688    random_state (H) for all H less than I before invoking random_state
1689    (I).  */
1690
1691 static struct md5_ctx
1692 random_state (size_t i)
1693 {
1694   /* An array of states resulting from the random data, and counts of
1695      its used and allocated members.  */
1696   static struct md5_ctx *state;
1697   static size_t used;
1698   static size_t allocated;
1699
1700   struct md5_ctx *s = &state[i];
1701
1702   if (used <= i)
1703     {
1704       unsigned char buf[MD5_DIGEST_SIZE];
1705
1706       used++;
1707
1708       if (allocated <= i)
1709         {
1710           state = X2NREALLOC (state, &allocated);
1711           s = &state[i];
1712         }
1713
1714       randread (randread_source, buf, sizeof buf);
1715       md5_init_ctx (s);
1716       md5_process_bytes (buf, sizeof buf, s);
1717     }
1718
1719   return *s;
1720 }
1721
1722 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
1723    with length LENGTHB.  Return negative if less, zero if equal,
1724    positive if greater.  */
1725
1726 static int
1727 cmp_hashes (char const *texta, size_t lena,
1728             char const *textb, size_t lenb)
1729 {
1730   /* Try random hashes until a pair of hashes disagree.  But if the
1731      first pair of random hashes agree, check whether the keys are
1732      identical and if so report no difference.  */
1733   int diff;
1734   size_t i;
1735   for (i = 0; ; i++)
1736     {
1737       uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
1738       struct md5_ctx s[2];
1739       s[0] = s[1] = random_state (i);
1740       md5_process_bytes (texta, lena, &s[0]);  md5_finish_ctx (&s[0], dig[0]);
1741       md5_process_bytes (textb, lenb, &s[1]);  md5_finish_ctx (&s[1], dig[1]);
1742       diff = memcmp (dig[0], dig[1], sizeof dig[0]);
1743       if (diff != 0)
1744         break;
1745       if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)
1746         break;
1747     }
1748
1749   return diff;
1750 }
1751
1752 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1753    using one or more random hash functions.  */
1754
1755 static int
1756 compare_random (char *restrict texta, size_t lena,
1757                 char *restrict textb, size_t lenb)
1758 {
1759   int diff;
1760
1761   if (! hard_LC_COLLATE)
1762     diff = cmp_hashes (texta, lena, textb, lenb);
1763   else
1764     {
1765       /* Transform the text into the basis of comparison, so that byte
1766          strings that would otherwise considered to be equal are
1767          considered equal here even if their bytes differ.  */
1768
1769       char *buf = NULL;
1770       char stackbuf[4000];
1771       size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
1772       bool a_fits = tlena <= sizeof stackbuf;
1773       size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
1774                                (a_fits ? sizeof stackbuf - tlena : 0),
1775                                textb, lenb);
1776
1777       if (a_fits && tlena + tlenb <= sizeof stackbuf)
1778         buf = stackbuf;
1779       else
1780         {
1781           /* Adding 1 to the buffer size lets xmemxfrm run a bit
1782              faster by avoiding the need for an extra buffer copy.  */
1783           buf = xmalloc (tlena + tlenb + 1);
1784           xmemxfrm (buf, tlena + 1, texta, lena);
1785           xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
1786         }
1787
1788       diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
1789
1790       if (buf != stackbuf)
1791         free (buf);
1792     }
1793
1794   return diff;
1795 }
1796
1797 /* Compare two lines A and B trying every key in sequence until there
1798    are no more keys or a difference is found. */
1799
1800 static int
1801 keycompare (const struct line *a, const struct line *b)
1802 {
1803   struct keyfield const *key = keylist;
1804
1805   /* For the first iteration only, the key positions have been
1806      precomputed for us. */
1807   char *texta = a->keybeg;
1808   char *textb = b->keybeg;
1809   char *lima = a->keylim;
1810   char *limb = b->keylim;
1811
1812   int diff;
1813
1814   for (;;)
1815     {
1816       char const *translate = key->translate;
1817       bool const *ignore = key->ignore;
1818
1819       /* Find the lengths. */
1820       size_t lena = lima <= texta ? 0 : lima - texta;
1821       size_t lenb = limb <= textb ? 0 : limb - textb;
1822
1823       /* Actually compare the fields. */
1824
1825       if (key->random)
1826         diff = compare_random (texta, lena, textb, lenb);
1827       else if (key->numeric | key->general_numeric)
1828         {
1829           char savea = *lima, saveb = *limb;
1830
1831           *lima = *limb = '\0';
1832           diff = ((key->numeric ? numcompare : general_numcompare)
1833                   (texta, textb));
1834           *lima = savea, *limb = saveb;
1835         }
1836       else if (key->month)
1837         diff = getmonth (texta, lena) - getmonth (textb, lenb);
1838       /* Sorting like this may become slow, so in a simple locale the user
1839          can select a faster sort that is similar to ascii sort.  */
1840       else if (hard_LC_COLLATE)
1841         {
1842           if (ignore || translate)
1843             {
1844               char buf[4000];
1845               size_t size = lena + 1 + lenb + 1;
1846               char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1847               char *copy_b = copy_a + lena + 1;
1848               size_t new_len_a, new_len_b, i;
1849
1850               /* Ignore and/or translate chars before comparing.  */
1851               for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1852                 {
1853                   if (i < lena)
1854                     {
1855                       copy_a[new_len_a] = (translate
1856                                            ? translate[to_uchar (texta[i])]
1857                                            : texta[i]);
1858                       if (!ignore || !ignore[to_uchar (texta[i])])
1859                         ++new_len_a;
1860                     }
1861                   if (i < lenb)
1862                     {
1863                       copy_b[new_len_b] = (translate
1864                                            ? translate[to_uchar (textb[i])]
1865                                            : textb [i]);
1866                       if (!ignore || !ignore[to_uchar (textb[i])])
1867                         ++new_len_b;
1868                     }
1869                 }
1870
1871               diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1872
1873               if (sizeof buf < size)
1874                 free (copy_a);
1875             }
1876           else if (lena == 0)
1877             diff = - NONZERO (lenb);
1878           else if (lenb == 0)
1879             goto greater;
1880           else
1881             diff = xmemcoll (texta, lena, textb, lenb);
1882         }
1883       else if (ignore)
1884         {
1885 #define CMP_WITH_IGNORE(A, B)                                           \
1886   do                                                                    \
1887     {                                                                   \
1888           for (;;)                                                      \
1889             {                                                           \
1890               while (texta < lima && ignore[to_uchar (*texta)])         \
1891                 ++texta;                                                \
1892               while (textb < limb && ignore[to_uchar (*textb)])         \
1893                 ++textb;                                                \
1894               if (! (texta < lima && textb < limb))                     \
1895                 break;                                                  \
1896               diff = to_uchar (A) - to_uchar (B);                       \
1897               if (diff)                                                 \
1898                 goto not_equal;                                         \
1899               ++texta;                                                  \
1900               ++textb;                                                  \
1901             }                                                           \
1902                                                                         \
1903           diff = (texta < lima) - (textb < limb);                       \
1904     }                                                                   \
1905   while (0)
1906
1907           if (translate)
1908             CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1909                              translate[to_uchar (*textb)]);
1910           else
1911             CMP_WITH_IGNORE (*texta, *textb);
1912         }
1913       else if (lena == 0)
1914         diff = - NONZERO (lenb);
1915       else if (lenb == 0)
1916         goto greater;
1917       else
1918         {
1919           if (translate)
1920             {
1921               while (texta < lima && textb < limb)
1922                 {
1923                   diff = (to_uchar (translate[to_uchar (*texta++)])
1924                           - to_uchar (translate[to_uchar (*textb++)]));
1925                   if (diff)
1926                     goto not_equal;
1927                 }
1928             }
1929           else
1930             {
1931               diff = memcmp (texta, textb, MIN (lena, lenb));
1932               if (diff)
1933                 goto not_equal;
1934             }
1935           diff = lena < lenb ? -1 : lena != lenb;
1936         }
1937
1938       if (diff)
1939         goto not_equal;
1940
1941       key = key->next;
1942       if (! key)
1943         break;
1944
1945       /* Find the beginning and limit of the next field.  */
1946       if (key->eword != SIZE_MAX)
1947         lima = limfield (a, key), limb = limfield (b, key);
1948       else
1949         lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1950
1951       if (key->sword != SIZE_MAX)
1952         texta = begfield (a, key), textb = begfield (b, key);
1953       else
1954         {
1955           texta = a->text, textb = b->text;
1956           if (key->skipsblanks)
1957             {
1958               while (texta < lima && blanks[to_uchar (*texta)])
1959                 ++texta;
1960               while (textb < limb && blanks[to_uchar (*textb)])
1961                 ++textb;
1962             }
1963         }
1964     }
1965
1966   return 0;
1967
1968  greater:
1969   diff = 1;
1970  not_equal:
1971   return key->reverse ? -diff : diff;
1972 }
1973
1974 /* Compare two lines A and B, returning negative, zero, or positive
1975    depending on whether A compares less than, equal to, or greater than B. */
1976
1977 static int
1978 compare (const struct line *a, const struct line *b)
1979 {
1980   int diff;
1981   size_t alen, blen;
1982
1983   /* First try to compare on the specified keys (if any).
1984      The only two cases with no key at all are unadorned sort,
1985      and unadorned sort -r. */
1986   if (keylist)
1987     {
1988       diff = keycompare (a, b);
1989       if (diff | unique | stable)
1990         return diff;
1991     }
1992
1993   /* If the keys all compare equal (or no keys were specified)
1994      fall through to the default comparison.  */
1995   alen = a->length - 1, blen = b->length - 1;
1996
1997   if (alen == 0)
1998     diff = - NONZERO (blen);
1999   else if (blen == 0)
2000     diff = 1;
2001   else if (hard_LC_COLLATE)
2002     diff = xmemcoll (a->text, alen, b->text, blen);
2003   else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2004     diff = alen < blen ? -1 : alen != blen;
2005
2006   return reverse ? -diff : diff;
2007 }
2008
2009 /* Check that the lines read from FILE_NAME come in order.  Return
2010    true if they are in order.  If CHECKONLY == 'c', also print a
2011    diagnostic (FILE_NAME, line number, contents of line) to stderr if
2012    they are not in order.  */
2013
2014 static bool
2015 check (char const *file_name, char checkonly)
2016 {
2017   FILE *fp = xfopen (file_name, "r");
2018   struct buffer buf;            /* Input buffer. */
2019   struct line temp;             /* Copy of previous line. */
2020   size_t alloc = 0;
2021   uintmax_t line_number = 0;
2022   struct keyfield const *key = keylist;
2023   bool nonunique = ! unique;
2024   bool ordered = true;
2025
2026   initbuf (&buf, sizeof (struct line),
2027            MAX (merge_buffer_size, sort_size));
2028   temp.text = NULL;
2029
2030   while (fillbuf (&buf, fp, file_name))
2031     {
2032       struct line const *line = buffer_linelim (&buf);
2033       struct line const *linebase = line - buf.nlines;
2034
2035       /* Make sure the line saved from the old buffer contents is
2036          less than or equal to the first line of the new buffer. */
2037       if (alloc && nonunique <= compare (&temp, line - 1))
2038         {
2039         found_disorder:
2040           {
2041             if (checkonly == 'c')
2042               {
2043                 struct line const *disorder_line = line - 1;
2044                 uintmax_t disorder_line_number =
2045                   buffer_linelim (&buf) - disorder_line + line_number;
2046                 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
2047                 fprintf (stderr, _("%s: %s:%s: disorder: "),
2048                          program_name, file_name,
2049                          umaxtostr (disorder_line_number, hr_buf));
2050                 write_bytes (disorder_line->text, disorder_line->length,
2051                              stderr, _("standard error"));
2052               }
2053
2054             ordered = false;
2055             break;
2056           }
2057         }
2058
2059       /* Compare each line in the buffer with its successor.  */
2060       while (linebase < --line)
2061         if (nonunique <= compare (line, line - 1))
2062           goto found_disorder;
2063
2064       line_number += buf.nlines;
2065
2066       /* Save the last line of the buffer.  */
2067       if (alloc < line->length)
2068         {
2069           do
2070             {
2071               alloc *= 2;
2072               if (! alloc)
2073                 {
2074                   alloc = line->length;
2075                   break;
2076                 }
2077             }
2078           while (alloc < line->length);
2079
2080           temp.text = xrealloc (temp.text, alloc);
2081         }
2082       memcpy (temp.text, line->text, line->length);
2083       temp.length = line->length;
2084       if (key)
2085         {
2086           temp.keybeg = temp.text + (line->keybeg - line->text);
2087           temp.keylim = temp.text + (line->keylim - line->text);
2088         }
2089     }
2090
2091   xfclose (fp, file_name);
2092   free (buf.buf);
2093   free (temp.text);
2094   return ordered;
2095 }
2096
2097 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
2098    files (all of which are at the start of the FILES array), and
2099    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2100    Close input and output files before returning.
2101    OUTPUT_FILE gives the name of the output file.  If it is NULL,
2102    the output file is standard output.  If OFP is NULL, the output
2103    file has not been opened yet (or written to, if standard output).  */
2104
2105 static void
2106 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2107           FILE *ofp, char const *output_file)
2108 {
2109   FILE **fps = xnmalloc (nmerge, sizeof *fps);
2110                                 /* Input streams for each file.  */
2111   struct buffer *buffer = xnmalloc (nmerge, sizeof *buffer);
2112                                 /* Input buffers for each file. */
2113   struct line saved;            /* Saved line storage for unique check. */
2114   struct line const *savedline = NULL;
2115                                 /* &saved if there is a saved line. */
2116   size_t savealloc = 0;         /* Size allocated for the saved line. */
2117   struct line const **cur = xnmalloc (nmerge, sizeof *cur);
2118                                 /* Current line in each line table. */
2119   struct line const **base = xnmalloc (nmerge, sizeof *base);
2120                                 /* Base of each line table.  */
2121   size_t *ord = xnmalloc (nmerge, sizeof *ord);
2122                                 /* Table representing a permutation of fps,
2123                                    such that cur[ord[0]] is the smallest line
2124                                    and will be next output. */
2125   size_t i;
2126   size_t j;
2127   size_t t;
2128   struct keyfield const *key = keylist;
2129   saved.text = NULL;
2130
2131   /* Read initial lines from each input file. */
2132   for (i = 0; i < nfiles; )
2133     {
2134       fps[i] = (files[i].pid
2135                 ? open_temp (files[i].name, files[i].pid)
2136                 : xfopen (files[i].name, "r"));
2137       initbuf (&buffer[i], sizeof (struct line),
2138                MAX (merge_buffer_size, sort_size / nfiles));
2139       if (fillbuf (&buffer[i], fps[i], files[i].name))
2140         {
2141           struct line const *linelim = buffer_linelim (&buffer[i]);
2142           cur[i] = linelim - 1;
2143           base[i] = linelim - buffer[i].nlines;
2144           i++;
2145         }
2146       else
2147         {
2148           /* fps[i] is empty; eliminate it from future consideration.  */
2149           xfclose (fps[i], files[i].name);
2150           if (i < ntemps)
2151             {
2152               ntemps--;
2153               zaptemp (files[i].name);
2154             }
2155           free (buffer[i].buf);
2156           --nfiles;
2157           for (j = i; j < nfiles; ++j)
2158             files[j] = files[j + 1];
2159         }
2160     }
2161
2162   if (! ofp)
2163     ofp = xfopen (output_file, "w");
2164
2165   /* Set up the ord table according to comparisons among input lines.
2166      Since this only reorders two items if one is strictly greater than
2167      the other, it is stable. */
2168   for (i = 0; i < nfiles; ++i)
2169     ord[i] = i;
2170   for (i = 1; i < nfiles; ++i)
2171     if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2172       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2173
2174   /* Repeatedly output the smallest line until no input remains. */
2175   while (nfiles)
2176     {
2177       struct line const *smallest = cur[ord[0]];
2178
2179       /* If uniquified output is turned on, output only the first of
2180          an identical series of lines. */
2181       if (unique)
2182         {
2183           if (savedline && compare (savedline, smallest))
2184             {
2185               savedline = NULL;
2186               write_bytes (saved.text, saved.length, ofp, output_file);
2187             }
2188           if (!savedline)
2189             {
2190               savedline = &saved;
2191               if (savealloc < smallest->length)
2192                 {
2193                   do
2194                     if (! savealloc)
2195                       {
2196                         savealloc = smallest->length;
2197                         break;
2198                       }
2199                   while ((savealloc *= 2) < smallest->length);
2200
2201                   saved.text = xrealloc (saved.text, savealloc);
2202                 }
2203               saved.length = smallest->length;
2204               memcpy (saved.text, smallest->text, saved.length);
2205               if (key)
2206                 {
2207                   saved.keybeg =
2208                     saved.text + (smallest->keybeg - smallest->text);
2209                   saved.keylim =
2210                     saved.text + (smallest->keylim - smallest->text);
2211                 }
2212             }
2213         }
2214       else
2215         write_bytes (smallest->text, smallest->length, ofp, output_file);
2216
2217       /* Check if we need to read more lines into core. */
2218       if (base[ord[0]] < smallest)
2219         cur[ord[0]] = smallest - 1;
2220       else
2221         {
2222           if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2223             {
2224               struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2225               cur[ord[0]] = linelim - 1;
2226               base[ord[0]] = linelim - buffer[ord[0]].nlines;
2227             }
2228           else
2229             {
2230               /* We reached EOF on fps[ord[0]].  */
2231               for (i = 1; i < nfiles; ++i)
2232                 if (ord[i] > ord[0])
2233                   --ord[i];
2234               --nfiles;
2235               xfclose (fps[ord[0]], files[ord[0]].name);
2236               if (ord[0] < ntemps)
2237                 {
2238                   ntemps--;
2239                   zaptemp (files[ord[0]].name);
2240                 }
2241               free (buffer[ord[0]].buf);
2242               for (i = ord[0]; i < nfiles; ++i)
2243                 {
2244                   fps[i] = fps[i + 1];
2245                   files[i] = files[i + 1];
2246                   buffer[i] = buffer[i + 1];
2247                   cur[i] = cur[i + 1];
2248                   base[i] = base[i + 1];
2249                 }
2250               for (i = 0; i < nfiles; ++i)
2251                 ord[i] = ord[i + 1];
2252               continue;
2253             }
2254         }
2255
2256       /* The new line just read in may be larger than other lines
2257          already in main memory; push it back in the queue until we
2258          encounter a line larger than it.  Optimize for the common
2259          case where the new line is smallest.  */
2260       {
2261         size_t lo = 1;
2262         size_t hi = nfiles;
2263         size_t probe = lo;
2264         size_t ord0 = ord[0];
2265         size_t count_of_smaller_lines;
2266
2267         while (lo < hi)
2268           {
2269             int cmp = compare (cur[ord0], cur[ord[probe]]);
2270             if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2271               hi = probe;
2272             else
2273               lo = probe + 1;
2274             probe = (lo + hi) / 2;
2275           }
2276
2277         count_of_smaller_lines = lo - 1;
2278         for (j = 0; j < count_of_smaller_lines; j++)
2279           ord[j] = ord[j + 1];
2280         ord[count_of_smaller_lines] = ord0;
2281       }
2282
2283       /* Free up some resources every once in a while.  */
2284       if (MAX_PROCS_BEFORE_REAP < nprocs)
2285         reap_some ();
2286     }
2287
2288   if (unique && savedline)
2289     {
2290       write_bytes (saved.text, saved.length, ofp, output_file);
2291       free (saved.text);
2292     }
2293
2294   xfclose (ofp, output_file);
2295   free(fps);
2296   free(buffer);
2297   free(ord);
2298   free(base);
2299   free(cur);
2300 }
2301
2302 /* Merge into T the two sorted arrays of lines LO (with NLO members)
2303    and HI (with NHI members).  T, LO, and HI point just past their
2304    respective arrays, and the arrays are in reverse order.  NLO and
2305    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
2306
2307 static inline void
2308 mergelines (struct line *t,
2309             struct line const *lo, size_t nlo,
2310             struct line const *hi, size_t nhi)
2311 {
2312   for (;;)
2313     if (compare (lo - 1, hi - 1) <= 0)
2314       {
2315         *--t = *--lo;
2316         if (! --nlo)
2317           {
2318             /* HI - NHI equalled T - (NLO + NHI) when this function
2319                began.  Therefore HI must equal T now, and there is no
2320                need to copy from HI to T.  */
2321             return;
2322           }
2323       }
2324     else
2325       {
2326         *--t = *--hi;
2327         if (! --nhi)
2328           {
2329             do
2330               *--t = *--lo;
2331             while (--nlo);
2332
2333             return;
2334           }
2335       }
2336 }
2337
2338 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
2339    NLINES must be at least 2.
2340    The input and output arrays are in reverse order, and LINES and
2341    TEMP point just past the end of their respective arrays.
2342
2343    Use a recursive divide-and-conquer algorithm, in the style
2344    suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
2345    the optimization suggested by exercise 5.2.4-10; this requires room
2346    for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
2347    writes that this memory optimization was originally published by
2348    D. A. Bell, Comp J. 1 (1958), 75.  */
2349
2350 static void
2351 sortlines (struct line *lines, size_t nlines, struct line *temp)
2352 {
2353   if (nlines == 2)
2354     {
2355       if (0 < compare (&lines[-1], &lines[-2]))
2356         {
2357           struct line tmp = lines[-1];
2358           lines[-1] = lines[-2];
2359           lines[-2] = tmp;
2360         }
2361     }
2362   else
2363     {
2364       size_t nlo = nlines / 2;
2365       size_t nhi = nlines - nlo;
2366       struct line *lo = lines;
2367       struct line *hi = lines - nlo;
2368       struct line *sorted_lo = temp;
2369
2370       sortlines (hi, nhi, temp);
2371       if (1 < nlo)
2372         sortlines_temp (lo, nlo, sorted_lo);
2373       else
2374         sorted_lo[-1] = lo[-1];
2375
2376       mergelines (lines, sorted_lo, nlo, hi, nhi);
2377     }
2378 }
2379
2380 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
2381    rather than sorting in place.  */
2382
2383 static void
2384 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
2385 {
2386   if (nlines == 2)
2387     {
2388       /* Declare `swap' as int, not bool, to work around a bug
2389          <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
2390          in the IBM xlc 6.0.0.0 compiler in 64-bit mode.  */
2391       int swap = (0 < compare (&lines[-1], &lines[-2]));
2392       temp[-1] = lines[-1 - swap];
2393       temp[-2] = lines[-2 + swap];
2394     }
2395   else
2396     {
2397       size_t nlo = nlines / 2;
2398       size_t nhi = nlines - nlo;
2399       struct line *lo = lines;
2400       struct line *hi = lines - nlo;
2401       struct line *sorted_hi = temp - nlo;
2402
2403       sortlines_temp (hi, nhi, sorted_hi);
2404       if (1 < nlo)
2405         sortlines (lo, nlo, temp);
2406
2407       mergelines (temp, lo, nlo, sorted_hi, nhi);
2408     }
2409 }
2410
2411 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
2412    the same as OUTFILE.  If found, merge the found instances (and perhaps
2413    some other files) into a temporary file so that it can in turn be
2414    merged into OUTFILE without destroying OUTFILE before it is completely
2415    read.  Return the new value of NFILES, which differs from the old if
2416    some merging occurred.
2417
2418    This test ensures that an otherwise-erroneous use like
2419    "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
2420    It's not clear that POSIX requires this nicety.
2421    Detect common error cases, but don't try to catch obscure cases like
2422    "cat ... FILE ... | sort -m -o FILE"
2423    where traditional "sort" doesn't copy the input and where
2424    people should know that they're getting into trouble anyway.
2425    Catching these obscure cases would slow down performance in
2426    common cases.  */
2427
2428 static size_t
2429 avoid_trashing_input (struct sortfile *files, size_t ntemps,
2430                       size_t nfiles, char const *outfile)
2431 {
2432   size_t i;
2433   bool got_outstat = false;
2434   struct stat outstat;
2435
2436   for (i = ntemps; i < nfiles; i++)
2437     {
2438       bool is_stdin = STREQ (files[i].name, "-");
2439       bool same;
2440       struct stat instat;
2441
2442       if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
2443         same = true;
2444       else
2445         {
2446           if (! got_outstat)
2447             {
2448               if ((outfile
2449                    ? stat (outfile, &outstat)
2450                    : fstat (STDOUT_FILENO, &outstat))
2451                   != 0)
2452                 break;
2453               got_outstat = true;
2454             }
2455
2456           same = (((is_stdin
2457                     ? fstat (STDIN_FILENO, &instat)
2458                     : stat (files[i].name, &instat))
2459                    == 0)
2460                   && SAME_INODE (instat, outstat));
2461         }
2462
2463       if (same)
2464         {
2465           FILE *tftp;
2466           pid_t pid;
2467           char *temp = create_temp (&tftp, &pid);
2468           mergefps (&files[i],0, nfiles - i, tftp, temp);
2469           files[i].name = temp;
2470           files[i].pid = pid;
2471           return i + 1;
2472         }
2473     }
2474
2475   return nfiles;
2476 }
2477
2478 /* Merge the input FILES.  NTEMPS is the number of files at the
2479    start of FILES that are temporary; it is zero at the top level.
2480    NFILES is the total number of files.  Put the output in
2481    OUTPUT_FILE; a null OUTPUT_FILE stands for standard output.  */
2482
2483 static void
2484 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
2485        char const *output_file)
2486 {
2487   while (nmerge < nfiles)
2488     {
2489       /* Number of input files processed so far.  */
2490       size_t in;
2491
2492       /* Number of output files generated so far.  */
2493       size_t out;
2494
2495       /* nfiles % NMERGE; this counts input files that are left over
2496          after all full-sized merges have been done.  */
2497       size_t remainder;
2498
2499       /* Number of easily-available slots at the next loop iteration.  */
2500       size_t cheap_slots;
2501
2502       /* Do as many NMERGE-size merges as possible.  */
2503       for (out = in = 0; out < nfiles / nmerge; out++, in += nmerge)
2504         {
2505           FILE *tfp;
2506           pid_t pid;
2507           char *temp = create_temp (&tfp, &pid);
2508           size_t nt = MIN (ntemps, nmerge);
2509           ntemps -= nt;
2510           mergefps (&files[in], nt, nmerge, tfp, temp);
2511           files[out].name = temp;
2512           files[out].pid = pid;
2513         }
2514
2515       remainder = nfiles - in;
2516       cheap_slots = nmerge - out % nmerge;
2517
2518       if (cheap_slots < remainder)
2519         {
2520           /* So many files remain that they can't all be put into the last
2521              NMERGE-sized output window.  Do one more merge.  Merge as few
2522              files as possible, to avoid needless I/O.  */
2523           size_t nshortmerge = remainder - cheap_slots + 1;
2524           FILE *tfp;
2525           pid_t pid;
2526           char *temp = create_temp (&tfp, &pid);
2527           size_t nt = MIN (ntemps, nshortmerge);
2528           ntemps -= nt;
2529           mergefps (&files[in], nt, nshortmerge, tfp, temp);
2530           files[out].name = temp;
2531           files[out++].pid = pid;
2532           in += nshortmerge;
2533         }
2534
2535       /* Put the remaining input files into the last NMERGE-sized output
2536          window, so they will be merged in the next pass.  */
2537       memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2538       ntemps += out;
2539       nfiles -= in - out;
2540     }
2541
2542   nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2543   mergefps (files, ntemps, nfiles, NULL, output_file);
2544 }
2545
2546 /* Sort NFILES FILES onto OUTPUT_FILE. */
2547
2548 static void
2549 sort (char * const *files, size_t nfiles, char const *output_file)
2550 {
2551   struct buffer buf;
2552   size_t ntemps = 0;
2553   bool output_file_created = false;
2554
2555   buf.alloc = 0;
2556
2557   while (nfiles)
2558     {
2559       char const *temp_output;
2560       char const *file = *files;
2561       FILE *fp = xfopen (file, "r");
2562       FILE *tfp;
2563       size_t bytes_per_line = (2 * sizeof (struct line)
2564                                - sizeof (struct line) / 2);
2565
2566       if (! buf.alloc)
2567         initbuf (&buf, bytes_per_line,
2568                  sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2569       buf.eof = false;
2570       files++;
2571       nfiles--;
2572
2573       while (fillbuf (&buf, fp, file))
2574         {
2575           struct line *line;
2576           struct line *linebase;
2577
2578           if (buf.eof && nfiles
2579               && (bytes_per_line + 1
2580                   < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2581             {
2582               /* End of file, but there is more input and buffer room.
2583                  Concatenate the next input file; this is faster in
2584                  the usual case.  */
2585               buf.left = buf.used;
2586               break;
2587             }
2588
2589           line = buffer_linelim (&buf);
2590           linebase = line - buf.nlines;
2591           if (1 < buf.nlines)
2592             sortlines (line, buf.nlines, linebase);
2593           if (buf.eof && !nfiles && !ntemps && !buf.left)
2594             {
2595               xfclose (fp, file);
2596               tfp = xfopen (output_file, "w");
2597               temp_output = output_file;
2598               output_file_created = true;
2599             }
2600           else
2601             {
2602               ++ntemps;
2603               temp_output = create_temp (&tfp, NULL);
2604             }
2605
2606           do
2607             {
2608               line--;
2609               write_bytes (line->text, line->length, tfp, temp_output);
2610               if (unique)
2611                 while (linebase < line && compare (line, line - 1) == 0)
2612                   line--;
2613             }
2614           while (linebase < line);
2615
2616           xfclose (tfp, temp_output);
2617
2618           /* Free up some resources every once in a while.  */
2619           if (MAX_PROCS_BEFORE_REAP < nprocs)
2620             reap_some ();
2621
2622           if (output_file_created)
2623             goto finish;
2624         }
2625       xfclose (fp, file);
2626     }
2627
2628  finish:
2629   free (buf.buf);
2630
2631   if (! output_file_created)
2632     {
2633       size_t i;
2634       struct tempnode *node = temphead;
2635       struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2636       for (i = 0; node; i++)
2637         {
2638           tempfiles[i].name = node->name;
2639           tempfiles[i].pid = node->pid;
2640           node = node->next;
2641         }
2642       merge (tempfiles, ntemps, ntemps, output_file);
2643       free (tempfiles);
2644     }
2645 }
2646
2647 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list.  */
2648
2649 static void
2650 insertkey (struct keyfield *key_arg)
2651 {
2652   struct keyfield **p;
2653   struct keyfield *key = xmemdup (key_arg, sizeof *key);
2654
2655   for (p = &keylist; *p; p = &(*p)->next)
2656     continue;
2657   *p = key;
2658   key->next = NULL;
2659 }
2660
2661 /* Report a bad field specification SPEC, with extra info MSGID.  */
2662
2663 static void badfieldspec (char const *, char const *)
2664      ATTRIBUTE_NORETURN;
2665 static void
2666 badfieldspec (char const *spec, char const *msgid)
2667 {
2668   error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
2669          _(msgid), quote (spec));
2670   abort ();
2671 }
2672
2673 /* Report incompatible options.  */
2674
2675 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
2676 static void
2677 incompatible_options (char const *opts)
2678 {
2679   error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
2680   abort ();
2681 }
2682
2683 /* Check compatibility of ordering options.  */
2684
2685 static void
2686 check_ordering_compatibility (void)
2687 {
2688   struct keyfield const *key;
2689
2690   for (key = keylist; key; key = key->next)
2691     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
2692               + !!key->ignore))
2693         || (key->random && key->translate))
2694       {
2695         char opts[7];
2696         char *p = opts;
2697         if (key->ignore == nondictionary)
2698           *p++ = 'd';
2699         if (key->translate)
2700           *p++ = 'f';
2701         if (key->general_numeric)
2702           *p++ = 'g';
2703         if (key->ignore == nonprinting)
2704           *p++ = 'i';
2705         if (key->month)
2706           *p++ = 'M';
2707         if (key->numeric)
2708           *p++ = 'n';
2709         if (key->random)
2710           *p++ = 'R';
2711         *p = '\0';
2712         incompatible_options (opts);
2713       }
2714 }
2715
2716 /* Parse the leading integer in STRING and store the resulting value
2717    (which must fit into size_t) into *VAL.  Return the address of the
2718    suffix after the integer.  If the value is too large, silently
2719    substitute SIZE_MAX.  If MSGID is NULL, return NULL after
2720    failure; otherwise, report MSGID and exit on failure.  */
2721
2722 static char const *
2723 parse_field_count (char const *string, size_t *val, char const *msgid)
2724 {
2725   char *suffix;
2726   uintmax_t n;
2727
2728   switch (xstrtoumax (string, &suffix, 10, &n, ""))
2729     {
2730     case LONGINT_OK:
2731     case LONGINT_INVALID_SUFFIX_CHAR:
2732       *val = n;
2733       if (*val == n)
2734         break;
2735       /* Fall through.  */
2736     case LONGINT_OVERFLOW:
2737     case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2738       *val = SIZE_MAX;
2739       break;
2740
2741     case LONGINT_INVALID:
2742       if (msgid)
2743         error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
2744                _(msgid), quote (string));
2745       return NULL;
2746     }
2747
2748   return suffix;
2749 }
2750
2751 /* Handle interrupts and hangups. */
2752
2753 static void
2754 sighandler (int sig)
2755 {
2756   if (! SA_NOCLDSTOP)
2757     signal (sig, SIG_IGN);
2758
2759   cleanup ();
2760
2761   signal (sig, SIG_DFL);
2762   raise (sig);
2763 }
2764
2765 /* Set the ordering options for KEY specified in S.
2766    Return the address of the first character in S that
2767    is not a valid ordering option.
2768    BLANKTYPE is the kind of blanks that 'b' should skip. */
2769
2770 static char *
2771 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
2772 {
2773   while (*s)
2774     {
2775       switch (*s)
2776         {
2777         case 'b':
2778           if (blanktype == bl_start || blanktype == bl_both)
2779             key->skipsblanks = true;
2780           if (blanktype == bl_end || blanktype == bl_both)
2781             key->skipeblanks = true;
2782           break;
2783         case 'd':
2784           key->ignore = nondictionary;
2785           break;
2786         case 'f':
2787           key->translate = fold_toupper;
2788           break;
2789         case 'g':
2790           key->general_numeric = true;
2791           break;
2792         case 'i':
2793           /* Option order should not matter, so don't let -i override
2794              -d.  -d implies -i, but -i does not imply -d.  */
2795           if (! key->ignore)
2796             key->ignore = nonprinting;
2797           break;
2798         case 'M':
2799           key->month = true;
2800           break;
2801         case 'n':
2802           key->numeric = true;
2803           break;
2804         case 'R':
2805           key->random = true;
2806           break;
2807         case 'r':
2808           key->reverse = true;
2809           break;
2810         default:
2811           return (char *) s;
2812         }
2813       ++s;
2814     }
2815   return (char *) s;
2816 }
2817
2818 static struct keyfield *
2819 key_init (struct keyfield *key)
2820 {
2821   memset (key, 0, sizeof *key);
2822   key->eword = SIZE_MAX;
2823   return key;
2824 }
2825
2826 int
2827 main (int argc, char **argv)
2828 {
2829   struct keyfield *key;
2830   struct keyfield key_buf;
2831   struct keyfield gkey;
2832   char const *s;
2833   int c = 0;
2834   char checkonly = 0;
2835   bool mergeonly = false;
2836   char *random_source = NULL;
2837   bool need_random = false;
2838   size_t nfiles = 0;
2839   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2840   bool obsolete_usage = (posix2_version () < 200112);
2841   char **files;
2842   char *files_from = NULL;
2843   struct Tokens tok;
2844   char const *outfile = NULL;
2845
2846   initialize_main (&argc, &argv);
2847   set_program_name (argv[0]);
2848   setlocale (LC_ALL, "");
2849   bindtextdomain (PACKAGE, LOCALEDIR);
2850   textdomain (PACKAGE);
2851
2852   initialize_exit_failure (SORT_FAILURE);
2853
2854   hard_LC_COLLATE = hard_locale (LC_COLLATE);
2855 #if HAVE_NL_LANGINFO
2856   hard_LC_TIME = hard_locale (LC_TIME);
2857 #endif
2858
2859   /* Get locale's representation of the decimal point.  */
2860   {
2861     struct lconv const *locale = localeconv ();
2862
2863     /* If the locale doesn't define a decimal point, or if the decimal
2864        point is multibyte, use the C locale's decimal point.  FIXME:
2865        add support for multibyte decimal points.  */
2866     decimal_point = to_uchar (locale->decimal_point[0]);
2867     if (! decimal_point || locale->decimal_point[1])
2868       decimal_point = '.';
2869
2870     /* FIXME: add support for multibyte thousands separators.  */
2871     thousands_sep = to_uchar (*locale->thousands_sep);
2872     if (! thousands_sep || locale->thousands_sep[1])
2873       thousands_sep = -1;
2874   }
2875
2876   have_read_stdin = false;
2877   inittables ();
2878
2879   {
2880     size_t i;
2881     static int const sig[] =
2882       {
2883         /* The usual suspects.  */
2884         SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
2885 #ifdef SIGPOLL
2886         SIGPOLL,
2887 #endif
2888 #ifdef SIGPROF
2889         SIGPROF,
2890 #endif
2891 #ifdef SIGVTALRM
2892         SIGVTALRM,
2893 #endif
2894 #ifdef SIGXCPU
2895         SIGXCPU,
2896 #endif
2897 #ifdef SIGXFSZ
2898         SIGXFSZ,
2899 #endif
2900       };
2901     enum { nsigs = sizeof sig / sizeof sig[0] };
2902
2903 #if SA_NOCLDSTOP
2904     struct sigaction act;
2905
2906     sigemptyset (&caught_signals);
2907     for (i = 0; i < nsigs; i++)
2908       {
2909         sigaction (sig[i], NULL, &act);
2910         if (act.sa_handler != SIG_IGN)
2911           sigaddset (&caught_signals, sig[i]);
2912       }
2913
2914     act.sa_handler = sighandler;
2915     act.sa_mask = caught_signals;
2916     act.sa_flags = 0;
2917
2918     for (i = 0; i < nsigs; i++)
2919       if (sigismember (&caught_signals, sig[i]))
2920         sigaction (sig[i], &act, NULL);
2921 #else
2922     for (i = 0; i < nsigs; i++)
2923       if (signal (sig[i], SIG_IGN) != SIG_IGN)
2924         {
2925           signal (sig[i], sighandler);
2926           siginterrupt (sig[i], 1);
2927         }
2928 #endif
2929   }
2930
2931   /* The signal mask is known, so it is safe to invoke exit_cleanup.  */
2932   atexit (exit_cleanup);
2933
2934   gkey.sword = gkey.eword = SIZE_MAX;
2935   gkey.ignore = NULL;
2936   gkey.translate = NULL;
2937   gkey.numeric = gkey.general_numeric = gkey.random = false;
2938   gkey.month = gkey.reverse = false;
2939   gkey.skipsblanks = gkey.skipeblanks = false;
2940
2941   files = xnmalloc (argc, sizeof *files);
2942
2943   for (;;)
2944     {
2945       /* Parse an operand as a file after "--" was seen; or if
2946          pedantic and a file was seen, unless the POSIX version
2947          predates 1003.1-2001 and -c was not seen and the operand is
2948          "-o FILE" or "-oFILE".  */
2949       int oi = -1;
2950
2951       if (c == -1
2952           || (posixly_correct && nfiles != 0
2953               && ! (obsolete_usage
2954                     && ! checkonly
2955                     && optind != argc
2956                     && argv[optind][0] == '-' && argv[optind][1] == 'o'
2957                     && (argv[optind][2] || optind + 1 != argc)))
2958           || ((c = getopt_long (argc, argv, short_options,
2959                                 long_options, &oi))
2960               == -1))
2961         {
2962           if (argc <= optind)
2963             break;
2964           files[nfiles++] = argv[optind++];
2965         }
2966       else switch (c)
2967         {
2968         case 1:
2969           key = NULL;
2970           if (optarg[0] == '+')
2971             {
2972               bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
2973                                       && ISDIGIT (argv[optind][1]));
2974               obsolete_usage |= minus_pos_usage & ~posixly_correct;
2975               if (obsolete_usage)
2976                 {
2977                   /* Treat +POS1 [-POS2] as a key if possible; but silently
2978                      treat an operand as a file if it is not a valid +POS1.  */
2979                   key = key_init (&key_buf);
2980                   s = parse_field_count (optarg + 1, &key->sword, NULL);
2981                   if (s && *s == '.')
2982                     s = parse_field_count (s + 1, &key->schar, NULL);
2983                   if (! (key->sword | key->schar))
2984                     key->sword = SIZE_MAX;
2985                   if (! s || *set_ordering (s, key, bl_start))
2986                     key = NULL;
2987                   else
2988                     {
2989                       if (minus_pos_usage)
2990                         {
2991                           char const *optarg1 = argv[optind++];
2992                           s = parse_field_count (optarg1 + 1, &key->eword,
2993                                              N_("invalid number after `-'"));
2994                           if (*s == '.')
2995                             s = parse_field_count (s + 1, &key->echar,
2996                                                N_("invalid number after `.'"));
2997                           if (*set_ordering (s, key, bl_end))
2998                             badfieldspec (optarg1,
2999                                       N_("stray character in field spec"));
3000                         }
3001                       insertkey (key);
3002                     }
3003                 }
3004             }
3005           if (! key)
3006             files[nfiles++] = optarg;
3007           break;
3008
3009         case SORT_OPTION:
3010           c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
3011           /* Fall through. */
3012         case 'b':
3013         case 'd':
3014         case 'f':
3015         case 'g':
3016         case 'i':
3017         case 'M':
3018         case 'n':
3019         case 'r':
3020         case 'R':
3021           {
3022             char str[2];
3023             str[0] = c;
3024             str[1] = '\0';
3025             set_ordering (str, &gkey, bl_both);
3026           }
3027           break;
3028
3029         case CHECK_OPTION:
3030           c = (optarg
3031                ? XARGMATCH ("--check", optarg, check_args, check_types)
3032                : 'c');
3033           /* Fall through.  */
3034         case 'c':
3035         case 'C':
3036           if (checkonly && checkonly != c)
3037             incompatible_options ("cC");
3038           checkonly = c;
3039           break;
3040
3041         case COMPRESS_PROGRAM_OPTION:
3042           if (compress_program && !STREQ (compress_program, optarg))
3043             error (SORT_FAILURE, 0, _("multiple compress programs specified"));
3044           compress_program = optarg;
3045           break;
3046
3047         case FILES0_FROM_OPTION:
3048           files_from = optarg;
3049           break;
3050
3051         case 'k':
3052           key = key_init (&key_buf);
3053
3054           /* Get POS1. */
3055           s = parse_field_count (optarg, &key->sword,
3056                                  N_("invalid number at field start"));
3057           if (! key->sword--)
3058             {
3059               /* Provoke with `sort -k0' */
3060               badfieldspec (optarg, N_("field number is zero"));
3061             }
3062           if (*s == '.')
3063             {
3064               s = parse_field_count (s + 1, &key->schar,
3065                                      N_("invalid number after `.'"));
3066               if (! key->schar--)
3067                 {
3068                   /* Provoke with `sort -k1.0' */
3069                   badfieldspec (optarg, N_("character offset is zero"));
3070                 }
3071             }
3072           if (! (key->sword | key->schar))
3073             key->sword = SIZE_MAX;
3074           s = set_ordering (s, key, bl_start);
3075           if (*s != ',')
3076             {
3077               key->eword = SIZE_MAX;
3078               key->echar = 0;
3079             }
3080           else
3081             {
3082               /* Get POS2. */
3083               s = parse_field_count (s + 1, &key->eword,
3084                                      N_("invalid number after `,'"));
3085               if (! key->eword--)
3086                 {
3087                   /* Provoke with `sort -k1,0' */
3088                   badfieldspec (optarg, N_("field number is zero"));
3089                 }
3090               if (*s == '.')
3091                 s = parse_field_count (s + 1, &key->echar,
3092                                        N_("invalid number after `.'"));
3093               else
3094                 {
3095                   /* `-k 2,3' is equivalent to `+1 -3'.  */
3096                   key->eword++;
3097                 }
3098               s = set_ordering (s, key, bl_end);
3099             }
3100           if (*s)
3101             badfieldspec (optarg, N_("stray character in field spec"));
3102           insertkey (key);
3103           break;
3104
3105         case 'm':
3106           mergeonly = true;
3107           break;
3108
3109         case NMERGE_OPTION:
3110           specify_nmerge (oi, c, optarg);
3111           break;
3112
3113         case 'o':
3114           if (outfile && !STREQ (outfile, optarg))
3115             error (SORT_FAILURE, 0, _("multiple output files specified"));
3116           outfile = optarg;
3117           break;
3118
3119         case RANDOM_SOURCE_OPTION:
3120           if (random_source && !STREQ (random_source, optarg))
3121             error (SORT_FAILURE, 0, _("multiple random sources specified"));
3122           random_source = optarg;
3123           break;
3124
3125         case 's':
3126           stable = true;
3127           break;
3128
3129         case 'S':
3130           specify_sort_size (oi, c, optarg);
3131           break;
3132
3133         case 't':
3134           {
3135             char newtab = optarg[0];
3136             if (! newtab)
3137               error (SORT_FAILURE, 0, _("empty tab"));
3138             if (optarg[1])
3139               {
3140                 if (STREQ (optarg, "\\0"))
3141                   newtab = '\0';
3142                 else
3143                   {
3144                     /* Provoke with `sort -txx'.  Complain about
3145                        "multi-character tab" instead of "multibyte tab", so
3146                        that the diagnostic's wording does not need to be
3147                        changed once multibyte characters are supported.  */
3148                     error (SORT_FAILURE, 0, _("multi-character tab %s"),
3149                            quote (optarg));
3150                   }
3151               }
3152             if (tab != TAB_DEFAULT && tab != newtab)
3153               error (SORT_FAILURE, 0, _("incompatible tabs"));
3154             tab = newtab;
3155           }
3156           break;
3157
3158         case 'T':
3159           add_temp_dir (optarg);
3160           break;
3161
3162         case 'u':
3163           unique = true;
3164           break;
3165
3166         case 'y':
3167           /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
3168              through Solaris 7.  It is also accepted by many non-Solaris
3169              "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
3170              -y is marked as obsolete starting with Solaris 8 (1999), but is
3171              still accepted as of Solaris 10 prerelease (2004).
3172
3173              Solaris 2.5.1 "sort -y 100" reads the input file "100", but
3174              emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
3175              and which in general ignores the argument after "-y" if it
3176              consists entirely of digits (it can even be empty).  */
3177           if (optarg == argv[optind - 1])
3178             {
3179               char const *p;
3180               for (p = optarg; ISDIGIT (*p); p++)
3181                 continue;
3182               optind -= (*p != '\0');
3183             }
3184           break;
3185
3186         case 'z':
3187           eolchar = 0;
3188           break;
3189
3190         case_GETOPT_HELP_CHAR;
3191
3192         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
3193
3194         default:
3195           usage (SORT_FAILURE);
3196         }
3197     }
3198
3199   if (files_from)
3200     {
3201       FILE *stream;
3202
3203       /* When using --files0-from=F, you may not specify any files
3204          on the command-line.  */
3205       if (nfiles)
3206         {
3207           error (0, 0, _("extra operand %s"), quote (files[0]));
3208           fprintf (stderr, "%s\n",
3209                    _("file operands cannot be combined with --files0-from"));
3210           usage (SORT_FAILURE);
3211         }
3212
3213       if (STREQ (files_from, "-"))
3214         stream = stdin;
3215       else
3216         {
3217           stream = fopen (files_from, "r");
3218           if (stream == NULL)
3219             error (SORT_FAILURE, errno, _("cannot open %s for reading"),
3220                    quote (files_from));
3221         }
3222
3223       readtokens0_init (&tok);
3224
3225       if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
3226         error (SORT_FAILURE, 0, _("cannot read file names from %s"),
3227                quote (files_from));
3228
3229       if (tok.n_tok)
3230         {
3231           size_t i;
3232           free (files);
3233           files = tok.tok;
3234           nfiles = tok.n_tok;
3235           for (i = 0; i < nfiles; i++)
3236           {
3237               if (STREQ (files[i], "-"))
3238                 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
3239                                           "no file name of %s allowed"),
3240                        quote (files[i]));
3241               else if (files[i][0] == '\0')
3242                 {
3243                   /* Using the standard `filename:line-number:' prefix here is
3244                      not totally appropriate, since NUL is the separator, not NL,
3245                      but it might be better than nothing.  */
3246                   unsigned long int file_number = i + 1;
3247                   error (SORT_FAILURE, 0,
3248                          _("%s:%lu: invalid zero-length file name"),
3249                          quotearg_colon (files_from), file_number);
3250                 }
3251           }
3252         }
3253       else
3254         error (SORT_FAILURE, 0, _("no input from %s"),
3255                quote (files_from));
3256     }
3257
3258   /* Inheritance of global options to individual keys. */
3259   for (key = keylist; key; key = key->next)
3260     {
3261       if (! (key->ignore || key->translate
3262              || (key->skipsblanks | key->reverse
3263                  | key->skipeblanks | key->month | key->numeric
3264                  | key->general_numeric
3265                  | key->random)))
3266         {
3267           key->ignore = gkey.ignore;
3268           key->translate = gkey.translate;
3269           key->skipsblanks = gkey.skipsblanks;
3270           key->skipeblanks = gkey.skipeblanks;
3271           key->month = gkey.month;
3272           key->numeric = gkey.numeric;
3273           key->general_numeric = gkey.general_numeric;
3274           key->random = gkey.random;
3275           key->reverse = gkey.reverse;
3276         }
3277
3278       need_random |= key->random;
3279     }
3280
3281   if (!keylist && (gkey.ignore || gkey.translate
3282                    || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
3283                        | gkey.numeric | gkey.general_numeric
3284                        | gkey.random)))
3285     {
3286       insertkey (&gkey);
3287       need_random |= gkey.random;
3288     }
3289
3290   check_ordering_compatibility ();
3291
3292   reverse = gkey.reverse;
3293
3294   if (need_random)
3295     {
3296       randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
3297       if (! randread_source)
3298         die (_("open failed"), random_source);
3299     }
3300
3301   if (temp_dir_count == 0)
3302     {
3303       char const *tmp_dir = getenv ("TMPDIR");
3304       add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
3305     }
3306
3307   if (nfiles == 0)
3308     {
3309       static char *minus = "-";
3310       nfiles = 1;
3311       free (files);
3312       files = &minus;
3313     }
3314
3315   /* Need to re-check that we meet the minimum requirement for memory
3316      usage with the final value for NMERGE. */
3317   if (0 < sort_size)
3318     sort_size = MAX (sort_size, MIN_SORT_SIZE);
3319
3320   if (checkonly)
3321     {
3322       if (nfiles > 1)
3323         error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
3324                quote (files[1]), checkonly);
3325
3326       if (outfile)
3327         {
3328           static char opts[] = {0, 'o', 0};
3329           opts[0] = checkonly;
3330           incompatible_options (opts);
3331         }
3332
3333       /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
3334          input is not properly sorted.  */
3335       exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
3336     }
3337
3338   if (mergeonly)
3339     {
3340       struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
3341       size_t i;
3342
3343       for (i = 0; i < nfiles; ++i)
3344         sortfiles[i].name = files[i];
3345
3346       merge (sortfiles, 0, nfiles, outfile);
3347       IF_LINT (free (sortfiles));
3348     }
3349   else
3350     sort (files, nfiles, outfile);
3351
3352   if (have_read_stdin && fclose (stdin) == EOF)
3353     die (_("close failed"), "-");
3354
3355   exit (EXIT_SUCCESS);
3356 }