src/sort.c

   1 /* sort - sort lines of text (with all kinds of options).
   2    Copyright (C) 1988, 1991-2010 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17    Written December 1988 by Mike Haertel.
  18    The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
  19    or (US mail) as Mike Haertel c/o Free Software Foundation.
  20
  21    Ørn E. Hansen added NLS support in 1997.  */
  22
  23 #include <config.h>
  24
  25 #include <getopt.h>
  26 #include <sys/types.h>
  27 #include <sys/wait.h>
  28 #include <signal.h>
  29 #include "system.h"
  30 #include "argmatch.h"
  31 #include "error.h"
  32 #include "filevercmp.h"
  33 #include "hard-locale.h"
  34 #include "hash.h"
  35 #include "ignore-value.h"
  36 #include "md5.h"
  37 #include "mbswidth.h"
  38 #include "physmem.h"
  39 #include "posixver.h"
  40 #include "quote.h"
  41 #include "quotearg.h"
  42 #include "randread.h"
  43 #include "readtokens0.h"
  44 #include "stdio--.h"
  45 #include "stdlib--.h"
  46 #include "strnumcmp.h"
  47 #include "xmemcoll.h"
  48 #include "xmemxfrm.h"
  49 #include "xnanosleep.h"
  50 #include "xstrtol.h"
  51
  52 #if HAVE_SYS_RESOURCE_H
  53 # include <sys/resource.h>
  54 #endif
  55 #ifndef RLIMIT_DATA
  56 struct rlimit { size_t rlim_cur; };
  57 # define getrlimit(Resource, Rlp) (-1)
  58 #endif
  59
  60 /* The official name of this program (e.g., no `g' prefix).  */
  61 #define PROGRAM_NAME "sort"
  62
  63 #define AUTHORS \
  64   proper_name ("Mike Haertel"), \
  65   proper_name ("Paul Eggert")
  66
  67 #if HAVE_LANGINFO_CODESET
  68 # include <langinfo.h>
  69 #endif
  70
  71 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
  72    present.  */
  73 #ifndef SA_NOCLDSTOP
  74 # define SA_NOCLDSTOP 0
  75 /* No sigprocmask.  Always 'return' zero. */
  76 # define sigprocmask(How, Set, Oset) (0)
  77 # define sigset_t int
  78 # if ! HAVE_SIGINTERRUPT
  79 #  define siginterrupt(sig, flag) /* empty */
  80 # endif
  81 #endif
  82
  83 #if !defined OPEN_MAX && defined NR_OPEN
  84 # define OPEN_MAX NR_OPEN
  85 #endif
  86 #if !defined OPEN_MAX
  87 # define OPEN_MAX 20
  88 #endif
  89
  90 #define UCHAR_LIM (UCHAR_MAX + 1)
  91
  92 #ifndef DEFAULT_TMPDIR
  93 # define DEFAULT_TMPDIR "/tmp"
  94 #endif
  95
  96 /* Exit statuses.  */
  97 enum
  98   {
  99     /* POSIX says to exit with status 1 if invoked with -c and the
 100        input is not properly sorted.  */
 101     SORT_OUT_OF_ORDER = 1,
 102
 103     /* POSIX says any other irregular exit must exit with a status
 104        code greater than 1.  */
 105     SORT_FAILURE = 2
 106   };
 107
 108 enum
 109   {
 110     /* The number of times we should try to fork a compression process
 111        (we retry if the fork call fails).  We don't _need_ to compress
 112        temp files, this is just to reduce disk access, so this number
 113        can be small.  Each retry doubles in duration.  */
 114     MAX_FORK_TRIES_COMPRESS = 4,
 115
 116     /* The number of times we should try to fork a decompression process.
 117        If we can't fork a decompression process, we can't sort, so this
 118        number should be big.  Each retry doubles in duration.  */
 119     MAX_FORK_TRIES_DECOMPRESS = 9
 120   };
 121
 122 /* The representation of the decimal point in the current locale.  */
 123 static int decimal_point;
 124
 125 /* Thousands separator; if -1, then there isn't one.  */
 126 static int thousands_sep;
 127
 128 /* Nonzero if the corresponding locales are hard.  */
 129 static bool hard_LC_COLLATE;
 130 #if HAVE_NL_LANGINFO
 131 static bool hard_LC_TIME;
 132 #endif
 133
 134 #define NONZERO(x) ((x) != 0)
 135
 136 /* The kind of blanks for '-b' to skip in various options. */
 137 enum blanktype { bl_start, bl_end, bl_both };
 138
 139 /* The character marking end of line. Default to \n. */
 140 static char eolchar = '\n';
 141
 142 /* Lines are held in core as counted strings. */
 143 struct line
 144 {
 145   char *text;                   /* Text of the line. */
 146   size_t length;                /* Length including final newline. */
 147   char *keybeg;                 /* Start of first key. */
 148   char *keylim;                 /* Limit of first key. */
 149 };
 150
 151 /* Input buffers. */
 152 struct buffer
 153 {
 154   char *buf;                    /* Dynamically allocated buffer,
 155                                    partitioned into 3 regions:
 156                                    - input data;
 157                                    - unused area;
 158                                    - an array of lines, in reverse order.  */
 159   size_t used;                  /* Number of bytes used for input data.  */
 160   size_t nlines;                /* Number of lines in the line array.  */
 161   size_t alloc;                 /* Number of bytes allocated. */
 162   size_t left;                  /* Number of bytes left from previous reads. */
 163   size_t line_bytes;            /* Number of bytes to reserve for each line. */
 164   bool eof;                     /* An EOF has been read.  */
 165 };
 166
 167 struct keyfield
 168 {
 169   size_t sword;                 /* Zero-origin 'word' to start at. */
 170   size_t schar;                 /* Additional characters to skip. */
 171   size_t eword;                 /* Zero-origin last 'word' of key. */
 172   size_t echar;                 /* Additional characters in field. */
 173   bool const *ignore;           /* Boolean array of characters to ignore. */
 174   char const *translate;        /* Translation applied to characters. */
 175   bool skipsblanks;             /* Skip leading blanks when finding start.  */
 176   bool skipeblanks;             /* Skip leading blanks when finding end.  */
 177   bool numeric;                 /* Flag for numeric comparison.  Handle
 178                                    strings of digits with optional decimal
 179                                    point, but no exponential notation. */
 180   bool random;                  /* Sort by random hash of key.  */
 181   bool general_numeric;         /* Flag for general, numeric comparison.
 182                                    Handle numbers in exponential notation. */
 183   bool human_numeric;           /* Flag for sorting by human readable
 184                                    units with either SI xor IEC prefixes. */
 185   int iec_present;              /* Flag for checking for mixed SI and IEC. */
 186   bool month;                   /* Flag for comparison by month name. */
 187   bool reverse;                 /* Reverse the sense of comparison. */
 188   bool version;                 /* sort by version number */
 189   bool obsolete_used;           /* obsolescent key option format is used. */
 190   struct keyfield *next;        /* Next keyfield to try. */
 191 };
 192
 193 struct month
 194 {
 195   char const *name;
 196   int val;
 197 };
 198
 199 /* FIXME: None of these tables work with multibyte character sets.
 200    Also, there are many other bugs when handling multibyte characters.
 201    One way to fix this is to rewrite `sort' to use wide characters
 202    internally, but doing this with good performance is a bit
 203    tricky.  */
 204
 205 /* Table of blanks.  */
 206 static bool blanks[UCHAR_LIM];
 207
 208 /* Table of non-printing characters. */
 209 static bool nonprinting[UCHAR_LIM];
 210
 211 /* Table of non-dictionary characters (not letters, digits, or blanks). */
 212 static bool nondictionary[UCHAR_LIM];
 213
 214 /* Translation table folding lower case to upper.  */
 215 static unsigned char fold_toupper[UCHAR_LIM];
 216
 217 #define MONTHS_PER_YEAR 12
 218
 219 /* Table mapping month names to integers.
 220    Alphabetic order allows binary search. */
 221 static struct month monthtab[] =
 222 {
 223   {"APR", 4},
 224   {"AUG", 8},
 225   {"DEC", 12},
 226   {"FEB", 2},
 227   {"JAN", 1},
 228   {"JUL", 7},
 229   {"JUN", 6},
 230   {"MAR", 3},
 231   {"MAY", 5},
 232   {"NOV", 11},
 233   {"OCT", 10},
 234   {"SEP", 9}
 235 };
 236
 237 /* During the merge phase, the number of files to merge at once. */
 238 #define NMERGE_DEFAULT 16
 239
 240 /* Minimum size for a merge or check buffer.  */
 241 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 242
 243 /* Minimum sort size; the code might not work with smaller sizes.  */
 244 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
 245
 246 /* The number of bytes needed for a merge or check buffer, which can
 247    function relatively efficiently even if it holds only one line.  If
 248    a longer line is seen, this value is increased.  */
 249 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
 250
 251 /* The approximate maximum number of bytes of main memory to use, as
 252    specified by the user.  Zero if the user has not specified a size.  */
 253 static size_t sort_size;
 254
 255 /* The guessed size for non-regular files.  */
 256 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
 257
 258 /* Array of directory names in which any temporary files are to be created. */
 259 static char const **temp_dirs;
 260
 261 /* Number of temporary directory names used.  */
 262 static size_t temp_dir_count;
 263
 264 /* Number of allocated slots in temp_dirs.  */
 265 static size_t temp_dir_alloc;
 266
 267 /* Flag to reverse the order of all comparisons. */
 268 static bool reverse;
 269
 270 /* Flag for stable sort.  This turns off the last ditch bytewise
 271    comparison of lines, and instead leaves lines in the same order
 272    they were read if all keys compare equal.  */
 273 static bool stable;
 274
 275 /* If TAB has this value, blanks separate fields.  */
 276 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 277
 278 /* Tab character separating fields.  If TAB_DEFAULT, then fields are
 279    separated by the empty string between a non-blank character and a blank
 280    character. */
 281 static int tab = TAB_DEFAULT;
 282
 283 /* Flag to remove consecutive duplicate lines from the output.
 284    Only the last of a sequence of equal lines will be output. */
 285 static bool unique;
 286
 287 /* Nonzero if any of the input files are the standard input. */
 288 static bool have_read_stdin;
 289
 290 /* List of key field comparisons to be tried.  */
 291 static struct keyfield *keylist;
 292
 293 /* Program used to (de)compress temp files.  Must accept -d.  */
 294 static char const *compress_program;
 295
 296 /* Annotate the output with extra info to aid the user.  */
 297 static bool debug;
 298
 299 /* Maximum number of files to merge in one go.  If more than this
 300    number are present, temp files will be used. */
 301 static unsigned int nmerge = NMERGE_DEFAULT;
 302
 303 static void sortlines_temp (struct line *, size_t, struct line *);
 304
 305 /* Report MESSAGE for FILE, then clean up and exit.
 306    If FILE is null, it represents standard output.  */
 307
 308 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
 309 static void
 310 die (char const *message, char const *file)
 311 {
 312   error (0, errno, "%s: %s", message, file ? file : _("standard output"));
 313   exit (SORT_FAILURE);
 314 }
 315
 316 void
 317 usage (int status)
 318 {
 319   if (status != EXIT_SUCCESS)
 320     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 321              program_name);
 322   else
 323     {
 324       printf (_("\
 325 Usage: %s [OPTION]... [FILE]...\n\
 326   or:  %s [OPTION]... --files0-from=F\n\
 327 "),
 328               program_name, program_name);
 329       fputs (_("\
 330 Write sorted concatenation of all FILE(s) to standard output.\n\
 331 \n\
 332 "), stdout);
 333       fputs (_("\
 334 Mandatory arguments to long options are mandatory for short options too.\n\
 335 "), stdout);
 336       fputs (_("\
 337 Ordering options:\n\
 338 \n\
 339 "), stdout);
 340       fputs (_("\
 341   -b, --ignore-leading-blanks  ignore leading blanks\n\
 342   -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
 343   -f, --ignore-case           fold lower case to upper case characters\n\
 344 "), stdout);
 345       fputs (_("\
 346   -g, --general-numeric-sort  compare according to general numerical value\n\
 347   -i, --ignore-nonprinting    consider only printable characters\n\
 348   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 349 "), stdout);
 350       fputs (_("\
 351   -h, --human-numeric-sort    compare human readable numbers (e.g., 2K 1G)\n\
 352 "), stdout);
 353       fputs (_("\
 354   -n, --numeric-sort          compare according to string numerical value\n\
 355   -R, --random-sort           sort by random hash of keys\n\
 356       --random-source=FILE    get random bytes from FILE\n\
 357   -r, --reverse               reverse the result of comparisons\n\
 358 "), stdout);
 359       fputs (_("\
 360       --sort=WORD             sort according to WORD:\n\
 361                                 general-numeric -g, human-numeric -h, month -M,\n\
 362                                 numeric -n, random -R, version -V\n\
 363   -V, --version-sort          natural sort of (version) numbers within text\n\
 364 \n\
 365 "), stdout);
 366       fputs (_("\
 367 Other options:\n\
 368 \n\
 369 "), stdout);
 370       fputs (_("\
 371       --batch-size=NMERGE   merge at most NMERGE inputs at once;\n\
 372                             for more use temp files\n\
 373 "), stdout);
 374       fputs (_("\
 375   -c, --check, --check=diagnose-first  check for sorted input; do not sort\n\
 376   -C, --check=quiet, --check=silent  like -c, but do not report first bad line\n\
 377       --compress-program=PROG  compress temporaries with PROG;\n\
 378                               decompress them with PROG -d\n\
 379 "), stdout);
 380       fputs (_("\
 381       --debug               annotate the part of the line used to sort,\n\
 382                               and warn about questionable usage to stderr\n\
 383       --files0-from=F       read input from the files specified by\n\
 384                             NUL-terminated names in file F;\n\
 385                             If F is - then read names from standard input\n\
 386 "), stdout);
 387       fputs (_("\
 388   -k, --key=POS1[,POS2]     start a key at POS1 (origin 1), end it at POS2\n\
 389                             (default end of line)\n\
 390   -m, --merge               merge already sorted files; do not sort\n\
 391 "), stdout);
 392       fputs (_("\
 393   -o, --output=FILE         write result to FILE instead of standard output\n\
 394   -s, --stable              stabilize sort by disabling last-resort comparison\n\
 395   -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
 396 "), stdout);
 397       printf (_("\
 398   -t, --field-separator=SEP  use SEP instead of non-blank to blank transition\n\
 399   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
 400                               multiple options specify multiple directories\n\
 401   -u, --unique              with -c, check for strict ordering;\n\
 402                               without -c, output only the first of an equal run\n\
 403 "), DEFAULT_TMPDIR);
 404       fputs (_("\
 405   -z, --zero-terminated     end lines with 0 byte, not newline\n\
 406 "), stdout);
 407       fputs (HELP_OPTION_DESCRIPTION, stdout);
 408       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 409       fputs (_("\
 410 \n\
 411 POS is F[.C][OPTS], where F is the field number and C the character position\n\
 412 in the field; both are origin 1.  If neither -t nor -b is in effect, characters\n\
 413 in a field are counted from the beginning of the preceding whitespace.  OPTS is\n\
 414 one or more single-letter ordering options, which override global ordering\n\
 415 options for that key.  If no key is given, use the entire line as the key.\n\
 416 \n\
 417 SIZE may be followed by the following multiplicative suffixes:\n\
 418 "), stdout);
 419       fputs (_("\
 420 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
 421 \n\
 422 With no FILE, or when FILE is -, read standard input.\n\
 423 \n\
 424 *** WARNING ***\n\
 425 The locale specified by the environment affects sort order.\n\
 426 Set LC_ALL=C to get the traditional sort order that uses\n\
 427 native byte values.\n\
 428 "), stdout );
 429       emit_ancillary_info ();
 430     }
 431
 432   exit (status);
 433 }
 434
 435 /* For long options that have no equivalent short option, use a
 436    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 437 enum
 438 {
 439   CHECK_OPTION = CHAR_MAX + 1,
 440   COMPRESS_PROGRAM_OPTION,
 441   DEBUG_PROGRAM_OPTION,
 442   FILES0_FROM_OPTION,
 443   NMERGE_OPTION,
 444   RANDOM_SOURCE_OPTION,
 445   SORT_OPTION
 446 };
 447
 448 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
 449
 450 static struct option const long_options[] =
 451 {
 452   {"ignore-leading-blanks", no_argument, NULL, 'b'},
 453   {"check", optional_argument, NULL, CHECK_OPTION},
 454   {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
 455   {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
 456   {"dictionary-order", no_argument, NULL, 'd'},
 457   {"ignore-case", no_argument, NULL, 'f'},
 458   {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
 459   {"general-numeric-sort", no_argument, NULL, 'g'},
 460   {"ignore-nonprinting", no_argument, NULL, 'i'},
 461   {"key", required_argument, NULL, 'k'},
 462   {"merge", no_argument, NULL, 'm'},
 463   {"month-sort", no_argument, NULL, 'M'},
 464   {"numeric-sort", no_argument, NULL, 'n'},
 465   {"human-numeric-sort", no_argument, NULL, 'h'},
 466   {"version-sort", no_argument, NULL, 'V'},
 467   {"random-sort", no_argument, NULL, 'R'},
 468   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
 469   {"sort", required_argument, NULL, SORT_OPTION},
 470   {"output", required_argument, NULL, 'o'},
 471   {"reverse", no_argument, NULL, 'r'},
 472   {"stable", no_argument, NULL, 's'},
 473   {"batch-size", required_argument, NULL, NMERGE_OPTION},
 474   {"buffer-size", required_argument, NULL, 'S'},
 475   {"field-separator", required_argument, NULL, 't'},
 476   {"temporary-directory", required_argument, NULL, 'T'},
 477   {"unique", no_argument, NULL, 'u'},
 478   {"zero-terminated", no_argument, NULL, 'z'},
 479   {GETOPT_HELP_OPTION_DECL},
 480   {GETOPT_VERSION_OPTION_DECL},
 481   {NULL, 0, NULL, 0},
 482 };
 483
 484 #define CHECK_TABLE \
 485   _ct_("quiet",          'C') \
 486   _ct_("silent",         'C') \
 487   _ct_("diagnose-first", 'c')
 488
 489 static char const *const check_args[] =
 490 {
 491 #define _ct_(_s, _c) _s,
 492   CHECK_TABLE NULL
 493 #undef  _ct_
 494 };
 495 static char const check_types[] =
 496 {
 497 #define _ct_(_s, _c) _c,
 498   CHECK_TABLE
 499 #undef  _ct_
 500 };
 501
 502 #define SORT_TABLE \
 503   _st_("general-numeric", 'g') \
 504   _st_("human-numeric",   'h') \
 505   _st_("month",           'M') \
 506   _st_("numeric",         'n') \
 507   _st_("random",          'R') \
 508   _st_("version",         'V')
 509
 510 static char const *const sort_args[] =
 511 {
 512 #define _st_(_s, _c) _s,
 513   SORT_TABLE NULL
 514 #undef  _st_
 515 };
 516 static char const sort_types[] =
 517 {
 518 #define _st_(_s, _c) _c,
 519   SORT_TABLE
 520 #undef  _st_
 521 };
 522
 523 /* The set of signals that are caught.  */
 524 static sigset_t caught_signals;
 525
 526 /* Critical section status.  */
 527 struct cs_status
 528 {
 529   bool valid;
 530   sigset_t sigs;
 531 };
 532
 533 /* Enter a critical section.  */
 534 static struct cs_status
 535 cs_enter (void)
 536 {
 537   struct cs_status status;
 538   status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
 539   return status;
 540 }
 541
 542 /* Leave a critical section.  */
 543 static void
 544 cs_leave (struct cs_status status)
 545 {
 546   if (status.valid)
 547     {
 548       /* Ignore failure when restoring the signal mask. */
 549       sigprocmask (SIG_SETMASK, &status.sigs, NULL);
 550     }
 551 }
 552
 553 /* The list of temporary files. */
 554 struct tempnode
 555 {
 556   struct tempnode *volatile next;
 557   pid_t pid;     /* If compressed, the pid of compressor, else zero */
 558   char name[1];  /* Actual size is 1 + file name length.  */
 559 };
 560 static struct tempnode *volatile temphead;
 561 static struct tempnode *volatile *temptail = &temphead;
 562
 563 struct sortfile
 564 {
 565   char const *name;
 566   pid_t pid;     /* If compressed, the pid of compressor, else zero */
 567 };
 568
 569 /* A table where we store compression process states.  We clean up all
 570    processes in a timely manner so as not to exhaust system resources,
 571    so we store the info on whether the process is still running, or has
 572    been reaped here.  */
 573 static Hash_table *proctab;
 574
 575 enum { INIT_PROCTAB_SIZE = 47 };
 576
 577 enum procstate { ALIVE, ZOMBIE };
 578
 579 /* A proctab entry.  The COUNT field is there in case we fork a new
 580    compression process that has the same PID as an old zombie process
 581    that is still in the table (because the process to decompress the
 582    temp file it was associated with hasn't started yet).  */
 583 struct procnode
 584 {
 585   pid_t pid;
 586   enum procstate state;
 587   size_t count;
 588 };
 589
 590 static size_t
 591 proctab_hasher (const void *entry, size_t tabsize)
 592 {
 593   const struct procnode *node = entry;
 594   return node->pid % tabsize;
 595 }
 596
 597 static bool
 598 proctab_comparator (const void *e1, const void *e2)
 599 {
 600   const struct procnode *n1 = e1, *n2 = e2;
 601   return n1->pid == n2->pid;
 602 }
 603
 604 /* The total number of forked processes (compressors and decompressors)
 605    that have not been reaped yet. */
 606 static size_t nprocs;
 607
 608 /* The number of child processes we'll allow before we try to reap some. */
 609 enum { MAX_PROCS_BEFORE_REAP = 2 };
 610
 611 /* If 0 < PID, wait for the child process with that PID to exit.
 612    If PID is -1, clean up a random child process which has finished and
 613    return the process ID of that child.  If PID is -1 and no processes
 614    have quit yet, return 0 without waiting.  */
 615
 616 static pid_t
 617 reap (pid_t pid)
 618 {
 619   int status;
 620   pid_t cpid = waitpid (pid, &status, pid < 0 ? WNOHANG : 0);
 621
 622   if (cpid < 0)
 623     error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
 624            compress_program);
 625   else if (0 < cpid)
 626     {
 627       if (! WIFEXITED (status) || WEXITSTATUS (status))
 628         error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
 629                compress_program);
 630       --nprocs;
 631     }
 632
 633   return cpid;
 634 }
 635
 636 /* Add the PID of a running compression process to proctab, or update
 637    the entry COUNT and STATE fields if it's already there.  This also
 638    creates the table for us the first time it's called.  */
 639
 640 static void
 641 register_proc (pid_t pid)
 642 {
 643   struct procnode test, *node;
 644
 645   if (! proctab)
 646     {
 647       proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
 648                                  proctab_hasher,
 649                                  proctab_comparator,
 650                                  free);
 651       if (! proctab)
 652         xalloc_die ();
 653     }
 654
 655   test.pid = pid;
 656   node = hash_lookup (proctab, &test);
 657   if (node)
 658     {
 659       node->state = ALIVE;
 660       ++node->count;
 661     }
 662   else
 663     {
 664       node = xmalloc (sizeof *node);
 665       node->pid = pid;
 666       node->state = ALIVE;
 667       node->count = 1;
 668       if (hash_insert (proctab, node) == NULL)
 669         xalloc_die ();
 670     }
 671 }
 672
 673 /* This is called when we reap a random process.  We don't know
 674    whether we have reaped a compression process or a decompression
 675    process until we look in the table.  If there's an ALIVE entry for
 676    it, then we have reaped a compression process, so change the state
 677    to ZOMBIE.  Otherwise, it's a decompression processes, so ignore it.  */
 678
 679 static void
 680 update_proc (pid_t pid)
 681 {
 682   struct procnode test, *node;
 683
 684   test.pid = pid;
 685   node = hash_lookup (proctab, &test);
 686   if (node)
 687     node->state = ZOMBIE;
 688 }
 689
 690 /* This is for when we need to wait for a compression process to exit.
 691    If it has a ZOMBIE entry in the table then it's already dead and has
 692    been reaped.  Note that if there's an ALIVE entry for it, it still may
 693    already have died and been reaped if a second process was created with
 694    the same PID.  This is probably exceedingly rare, but to be on the safe
 695    side we will have to wait for any compression process with this PID.  */
 696
 697 static void
 698 wait_proc (pid_t pid)
 699 {
 700   struct procnode test, *node;
 701
 702   test.pid = pid;
 703   node = hash_lookup (proctab, &test);
 704   if (node->state == ALIVE)
 705     reap (pid);
 706
 707   node->state = ZOMBIE;
 708   if (! --node->count)
 709     {
 710       hash_delete (proctab, node);
 711       free (node);
 712     }
 713 }
 714
 715 /* Keep reaping finished children as long as there are more to reap.
 716    This doesn't block waiting for any of them, it only reaps those
 717    that are already dead.  */
 718
 719 static void
 720 reap_some (void)
 721 {
 722   pid_t pid;
 723
 724   while (0 < nprocs && (pid = reap (-1)))
 725     update_proc (pid);
 726 }
 727
 728 /* Clean up any remaining temporary files.  */
 729
 730 static void
 731 cleanup (void)
 732 {
 733   struct tempnode const *node;
 734
 735   for (node = temphead; node; node = node->next)
 736     unlink (node->name);
 737   temphead = NULL;
 738 }
 739
 740 /* Cleanup actions to take when exiting.  */
 741
 742 static void
 743 exit_cleanup (void)
 744 {
 745   if (temphead)
 746     {
 747       /* Clean up any remaining temporary files in a critical section so
 748          that a signal handler does not try to clean them too.  */
 749       struct cs_status cs = cs_enter ();
 750       cleanup ();
 751       cs_leave (cs);
 752     }
 753
 754   close_stdout ();
 755 }
 756
 757 /* Create a new temporary file, returning its newly allocated tempnode.
 758    Store into *PFD the file descriptor open for writing.
 759    If the creation fails, return NULL and store -1 into *PFD if the
 760    failure is due to file descriptor exhaustion and
 761    SURVIVE_FD_EXHAUSTION; otherwise, die.  */
 762
 763 static struct tempnode *
 764 create_temp_file (int *pfd, bool survive_fd_exhaustion)
 765 {
 766   static char const slashbase[] = "/sortXXXXXX";
 767   static size_t temp_dir_index;
 768   int fd;
 769   int saved_errno;
 770   char const *temp_dir = temp_dirs[temp_dir_index];
 771   size_t len = strlen (temp_dir);
 772   struct tempnode *node =
 773     xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
 774   char *file = node->name;
 775   struct cs_status cs;
 776
 777   memcpy (file, temp_dir, len);
 778   memcpy (file + len, slashbase, sizeof slashbase);
 779   node->next = NULL;
 780   node->pid = 0;
 781   if (++temp_dir_index == temp_dir_count)
 782     temp_dir_index = 0;
 783
 784   /* Create the temporary file in a critical section, to avoid races.  */
 785   cs = cs_enter ();
 786   fd = mkstemp (file);
 787   if (0 <= fd)
 788     {
 789       *temptail = node;
 790       temptail = &node->next;
 791     }
 792   saved_errno = errno;
 793   cs_leave (cs);
 794   errno = saved_errno;
 795
 796   if (fd < 0)
 797     {
 798       if (! (survive_fd_exhaustion && errno == EMFILE))
 799         error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
 800                quote (temp_dir));
 801       free (node);
 802       node = NULL;
 803     }
 804
 805   *pfd = fd;
 806   return node;
 807 }
 808
 809 /* Predeclare an access pattern for input files.
 810    Ignore any errors -- this is only advisory.
 811
 812    There are a few hints we could possibly provide,
 813    and after careful testing it was decided that
 814    specifying POSIX_FADV_SEQUENTIAL was not detrimental
 815    to any cases.  On Linux 2.6.31, this option doubles
 816    the size of read ahead performed and thus was seen to
 817    benefit these cases:
 818      Merging
 819      Sorting with a smaller internal buffer
 820      Reading from faster flash devices
 821
 822    In _addition_ one could also specify other hints...
 823
 824    POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
 825    at least uses that to _synchronously_ prepopulate the cache
 826    with the specified range.  While sort does need to
 827    read all of its input before outputting, a synchronous
 828    read of the whole file up front precludes any processing
 829    that sort could do in parallel with the system doing
 830    read ahead of the data. This was seen to have negative effects
 831    in a couple of cases:
 832      Merging
 833      Sorting with a smaller internal buffer
 834    Note this option was seen to shorten the runtime for sort
 835    on a multicore system with lots of RAM and other processes
 836    competing for CPU.  It could be argued that more explicit
 837    scheduling hints with `nice` et. al. are more appropriate
 838    for this situation.
 839
 840    POSIX_FADV_NOREUSE is a possibility as it could lower
 841    the priority of input data in the cache as sort will
 842    only need to process it once.  However its functionality
 843    has changed over Linux kernel versions and as of 2.6.31
 844    it does nothing and thus we can't depend on what it might
 845    do in future.
 846
 847    POSIX_FADV_DONTNEED is not appropriate for user specified
 848    input files, but for temp files we do want to drop the
 849    cache immediately after processing.  This is done implicitly
 850    however when the files are unlinked.  */
 851
 852 static void
 853 fadvise_input (FILE *fp)
 854 {
 855 #if HAVE_POSIX_FADVISE
 856   if (fp)
 857     {
 858       int fd = fileno (fp);
 859       ignore_value (posix_fadvise (fd, 0, 0, POSIX_FADV_SEQUENTIAL));
 860     }
 861 #endif
 862 }
 863
 864 /* Return a stream for FILE, opened with mode HOW.  A null FILE means
 865    standard output; HOW should be "w".  When opening for input, "-"
 866    means standard input.  To avoid confusion, do not return file
 867    descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
 868    opening an ordinary FILE.  Return NULL if unsuccessful.  */
 869
 870 static FILE *
 871 stream_open (const char *file, const char *how)
 872 {
 873   if (!file)
 874     return stdout;
 875   if (*how == 'r')
 876     {
 877       FILE *fp;
 878       if (STREQ (file, "-"))
 879         {
 880           have_read_stdin = true;
 881           fp = stdin;
 882         }
 883       else
 884         fp = fopen (file, how);
 885       fadvise_input (fp);
 886       return fp;
 887     }
 888   return fopen (file, how);
 889 }
 890
 891 /* Same as stream_open, except always return a non-null value; die on
 892    failure.  */
 893
 894 static FILE *
 895 xfopen (const char *file, const char *how)
 896  {
 897   FILE *fp = stream_open (file, how);
 898   if (!fp)
 899     die (_("open failed"), file);
 900   return fp;
 901 }
 902
 903 /* Close FP, whose name is FILE, and report any errors.  */
 904
 905 static void
 906 xfclose (FILE *fp, char const *file)
 907 {
 908   switch (fileno (fp))
 909     {
 910     case STDIN_FILENO:
 911       /* Allow reading stdin from tty more than once.  */
 912       if (feof (fp))
 913         clearerr (fp);
 914       break;
 915
 916     case STDOUT_FILENO:
 917       /* Don't close stdout just yet.  close_stdout does that.  */
 918       if (fflush (fp) != 0)
 919         die (_("fflush failed"), file);
 920       break;
 921
 922     default:
 923       if (fclose (fp) != 0)
 924         die (_("close failed"), file);
 925       break;
 926     }
 927 }
 928
 929 static void
 930 dup2_or_die (int oldfd, int newfd)
 931 {
 932   if (dup2 (oldfd, newfd) < 0)
 933     error (SORT_FAILURE, errno, _("dup2 failed"));
 934 }
 935
 936 /* Fork a child process for piping to and do common cleanup.  The
 937    TRIES parameter tells us how many times to try to fork before
 938    giving up.  Return the PID of the child, or -1 (setting errno)
 939    on failure. */
 940
 941 static pid_t
 942 pipe_fork (int pipefds[2], size_t tries)
 943 {
 944 #if HAVE_WORKING_FORK
 945   struct tempnode *saved_temphead;
 946   int saved_errno;
 947   double wait_retry = 0.25;
 948   pid_t pid IF_LINT (= -1);
 949   struct cs_status cs;
 950
 951   if (pipe (pipefds) < 0)
 952     return -1;
 953
 954   while (tries--)
 955     {
 956       /* This is so the child process won't delete our temp files
 957          if it receives a signal before exec-ing.  */
 958       cs = cs_enter ();
 959       saved_temphead = temphead;
 960       temphead = NULL;
 961
 962       pid = fork ();
 963       saved_errno = errno;
 964       if (pid)
 965         temphead = saved_temphead;
 966
 967       cs_leave (cs);
 968       errno = saved_errno;
 969
 970       if (0 <= pid || errno != EAGAIN)
 971         break;
 972       else
 973         {
 974           xnanosleep (wait_retry);
 975           wait_retry *= 2;
 976           reap_some ();
 977         }
 978     }
 979
 980   if (pid < 0)
 981     {
 982       saved_errno = errno;
 983       close (pipefds[0]);
 984       close (pipefds[1]);
 985       errno = saved_errno;
 986     }
 987   else if (pid == 0)
 988     {
 989       close (STDIN_FILENO);
 990       close (STDOUT_FILENO);
 991     }
 992   else
 993     ++nprocs;
 994
 995   return pid;
 996
 997 #else  /* ! HAVE_WORKING_FORK */
 998   return -1;
 999 #endif
1000 }
1001
1002 /* Create a temporary file and start a compression program to filter output
1003    to that file.  Set *PFP to the file handle and if PPID is non-NULL,
1004    set *PPID to the PID of the newly-created process.  If the creation
1005    fails, return NULL if the failure is due to file descriptor
1006    exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die.  */
1007
1008 static char *
1009 maybe_create_temp (FILE **pfp, pid_t *ppid, bool survive_fd_exhaustion)
1010 {
1011   int tempfd;
1012   struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1013   char *name;
1014
1015   if (! node)
1016     return NULL;
1017
1018   name = node->name;
1019
1020   if (compress_program)
1021     {
1022       int pipefds[2];
1023
1024       node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1025       if (0 < node->pid)
1026         {
1027           close (tempfd);
1028           close (pipefds[0]);
1029           tempfd = pipefds[1];
1030
1031           register_proc (node->pid);
1032         }
1033       else if (node->pid == 0)
1034         {
1035           close (pipefds[1]);
1036           dup2_or_die (tempfd, STDOUT_FILENO);
1037           close (tempfd);
1038           dup2_or_die (pipefds[0], STDIN_FILENO);
1039           close (pipefds[0]);
1040
1041           if (execlp (compress_program, compress_program, (char *) NULL) < 0)
1042             error (SORT_FAILURE, errno, _("couldn't execute %s"),
1043                    compress_program);
1044         }
1045       else
1046         node->pid = 0;
1047     }
1048
1049   *pfp = fdopen (tempfd, "w");
1050   if (! *pfp)
1051     die (_("couldn't create temporary file"), name);
1052
1053   if (ppid)
1054     *ppid = node->pid;
1055
1056   return name;
1057 }
1058
1059 /* Create a temporary file and start a compression program to filter output
1060    to that file.  Set *PFP to the file handle and if *PPID is non-NULL,
1061    set it to the PID of the newly-created process.  Die on failure.  */
1062
1063 static char *
1064 create_temp (FILE **pfp, pid_t *ppid)
1065 {
1066   return maybe_create_temp (pfp, ppid, false);
1067 }
1068
1069 /* Open a compressed temp file and start a decompression process through
1070    which to filter the input.  PID must be the valid processes ID of the
1071    process used to compress the file.  Return NULL (setting errno to
1072    EMFILE) if we ran out of file descriptors, and die on any other
1073    kind of failure.  */
1074
1075 static FILE *
1076 open_temp (const char *name, pid_t pid)
1077 {
1078   int tempfd, pipefds[2];
1079   FILE *fp = NULL;
1080
1081   wait_proc (pid);
1082
1083   tempfd = open (name, O_RDONLY);
1084   if (tempfd < 0)
1085     return NULL;
1086
1087   switch (pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS))
1088     {
1089     case -1:
1090       if (errno != EMFILE)
1091         error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1092                compress_program);
1093       close (tempfd);
1094       errno = EMFILE;
1095       break;
1096
1097     case 0:
1098       close (pipefds[0]);
1099       dup2_or_die (tempfd, STDIN_FILENO);
1100       close (tempfd);
1101       dup2_or_die (pipefds[1], STDOUT_FILENO);
1102       close (pipefds[1]);
1103
1104       execlp (compress_program, compress_program, "-d", (char *) NULL);
1105       error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
1106              compress_program);
1107
1108     default:
1109       close (tempfd);
1110       close (pipefds[1]);
1111
1112       fp = fdopen (pipefds[0], "r");
1113       if (! fp)
1114         {
1115           int saved_errno = errno;
1116           close (pipefds[0]);
1117           errno = saved_errno;
1118         }
1119       break;
1120     }
1121
1122   return fp;
1123 }
1124
1125 /* Append DIR to the array of temporary directory names.  */
1126 static void
1127 add_temp_dir (char const *dir)
1128 {
1129   if (temp_dir_count == temp_dir_alloc)
1130     temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1131
1132   temp_dirs[temp_dir_count++] = dir;
1133 }
1134
1135 /* Remove NAME from the list of temporary files.  */
1136
1137 static void
1138 zaptemp (const char *name)
1139 {
1140   struct tempnode *volatile *pnode;
1141   struct tempnode *node;
1142   struct tempnode *next;
1143   int unlink_status;
1144   int unlink_errno = 0;
1145   struct cs_status cs;
1146
1147   for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1148     continue;
1149
1150   /* Unlink the temporary file in a critical section to avoid races.  */
1151   next = node->next;
1152   cs = cs_enter ();
1153   unlink_status = unlink (name);
1154   unlink_errno = errno;
1155   *pnode = next;
1156   cs_leave (cs);
1157
1158   if (unlink_status != 0)
1159     error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1160   if (! next)
1161     temptail = pnode;
1162   free (node);
1163 }
1164
1165 #if HAVE_NL_LANGINFO
1166
1167 static int
1168 struct_month_cmp (const void *m1, const void *m2)
1169 {
1170   struct month const *month1 = m1;
1171   struct month const *month2 = m2;
1172   return strcmp (month1->name, month2->name);
1173 }
1174
1175 #endif
1176
1177 /* Initialize the character class tables. */
1178
1179 static void
1180 inittables (void)
1181 {
1182   size_t i;
1183
1184   for (i = 0; i < UCHAR_LIM; ++i)
1185     {
1186       blanks[i] = !! isblank (i);
1187       nonprinting[i] = ! isprint (i);
1188       nondictionary[i] = ! isalnum (i) && ! isblank (i);
1189       fold_toupper[i] = toupper (i);
1190     }
1191
1192 #if HAVE_NL_LANGINFO
1193   /* If we're not in the "C" locale, read different names for months.  */
1194   if (hard_LC_TIME)
1195     {
1196       for (i = 0; i < MONTHS_PER_YEAR; i++)
1197         {
1198           char const *s;
1199           size_t s_len;
1200           size_t j, k;
1201           char *name;
1202
1203           s = (char *) nl_langinfo (ABMON_1 + i);
1204           s_len = strlen (s);
1205           monthtab[i].name = name = xmalloc (s_len + 1);
1206           monthtab[i].val = i + 1;
1207
1208           for (j = k = 0; j < s_len; j++)
1209             if (! isblank (to_uchar (s[j])))
1210               name[k++] = fold_toupper[to_uchar (s[j])];
1211           name[k] = '\0';
1212         }
1213       qsort ((void *) monthtab, MONTHS_PER_YEAR,
1214              sizeof *monthtab, struct_month_cmp);
1215     }
1216 #endif
1217 }
1218
1219 /* Specify how many inputs may be merged at once.
1220    This may be set on the command-line with the
1221    --batch-size option. */
1222 static void
1223 specify_nmerge (int oi, char c, char const *s)
1224 {
1225   uintmax_t n;
1226   struct rlimit rlimit;
1227   enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1228
1229   /* Try to find out how many file descriptors we'll be able
1230      to open.  We need at least nmerge + 3 (STDIN_FILENO,
1231      STDOUT_FILENO and STDERR_FILENO). */
1232   unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1233                               ? rlimit.rlim_cur
1234                               : OPEN_MAX)
1235                              - 3);
1236
1237   if (e == LONGINT_OK)
1238     {
1239       nmerge = n;
1240       if (nmerge != n)
1241         e = LONGINT_OVERFLOW;
1242       else
1243         {
1244           if (nmerge < 2)
1245             {
1246               error (0, 0, _("invalid --%s argument %s"),
1247                      long_options[oi].name, quote (s));
1248               error (SORT_FAILURE, 0,
1249                      _("minimum --%s argument is %s"),
1250                      long_options[oi].name, quote ("2"));
1251             }
1252           else if (max_nmerge < nmerge)
1253             {
1254               e = LONGINT_OVERFLOW;
1255             }
1256           else
1257             return;
1258         }
1259     }
1260
1261   if (e == LONGINT_OVERFLOW)
1262     {
1263       char max_nmerge_buf[INT_BUFSIZE_BOUND (unsigned int)];
1264       error (0, 0, _("--%s argument %s too large"),
1265              long_options[oi].name, quote (s));
1266       error (SORT_FAILURE, 0,
1267              _("maximum --%s argument with current rlimit is %s"),
1268              long_options[oi].name,
1269              uinttostr (max_nmerge, max_nmerge_buf));
1270     }
1271   else
1272     xstrtol_fatal (e, oi, c, long_options, s);
1273 }
1274
1275 /* Specify the amount of main memory to use when sorting.  */
1276 static void
1277 specify_sort_size (int oi, char c, char const *s)
1278 {
1279   uintmax_t n;
1280   char *suffix;
1281   enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1282
1283   /* The default unit is KiB.  */
1284   if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1285     {
1286       if (n <= UINTMAX_MAX / 1024)
1287         n *= 1024;
1288       else
1289         e = LONGINT_OVERFLOW;
1290     }
1291
1292   /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
1293   if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1294     switch (suffix[0])
1295       {
1296       case 'b':
1297         e = LONGINT_OK;
1298         break;
1299
1300       case '%':
1301         {
1302           double mem = physmem_total () * n / 100;
1303
1304           /* Use "<", not "<=", to avoid problems with rounding.  */
1305           if (mem < UINTMAX_MAX)
1306             {
1307               n = mem;
1308               e = LONGINT_OK;
1309             }
1310           else
1311             e = LONGINT_OVERFLOW;
1312         }
1313         break;
1314       }
1315
1316   if (e == LONGINT_OK)
1317     {
1318       /* If multiple sort sizes are specified, take the maximum, so
1319          that option order does not matter.  */
1320       if (n < sort_size)
1321         return;
1322
1323       sort_size = n;
1324       if (sort_size == n)
1325         {
1326           sort_size = MAX (sort_size, MIN_SORT_SIZE);
1327           return;
1328         }
1329
1330       e = LONGINT_OVERFLOW;
1331     }
1332
1333   xstrtol_fatal (e, oi, c, long_options, s);
1334 }
1335
1336 /* Return the default sort size.  */
1337 static size_t
1338 default_sort_size (void)
1339 {
1340   /* Let MEM be available memory or 1/8 of total memory, whichever
1341      is greater.  */
1342   double avail = physmem_available ();
1343   double total = physmem_total ();
1344   double mem = MAX (avail, total / 8);
1345   struct rlimit rlimit;
1346
1347   /* Let SIZE be MEM, but no more than the maximum object size or
1348      system resource limits.  Avoid the MIN macro here, as it is not
1349      quite right when only one argument is floating point.  Don't
1350      bother to check for values like RLIM_INFINITY since in practice
1351      they are not much less than SIZE_MAX.  */
1352   size_t size = SIZE_MAX;
1353   if (mem < size)
1354     size = mem;
1355   if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1356     size = rlimit.rlim_cur;
1357 #ifdef RLIMIT_AS
1358   if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1359     size = rlimit.rlim_cur;
1360 #endif
1361
1362   /* Leave a large safety margin for the above limits, as failure can
1363      occur when they are exceeded.  */
1364   size /= 2;
1365
1366 #ifdef RLIMIT_RSS
1367   /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1368      Exceeding RSS is not fatal, but can be quite slow.  */
1369   if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1370     size = rlimit.rlim_cur / 16 * 15;
1371 #endif
1372
1373   /* Use no less than the minimum.  */
1374   return MAX (size, MIN_SORT_SIZE);
1375 }
1376
1377 /* Return the sort buffer size to use with the input files identified
1378    by FPS and FILES, which are alternate names of the same files.
1379    NFILES gives the number of input files; NFPS may be less.  Assume
1380    that each input line requires LINE_BYTES extra bytes' worth of line
1381    information.  Do not exceed the size bound specified by the user
1382    (or a default size bound, if the user does not specify one).  */
1383
1384 static size_t
1385 sort_buffer_size (FILE *const *fps, size_t nfps,
1386                   char *const *files, size_t nfiles,
1387                   size_t line_bytes)
1388 {
1389   /* A bound on the input size.  If zero, the bound hasn't been
1390      determined yet.  */
1391   static size_t size_bound;
1392
1393   /* In the worst case, each input byte is a newline.  */
1394   size_t worst_case_per_input_byte = line_bytes + 1;
1395
1396   /* Keep enough room for one extra input line and an extra byte.
1397      This extra room might be needed when preparing to read EOF.  */
1398   size_t size = worst_case_per_input_byte + 1;
1399
1400   size_t i;
1401
1402   for (i = 0; i < nfiles; i++)
1403     {
1404       struct stat st;
1405       off_t file_size;
1406       size_t worst_case;
1407
1408       if ((i < nfps ? fstat (fileno (fps[i]), &st)
1409            : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1410            : stat (files[i], &st))
1411           != 0)
1412         die (_("stat failed"), files[i]);
1413
1414       if (S_ISREG (st.st_mode))
1415         file_size = st.st_size;
1416       else
1417         {
1418           /* The file has unknown size.  If the user specified a sort
1419              buffer size, use that; otherwise, guess the size.  */
1420           if (sort_size)
1421             return sort_size;
1422           file_size = INPUT_FILE_SIZE_GUESS;
1423         }
1424
1425       if (! size_bound)
1426         {
1427           size_bound = sort_size;
1428           if (! size_bound)
1429             size_bound = default_sort_size ();
1430         }
1431
1432       /* Add the amount of memory needed to represent the worst case
1433          where the input consists entirely of newlines followed by a
1434          single non-newline.  Check for overflow.  */
1435       worst_case = file_size * worst_case_per_input_byte + 1;
1436       if (file_size != worst_case / worst_case_per_input_byte
1437           || size_bound - size <= worst_case)
1438         return size_bound;
1439       size += worst_case;
1440     }
1441
1442   return size;
1443 }
1444
1445 /* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
1446    must be at least sizeof (struct line).  Allocate ALLOC bytes
1447    initially.  */
1448
1449 static void
1450 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1451 {
1452   /* Ensure that the line array is properly aligned.  If the desired
1453      size cannot be allocated, repeatedly halve it until allocation
1454      succeeds.  The smaller allocation may hurt overall performance,
1455      but that's better than failing.  */
1456   for (;;)
1457     {
1458       alloc += sizeof (struct line) - alloc % sizeof (struct line);
1459       buf->buf = malloc (alloc);
1460       if (buf->buf)
1461         break;
1462       alloc /= 2;
1463       if (alloc <= line_bytes + 1)
1464         xalloc_die ();
1465     }
1466
1467   buf->line_bytes = line_bytes;
1468   buf->alloc = alloc;
1469   buf->used = buf->left = buf->nlines = 0;
1470   buf->eof = false;
1471 }
1472
1473 /* Return one past the limit of the line array.  */
1474
1475 static inline struct line *
1476 buffer_linelim (struct buffer const *buf)
1477 {
1478   return (struct line *) (buf->buf + buf->alloc);
1479 }
1480
1481 /* Return a pointer to the first character of the field specified
1482    by KEY in LINE. */
1483
1484 static char *
1485 begfield (const struct line *line, const struct keyfield *key)
1486 {
1487   char *ptr = line->text, *lim = ptr + line->length - 1;
1488   size_t sword = key->sword;
1489   size_t schar = key->schar;
1490
1491   /* The leading field separator itself is included in a field when -t
1492      is absent.  */
1493
1494   if (tab != TAB_DEFAULT)
1495     while (ptr < lim && sword--)
1496       {
1497         while (ptr < lim && *ptr != tab)
1498           ++ptr;
1499         if (ptr < lim)
1500           ++ptr;
1501       }
1502   else
1503     while (ptr < lim && sword--)
1504       {
1505         while (ptr < lim && blanks[to_uchar (*ptr)])
1506           ++ptr;
1507         while (ptr < lim && !blanks[to_uchar (*ptr)])
1508           ++ptr;
1509       }
1510
1511   /* If we're ignoring leading blanks when computing the Start
1512      of the field, skip past them here.  */
1513   if (key->skipsblanks)
1514     while (ptr < lim && blanks[to_uchar (*ptr)])
1515       ++ptr;
1516
1517   /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
1518   ptr = MIN (lim, ptr + schar);
1519
1520   return ptr;
1521 }
1522
1523 /* Return the limit of (a pointer to the first character after) the field
1524    in LINE specified by KEY. */
1525
1526 static char *
1527 limfield (const struct line *line, const struct keyfield *key)
1528 {
1529   char *ptr = line->text, *lim = ptr + line->length - 1;
1530   size_t eword = key->eword, echar = key->echar;
1531
1532   if (echar == 0)
1533     eword++; /* Skip all of end field.  */
1534
1535   /* Move PTR past EWORD fields or to one past the last byte on LINE,
1536      whichever comes first.  If there are more than EWORD fields, leave
1537      PTR pointing at the beginning of the field having zero-based index,
1538      EWORD.  If a delimiter character was specified (via -t), then that
1539      `beginning' is the first character following the delimiting TAB.
1540      Otherwise, leave PTR pointing at the first `blank' character after
1541      the preceding field.  */
1542   if (tab != TAB_DEFAULT)
1543     while (ptr < lim && eword--)
1544       {
1545         while (ptr < lim && *ptr != tab)
1546           ++ptr;
1547         if (ptr < lim && (eword || echar))
1548           ++ptr;
1549       }
1550   else
1551     while (ptr < lim && eword--)
1552       {
1553         while (ptr < lim && blanks[to_uchar (*ptr)])
1554           ++ptr;
1555         while (ptr < lim && !blanks[to_uchar (*ptr)])
1556           ++ptr;
1557       }
1558
1559 #ifdef POSIX_UNSPECIFIED
1560   /* The following block of code makes GNU sort incompatible with
1561      standard Unix sort, so it's ifdef'd out for now.
1562      The POSIX spec isn't clear on how to interpret this.
1563      FIXME: request clarification.
1564
1565      From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1566      Date: Thu, 30 May 96 12:20:41 -0400
1567      [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1568
1569      [...]I believe I've found another bug in `sort'.
1570
1571      $ cat /tmp/sort.in
1572      a b c 2 d
1573      pq rs 1 t
1574      $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1575      a b c 2 d
1576      pq rs 1 t
1577      $ /bin/sort -k1.7,1.7 </tmp/sort.in
1578      pq rs 1 t
1579      a b c 2 d
1580
1581      Unix sort produced the answer I expected: sort on the single character
1582      in column 7.  GNU sort produced different results, because it disagrees
1583      on the interpretation of the key-end spec "M.N".  Unix sort reads this
1584      as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1585      "skip M-1 fields, then either N-1 characters or the rest of the current
1586      field, whichever comes first".  This extra clause applies only to
1587      key-ends, not key-starts.
1588      */
1589
1590   /* Make LIM point to the end of (one byte past) the current field.  */
1591   if (tab != TAB_DEFAULT)
1592     {
1593       char *newlim;
1594       newlim = memchr (ptr, tab, lim - ptr);
1595       if (newlim)
1596         lim = newlim;
1597     }
1598   else
1599     {
1600       char *newlim;
1601       newlim = ptr;
1602       while (newlim < lim && blanks[to_uchar (*newlim)])
1603         ++newlim;
1604       while (newlim < lim && !blanks[to_uchar (*newlim)])
1605         ++newlim;
1606       lim = newlim;
1607     }
1608 #endif
1609
1610   if (echar != 0) /* We need to skip over a portion of the end field.  */
1611     {
1612       /* If we're ignoring leading blanks when computing the End
1613          of the field, skip past them here.  */
1614       if (key->skipeblanks)
1615         while (ptr < lim && blanks[to_uchar (*ptr)])
1616           ++ptr;
1617
1618       /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
1619       ptr = MIN (lim, ptr + echar);
1620     }
1621
1622   return ptr;
1623 }
1624
1625 /* Fill BUF reading from FP, moving buf->left bytes from the end
1626    of buf->buf to the beginning first.  If EOF is reached and the
1627    file wasn't terminated by a newline, supply one.  Set up BUF's line
1628    table too.  FILE is the name of the file corresponding to FP.
1629    Return true if some input was read.  */
1630
1631 static bool
1632 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1633 {
1634   struct keyfield const *key = keylist;
1635   char eol = eolchar;
1636   size_t line_bytes = buf->line_bytes;
1637   size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1638
1639   if (buf->eof)
1640     return false;
1641
1642   if (buf->used != buf->left)
1643     {
1644       memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1645       buf->used = buf->left;
1646       buf->nlines = 0;
1647     }
1648
1649   for (;;)
1650     {
1651       char *ptr = buf->buf + buf->used;
1652       struct line *linelim = buffer_linelim (buf);
1653       struct line *line = linelim - buf->nlines;
1654       size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1655       char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1656
1657       while (line_bytes + 1 < avail)
1658         {
1659           /* Read as many bytes as possible, but do not read so many
1660              bytes that there might not be enough room for the
1661              corresponding line array.  The worst case is when the
1662              rest of the input file consists entirely of newlines,
1663              except that the last byte is not a newline.  */
1664           size_t readsize = (avail - 1) / (line_bytes + 1);
1665           size_t bytes_read = fread (ptr, 1, readsize, fp);
1666           char *ptrlim = ptr + bytes_read;
1667           char *p;
1668           avail -= bytes_read;
1669
1670           if (bytes_read != readsize)
1671             {
1672               if (ferror (fp))
1673                 die (_("read failed"), file);
1674               if (feof (fp))
1675                 {
1676                   buf->eof = true;
1677                   if (buf->buf == ptrlim)
1678                     return false;
1679                   if (ptrlim[-1] != eol)
1680                     *ptrlim++ = eol;
1681                 }
1682             }
1683
1684           /* Find and record each line in the just-read input.  */
1685           while ((p = memchr (ptr, eol, ptrlim - ptr)))
1686             {
1687               ptr = p + 1;
1688               line--;
1689               line->text = line_start;
1690               line->length = ptr - line_start;
1691               mergesize = MAX (mergesize, line->length);
1692               avail -= line_bytes;
1693
1694               if (key)
1695                 {
1696                   /* Precompute the position of the first key for
1697                      efficiency.  */
1698                   line->keylim = (key->eword == SIZE_MAX
1699                                   ? p
1700                                   : limfield (line, key));
1701
1702                   if (key->sword != SIZE_MAX)
1703                     line->keybeg = begfield (line, key);
1704                   else
1705                     {
1706                       if (key->skipsblanks)
1707                         while (blanks[to_uchar (*line_start)])
1708                           line_start++;
1709                       line->keybeg = line_start;
1710                     }
1711                 }
1712
1713               line_start = ptr;
1714             }
1715
1716           ptr = ptrlim;
1717           if (buf->eof)
1718             break;
1719         }
1720
1721       buf->used = ptr - buf->buf;
1722       buf->nlines = buffer_linelim (buf) - line;
1723       if (buf->nlines != 0)
1724         {
1725           buf->left = ptr - line_start;
1726           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1727           return true;
1728         }
1729
1730       {
1731         /* The current input line is too long to fit in the buffer.
1732            Double the buffer size and try again, keeping it properly
1733            aligned.  */
1734         size_t line_alloc = buf->alloc / sizeof (struct line);
1735         buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1736         buf->alloc = line_alloc * sizeof (struct line);
1737       }
1738     }
1739 }
1740
1741 /* Exit with an error if a mixture of SI and IEC units detected.  */
1742
1743 static bool
1744 check_mixed_SI_IEC (char prefix, struct keyfield *key)
1745 {
1746   int iec_present = prefix == 'i';
1747   if (key)
1748     {
1749       if (key->iec_present != -1 && iec_present != key->iec_present)
1750         error (SORT_FAILURE, 0, _("both SI and IEC prefixes present on units"));
1751       key->iec_present = iec_present;
1752     }
1753   return iec_present;
1754 }
1755
1756 /* Return an integer which represents the order of magnitude of
1757    the unit following the number.  NUMBER can contain thousands separators
1758    or a decimal point, but not have preceeding blanks.
1759    Negative numbers return a negative unit order.  */
1760
1761 static int
1762 find_unit_order (const char *number, struct keyfield *key, char const **endptr)
1763 {
1764   static const char orders [UCHAR_LIM] =
1765     {
1766 #if SOME_DAY_WE_WILL_REQUIRE_C99
1767       ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1768       ['k']=1,
1769 #else
1770       /* Generate the following table with this command:
1771          perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1772          foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1773          |fmt  */
1774       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1775       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1776       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1777       0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1778       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1779       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1780       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1781       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1782       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1783       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1784       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1785 #endif
1786     };
1787
1788   const unsigned char *p = number;
1789
1790   int sign = 1;
1791
1792   if (*p == '-')
1793     {
1794       sign = -1;
1795       p++;
1796     }
1797
1798   /* Scan to end of number.
1799      Decimals or separators not followed by digits stop the scan.
1800      Numbers ending in decimals or separators are thus considered
1801      to be lacking in units.
1802      FIXME: add support for multibyte thousands_sep and decimal_point.  */
1803
1804   while (ISDIGIT (*p))
1805     {
1806       p++;
1807
1808       if (*p == decimal_point && ISDIGIT (*(p + 1)))
1809         p += 2;
1810       else if (*p == thousands_sep && ISDIGIT (*(p + 1)))
1811         p += 2;
1812     }
1813
1814   int order = orders[*p];
1815
1816   /* For valid units check for MiB vs MB etc.  */
1817   if (order)
1818     {
1819       p++;
1820       p += check_mixed_SI_IEC (*p, key);
1821     }
1822
1823   if (endptr)
1824     *endptr = p;
1825
1826   return sign * order;
1827 }
1828
1829 /* Compare numbers ending in units with SI xor IEC prefixes
1830        <none/unknown> < K/k < M < G < T < P < E < Z < Y
1831    Assume that numbers are properly abbreviated.
1832    i.e. input will never have both 6000K and 5M.  */
1833
1834 static int
1835 human_numcompare (const char *a, const char *b, struct keyfield *key,
1836                   char const **ea)
1837 {
1838   while (blanks[to_uchar (*a)])
1839     a++;
1840   while (blanks[to_uchar (*b)])
1841     b++;
1842
1843   int order_a = find_unit_order (a, key, ea);
1844   int order_b = find_unit_order (b, key, NULL);
1845
1846   return (order_a > order_b ? 1
1847           : order_a < order_b ? -1
1848           : strnumcmp (a, b, decimal_point, thousands_sep));
1849 }
1850
1851 /* Compare strings A and B as numbers without explicitly converting them to
1852    machine numbers.  Comparatively slow for short strings, but asymptotically
1853    hideously fast. */
1854
1855 static int
1856 numcompare (const char *a, const char *b, char const **ea)
1857 {
1858   while (blanks[to_uchar (*a)])
1859     a++;
1860   while (blanks[to_uchar (*b)])
1861     b++;
1862
1863   if (debug)
1864     {
1865       /* Approximate strnumcmp extents with find_unit_order.  */
1866       if (find_unit_order (a, NULL, ea))
1867         {
1868           *ea -= 1; /* ignore the order letter */
1869           *ea -= (**ea == 'i'); /* and IEC prefix */
1870         }
1871     }
1872
1873   return strnumcmp (a, b, decimal_point, thousands_sep);
1874 }
1875
1876 static int
1877 general_numcompare (const char *sa, const char *sb, char const **ea)
1878 {
1879   /* FIXME: maybe add option to try expensive FP conversion
1880      only if A and B can't be compared more cheaply/accurately.  */
1881
1882 #if HAVE_C99_STRTOLD /* provided by c-strtold module.  */
1883 # define long_double long double
1884 #else
1885 # define long_double double
1886 # undef strtold
1887 # define strtold strtod
1888 #endif
1889
1890   char *eb;
1891   long_double a = strtold (sa, (char **) ea);
1892   long_double b = strtold (sb, &eb);
1893
1894   /* Put conversion errors at the start of the collating sequence.  */
1895   if (sa == *ea)
1896     return sb == eb ? 0 : -1;
1897   if (sb == eb)
1898     return 1;
1899
1900   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1901      conversion errors but before numbers; sort them by internal
1902      bit-pattern, for lack of a more portable alternative.  */
1903   return (a < b ? -1
1904           : a > b ? 1
1905           : a == b ? 0
1906           : b == b ? -1
1907           : a == a ? 1
1908           : memcmp ((char *) &a, (char *) &b, sizeof a));
1909 }
1910
1911 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1912    Return 0 if the name in S is not recognized.  */
1913
1914 static int
1915 getmonth (char const *month, size_t len, char const **ea)
1916 {
1917   size_t lo = 0;
1918   size_t hi = MONTHS_PER_YEAR;
1919   char const *monthlim = month + len;
1920
1921   for (;;)
1922     {
1923       if (month == monthlim)
1924         return 0;
1925       if (!blanks[to_uchar (*month)])
1926         break;
1927       ++month;
1928     }
1929
1930   do
1931     {
1932       size_t ix = (lo + hi) / 2;
1933       char const *m = month;
1934       char const *n = monthtab[ix].name;
1935
1936       for (;; m++, n++)
1937         {
1938           if (!*n)
1939             {
1940               if (ea)
1941                 *ea = m;
1942               return monthtab[ix].val;
1943             }
1944           if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1945             {
1946               hi = ix;
1947               break;
1948             }
1949           else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1950             {
1951               lo = ix + 1;
1952               break;
1953             }
1954         }
1955     }
1956   while (lo < hi);
1957
1958   return 0;
1959 }
1960
1961 /* A source of random data.  */
1962 static struct randread_source *randread_source;
1963
1964 /* Return the Ith randomly-generated state.  The caller must invoke
1965    random_state (H) for all H less than I before invoking random_state
1966    (I).  */
1967
1968 static struct md5_ctx
1969 random_state (size_t i)
1970 {
1971   /* An array of states resulting from the random data, and counts of
1972      its used and allocated members.  */
1973   static struct md5_ctx *state;
1974   static size_t used;
1975   static size_t allocated;
1976
1977   struct md5_ctx *s = &state[i];
1978
1979   if (used <= i)
1980     {
1981       unsigned char buf[MD5_DIGEST_SIZE];
1982
1983       used++;
1984
1985       if (allocated <= i)
1986         {
1987           state = X2NREALLOC (state, &allocated);
1988           s = &state[i];
1989         }
1990
1991       randread (randread_source, buf, sizeof buf);
1992       md5_init_ctx (s);
1993       md5_process_bytes (buf, sizeof buf, s);
1994     }
1995
1996   return *s;
1997 }
1998
1999 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
2000    with length LENGTHB.  Return negative if less, zero if equal,
2001    positive if greater.  */
2002
2003 static int
2004 cmp_hashes (char const *texta, size_t lena,
2005             char const *textb, size_t lenb)
2006 {
2007   /* Try random hashes until a pair of hashes disagree.  But if the
2008      first pair of random hashes agree, check whether the keys are
2009      identical and if so report no difference.  */
2010   int diff;
2011   size_t i;
2012   for (i = 0; ; i++)
2013     {
2014       uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2015       struct md5_ctx s[2];
2016       s[0] = s[1] = random_state (i);
2017       md5_process_bytes (texta, lena, &s[0]);  md5_finish_ctx (&s[0], dig[0]);
2018       md5_process_bytes (textb, lenb, &s[1]);  md5_finish_ctx (&s[1], dig[1]);
2019       diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2020       if (diff != 0)
2021         break;
2022       if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)
2023         break;
2024     }
2025
2026   return diff;
2027 }
2028
2029 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2030    using one or more random hash functions.  */
2031
2032 static int
2033 compare_random (char *restrict texta, size_t lena,
2034                 char *restrict textb, size_t lenb)
2035 {
2036   int diff;
2037
2038   if (! hard_LC_COLLATE)
2039     diff = cmp_hashes (texta, lena, textb, lenb);
2040   else
2041     {
2042       /* Transform the text into the basis of comparison, so that byte
2043          strings that would otherwise considered to be equal are
2044          considered equal here even if their bytes differ.  */
2045
2046       char *buf = NULL;
2047       char stackbuf[4000];
2048       size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
2049       bool a_fits = tlena <= sizeof stackbuf;
2050       size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
2051                                (a_fits ? sizeof stackbuf - tlena : 0),
2052                                textb, lenb);
2053
2054       if (a_fits && tlena + tlenb <= sizeof stackbuf)
2055         buf = stackbuf;
2056       else
2057         {
2058           /* Adding 1 to the buffer size lets xmemxfrm run a bit
2059              faster by avoiding the need for an extra buffer copy.  */
2060           buf = xmalloc (tlena + tlenb + 1);
2061           xmemxfrm (buf, tlena + 1, texta, lena);
2062           xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
2063         }
2064
2065       diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
2066
2067       if (buf != stackbuf)
2068         free (buf);
2069     }
2070
2071   return diff;
2072 }
2073
2074 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2075    using filevercmp. See lib/filevercmp.h for function description. */
2076
2077 static int
2078 compare_version (char *restrict texta, size_t lena,
2079                  char *restrict textb, size_t lenb)
2080 {
2081   int diff;
2082
2083   /* It is necessary to save the character after the end of the field.
2084      "filevercmp" works with NUL terminated strings.  Our blocks of
2085      text are not necessarily terminated with a NUL byte. */
2086   char sv_a = texta[lena];
2087   char sv_b = textb[lenb];
2088
2089   texta[lena] = '\0';
2090   textb[lenb] = '\0';
2091
2092   diff = filevercmp (texta, textb);
2093
2094   texta[lena] = sv_a;
2095   textb[lenb] = sv_b;
2096
2097   return diff;
2098 }
2099
2100 /* For debug mode, count tabs in the passed string
2101    so we can adjust the widths returned by mbswidth.
2102    FIXME: Should we generally be counting non printable chars?  */
2103
2104 static size_t
2105 count_tabs (char const *text, const size_t len)
2106 {
2107   size_t tabs = 0;
2108   size_t tlen = strnlen (text, len);
2109
2110   while (tlen--)
2111     {
2112       if (*text++ == '\t')
2113         tabs++;
2114     }
2115
2116   return tabs;
2117 }
2118
2119 /* For debug mode, "underline" a key at the
2120    specified offset and screen width.  */
2121
2122 static void
2123 mark_key (size_t offset, size_t width)
2124 {
2125   printf ("%*s", (int) offset, "");
2126
2127   if (!width)
2128     printf (_("^ no match for key\n"));
2129   else
2130     {
2131       while (width--)
2132         putchar ('_');
2133       putchar ('\n');
2134     }
2135 }
2136
2137 /* For debug mode, determine the screen offset and width
2138    to highlight for a key, and then output the highlight.  */
2139
2140 static void
2141 debug_key (char const *sline, char const *sfield, char const *efield,
2142            size_t flen, bool skipb)
2143 {
2144   char const *sa = sfield;
2145
2146   if (skipb) /* This key type implicitly skips leading blanks.  */
2147     {
2148       while (sa < efield && blanks[to_uchar (*sa)])
2149         {
2150           sa++;
2151           if (flen)
2152             flen--; /* This assumes TABs same width as SPACEs.  */
2153         }
2154     }
2155
2156   size_t offset = mbsnwidth (sline, sfield - sline, 0) + (sa - sfield);
2157   offset += count_tabs (sline, sfield - sline);
2158
2159   size_t width = mbsnwidth (sa, flen, 0);
2160   width += count_tabs (sa, flen);
2161
2162   mark_key (offset, width);
2163 }
2164
2165 /* Testing if a key is numeric is done in various places.  */
2166
2167 static inline bool
2168 key_numeric (struct keyfield const *key)
2169 {
2170   return key->numeric || key->general_numeric || key->human_numeric;
2171 }
2172
2173 /* Return whether sorting options specified for key.  */
2174
2175 static bool
2176 default_key_compare (struct keyfield const *key)
2177 {
2178   return ! (key->ignore
2179             || key->translate
2180             || key->skipsblanks
2181             || key->skipeblanks
2182             || key_numeric (key)
2183             || key->month
2184             || key->version
2185             || key->random
2186             /* || key->reverse */
2187            );
2188 }
2189
2190 /* Convert a key to the short options used to specify it.  */
2191
2192 static void
2193 key_to_opts (struct keyfield const *key, char *opts)
2194 {
2195   if (key->skipsblanks || key->skipeblanks)
2196     *opts++ = 'b';/* either disables global -b  */
2197   if (key->ignore == nondictionary)
2198     *opts++ = 'd';
2199   if (key->translate)
2200     *opts++ = 'f';
2201   if (key->general_numeric)
2202     *opts++ = 'g';
2203   if (key->human_numeric)
2204     *opts++ = 'h';
2205   if (key->ignore == nonprinting)
2206     *opts++ = 'i';
2207   if (key->month)
2208     *opts++ = 'M';
2209   if (key->numeric)
2210     *opts++ = 'n';
2211   if (key->random)
2212     *opts++ = 'R';
2213   if (key->reverse)
2214     *opts++ = 'r';
2215   if (key->version)
2216     *opts++ = 'V';
2217   *opts = '\0';
2218 }
2219
2220 /* Output data independent key warnings to stderr.  */
2221
2222 static void
2223 key_warnings (struct keyfield const *gkey, bool gkey_only)
2224 {
2225   struct keyfield const *key;
2226   struct keyfield ugkey = *gkey;
2227   unsigned long keynum = 1;
2228
2229   for (key = keylist; key; key = key->next, keynum++)
2230     {
2231       if (key->obsolete_used)
2232         {
2233           /* obsolescent syntax +A.x -B.y is equivalent to:
2234                -k A+1.x+1,B.y   (when y = 0)
2235                -k A+1.x+1,B+1.y (when y > 0)  */
2236           char obuf[INT_BUFSIZE_BOUND (size_t) * 2 + 4]; /* +# -#  */
2237           char nbuf[INT_BUFSIZE_BOUND (size_t) * 2 + 5]; /* -k #,#  */
2238
2239           char *po = obuf;
2240           char *pn = nbuf;
2241
2242           size_t sword = key->sword;
2243           size_t eword = key->eword;
2244           if (sword == SIZE_MAX)
2245             sword++;
2246
2247           po += sprintf (po, "+%" PRIuMAX, (uintmax_t) sword);
2248           pn += sprintf (pn, "-k %" PRIuMAX, (uintmax_t) sword + 1);
2249           if (key->eword != SIZE_MAX)
2250             {
2251               po += sprintf (po, " -%" PRIuMAX, (uintmax_t) eword + 1);
2252               pn += sprintf (pn, ",%" PRIuMAX,
2253                              (uintmax_t) eword + 1 + (key->echar == SIZE_MAX));
2254             }
2255           error (0, 0, _("obsolescent key `%s' used; consider `%s' instead"),
2256                  obuf, nbuf);
2257         }
2258
2259       /* Warn about field specs that will never match.  */
2260       if (key->sword != SIZE_MAX && key->eword < key->sword)
2261         error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2262
2263       /* Warn about significant leading blanks.  */
2264       bool implicit_skip = key_numeric (key) || key->month;
2265       bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
2266                                  && !(key->schar || key->echar);
2267       bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y  */
2268       if (!gkey_only && tab == TAB_DEFAULT && !line_offset
2269           && ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
2270               || (!key->skipsblanks && key->schar)
2271               || (!key->skipeblanks && key->echar)))
2272         error (0, 0, _("leading blanks are significant in key %lu; "
2273                        "consider also specifying `b'"), keynum);
2274
2275       /* Warn about numeric comparisons spanning fields,
2276          as field delimiters could be interpreted as part
2277          of the number (maybe only in other locales).  */
2278       if (!gkey_only && key_numeric (key))
2279         {
2280           size_t sword = key->sword + 1;
2281           size_t eword = key->eword + 1;
2282           if (!sword)
2283             sword++;
2284           if (sword != eword)
2285             error (0, 0, _("key %lu is numeric and spans multiple fields"),
2286                    keynum);
2287         }
2288
2289       /* Flag global options not copied or specified in any key.  */
2290       if (ugkey.ignore && (ugkey.ignore == key->ignore))
2291         ugkey.ignore = NULL;
2292       if (ugkey.translate && (ugkey.translate == key->translate))
2293         ugkey.translate = NULL;
2294       ugkey.skipsblanks &= !key->skipsblanks;
2295       ugkey.skipeblanks &= !key->skipeblanks;
2296       ugkey.month &= !key->month;
2297       ugkey.numeric &= !key->numeric;
2298       ugkey.general_numeric &= !key->general_numeric;
2299       ugkey.human_numeric &= !key->human_numeric;
2300       ugkey.random &= !key->random;
2301       ugkey.version &= !key->version;
2302       ugkey.reverse &= !key->reverse;
2303     }
2304
2305   /* Warn about ignored global options flagged above.
2306      Note if gkey is the only one in the list, all flags are cleared.  */
2307   if (!default_key_compare (&ugkey)
2308       || (ugkey.reverse && (stable || unique) && keylist))
2309     {
2310       bool ugkey_reverse = ugkey.reverse;
2311       if (!(stable || unique))
2312         ugkey.reverse = false;
2313       /* The following is too big, but guaranteed to be "big enough".  */
2314       char opts[sizeof short_options];
2315       key_to_opts (&ugkey, opts);
2316       error (0, 0,
2317              ngettext ("option `-%s' is ignored",
2318                        "options `-%s' are ignored",
2319                        select_plural (strlen (opts))), opts);
2320       ugkey.reverse = ugkey_reverse;
2321     }
2322   if (ugkey.reverse && !(stable || unique) && keylist)
2323     error (0, 0, _("option `-r' only applies to last-resort comparison"));
2324 }
2325
2326 /* Compare two lines A and B trying every key in sequence until there
2327    are no more keys or a difference is found. */
2328
2329 static int
2330 keycompare (const struct line *a, const struct line *b, bool show_debug)
2331 {
2332   struct keyfield *key = keylist;
2333
2334   /* For the first iteration only, the key positions have been
2335      precomputed for us. */
2336   char *texta = a->keybeg;
2337   char *textb = b->keybeg;
2338   char *lima = a->keylim;
2339   char *limb = b->keylim;
2340
2341   int diff;
2342
2343   for (;;)
2344     {
2345       char const *translate = key->translate;
2346       bool const *ignore = key->ignore;
2347       bool skipb = false; /* Whether key type auto skips leading blanks.  */
2348
2349       /* Treat field ends before field starts as empty fields.  */
2350       lima = MAX (texta, lima);
2351       limb = MAX (textb, limb);
2352
2353       /* Find the lengths. */
2354       size_t lena = lima - texta;
2355       size_t lenb = limb - textb;
2356
2357       /* Actually compare the fields. */
2358
2359       if (key->random)
2360         diff = compare_random (texta, lena, textb, lenb);
2361       else if (key_numeric (key))
2362         {
2363           char savea = *lima, saveb = *limb;
2364           char const* ea = lima;
2365
2366           *lima = *limb = '\0';
2367           diff = (key->numeric ? numcompare (texta, textb, &ea)
2368                   : key->general_numeric ? general_numcompare (texta, textb,
2369                                                                &ea)
2370                   : human_numcompare (texta, textb, key, &ea));
2371           if (show_debug)
2372             {
2373               lena = ea - texta;
2374               skipb = true;
2375             }
2376           *lima = savea, *limb = saveb;
2377         }
2378       else if (key->version)
2379         diff = compare_version (texta, lena, textb, lenb);
2380       else if (key->month)
2381         {
2382           char const *ea = lima;
2383
2384           int amon = getmonth (texta, lena, &ea);
2385           diff = amon - getmonth (textb, lenb, NULL);
2386
2387           if (show_debug)
2388             {
2389               lena = amon ? ea - texta : 0;
2390               skipb = true;
2391             }
2392         }
2393       /* Sorting like this may become slow, so in a simple locale the user
2394          can select a faster sort that is similar to ascii sort.  */
2395       else if (hard_LC_COLLATE)
2396         {
2397           /* FIXME: for debug, account for skipped chars, while handling mb chars.
2398              Generally perhaps xmemfrm could be used to determine chars that are
2399              excluded from the collating order?  */
2400           if (ignore || translate)
2401             {
2402               char buf[4000];
2403               size_t size = lena + 1 + lenb + 1;
2404               char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
2405               char *copy_b = copy_a + lena + 1;
2406               size_t new_len_a, new_len_b, i;
2407
2408               /* Ignore and/or translate chars before comparing.  */
2409               for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
2410                 {
2411                   if (i < lena)
2412                     {
2413                       copy_a[new_len_a] = (translate
2414                                            ? translate[to_uchar (texta[i])]
2415                                            : texta[i]);
2416                       if (!ignore || !ignore[to_uchar (texta[i])])
2417                         ++new_len_a;
2418                     }
2419                   if (i < lenb)
2420                     {
2421                       copy_b[new_len_b] = (translate
2422                                            ? translate[to_uchar (textb[i])]
2423                                            : textb [i]);
2424                       if (!ignore || !ignore[to_uchar (textb[i])])
2425                         ++new_len_b;
2426                     }
2427                 }
2428
2429               diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
2430
2431               if (sizeof buf < size)
2432                 free (copy_a);
2433             }
2434           else if (lena == 0)
2435             diff = - NONZERO (lenb);
2436           else if (lenb == 0)
2437             goto greater;
2438           else
2439             diff = xmemcoll (texta, lena, textb, lenb);
2440         }
2441       else if (ignore)
2442         {
2443           char *savea = texta;
2444
2445 #define CMP_WITH_IGNORE(A, B)                                           \
2446   do                                                                    \
2447     {                                                                   \
2448           for (;;)                                                      \
2449             {                                                           \
2450               while (texta < lima && ignore[to_uchar (*texta)])         \
2451                 ++texta;                                                \
2452               while (textb < limb && ignore[to_uchar (*textb)])         \
2453                 ++textb;                                                \
2454               if (! (texta < lima && textb < limb))                     \
2455                 break;                                                  \
2456               diff = to_uchar (A) - to_uchar (B);                       \
2457               if (diff)                                                 \
2458                 goto not_equal;                                         \
2459               ++texta;                                                  \
2460               ++textb;                                                  \
2461             }                                                           \
2462                                                                         \
2463           diff = (texta < lima) - (textb < limb);                       \
2464     }                                                                   \
2465   while (0)
2466
2467           if (translate)
2468             CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2469                              translate[to_uchar (*textb)]);
2470           else
2471             CMP_WITH_IGNORE (*texta, *textb);
2472
2473           /* We only need to restore this for debug_key
2474              in which case the keys being compared are equal.  */
2475           texta = savea;
2476         }
2477       else if (lena == 0)
2478         diff = - NONZERO (lenb);
2479       else if (lenb == 0)
2480         goto greater;
2481       else
2482         {
2483           if (translate)
2484             {
2485               char *savea = texta;
2486
2487               while (texta < lima && textb < limb)
2488                 {
2489                   diff = (to_uchar (translate[to_uchar (*texta++)])
2490                           - to_uchar (translate[to_uchar (*textb++)]));
2491                   if (diff)
2492                     goto not_equal;
2493                 }
2494
2495               /* We only need to restore this for debug_key
2496                  in which case the keys being compared are equal.  */
2497               texta = savea;
2498             }
2499           else
2500             {
2501               diff = memcmp (texta, textb, MIN (lena, lenb));
2502               if (diff)
2503                 goto not_equal;
2504             }
2505           diff = lena < lenb ? -1 : lena != lenb;
2506         }
2507
2508       if (diff)
2509         goto not_equal;
2510
2511       if (show_debug)
2512         debug_key (a->text, texta, lima, lena, skipb);
2513
2514       key = key->next;
2515       if (! key)
2516         break;
2517
2518       /* Find the beginning and limit of the next field.  */
2519       if (key->eword != SIZE_MAX)
2520         lima = limfield (a, key), limb = limfield (b, key);
2521       else
2522         lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2523
2524       if (key->sword != SIZE_MAX)
2525         texta = begfield (a, key), textb = begfield (b, key);
2526       else
2527         {
2528           texta = a->text, textb = b->text;
2529           if (key->skipsblanks)
2530             {
2531               while (texta < lima && blanks[to_uchar (*texta)])
2532                 ++texta;
2533               while (textb < limb && blanks[to_uchar (*textb)])
2534                 ++textb;
2535             }
2536         }
2537     }
2538
2539   return 0;
2540
2541  greater:
2542   diff = 1;
2543  not_equal:
2544   return key->reverse ? -diff : diff;
2545 }
2546
2547 /* Compare two lines A and B, returning negative, zero, or positive
2548    depending on whether A compares less than, equal to, or greater than B. */
2549
2550 static int
2551 compare (const struct line *a, const struct line *b, bool show_debug)
2552 {
2553   int diff;
2554   size_t alen, blen;
2555
2556   /* First try to compare on the specified keys (if any).
2557      The only two cases with no key at all are unadorned sort,
2558      and unadorned sort -r. */
2559   if (keylist)
2560     {
2561       diff = keycompare (a, b, show_debug);
2562       if (diff || unique || stable)
2563         return diff;
2564     }
2565
2566   /* If the keys all compare equal (or no keys were specified)
2567      fall through to the default comparison.  */
2568   alen = a->length - 1, blen = b->length - 1;
2569
2570   if (show_debug)
2571     debug_key (a->text, a->text, a->text + alen, alen, false);
2572
2573   if (alen == 0)
2574     diff = - NONZERO (blen);
2575   else if (blen == 0)
2576     diff = 1;
2577   else if (hard_LC_COLLATE)
2578     diff = xmemcoll (a->text, alen, b->text, blen);
2579   else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2580     diff = alen < blen ? -1 : alen != blen;
2581
2582   return reverse ? -diff : diff;
2583 }
2584
2585 static void
2586 write_bytes (struct line const *line, FILE *fp, char const *output_file)
2587 {
2588   char const *buf = line->text;
2589   size_t n_bytes = line->length;
2590
2591   /* Convert TABs to '>' and \0 to \n when -z specified.  */
2592   if (debug && fp == stdout)
2593     {
2594       char const *ebuf = buf + n_bytes;
2595       char const *c = buf;
2596
2597       while (c < ebuf)
2598         {
2599           char wc = *c++;
2600           if (wc == '\t')
2601             wc = '>';
2602           else if (wc == 0 && eolchar == 0)
2603             wc = '\n';
2604           if (fputc (wc, fp) == EOF)
2605             die (_("write failed"), output_file);
2606         }
2607
2608       compare (line, line, true);
2609     }
2610   else
2611     {
2612       if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2613         die (_("write failed"), output_file);
2614     }
2615 }
2616
2617 /* Check that the lines read from FILE_NAME come in order.  Return
2618    true if they are in order.  If CHECKONLY == 'c', also print a
2619    diagnostic (FILE_NAME, line number, contents of line) to stderr if
2620    they are not in order.  */
2621
2622 static bool
2623 check (char const *file_name, char checkonly)
2624 {
2625   FILE *fp = xfopen (file_name, "r");
2626   struct buffer buf;            /* Input buffer. */
2627   struct line temp;             /* Copy of previous line. */
2628   size_t alloc = 0;
2629   uintmax_t line_number = 0;
2630   struct keyfield const *key = keylist;
2631   bool nonunique = ! unique;
2632   bool ordered = true;
2633
2634   initbuf (&buf, sizeof (struct line),
2635            MAX (merge_buffer_size, sort_size));
2636   temp.text = NULL;
2637
2638   while (fillbuf (&buf, fp, file_name))
2639     {
2640       struct line const *line = buffer_linelim (&buf);
2641       struct line const *linebase = line - buf.nlines;
2642
2643       /* Make sure the line saved from the old buffer contents is
2644          less than or equal to the first line of the new buffer. */
2645       if (alloc && nonunique <= compare (&temp, line - 1, false))
2646         {
2647         found_disorder:
2648           {
2649             if (checkonly == 'c')
2650               {
2651                 struct line const *disorder_line = line - 1;
2652                 uintmax_t disorder_line_number =
2653                   buffer_linelim (&buf) - disorder_line + line_number;
2654                 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
2655                 fprintf (stderr, _("%s: %s:%s: disorder: "),
2656                          program_name, file_name,
2657                          umaxtostr (disorder_line_number, hr_buf));
2658                 if (debug)
2659                   fputc ('\n', stderr);
2660                 write_bytes (disorder_line, debug ? stdout : stderr,
2661                              debug ? _("standard out") : _("standard error"));
2662               }
2663
2664             ordered = false;
2665             break;
2666           }
2667         }
2668
2669       /* Compare each line in the buffer with its successor.  */
2670       while (linebase < --line)
2671         if (nonunique <= compare (line, line - 1, false))
2672           goto found_disorder;
2673
2674       line_number += buf.nlines;
2675
2676       /* Save the last line of the buffer.  */
2677       if (alloc < line->length)
2678         {
2679           do
2680             {
2681               alloc *= 2;
2682               if (! alloc)
2683                 {
2684                   alloc = line->length;
2685                   break;
2686                 }
2687             }
2688           while (alloc < line->length);
2689
2690           temp.text = xrealloc (temp.text, alloc);
2691         }
2692       memcpy (temp.text, line->text, line->length);
2693       temp.length = line->length;
2694       if (key)
2695         {
2696           temp.keybeg = temp.text + (line->keybeg - line->text);
2697           temp.keylim = temp.text + (line->keylim - line->text);
2698         }
2699     }
2700
2701   xfclose (fp, file_name);
2702   free (buf.buf);
2703   free (temp.text);
2704   return ordered;
2705 }
2706
2707 /* Open FILES (there are NFILES of them) and store the resulting array
2708    of stream pointers into (*PFPS).  Allocate the array.  Return the
2709    number of successfully opened files, setting errno if this value is
2710    less than NFILES.  */
2711
2712 static size_t
2713 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2714 {
2715   FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2716   int i;
2717
2718   /* Open as many input files as we can.  */
2719   for (i = 0; i < nfiles; i++)
2720     {
2721       fps[i] = (files[i].pid
2722                 ? open_temp (files[i].name, files[i].pid)
2723                 : stream_open (files[i].name, "r"));
2724       if (!fps[i])
2725         break;
2726     }
2727
2728   return i;
2729 }
2730
2731 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
2732    files (all of which are at the start of the FILES array), and
2733    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2734    FPS is the vector of open stream corresponding to the files.
2735    Close input and output streams before returning.
2736    OUTPUT_FILE gives the name of the output file.  If it is NULL,
2737    the output file is standard output.  */
2738
2739 static void
2740 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2741           FILE *ofp, char const *output_file, FILE **fps)
2742 {
2743   struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2744                                 /* Input buffers for each file. */
2745   struct line saved;            /* Saved line storage for unique check. */
2746   struct line const *savedline = NULL;
2747                                 /* &saved if there is a saved line. */
2748   size_t savealloc = 0;         /* Size allocated for the saved line. */
2749   struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2750                                 /* Current line in each line table. */
2751   struct line const **base = xnmalloc (nfiles, sizeof *base);
2752                                 /* Base of each line table.  */
2753   size_t *ord = xnmalloc (nfiles, sizeof *ord);
2754                                 /* Table representing a permutation of fps,
2755                                    such that cur[ord[0]] is the smallest line
2756                                    and will be next output. */
2757   size_t i;
2758   size_t j;
2759   size_t t;
2760   struct keyfield const *key = keylist;
2761   saved.text = NULL;
2762
2763   /* Read initial lines from each input file. */
2764   for (i = 0; i < nfiles; )
2765     {
2766       initbuf (&buffer[i], sizeof (struct line),
2767                MAX (merge_buffer_size, sort_size / nfiles));
2768       if (fillbuf (&buffer[i], fps[i], files[i].name))
2769         {
2770           struct line const *linelim = buffer_linelim (&buffer[i]);
2771           cur[i] = linelim - 1;
2772           base[i] = linelim - buffer[i].nlines;
2773           i++;
2774         }
2775       else
2776         {
2777           /* fps[i] is empty; eliminate it from future consideration.  */
2778           xfclose (fps[i], files[i].name);
2779           if (i < ntemps)
2780             {
2781               ntemps--;
2782               zaptemp (files[i].name);
2783             }
2784           free (buffer[i].buf);
2785           --nfiles;
2786           for (j = i; j < nfiles; ++j)
2787             {
2788               files[j] = files[j + 1];
2789               fps[j] = fps[j + 1];
2790             }
2791         }
2792     }
2793
2794   /* Set up the ord table according to comparisons among input lines.
2795      Since this only reorders two items if one is strictly greater than
2796      the other, it is stable. */
2797   for (i = 0; i < nfiles; ++i)
2798     ord[i] = i;
2799   for (i = 1; i < nfiles; ++i)
2800     if (0 < compare (cur[ord[i - 1]], cur[ord[i]], false))
2801       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2802
2803   /* Repeatedly output the smallest line until no input remains. */
2804   while (nfiles)
2805     {
2806       struct line const *smallest = cur[ord[0]];
2807
2808       /* If uniquified output is turned on, output only the first of
2809          an identical series of lines. */
2810       if (unique)
2811         {
2812           if (savedline && compare (savedline, smallest, false))
2813             {
2814               savedline = NULL;
2815               write_bytes (&saved, ofp, output_file);
2816             }
2817           if (!savedline)
2818             {
2819               savedline = &saved;
2820               if (savealloc < smallest->length)
2821                 {
2822                   do
2823                     if (! savealloc)
2824                       {
2825                         savealloc = smallest->length;
2826                         break;
2827                       }
2828                   while ((savealloc *= 2) < smallest->length);
2829
2830                   saved.text = xrealloc (saved.text, savealloc);
2831                 }
2832               saved.length = smallest->length;
2833               memcpy (saved.text, smallest->text, saved.length);
2834               if (key)
2835                 {
2836                   saved.keybeg =
2837                     saved.text + (smallest->keybeg - smallest->text);
2838                   saved.keylim =
2839                     saved.text + (smallest->keylim - smallest->text);
2840                 }
2841             }
2842         }
2843       else
2844         write_bytes (smallest, ofp, output_file);
2845
2846       /* Check if we need to read more lines into core. */
2847       if (base[ord[0]] < smallest)
2848         cur[ord[0]] = smallest - 1;
2849       else
2850         {
2851           if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2852             {
2853               struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2854               cur[ord[0]] = linelim - 1;
2855               base[ord[0]] = linelim - buffer[ord[0]].nlines;
2856             }
2857           else
2858             {
2859               /* We reached EOF on fps[ord[0]].  */
2860               for (i = 1; i < nfiles; ++i)
2861                 if (ord[i] > ord[0])
2862                   --ord[i];
2863               --nfiles;
2864               xfclose (fps[ord[0]], files[ord[0]].name);
2865               if (ord[0] < ntemps)
2866                 {
2867                   ntemps--;
2868                   zaptemp (files[ord[0]].name);
2869                 }
2870               free (buffer[ord[0]].buf);
2871               for (i = ord[0]; i < nfiles; ++i)
2872                 {
2873                   fps[i] = fps[i + 1];
2874                   files[i] = files[i + 1];
2875                   buffer[i] = buffer[i + 1];
2876                   cur[i] = cur[i + 1];
2877                   base[i] = base[i + 1];
2878                 }
2879               for (i = 0; i < nfiles; ++i)
2880                 ord[i] = ord[i + 1];
2881               continue;
2882             }
2883         }
2884
2885       /* The new line just read in may be larger than other lines
2886          already in main memory; push it back in the queue until we
2887          encounter a line larger than it.  Optimize for the common
2888          case where the new line is smallest.  */
2889       {
2890         size_t lo = 1;
2891         size_t hi = nfiles;
2892         size_t probe = lo;
2893         size_t ord0 = ord[0];
2894         size_t count_of_smaller_lines;
2895
2896         while (lo < hi)
2897           {
2898             int cmp = compare (cur[ord0], cur[ord[probe]], false);
2899             if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2900               hi = probe;
2901             else
2902               lo = probe + 1;
2903             probe = (lo + hi) / 2;
2904           }
2905
2906         count_of_smaller_lines = lo - 1;
2907         for (j = 0; j < count_of_smaller_lines; j++)
2908           ord[j] = ord[j + 1];
2909         ord[count_of_smaller_lines] = ord0;
2910       }
2911
2912       /* Free up some resources every once in a while.  */
2913       if (MAX_PROCS_BEFORE_REAP < nprocs)
2914         reap_some ();
2915     }
2916
2917   if (unique && savedline)
2918     {
2919       write_bytes (&saved, ofp, output_file);
2920       free (saved.text);
2921     }
2922
2923   xfclose (ofp, output_file);
2924   free (fps);
2925   free (buffer);
2926   free (ord);
2927   free (base);
2928   free (cur);
2929 }
2930
2931 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
2932    files (all of which are at the start of the FILES array), and
2933    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2934    Close input and output files before returning.
2935    OUTPUT_FILE gives the name of the output file.
2936
2937    Return the number of files successfully merged.  This number can be
2938    less than NFILES if we ran low on file descriptors, but in this
2939    case it is never less than 2.  */
2940
2941 static size_t
2942 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
2943             FILE *ofp, char const *output_file)
2944 {
2945   FILE **fps;
2946   size_t nopened = open_input_files (files, nfiles, &fps);
2947   if (nopened < nfiles && nopened < 2)
2948     die (_("open failed"), files[nopened].name);
2949   mergefps (files, ntemps, nopened, ofp, output_file, fps);
2950   return nopened;
2951 }
2952
2953 /* Merge into T the two sorted arrays of lines LO (with NLO members)
2954    and HI (with NHI members).  T, LO, and HI point just past their
2955    respective arrays, and the arrays are in reverse order.  NLO and
2956    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
2957
2958 static inline void
2959 mergelines (struct line *t,
2960             struct line const *lo, size_t nlo,
2961             struct line const *hi, size_t nhi)
2962 {
2963   for (;;)
2964     if (compare (lo - 1, hi - 1, false) <= 0)
2965       {
2966         *--t = *--lo;
2967         if (! --nlo)
2968           {
2969             /* HI - NHI equalled T - (NLO + NHI) when this function
2970                began.  Therefore HI must equal T now, and there is no
2971                need to copy from HI to T.  */
2972             return;
2973           }
2974       }
2975     else
2976       {
2977         *--t = *--hi;
2978         if (! --nhi)
2979           {
2980             do
2981               *--t = *--lo;
2982             while (--nlo);
2983
2984             return;
2985           }
2986       }
2987 }
2988
2989 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
2990    NLINES must be at least 2.
2991    The input and output arrays are in reverse order, and LINES and
2992    TEMP point just past the end of their respective arrays.
2993
2994    Use a recursive divide-and-conquer algorithm, in the style
2995    suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
2996    the optimization suggested by exercise 5.2.4-10; this requires room
2997    for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
2998    writes that this memory optimization was originally published by
2999    D. A. Bell, Comp J. 1 (1958), 75.  */
3000
3001 static void
3002 sortlines (struct line *lines, size_t nlines, struct line *temp)
3003 {
3004   if (nlines == 2)
3005     {
3006       if (0 < compare (&lines[-1], &lines[-2], false))
3007         {
3008           struct line tmp = lines[-1];
3009           lines[-1] = lines[-2];
3010           lines[-2] = tmp;
3011         }
3012     }
3013   else
3014     {
3015       size_t nlo = nlines / 2;
3016       size_t nhi = nlines - nlo;
3017       struct line *lo = lines;
3018       struct line *hi = lines - nlo;
3019       struct line *sorted_lo = temp;
3020
3021       sortlines (hi, nhi, temp);
3022       if (1 < nlo)
3023         sortlines_temp (lo, nlo, sorted_lo);
3024       else
3025         sorted_lo[-1] = lo[-1];
3026
3027       mergelines (lines, sorted_lo, nlo, hi, nhi);
3028     }
3029 }
3030
3031 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
3032    rather than sorting in place.  */
3033
3034 static void
3035 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
3036 {
3037   if (nlines == 2)
3038     {
3039       /* Declare `swap' as int, not bool, to work around a bug
3040          <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3041          in the IBM xlc 6.0.0.0 compiler in 64-bit mode.  */
3042       int swap = (0 < compare (&lines[-1], &lines[-2], false));
3043       temp[-1] = lines[-1 - swap];
3044       temp[-2] = lines[-2 + swap];
3045     }
3046   else
3047     {
3048       size_t nlo = nlines / 2;
3049       size_t nhi = nlines - nlo;
3050       struct line *lo = lines;
3051       struct line *hi = lines - nlo;
3052       struct line *sorted_hi = temp - nlo;
3053
3054       sortlines_temp (hi, nhi, sorted_hi);
3055       if (1 < nlo)
3056         sortlines (lo, nlo, temp);
3057
3058       mergelines (temp, lo, nlo, sorted_hi, nhi);
3059     }
3060 }
3061
3062 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
3063    the same as OUTFILE.  If found, merge the found instances (and perhaps
3064    some other files) into a temporary file so that it can in turn be
3065    merged into OUTFILE without destroying OUTFILE before it is completely
3066    read.  Return the new value of NFILES, which differs from the old if
3067    some merging occurred.
3068
3069    This test ensures that an otherwise-erroneous use like
3070    "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3071    It's not clear that POSIX requires this nicety.
3072    Detect common error cases, but don't try to catch obscure cases like
3073    "cat ... FILE ... | sort -m -o FILE"
3074    where traditional "sort" doesn't copy the input and where
3075    people should know that they're getting into trouble anyway.
3076    Catching these obscure cases would slow down performance in
3077    common cases.  */
3078
3079 static size_t
3080 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3081                       size_t nfiles, char const *outfile)
3082 {
3083   size_t i;
3084   bool got_outstat = false;
3085   struct stat outstat;
3086
3087   for (i = ntemps; i < nfiles; i++)
3088     {
3089       bool is_stdin = STREQ (files[i].name, "-");
3090       bool same;
3091       struct stat instat;
3092
3093       if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3094         same = true;
3095       else
3096         {
3097           if (! got_outstat)
3098             {
3099               if ((outfile
3100                    ? stat (outfile, &outstat)
3101                    : fstat (STDOUT_FILENO, &outstat))
3102                   != 0)
3103                 break;
3104               got_outstat = true;
3105             }
3106
3107           same = (((is_stdin
3108                     ? fstat (STDIN_FILENO, &instat)
3109                     : stat (files[i].name, &instat))
3110                    == 0)
3111                   && SAME_INODE (instat, outstat));
3112         }
3113
3114       if (same)
3115         {
3116           FILE *tftp;
3117           pid_t pid;
3118           char *temp = create_temp (&tftp, &pid);
3119           size_t num_merged = 0;
3120           do
3121             {
3122               num_merged += mergefiles (&files[i], 0, nfiles - i, tftp, temp);
3123               files[i].name = temp;
3124               files[i].pid = pid;
3125
3126               if (i + num_merged < nfiles)
3127                 memmove (&files[i + 1], &files[i + num_merged],
3128                          num_merged * sizeof *files);
3129               ntemps += 1;
3130               nfiles -= num_merged - 1;;
3131               i += num_merged;
3132             }
3133           while (i < nfiles);
3134         }
3135     }
3136
3137   return nfiles;
3138 }
3139
3140 /* Merge the input FILES.  NTEMPS is the number of files at the
3141    start of FILES that are temporary; it is zero at the top level.
3142    NFILES is the total number of files.  Put the output in
3143    OUTPUT_FILE; a null OUTPUT_FILE stands for standard output.  */
3144
3145 static void
3146 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3147        char const *output_file)
3148 {
3149   while (nmerge < nfiles)
3150     {
3151       /* Number of input files processed so far.  */
3152       size_t in;
3153
3154       /* Number of output files generated so far.  */
3155       size_t out;
3156
3157       /* nfiles % NMERGE; this counts input files that are left over
3158          after all full-sized merges have been done.  */
3159       size_t remainder;
3160
3161       /* Number of easily-available slots at the next loop iteration.  */
3162       size_t cheap_slots;
3163
3164       /* Do as many NMERGE-size merges as possible. In the case that
3165          nmerge is bogus, increment by the maximum number of file
3166          descriptors allowed.  */
3167       for (out = in = 0; nmerge <= nfiles - in; out++)
3168         {
3169           FILE *tfp;
3170           pid_t pid;
3171           char *temp = create_temp (&tfp, &pid);
3172           size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3173                                           nmerge, tfp, temp);
3174           ntemps -= MIN (ntemps, num_merged);
3175           files[out].name = temp;
3176           files[out].pid = pid;
3177           in += num_merged;
3178         }
3179
3180       remainder = nfiles - in;
3181       cheap_slots = nmerge - out % nmerge;
3182
3183       if (cheap_slots < remainder)
3184         {
3185           /* So many files remain that they can't all be put into the last
3186              NMERGE-sized output window.  Do one more merge.  Merge as few
3187              files as possible, to avoid needless I/O.  */
3188           size_t nshortmerge = remainder - cheap_slots + 1;
3189           FILE *tfp;
3190           pid_t pid;
3191           char *temp = create_temp (&tfp, &pid);
3192           size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3193                                           nshortmerge, tfp, temp);
3194           ntemps -= MIN (ntemps, num_merged);
3195           files[out].name = temp;
3196           files[out++].pid = pid;
3197           in += num_merged;
3198         }
3199
3200       /* Put the remaining input files into the last NMERGE-sized output
3201          window, so they will be merged in the next pass.  */
3202       memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3203       ntemps += out;
3204       nfiles -= in - out;
3205     }
3206
3207   nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
3208
3209   /* We aren't guaranteed that this final mergefiles will work, therefore we
3210      try to merge into the output, and then merge as much as we can into a
3211      temp file if we can't. Repeat.  */
3212
3213   for (;;)
3214     {
3215       /* Merge directly into the output file if possible.  */
3216       FILE **fps;
3217       size_t nopened = open_input_files (files, nfiles, &fps);
3218
3219       if (nopened == nfiles)
3220         {
3221           FILE *ofp = stream_open (output_file, "w");
3222           if (ofp)
3223             {
3224               mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3225               break;
3226             }
3227           if (errno != EMFILE || nopened <= 2)
3228             die (_("open failed"), output_file);
3229         }
3230       else if (nopened <= 2)
3231         die (_("open failed"), files[nopened].name);
3232
3233       /* We ran out of file descriptors.  Close one of the input
3234          files, to gain a file descriptor.  Then create a temporary
3235          file with our spare file descriptor.  Retry if that failed
3236          (e.g., some other process could open a file between the time
3237          we closed and tried to create).  */
3238       FILE *tfp;
3239       pid_t pid;
3240       char *temp;
3241       do
3242         {
3243           nopened--;
3244           xfclose (fps[nopened], files[nopened].name);
3245           temp = maybe_create_temp (&tfp, &pid, ! (nopened <= 2));
3246         }
3247       while (!temp);
3248
3249       /* Merge into the newly allocated temporary.  */
3250       mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp, fps);
3251       ntemps -= MIN (ntemps, nopened);
3252       files[0].name = temp;
3253       files[0].pid = pid;
3254
3255       memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3256       ntemps++;
3257       nfiles -= nopened - 1;
3258     }
3259 }
3260
3261 /* Sort NFILES FILES onto OUTPUT_FILE. */
3262
3263 static void
3264 sort (char * const *files, size_t nfiles, char const *output_file)
3265 {
3266   struct buffer buf;
3267   size_t ntemps = 0;
3268   bool output_file_created = false;
3269
3270   buf.alloc = 0;
3271
3272   while (nfiles)
3273     {
3274       char const *temp_output;
3275       char const *file = *files;
3276       FILE *fp = xfopen (file, "r");
3277       FILE *tfp;
3278       size_t bytes_per_line = (2 * sizeof (struct line)
3279                                - sizeof (struct line) / 2);
3280
3281       if (! buf.alloc)
3282         initbuf (&buf, bytes_per_line,
3283                  sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3284       buf.eof = false;
3285       files++;
3286       nfiles--;
3287
3288       while (fillbuf (&buf, fp, file))
3289         {
3290           struct line *line;
3291           struct line *linebase;
3292
3293           if (buf.eof && nfiles
3294               && (bytes_per_line + 1
3295                   < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3296             {
3297               /* End of file, but there is more input and buffer room.
3298                  Concatenate the next input file; this is faster in
3299                  the usual case.  */
3300               buf.left = buf.used;
3301               break;
3302             }
3303
3304           line = buffer_linelim (&buf);
3305           linebase = line - buf.nlines;
3306           if (1 < buf.nlines)
3307             sortlines (line, buf.nlines, linebase);
3308           if (buf.eof && !nfiles && !ntemps && !buf.left)
3309             {
3310               xfclose (fp, file);
3311               tfp = xfopen (output_file, "w");
3312               temp_output = output_file;
3313               output_file_created = true;
3314             }
3315           else
3316             {
3317               ++ntemps;
3318               temp_output = create_temp (&tfp, NULL);
3319             }
3320
3321           do
3322             {
3323               line--;
3324               write_bytes (line, tfp, temp_output);
3325               if (unique)
3326                 while (linebase < line && compare (line, line - 1, false) == 0)
3327                   line--;
3328             }
3329           while (linebase < line);
3330
3331           xfclose (tfp, temp_output);
3332
3333           /* Free up some resources every once in a while.  */
3334           if (MAX_PROCS_BEFORE_REAP < nprocs)
3335             reap_some ();
3336
3337           if (output_file_created)
3338             goto finish;
3339         }
3340       xfclose (fp, file);
3341     }
3342
3343  finish:
3344   free (buf.buf);
3345
3346   if (! output_file_created)
3347     {
3348       size_t i;
3349       struct tempnode *node = temphead;
3350       struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3351       for (i = 0; node; i++)
3352         {
3353           tempfiles[i].name = node->name;
3354           tempfiles[i].pid = node->pid;
3355           node = node->next;
3356         }
3357       merge (tempfiles, ntemps, ntemps, output_file);
3358       free (tempfiles);
3359     }
3360 }
3361
3362 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list.  */
3363
3364 static void
3365 insertkey (struct keyfield *key_arg)
3366 {
3367   struct keyfield **p;
3368   struct keyfield *key = xmemdup (key_arg, sizeof *key);
3369
3370   for (p = &keylist; *p; p = &(*p)->next)
3371     continue;
3372   *p = key;
3373   key->next = NULL;
3374 }
3375
3376 /* Report a bad field specification SPEC, with extra info MSGID.  */
3377
3378 static void badfieldspec (char const *, char const *)
3379      ATTRIBUTE_NORETURN;
3380 static void
3381 badfieldspec (char const *spec, char const *msgid)
3382 {
3383   error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
3384          _(msgid), quote (spec));
3385   abort ();
3386 }
3387
3388 /* Report incompatible options.  */
3389
3390 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
3391 static void
3392 incompatible_options (char const *opts)
3393 {
3394   error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
3395   abort ();
3396 }
3397
3398 /* Check compatibility of ordering options.  */
3399
3400 static void
3401 check_ordering_compatibility (void)
3402 {
3403   struct keyfield *key;
3404
3405   for (key = keylist; key; key = key->next)
3406     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
3407               + key->version + !!key->ignore + key->human_numeric))
3408         || (key->random && key->translate))
3409       {
3410         /* The following is too big, but guaranteed to be "big enough".  */
3411         char opts[sizeof short_options];
3412         /* Clear flags we're not interested in.  */
3413         key->skipsblanks = key->skipeblanks = key->reverse = false;
3414         key_to_opts (key, opts);
3415         incompatible_options (opts);
3416       }
3417 }
3418
3419 /* Parse the leading integer in STRING and store the resulting value
3420    (which must fit into size_t) into *VAL.  Return the address of the
3421    suffix after the integer.  If the value is too large, silently
3422    substitute SIZE_MAX.  If MSGID is NULL, return NULL after
3423    failure; otherwise, report MSGID and exit on failure.  */
3424
3425 static char const *
3426 parse_field_count (char const *string, size_t *val, char const *msgid)
3427 {
3428   char *suffix;
3429   uintmax_t n;
3430
3431   switch (xstrtoumax (string, &suffix, 10, &n, ""))
3432     {
3433     case LONGINT_OK:
3434     case LONGINT_INVALID_SUFFIX_CHAR:
3435       *val = n;
3436       if (*val == n)
3437         break;
3438       /* Fall through.  */
3439     case LONGINT_OVERFLOW:
3440     case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
3441       *val = SIZE_MAX;
3442       break;
3443
3444     case LONGINT_INVALID:
3445       if (msgid)
3446         error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
3447                _(msgid), quote (string));
3448       return NULL;
3449     }
3450
3451   return suffix;
3452 }
3453
3454 /* Handle interrupts and hangups. */
3455
3456 static void
3457 sighandler (int sig)
3458 {
3459   if (! SA_NOCLDSTOP)
3460     signal (sig, SIG_IGN);
3461
3462   cleanup ();
3463
3464   signal (sig, SIG_DFL);
3465   raise (sig);
3466 }
3467
3468 /* Set the ordering options for KEY specified in S.
3469    Return the address of the first character in S that
3470    is not a valid ordering option.
3471    BLANKTYPE is the kind of blanks that 'b' should skip. */
3472
3473 static char *
3474 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
3475 {
3476   while (*s)
3477     {
3478       switch (*s)
3479         {
3480         case 'b':
3481           if (blanktype == bl_start || blanktype == bl_both)
3482             key->skipsblanks = true;
3483           if (blanktype == bl_end || blanktype == bl_both)
3484             key->skipeblanks = true;
3485           break;
3486         case 'd':
3487           key->ignore = nondictionary;
3488           break;
3489         case 'f':
3490           key->translate = fold_toupper;
3491           break;
3492         case 'g':
3493           key->general_numeric = true;
3494           break;
3495         case 'h':
3496           key->human_numeric = true;
3497           break;
3498         case 'i':
3499           /* Option order should not matter, so don't let -i override
3500              -d.  -d implies -i, but -i does not imply -d.  */
3501           if (! key->ignore)
3502             key->ignore = nonprinting;
3503           break;
3504         case 'M':
3505           key->month = true;
3506           break;
3507         case 'n':
3508           key->numeric = true;
3509           break;
3510         case 'R':
3511           key->random = true;
3512           break;
3513         case 'r':
3514           key->reverse = true;
3515           break;
3516         case 'V':
3517           key->version = true;
3518           break;
3519         default:
3520           return (char *) s;
3521         }
3522       ++s;
3523     }
3524   return (char *) s;
3525 }
3526
3527 static struct keyfield *
3528 key_init (struct keyfield *key)
3529 {
3530   memset (key, 0, sizeof *key);
3531   key->eword = SIZE_MAX;
3532   key->iec_present = -1;
3533   return key;
3534 }
3535
3536 int
3537 main (int argc, char **argv)
3538 {
3539   struct keyfield *key;
3540   struct keyfield key_buf;
3541   struct keyfield gkey;
3542   bool gkey_only = false;
3543   char const *s;
3544   int c = 0;
3545   char checkonly = 0;
3546   bool mergeonly = false;
3547   char *random_source = NULL;
3548   bool need_random = false;
3549   size_t nfiles = 0;
3550   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
3551   bool obsolete_usage = (posix2_version () < 200112);
3552   char **files;
3553   char *files_from = NULL;
3554   struct Tokens tok;
3555   char const *outfile = NULL;
3556
3557   initialize_main (&argc, &argv);
3558   set_program_name (argv[0]);
3559   setlocale (LC_ALL, "");
3560   bindtextdomain (PACKAGE, LOCALEDIR);
3561   textdomain (PACKAGE);
3562
3563   initialize_exit_failure (SORT_FAILURE);
3564
3565   hard_LC_COLLATE = hard_locale (LC_COLLATE);
3566 #if HAVE_NL_LANGINFO
3567   hard_LC_TIME = hard_locale (LC_TIME);
3568 #endif
3569
3570   /* Get locale's representation of the decimal point.  */
3571   {
3572     struct lconv const *locale = localeconv ();
3573
3574     /* If the locale doesn't define a decimal point, or if the decimal
3575        point is multibyte, use the C locale's decimal point.  FIXME:
3576        add support for multibyte decimal points.  */
3577     decimal_point = to_uchar (locale->decimal_point[0]);
3578     if (! decimal_point || locale->decimal_point[1])
3579       decimal_point = '.';
3580
3581     /* FIXME: add support for multibyte thousands separators.  */
3582     thousands_sep = to_uchar (*locale->thousands_sep);
3583     if (! thousands_sep || locale->thousands_sep[1])
3584       thousands_sep = -1;
3585   }
3586
3587   have_read_stdin = false;
3588   inittables ();
3589
3590   {
3591     size_t i;
3592     static int const sig[] =
3593       {
3594         /* The usual suspects.  */
3595         SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
3596 #ifdef SIGPOLL
3597         SIGPOLL,
3598 #endif
3599 #ifdef SIGPROF
3600         SIGPROF,
3601 #endif
3602 #ifdef SIGVTALRM
3603         SIGVTALRM,
3604 #endif
3605 #ifdef SIGXCPU
3606         SIGXCPU,
3607 #endif
3608 #ifdef SIGXFSZ
3609         SIGXFSZ,
3610 #endif
3611       };
3612     enum { nsigs = ARRAY_CARDINALITY (sig) };
3613
3614 #if SA_NOCLDSTOP
3615     struct sigaction act;
3616
3617     sigemptyset (&caught_signals);
3618     for (i = 0; i < nsigs; i++)
3619       {
3620         sigaction (sig[i], NULL, &act);
3621         if (act.sa_handler != SIG_IGN)
3622           sigaddset (&caught_signals, sig[i]);
3623       }
3624
3625     act.sa_handler = sighandler;
3626     act.sa_mask = caught_signals;
3627     act.sa_flags = 0;
3628
3629     for (i = 0; i < nsigs; i++)
3630       if (sigismember (&caught_signals, sig[i]))
3631         sigaction (sig[i], &act, NULL);
3632 #else
3633     for (i = 0; i < nsigs; i++)
3634       if (signal (sig[i], SIG_IGN) != SIG_IGN)
3635         {
3636           signal (sig[i], sighandler);
3637           siginterrupt (sig[i], 1);
3638         }
3639 #endif
3640   }
3641   signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent.  */
3642
3643   /* The signal mask is known, so it is safe to invoke exit_cleanup.  */
3644   atexit (exit_cleanup);
3645
3646   key_init (&gkey);
3647   gkey.sword = SIZE_MAX;
3648
3649   files = xnmalloc (argc, sizeof *files);
3650
3651   for (;;)
3652     {
3653       /* Parse an operand as a file after "--" was seen; or if
3654          pedantic and a file was seen, unless the POSIX version
3655          predates 1003.1-2001 and -c was not seen and the operand is
3656          "-o FILE" or "-oFILE".  */
3657       int oi = -1;
3658
3659       if (c == -1
3660           || (posixly_correct && nfiles != 0
3661               && ! (obsolete_usage
3662                     && ! checkonly
3663                     && optind != argc
3664                     && argv[optind][0] == '-' && argv[optind][1] == 'o'
3665                     && (argv[optind][2] || optind + 1 != argc)))
3666           || ((c = getopt_long (argc, argv, short_options,
3667                                 long_options, &oi))
3668               == -1))
3669         {
3670           if (argc <= optind)
3671             break;
3672           files[nfiles++] = argv[optind++];
3673         }
3674       else switch (c)
3675         {
3676         case 1:
3677           key = NULL;
3678           if (optarg[0] == '+')
3679             {
3680               bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
3681                                       && ISDIGIT (argv[optind][1]));
3682               obsolete_usage |= minus_pos_usage && !posixly_correct;
3683               if (obsolete_usage)
3684                 {
3685                   /* Treat +POS1 [-POS2] as a key if possible; but silently
3686                      treat an operand as a file if it is not a valid +POS1.  */
3687                   key = key_init (&key_buf);
3688                   s = parse_field_count (optarg + 1, &key->sword, NULL);
3689                   if (s && *s == '.')
3690                     s = parse_field_count (s + 1, &key->schar, NULL);
3691                   if (! (key->sword || key->schar))
3692                     key->sword = SIZE_MAX;
3693                   if (! s || *set_ordering (s, key, bl_start))
3694                     key = NULL;
3695                   else
3696                     {
3697                       if (minus_pos_usage)
3698                         {
3699                           char const *optarg1 = argv[optind++];
3700                           s = parse_field_count (optarg1 + 1, &key->eword,
3701                                              N_("invalid number after `-'"));
3702                           if (*s == '.')
3703                             s = parse_field_count (s + 1, &key->echar,
3704                                                N_("invalid number after `.'"));
3705                           if (!key->echar && key->eword)
3706                             {
3707                               /* obsolescent syntax +A.x -B.y is equivalent to:
3708                                    -k A+1.x+1,B.y   (when y = 0)
3709                                    -k A+1.x+1,B+1.y (when y > 0)
3710                                  So eword is decremented as in the -k case
3711                                  only when the end field (B) is specified and
3712                                  echar (y) is 0.  */
3713                               key->eword--;
3714                             }
3715                           if (*set_ordering (s, key, bl_end))
3716                             badfieldspec (optarg1,
3717                                       N_("stray character in field spec"));
3718                         }
3719                       key->obsolete_used = true;
3720                       insertkey (key);
3721                     }
3722                 }
3723             }
3724           if (! key)
3725             files[nfiles++] = optarg;
3726           break;
3727
3728         case SORT_OPTION:
3729           c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
3730           /* Fall through. */
3731         case 'b':
3732         case 'd':
3733         case 'f':
3734         case 'g':
3735         case 'h':
3736         case 'i':
3737         case 'M':
3738         case 'n':
3739         case 'r':
3740         case 'R':
3741         case 'V':
3742           {
3743             char str[2];
3744             str[0] = c;
3745             str[1] = '\0';
3746             set_ordering (str, &gkey, bl_both);
3747           }
3748           break;
3749
3750         case CHECK_OPTION:
3751           c = (optarg
3752                ? XARGMATCH ("--check", optarg, check_args, check_types)
3753                : 'c');
3754           /* Fall through.  */
3755         case 'c':
3756         case 'C':
3757           if (checkonly && checkonly != c)
3758             incompatible_options ("cC");
3759           checkonly = c;
3760           break;
3761
3762         case COMPRESS_PROGRAM_OPTION:
3763           if (compress_program && !STREQ (compress_program, optarg))
3764             error (SORT_FAILURE, 0, _("multiple compress programs specified"));
3765           compress_program = optarg;
3766           break;
3767
3768         case DEBUG_PROGRAM_OPTION:
3769           debug = true;
3770           break;
3771
3772         case FILES0_FROM_OPTION:
3773           files_from = optarg;
3774           break;
3775
3776         case 'k':
3777           key = key_init (&key_buf);
3778
3779           /* Get POS1. */
3780           s = parse_field_count (optarg, &key->sword,
3781                                  N_("invalid number at field start"));
3782           if (! key->sword--)
3783             {
3784               /* Provoke with `sort -k0' */
3785               badfieldspec (optarg, N_("field number is zero"));
3786             }
3787           if (*s == '.')
3788             {
3789               s = parse_field_count (s + 1, &key->schar,
3790                                      N_("invalid number after `.'"));
3791               if (! key->schar--)
3792                 {
3793                   /* Provoke with `sort -k1.0' */
3794                   badfieldspec (optarg, N_("character offset is zero"));
3795                 }
3796             }
3797           if (! (key->sword || key->schar))
3798             key->sword = SIZE_MAX;
3799           s = set_ordering (s, key, bl_start);
3800           if (*s != ',')
3801             {
3802               key->eword = SIZE_MAX;
3803               key->echar = 0;
3804             }
3805           else
3806             {
3807               /* Get POS2. */
3808               s = parse_field_count (s + 1, &key->eword,
3809                                      N_("invalid number after `,'"));
3810               if (! key->eword--)
3811                 {
3812                   /* Provoke with `sort -k1,0' */
3813                   badfieldspec (optarg, N_("field number is zero"));
3814                 }
3815               if (*s == '.')
3816                 {
3817                   s = parse_field_count (s + 1, &key->echar,
3818                                          N_("invalid number after `.'"));
3819                 }
3820               s = set_ordering (s, key, bl_end);
3821             }
3822           if (*s)
3823             badfieldspec (optarg, N_("stray character in field spec"));
3824           insertkey (key);
3825           break;
3826
3827         case 'm':
3828           mergeonly = true;
3829           break;
3830
3831         case NMERGE_OPTION:
3832           specify_nmerge (oi, c, optarg);
3833           break;
3834
3835         case 'o':
3836           if (outfile && !STREQ (outfile, optarg))
3837             error (SORT_FAILURE, 0, _("multiple output files specified"));
3838           outfile = optarg;
3839           break;
3840
3841         case RANDOM_SOURCE_OPTION:
3842           if (random_source && !STREQ (random_source, optarg))
3843             error (SORT_FAILURE, 0, _("multiple random sources specified"));
3844           random_source = optarg;
3845           break;
3846
3847         case 's':
3848           stable = true;
3849           break;
3850
3851         case 'S':
3852           specify_sort_size (oi, c, optarg);
3853           break;
3854
3855         case 't':
3856           {
3857             char newtab = optarg[0];
3858             if (! newtab)
3859               error (SORT_FAILURE, 0, _("empty tab"));
3860             if (optarg[1])
3861               {
3862                 if (STREQ (optarg, "\\0"))
3863                   newtab = '\0';
3864                 else
3865                   {
3866                     /* Provoke with `sort -txx'.  Complain about
3867                        "multi-character tab" instead of "multibyte tab", so
3868                        that the diagnostic's wording does not need to be
3869                        changed once multibyte characters are supported.  */
3870                     error (SORT_FAILURE, 0, _("multi-character tab %s"),
3871                            quote (optarg));
3872                   }
3873               }
3874             if (tab != TAB_DEFAULT && tab != newtab)
3875               error (SORT_FAILURE, 0, _("incompatible tabs"));
3876             tab = newtab;
3877           }
3878           break;
3879
3880         case 'T':
3881           add_temp_dir (optarg);
3882           break;
3883
3884         case 'u':
3885           unique = true;
3886           break;
3887
3888         case 'y':
3889           /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
3890              through Solaris 7.  It is also accepted by many non-Solaris
3891              "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
3892              -y is marked as obsolete starting with Solaris 8 (1999), but is
3893              still accepted as of Solaris 10 prerelease (2004).
3894
3895              Solaris 2.5.1 "sort -y 100" reads the input file "100", but
3896              emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
3897              and which in general ignores the argument after "-y" if it
3898              consists entirely of digits (it can even be empty).  */
3899           if (optarg == argv[optind - 1])
3900             {
3901               char const *p;
3902               for (p = optarg; ISDIGIT (*p); p++)
3903                 continue;
3904               optind -= (*p != '\0');
3905             }
3906           break;
3907
3908         case 'z':
3909           eolchar = 0;
3910           break;
3911
3912         case_GETOPT_HELP_CHAR;
3913
3914         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
3915
3916         default:
3917           usage (SORT_FAILURE);
3918         }
3919     }
3920
3921   if (files_from)
3922     {
3923       FILE *stream;
3924
3925       /* When using --files0-from=F, you may not specify any files
3926          on the command-line.  */
3927       if (nfiles)
3928         {
3929           error (0, 0, _("extra operand %s"), quote (files[0]));
3930           fprintf (stderr, "%s\n",
3931                    _("file operands cannot be combined with --files0-from"));
3932           usage (SORT_FAILURE);
3933         }
3934
3935       if (STREQ (files_from, "-"))
3936         stream = stdin;
3937       else
3938         {
3939           stream = fopen (files_from, "r");
3940           if (stream == NULL)
3941             error (SORT_FAILURE, errno, _("cannot open %s for reading"),
3942                    quote (files_from));
3943         }
3944
3945       readtokens0_init (&tok);
3946
3947       if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
3948         error (SORT_FAILURE, 0, _("cannot read file names from %s"),
3949                quote (files_from));
3950
3951       if (tok.n_tok)
3952         {
3953           size_t i;
3954           free (files);
3955           files = tok.tok;
3956           nfiles = tok.n_tok;
3957           for (i = 0; i < nfiles; i++)
3958           {
3959               if (STREQ (files[i], "-"))
3960                 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
3961                                           "no file name of %s allowed"),
3962                        quote (files[i]));
3963               else if (files[i][0] == '\0')
3964                 {
3965                   /* Using the standard `filename:line-number:' prefix here is
3966                      not totally appropriate, since NUL is the separator, not NL,
3967                      but it might be better than nothing.  */
3968                   unsigned long int file_number = i + 1;
3969                   error (SORT_FAILURE, 0,
3970                          _("%s:%lu: invalid zero-length file name"),
3971                          quotearg_colon (files_from), file_number);
3972                 }
3973           }
3974         }
3975       else
3976         error (SORT_FAILURE, 0, _("no input from %s"),
3977                quote (files_from));
3978     }
3979
3980   /* Inheritance of global options to individual keys. */
3981   for (key = keylist; key; key = key->next)
3982     {
3983       if (default_key_compare (key) && !key->reverse)
3984         {
3985           key->ignore = gkey.ignore;
3986           key->translate = gkey.translate;
3987           key->skipsblanks = gkey.skipsblanks;
3988           key->skipeblanks = gkey.skipeblanks;
3989           key->month = gkey.month;
3990           key->numeric = gkey.numeric;
3991           key->general_numeric = gkey.general_numeric;
3992           key->human_numeric = gkey.human_numeric;
3993           key->version = gkey.version;
3994           key->random = gkey.random;
3995           key->reverse = gkey.reverse;
3996         }
3997
3998       need_random |= key->random;
3999     }
4000
4001   if (!keylist && !default_key_compare (&gkey))
4002     {
4003       gkey_only = true;
4004       insertkey (&gkey);
4005       need_random |= gkey.random;
4006     }
4007
4008   check_ordering_compatibility ();
4009
4010   /* Disable this combination so that users are less likely
4011      to inadvertantly update a file with debugging enabled.
4012      Also it simplifies the code for handling temp files.  */
4013   if (debug && outfile)
4014     error (SORT_FAILURE, 0, _("options -o and --debug are incompatible"));
4015
4016   if (debug)
4017     {
4018       /* Always output the locale in debug mode, since this
4019          is such a common source of confusion.  */
4020       if (hard_LC_COLLATE)
4021         error (0, 0, _("using %s sorting rules"),
4022                quote (setlocale (LC_COLLATE, NULL)));
4023       else
4024         error (0, 0, _("using simple byte comparison"));
4025       key_warnings (&gkey, gkey_only);
4026     }
4027
4028   reverse = gkey.reverse;
4029
4030   if (need_random)
4031     {
4032       randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
4033       if (! randread_source)
4034         die (_("open failed"), random_source);
4035     }
4036
4037   if (temp_dir_count == 0)
4038     {
4039       char const *tmp_dir = getenv ("TMPDIR");
4040       add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4041     }
4042
4043   if (nfiles == 0)
4044     {
4045       static char *minus = (char *) "-";
4046       nfiles = 1;
4047       free (files);
4048       files = &minus;
4049     }
4050
4051   /* Need to re-check that we meet the minimum requirement for memory
4052      usage with the final value for NMERGE. */
4053   if (0 < sort_size)
4054     sort_size = MAX (sort_size, MIN_SORT_SIZE);
4055
4056   if (checkonly)
4057     {
4058       if (nfiles > 1)
4059         error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4060                quote (files[1]), checkonly);
4061
4062       if (outfile)
4063         {
4064           static char opts[] = {0, 'o', 0};
4065           opts[0] = checkonly;
4066           incompatible_options (opts);
4067         }
4068
4069       /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4070          input is not properly sorted.  */
4071       exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4072     }
4073
4074   if (mergeonly)
4075     {
4076       struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4077       size_t i;
4078
4079       for (i = 0; i < nfiles; ++i)
4080         sortfiles[i].name = files[i];
4081
4082       merge (sortfiles, 0, nfiles, outfile);
4083       IF_LINT (free (sortfiles));
4084     }
4085   else
4086     sort (files, nfiles, outfile);
4087
4088   if (have_read_stdin && fclose (stdin) == EOF)
4089     die (_("close failed"), "-");
4090
4091   exit (EXIT_SUCCESS);
4092 }