src/sort.c

   1 /* sort - sort lines of text (with all kinds of options).
   2    Copyright (C) 1988, 1991-2008 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17    Written December 1988 by Mike Haertel.
  18    The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
  19    or (US mail) as Mike Haertel c/o Free Software Foundation.
  20
  21    Ørn E. Hansen added NLS support in 1997.  */
  22
  23 #include <config.h>
  24
  25 #include <getopt.h>
  26 #include <sys/types.h>
  27 #include <sys/wait.h>
  28 #include <signal.h>
  29 #include "system.h"
  30 #include "argmatch.h"
  31 #include "error.h"
  32 #include "hard-locale.h"
  33 #include "hash.h"
  34 #include "md5.h"
  35 #include "physmem.h"
  36 #include "posixver.h"
  37 #include "quote.h"
  38 #include "quotearg.h"
  39 #include "randread.h"
  40 #include "readtokens0.h"
  41 #include "stdio--.h"
  42 #include "stdlib--.h"
  43 #include "strnumcmp.h"
  44 #include "xmemcoll.h"
  45 #include "xmemxfrm.h"
  46 #include "xstrtol.h"
  47
  48 #if HAVE_SYS_RESOURCE_H
  49 # include <sys/resource.h>
  50 #endif
  51 #ifndef RLIMIT_DATA
  52 struct rlimit { size_t rlim_cur; };
  53 # define getrlimit(Resource, Rlp) (-1)
  54 #endif
  55
  56 /* The official name of this program (e.g., no `g' prefix).  */
  57 #define PROGRAM_NAME "sort"
  58
  59 #define AUTHORS \
  60   proper_name ("Mike Haertel"), \
  61   proper_name ("Paul Eggert")
  62
  63 #if HAVE_LANGINFO_CODESET
  64 # include <langinfo.h>
  65 #endif
  66
  67 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
  68    present.  */
  69 #ifndef SA_NOCLDSTOP
  70 # define SA_NOCLDSTOP 0
  71 /* No sigprocmask.  Always 'return' zero. */
  72 # define sigprocmask(How, Set, Oset) (0)
  73 # define sigset_t int
  74 # if ! HAVE_SIGINTERRUPT
  75 #  define siginterrupt(sig, flag) /* empty */
  76 # endif
  77 #endif
  78
  79 #if !defined OPEN_MAX && defined NR_OPEN
  80 # define OPEN_MAX NR_OPEN
  81 #endif
  82 #if !defined OPEN_MAX
  83 # define OPEN_MAX 20
  84 #endif
  85
  86 #define UCHAR_LIM (UCHAR_MAX + 1)
  87
  88 #ifndef DEFAULT_TMPDIR
  89 # define DEFAULT_TMPDIR "/tmp"
  90 #endif
  91
  92 /* Exit statuses.  */
  93 enum
  94   {
  95     /* POSIX says to exit with status 1 if invoked with -c and the
  96        input is not properly sorted.  */
  97     SORT_OUT_OF_ORDER = 1,
  98
  99     /* POSIX says any other irregular exit must exit with a status
 100        code greater than 1.  */
 101     SORT_FAILURE = 2
 102   };
 103
 104 enum
 105   {
 106     /* The number of times we should try to fork a compression process
 107        (we retry if the fork call fails).  We don't _need_ to compress
 108        temp files, this is just to reduce disk access, so this number
 109        can be small.  */
 110     MAX_FORK_TRIES_COMPRESS = 2,
 111
 112     /* The number of times we should try to fork a decompression process.
 113        If we can't fork a decompression process, we can't sort, so this
 114        number should be big.  */
 115     MAX_FORK_TRIES_DECOMPRESS = 8
 116   };
 117
 118 /* The representation of the decimal point in the current locale.  */
 119 static int decimal_point;
 120
 121 /* Thousands separator; if -1, then there isn't one.  */
 122 static int thousands_sep;
 123
 124 /* Nonzero if the corresponding locales are hard.  */
 125 static bool hard_LC_COLLATE;
 126 #if HAVE_NL_LANGINFO
 127 static bool hard_LC_TIME;
 128 #endif
 129
 130 #define NONZERO(x) ((x) != 0)
 131
 132 /* The kind of blanks for '-b' to skip in various options. */
 133 enum blanktype { bl_start, bl_end, bl_both };
 134
 135 /* The character marking end of line. Default to \n. */
 136 static char eolchar = '\n';
 137
 138 /* Lines are held in core as counted strings. */
 139 struct line
 140 {
 141   char *text;                   /* Text of the line. */
 142   size_t length;                /* Length including final newline. */
 143   char *keybeg;                 /* Start of first key. */
 144   char *keylim;                 /* Limit of first key. */
 145 };
 146
 147 /* Input buffers. */
 148 struct buffer
 149 {
 150   char *buf;                    /* Dynamically allocated buffer,
 151                                    partitioned into 3 regions:
 152                                    - input data;
 153                                    - unused area;
 154                                    - an array of lines, in reverse order.  */
 155   size_t used;                  /* Number of bytes used for input data.  */
 156   size_t nlines;                /* Number of lines in the line array.  */
 157   size_t alloc;                 /* Number of bytes allocated. */
 158   size_t left;                  /* Number of bytes left from previous reads. */
 159   size_t line_bytes;            /* Number of bytes to reserve for each line. */
 160   bool eof;                     /* An EOF has been read.  */
 161 };
 162
 163 struct keyfield
 164 {
 165   size_t sword;                 /* Zero-origin 'word' to start at. */
 166   size_t schar;                 /* Additional characters to skip. */
 167   size_t eword;                 /* Zero-origin first word after field. */
 168   size_t echar;                 /* Additional characters in field. */
 169   bool const *ignore;           /* Boolean array of characters to ignore. */
 170   char const *translate;        /* Translation applied to characters. */
 171   bool skipsblanks;             /* Skip leading blanks when finding start.  */
 172   bool skipeblanks;             /* Skip leading blanks when finding end.  */
 173   bool numeric;                 /* Flag for numeric comparison.  Handle
 174                                    strings of digits with optional decimal
 175                                    point, but no exponential notation. */
 176   bool random;                  /* Sort by random hash of key.  */
 177   bool general_numeric;         /* Flag for general, numeric comparison.
 178                                    Handle numbers in exponential notation. */
 179   bool month;                   /* Flag for comparison by month name. */
 180   bool reverse;                 /* Reverse the sense of comparison. */
 181   bool version;                 /* sort by version number */
 182   struct keyfield *next;        /* Next keyfield to try. */
 183 };
 184
 185 struct month
 186 {
 187   char const *name;
 188   int val;
 189 };
 190
 191 /* FIXME: None of these tables work with multibyte character sets.
 192    Also, there are many other bugs when handling multibyte characters.
 193    One way to fix this is to rewrite `sort' to use wide characters
 194    internally, but doing this with good performance is a bit
 195    tricky.  */
 196
 197 /* Table of blanks.  */
 198 static bool blanks[UCHAR_LIM];
 199
 200 /* Table of non-printing characters. */
 201 static bool nonprinting[UCHAR_LIM];
 202
 203 /* Table of non-dictionary characters (not letters, digits, or blanks). */
 204 static bool nondictionary[UCHAR_LIM];
 205
 206 /* Translation table folding lower case to upper.  */
 207 static char fold_toupper[UCHAR_LIM];
 208
 209 #define MONTHS_PER_YEAR 12
 210
 211 /* Table mapping month names to integers.
 212    Alphabetic order allows binary search. */
 213 static struct month monthtab[] =
 214 {
 215   {"APR", 4},
 216   {"AUG", 8},
 217   {"DEC", 12},
 218   {"FEB", 2},
 219   {"JAN", 1},
 220   {"JUL", 7},
 221   {"JUN", 6},
 222   {"MAR", 3},
 223   {"MAY", 5},
 224   {"NOV", 11},
 225   {"OCT", 10},
 226   {"SEP", 9}
 227 };
 228
 229 /* During the merge phase, the number of files to merge at once. */
 230 #define NMERGE_DEFAULT 16
 231
 232 /* Minimum size for a merge or check buffer.  */
 233 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 234
 235 /* Minimum sort size; the code might not work with smaller sizes.  */
 236 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
 237
 238 /* The number of bytes needed for a merge or check buffer, which can
 239    function relatively efficiently even if it holds only one line.  If
 240    a longer line is seen, this value is increased.  */
 241 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
 242
 243 /* The approximate maximum number of bytes of main memory to use, as
 244    specified by the user.  Zero if the user has not specified a size.  */
 245 static size_t sort_size;
 246
 247 /* The guessed size for non-regular files.  */
 248 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
 249
 250 /* Array of directory names in which any temporary files are to be created. */
 251 static char const **temp_dirs;
 252
 253 /* Number of temporary directory names used.  */
 254 static size_t temp_dir_count;
 255
 256 /* Number of allocated slots in temp_dirs.  */
 257 static size_t temp_dir_alloc;
 258
 259 /* Flag to reverse the order of all comparisons. */
 260 static bool reverse;
 261
 262 /* Flag for stable sort.  This turns off the last ditch bytewise
 263    comparison of lines, and instead leaves lines in the same order
 264    they were read if all keys compare equal.  */
 265 static bool stable;
 266
 267 /* If TAB has this value, blanks separate fields.  */
 268 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 269
 270 /* Tab character separating fields.  If TAB_DEFAULT, then fields are
 271    separated by the empty string between a non-blank character and a blank
 272    character. */
 273 static int tab = TAB_DEFAULT;
 274
 275 /* Flag to remove consecutive duplicate lines from the output.
 276    Only the last of a sequence of equal lines will be output. */
 277 static bool unique;
 278
 279 /* Nonzero if any of the input files are the standard input. */
 280 static bool have_read_stdin;
 281
 282 /* List of key field comparisons to be tried.  */
 283 static struct keyfield *keylist;
 284
 285 /* Program used to (de)compress temp files.  Must accept -d.  */
 286 static char const *compress_program;
 287
 288 /* Maximum number of files to merge in one go.  If more than this
 289    number are present, temp files will be used. */
 290 static unsigned int nmerge = NMERGE_DEFAULT;
 291
 292 static void sortlines_temp (struct line *, size_t, struct line *);
 293
 294 /* Report MESSAGE for FILE, then clean up and exit.
 295    If FILE is null, it represents standard output.  */
 296
 297 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
 298 static void
 299 die (char const *message, char const *file)
 300 {
 301   error (0, errno, "%s: %s", message, file ? file : _("standard output"));
 302   exit (SORT_FAILURE);
 303 }
 304
 305 void
 306 usage (int status)
 307 {
 308   if (status != EXIT_SUCCESS)
 309     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 310              program_name);
 311   else
 312     {
 313       printf (_("\
 314 Usage: %s [OPTION]... [FILE]...\n\
 315   or:  %s [OPTION]... --files0-from=F\n\
 316 "),
 317               program_name, program_name);
 318       fputs (_("\
 319 Write sorted concatenation of all FILE(s) to standard output.\n\
 320 \n\
 321 "), stdout);
 322       fputs (_("\
 323 Mandatory arguments to long options are mandatory for short options too.\n\
 324 "), stdout);
 325       fputs (_("\
 326 Ordering options:\n\
 327 \n\
 328 "), stdout);
 329       fputs (_("\
 330   -b, --ignore-leading-blanks  ignore leading blanks\n\
 331   -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
 332   -f, --ignore-case           fold lower case to upper case characters\n\
 333 "), stdout);
 334       fputs (_("\
 335   -g, --general-numeric-sort  compare according to general numerical value\n\
 336   -i, --ignore-nonprinting    consider only printable characters\n\
 337   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 338   -n, --numeric-sort          compare according to string numerical value\n\
 339   -R, --random-sort           sort by random hash of keys\n\
 340   -V, --version-sort          sort by numeric version (see strverscmp(3C))\n\
 341       --random-source=FILE    get random bytes from FILE (default /dev/urandom)\n\
 342       --sort=WORD             sort according to WORD:\n\
 343                                 general-numeric -g, month -M, numeric -n,\n\
 344                                 random -R, version -V\n\
 345   -r, --reverse               reverse the result of comparisons\n\
 346 \n\
 347 "), stdout);
 348       fputs (_("\
 349 Other options:\n\
 350 \n\
 351 "), stdout);
 352       fputs (_("\
 353       --batch-size=NMERGE   merge at most NMERGE inputs at once;\n\
 354                             for more use temp files\n\
 355 "), stdout);
 356       fputs (_("\
 357   -c, --check, --check=diagnose-first  check for sorted input; do not sort\n\
 358   -C, --check=quiet, --check=silent  like -c, but do not report first bad line\n\
 359       --compress-program=PROG  compress temporaries with PROG;\n\
 360                               decompress them with PROG -d\n\
 361       --files0-from=F       read input from the files specified by\n\
 362                             NUL-terminated names in file F\n\
 363 "), stdout);
 364       fputs (_("\
 365   -k, --key=POS1[,POS2]     start a key at POS1, end it at POS2 (origin 1)\n\
 366   -m, --merge               merge already sorted files; do not sort\n\
 367 "), stdout);
 368       fputs (_("\
 369   -o, --output=FILE         write result to FILE instead of standard output\n\
 370   -s, --stable              stabilize sort by disabling last-resort comparison\n\
 371   -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
 372 "), stdout);
 373       printf (_("\
 374   -t, --field-separator=SEP  use SEP instead of non-blank to blank transition\n\
 375   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
 376                               multiple options specify multiple directories\n\
 377   -u, --unique              with -c, check for strict ordering;\n\
 378                               without -c, output only the first of an equal run\n\
 379 "), DEFAULT_TMPDIR);
 380       fputs (_("\
 381   -z, --zero-terminated     end lines with 0 byte, not newline\n\
 382 "), stdout);
 383       fputs (HELP_OPTION_DESCRIPTION, stdout);
 384       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 385       fputs (_("\
 386 \n\
 387 POS is F[.C][OPTS], where F is the field number and C the character position\n\
 388 in the field; both are origin 1.  If neither -t nor -b is in effect, characters\n\
 389 in a field are counted from the beginning of the preceding whitespace.  OPTS is\n\
 390 one or more single-letter ordering options, which override global ordering\n\
 391 options for that key.  If no key is given, use the entire line as the key.\n\
 392 \n\
 393 SIZE may be followed by the following multiplicative suffixes:\n\
 394 "), stdout);
 395       fputs (_("\
 396 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
 397 \n\
 398 With no FILE, or when FILE is -, read standard input.\n\
 399 \n\
 400 *** WARNING ***\n\
 401 The locale specified by the environment affects sort order.\n\
 402 Set LC_ALL=C to get the traditional sort order that uses\n\
 403 native byte values.\n\
 404 "), stdout );
 405       emit_bug_reporting_address ();
 406     }
 407
 408   exit (status);
 409 }
 410
 411 /* For long options that have no equivalent short option, use a
 412    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 413 enum
 414 {
 415   CHECK_OPTION = CHAR_MAX + 1,
 416   COMPRESS_PROGRAM_OPTION,
 417   FILES0_FROM_OPTION,
 418   NMERGE_OPTION,
 419   RANDOM_SOURCE_OPTION,
 420   SORT_OPTION
 421 };
 422
 423 static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z";
 424
 425 static struct option const long_options[] =
 426 {
 427   {"ignore-leading-blanks", no_argument, NULL, 'b'},
 428   {"check", optional_argument, NULL, CHECK_OPTION},
 429   {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
 430   {"dictionary-order", no_argument, NULL, 'd'},
 431   {"ignore-case", no_argument, NULL, 'f'},
 432   {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
 433   {"general-numeric-sort", no_argument, NULL, 'g'},
 434   {"ignore-nonprinting", no_argument, NULL, 'i'},
 435   {"key", required_argument, NULL, 'k'},
 436   {"merge", no_argument, NULL, 'm'},
 437   {"month-sort", no_argument, NULL, 'M'},
 438   {"numeric-sort", no_argument, NULL, 'n'},
 439   {"version-sort", no_argument, NULL, 'V'},
 440   {"random-sort", no_argument, NULL, 'R'},
 441   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
 442   {"sort", required_argument, NULL, SORT_OPTION},
 443   {"output", required_argument, NULL, 'o'},
 444   {"reverse", no_argument, NULL, 'r'},
 445   {"stable", no_argument, NULL, 's'},
 446   {"batch-size", required_argument, NULL, NMERGE_OPTION},
 447   {"buffer-size", required_argument, NULL, 'S'},
 448   {"field-separator", required_argument, NULL, 't'},
 449   {"temporary-directory", required_argument, NULL, 'T'},
 450   {"unique", no_argument, NULL, 'u'},
 451   {"zero-terminated", no_argument, NULL, 'z'},
 452   {GETOPT_HELP_OPTION_DECL},
 453   {GETOPT_VERSION_OPTION_DECL},
 454   {NULL, 0, NULL, 0},
 455 };
 456
 457 #define CHECK_TABLE \
 458   _ct_("quiet",          'C') \
 459   _ct_("silent",         'C') \
 460   _ct_("diagnose-first", 'c')
 461
 462 static char const *const check_args[] =
 463 {
 464 #define _ct_(_s, _c) _s,
 465   CHECK_TABLE NULL
 466 #undef  _ct_
 467 };
 468 static char const check_types[] =
 469 {
 470 #define _ct_(_s, _c) _c,
 471   CHECK_TABLE
 472 #undef  _ct_
 473 };
 474
 475 #define SORT_TABLE \
 476   _st_("general-numeric", 'g') \
 477   _st_("month",           'M') \
 478   _st_("numeric",         'n') \
 479   _st_("random",          'R') \
 480   _st_("version",         'V')
 481
 482 static char const *const sort_args[] =
 483 {
 484 #define _st_(_s, _c) _s,
 485   SORT_TABLE NULL
 486 #undef  _st_
 487 };
 488 static char const sort_types[] =
 489 {
 490 #define _st_(_s, _c) _c,
 491   SORT_TABLE
 492 #undef  _st_
 493 };
 494
 495 /* The set of signals that are caught.  */
 496 static sigset_t caught_signals;
 497
 498 /* Critical section status.  */
 499 struct cs_status
 500 {
 501   bool valid;
 502   sigset_t sigs;
 503 };
 504
 505 /* Enter a critical section.  */
 506 static struct cs_status
 507 cs_enter (void)
 508 {
 509   struct cs_status status;
 510   status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
 511   return status;
 512 }
 513
 514 /* Leave a critical section.  */
 515 static void
 516 cs_leave (struct cs_status status)
 517 {
 518   if (status.valid)
 519     {
 520       /* Ignore failure when restoring the signal mask. */
 521       sigprocmask (SIG_SETMASK, &status.sigs, NULL);
 522     }
 523 }
 524
 525 /* The list of temporary files. */
 526 struct tempnode
 527 {
 528   struct tempnode *volatile next;
 529   pid_t pid;     /* If compressed, the pid of compressor, else zero */
 530   char name[1];  /* Actual size is 1 + file name length.  */
 531 };
 532 static struct tempnode *volatile temphead;
 533 static struct tempnode *volatile *temptail = &temphead;
 534
 535 struct sortfile
 536 {
 537   char const *name;
 538   pid_t pid;     /* If compressed, the pid of compressor, else zero */
 539 };
 540
 541 /* A table where we store compression process states.  We clean up all
 542    processes in a timely manner so as not to exhaust system resources,
 543    so we store the info on whether the process is still running, or has
 544    been reaped here.  */
 545 static Hash_table *proctab;
 546
 547 enum { INIT_PROCTAB_SIZE = 47 };
 548
 549 enum procstate { ALIVE, ZOMBIE };
 550
 551 /* A proctab entry.  The COUNT field is there in case we fork a new
 552    compression process that has the same PID as an old zombie process
 553    that is still in the table (because the process to decompress the
 554    temp file it was associated with hasn't started yet).  */
 555 struct procnode
 556 {
 557   pid_t pid;
 558   enum procstate state;
 559   size_t count;
 560 };
 561
 562 static size_t
 563 proctab_hasher (const void *entry, size_t tabsize)
 564 {
 565   const struct procnode *node = entry;
 566   return node->pid % tabsize;
 567 }
 568
 569 static bool
 570 proctab_comparator (const void *e1, const void *e2)
 571 {
 572   const struct procnode *n1 = e1, *n2 = e2;
 573   return n1->pid == n2->pid;
 574 }
 575
 576 /* The total number of forked processes (compressors and decompressors)
 577    that have not been reaped yet. */
 578 static size_t nprocs;
 579
 580 /* The number of child processes we'll allow before we try to reap some. */
 581 enum { MAX_PROCS_BEFORE_REAP = 2 };
 582
 583 /* If 0 < PID, wait for the child process with that PID to exit.
 584    If PID is -1, clean up a random child process which has finished and
 585    return the process ID of that child.  If PID is -1 and no processes
 586    have quit yet, return 0 without waiting.  */
 587
 588 static pid_t
 589 reap (pid_t pid)
 590 {
 591   int status;
 592   pid_t cpid = waitpid (pid, &status, pid < 0 ? WNOHANG : 0);
 593
 594   if (cpid < 0)
 595     error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
 596            compress_program);
 597   else if (0 < cpid)
 598     {
 599       if (! WIFEXITED (status) || WEXITSTATUS (status))
 600         error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
 601                compress_program);
 602       --nprocs;
 603     }
 604
 605   return cpid;
 606 }
 607
 608 /* Add the PID of a running compression process to proctab, or update
 609    the entry COUNT and STATE fields if it's already there.  This also
 610    creates the table for us the first time it's called.  */
 611
 612 static void
 613 register_proc (pid_t pid)
 614 {
 615   struct procnode test, *node;
 616
 617   if (! proctab)
 618     {
 619       proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
 620                                  proctab_hasher,
 621                                  proctab_comparator,
 622                                  free);
 623       if (! proctab)
 624         xalloc_die ();
 625     }
 626
 627   test.pid = pid;
 628   node = hash_lookup (proctab, &test);
 629   if (node)
 630     {
 631       node->state = ALIVE;
 632       ++node->count;
 633     }
 634   else
 635     {
 636       node = xmalloc (sizeof *node);
 637       node->pid = pid;
 638       node->state = ALIVE;
 639       node->count = 1;
 640       hash_insert (proctab, node);
 641     }
 642 }
 643
 644 /* This is called when we reap a random process.  We don't know
 645    whether we have reaped a compression process or a decompression
 646    process until we look in the table.  If there's an ALIVE entry for
 647    it, then we have reaped a compression process, so change the state
 648    to ZOMBIE.  Otherwise, it's a decompression processes, so ignore it.  */
 649
 650 static void
 651 update_proc (pid_t pid)
 652 {
 653   struct procnode test, *node;
 654
 655   test.pid = pid;
 656   node = hash_lookup (proctab, &test);
 657   if (node)
 658     node->state = ZOMBIE;
 659 }
 660
 661 /* This is for when we need to wait for a compression process to exit.
 662    If it has a ZOMBIE entry in the table then it's already dead and has
 663    been reaped.  Note that if there's an ALIVE entry for it, it still may
 664    already have died and been reaped if a second process was created with
 665    the same PID.  This is probably exceedingly rare, but to be on the safe
 666    side we will have to wait for any compression process with this PID.  */
 667
 668 static void
 669 wait_proc (pid_t pid)
 670 {
 671   struct procnode test, *node;
 672
 673   test.pid = pid;
 674   node = hash_lookup (proctab, &test);
 675   if (node->state == ALIVE)
 676     reap (pid);
 677
 678   node->state = ZOMBIE;
 679   if (! --node->count)
 680     {
 681       hash_delete (proctab, node);
 682       free (node);
 683     }
 684 }
 685
 686 /* Keep reaping finished children as long as there are more to reap.
 687    This doesn't block waiting for any of them, it only reaps those
 688    that are already dead.  */
 689
 690 static void
 691 reap_some (void)
 692 {
 693   pid_t pid;
 694
 695   while (0 < nprocs && (pid = reap (-1)))
 696     update_proc (pid);
 697 }
 698
 699 /* Clean up any remaining temporary files.  */
 700
 701 static void
 702 cleanup (void)
 703 {
 704   struct tempnode const *node;
 705
 706   for (node = temphead; node; node = node->next)
 707     unlink (node->name);
 708   temphead = NULL;
 709 }
 710
 711 /* Cleanup actions to take when exiting.  */
 712
 713 static void
 714 exit_cleanup (void)
 715 {
 716   if (temphead)
 717     {
 718       /* Clean up any remaining temporary files in a critical section so
 719          that a signal handler does not try to clean them too.  */
 720       struct cs_status cs = cs_enter ();
 721       cleanup ();
 722       cs_leave (cs);
 723     }
 724
 725   close_stdout ();
 726 }
 727
 728 /* Create a new temporary file, returning its newly allocated tempnode.
 729    Store into *PFD the file descriptor open for writing.  */
 730
 731 static struct tempnode *
 732 create_temp_file (int *pfd)
 733 {
 734   static char const slashbase[] = "/sortXXXXXX";
 735   static size_t temp_dir_index;
 736   int fd;
 737   int saved_errno;
 738   char const *temp_dir = temp_dirs[temp_dir_index];
 739   size_t len = strlen (temp_dir);
 740   struct tempnode *node =
 741     xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
 742   char *file = node->name;
 743   struct cs_status cs;
 744
 745   memcpy (file, temp_dir, len);
 746   memcpy (file + len, slashbase, sizeof slashbase);
 747   node->next = NULL;
 748   node->pid = 0;
 749   if (++temp_dir_index == temp_dir_count)
 750     temp_dir_index = 0;
 751
 752   /* Create the temporary file in a critical section, to avoid races.  */
 753   cs = cs_enter ();
 754   fd = mkstemp (file);
 755   if (0 <= fd)
 756     {
 757       *temptail = node;
 758       temptail = &node->next;
 759     }
 760   saved_errno = errno;
 761   cs_leave (cs);
 762   errno = saved_errno;
 763
 764   if (fd < 0)
 765     error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
 766            quote (temp_dir));
 767
 768   *pfd = fd;
 769   return node;
 770 }
 771
 772 /* Return a stream for FILE, opened with mode HOW.  A null FILE means
 773    standard output; HOW should be "w".  When opening for input, "-"
 774    means standard input.  To avoid confusion, do not return file
 775    descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
 776    opening an ordinary FILE.  */
 777
 778 static FILE *
 779 xfopen (const char *file, const char *how)
 780 {
 781   FILE *fp;
 782
 783   if (!file)
 784     fp = stdout;
 785   else if (STREQ (file, "-") && *how == 'r')
 786     {
 787       have_read_stdin = true;
 788       fp = stdin;
 789     }
 790   else
 791     {
 792       fp = fopen (file, how);
 793       if (! fp)
 794         die (_("open failed"), file);
 795     }
 796
 797   return fp;
 798 }
 799
 800 /* Close FP, whose name is FILE, and report any errors.  */
 801
 802 static void
 803 xfclose (FILE *fp, char const *file)
 804 {
 805   switch (fileno (fp))
 806     {
 807     case STDIN_FILENO:
 808       /* Allow reading stdin from tty more than once.  */
 809       if (feof (fp))
 810         clearerr (fp);
 811       break;
 812
 813     case STDOUT_FILENO:
 814       /* Don't close stdout just yet.  close_stdout does that.  */
 815       if (fflush (fp) != 0)
 816         die (_("fflush failed"), file);
 817       break;
 818
 819     default:
 820       if (fclose (fp) != 0)
 821         die (_("close failed"), file);
 822       break;
 823     }
 824 }
 825
 826 static void
 827 dup2_or_die (int oldfd, int newfd)
 828 {
 829   if (dup2 (oldfd, newfd) < 0)
 830     error (SORT_FAILURE, errno, _("dup2 failed"));
 831 }
 832
 833 /* Fork a child process for piping to and do common cleanup.  The
 834    TRIES parameter tells us how many times to try to fork before
 835    giving up.  Return the PID of the child or -1 if fork failed.  */
 836
 837 static pid_t
 838 pipe_fork (int pipefds[2], size_t tries)
 839 {
 840 #if HAVE_WORKING_FORK
 841   struct tempnode *saved_temphead;
 842   int saved_errno;
 843   unsigned int wait_retry = 1;
 844   pid_t pid IF_LINT (= -1);
 845   struct cs_status cs;
 846
 847   if (pipe (pipefds) < 0)
 848     return -1;
 849
 850   while (tries--)
 851     {
 852       /* This is so the child process won't delete our temp files
 853          if it receives a signal before exec-ing.  */
 854       cs = cs_enter ();
 855       saved_temphead = temphead;
 856       temphead = NULL;
 857
 858       pid = fork ();
 859       saved_errno = errno;
 860       if (pid)
 861         temphead = saved_temphead;
 862
 863       cs_leave (cs);
 864       errno = saved_errno;
 865
 866       if (0 <= pid || errno != EAGAIN)
 867         break;
 868       else
 869         {
 870           sleep (wait_retry);
 871           wait_retry *= 2;
 872           reap_some ();
 873         }
 874     }
 875
 876   if (pid < 0)
 877     {
 878       close (pipefds[0]);
 879       close (pipefds[1]);
 880     }
 881   else if (pid == 0)
 882     {
 883       close (STDIN_FILENO);
 884       close (STDOUT_FILENO);
 885     }
 886   else
 887     ++nprocs;
 888
 889   return pid;
 890
 891 #else  /* ! HAVE_WORKING_FORK */
 892   return -1;
 893 #endif
 894 }
 895
 896 /* Create a temporary file and start a compression program to filter output
 897    to that file.  Set *PFP to the file handle and if *PPID is non-NULL,
 898    set it to the PID of the newly-created process.  */
 899
 900 static char *
 901 create_temp (FILE **pfp, pid_t *ppid)
 902 {
 903   int tempfd;
 904   struct tempnode *node = create_temp_file (&tempfd);
 905   char *name = node->name;
 906
 907   if (compress_program)
 908     {
 909       int pipefds[2];
 910
 911       node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
 912       if (0 < node->pid)
 913         {
 914           close (tempfd);
 915           close (pipefds[0]);
 916           tempfd = pipefds[1];
 917
 918           register_proc (node->pid);
 919         }
 920       else if (node->pid == 0)
 921         {
 922           close (pipefds[1]);
 923           dup2_or_die (tempfd, STDOUT_FILENO);
 924           close (tempfd);
 925           dup2_or_die (pipefds[0], STDIN_FILENO);
 926           close (pipefds[0]);
 927
 928           if (execlp (compress_program, compress_program, (char *) NULL) < 0)
 929             error (SORT_FAILURE, errno, _("couldn't execute %s"),
 930                    compress_program);
 931         }
 932       else
 933         node->pid = 0;
 934     }
 935
 936   *pfp = fdopen (tempfd, "w");
 937   if (! *pfp)
 938     die (_("couldn't create temporary file"), name);
 939
 940   if (ppid)
 941     *ppid = node->pid;
 942
 943   return name;
 944 }
 945
 946 /* Open a compressed temp file and start a decompression process through
 947    which to filter the input.  PID must be the valid processes ID of the
 948    process used to compress the file.  */
 949
 950 static FILE *
 951 open_temp (const char *name, pid_t pid)
 952 {
 953   int tempfd, pipefds[2];
 954   pid_t child_pid;
 955   FILE *fp;
 956
 957   wait_proc (pid);
 958
 959   tempfd = open (name, O_RDONLY);
 960   if (tempfd < 0)
 961     die (_("couldn't open temporary file"), name);
 962
 963   child_pid = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
 964   if (0 < child_pid)
 965     {
 966       close (tempfd);
 967       close (pipefds[1]);
 968     }
 969   else if (child_pid == 0)
 970     {
 971       close (pipefds[0]);
 972       dup2_or_die (tempfd, STDIN_FILENO);
 973       close (tempfd);
 974       dup2_or_die (pipefds[1], STDOUT_FILENO);
 975       close (pipefds[1]);
 976
 977       if (execlp (compress_program, compress_program, "-d", (char *) NULL) < 0)
 978         error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
 979                compress_program);
 980     }
 981   else
 982     error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
 983            compress_program);
 984
 985   fp = fdopen (pipefds[0], "r");
 986   if (! fp)
 987     die (_("couldn't create temporary file"), name);
 988
 989   return fp;
 990 }
 991
 992 static void
 993 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
 994 {
 995   if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
 996     die (_("write failed"), output_file);
 997 }
 998
 999 /* Append DIR to the array of temporary directory names.  */
1000 static void
1001 add_temp_dir (char const *dir)
1002 {
1003   if (temp_dir_count == temp_dir_alloc)
1004     temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1005
1006   temp_dirs[temp_dir_count++] = dir;
1007 }
1008
1009 /* Remove NAME from the list of temporary files.  */
1010
1011 static void
1012 zaptemp (const char *name)
1013 {
1014   struct tempnode *volatile *pnode;
1015   struct tempnode *node;
1016   struct tempnode *next;
1017   int unlink_status;
1018   int unlink_errno = 0;
1019   struct cs_status cs;
1020
1021   for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1022     continue;
1023
1024   /* Unlink the temporary file in a critical section to avoid races.  */
1025   next = node->next;
1026   cs = cs_enter ();
1027   unlink_status = unlink (name);
1028   unlink_errno = errno;
1029   *pnode = next;
1030   cs_leave (cs);
1031
1032   if (unlink_status != 0)
1033     error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1034   if (! next)
1035     temptail = pnode;
1036   free (node);
1037 }
1038
1039 #if HAVE_NL_LANGINFO
1040
1041 static int
1042 struct_month_cmp (const void *m1, const void *m2)
1043 {
1044   struct month const *month1 = m1;
1045   struct month const *month2 = m2;
1046   return strcmp (month1->name, month2->name);
1047 }
1048
1049 #endif
1050
1051 /* Initialize the character class tables. */
1052
1053 static void
1054 inittables (void)
1055 {
1056   size_t i;
1057
1058   for (i = 0; i < UCHAR_LIM; ++i)
1059     {
1060       blanks[i] = !! isblank (i);
1061       nonprinting[i] = ! isprint (i);
1062       nondictionary[i] = ! isalnum (i) && ! isblank (i);
1063       fold_toupper[i] = toupper (i);
1064     }
1065
1066 #if HAVE_NL_LANGINFO
1067   /* If we're not in the "C" locale, read different names for months.  */
1068   if (hard_LC_TIME)
1069     {
1070       for (i = 0; i < MONTHS_PER_YEAR; i++)
1071         {
1072           char const *s;
1073           size_t s_len;
1074           size_t j;
1075           char *name;
1076
1077           s = (char *) nl_langinfo (ABMON_1 + i);
1078           s_len = strlen (s);
1079           monthtab[i].name = name = xmalloc (s_len + 1);
1080           monthtab[i].val = i + 1;
1081
1082           for (j = 0; j < s_len; j++)
1083             name[j] = fold_toupper[to_uchar (s[j])];
1084           name[j] = '\0';
1085         }
1086       qsort ((void *) monthtab, MONTHS_PER_YEAR,
1087              sizeof *monthtab, struct_month_cmp);
1088     }
1089 #endif
1090 }
1091
1092 /* Specify how many inputs may be merged at once.
1093    This may be set on the command-line with the
1094    --batch-size option. */
1095 static void
1096 specify_nmerge (int oi, char c, char const *s)
1097 {
1098   uintmax_t n;
1099   struct rlimit rlimit;
1100   enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1101
1102   /* Try to find out how many file descriptors we'll be able
1103      to open.  We need at least nmerge + 3 (STDIN_FILENO,
1104      STDOUT_FILENO and STDERR_FILENO). */
1105   unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1106                               ? rlimit.rlim_cur
1107                               : OPEN_MAX)
1108                              - 3);
1109
1110   if (e == LONGINT_OK)
1111     {
1112       nmerge = n;
1113       if (nmerge != n)
1114         e = LONGINT_OVERFLOW;
1115       else
1116         {
1117           if (nmerge < 2)
1118             {
1119               error (0, 0, _("invalid --%s argument %s"),
1120                      long_options[oi].name, quote(s));
1121               error (SORT_FAILURE, 0,
1122                      _("minimum --%s argument is %s"),
1123                      long_options[oi].name, quote("2"));
1124             }
1125           else if (max_nmerge < nmerge)
1126             {
1127               e = LONGINT_OVERFLOW;
1128             }
1129           else
1130             return;
1131         }
1132     }
1133
1134   if (e == LONGINT_OVERFLOW)
1135     {
1136       char max_nmerge_buf[INT_BUFSIZE_BOUND (unsigned int)];
1137       error (0, 0, _("--%s argument %s too large"),
1138              long_options[oi].name, quote(s));
1139       error (SORT_FAILURE, 0,
1140              _("maximum --%s argument with current rlimit is %s"),
1141              long_options[oi].name,
1142              uinttostr (max_nmerge, max_nmerge_buf));
1143     }
1144   else
1145     xstrtol_fatal (e, oi, c, long_options, s);
1146 }
1147
1148 /* Specify the amount of main memory to use when sorting.  */
1149 static void
1150 specify_sort_size (int oi, char c, char const *s)
1151 {
1152   uintmax_t n;
1153   char *suffix;
1154   enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1155
1156   /* The default unit is KiB.  */
1157   if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1158     {
1159       if (n <= UINTMAX_MAX / 1024)
1160         n *= 1024;
1161       else
1162         e = LONGINT_OVERFLOW;
1163     }
1164
1165   /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
1166   if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1167     switch (suffix[0])
1168       {
1169       case 'b':
1170         e = LONGINT_OK;
1171         break;
1172
1173       case '%':
1174         {
1175           double mem = physmem_total () * n / 100;
1176
1177           /* Use "<", not "<=", to avoid problems with rounding.  */
1178           if (mem < UINTMAX_MAX)
1179             {
1180               n = mem;
1181               e = LONGINT_OK;
1182             }
1183           else
1184             e = LONGINT_OVERFLOW;
1185         }
1186         break;
1187       }
1188
1189   if (e == LONGINT_OK)
1190     {
1191       /* If multiple sort sizes are specified, take the maximum, so
1192          that option order does not matter.  */
1193       if (n < sort_size)
1194         return;
1195
1196       sort_size = n;
1197       if (sort_size == n)
1198         {
1199           sort_size = MAX (sort_size, MIN_SORT_SIZE);
1200           return;
1201         }
1202
1203       e = LONGINT_OVERFLOW;
1204     }
1205
1206   xstrtol_fatal (e, oi, c, long_options, s);
1207 }
1208
1209 /* Return the default sort size.  */
1210 static size_t
1211 default_sort_size (void)
1212 {
1213   /* Let MEM be available memory or 1/8 of total memory, whichever
1214      is greater.  */
1215   double avail = physmem_available ();
1216   double total = physmem_total ();
1217   double mem = MAX (avail, total / 8);
1218   struct rlimit rlimit;
1219
1220   /* Let SIZE be MEM, but no more than the maximum object size or
1221      system resource limits.  Avoid the MIN macro here, as it is not
1222      quite right when only one argument is floating point.  Don't
1223      bother to check for values like RLIM_INFINITY since in practice
1224      they are not much less than SIZE_MAX.  */
1225   size_t size = SIZE_MAX;
1226   if (mem < size)
1227     size = mem;
1228   if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1229     size = rlimit.rlim_cur;
1230 #ifdef RLIMIT_AS
1231   if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1232     size = rlimit.rlim_cur;
1233 #endif
1234
1235   /* Leave a large safety margin for the above limits, as failure can
1236      occur when they are exceeded.  */
1237   size /= 2;
1238
1239 #ifdef RLIMIT_RSS
1240   /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1241      Exceeding RSS is not fatal, but can be quite slow.  */
1242   if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1243     size = rlimit.rlim_cur / 16 * 15;
1244 #endif
1245
1246   /* Use no less than the minimum.  */
1247   return MAX (size, MIN_SORT_SIZE);
1248 }
1249
1250 /* Return the sort buffer size to use with the input files identified
1251    by FPS and FILES, which are alternate names of the same files.
1252    NFILES gives the number of input files; NFPS may be less.  Assume
1253    that each input line requires LINE_BYTES extra bytes' worth of line
1254    information.  Do not exceed the size bound specified by the user
1255    (or a default size bound, if the user does not specify one).  */
1256
1257 static size_t
1258 sort_buffer_size (FILE *const *fps, size_t nfps,
1259                   char *const *files, size_t nfiles,
1260                   size_t line_bytes)
1261 {
1262   /* A bound on the input size.  If zero, the bound hasn't been
1263      determined yet.  */
1264   static size_t size_bound;
1265
1266   /* In the worst case, each input byte is a newline.  */
1267   size_t worst_case_per_input_byte = line_bytes + 1;
1268
1269   /* Keep enough room for one extra input line and an extra byte.
1270      This extra room might be needed when preparing to read EOF.  */
1271   size_t size = worst_case_per_input_byte + 1;
1272
1273   size_t i;
1274
1275   for (i = 0; i < nfiles; i++)
1276     {
1277       struct stat st;
1278       off_t file_size;
1279       size_t worst_case;
1280
1281       if ((i < nfps ? fstat (fileno (fps[i]), &st)
1282            : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1283            : stat (files[i], &st))
1284           != 0)
1285         die (_("stat failed"), files[i]);
1286
1287       if (S_ISREG (st.st_mode))
1288         file_size = st.st_size;
1289       else
1290         {
1291           /* The file has unknown size.  If the user specified a sort
1292              buffer size, use that; otherwise, guess the size.  */
1293           if (sort_size)
1294             return sort_size;
1295           file_size = INPUT_FILE_SIZE_GUESS;
1296         }
1297
1298       if (! size_bound)
1299         {
1300           size_bound = sort_size;
1301           if (! size_bound)
1302             size_bound = default_sort_size ();
1303         }
1304
1305       /* Add the amount of memory needed to represent the worst case
1306          where the input consists entirely of newlines followed by a
1307          single non-newline.  Check for overflow.  */
1308       worst_case = file_size * worst_case_per_input_byte + 1;
1309       if (file_size != worst_case / worst_case_per_input_byte
1310           || size_bound - size <= worst_case)
1311         return size_bound;
1312       size += worst_case;
1313     }
1314
1315   return size;
1316 }
1317
1318 /* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
1319    must be at least sizeof (struct line).  Allocate ALLOC bytes
1320    initially.  */
1321
1322 static void
1323 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1324 {
1325   /* Ensure that the line array is properly aligned.  If the desired
1326      size cannot be allocated, repeatedly halve it until allocation
1327      succeeds.  The smaller allocation may hurt overall performance,
1328      but that's better than failing.  */
1329   for (;;)
1330     {
1331       alloc += sizeof (struct line) - alloc % sizeof (struct line);
1332       buf->buf = malloc (alloc);
1333       if (buf->buf)
1334         break;
1335       alloc /= 2;
1336       if (alloc <= line_bytes + 1)
1337         xalloc_die ();
1338     }
1339
1340   buf->line_bytes = line_bytes;
1341   buf->alloc = alloc;
1342   buf->used = buf->left = buf->nlines = 0;
1343   buf->eof = false;
1344 }
1345
1346 /* Return one past the limit of the line array.  */
1347
1348 static inline struct line *
1349 buffer_linelim (struct buffer const *buf)
1350 {
1351   return (struct line *) (buf->buf + buf->alloc);
1352 }
1353
1354 /* Return a pointer to the first character of the field specified
1355    by KEY in LINE. */
1356
1357 static char *
1358 begfield (const struct line *line, const struct keyfield *key)
1359 {
1360   char *ptr = line->text, *lim = ptr + line->length - 1;
1361   size_t sword = key->sword;
1362   size_t schar = key->schar;
1363   size_t remaining_bytes;
1364
1365   /* The leading field separator itself is included in a field when -t
1366      is absent.  */
1367
1368   if (tab != TAB_DEFAULT)
1369     while (ptr < lim && sword--)
1370       {
1371         while (ptr < lim && *ptr != tab)
1372           ++ptr;
1373         if (ptr < lim)
1374           ++ptr;
1375       }
1376   else
1377     while (ptr < lim && sword--)
1378       {
1379         while (ptr < lim && blanks[to_uchar (*ptr)])
1380           ++ptr;
1381         while (ptr < lim && !blanks[to_uchar (*ptr)])
1382           ++ptr;
1383       }
1384
1385   if (key->skipsblanks)
1386     while (ptr < lim && blanks[to_uchar (*ptr)])
1387       ++ptr;
1388
1389   /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
1390   remaining_bytes = lim - ptr;
1391   if (schar < remaining_bytes)
1392     ptr += schar;
1393   else
1394     ptr = lim;
1395
1396   return ptr;
1397 }
1398
1399 /* Return the limit of (a pointer to the first character after) the field
1400    in LINE specified by KEY. */
1401
1402 static char *
1403 limfield (const struct line *line, const struct keyfield *key)
1404 {
1405   char *ptr = line->text, *lim = ptr + line->length - 1;
1406   size_t eword = key->eword, echar = key->echar;
1407   size_t remaining_bytes;
1408
1409   /* Move PTR past EWORD fields or to one past the last byte on LINE,
1410      whichever comes first.  If there are more than EWORD fields, leave
1411      PTR pointing at the beginning of the field having zero-based index,
1412      EWORD.  If a delimiter character was specified (via -t), then that
1413      `beginning' is the first character following the delimiting TAB.
1414      Otherwise, leave PTR pointing at the first `blank' character after
1415      the preceding field.  */
1416   if (tab != TAB_DEFAULT)
1417     while (ptr < lim && eword--)
1418       {
1419         while (ptr < lim && *ptr != tab)
1420           ++ptr;
1421         if (ptr < lim && (eword | echar))
1422           ++ptr;
1423       }
1424   else
1425     while (ptr < lim && eword--)
1426       {
1427         while (ptr < lim && blanks[to_uchar (*ptr)])
1428           ++ptr;
1429         while (ptr < lim && !blanks[to_uchar (*ptr)])
1430           ++ptr;
1431       }
1432
1433 #ifdef POSIX_UNSPECIFIED
1434   /* The following block of code makes GNU sort incompatible with
1435      standard Unix sort, so it's ifdef'd out for now.
1436      The POSIX spec isn't clear on how to interpret this.
1437      FIXME: request clarification.
1438
1439      From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1440      Date: Thu, 30 May 96 12:20:41 -0400
1441      [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1442
1443      [...]I believe I've found another bug in `sort'.
1444
1445      $ cat /tmp/sort.in
1446      a b c 2 d
1447      pq rs 1 t
1448      $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1449      a b c 2 d
1450      pq rs 1 t
1451      $ /bin/sort -k1.7,1.7 </tmp/sort.in
1452      pq rs 1 t
1453      a b c 2 d
1454
1455      Unix sort produced the answer I expected: sort on the single character
1456      in column 7.  GNU sort produced different results, because it disagrees
1457      on the interpretation of the key-end spec "M.N".  Unix sort reads this
1458      as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1459      "skip M-1 fields, then either N-1 characters or the rest of the current
1460      field, whichever comes first".  This extra clause applies only to
1461      key-ends, not key-starts.
1462      */
1463
1464   /* Make LIM point to the end of (one byte past) the current field.  */
1465   if (tab != TAB_DEFAULT)
1466     {
1467       char *newlim;
1468       newlim = memchr (ptr, tab, lim - ptr);
1469       if (newlim)
1470         lim = newlim;
1471     }
1472   else
1473     {
1474       char *newlim;
1475       newlim = ptr;
1476       while (newlim < lim && blanks[to_uchar (*newlim)])
1477         ++newlim;
1478       while (newlim < lim && !blanks[to_uchar (*newlim)])
1479         ++newlim;
1480       lim = newlim;
1481     }
1482 #endif
1483
1484   /* If we're ignoring leading blanks when computing the End
1485      of the field, don't start counting bytes until after skipping
1486      past any leading blanks. */
1487   if (key->skipeblanks)
1488     while (ptr < lim && blanks[to_uchar (*ptr)])
1489       ++ptr;
1490
1491   /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
1492   remaining_bytes = lim - ptr;
1493   if (echar < remaining_bytes)
1494     ptr += echar;
1495   else
1496     ptr = lim;
1497
1498   return ptr;
1499 }
1500
1501 /* Fill BUF reading from FP, moving buf->left bytes from the end
1502    of buf->buf to the beginning first.  If EOF is reached and the
1503    file wasn't terminated by a newline, supply one.  Set up BUF's line
1504    table too.  FILE is the name of the file corresponding to FP.
1505    Return true if some input was read.  */
1506
1507 static bool
1508 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1509 {
1510   struct keyfield const *key = keylist;
1511   char eol = eolchar;
1512   size_t line_bytes = buf->line_bytes;
1513   size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1514
1515   if (buf->eof)
1516     return false;
1517
1518   if (buf->used != buf->left)
1519     {
1520       memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1521       buf->used = buf->left;
1522       buf->nlines = 0;
1523     }
1524
1525   for (;;)
1526     {
1527       char *ptr = buf->buf + buf->used;
1528       struct line *linelim = buffer_linelim (buf);
1529       struct line *line = linelim - buf->nlines;
1530       size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1531       char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1532
1533       while (line_bytes + 1 < avail)
1534         {
1535           /* Read as many bytes as possible, but do not read so many
1536              bytes that there might not be enough room for the
1537              corresponding line array.  The worst case is when the
1538              rest of the input file consists entirely of newlines,
1539              except that the last byte is not a newline.  */
1540           size_t readsize = (avail - 1) / (line_bytes + 1);
1541           size_t bytes_read = fread (ptr, 1, readsize, fp);
1542           char *ptrlim = ptr + bytes_read;
1543           char *p;
1544           avail -= bytes_read;
1545
1546           if (bytes_read != readsize)
1547             {
1548               if (ferror (fp))
1549                 die (_("read failed"), file);
1550               if (feof (fp))
1551                 {
1552                   buf->eof = true;
1553                   if (buf->buf == ptrlim)
1554                     return false;
1555                   if (ptrlim[-1] != eol)
1556                     *ptrlim++ = eol;
1557                 }
1558             }
1559
1560           /* Find and record each line in the just-read input.  */
1561           while ((p = memchr (ptr, eol, ptrlim - ptr)))
1562             {
1563               ptr = p + 1;
1564               line--;
1565               line->text = line_start;
1566               line->length = ptr - line_start;
1567               mergesize = MAX (mergesize, line->length);
1568               avail -= line_bytes;
1569
1570               if (key)
1571                 {
1572                   /* Precompute the position of the first key for
1573                      efficiency.  */
1574                   line->keylim = (key->eword == SIZE_MAX
1575                                   ? p
1576                                   : limfield (line, key));
1577
1578                   if (key->sword != SIZE_MAX)
1579                     line->keybeg = begfield (line, key);
1580                   else
1581                     {
1582                       if (key->skipsblanks)
1583                         while (blanks[to_uchar (*line_start)])
1584                           line_start++;
1585                       line->keybeg = line_start;
1586                     }
1587                 }
1588
1589               line_start = ptr;
1590             }
1591
1592           ptr = ptrlim;
1593           if (buf->eof)
1594             break;
1595         }
1596
1597       buf->used = ptr - buf->buf;
1598       buf->nlines = buffer_linelim (buf) - line;
1599       if (buf->nlines != 0)
1600         {
1601           buf->left = ptr - line_start;
1602           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1603           return true;
1604         }
1605
1606       {
1607         /* The current input line is too long to fit in the buffer.
1608            Double the buffer size and try again, keeping it properly
1609            aligned.  */
1610         size_t line_alloc = buf->alloc / sizeof (struct line);
1611         buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1612         buf->alloc = line_alloc * sizeof (struct line);
1613       }
1614     }
1615 }
1616
1617 /* Compare strings A and B as numbers without explicitly converting them to
1618    machine numbers.  Comparatively slow for short strings, but asymptotically
1619    hideously fast. */
1620
1621 static int
1622 numcompare (const char *a, const char *b)
1623 {
1624   while (blanks[to_uchar (*a)])
1625     a++;
1626   while (blanks[to_uchar (*b)])
1627     b++;
1628
1629   return strnumcmp (a, b, decimal_point, thousands_sep);
1630 }
1631
1632 static int
1633 general_numcompare (const char *sa, const char *sb)
1634 {
1635   /* FIXME: add option to warn about failed conversions.  */
1636   /* FIXME: maybe add option to try expensive FP conversion
1637      only if A and B can't be compared more cheaply/accurately.  */
1638
1639   char *ea;
1640   char *eb;
1641   double a = strtod (sa, &ea);
1642   double b = strtod (sb, &eb);
1643
1644   /* Put conversion errors at the start of the collating sequence.  */
1645   if (sa == ea)
1646     return sb == eb ? 0 : -1;
1647   if (sb == eb)
1648     return 1;
1649
1650   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1651      conversion errors but before numbers; sort them by internal
1652      bit-pattern, for lack of a more portable alternative.  */
1653   return (a < b ? -1
1654           : a > b ? 1
1655           : a == b ? 0
1656           : b == b ? -1
1657           : a == a ? 1
1658           : memcmp ((char *) &a, (char *) &b, sizeof a));
1659 }
1660
1661 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1662    Return 0 if the name in S is not recognized.  */
1663
1664 static int
1665 getmonth (char const *month, size_t len)
1666 {
1667   size_t lo = 0;
1668   size_t hi = MONTHS_PER_YEAR;
1669   char const *monthlim = month + len;
1670
1671   for (;;)
1672     {
1673       if (month == monthlim)
1674         return 0;
1675       if (!blanks[to_uchar (*month)])
1676         break;
1677       ++month;
1678     }
1679
1680   do
1681     {
1682       size_t ix = (lo + hi) / 2;
1683       char const *m = month;
1684       char const *n = monthtab[ix].name;
1685
1686       for (;; m++, n++)
1687         {
1688           if (!*n)
1689             return monthtab[ix].val;
1690           if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1691             {
1692               hi = ix;
1693               break;
1694             }
1695           else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1696             {
1697               lo = ix + 1;
1698               break;
1699             }
1700         }
1701     }
1702   while (lo < hi);
1703
1704   return 0;
1705 }
1706
1707 /* A source of random data.  */
1708 static struct randread_source *randread_source;
1709
1710 /* Return the Ith randomly-generated state.  The caller must invoke
1711    random_state (H) for all H less than I before invoking random_state
1712    (I).  */
1713
1714 static struct md5_ctx
1715 random_state (size_t i)
1716 {
1717   /* An array of states resulting from the random data, and counts of
1718      its used and allocated members.  */
1719   static struct md5_ctx *state;
1720   static size_t used;
1721   static size_t allocated;
1722
1723   struct md5_ctx *s = &state[i];
1724
1725   if (used <= i)
1726     {
1727       unsigned char buf[MD5_DIGEST_SIZE];
1728
1729       used++;
1730
1731       if (allocated <= i)
1732         {
1733           state = X2NREALLOC (state, &allocated);
1734           s = &state[i];
1735         }
1736
1737       randread (randread_source, buf, sizeof buf);
1738       md5_init_ctx (s);
1739       md5_process_bytes (buf, sizeof buf, s);
1740     }
1741
1742   return *s;
1743 }
1744
1745 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
1746    with length LENGTHB.  Return negative if less, zero if equal,
1747    positive if greater.  */
1748
1749 static int
1750 cmp_hashes (char const *texta, size_t lena,
1751             char const *textb, size_t lenb)
1752 {
1753   /* Try random hashes until a pair of hashes disagree.  But if the
1754      first pair of random hashes agree, check whether the keys are
1755      identical and if so report no difference.  */
1756   int diff;
1757   size_t i;
1758   for (i = 0; ; i++)
1759     {
1760       uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
1761       struct md5_ctx s[2];
1762       s[0] = s[1] = random_state (i);
1763       md5_process_bytes (texta, lena, &s[0]);  md5_finish_ctx (&s[0], dig[0]);
1764       md5_process_bytes (textb, lenb, &s[1]);  md5_finish_ctx (&s[1], dig[1]);
1765       diff = memcmp (dig[0], dig[1], sizeof dig[0]);
1766       if (diff != 0)
1767         break;
1768       if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)
1769         break;
1770     }
1771
1772   return diff;
1773 }
1774
1775 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1776    using one or more random hash functions.  */
1777
1778 static int
1779 compare_random (char *restrict texta, size_t lena,
1780                 char *restrict textb, size_t lenb)
1781 {
1782   int diff;
1783
1784   if (! hard_LC_COLLATE)
1785     diff = cmp_hashes (texta, lena, textb, lenb);
1786   else
1787     {
1788       /* Transform the text into the basis of comparison, so that byte
1789          strings that would otherwise considered to be equal are
1790          considered equal here even if their bytes differ.  */
1791
1792       char *buf = NULL;
1793       char stackbuf[4000];
1794       size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
1795       bool a_fits = tlena <= sizeof stackbuf;
1796       size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
1797                                (a_fits ? sizeof stackbuf - tlena : 0),
1798                                textb, lenb);
1799
1800       if (a_fits && tlena + tlenb <= sizeof stackbuf)
1801         buf = stackbuf;
1802       else
1803         {
1804           /* Adding 1 to the buffer size lets xmemxfrm run a bit
1805              faster by avoiding the need for an extra buffer copy.  */
1806           buf = xmalloc (tlena + tlenb + 1);
1807           xmemxfrm (buf, tlena + 1, texta, lena);
1808           xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
1809         }
1810
1811       diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
1812
1813       if (buf != stackbuf)
1814         free (buf);
1815     }
1816
1817   return diff;
1818 }
1819
1820 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1821    using strverscmp.  */
1822
1823 static int
1824 compare_version (char *restrict texta, size_t lena,
1825                  char *restrict textb, size_t lenb)
1826 {
1827   int diff;
1828
1829   /* It is necessary to save the character after the end of the field.
1830      "strverscmp" works with NUL terminated strings.  Our blocks of
1831      text are not necessarily terminated with a NUL byte. */
1832   char sv_a = texta[lena];
1833   char sv_b = textb[lenb];
1834
1835   texta[lena] = '\0';
1836   textb[lenb] = '\0';
1837
1838   diff = strverscmp (texta, textb);
1839
1840   texta[lena] = sv_a;
1841   textb[lenb] = sv_b;
1842
1843   return diff;
1844 }
1845
1846 /* Compare two lines A and B trying every key in sequence until there
1847    are no more keys or a difference is found. */
1848
1849 static int
1850 keycompare (const struct line *a, const struct line *b)
1851 {
1852   struct keyfield const *key = keylist;
1853
1854   /* For the first iteration only, the key positions have been
1855      precomputed for us. */
1856   char *texta = a->keybeg;
1857   char *textb = b->keybeg;
1858   char *lima = a->keylim;
1859   char *limb = b->keylim;
1860
1861   int diff;
1862
1863   for (;;)
1864     {
1865       char const *translate = key->translate;
1866       bool const *ignore = key->ignore;
1867
1868       /* Find the lengths. */
1869       size_t lena = lima <= texta ? 0 : lima - texta;
1870       size_t lenb = limb <= textb ? 0 : limb - textb;
1871
1872       /* Actually compare the fields. */
1873
1874       if (key->random)
1875         diff = compare_random (texta, lena, textb, lenb);
1876       else if (key->numeric | key->general_numeric)
1877         {
1878           char savea = *lima, saveb = *limb;
1879
1880           *lima = *limb = '\0';
1881           diff = ((key->numeric ? numcompare : general_numcompare)
1882                   (texta, textb));
1883           *lima = savea, *limb = saveb;
1884         }
1885       else if (key->version)
1886         diff = compare_version (texta, lena, textb, lenb);
1887       else if (key->month)
1888         diff = getmonth (texta, lena) - getmonth (textb, lenb);
1889       /* Sorting like this may become slow, so in a simple locale the user
1890          can select a faster sort that is similar to ascii sort.  */
1891       else if (hard_LC_COLLATE)
1892         {
1893           if (ignore || translate)
1894             {
1895               char buf[4000];
1896               size_t size = lena + 1 + lenb + 1;
1897               char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1898               char *copy_b = copy_a + lena + 1;
1899               size_t new_len_a, new_len_b, i;
1900
1901               /* Ignore and/or translate chars before comparing.  */
1902               for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1903                 {
1904                   if (i < lena)
1905                     {
1906                       copy_a[new_len_a] = (translate
1907                                            ? translate[to_uchar (texta[i])]
1908                                            : texta[i]);
1909                       if (!ignore || !ignore[to_uchar (texta[i])])
1910                         ++new_len_a;
1911                     }
1912                   if (i < lenb)
1913                     {
1914                       copy_b[new_len_b] = (translate
1915                                            ? translate[to_uchar (textb[i])]
1916                                            : textb [i]);
1917                       if (!ignore || !ignore[to_uchar (textb[i])])
1918                         ++new_len_b;
1919                     }
1920                 }
1921
1922               diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1923
1924               if (sizeof buf < size)
1925                 free (copy_a);
1926             }
1927           else if (lena == 0)
1928             diff = - NONZERO (lenb);
1929           else if (lenb == 0)
1930             goto greater;
1931           else
1932             diff = xmemcoll (texta, lena, textb, lenb);
1933         }
1934       else if (ignore)
1935         {
1936 #define CMP_WITH_IGNORE(A, B)                                           \
1937   do                                                                    \
1938     {                                                                   \
1939           for (;;)                                                      \
1940             {                                                           \
1941               while (texta < lima && ignore[to_uchar (*texta)])         \
1942                 ++texta;                                                \
1943               while (textb < limb && ignore[to_uchar (*textb)])         \
1944                 ++textb;                                                \
1945               if (! (texta < lima && textb < limb))                     \
1946                 break;                                                  \
1947               diff = to_uchar (A) - to_uchar (B);                       \
1948               if (diff)                                                 \
1949                 goto not_equal;                                         \
1950               ++texta;                                                  \
1951               ++textb;                                                  \
1952             }                                                           \
1953                                                                         \
1954           diff = (texta < lima) - (textb < limb);                       \
1955     }                                                                   \
1956   while (0)
1957
1958           if (translate)
1959             CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1960                              translate[to_uchar (*textb)]);
1961           else
1962             CMP_WITH_IGNORE (*texta, *textb);
1963         }
1964       else if (lena == 0)
1965         diff = - NONZERO (lenb);
1966       else if (lenb == 0)
1967         goto greater;
1968       else
1969         {
1970           if (translate)
1971             {
1972               while (texta < lima && textb < limb)
1973                 {
1974                   diff = (to_uchar (translate[to_uchar (*texta++)])
1975                           - to_uchar (translate[to_uchar (*textb++)]));
1976                   if (diff)
1977                     goto not_equal;
1978                 }
1979             }
1980           else
1981             {
1982               diff = memcmp (texta, textb, MIN (lena, lenb));
1983               if (diff)
1984                 goto not_equal;
1985             }
1986           diff = lena < lenb ? -1 : lena != lenb;
1987         }
1988
1989       if (diff)
1990         goto not_equal;
1991
1992       key = key->next;
1993       if (! key)
1994         break;
1995
1996       /* Find the beginning and limit of the next field.  */
1997       if (key->eword != SIZE_MAX)
1998         lima = limfield (a, key), limb = limfield (b, key);
1999       else
2000         lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2001
2002       if (key->sword != SIZE_MAX)
2003         texta = begfield (a, key), textb = begfield (b, key);
2004       else
2005         {
2006           texta = a->text, textb = b->text;
2007           if (key->skipsblanks)
2008             {
2009               while (texta < lima && blanks[to_uchar (*texta)])
2010                 ++texta;
2011               while (textb < limb && blanks[to_uchar (*textb)])
2012                 ++textb;
2013             }
2014         }
2015     }
2016
2017   return 0;
2018
2019  greater:
2020   diff = 1;
2021  not_equal:
2022   return key->reverse ? -diff : diff;
2023 }
2024
2025 /* Compare two lines A and B, returning negative, zero, or positive
2026    depending on whether A compares less than, equal to, or greater than B. */
2027
2028 static int
2029 compare (const struct line *a, const struct line *b)
2030 {
2031   int diff;
2032   size_t alen, blen;
2033
2034   /* First try to compare on the specified keys (if any).
2035      The only two cases with no key at all are unadorned sort,
2036      and unadorned sort -r. */
2037   if (keylist)
2038     {
2039       diff = keycompare (a, b);
2040       if (diff | unique | stable)
2041         return diff;
2042     }
2043
2044   /* If the keys all compare equal (or no keys were specified)
2045      fall through to the default comparison.  */
2046   alen = a->length - 1, blen = b->length - 1;
2047
2048   if (alen == 0)
2049     diff = - NONZERO (blen);
2050   else if (blen == 0)
2051     diff = 1;
2052   else if (hard_LC_COLLATE)
2053     diff = xmemcoll (a->text, alen, b->text, blen);
2054   else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2055     diff = alen < blen ? -1 : alen != blen;
2056
2057   return reverse ? -diff : diff;
2058 }
2059
2060 /* Check that the lines read from FILE_NAME come in order.  Return
2061    true if they are in order.  If CHECKONLY == 'c', also print a
2062    diagnostic (FILE_NAME, line number, contents of line) to stderr if
2063    they are not in order.  */
2064
2065 static bool
2066 check (char const *file_name, char checkonly)
2067 {
2068   FILE *fp = xfopen (file_name, "r");
2069   struct buffer buf;            /* Input buffer. */
2070   struct line temp;             /* Copy of previous line. */
2071   size_t alloc = 0;
2072   uintmax_t line_number = 0;
2073   struct keyfield const *key = keylist;
2074   bool nonunique = ! unique;
2075   bool ordered = true;
2076
2077   initbuf (&buf, sizeof (struct line),
2078            MAX (merge_buffer_size, sort_size));
2079   temp.text = NULL;
2080
2081   while (fillbuf (&buf, fp, file_name))
2082     {
2083       struct line const *line = buffer_linelim (&buf);
2084       struct line const *linebase = line - buf.nlines;
2085
2086       /* Make sure the line saved from the old buffer contents is
2087          less than or equal to the first line of the new buffer. */
2088       if (alloc && nonunique <= compare (&temp, line - 1))
2089         {
2090         found_disorder:
2091           {
2092             if (checkonly == 'c')
2093               {
2094                 struct line const *disorder_line = line - 1;
2095                 uintmax_t disorder_line_number =
2096                   buffer_linelim (&buf) - disorder_line + line_number;
2097                 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
2098                 fprintf (stderr, _("%s: %s:%s: disorder: "),
2099                          program_name, file_name,
2100                          umaxtostr (disorder_line_number, hr_buf));
2101                 write_bytes (disorder_line->text, disorder_line->length,
2102                              stderr, _("standard error"));
2103               }
2104
2105             ordered = false;
2106             break;
2107           }
2108         }
2109
2110       /* Compare each line in the buffer with its successor.  */
2111       while (linebase < --line)
2112         if (nonunique <= compare (line, line - 1))
2113           goto found_disorder;
2114
2115       line_number += buf.nlines;
2116
2117       /* Save the last line of the buffer.  */
2118       if (alloc < line->length)
2119         {
2120           do
2121             {
2122               alloc *= 2;
2123               if (! alloc)
2124                 {
2125                   alloc = line->length;
2126                   break;
2127                 }
2128             }
2129           while (alloc < line->length);
2130
2131           temp.text = xrealloc (temp.text, alloc);
2132         }
2133       memcpy (temp.text, line->text, line->length);
2134       temp.length = line->length;
2135       if (key)
2136         {
2137           temp.keybeg = temp.text + (line->keybeg - line->text);
2138           temp.keylim = temp.text + (line->keylim - line->text);
2139         }
2140     }
2141
2142   xfclose (fp, file_name);
2143   free (buf.buf);
2144   free (temp.text);
2145   return ordered;
2146 }
2147
2148 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
2149    files (all of which are at the start of the FILES array), and
2150    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2151    Close input and output files before returning.
2152    OUTPUT_FILE gives the name of the output file.  If it is NULL,
2153    the output file is standard output.  If OFP is NULL, the output
2154    file has not been opened yet (or written to, if standard output).  */
2155
2156 static void
2157 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2158           FILE *ofp, char const *output_file)
2159 {
2160   FILE **fps = xnmalloc (nmerge, sizeof *fps);
2161                                 /* Input streams for each file.  */
2162   struct buffer *buffer = xnmalloc (nmerge, sizeof *buffer);
2163                                 /* Input buffers for each file. */
2164   struct line saved;            /* Saved line storage for unique check. */
2165   struct line const *savedline = NULL;
2166                                 /* &saved if there is a saved line. */
2167   size_t savealloc = 0;         /* Size allocated for the saved line. */
2168   struct line const **cur = xnmalloc (nmerge, sizeof *cur);
2169                                 /* Current line in each line table. */
2170   struct line const **base = xnmalloc (nmerge, sizeof *base);
2171                                 /* Base of each line table.  */
2172   size_t *ord = xnmalloc (nmerge, sizeof *ord);
2173                                 /* Table representing a permutation of fps,
2174                                    such that cur[ord[0]] is the smallest line
2175                                    and will be next output. */
2176   size_t i;
2177   size_t j;
2178   size_t t;
2179   struct keyfield const *key = keylist;
2180   saved.text = NULL;
2181
2182   /* Read initial lines from each input file. */
2183   for (i = 0; i < nfiles; )
2184     {
2185       fps[i] = (files[i].pid
2186                 ? open_temp (files[i].name, files[i].pid)
2187                 : xfopen (files[i].name, "r"));
2188       initbuf (&buffer[i], sizeof (struct line),
2189                MAX (merge_buffer_size, sort_size / nfiles));
2190       if (fillbuf (&buffer[i], fps[i], files[i].name))
2191         {
2192           struct line const *linelim = buffer_linelim (&buffer[i]);
2193           cur[i] = linelim - 1;
2194           base[i] = linelim - buffer[i].nlines;
2195           i++;
2196         }
2197       else
2198         {
2199           /* fps[i] is empty; eliminate it from future consideration.  */
2200           xfclose (fps[i], files[i].name);
2201           if (i < ntemps)
2202             {
2203               ntemps--;
2204               zaptemp (files[i].name);
2205             }
2206           free (buffer[i].buf);
2207           --nfiles;
2208           for (j = i; j < nfiles; ++j)
2209             files[j] = files[j + 1];
2210         }
2211     }
2212
2213   if (! ofp)
2214     ofp = xfopen (output_file, "w");
2215
2216   /* Set up the ord table according to comparisons among input lines.
2217      Since this only reorders two items if one is strictly greater than
2218      the other, it is stable. */
2219   for (i = 0; i < nfiles; ++i)
2220     ord[i] = i;
2221   for (i = 1; i < nfiles; ++i)
2222     if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2223       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2224
2225   /* Repeatedly output the smallest line until no input remains. */
2226   while (nfiles)
2227     {
2228       struct line const *smallest = cur[ord[0]];
2229
2230       /* If uniquified output is turned on, output only the first of
2231          an identical series of lines. */
2232       if (unique)
2233         {
2234           if (savedline && compare (savedline, smallest))
2235             {
2236               savedline = NULL;
2237               write_bytes (saved.text, saved.length, ofp, output_file);
2238             }
2239           if (!savedline)
2240             {
2241               savedline = &saved;
2242               if (savealloc < smallest->length)
2243                 {
2244                   do
2245                     if (! savealloc)
2246                       {
2247                         savealloc = smallest->length;
2248                         break;
2249                       }
2250                   while ((savealloc *= 2) < smallest->length);
2251
2252                   saved.text = xrealloc (saved.text, savealloc);
2253                 }
2254               saved.length = smallest->length;
2255               memcpy (saved.text, smallest->text, saved.length);
2256               if (key)
2257                 {
2258                   saved.keybeg =
2259                     saved.text + (smallest->keybeg - smallest->text);
2260                   saved.keylim =
2261                     saved.text + (smallest->keylim - smallest->text);
2262                 }
2263             }
2264         }
2265       else
2266         write_bytes (smallest->text, smallest->length, ofp, output_file);
2267
2268       /* Check if we need to read more lines into core. */
2269       if (base[ord[0]] < smallest)
2270         cur[ord[0]] = smallest - 1;
2271       else
2272         {
2273           if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2274             {
2275               struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2276               cur[ord[0]] = linelim - 1;
2277               base[ord[0]] = linelim - buffer[ord[0]].nlines;
2278             }
2279           else
2280             {
2281               /* We reached EOF on fps[ord[0]].  */
2282               for (i = 1; i < nfiles; ++i)
2283                 if (ord[i] > ord[0])
2284                   --ord[i];
2285               --nfiles;
2286               xfclose (fps[ord[0]], files[ord[0]].name);
2287               if (ord[0] < ntemps)
2288                 {
2289                   ntemps--;
2290                   zaptemp (files[ord[0]].name);
2291                 }
2292               free (buffer[ord[0]].buf);
2293               for (i = ord[0]; i < nfiles; ++i)
2294                 {
2295                   fps[i] = fps[i + 1];
2296                   files[i] = files[i + 1];
2297                   buffer[i] = buffer[i + 1];
2298                   cur[i] = cur[i + 1];
2299                   base[i] = base[i + 1];
2300                 }
2301               for (i = 0; i < nfiles; ++i)
2302                 ord[i] = ord[i + 1];
2303               continue;
2304             }
2305         }
2306
2307       /* The new line just read in may be larger than other lines
2308          already in main memory; push it back in the queue until we
2309          encounter a line larger than it.  Optimize for the common
2310          case where the new line is smallest.  */
2311       {
2312         size_t lo = 1;
2313         size_t hi = nfiles;
2314         size_t probe = lo;
2315         size_t ord0 = ord[0];
2316         size_t count_of_smaller_lines;
2317
2318         while (lo < hi)
2319           {
2320             int cmp = compare (cur[ord0], cur[ord[probe]]);
2321             if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2322               hi = probe;
2323             else
2324               lo = probe + 1;
2325             probe = (lo + hi) / 2;
2326           }
2327
2328         count_of_smaller_lines = lo - 1;
2329         for (j = 0; j < count_of_smaller_lines; j++)
2330           ord[j] = ord[j + 1];
2331         ord[count_of_smaller_lines] = ord0;
2332       }
2333
2334       /* Free up some resources every once in a while.  */
2335       if (MAX_PROCS_BEFORE_REAP < nprocs)
2336         reap_some ();
2337     }
2338
2339   if (unique && savedline)
2340     {
2341       write_bytes (saved.text, saved.length, ofp, output_file);
2342       free (saved.text);
2343     }
2344
2345   xfclose (ofp, output_file);
2346   free(fps);
2347   free(buffer);
2348   free(ord);
2349   free(base);
2350   free(cur);
2351 }
2352
2353 /* Merge into T the two sorted arrays of lines LO (with NLO members)
2354    and HI (with NHI members).  T, LO, and HI point just past their
2355    respective arrays, and the arrays are in reverse order.  NLO and
2356    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
2357
2358 static inline void
2359 mergelines (struct line *t,
2360             struct line const *lo, size_t nlo,
2361             struct line const *hi, size_t nhi)
2362 {
2363   for (;;)
2364     if (compare (lo - 1, hi - 1) <= 0)
2365       {
2366         *--t = *--lo;
2367         if (! --nlo)
2368           {
2369             /* HI - NHI equalled T - (NLO + NHI) when this function
2370                began.  Therefore HI must equal T now, and there is no
2371                need to copy from HI to T.  */
2372             return;
2373           }
2374       }
2375     else
2376       {
2377         *--t = *--hi;
2378         if (! --nhi)
2379           {
2380             do
2381               *--t = *--lo;
2382             while (--nlo);
2383
2384             return;
2385           }
2386       }
2387 }
2388
2389 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
2390    NLINES must be at least 2.
2391    The input and output arrays are in reverse order, and LINES and
2392    TEMP point just past the end of their respective arrays.
2393
2394    Use a recursive divide-and-conquer algorithm, in the style
2395    suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
2396    the optimization suggested by exercise 5.2.4-10; this requires room
2397    for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
2398    writes that this memory optimization was originally published by
2399    D. A. Bell, Comp J. 1 (1958), 75.  */
2400
2401 static void
2402 sortlines (struct line *lines, size_t nlines, struct line *temp)
2403 {
2404   if (nlines == 2)
2405     {
2406       if (0 < compare (&lines[-1], &lines[-2]))
2407         {
2408           struct line tmp = lines[-1];
2409           lines[-1] = lines[-2];
2410           lines[-2] = tmp;
2411         }
2412     }
2413   else
2414     {
2415       size_t nlo = nlines / 2;
2416       size_t nhi = nlines - nlo;
2417       struct line *lo = lines;
2418       struct line *hi = lines - nlo;
2419       struct line *sorted_lo = temp;
2420
2421       sortlines (hi, nhi, temp);
2422       if (1 < nlo)
2423         sortlines_temp (lo, nlo, sorted_lo);
2424       else
2425         sorted_lo[-1] = lo[-1];
2426
2427       mergelines (lines, sorted_lo, nlo, hi, nhi);
2428     }
2429 }
2430
2431 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
2432    rather than sorting in place.  */
2433
2434 static void
2435 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
2436 {
2437   if (nlines == 2)
2438     {
2439       /* Declare `swap' as int, not bool, to work around a bug
2440          <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
2441          in the IBM xlc 6.0.0.0 compiler in 64-bit mode.  */
2442       int swap = (0 < compare (&lines[-1], &lines[-2]));
2443       temp[-1] = lines[-1 - swap];
2444       temp[-2] = lines[-2 + swap];
2445     }
2446   else
2447     {
2448       size_t nlo = nlines / 2;
2449       size_t nhi = nlines - nlo;
2450       struct line *lo = lines;
2451       struct line *hi = lines - nlo;
2452       struct line *sorted_hi = temp - nlo;
2453
2454       sortlines_temp (hi, nhi, sorted_hi);
2455       if (1 < nlo)
2456         sortlines (lo, nlo, temp);
2457
2458       mergelines (temp, lo, nlo, sorted_hi, nhi);
2459     }
2460 }
2461
2462 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
2463    the same as OUTFILE.  If found, merge the found instances (and perhaps
2464    some other files) into a temporary file so that it can in turn be
2465    merged into OUTFILE without destroying OUTFILE before it is completely
2466    read.  Return the new value of NFILES, which differs from the old if
2467    some merging occurred.
2468
2469    This test ensures that an otherwise-erroneous use like
2470    "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
2471    It's not clear that POSIX requires this nicety.
2472    Detect common error cases, but don't try to catch obscure cases like
2473    "cat ... FILE ... | sort -m -o FILE"
2474    where traditional "sort" doesn't copy the input and where
2475    people should know that they're getting into trouble anyway.
2476    Catching these obscure cases would slow down performance in
2477    common cases.  */
2478
2479 static size_t
2480 avoid_trashing_input (struct sortfile *files, size_t ntemps,
2481                       size_t nfiles, char const *outfile)
2482 {
2483   size_t i;
2484   bool got_outstat = false;
2485   struct stat outstat;
2486
2487   for (i = ntemps; i < nfiles; i++)
2488     {
2489       bool is_stdin = STREQ (files[i].name, "-");
2490       bool same;
2491       struct stat instat;
2492
2493       if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
2494         same = true;
2495       else
2496         {
2497           if (! got_outstat)
2498             {
2499               if ((outfile
2500                    ? stat (outfile, &outstat)
2501                    : fstat (STDOUT_FILENO, &outstat))
2502                   != 0)
2503                 break;
2504               got_outstat = true;
2505             }
2506
2507           same = (((is_stdin
2508                     ? fstat (STDIN_FILENO, &instat)
2509                     : stat (files[i].name, &instat))
2510                    == 0)
2511                   && SAME_INODE (instat, outstat));
2512         }
2513
2514       if (same)
2515         {
2516           FILE *tftp;
2517           pid_t pid;
2518           char *temp = create_temp (&tftp, &pid);
2519           mergefps (&files[i],0, nfiles - i, tftp, temp);
2520           files[i].name = temp;
2521           files[i].pid = pid;
2522           return i + 1;
2523         }
2524     }
2525
2526   return nfiles;
2527 }
2528
2529 /* Merge the input FILES.  NTEMPS is the number of files at the
2530    start of FILES that are temporary; it is zero at the top level.
2531    NFILES is the total number of files.  Put the output in
2532    OUTPUT_FILE; a null OUTPUT_FILE stands for standard output.  */
2533
2534 static void
2535 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
2536        char const *output_file)
2537 {
2538   while (nmerge < nfiles)
2539     {
2540       /* Number of input files processed so far.  */
2541       size_t in;
2542
2543       /* Number of output files generated so far.  */
2544       size_t out;
2545
2546       /* nfiles % NMERGE; this counts input files that are left over
2547          after all full-sized merges have been done.  */
2548       size_t remainder;
2549
2550       /* Number of easily-available slots at the next loop iteration.  */
2551       size_t cheap_slots;
2552
2553       /* Do as many NMERGE-size merges as possible.  */
2554       for (out = in = 0; out < nfiles / nmerge; out++, in += nmerge)
2555         {
2556           FILE *tfp;
2557           pid_t pid;
2558           char *temp = create_temp (&tfp, &pid);
2559           size_t nt = MIN (ntemps, nmerge);
2560           ntemps -= nt;
2561           mergefps (&files[in], nt, nmerge, tfp, temp);
2562           files[out].name = temp;
2563           files[out].pid = pid;
2564         }
2565
2566       remainder = nfiles - in;
2567       cheap_slots = nmerge - out % nmerge;
2568
2569       if (cheap_slots < remainder)
2570         {
2571           /* So many files remain that they can't all be put into the last
2572              NMERGE-sized output window.  Do one more merge.  Merge as few
2573              files as possible, to avoid needless I/O.  */
2574           size_t nshortmerge = remainder - cheap_slots + 1;
2575           FILE *tfp;
2576           pid_t pid;
2577           char *temp = create_temp (&tfp, &pid);
2578           size_t nt = MIN (ntemps, nshortmerge);
2579           ntemps -= nt;
2580           mergefps (&files[in], nt, nshortmerge, tfp, temp);
2581           files[out].name = temp;
2582           files[out++].pid = pid;
2583           in += nshortmerge;
2584         }
2585
2586       /* Put the remaining input files into the last NMERGE-sized output
2587          window, so they will be merged in the next pass.  */
2588       memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2589       ntemps += out;
2590       nfiles -= in - out;
2591     }
2592
2593   nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2594   mergefps (files, ntemps, nfiles, NULL, output_file);
2595 }
2596
2597 /* Sort NFILES FILES onto OUTPUT_FILE. */
2598
2599 static void
2600 sort (char * const *files, size_t nfiles, char const *output_file)
2601 {
2602   struct buffer buf;
2603   size_t ntemps = 0;
2604   bool output_file_created = false;
2605
2606   buf.alloc = 0;
2607
2608   while (nfiles)
2609     {
2610       char const *temp_output;
2611       char const *file = *files;
2612       FILE *fp = xfopen (file, "r");
2613       FILE *tfp;
2614       size_t bytes_per_line = (2 * sizeof (struct line)
2615                                - sizeof (struct line) / 2);
2616
2617       if (! buf.alloc)
2618         initbuf (&buf, bytes_per_line,
2619                  sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2620       buf.eof = false;
2621       files++;
2622       nfiles--;
2623
2624       while (fillbuf (&buf, fp, file))
2625         {
2626           struct line *line;
2627           struct line *linebase;
2628
2629           if (buf.eof && nfiles
2630               && (bytes_per_line + 1
2631                   < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2632             {
2633               /* End of file, but there is more input and buffer room.
2634                  Concatenate the next input file; this is faster in
2635                  the usual case.  */
2636               buf.left = buf.used;
2637               break;
2638             }
2639
2640           line = buffer_linelim (&buf);
2641           linebase = line - buf.nlines;
2642           if (1 < buf.nlines)
2643             sortlines (line, buf.nlines, linebase);
2644           if (buf.eof && !nfiles && !ntemps && !buf.left)
2645             {
2646               xfclose (fp, file);
2647               tfp = xfopen (output_file, "w");
2648               temp_output = output_file;
2649               output_file_created = true;
2650             }
2651           else
2652             {
2653               ++ntemps;
2654               temp_output = create_temp (&tfp, NULL);
2655             }
2656
2657           do
2658             {
2659               line--;
2660               write_bytes (line->text, line->length, tfp, temp_output);
2661               if (unique)
2662                 while (linebase < line && compare (line, line - 1) == 0)
2663                   line--;
2664             }
2665           while (linebase < line);
2666
2667           xfclose (tfp, temp_output);
2668
2669           /* Free up some resources every once in a while.  */
2670           if (MAX_PROCS_BEFORE_REAP < nprocs)
2671             reap_some ();
2672
2673           if (output_file_created)
2674             goto finish;
2675         }
2676       xfclose (fp, file);
2677     }
2678
2679  finish:
2680   free (buf.buf);
2681
2682   if (! output_file_created)
2683     {
2684       size_t i;
2685       struct tempnode *node = temphead;
2686       struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2687       for (i = 0; node; i++)
2688         {
2689           tempfiles[i].name = node->name;
2690           tempfiles[i].pid = node->pid;
2691           node = node->next;
2692         }
2693       merge (tempfiles, ntemps, ntemps, output_file);
2694       free (tempfiles);
2695     }
2696 }
2697
2698 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list.  */
2699
2700 static void
2701 insertkey (struct keyfield *key_arg)
2702 {
2703   struct keyfield **p;
2704   struct keyfield *key = xmemdup (key_arg, sizeof *key);
2705
2706   for (p = &keylist; *p; p = &(*p)->next)
2707     continue;
2708   *p = key;
2709   key->next = NULL;
2710 }
2711
2712 /* Report a bad field specification SPEC, with extra info MSGID.  */
2713
2714 static void badfieldspec (char const *, char const *)
2715      ATTRIBUTE_NORETURN;
2716 static void
2717 badfieldspec (char const *spec, char const *msgid)
2718 {
2719   error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
2720          _(msgid), quote (spec));
2721   abort ();
2722 }
2723
2724 /* Report incompatible options.  */
2725
2726 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
2727 static void
2728 incompatible_options (char const *opts)
2729 {
2730   error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
2731   abort ();
2732 }
2733
2734 /* Check compatibility of ordering options.  */
2735
2736 static void
2737 check_ordering_compatibility (void)
2738 {
2739   struct keyfield const *key;
2740
2741   for (key = keylist; key; key = key->next)
2742     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
2743               + key->version + !!key->ignore))
2744         || (key->random && key->translate))
2745       {
2746         /* The following is too big, but guaranteed to be "big enough". */
2747         char opts[sizeof short_options];
2748         char *p = opts;
2749         if (key->ignore == nondictionary)
2750           *p++ = 'd';
2751         if (key->translate)
2752           *p++ = 'f';
2753         if (key->general_numeric)
2754           *p++ = 'g';
2755         if (key->ignore == nonprinting)
2756           *p++ = 'i';
2757         if (key->month)
2758           *p++ = 'M';
2759         if (key->numeric)
2760           *p++ = 'n';
2761         if (key->version)
2762           *p++ = 'V';
2763         if (key->random)
2764           *p++ = 'R';
2765         *p = '\0';
2766         incompatible_options (opts);
2767       }
2768 }
2769
2770 /* Parse the leading integer in STRING and store the resulting value
2771    (which must fit into size_t) into *VAL.  Return the address of the
2772    suffix after the integer.  If the value is too large, silently
2773    substitute SIZE_MAX.  If MSGID is NULL, return NULL after
2774    failure; otherwise, report MSGID and exit on failure.  */
2775
2776 static char const *
2777 parse_field_count (char const *string, size_t *val, char const *msgid)
2778 {
2779   char *suffix;
2780   uintmax_t n;
2781
2782   switch (xstrtoumax (string, &suffix, 10, &n, ""))
2783     {
2784     case LONGINT_OK:
2785     case LONGINT_INVALID_SUFFIX_CHAR:
2786       *val = n;
2787       if (*val == n)
2788         break;
2789       /* Fall through.  */
2790     case LONGINT_OVERFLOW:
2791     case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2792       *val = SIZE_MAX;
2793       break;
2794
2795     case LONGINT_INVALID:
2796       if (msgid)
2797         error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
2798                _(msgid), quote (string));
2799       return NULL;
2800     }
2801
2802   return suffix;
2803 }
2804
2805 /* Handle interrupts and hangups. */
2806
2807 static void
2808 sighandler (int sig)
2809 {
2810   if (! SA_NOCLDSTOP)
2811     signal (sig, SIG_IGN);
2812
2813   cleanup ();
2814
2815   signal (sig, SIG_DFL);
2816   raise (sig);
2817 }
2818
2819 /* Set the ordering options for KEY specified in S.
2820    Return the address of the first character in S that
2821    is not a valid ordering option.
2822    BLANKTYPE is the kind of blanks that 'b' should skip. */
2823
2824 static char *
2825 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
2826 {
2827   while (*s)
2828     {
2829       switch (*s)
2830         {
2831         case 'b':
2832           if (blanktype == bl_start || blanktype == bl_both)
2833             key->skipsblanks = true;
2834           if (blanktype == bl_end || blanktype == bl_both)
2835             key->skipeblanks = true;
2836           break;
2837         case 'd':
2838           key->ignore = nondictionary;
2839           break;
2840         case 'f':
2841           key->translate = fold_toupper;
2842           break;
2843         case 'g':
2844           key->general_numeric = true;
2845           break;
2846         case 'i':
2847           /* Option order should not matter, so don't let -i override
2848              -d.  -d implies -i, but -i does not imply -d.  */
2849           if (! key->ignore)
2850             key->ignore = nonprinting;
2851           break;
2852         case 'M':
2853           key->month = true;
2854           break;
2855         case 'n':
2856           key->numeric = true;
2857           break;
2858         case 'R':
2859           key->random = true;
2860           break;
2861         case 'r':
2862           key->reverse = true;
2863           break;
2864         case 'V':
2865           key->version = true;
2866           break;
2867         default:
2868           return (char *) s;
2869         }
2870       ++s;
2871     }
2872   return (char *) s;
2873 }
2874
2875 static struct keyfield *
2876 key_init (struct keyfield *key)
2877 {
2878   memset (key, 0, sizeof *key);
2879   key->eword = SIZE_MAX;
2880   return key;
2881 }
2882
2883 int
2884 main (int argc, char **argv)
2885 {
2886   struct keyfield *key;
2887   struct keyfield key_buf;
2888   struct keyfield gkey;
2889   char const *s;
2890   int c = 0;
2891   char checkonly = 0;
2892   bool mergeonly = false;
2893   char *random_source = NULL;
2894   bool need_random = false;
2895   size_t nfiles = 0;
2896   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2897   bool obsolete_usage = (posix2_version () < 200112);
2898   char **files;
2899   char *files_from = NULL;
2900   struct Tokens tok;
2901   char const *outfile = NULL;
2902
2903   initialize_main (&argc, &argv);
2904   set_program_name (argv[0]);
2905   setlocale (LC_ALL, "");
2906   bindtextdomain (PACKAGE, LOCALEDIR);
2907   textdomain (PACKAGE);
2908
2909   initialize_exit_failure (SORT_FAILURE);
2910
2911   hard_LC_COLLATE = hard_locale (LC_COLLATE);
2912 #if HAVE_NL_LANGINFO
2913   hard_LC_TIME = hard_locale (LC_TIME);
2914 #endif
2915
2916   /* Get locale's representation of the decimal point.  */
2917   {
2918     struct lconv const *locale = localeconv ();
2919
2920     /* If the locale doesn't define a decimal point, or if the decimal
2921        point is multibyte, use the C locale's decimal point.  FIXME:
2922        add support for multibyte decimal points.  */
2923     decimal_point = to_uchar (locale->decimal_point[0]);
2924     if (! decimal_point || locale->decimal_point[1])
2925       decimal_point = '.';
2926
2927     /* FIXME: add support for multibyte thousands separators.  */
2928     thousands_sep = to_uchar (*locale->thousands_sep);
2929     if (! thousands_sep || locale->thousands_sep[1])
2930       thousands_sep = -1;
2931   }
2932
2933   have_read_stdin = false;
2934   inittables ();
2935
2936   {
2937     size_t i;
2938     static int const sig[] =
2939       {
2940         /* The usual suspects.  */
2941         SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
2942 #ifdef SIGPOLL
2943         SIGPOLL,
2944 #endif
2945 #ifdef SIGPROF
2946         SIGPROF,
2947 #endif
2948 #ifdef SIGVTALRM
2949         SIGVTALRM,
2950 #endif
2951 #ifdef SIGXCPU
2952         SIGXCPU,
2953 #endif
2954 #ifdef SIGXFSZ
2955         SIGXFSZ,
2956 #endif
2957       };
2958     enum { nsigs = sizeof sig / sizeof sig[0] };
2959
2960 #if SA_NOCLDSTOP
2961     struct sigaction act;
2962
2963     sigemptyset (&caught_signals);
2964     for (i = 0; i < nsigs; i++)
2965       {
2966         sigaction (sig[i], NULL, &act);
2967         if (act.sa_handler != SIG_IGN)
2968           sigaddset (&caught_signals, sig[i]);
2969       }
2970
2971     act.sa_handler = sighandler;
2972     act.sa_mask = caught_signals;
2973     act.sa_flags = 0;
2974
2975     for (i = 0; i < nsigs; i++)
2976       if (sigismember (&caught_signals, sig[i]))
2977         sigaction (sig[i], &act, NULL);
2978 #else
2979     for (i = 0; i < nsigs; i++)
2980       if (signal (sig[i], SIG_IGN) != SIG_IGN)
2981         {
2982           signal (sig[i], sighandler);
2983           siginterrupt (sig[i], 1);
2984         }
2985 #endif
2986   }
2987
2988   /* The signal mask is known, so it is safe to invoke exit_cleanup.  */
2989   atexit (exit_cleanup);
2990
2991   gkey.sword = gkey.eword = SIZE_MAX;
2992   gkey.ignore = NULL;
2993   gkey.translate = NULL;
2994   gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false;
2995   gkey.month = gkey.reverse = false;
2996   gkey.skipsblanks = gkey.skipeblanks = false;
2997
2998   files = xnmalloc (argc, sizeof *files);
2999
3000   for (;;)
3001     {
3002       /* Parse an operand as a file after "--" was seen; or if
3003          pedantic and a file was seen, unless the POSIX version
3004          predates 1003.1-2001 and -c was not seen and the operand is
3005          "-o FILE" or "-oFILE".  */
3006       int oi = -1;
3007
3008       if (c == -1
3009           || (posixly_correct && nfiles != 0
3010               && ! (obsolete_usage
3011                     && ! checkonly
3012                     && optind != argc
3013                     && argv[optind][0] == '-' && argv[optind][1] == 'o'
3014                     && (argv[optind][2] || optind + 1 != argc)))
3015           || ((c = getopt_long (argc, argv, short_options,
3016                                 long_options, &oi))
3017               == -1))
3018         {
3019           if (argc <= optind)
3020             break;
3021           files[nfiles++] = argv[optind++];
3022         }
3023       else switch (c)
3024         {
3025         case 1:
3026           key = NULL;
3027           if (optarg[0] == '+')
3028             {
3029               bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
3030                                       && ISDIGIT (argv[optind][1]));
3031               obsolete_usage |= minus_pos_usage & ~posixly_correct;
3032               if (obsolete_usage)
3033                 {
3034                   /* Treat +POS1 [-POS2] as a key if possible; but silently
3035                      treat an operand as a file if it is not a valid +POS1.  */
3036                   key = key_init (&key_buf);
3037                   s = parse_field_count (optarg + 1, &key->sword, NULL);
3038                   if (s && *s == '.')
3039                     s = parse_field_count (s + 1, &key->schar, NULL);
3040                   if (! (key->sword | key->schar))
3041                     key->sword = SIZE_MAX;
3042                   if (! s || *set_ordering (s, key, bl_start))
3043                     key = NULL;
3044                   else
3045                     {
3046                       if (minus_pos_usage)
3047                         {
3048                           char const *optarg1 = argv[optind++];
3049                           s = parse_field_count (optarg1 + 1, &key->eword,
3050                                              N_("invalid number after `-'"));
3051                           if (*s == '.')
3052                             s = parse_field_count (s + 1, &key->echar,
3053                                                N_("invalid number after `.'"));
3054                           if (*set_ordering (s, key, bl_end))
3055                             badfieldspec (optarg1,
3056                                       N_("stray character in field spec"));
3057                         }
3058                       insertkey (key);
3059                     }
3060                 }
3061             }
3062           if (! key)
3063             files[nfiles++] = optarg;
3064           break;
3065
3066         case SORT_OPTION:
3067           c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
3068           /* Fall through. */
3069         case 'b':
3070         case 'd':
3071         case 'f':
3072         case 'g':
3073         case 'i':
3074         case 'M':
3075         case 'n':
3076         case 'r':
3077         case 'R':
3078         case 'V':
3079           {
3080             char str[2];
3081             str[0] = c;
3082             str[1] = '\0';
3083             set_ordering (str, &gkey, bl_both);
3084           }
3085           break;
3086
3087         case CHECK_OPTION:
3088           c = (optarg
3089                ? XARGMATCH ("--check", optarg, check_args, check_types)
3090                : 'c');
3091           /* Fall through.  */
3092         case 'c':
3093         case 'C':
3094           if (checkonly && checkonly != c)
3095             incompatible_options ("cC");
3096           checkonly = c;
3097           break;
3098
3099         case COMPRESS_PROGRAM_OPTION:
3100           if (compress_program && !STREQ (compress_program, optarg))
3101             error (SORT_FAILURE, 0, _("multiple compress programs specified"));
3102           compress_program = optarg;
3103           break;
3104
3105         case FILES0_FROM_OPTION:
3106           files_from = optarg;
3107           break;
3108
3109         case 'k':
3110           key = key_init (&key_buf);
3111
3112           /* Get POS1. */
3113           s = parse_field_count (optarg, &key->sword,
3114                                  N_("invalid number at field start"));
3115           if (! key->sword--)
3116             {
3117               /* Provoke with `sort -k0' */
3118               badfieldspec (optarg, N_("field number is zero"));
3119             }
3120           if (*s == '.')
3121             {
3122               s = parse_field_count (s + 1, &key->schar,
3123                                      N_("invalid number after `.'"));
3124               if (! key->schar--)
3125                 {
3126                   /* Provoke with `sort -k1.0' */
3127                   badfieldspec (optarg, N_("character offset is zero"));
3128                 }
3129             }
3130           if (! (key->sword | key->schar))
3131             key->sword = SIZE_MAX;
3132           s = set_ordering (s, key, bl_start);
3133           if (*s != ',')
3134             {
3135               key->eword = SIZE_MAX;
3136               key->echar = 0;
3137             }
3138           else
3139             {
3140               /* Get POS2. */
3141               s = parse_field_count (s + 1, &key->eword,
3142                                      N_("invalid number after `,'"));
3143               if (! key->eword--)
3144                 {
3145                   /* Provoke with `sort -k1,0' */
3146                   badfieldspec (optarg, N_("field number is zero"));
3147                 }
3148               if (*s == '.')
3149                 s = parse_field_count (s + 1, &key->echar,
3150                                        N_("invalid number after `.'"));
3151               else
3152                 {
3153                   /* `-k 2,3' is equivalent to `+1 -3'.  */
3154                   key->eword++;
3155                 }
3156               s = set_ordering (s, key, bl_end);
3157             }
3158           if (*s)
3159             badfieldspec (optarg, N_("stray character in field spec"));
3160           insertkey (key);
3161           break;
3162
3163         case 'm':
3164           mergeonly = true;
3165           break;
3166
3167         case NMERGE_OPTION:
3168           specify_nmerge (oi, c, optarg);
3169           break;
3170
3171         case 'o':
3172           if (outfile && !STREQ (outfile, optarg))
3173             error (SORT_FAILURE, 0, _("multiple output files specified"));
3174           outfile = optarg;
3175           break;
3176
3177         case RANDOM_SOURCE_OPTION:
3178           if (random_source && !STREQ (random_source, optarg))
3179             error (SORT_FAILURE, 0, _("multiple random sources specified"));
3180           random_source = optarg;
3181           break;
3182
3183         case 's':
3184           stable = true;
3185           break;
3186
3187         case 'S':
3188           specify_sort_size (oi, c, optarg);
3189           break;
3190
3191         case 't':
3192           {
3193             char newtab = optarg[0];
3194             if (! newtab)
3195               error (SORT_FAILURE, 0, _("empty tab"));
3196             if (optarg[1])
3197               {
3198                 if (STREQ (optarg, "\\0"))
3199                   newtab = '\0';
3200                 else
3201                   {
3202                     /* Provoke with `sort -txx'.  Complain about
3203                        "multi-character tab" instead of "multibyte tab", so
3204                        that the diagnostic's wording does not need to be
3205                        changed once multibyte characters are supported.  */
3206                     error (SORT_FAILURE, 0, _("multi-character tab %s"),
3207                            quote (optarg));
3208                   }
3209               }
3210             if (tab != TAB_DEFAULT && tab != newtab)
3211               error (SORT_FAILURE, 0, _("incompatible tabs"));
3212             tab = newtab;
3213           }
3214           break;
3215
3216         case 'T':
3217           add_temp_dir (optarg);
3218           break;
3219
3220         case 'u':
3221           unique = true;
3222           break;
3223
3224         case 'y':
3225           /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
3226              through Solaris 7.  It is also accepted by many non-Solaris
3227              "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
3228              -y is marked as obsolete starting with Solaris 8 (1999), but is
3229              still accepted as of Solaris 10 prerelease (2004).
3230
3231              Solaris 2.5.1 "sort -y 100" reads the input file "100", but
3232              emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
3233              and which in general ignores the argument after "-y" if it
3234              consists entirely of digits (it can even be empty).  */
3235           if (optarg == argv[optind - 1])
3236             {
3237               char const *p;
3238               for (p = optarg; ISDIGIT (*p); p++)
3239                 continue;
3240               optind -= (*p != '\0');
3241             }
3242           break;
3243
3244         case 'z':
3245           eolchar = 0;
3246           break;
3247
3248         case_GETOPT_HELP_CHAR;
3249
3250         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
3251
3252         default:
3253           usage (SORT_FAILURE);
3254         }
3255     }
3256
3257   if (files_from)
3258     {
3259       FILE *stream;
3260
3261       /* When using --files0-from=F, you may not specify any files
3262          on the command-line.  */
3263       if (nfiles)
3264         {
3265           error (0, 0, _("extra operand %s"), quote (files[0]));
3266           fprintf (stderr, "%s\n",
3267                    _("file operands cannot be combined with --files0-from"));
3268           usage (SORT_FAILURE);
3269         }
3270
3271       if (STREQ (files_from, "-"))
3272         stream = stdin;
3273       else
3274         {
3275           stream = fopen (files_from, "r");
3276           if (stream == NULL)
3277             error (SORT_FAILURE, errno, _("cannot open %s for reading"),
3278                    quote (files_from));
3279         }
3280
3281       readtokens0_init (&tok);
3282
3283       if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
3284         error (SORT_FAILURE, 0, _("cannot read file names from %s"),
3285                quote (files_from));
3286
3287       if (tok.n_tok)
3288         {
3289           size_t i;
3290           free (files);
3291           files = tok.tok;
3292           nfiles = tok.n_tok;
3293           for (i = 0; i < nfiles; i++)
3294           {
3295               if (STREQ (files[i], "-"))
3296                 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
3297                                           "no file name of %s allowed"),
3298                        quote (files[i]));
3299               else if (files[i][0] == '\0')
3300                 {
3301                   /* Using the standard `filename:line-number:' prefix here is
3302                      not totally appropriate, since NUL is the separator, not NL,
3303                      but it might be better than nothing.  */
3304                   unsigned long int file_number = i + 1;
3305                   error (SORT_FAILURE, 0,
3306                          _("%s:%lu: invalid zero-length file name"),
3307                          quotearg_colon (files_from), file_number);
3308                 }
3309           }
3310         }
3311       else
3312         error (SORT_FAILURE, 0, _("no input from %s"),
3313                quote (files_from));
3314     }
3315
3316   /* Inheritance of global options to individual keys. */
3317   for (key = keylist; key; key = key->next)
3318     {
3319       if (! (key->ignore
3320              || key->translate
3321              || (key->skipsblanks
3322                  | key->reverse
3323                  | key->skipeblanks
3324                  | key->month
3325                  | key->numeric
3326                  | key->version
3327                  | key->general_numeric
3328                  | key->random)))
3329         {
3330           key->ignore = gkey.ignore;
3331           key->translate = gkey.translate;
3332           key->skipsblanks = gkey.skipsblanks;
3333           key->skipeblanks = gkey.skipeblanks;
3334           key->month = gkey.month;
3335           key->numeric = gkey.numeric;
3336           key->general_numeric = gkey.general_numeric;
3337           key->random = gkey.random;
3338           key->reverse = gkey.reverse;
3339           key->version = gkey.version;
3340         }
3341
3342       need_random |= key->random;
3343     }
3344
3345   if (!keylist && (gkey.ignore
3346                    || gkey.translate
3347                    || (gkey.skipsblanks
3348                        | gkey.skipeblanks
3349                        | gkey.month
3350                        | gkey.numeric
3351                        | gkey.general_numeric
3352                        | gkey.random
3353                        | gkey.version)))
3354     {
3355       insertkey (&gkey);
3356       need_random |= gkey.random;
3357     }
3358
3359   check_ordering_compatibility ();
3360
3361   reverse = gkey.reverse;
3362
3363   if (need_random)
3364     {
3365       randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
3366       if (! randread_source)
3367         die (_("open failed"), random_source);
3368     }
3369
3370   if (temp_dir_count == 0)
3371     {
3372       char const *tmp_dir = getenv ("TMPDIR");
3373       add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
3374     }
3375
3376   if (nfiles == 0)
3377     {
3378       static char *minus = "-";
3379       nfiles = 1;
3380       free (files);
3381       files = &minus;
3382     }
3383
3384   /* Need to re-check that we meet the minimum requirement for memory
3385      usage with the final value for NMERGE. */
3386   if (0 < sort_size)
3387     sort_size = MAX (sort_size, MIN_SORT_SIZE);
3388
3389   if (checkonly)
3390     {
3391       if (nfiles > 1)
3392         error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
3393                quote (files[1]), checkonly);
3394
3395       if (outfile)
3396         {
3397           static char opts[] = {0, 'o', 0};
3398           opts[0] = checkonly;
3399           incompatible_options (opts);
3400         }
3401
3402       /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
3403          input is not properly sorted.  */
3404       exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
3405     }
3406
3407   if (mergeonly)
3408     {
3409       struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
3410       size_t i;
3411
3412       for (i = 0; i < nfiles; ++i)
3413         sortfiles[i].name = files[i];
3414
3415       merge (sortfiles, 0, nfiles, outfile);
3416       IF_LINT (free (sortfiles));
3417     }
3418   else
3419     sort (files, nfiles, outfile);
3420
3421   if (have_read_stdin && fclose (stdin) == EOF)
3422     die (_("close failed"), "-");
3423
3424   exit (EXIT_SUCCESS);
3425 }