src/sort.c

   1 /* sort - sort lines of text (with all kinds of options).
   2    Copyright (C) 1988, 1991-2008 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17    Written December 1988 by Mike Haertel.
  18    The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
  19    or (US mail) as Mike Haertel c/o Free Software Foundation.
  20
  21    Ørn E. Hansen added NLS support in 1997.  */
  22
  23 #include <config.h>
  24
  25 #include <getopt.h>
  26 #include <sys/types.h>
  27 #include <sys/wait.h>
  28 #include <signal.h>
  29 #include "system.h"
  30 #include "argmatch.h"
  31 #include "error.h"
  32 #include "hard-locale.h"
  33 #include "hash.h"
  34 #include "md5.h"
  35 #include "physmem.h"
  36 #include "posixver.h"
  37 #include "quote.h"
  38 #include "quotearg.h"
  39 #include "randread.h"
  40 #include "readtokens0.h"
  41 #include "stdio--.h"
  42 #include "stdlib--.h"
  43 #include "strnumcmp.h"
  44 #include "xmemcoll.h"
  45 #include "xmemxfrm.h"
  46 #include "xstrtol.h"
  47
  48 #if HAVE_SYS_RESOURCE_H
  49 # include <sys/resource.h>
  50 #endif
  51 #ifndef RLIMIT_DATA
  52 struct rlimit { size_t rlim_cur; };
  53 # define getrlimit(Resource, Rlp) (-1)
  54 #endif
  55
  56 /* The official name of this program (e.g., no `g' prefix).  */
  57 #define PROGRAM_NAME "sort"
  58
  59 #define AUTHORS \
  60   proper_name ("Mike Haertel"), \
  61   proper_name ("Paul Eggert")
  62
  63 #if HAVE_LANGINFO_CODESET
  64 # include <langinfo.h>
  65 #endif
  66
  67 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
  68    present.  */
  69 #ifndef SA_NOCLDSTOP
  70 # define SA_NOCLDSTOP 0
  71 /* No sigprocmask.  Always 'return' zero. */
  72 # define sigprocmask(How, Set, Oset) (0)
  73 # define sigset_t int
  74 # if ! HAVE_SIGINTERRUPT
  75 #  define siginterrupt(sig, flag) /* empty */
  76 # endif
  77 #endif
  78
  79 #if !defined OPEN_MAX && defined NR_OPEN
  80 # define OPEN_MAX NR_OPEN
  81 #endif
  82 #if !defined OPEN_MAX
  83 # define OPEN_MAX 20
  84 #endif
  85
  86 #define UCHAR_LIM (UCHAR_MAX + 1)
  87
  88 #ifndef DEFAULT_TMPDIR
  89 # define DEFAULT_TMPDIR "/tmp"
  90 #endif
  91
  92 /* Exit statuses.  */
  93 enum
  94   {
  95     /* POSIX says to exit with status 1 if invoked with -c and the
  96        input is not properly sorted.  */
  97     SORT_OUT_OF_ORDER = 1,
  98
  99     /* POSIX says any other irregular exit must exit with a status
 100        code greater than 1.  */
 101     SORT_FAILURE = 2
 102   };
 103
 104 enum
 105   {
 106     /* The number of times we should try to fork a compression process
 107        (we retry if the fork call fails).  We don't _need_ to compress
 108        temp files, this is just to reduce disk access, so this number
 109        can be small.  */
 110     MAX_FORK_TRIES_COMPRESS = 2,
 111
 112     /* The number of times we should try to fork a decompression process.
 113        If we can't fork a decompression process, we can't sort, so this
 114        number should be big.  */
 115     MAX_FORK_TRIES_DECOMPRESS = 8
 116   };
 117
 118 /* The representation of the decimal point in the current locale.  */
 119 static int decimal_point;
 120
 121 /* Thousands separator; if -1, then there isn't one.  */
 122 static int thousands_sep;
 123
 124 /* Nonzero if the corresponding locales are hard.  */
 125 static bool hard_LC_COLLATE;
 126 #if HAVE_NL_LANGINFO
 127 static bool hard_LC_TIME;
 128 #endif
 129
 130 #define NONZERO(x) ((x) != 0)
 131
 132 /* The kind of blanks for '-b' to skip in various options. */
 133 enum blanktype { bl_start, bl_end, bl_both };
 134
 135 /* The character marking end of line. Default to \n. */
 136 static char eolchar = '\n';
 137
 138 /* Lines are held in core as counted strings. */
 139 struct line
 140 {
 141   char *text;                   /* Text of the line. */
 142   size_t length;                /* Length including final newline. */
 143   char *keybeg;                 /* Start of first key. */
 144   char *keylim;                 /* Limit of first key. */
 145 };
 146
 147 /* Input buffers. */
 148 struct buffer
 149 {
 150   char *buf;                    /* Dynamically allocated buffer,
 151                                    partitioned into 3 regions:
 152                                    - input data;
 153                                    - unused area;
 154                                    - an array of lines, in reverse order.  */
 155   size_t used;                  /* Number of bytes used for input data.  */
 156   size_t nlines;                /* Number of lines in the line array.  */
 157   size_t alloc;                 /* Number of bytes allocated. */
 158   size_t left;                  /* Number of bytes left from previous reads. */
 159   size_t line_bytes;            /* Number of bytes to reserve for each line. */
 160   bool eof;                     /* An EOF has been read.  */
 161 };
 162
 163 struct keyfield
 164 {
 165   size_t sword;                 /* Zero-origin 'word' to start at. */
 166   size_t schar;                 /* Additional characters to skip. */
 167   size_t eword;                 /* Zero-origin first word after field. */
 168   size_t echar;                 /* Additional characters in field. */
 169   bool const *ignore;           /* Boolean array of characters to ignore. */
 170   char const *translate;        /* Translation applied to characters. */
 171   bool skipsblanks;             /* Skip leading blanks when finding start.  */
 172   bool skipeblanks;             /* Skip leading blanks when finding end.  */
 173   bool numeric;                 /* Flag for numeric comparison.  Handle
 174                                    strings of digits with optional decimal
 175                                    point, but no exponential notation. */
 176   bool random;                  /* Sort by random hash of key.  */
 177   bool general_numeric;         /* Flag for general, numeric comparison.
 178                                    Handle numbers in exponential notation. */
 179   bool month;                   /* Flag for comparison by month name. */
 180   bool reverse;                 /* Reverse the sense of comparison. */
 181   bool version;                 /* sort by version number */
 182   struct keyfield *next;        /* Next keyfield to try. */
 183 };
 184
 185 struct month
 186 {
 187   char const *name;
 188   int val;
 189 };
 190
 191 /* FIXME: None of these tables work with multibyte character sets.
 192    Also, there are many other bugs when handling multibyte characters.
 193    One way to fix this is to rewrite `sort' to use wide characters
 194    internally, but doing this with good performance is a bit
 195    tricky.  */
 196
 197 /* Table of blanks.  */
 198 static bool blanks[UCHAR_LIM];
 199
 200 /* Table of non-printing characters. */
 201 static bool nonprinting[UCHAR_LIM];
 202
 203 /* Table of non-dictionary characters (not letters, digits, or blanks). */
 204 static bool nondictionary[UCHAR_LIM];
 205
 206 /* Translation table folding lower case to upper.  */
 207 static char fold_toupper[UCHAR_LIM];
 208
 209 #define MONTHS_PER_YEAR 12
 210
 211 /* Table mapping month names to integers.
 212    Alphabetic order allows binary search. */
 213 static struct month monthtab[] =
 214 {
 215   {"APR", 4},
 216   {"AUG", 8},
 217   {"DEC", 12},
 218   {"FEB", 2},
 219   {"JAN", 1},
 220   {"JUL", 7},
 221   {"JUN", 6},
 222   {"MAR", 3},
 223   {"MAY", 5},
 224   {"NOV", 11},
 225   {"OCT", 10},
 226   {"SEP", 9}
 227 };
 228
 229 /* During the merge phase, the number of files to merge at once. */
 230 #define NMERGE_DEFAULT 16
 231
 232 /* Minimum size for a merge or check buffer.  */
 233 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 234
 235 /* Minimum sort size; the code might not work with smaller sizes.  */
 236 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
 237
 238 /* The number of bytes needed for a merge or check buffer, which can
 239    function relatively efficiently even if it holds only one line.  If
 240    a longer line is seen, this value is increased.  */
 241 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
 242
 243 /* The approximate maximum number of bytes of main memory to use, as
 244    specified by the user.  Zero if the user has not specified a size.  */
 245 static size_t sort_size;
 246
 247 /* The guessed size for non-regular files.  */
 248 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
 249
 250 /* Array of directory names in which any temporary files are to be created. */
 251 static char const **temp_dirs;
 252
 253 /* Number of temporary directory names used.  */
 254 static size_t temp_dir_count;
 255
 256 /* Number of allocated slots in temp_dirs.  */
 257 static size_t temp_dir_alloc;
 258
 259 /* Flag to reverse the order of all comparisons. */
 260 static bool reverse;
 261
 262 /* Flag for stable sort.  This turns off the last ditch bytewise
 263    comparison of lines, and instead leaves lines in the same order
 264    they were read if all keys compare equal.  */
 265 static bool stable;
 266
 267 /* If TAB has this value, blanks separate fields.  */
 268 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 269
 270 /* Tab character separating fields.  If TAB_DEFAULT, then fields are
 271    separated by the empty string between a non-blank character and a blank
 272    character. */
 273 static int tab = TAB_DEFAULT;
 274
 275 /* Flag to remove consecutive duplicate lines from the output.
 276    Only the last of a sequence of equal lines will be output. */
 277 static bool unique;
 278
 279 /* Nonzero if any of the input files are the standard input. */
 280 static bool have_read_stdin;
 281
 282 /* List of key field comparisons to be tried.  */
 283 static struct keyfield *keylist;
 284
 285 /* Program used to (de)compress temp files.  Must accept -d.  */
 286 static char const *compress_program;
 287
 288 /* Maximum number of files to merge in one go.  If more than this
 289    number are present, temp files will be used. */
 290 static unsigned int nmerge = NMERGE_DEFAULT;
 291
 292 static void sortlines_temp (struct line *, size_t, struct line *);
 293
 294 /* Report MESSAGE for FILE, then clean up and exit.
 295    If FILE is null, it represents standard output.  */
 296
 297 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
 298 static void
 299 die (char const *message, char const *file)
 300 {
 301   error (0, errno, "%s: %s", message, file ? file : _("standard output"));
 302   exit (SORT_FAILURE);
 303 }
 304
 305 void
 306 usage (int status)
 307 {
 308   if (status != EXIT_SUCCESS)
 309     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 310              program_name);
 311   else
 312     {
 313       printf (_("\
 314 Usage: %s [OPTION]... [FILE]...\n\
 315   or:  %s [OPTION]... --files0-from=F\n\
 316 "),
 317               program_name, program_name);
 318       fputs (_("\
 319 Write sorted concatenation of all FILE(s) to standard output.\n\
 320 \n\
 321 "), stdout);
 322       fputs (_("\
 323 Mandatory arguments to long options are mandatory for short options too.\n\
 324 "), stdout);
 325       fputs (_("\
 326 Ordering options:\n\
 327 \n\
 328 "), stdout);
 329       fputs (_("\
 330   -b, --ignore-leading-blanks  ignore leading blanks\n\
 331   -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
 332   -f, --ignore-case           fold lower case to upper case characters\n\
 333 "), stdout);
 334       fputs (_("\
 335   -g, --general-numeric-sort  compare according to general numerical value\n\
 336   -i, --ignore-nonprinting    consider only printable characters\n\
 337   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 338   -n, --numeric-sort          compare according to string numerical value\n\
 339   -R, --random-sort           sort by random hash of keys\n\
 340   -V, --version-sort          sort by numeric version (see strverscmp(3C))\n\
 341       --random-source=FILE    get random bytes from FILE (default /dev/urandom)\n\
 342       --sort=WORD             sort according to WORD:\n\
 343                                 general-numeric -g, month -M, numeric -n,\n\
 344                                 random -R, version -V\n\
 345   -r, --reverse               reverse the result of comparisons\n\
 346 \n\
 347 "), stdout);
 348       fputs (_("\
 349 Other options:\n\
 350 \n\
 351 "), stdout);
 352       fputs (_("\
 353       --batch-size=NMERGE   merge at most NMERGE inputs at once;\n\
 354                             for more use temp files\n\
 355 "), stdout);
 356       fputs (_("\
 357   -c, --check, --check=diagnose-first  check for sorted input; do not sort\n\
 358   -C, --check=quiet, --check=silent  like -c, but do not report first bad line\n\
 359       --compress-program=PROG  compress temporaries with PROG;\n\
 360                               decompress them with PROG -d\n\
 361       --files0-from=F       read input from the files specified by\n\
 362                             NUL-terminated names in file F\n\
 363 "), stdout);
 364       fputs (_("\
 365   -k, --key=POS1[,POS2]     start a key at POS1 (origin 1), end it at POS2\n\
 366                             (default end of line)\n\
 367   -m, --merge               merge already sorted files; do not sort\n\
 368 "), stdout);
 369       fputs (_("\
 370   -o, --output=FILE         write result to FILE instead of standard output\n\
 371   -s, --stable              stabilize sort by disabling last-resort comparison\n\
 372   -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
 373 "), stdout);
 374       printf (_("\
 375   -t, --field-separator=SEP  use SEP instead of non-blank to blank transition\n\
 376   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
 377                               multiple options specify multiple directories\n\
 378   -u, --unique              with -c, check for strict ordering;\n\
 379                               without -c, output only the first of an equal run\n\
 380 "), DEFAULT_TMPDIR);
 381       fputs (_("\
 382   -z, --zero-terminated     end lines with 0 byte, not newline\n\
 383 "), stdout);
 384       fputs (HELP_OPTION_DESCRIPTION, stdout);
 385       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 386       fputs (_("\
 387 \n\
 388 POS is F[.C][OPTS], where F is the field number and C the character position\n\
 389 in the field; both are origin 1.  If neither -t nor -b is in effect, characters\n\
 390 in a field are counted from the beginning of the preceding whitespace.  OPTS is\n\
 391 one or more single-letter ordering options, which override global ordering\n\
 392 options for that key.  If no key is given, use the entire line as the key.\n\
 393 \n\
 394 SIZE may be followed by the following multiplicative suffixes:\n\
 395 "), stdout);
 396       fputs (_("\
 397 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
 398 \n\
 399 With no FILE, or when FILE is -, read standard input.\n\
 400 \n\
 401 *** WARNING ***\n\
 402 The locale specified by the environment affects sort order.\n\
 403 Set LC_ALL=C to get the traditional sort order that uses\n\
 404 native byte values.\n\
 405 "), stdout );
 406       emit_bug_reporting_address ();
 407     }
 408
 409   exit (status);
 410 }
 411
 412 /* For long options that have no equivalent short option, use a
 413    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 414 enum
 415 {
 416   CHECK_OPTION = CHAR_MAX + 1,
 417   COMPRESS_PROGRAM_OPTION,
 418   FILES0_FROM_OPTION,
 419   NMERGE_OPTION,
 420   RANDOM_SOURCE_OPTION,
 421   SORT_OPTION
 422 };
 423
 424 static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z";
 425
 426 static struct option const long_options[] =
 427 {
 428   {"ignore-leading-blanks", no_argument, NULL, 'b'},
 429   {"check", optional_argument, NULL, CHECK_OPTION},
 430   {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
 431   {"dictionary-order", no_argument, NULL, 'd'},
 432   {"ignore-case", no_argument, NULL, 'f'},
 433   {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
 434   {"general-numeric-sort", no_argument, NULL, 'g'},
 435   {"ignore-nonprinting", no_argument, NULL, 'i'},
 436   {"key", required_argument, NULL, 'k'},
 437   {"merge", no_argument, NULL, 'm'},
 438   {"month-sort", no_argument, NULL, 'M'},
 439   {"numeric-sort", no_argument, NULL, 'n'},
 440   {"version-sort", no_argument, NULL, 'V'},
 441   {"random-sort", no_argument, NULL, 'R'},
 442   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
 443   {"sort", required_argument, NULL, SORT_OPTION},
 444   {"output", required_argument, NULL, 'o'},
 445   {"reverse", no_argument, NULL, 'r'},
 446   {"stable", no_argument, NULL, 's'},
 447   {"batch-size", required_argument, NULL, NMERGE_OPTION},
 448   {"buffer-size", required_argument, NULL, 'S'},
 449   {"field-separator", required_argument, NULL, 't'},
 450   {"temporary-directory", required_argument, NULL, 'T'},
 451   {"unique", no_argument, NULL, 'u'},
 452   {"zero-terminated", no_argument, NULL, 'z'},
 453   {GETOPT_HELP_OPTION_DECL},
 454   {GETOPT_VERSION_OPTION_DECL},
 455   {NULL, 0, NULL, 0},
 456 };
 457
 458 #define CHECK_TABLE \
 459   _ct_("quiet",          'C') \
 460   _ct_("silent",         'C') \
 461   _ct_("diagnose-first", 'c')
 462
 463 static char const *const check_args[] =
 464 {
 465 #define _ct_(_s, _c) _s,
 466   CHECK_TABLE NULL
 467 #undef  _ct_
 468 };
 469 static char const check_types[] =
 470 {
 471 #define _ct_(_s, _c) _c,
 472   CHECK_TABLE
 473 #undef  _ct_
 474 };
 475
 476 #define SORT_TABLE \
 477   _st_("general-numeric", 'g') \
 478   _st_("month",           'M') \
 479   _st_("numeric",         'n') \
 480   _st_("random",          'R') \
 481   _st_("version",         'V')
 482
 483 static char const *const sort_args[] =
 484 {
 485 #define _st_(_s, _c) _s,
 486   SORT_TABLE NULL
 487 #undef  _st_
 488 };
 489 static char const sort_types[] =
 490 {
 491 #define _st_(_s, _c) _c,
 492   SORT_TABLE
 493 #undef  _st_
 494 };
 495
 496 /* The set of signals that are caught.  */
 497 static sigset_t caught_signals;
 498
 499 /* Critical section status.  */
 500 struct cs_status
 501 {
 502   bool valid;
 503   sigset_t sigs;
 504 };
 505
 506 /* Enter a critical section.  */
 507 static struct cs_status
 508 cs_enter (void)
 509 {
 510   struct cs_status status;
 511   status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
 512   return status;
 513 }
 514
 515 /* Leave a critical section.  */
 516 static void
 517 cs_leave (struct cs_status status)
 518 {
 519   if (status.valid)
 520     {
 521       /* Ignore failure when restoring the signal mask. */
 522       sigprocmask (SIG_SETMASK, &status.sigs, NULL);
 523     }
 524 }
 525
 526 /* The list of temporary files. */
 527 struct tempnode
 528 {
 529   struct tempnode *volatile next;
 530   pid_t pid;     /* If compressed, the pid of compressor, else zero */
 531   char name[1];  /* Actual size is 1 + file name length.  */
 532 };
 533 static struct tempnode *volatile temphead;
 534 static struct tempnode *volatile *temptail = &temphead;
 535
 536 struct sortfile
 537 {
 538   char const *name;
 539   pid_t pid;     /* If compressed, the pid of compressor, else zero */
 540 };
 541
 542 /* A table where we store compression process states.  We clean up all
 543    processes in a timely manner so as not to exhaust system resources,
 544    so we store the info on whether the process is still running, or has
 545    been reaped here.  */
 546 static Hash_table *proctab;
 547
 548 enum { INIT_PROCTAB_SIZE = 47 };
 549
 550 enum procstate { ALIVE, ZOMBIE };
 551
 552 /* A proctab entry.  The COUNT field is there in case we fork a new
 553    compression process that has the same PID as an old zombie process
 554    that is still in the table (because the process to decompress the
 555    temp file it was associated with hasn't started yet).  */
 556 struct procnode
 557 {
 558   pid_t pid;
 559   enum procstate state;
 560   size_t count;
 561 };
 562
 563 static size_t
 564 proctab_hasher (const void *entry, size_t tabsize)
 565 {
 566   const struct procnode *node = entry;
 567   return node->pid % tabsize;
 568 }
 569
 570 static bool
 571 proctab_comparator (const void *e1, const void *e2)
 572 {
 573   const struct procnode *n1 = e1, *n2 = e2;
 574   return n1->pid == n2->pid;
 575 }
 576
 577 /* The total number of forked processes (compressors and decompressors)
 578    that have not been reaped yet. */
 579 static size_t nprocs;
 580
 581 /* The number of child processes we'll allow before we try to reap some. */
 582 enum { MAX_PROCS_BEFORE_REAP = 2 };
 583
 584 /* If 0 < PID, wait for the child process with that PID to exit.
 585    If PID is -1, clean up a random child process which has finished and
 586    return the process ID of that child.  If PID is -1 and no processes
 587    have quit yet, return 0 without waiting.  */
 588
 589 static pid_t
 590 reap (pid_t pid)
 591 {
 592   int status;
 593   pid_t cpid = waitpid (pid, &status, pid < 0 ? WNOHANG : 0);
 594
 595   if (cpid < 0)
 596     error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
 597            compress_program);
 598   else if (0 < cpid)
 599     {
 600       if (! WIFEXITED (status) || WEXITSTATUS (status))
 601         error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
 602                compress_program);
 603       --nprocs;
 604     }
 605
 606   return cpid;
 607 }
 608
 609 /* Add the PID of a running compression process to proctab, or update
 610    the entry COUNT and STATE fields if it's already there.  This also
 611    creates the table for us the first time it's called.  */
 612
 613 static void
 614 register_proc (pid_t pid)
 615 {
 616   struct procnode test, *node;
 617
 618   if (! proctab)
 619     {
 620       proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
 621                                  proctab_hasher,
 622                                  proctab_comparator,
 623                                  free);
 624       if (! proctab)
 625         xalloc_die ();
 626     }
 627
 628   test.pid = pid;
 629   node = hash_lookup (proctab, &test);
 630   if (node)
 631     {
 632       node->state = ALIVE;
 633       ++node->count;
 634     }
 635   else
 636     {
 637       node = xmalloc (sizeof *node);
 638       node->pid = pid;
 639       node->state = ALIVE;
 640       node->count = 1;
 641       hash_insert (proctab, node);
 642     }
 643 }
 644
 645 /* This is called when we reap a random process.  We don't know
 646    whether we have reaped a compression process or a decompression
 647    process until we look in the table.  If there's an ALIVE entry for
 648    it, then we have reaped a compression process, so change the state
 649    to ZOMBIE.  Otherwise, it's a decompression processes, so ignore it.  */
 650
 651 static void
 652 update_proc (pid_t pid)
 653 {
 654   struct procnode test, *node;
 655
 656   test.pid = pid;
 657   node = hash_lookup (proctab, &test);
 658   if (node)
 659     node->state = ZOMBIE;
 660 }
 661
 662 /* This is for when we need to wait for a compression process to exit.
 663    If it has a ZOMBIE entry in the table then it's already dead and has
 664    been reaped.  Note that if there's an ALIVE entry for it, it still may
 665    already have died and been reaped if a second process was created with
 666    the same PID.  This is probably exceedingly rare, but to be on the safe
 667    side we will have to wait for any compression process with this PID.  */
 668
 669 static void
 670 wait_proc (pid_t pid)
 671 {
 672   struct procnode test, *node;
 673
 674   test.pid = pid;
 675   node = hash_lookup (proctab, &test);
 676   if (node->state == ALIVE)
 677     reap (pid);
 678
 679   node->state = ZOMBIE;
 680   if (! --node->count)
 681     {
 682       hash_delete (proctab, node);
 683       free (node);
 684     }
 685 }
 686
 687 /* Keep reaping finished children as long as there are more to reap.
 688    This doesn't block waiting for any of them, it only reaps those
 689    that are already dead.  */
 690
 691 static void
 692 reap_some (void)
 693 {
 694   pid_t pid;
 695
 696   while (0 < nprocs && (pid = reap (-1)))
 697     update_proc (pid);
 698 }
 699
 700 /* Clean up any remaining temporary files.  */
 701
 702 static void
 703 cleanup (void)
 704 {
 705   struct tempnode const *node;
 706
 707   for (node = temphead; node; node = node->next)
 708     unlink (node->name);
 709   temphead = NULL;
 710 }
 711
 712 /* Cleanup actions to take when exiting.  */
 713
 714 static void
 715 exit_cleanup (void)
 716 {
 717   if (temphead)
 718     {
 719       /* Clean up any remaining temporary files in a critical section so
 720          that a signal handler does not try to clean them too.  */
 721       struct cs_status cs = cs_enter ();
 722       cleanup ();
 723       cs_leave (cs);
 724     }
 725
 726   close_stdout ();
 727 }
 728
 729 /* Create a new temporary file, returning its newly allocated tempnode.
 730    Store into *PFD the file descriptor open for writing.  */
 731
 732 static struct tempnode *
 733 create_temp_file (int *pfd)
 734 {
 735   static char const slashbase[] = "/sortXXXXXX";
 736   static size_t temp_dir_index;
 737   int fd;
 738   int saved_errno;
 739   char const *temp_dir = temp_dirs[temp_dir_index];
 740   size_t len = strlen (temp_dir);
 741   struct tempnode *node =
 742     xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
 743   char *file = node->name;
 744   struct cs_status cs;
 745
 746   memcpy (file, temp_dir, len);
 747   memcpy (file + len, slashbase, sizeof slashbase);
 748   node->next = NULL;
 749   node->pid = 0;
 750   if (++temp_dir_index == temp_dir_count)
 751     temp_dir_index = 0;
 752
 753   /* Create the temporary file in a critical section, to avoid races.  */
 754   cs = cs_enter ();
 755   fd = mkstemp (file);
 756   if (0 <= fd)
 757     {
 758       *temptail = node;
 759       temptail = &node->next;
 760     }
 761   saved_errno = errno;
 762   cs_leave (cs);
 763   errno = saved_errno;
 764
 765   if (fd < 0)
 766     error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
 767            quote (temp_dir));
 768
 769   *pfd = fd;
 770   return node;
 771 }
 772
 773 /* Return a stream for FILE, opened with mode HOW.  A null FILE means
 774    standard output; HOW should be "w".  When opening for input, "-"
 775    means standard input.  To avoid confusion, do not return file
 776    descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
 777    opening an ordinary FILE.  */
 778
 779 static FILE *
 780 xfopen (const char *file, const char *how)
 781 {
 782   FILE *fp;
 783
 784   if (!file)
 785     fp = stdout;
 786   else if (STREQ (file, "-") && *how == 'r')
 787     {
 788       have_read_stdin = true;
 789       fp = stdin;
 790     }
 791   else
 792     {
 793       fp = fopen (file, how);
 794       if (! fp)
 795         die (_("open failed"), file);
 796     }
 797
 798   return fp;
 799 }
 800
 801 /* Close FP, whose name is FILE, and report any errors.  */
 802
 803 static void
 804 xfclose (FILE *fp, char const *file)
 805 {
 806   switch (fileno (fp))
 807     {
 808     case STDIN_FILENO:
 809       /* Allow reading stdin from tty more than once.  */
 810       if (feof (fp))
 811         clearerr (fp);
 812       break;
 813
 814     case STDOUT_FILENO:
 815       /* Don't close stdout just yet.  close_stdout does that.  */
 816       if (fflush (fp) != 0)
 817         die (_("fflush failed"), file);
 818       break;
 819
 820     default:
 821       if (fclose (fp) != 0)
 822         die (_("close failed"), file);
 823       break;
 824     }
 825 }
 826
 827 static void
 828 dup2_or_die (int oldfd, int newfd)
 829 {
 830   if (dup2 (oldfd, newfd) < 0)
 831     error (SORT_FAILURE, errno, _("dup2 failed"));
 832 }
 833
 834 /* Fork a child process for piping to and do common cleanup.  The
 835    TRIES parameter tells us how many times to try to fork before
 836    giving up.  Return the PID of the child or -1 if fork failed.  */
 837
 838 static pid_t
 839 pipe_fork (int pipefds[2], size_t tries)
 840 {
 841 #if HAVE_WORKING_FORK
 842   struct tempnode *saved_temphead;
 843   int saved_errno;
 844   unsigned int wait_retry = 1;
 845   pid_t pid IF_LINT (= -1);
 846   struct cs_status cs;
 847
 848   if (pipe (pipefds) < 0)
 849     return -1;
 850
 851   while (tries--)
 852     {
 853       /* This is so the child process won't delete our temp files
 854          if it receives a signal before exec-ing.  */
 855       cs = cs_enter ();
 856       saved_temphead = temphead;
 857       temphead = NULL;
 858
 859       pid = fork ();
 860       saved_errno = errno;
 861       if (pid)
 862         temphead = saved_temphead;
 863
 864       cs_leave (cs);
 865       errno = saved_errno;
 866
 867       if (0 <= pid || errno != EAGAIN)
 868         break;
 869       else
 870         {
 871           sleep (wait_retry);
 872           wait_retry *= 2;
 873           reap_some ();
 874         }
 875     }
 876
 877   if (pid < 0)
 878     {
 879       close (pipefds[0]);
 880       close (pipefds[1]);
 881     }
 882   else if (pid == 0)
 883     {
 884       close (STDIN_FILENO);
 885       close (STDOUT_FILENO);
 886     }
 887   else
 888     ++nprocs;
 889
 890   return pid;
 891
 892 #else  /* ! HAVE_WORKING_FORK */
 893   return -1;
 894 #endif
 895 }
 896
 897 /* Create a temporary file and start a compression program to filter output
 898    to that file.  Set *PFP to the file handle and if *PPID is non-NULL,
 899    set it to the PID of the newly-created process.  */
 900
 901 static char *
 902 create_temp (FILE **pfp, pid_t *ppid)
 903 {
 904   int tempfd;
 905   struct tempnode *node = create_temp_file (&tempfd);
 906   char *name = node->name;
 907
 908   if (compress_program)
 909     {
 910       int pipefds[2];
 911
 912       node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
 913       if (0 < node->pid)
 914         {
 915           close (tempfd);
 916           close (pipefds[0]);
 917           tempfd = pipefds[1];
 918
 919           register_proc (node->pid);
 920         }
 921       else if (node->pid == 0)
 922         {
 923           close (pipefds[1]);
 924           dup2_or_die (tempfd, STDOUT_FILENO);
 925           close (tempfd);
 926           dup2_or_die (pipefds[0], STDIN_FILENO);
 927           close (pipefds[0]);
 928
 929           if (execlp (compress_program, compress_program, (char *) NULL) < 0)
 930             error (SORT_FAILURE, errno, _("couldn't execute %s"),
 931                    compress_program);
 932         }
 933       else
 934         node->pid = 0;
 935     }
 936
 937   *pfp = fdopen (tempfd, "w");
 938   if (! *pfp)
 939     die (_("couldn't create temporary file"), name);
 940
 941   if (ppid)
 942     *ppid = node->pid;
 943
 944   return name;
 945 }
 946
 947 /* Open a compressed temp file and start a decompression process through
 948    which to filter the input.  PID must be the valid processes ID of the
 949    process used to compress the file.  */
 950
 951 static FILE *
 952 open_temp (const char *name, pid_t pid)
 953 {
 954   int tempfd, pipefds[2];
 955   pid_t child_pid;
 956   FILE *fp;
 957
 958   wait_proc (pid);
 959
 960   tempfd = open (name, O_RDONLY);
 961   if (tempfd < 0)
 962     die (_("couldn't open temporary file"), name);
 963
 964   child_pid = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
 965   if (0 < child_pid)
 966     {
 967       close (tempfd);
 968       close (pipefds[1]);
 969     }
 970   else if (child_pid == 0)
 971     {
 972       close (pipefds[0]);
 973       dup2_or_die (tempfd, STDIN_FILENO);
 974       close (tempfd);
 975       dup2_or_die (pipefds[1], STDOUT_FILENO);
 976       close (pipefds[1]);
 977
 978       if (execlp (compress_program, compress_program, "-d", (char *) NULL) < 0)
 979         error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
 980                compress_program);
 981     }
 982   else
 983     error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
 984            compress_program);
 985
 986   fp = fdopen (pipefds[0], "r");
 987   if (! fp)
 988     die (_("couldn't create temporary file"), name);
 989
 990   return fp;
 991 }
 992
 993 static void
 994 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
 995 {
 996   if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
 997     die (_("write failed"), output_file);
 998 }
 999
1000 /* Append DIR to the array of temporary directory names.  */
1001 static void
1002 add_temp_dir (char const *dir)
1003 {
1004   if (temp_dir_count == temp_dir_alloc)
1005     temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1006
1007   temp_dirs[temp_dir_count++] = dir;
1008 }
1009
1010 /* Remove NAME from the list of temporary files.  */
1011
1012 static void
1013 zaptemp (const char *name)
1014 {
1015   struct tempnode *volatile *pnode;
1016   struct tempnode *node;
1017   struct tempnode *next;
1018   int unlink_status;
1019   int unlink_errno = 0;
1020   struct cs_status cs;
1021
1022   for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1023     continue;
1024
1025   /* Unlink the temporary file in a critical section to avoid races.  */
1026   next = node->next;
1027   cs = cs_enter ();
1028   unlink_status = unlink (name);
1029   unlink_errno = errno;
1030   *pnode = next;
1031   cs_leave (cs);
1032
1033   if (unlink_status != 0)
1034     error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1035   if (! next)
1036     temptail = pnode;
1037   free (node);
1038 }
1039
1040 #if HAVE_NL_LANGINFO
1041
1042 static int
1043 struct_month_cmp (const void *m1, const void *m2)
1044 {
1045   struct month const *month1 = m1;
1046   struct month const *month2 = m2;
1047   return strcmp (month1->name, month2->name);
1048 }
1049
1050 #endif
1051
1052 /* Initialize the character class tables. */
1053
1054 static void
1055 inittables (void)
1056 {
1057   size_t i;
1058
1059   for (i = 0; i < UCHAR_LIM; ++i)
1060     {
1061       blanks[i] = !! isblank (i);
1062       nonprinting[i] = ! isprint (i);
1063       nondictionary[i] = ! isalnum (i) && ! isblank (i);
1064       fold_toupper[i] = toupper (i);
1065     }
1066
1067 #if HAVE_NL_LANGINFO
1068   /* If we're not in the "C" locale, read different names for months.  */
1069   if (hard_LC_TIME)
1070     {
1071       for (i = 0; i < MONTHS_PER_YEAR; i++)
1072         {
1073           char const *s;
1074           size_t s_len;
1075           size_t j;
1076           char *name;
1077
1078           s = (char *) nl_langinfo (ABMON_1 + i);
1079           s_len = strlen (s);
1080           monthtab[i].name = name = xmalloc (s_len + 1);
1081           monthtab[i].val = i + 1;
1082
1083           for (j = 0; j < s_len; j++)
1084             name[j] = fold_toupper[to_uchar (s[j])];
1085           name[j] = '\0';
1086         }
1087       qsort ((void *) monthtab, MONTHS_PER_YEAR,
1088              sizeof *monthtab, struct_month_cmp);
1089     }
1090 #endif
1091 }
1092
1093 /* Specify how many inputs may be merged at once.
1094    This may be set on the command-line with the
1095    --batch-size option. */
1096 static void
1097 specify_nmerge (int oi, char c, char const *s)
1098 {
1099   uintmax_t n;
1100   struct rlimit rlimit;
1101   enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1102
1103   /* Try to find out how many file descriptors we'll be able
1104      to open.  We need at least nmerge + 3 (STDIN_FILENO,
1105      STDOUT_FILENO and STDERR_FILENO). */
1106   unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1107                               ? rlimit.rlim_cur
1108                               : OPEN_MAX)
1109                              - 3);
1110
1111   if (e == LONGINT_OK)
1112     {
1113       nmerge = n;
1114       if (nmerge != n)
1115         e = LONGINT_OVERFLOW;
1116       else
1117         {
1118           if (nmerge < 2)
1119             {
1120               error (0, 0, _("invalid --%s argument %s"),
1121                      long_options[oi].name, quote(s));
1122               error (SORT_FAILURE, 0,
1123                      _("minimum --%s argument is %s"),
1124                      long_options[oi].name, quote("2"));
1125             }
1126           else if (max_nmerge < nmerge)
1127             {
1128               e = LONGINT_OVERFLOW;
1129             }
1130           else
1131             return;
1132         }
1133     }
1134
1135   if (e == LONGINT_OVERFLOW)
1136     {
1137       char max_nmerge_buf[INT_BUFSIZE_BOUND (unsigned int)];
1138       error (0, 0, _("--%s argument %s too large"),
1139              long_options[oi].name, quote(s));
1140       error (SORT_FAILURE, 0,
1141              _("maximum --%s argument with current rlimit is %s"),
1142              long_options[oi].name,
1143              uinttostr (max_nmerge, max_nmerge_buf));
1144     }
1145   else
1146     xstrtol_fatal (e, oi, c, long_options, s);
1147 }
1148
1149 /* Specify the amount of main memory to use when sorting.  */
1150 static void
1151 specify_sort_size (int oi, char c, char const *s)
1152 {
1153   uintmax_t n;
1154   char *suffix;
1155   enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1156
1157   /* The default unit is KiB.  */
1158   if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1159     {
1160       if (n <= UINTMAX_MAX / 1024)
1161         n *= 1024;
1162       else
1163         e = LONGINT_OVERFLOW;
1164     }
1165
1166   /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
1167   if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1168     switch (suffix[0])
1169       {
1170       case 'b':
1171         e = LONGINT_OK;
1172         break;
1173
1174       case '%':
1175         {
1176           double mem = physmem_total () * n / 100;
1177
1178           /* Use "<", not "<=", to avoid problems with rounding.  */
1179           if (mem < UINTMAX_MAX)
1180             {
1181               n = mem;
1182               e = LONGINT_OK;
1183             }
1184           else
1185             e = LONGINT_OVERFLOW;
1186         }
1187         break;
1188       }
1189
1190   if (e == LONGINT_OK)
1191     {
1192       /* If multiple sort sizes are specified, take the maximum, so
1193          that option order does not matter.  */
1194       if (n < sort_size)
1195         return;
1196
1197       sort_size = n;
1198       if (sort_size == n)
1199         {
1200           sort_size = MAX (sort_size, MIN_SORT_SIZE);
1201           return;
1202         }
1203
1204       e = LONGINT_OVERFLOW;
1205     }
1206
1207   xstrtol_fatal (e, oi, c, long_options, s);
1208 }
1209
1210 /* Return the default sort size.  */
1211 static size_t
1212 default_sort_size (void)
1213 {
1214   /* Let MEM be available memory or 1/8 of total memory, whichever
1215      is greater.  */
1216   double avail = physmem_available ();
1217   double total = physmem_total ();
1218   double mem = MAX (avail, total / 8);
1219   struct rlimit rlimit;
1220
1221   /* Let SIZE be MEM, but no more than the maximum object size or
1222      system resource limits.  Avoid the MIN macro here, as it is not
1223      quite right when only one argument is floating point.  Don't
1224      bother to check for values like RLIM_INFINITY since in practice
1225      they are not much less than SIZE_MAX.  */
1226   size_t size = SIZE_MAX;
1227   if (mem < size)
1228     size = mem;
1229   if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1230     size = rlimit.rlim_cur;
1231 #ifdef RLIMIT_AS
1232   if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1233     size = rlimit.rlim_cur;
1234 #endif
1235
1236   /* Leave a large safety margin for the above limits, as failure can
1237      occur when they are exceeded.  */
1238   size /= 2;
1239
1240 #ifdef RLIMIT_RSS
1241   /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1242      Exceeding RSS is not fatal, but can be quite slow.  */
1243   if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1244     size = rlimit.rlim_cur / 16 * 15;
1245 #endif
1246
1247   /* Use no less than the minimum.  */
1248   return MAX (size, MIN_SORT_SIZE);
1249 }
1250
1251 /* Return the sort buffer size to use with the input files identified
1252    by FPS and FILES, which are alternate names of the same files.
1253    NFILES gives the number of input files; NFPS may be less.  Assume
1254    that each input line requires LINE_BYTES extra bytes' worth of line
1255    information.  Do not exceed the size bound specified by the user
1256    (or a default size bound, if the user does not specify one).  */
1257
1258 static size_t
1259 sort_buffer_size (FILE *const *fps, size_t nfps,
1260                   char *const *files, size_t nfiles,
1261                   size_t line_bytes)
1262 {
1263   /* A bound on the input size.  If zero, the bound hasn't been
1264      determined yet.  */
1265   static size_t size_bound;
1266
1267   /* In the worst case, each input byte is a newline.  */
1268   size_t worst_case_per_input_byte = line_bytes + 1;
1269
1270   /* Keep enough room for one extra input line and an extra byte.
1271      This extra room might be needed when preparing to read EOF.  */
1272   size_t size = worst_case_per_input_byte + 1;
1273
1274   size_t i;
1275
1276   for (i = 0; i < nfiles; i++)
1277     {
1278       struct stat st;
1279       off_t file_size;
1280       size_t worst_case;
1281
1282       if ((i < nfps ? fstat (fileno (fps[i]), &st)
1283            : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1284            : stat (files[i], &st))
1285           != 0)
1286         die (_("stat failed"), files[i]);
1287
1288       if (S_ISREG (st.st_mode))
1289         file_size = st.st_size;
1290       else
1291         {
1292           /* The file has unknown size.  If the user specified a sort
1293              buffer size, use that; otherwise, guess the size.  */
1294           if (sort_size)
1295             return sort_size;
1296           file_size = INPUT_FILE_SIZE_GUESS;
1297         }
1298
1299       if (! size_bound)
1300         {
1301           size_bound = sort_size;
1302           if (! size_bound)
1303             size_bound = default_sort_size ();
1304         }
1305
1306       /* Add the amount of memory needed to represent the worst case
1307          where the input consists entirely of newlines followed by a
1308          single non-newline.  Check for overflow.  */
1309       worst_case = file_size * worst_case_per_input_byte + 1;
1310       if (file_size != worst_case / worst_case_per_input_byte
1311           || size_bound - size <= worst_case)
1312         return size_bound;
1313       size += worst_case;
1314     }
1315
1316   return size;
1317 }
1318
1319 /* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
1320    must be at least sizeof (struct line).  Allocate ALLOC bytes
1321    initially.  */
1322
1323 static void
1324 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1325 {
1326   /* Ensure that the line array is properly aligned.  If the desired
1327      size cannot be allocated, repeatedly halve it until allocation
1328      succeeds.  The smaller allocation may hurt overall performance,
1329      but that's better than failing.  */
1330   for (;;)
1331     {
1332       alloc += sizeof (struct line) - alloc % sizeof (struct line);
1333       buf->buf = malloc (alloc);
1334       if (buf->buf)
1335         break;
1336       alloc /= 2;
1337       if (alloc <= line_bytes + 1)
1338         xalloc_die ();
1339     }
1340
1341   buf->line_bytes = line_bytes;
1342   buf->alloc = alloc;
1343   buf->used = buf->left = buf->nlines = 0;
1344   buf->eof = false;
1345 }
1346
1347 /* Return one past the limit of the line array.  */
1348
1349 static inline struct line *
1350 buffer_linelim (struct buffer const *buf)
1351 {
1352   return (struct line *) (buf->buf + buf->alloc);
1353 }
1354
1355 /* Return a pointer to the first character of the field specified
1356    by KEY in LINE. */
1357
1358 static char *
1359 begfield (const struct line *line, const struct keyfield *key)
1360 {
1361   char *ptr = line->text, *lim = ptr + line->length - 1;
1362   size_t sword = key->sword;
1363   size_t schar = key->schar;
1364   size_t remaining_bytes;
1365
1366   /* The leading field separator itself is included in a field when -t
1367      is absent.  */
1368
1369   if (tab != TAB_DEFAULT)
1370     while (ptr < lim && sword--)
1371       {
1372         while (ptr < lim && *ptr != tab)
1373           ++ptr;
1374         if (ptr < lim)
1375           ++ptr;
1376       }
1377   else
1378     while (ptr < lim && sword--)
1379       {
1380         while (ptr < lim && blanks[to_uchar (*ptr)])
1381           ++ptr;
1382         while (ptr < lim && !blanks[to_uchar (*ptr)])
1383           ++ptr;
1384       }
1385
1386   if (key->skipsblanks)
1387     while (ptr < lim && blanks[to_uchar (*ptr)])
1388       ++ptr;
1389
1390   /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
1391   remaining_bytes = lim - ptr;
1392   if (schar < remaining_bytes)
1393     ptr += schar;
1394   else
1395     ptr = lim;
1396
1397   return ptr;
1398 }
1399
1400 /* Return the limit of (a pointer to the first character after) the field
1401    in LINE specified by KEY. */
1402
1403 static char *
1404 limfield (const struct line *line, const struct keyfield *key)
1405 {
1406   char *ptr = line->text, *lim = ptr + line->length - 1;
1407   size_t eword = key->eword, echar = key->echar;
1408   size_t remaining_bytes;
1409
1410   /* Move PTR past EWORD fields or to one past the last byte on LINE,
1411      whichever comes first.  If there are more than EWORD fields, leave
1412      PTR pointing at the beginning of the field having zero-based index,
1413      EWORD.  If a delimiter character was specified (via -t), then that
1414      `beginning' is the first character following the delimiting TAB.
1415      Otherwise, leave PTR pointing at the first `blank' character after
1416      the preceding field.  */
1417   if (tab != TAB_DEFAULT)
1418     while (ptr < lim && eword--)
1419       {
1420         while (ptr < lim && *ptr != tab)
1421           ++ptr;
1422         if (ptr < lim && (eword | echar))
1423           ++ptr;
1424       }
1425   else
1426     while (ptr < lim && eword--)
1427       {
1428         while (ptr < lim && blanks[to_uchar (*ptr)])
1429           ++ptr;
1430         while (ptr < lim && !blanks[to_uchar (*ptr)])
1431           ++ptr;
1432       }
1433
1434 #ifdef POSIX_UNSPECIFIED
1435   /* The following block of code makes GNU sort incompatible with
1436      standard Unix sort, so it's ifdef'd out for now.
1437      The POSIX spec isn't clear on how to interpret this.
1438      FIXME: request clarification.
1439
1440      From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1441      Date: Thu, 30 May 96 12:20:41 -0400
1442      [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1443
1444      [...]I believe I've found another bug in `sort'.
1445
1446      $ cat /tmp/sort.in
1447      a b c 2 d
1448      pq rs 1 t
1449      $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1450      a b c 2 d
1451      pq rs 1 t
1452      $ /bin/sort -k1.7,1.7 </tmp/sort.in
1453      pq rs 1 t
1454      a b c 2 d
1455
1456      Unix sort produced the answer I expected: sort on the single character
1457      in column 7.  GNU sort produced different results, because it disagrees
1458      on the interpretation of the key-end spec "M.N".  Unix sort reads this
1459      as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1460      "skip M-1 fields, then either N-1 characters or the rest of the current
1461      field, whichever comes first".  This extra clause applies only to
1462      key-ends, not key-starts.
1463      */
1464
1465   /* Make LIM point to the end of (one byte past) the current field.  */
1466   if (tab != TAB_DEFAULT)
1467     {
1468       char *newlim;
1469       newlim = memchr (ptr, tab, lim - ptr);
1470       if (newlim)
1471         lim = newlim;
1472     }
1473   else
1474     {
1475       char *newlim;
1476       newlim = ptr;
1477       while (newlim < lim && blanks[to_uchar (*newlim)])
1478         ++newlim;
1479       while (newlim < lim && !blanks[to_uchar (*newlim)])
1480         ++newlim;
1481       lim = newlim;
1482     }
1483 #endif
1484
1485   /* If we're ignoring leading blanks when computing the End
1486      of the field, don't start counting bytes until after skipping
1487      past any leading blanks. */
1488   if (key->skipeblanks)
1489     while (ptr < lim && blanks[to_uchar (*ptr)])
1490       ++ptr;
1491
1492   /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
1493   remaining_bytes = lim - ptr;
1494   if (echar < remaining_bytes)
1495     ptr += echar;
1496   else
1497     ptr = lim;
1498
1499   return ptr;
1500 }
1501
1502 /* Fill BUF reading from FP, moving buf->left bytes from the end
1503    of buf->buf to the beginning first.  If EOF is reached and the
1504    file wasn't terminated by a newline, supply one.  Set up BUF's line
1505    table too.  FILE is the name of the file corresponding to FP.
1506    Return true if some input was read.  */
1507
1508 static bool
1509 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1510 {
1511   struct keyfield const *key = keylist;
1512   char eol = eolchar;
1513   size_t line_bytes = buf->line_bytes;
1514   size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1515
1516   if (buf->eof)
1517     return false;
1518
1519   if (buf->used != buf->left)
1520     {
1521       memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1522       buf->used = buf->left;
1523       buf->nlines = 0;
1524     }
1525
1526   for (;;)
1527     {
1528       char *ptr = buf->buf + buf->used;
1529       struct line *linelim = buffer_linelim (buf);
1530       struct line *line = linelim - buf->nlines;
1531       size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1532       char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1533
1534       while (line_bytes + 1 < avail)
1535         {
1536           /* Read as many bytes as possible, but do not read so many
1537              bytes that there might not be enough room for the
1538              corresponding line array.  The worst case is when the
1539              rest of the input file consists entirely of newlines,
1540              except that the last byte is not a newline.  */
1541           size_t readsize = (avail - 1) / (line_bytes + 1);
1542           size_t bytes_read = fread (ptr, 1, readsize, fp);
1543           char *ptrlim = ptr + bytes_read;
1544           char *p;
1545           avail -= bytes_read;
1546
1547           if (bytes_read != readsize)
1548             {
1549               if (ferror (fp))
1550                 die (_("read failed"), file);
1551               if (feof (fp))
1552                 {
1553                   buf->eof = true;
1554                   if (buf->buf == ptrlim)
1555                     return false;
1556                   if (ptrlim[-1] != eol)
1557                     *ptrlim++ = eol;
1558                 }
1559             }
1560
1561           /* Find and record each line in the just-read input.  */
1562           while ((p = memchr (ptr, eol, ptrlim - ptr)))
1563             {
1564               ptr = p + 1;
1565               line--;
1566               line->text = line_start;
1567               line->length = ptr - line_start;
1568               mergesize = MAX (mergesize, line->length);
1569               avail -= line_bytes;
1570
1571               if (key)
1572                 {
1573                   /* Precompute the position of the first key for
1574                      efficiency.  */
1575                   line->keylim = (key->eword == SIZE_MAX
1576                                   ? p
1577                                   : limfield (line, key));
1578
1579                   if (key->sword != SIZE_MAX)
1580                     line->keybeg = begfield (line, key);
1581                   else
1582                     {
1583                       if (key->skipsblanks)
1584                         while (blanks[to_uchar (*line_start)])
1585                           line_start++;
1586                       line->keybeg = line_start;
1587                     }
1588                 }
1589
1590               line_start = ptr;
1591             }
1592
1593           ptr = ptrlim;
1594           if (buf->eof)
1595             break;
1596         }
1597
1598       buf->used = ptr - buf->buf;
1599       buf->nlines = buffer_linelim (buf) - line;
1600       if (buf->nlines != 0)
1601         {
1602           buf->left = ptr - line_start;
1603           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1604           return true;
1605         }
1606
1607       {
1608         /* The current input line is too long to fit in the buffer.
1609            Double the buffer size and try again, keeping it properly
1610            aligned.  */
1611         size_t line_alloc = buf->alloc / sizeof (struct line);
1612         buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1613         buf->alloc = line_alloc * sizeof (struct line);
1614       }
1615     }
1616 }
1617
1618 /* Compare strings A and B as numbers without explicitly converting them to
1619    machine numbers.  Comparatively slow for short strings, but asymptotically
1620    hideously fast. */
1621
1622 static int
1623 numcompare (const char *a, const char *b)
1624 {
1625   while (blanks[to_uchar (*a)])
1626     a++;
1627   while (blanks[to_uchar (*b)])
1628     b++;
1629
1630   return strnumcmp (a, b, decimal_point, thousands_sep);
1631 }
1632
1633 static int
1634 general_numcompare (const char *sa, const char *sb)
1635 {
1636   /* FIXME: add option to warn about failed conversions.  */
1637   /* FIXME: maybe add option to try expensive FP conversion
1638      only if A and B can't be compared more cheaply/accurately.  */
1639
1640   char *ea;
1641   char *eb;
1642   double a = strtod (sa, &ea);
1643   double b = strtod (sb, &eb);
1644
1645   /* Put conversion errors at the start of the collating sequence.  */
1646   if (sa == ea)
1647     return sb == eb ? 0 : -1;
1648   if (sb == eb)
1649     return 1;
1650
1651   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1652      conversion errors but before numbers; sort them by internal
1653      bit-pattern, for lack of a more portable alternative.  */
1654   return (a < b ? -1
1655           : a > b ? 1
1656           : a == b ? 0
1657           : b == b ? -1
1658           : a == a ? 1
1659           : memcmp ((char *) &a, (char *) &b, sizeof a));
1660 }
1661
1662 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1663    Return 0 if the name in S is not recognized.  */
1664
1665 static int
1666 getmonth (char const *month, size_t len)
1667 {
1668   size_t lo = 0;
1669   size_t hi = MONTHS_PER_YEAR;
1670   char const *monthlim = month + len;
1671
1672   for (;;)
1673     {
1674       if (month == monthlim)
1675         return 0;
1676       if (!blanks[to_uchar (*month)])
1677         break;
1678       ++month;
1679     }
1680
1681   do
1682     {
1683       size_t ix = (lo + hi) / 2;
1684       char const *m = month;
1685       char const *n = monthtab[ix].name;
1686
1687       for (;; m++, n++)
1688         {
1689           if (!*n)
1690             return monthtab[ix].val;
1691           if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1692             {
1693               hi = ix;
1694               break;
1695             }
1696           else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1697             {
1698               lo = ix + 1;
1699               break;
1700             }
1701         }
1702     }
1703   while (lo < hi);
1704
1705   return 0;
1706 }
1707
1708 /* A source of random data.  */
1709 static struct randread_source *randread_source;
1710
1711 /* Return the Ith randomly-generated state.  The caller must invoke
1712    random_state (H) for all H less than I before invoking random_state
1713    (I).  */
1714
1715 static struct md5_ctx
1716 random_state (size_t i)
1717 {
1718   /* An array of states resulting from the random data, and counts of
1719      its used and allocated members.  */
1720   static struct md5_ctx *state;
1721   static size_t used;
1722   static size_t allocated;
1723
1724   struct md5_ctx *s = &state[i];
1725
1726   if (used <= i)
1727     {
1728       unsigned char buf[MD5_DIGEST_SIZE];
1729
1730       used++;
1731
1732       if (allocated <= i)
1733         {
1734           state = X2NREALLOC (state, &allocated);
1735           s = &state[i];
1736         }
1737
1738       randread (randread_source, buf, sizeof buf);
1739       md5_init_ctx (s);
1740       md5_process_bytes (buf, sizeof buf, s);
1741     }
1742
1743   return *s;
1744 }
1745
1746 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
1747    with length LENGTHB.  Return negative if less, zero if equal,
1748    positive if greater.  */
1749
1750 static int
1751 cmp_hashes (char const *texta, size_t lena,
1752             char const *textb, size_t lenb)
1753 {
1754   /* Try random hashes until a pair of hashes disagree.  But if the
1755      first pair of random hashes agree, check whether the keys are
1756      identical and if so report no difference.  */
1757   int diff;
1758   size_t i;
1759   for (i = 0; ; i++)
1760     {
1761       uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
1762       struct md5_ctx s[2];
1763       s[0] = s[1] = random_state (i);
1764       md5_process_bytes (texta, lena, &s[0]);  md5_finish_ctx (&s[0], dig[0]);
1765       md5_process_bytes (textb, lenb, &s[1]);  md5_finish_ctx (&s[1], dig[1]);
1766       diff = memcmp (dig[0], dig[1], sizeof dig[0]);
1767       if (diff != 0)
1768         break;
1769       if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)
1770         break;
1771     }
1772
1773   return diff;
1774 }
1775
1776 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1777    using one or more random hash functions.  */
1778
1779 static int
1780 compare_random (char *restrict texta, size_t lena,
1781                 char *restrict textb, size_t lenb)
1782 {
1783   int diff;
1784
1785   if (! hard_LC_COLLATE)
1786     diff = cmp_hashes (texta, lena, textb, lenb);
1787   else
1788     {
1789       /* Transform the text into the basis of comparison, so that byte
1790          strings that would otherwise considered to be equal are
1791          considered equal here even if their bytes differ.  */
1792
1793       char *buf = NULL;
1794       char stackbuf[4000];
1795       size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
1796       bool a_fits = tlena <= sizeof stackbuf;
1797       size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
1798                                (a_fits ? sizeof stackbuf - tlena : 0),
1799                                textb, lenb);
1800
1801       if (a_fits && tlena + tlenb <= sizeof stackbuf)
1802         buf = stackbuf;
1803       else
1804         {
1805           /* Adding 1 to the buffer size lets xmemxfrm run a bit
1806              faster by avoiding the need for an extra buffer copy.  */
1807           buf = xmalloc (tlena + tlenb + 1);
1808           xmemxfrm (buf, tlena + 1, texta, lena);
1809           xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
1810         }
1811
1812       diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
1813
1814       if (buf != stackbuf)
1815         free (buf);
1816     }
1817
1818   return diff;
1819 }
1820
1821 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1822    using strverscmp.  */
1823
1824 static int
1825 compare_version (char *restrict texta, size_t lena,
1826                  char *restrict textb, size_t lenb)
1827 {
1828   int diff;
1829
1830   /* It is necessary to save the character after the end of the field.
1831      "strverscmp" works with NUL terminated strings.  Our blocks of
1832      text are not necessarily terminated with a NUL byte. */
1833   char sv_a = texta[lena];
1834   char sv_b = textb[lenb];
1835
1836   texta[lena] = '\0';
1837   textb[lenb] = '\0';
1838
1839   diff = strverscmp (texta, textb);
1840
1841   texta[lena] = sv_a;
1842   textb[lenb] = sv_b;
1843
1844   return diff;
1845 }
1846
1847 /* Compare two lines A and B trying every key in sequence until there
1848    are no more keys or a difference is found. */
1849
1850 static int
1851 keycompare (const struct line *a, const struct line *b)
1852 {
1853   struct keyfield const *key = keylist;
1854
1855   /* For the first iteration only, the key positions have been
1856      precomputed for us. */
1857   char *texta = a->keybeg;
1858   char *textb = b->keybeg;
1859   char *lima = a->keylim;
1860   char *limb = b->keylim;
1861
1862   int diff;
1863
1864   for (;;)
1865     {
1866       char const *translate = key->translate;
1867       bool const *ignore = key->ignore;
1868
1869       /* Find the lengths. */
1870       size_t lena = lima <= texta ? 0 : lima - texta;
1871       size_t lenb = limb <= textb ? 0 : limb - textb;
1872
1873       /* Actually compare the fields. */
1874
1875       if (key->random)
1876         diff = compare_random (texta, lena, textb, lenb);
1877       else if (key->numeric | key->general_numeric)
1878         {
1879           char savea = *lima, saveb = *limb;
1880
1881           *lima = *limb = '\0';
1882           diff = ((key->numeric ? numcompare : general_numcompare)
1883                   (texta, textb));
1884           *lima = savea, *limb = saveb;
1885         }
1886       else if (key->version)
1887         diff = compare_version (texta, lena, textb, lenb);
1888       else if (key->month)
1889         diff = getmonth (texta, lena) - getmonth (textb, lenb);
1890       /* Sorting like this may become slow, so in a simple locale the user
1891          can select a faster sort that is similar to ascii sort.  */
1892       else if (hard_LC_COLLATE)
1893         {
1894           if (ignore || translate)
1895             {
1896               char buf[4000];
1897               size_t size = lena + 1 + lenb + 1;
1898               char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1899               char *copy_b = copy_a + lena + 1;
1900               size_t new_len_a, new_len_b, i;
1901
1902               /* Ignore and/or translate chars before comparing.  */
1903               for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1904                 {
1905                   if (i < lena)
1906                     {
1907                       copy_a[new_len_a] = (translate
1908                                            ? translate[to_uchar (texta[i])]
1909                                            : texta[i]);
1910                       if (!ignore || !ignore[to_uchar (texta[i])])
1911                         ++new_len_a;
1912                     }
1913                   if (i < lenb)
1914                     {
1915                       copy_b[new_len_b] = (translate
1916                                            ? translate[to_uchar (textb[i])]
1917                                            : textb [i]);
1918                       if (!ignore || !ignore[to_uchar (textb[i])])
1919                         ++new_len_b;
1920                     }
1921                 }
1922
1923               diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1924
1925               if (sizeof buf < size)
1926                 free (copy_a);
1927             }
1928           else if (lena == 0)
1929             diff = - NONZERO (lenb);
1930           else if (lenb == 0)
1931             goto greater;
1932           else
1933             diff = xmemcoll (texta, lena, textb, lenb);
1934         }
1935       else if (ignore)
1936         {
1937 #define CMP_WITH_IGNORE(A, B)                                           \
1938   do                                                                    \
1939     {                                                                   \
1940           for (;;)                                                      \
1941             {                                                           \
1942               while (texta < lima && ignore[to_uchar (*texta)])         \
1943                 ++texta;                                                \
1944               while (textb < limb && ignore[to_uchar (*textb)])         \
1945                 ++textb;                                                \
1946               if (! (texta < lima && textb < limb))                     \
1947                 break;                                                  \
1948               diff = to_uchar (A) - to_uchar (B);                       \
1949               if (diff)                                                 \
1950                 goto not_equal;                                         \
1951               ++texta;                                                  \
1952               ++textb;                                                  \
1953             }                                                           \
1954                                                                         \
1955           diff = (texta < lima) - (textb < limb);                       \
1956     }                                                                   \
1957   while (0)
1958
1959           if (translate)
1960             CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1961                              translate[to_uchar (*textb)]);
1962           else
1963             CMP_WITH_IGNORE (*texta, *textb);
1964         }
1965       else if (lena == 0)
1966         diff = - NONZERO (lenb);
1967       else if (lenb == 0)
1968         goto greater;
1969       else
1970         {
1971           if (translate)
1972             {
1973               while (texta < lima && textb < limb)
1974                 {
1975                   diff = (to_uchar (translate[to_uchar (*texta++)])
1976                           - to_uchar (translate[to_uchar (*textb++)]));
1977                   if (diff)
1978                     goto not_equal;
1979                 }
1980             }
1981           else
1982             {
1983               diff = memcmp (texta, textb, MIN (lena, lenb));
1984               if (diff)
1985                 goto not_equal;
1986             }
1987           diff = lena < lenb ? -1 : lena != lenb;
1988         }
1989
1990       if (diff)
1991         goto not_equal;
1992
1993       key = key->next;
1994       if (! key)
1995         break;
1996
1997       /* Find the beginning and limit of the next field.  */
1998       if (key->eword != SIZE_MAX)
1999         lima = limfield (a, key), limb = limfield (b, key);
2000       else
2001         lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2002
2003       if (key->sword != SIZE_MAX)
2004         texta = begfield (a, key), textb = begfield (b, key);
2005       else
2006         {
2007           texta = a->text, textb = b->text;
2008           if (key->skipsblanks)
2009             {
2010               while (texta < lima && blanks[to_uchar (*texta)])
2011                 ++texta;
2012               while (textb < limb && blanks[to_uchar (*textb)])
2013                 ++textb;
2014             }
2015         }
2016     }
2017
2018   return 0;
2019
2020  greater:
2021   diff = 1;
2022  not_equal:
2023   return key->reverse ? -diff : diff;
2024 }
2025
2026 /* Compare two lines A and B, returning negative, zero, or positive
2027    depending on whether A compares less than, equal to, or greater than B. */
2028
2029 static int
2030 compare (const struct line *a, const struct line *b)
2031 {
2032   int diff;
2033   size_t alen, blen;
2034
2035   /* First try to compare on the specified keys (if any).
2036      The only two cases with no key at all are unadorned sort,
2037      and unadorned sort -r. */
2038   if (keylist)
2039     {
2040       diff = keycompare (a, b);
2041       if (diff | unique | stable)
2042         return diff;
2043     }
2044
2045   /* If the keys all compare equal (or no keys were specified)
2046      fall through to the default comparison.  */
2047   alen = a->length - 1, blen = b->length - 1;
2048
2049   if (alen == 0)
2050     diff = - NONZERO (blen);
2051   else if (blen == 0)
2052     diff = 1;
2053   else if (hard_LC_COLLATE)
2054     diff = xmemcoll (a->text, alen, b->text, blen);
2055   else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2056     diff = alen < blen ? -1 : alen != blen;
2057
2058   return reverse ? -diff : diff;
2059 }
2060
2061 /* Check that the lines read from FILE_NAME come in order.  Return
2062    true if they are in order.  If CHECKONLY == 'c', also print a
2063    diagnostic (FILE_NAME, line number, contents of line) to stderr if
2064    they are not in order.  */
2065
2066 static bool
2067 check (char const *file_name, char checkonly)
2068 {
2069   FILE *fp = xfopen (file_name, "r");
2070   struct buffer buf;            /* Input buffer. */
2071   struct line temp;             /* Copy of previous line. */
2072   size_t alloc = 0;
2073   uintmax_t line_number = 0;
2074   struct keyfield const *key = keylist;
2075   bool nonunique = ! unique;
2076   bool ordered = true;
2077
2078   initbuf (&buf, sizeof (struct line),
2079            MAX (merge_buffer_size, sort_size));
2080   temp.text = NULL;
2081
2082   while (fillbuf (&buf, fp, file_name))
2083     {
2084       struct line const *line = buffer_linelim (&buf);
2085       struct line const *linebase = line - buf.nlines;
2086
2087       /* Make sure the line saved from the old buffer contents is
2088          less than or equal to the first line of the new buffer. */
2089       if (alloc && nonunique <= compare (&temp, line - 1))
2090         {
2091         found_disorder:
2092           {
2093             if (checkonly == 'c')
2094               {
2095                 struct line const *disorder_line = line - 1;
2096                 uintmax_t disorder_line_number =
2097                   buffer_linelim (&buf) - disorder_line + line_number;
2098                 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
2099                 fprintf (stderr, _("%s: %s:%s: disorder: "),
2100                          program_name, file_name,
2101                          umaxtostr (disorder_line_number, hr_buf));
2102                 write_bytes (disorder_line->text, disorder_line->length,
2103                              stderr, _("standard error"));
2104               }
2105
2106             ordered = false;
2107             break;
2108           }
2109         }
2110
2111       /* Compare each line in the buffer with its successor.  */
2112       while (linebase < --line)
2113         if (nonunique <= compare (line, line - 1))
2114           goto found_disorder;
2115
2116       line_number += buf.nlines;
2117
2118       /* Save the last line of the buffer.  */
2119       if (alloc < line->length)
2120         {
2121           do
2122             {
2123               alloc *= 2;
2124               if (! alloc)
2125                 {
2126                   alloc = line->length;
2127                   break;
2128                 }
2129             }
2130           while (alloc < line->length);
2131
2132           temp.text = xrealloc (temp.text, alloc);
2133         }
2134       memcpy (temp.text, line->text, line->length);
2135       temp.length = line->length;
2136       if (key)
2137         {
2138           temp.keybeg = temp.text + (line->keybeg - line->text);
2139           temp.keylim = temp.text + (line->keylim - line->text);
2140         }
2141     }
2142
2143   xfclose (fp, file_name);
2144   free (buf.buf);
2145   free (temp.text);
2146   return ordered;
2147 }
2148
2149 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
2150    files (all of which are at the start of the FILES array), and
2151    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2152    Close input and output files before returning.
2153    OUTPUT_FILE gives the name of the output file.  If it is NULL,
2154    the output file is standard output.  If OFP is NULL, the output
2155    file has not been opened yet (or written to, if standard output).  */
2156
2157 static void
2158 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2159           FILE *ofp, char const *output_file)
2160 {
2161   FILE **fps = xnmalloc (nmerge, sizeof *fps);
2162                                 /* Input streams for each file.  */
2163   struct buffer *buffer = xnmalloc (nmerge, sizeof *buffer);
2164                                 /* Input buffers for each file. */
2165   struct line saved;            /* Saved line storage for unique check. */
2166   struct line const *savedline = NULL;
2167                                 /* &saved if there is a saved line. */
2168   size_t savealloc = 0;         /* Size allocated for the saved line. */
2169   struct line const **cur = xnmalloc (nmerge, sizeof *cur);
2170                                 /* Current line in each line table. */
2171   struct line const **base = xnmalloc (nmerge, sizeof *base);
2172                                 /* Base of each line table.  */
2173   size_t *ord = xnmalloc (nmerge, sizeof *ord);
2174                                 /* Table representing a permutation of fps,
2175                                    such that cur[ord[0]] is the smallest line
2176                                    and will be next output. */
2177   size_t i;
2178   size_t j;
2179   size_t t;
2180   struct keyfield const *key = keylist;
2181   saved.text = NULL;
2182
2183   /* Read initial lines from each input file. */
2184   for (i = 0; i < nfiles; )
2185     {
2186       fps[i] = (files[i].pid
2187                 ? open_temp (files[i].name, files[i].pid)
2188                 : xfopen (files[i].name, "r"));
2189       initbuf (&buffer[i], sizeof (struct line),
2190                MAX (merge_buffer_size, sort_size / nfiles));
2191       if (fillbuf (&buffer[i], fps[i], files[i].name))
2192         {
2193           struct line const *linelim = buffer_linelim (&buffer[i]);
2194           cur[i] = linelim - 1;
2195           base[i] = linelim - buffer[i].nlines;
2196           i++;
2197         }
2198       else
2199         {
2200           /* fps[i] is empty; eliminate it from future consideration.  */
2201           xfclose (fps[i], files[i].name);
2202           if (i < ntemps)
2203             {
2204               ntemps--;
2205               zaptemp (files[i].name);
2206             }
2207           free (buffer[i].buf);
2208           --nfiles;
2209           for (j = i; j < nfiles; ++j)
2210             files[j] = files[j + 1];
2211         }
2212     }
2213
2214   if (! ofp)
2215     ofp = xfopen (output_file, "w");
2216
2217   /* Set up the ord table according to comparisons among input lines.
2218      Since this only reorders two items if one is strictly greater than
2219      the other, it is stable. */
2220   for (i = 0; i < nfiles; ++i)
2221     ord[i] = i;
2222   for (i = 1; i < nfiles; ++i)
2223     if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2224       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2225
2226   /* Repeatedly output the smallest line until no input remains. */
2227   while (nfiles)
2228     {
2229       struct line const *smallest = cur[ord[0]];
2230
2231       /* If uniquified output is turned on, output only the first of
2232          an identical series of lines. */
2233       if (unique)
2234         {
2235           if (savedline && compare (savedline, smallest))
2236             {
2237               savedline = NULL;
2238               write_bytes (saved.text, saved.length, ofp, output_file);
2239             }
2240           if (!savedline)
2241             {
2242               savedline = &saved;
2243               if (savealloc < smallest->length)
2244                 {
2245                   do
2246                     if (! savealloc)
2247                       {
2248                         savealloc = smallest->length;
2249                         break;
2250                       }
2251                   while ((savealloc *= 2) < smallest->length);
2252
2253                   saved.text = xrealloc (saved.text, savealloc);
2254                 }
2255               saved.length = smallest->length;
2256               memcpy (saved.text, smallest->text, saved.length);
2257               if (key)
2258                 {
2259                   saved.keybeg =
2260                     saved.text + (smallest->keybeg - smallest->text);
2261                   saved.keylim =
2262                     saved.text + (smallest->keylim - smallest->text);
2263                 }
2264             }
2265         }
2266       else
2267         write_bytes (smallest->text, smallest->length, ofp, output_file);
2268
2269       /* Check if we need to read more lines into core. */
2270       if (base[ord[0]] < smallest)
2271         cur[ord[0]] = smallest - 1;
2272       else
2273         {
2274           if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2275             {
2276               struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2277               cur[ord[0]] = linelim - 1;
2278               base[ord[0]] = linelim - buffer[ord[0]].nlines;
2279             }
2280           else
2281             {
2282               /* We reached EOF on fps[ord[0]].  */
2283               for (i = 1; i < nfiles; ++i)
2284                 if (ord[i] > ord[0])
2285                   --ord[i];
2286               --nfiles;
2287               xfclose (fps[ord[0]], files[ord[0]].name);
2288               if (ord[0] < ntemps)
2289                 {
2290                   ntemps--;
2291                   zaptemp (files[ord[0]].name);
2292                 }
2293               free (buffer[ord[0]].buf);
2294               for (i = ord[0]; i < nfiles; ++i)
2295                 {
2296                   fps[i] = fps[i + 1];
2297                   files[i] = files[i + 1];
2298                   buffer[i] = buffer[i + 1];
2299                   cur[i] = cur[i + 1];
2300                   base[i] = base[i + 1];
2301                 }
2302               for (i = 0; i < nfiles; ++i)
2303                 ord[i] = ord[i + 1];
2304               continue;
2305             }
2306         }
2307
2308       /* The new line just read in may be larger than other lines
2309          already in main memory; push it back in the queue until we
2310          encounter a line larger than it.  Optimize for the common
2311          case where the new line is smallest.  */
2312       {
2313         size_t lo = 1;
2314         size_t hi = nfiles;
2315         size_t probe = lo;
2316         size_t ord0 = ord[0];
2317         size_t count_of_smaller_lines;
2318
2319         while (lo < hi)
2320           {
2321             int cmp = compare (cur[ord0], cur[ord[probe]]);
2322             if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2323               hi = probe;
2324             else
2325               lo = probe + 1;
2326             probe = (lo + hi) / 2;
2327           }
2328
2329         count_of_smaller_lines = lo - 1;
2330         for (j = 0; j < count_of_smaller_lines; j++)
2331           ord[j] = ord[j + 1];
2332         ord[count_of_smaller_lines] = ord0;
2333       }
2334
2335       /* Free up some resources every once in a while.  */
2336       if (MAX_PROCS_BEFORE_REAP < nprocs)
2337         reap_some ();
2338     }
2339
2340   if (unique && savedline)
2341     {
2342       write_bytes (saved.text, saved.length, ofp, output_file);
2343       free (saved.text);
2344     }
2345
2346   xfclose (ofp, output_file);
2347   free(fps);
2348   free(buffer);
2349   free(ord);
2350   free(base);
2351   free(cur);
2352 }
2353
2354 /* Merge into T the two sorted arrays of lines LO (with NLO members)
2355    and HI (with NHI members).  T, LO, and HI point just past their
2356    respective arrays, and the arrays are in reverse order.  NLO and
2357    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
2358
2359 static inline void
2360 mergelines (struct line *t,
2361             struct line const *lo, size_t nlo,
2362             struct line const *hi, size_t nhi)
2363 {
2364   for (;;)
2365     if (compare (lo - 1, hi - 1) <= 0)
2366       {
2367         *--t = *--lo;
2368         if (! --nlo)
2369           {
2370             /* HI - NHI equalled T - (NLO + NHI) when this function
2371                began.  Therefore HI must equal T now, and there is no
2372                need to copy from HI to T.  */
2373             return;
2374           }
2375       }
2376     else
2377       {
2378         *--t = *--hi;
2379         if (! --nhi)
2380           {
2381             do
2382               *--t = *--lo;
2383             while (--nlo);
2384
2385             return;
2386           }
2387       }
2388 }
2389
2390 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
2391    NLINES must be at least 2.
2392    The input and output arrays are in reverse order, and LINES and
2393    TEMP point just past the end of their respective arrays.
2394
2395    Use a recursive divide-and-conquer algorithm, in the style
2396    suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
2397    the optimization suggested by exercise 5.2.4-10; this requires room
2398    for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
2399    writes that this memory optimization was originally published by
2400    D. A. Bell, Comp J. 1 (1958), 75.  */
2401
2402 static void
2403 sortlines (struct line *lines, size_t nlines, struct line *temp)
2404 {
2405   if (nlines == 2)
2406     {
2407       if (0 < compare (&lines[-1], &lines[-2]))
2408         {
2409           struct line tmp = lines[-1];
2410           lines[-1] = lines[-2];
2411           lines[-2] = tmp;
2412         }
2413     }
2414   else
2415     {
2416       size_t nlo = nlines / 2;
2417       size_t nhi = nlines - nlo;
2418       struct line *lo = lines;
2419       struct line *hi = lines - nlo;
2420       struct line *sorted_lo = temp;
2421
2422       sortlines (hi, nhi, temp);
2423       if (1 < nlo)
2424         sortlines_temp (lo, nlo, sorted_lo);
2425       else
2426         sorted_lo[-1] = lo[-1];
2427
2428       mergelines (lines, sorted_lo, nlo, hi, nhi);
2429     }
2430 }
2431
2432 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
2433    rather than sorting in place.  */
2434
2435 static void
2436 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
2437 {
2438   if (nlines == 2)
2439     {
2440       /* Declare `swap' as int, not bool, to work around a bug
2441          <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
2442          in the IBM xlc 6.0.0.0 compiler in 64-bit mode.  */
2443       int swap = (0 < compare (&lines[-1], &lines[-2]));
2444       temp[-1] = lines[-1 - swap];
2445       temp[-2] = lines[-2 + swap];
2446     }
2447   else
2448     {
2449       size_t nlo = nlines / 2;
2450       size_t nhi = nlines - nlo;
2451       struct line *lo = lines;
2452       struct line *hi = lines - nlo;
2453       struct line *sorted_hi = temp - nlo;
2454
2455       sortlines_temp (hi, nhi, sorted_hi);
2456       if (1 < nlo)
2457         sortlines (lo, nlo, temp);
2458
2459       mergelines (temp, lo, nlo, sorted_hi, nhi);
2460     }
2461 }
2462
2463 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
2464    the same as OUTFILE.  If found, merge the found instances (and perhaps
2465    some other files) into a temporary file so that it can in turn be
2466    merged into OUTFILE without destroying OUTFILE before it is completely
2467    read.  Return the new value of NFILES, which differs from the old if
2468    some merging occurred.
2469
2470    This test ensures that an otherwise-erroneous use like
2471    "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
2472    It's not clear that POSIX requires this nicety.
2473    Detect common error cases, but don't try to catch obscure cases like
2474    "cat ... FILE ... | sort -m -o FILE"
2475    where traditional "sort" doesn't copy the input and where
2476    people should know that they're getting into trouble anyway.
2477    Catching these obscure cases would slow down performance in
2478    common cases.  */
2479
2480 static size_t
2481 avoid_trashing_input (struct sortfile *files, size_t ntemps,
2482                       size_t nfiles, char const *outfile)
2483 {
2484   size_t i;
2485   bool got_outstat = false;
2486   struct stat outstat;
2487
2488   for (i = ntemps; i < nfiles; i++)
2489     {
2490       bool is_stdin = STREQ (files[i].name, "-");
2491       bool same;
2492       struct stat instat;
2493
2494       if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
2495         same = true;
2496       else
2497         {
2498           if (! got_outstat)
2499             {
2500               if ((outfile
2501                    ? stat (outfile, &outstat)
2502                    : fstat (STDOUT_FILENO, &outstat))
2503                   != 0)
2504                 break;
2505               got_outstat = true;
2506             }
2507
2508           same = (((is_stdin
2509                     ? fstat (STDIN_FILENO, &instat)
2510                     : stat (files[i].name, &instat))
2511                    == 0)
2512                   && SAME_INODE (instat, outstat));
2513         }
2514
2515       if (same)
2516         {
2517           FILE *tftp;
2518           pid_t pid;
2519           char *temp = create_temp (&tftp, &pid);
2520           mergefps (&files[i],0, nfiles - i, tftp, temp);
2521           files[i].name = temp;
2522           files[i].pid = pid;
2523           return i + 1;
2524         }
2525     }
2526
2527   return nfiles;
2528 }
2529
2530 /* Merge the input FILES.  NTEMPS is the number of files at the
2531    start of FILES that are temporary; it is zero at the top level.
2532    NFILES is the total number of files.  Put the output in
2533    OUTPUT_FILE; a null OUTPUT_FILE stands for standard output.  */
2534
2535 static void
2536 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
2537        char const *output_file)
2538 {
2539   while (nmerge < nfiles)
2540     {
2541       /* Number of input files processed so far.  */
2542       size_t in;
2543
2544       /* Number of output files generated so far.  */
2545       size_t out;
2546
2547       /* nfiles % NMERGE; this counts input files that are left over
2548          after all full-sized merges have been done.  */
2549       size_t remainder;
2550
2551       /* Number of easily-available slots at the next loop iteration.  */
2552       size_t cheap_slots;
2553
2554       /* Do as many NMERGE-size merges as possible.  */
2555       for (out = in = 0; out < nfiles / nmerge; out++, in += nmerge)
2556         {
2557           FILE *tfp;
2558           pid_t pid;
2559           char *temp = create_temp (&tfp, &pid);
2560           size_t nt = MIN (ntemps, nmerge);
2561           ntemps -= nt;
2562           mergefps (&files[in], nt, nmerge, tfp, temp);
2563           files[out].name = temp;
2564           files[out].pid = pid;
2565         }
2566
2567       remainder = nfiles - in;
2568       cheap_slots = nmerge - out % nmerge;
2569
2570       if (cheap_slots < remainder)
2571         {
2572           /* So many files remain that they can't all be put into the last
2573              NMERGE-sized output window.  Do one more merge.  Merge as few
2574              files as possible, to avoid needless I/O.  */
2575           size_t nshortmerge = remainder - cheap_slots + 1;
2576           FILE *tfp;
2577           pid_t pid;
2578           char *temp = create_temp (&tfp, &pid);
2579           size_t nt = MIN (ntemps, nshortmerge);
2580           ntemps -= nt;
2581           mergefps (&files[in], nt, nshortmerge, tfp, temp);
2582           files[out].name = temp;
2583           files[out++].pid = pid;
2584           in += nshortmerge;
2585         }
2586
2587       /* Put the remaining input files into the last NMERGE-sized output
2588          window, so they will be merged in the next pass.  */
2589       memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2590       ntemps += out;
2591       nfiles -= in - out;
2592     }
2593
2594   nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2595   mergefps (files, ntemps, nfiles, NULL, output_file);
2596 }
2597
2598 /* Sort NFILES FILES onto OUTPUT_FILE. */
2599
2600 static void
2601 sort (char * const *files, size_t nfiles, char const *output_file)
2602 {
2603   struct buffer buf;
2604   size_t ntemps = 0;
2605   bool output_file_created = false;
2606
2607   buf.alloc = 0;
2608
2609   while (nfiles)
2610     {
2611       char const *temp_output;
2612       char const *file = *files;
2613       FILE *fp = xfopen (file, "r");
2614       FILE *tfp;
2615       size_t bytes_per_line = (2 * sizeof (struct line)
2616                                - sizeof (struct line) / 2);
2617
2618       if (! buf.alloc)
2619         initbuf (&buf, bytes_per_line,
2620                  sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2621       buf.eof = false;
2622       files++;
2623       nfiles--;
2624
2625       while (fillbuf (&buf, fp, file))
2626         {
2627           struct line *line;
2628           struct line *linebase;
2629
2630           if (buf.eof && nfiles
2631               && (bytes_per_line + 1
2632                   < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2633             {
2634               /* End of file, but there is more input and buffer room.
2635                  Concatenate the next input file; this is faster in
2636                  the usual case.  */
2637               buf.left = buf.used;
2638               break;
2639             }
2640
2641           line = buffer_linelim (&buf);
2642           linebase = line - buf.nlines;
2643           if (1 < buf.nlines)
2644             sortlines (line, buf.nlines, linebase);
2645           if (buf.eof && !nfiles && !ntemps && !buf.left)
2646             {
2647               xfclose (fp, file);
2648               tfp = xfopen (output_file, "w");
2649               temp_output = output_file;
2650               output_file_created = true;
2651             }
2652           else
2653             {
2654               ++ntemps;
2655               temp_output = create_temp (&tfp, NULL);
2656             }
2657
2658           do
2659             {
2660               line--;
2661               write_bytes (line->text, line->length, tfp, temp_output);
2662               if (unique)
2663                 while (linebase < line && compare (line, line - 1) == 0)
2664                   line--;
2665             }
2666           while (linebase < line);
2667
2668           xfclose (tfp, temp_output);
2669
2670           /* Free up some resources every once in a while.  */
2671           if (MAX_PROCS_BEFORE_REAP < nprocs)
2672             reap_some ();
2673
2674           if (output_file_created)
2675             goto finish;
2676         }
2677       xfclose (fp, file);
2678     }
2679
2680  finish:
2681   free (buf.buf);
2682
2683   if (! output_file_created)
2684     {
2685       size_t i;
2686       struct tempnode *node = temphead;
2687       struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2688       for (i = 0; node; i++)
2689         {
2690           tempfiles[i].name = node->name;
2691           tempfiles[i].pid = node->pid;
2692           node = node->next;
2693         }
2694       merge (tempfiles, ntemps, ntemps, output_file);
2695       free (tempfiles);
2696     }
2697 }
2698
2699 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list.  */
2700
2701 static void
2702 insertkey (struct keyfield *key_arg)
2703 {
2704   struct keyfield **p;
2705   struct keyfield *key = xmemdup (key_arg, sizeof *key);
2706
2707   for (p = &keylist; *p; p = &(*p)->next)
2708     continue;
2709   *p = key;
2710   key->next = NULL;
2711 }
2712
2713 /* Report a bad field specification SPEC, with extra info MSGID.  */
2714
2715 static void badfieldspec (char const *, char const *)
2716      ATTRIBUTE_NORETURN;
2717 static void
2718 badfieldspec (char const *spec, char const *msgid)
2719 {
2720   error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
2721          _(msgid), quote (spec));
2722   abort ();
2723 }
2724
2725 /* Report incompatible options.  */
2726
2727 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
2728 static void
2729 incompatible_options (char const *opts)
2730 {
2731   error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
2732   abort ();
2733 }
2734
2735 /* Check compatibility of ordering options.  */
2736
2737 static void
2738 check_ordering_compatibility (void)
2739 {
2740   struct keyfield const *key;
2741
2742   for (key = keylist; key; key = key->next)
2743     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
2744               + key->version + !!key->ignore))
2745         || (key->random && key->translate))
2746       {
2747         /* The following is too big, but guaranteed to be "big enough". */
2748         char opts[sizeof short_options];
2749         char *p = opts;
2750         if (key->ignore == nondictionary)
2751           *p++ = 'd';
2752         if (key->translate)
2753           *p++ = 'f';
2754         if (key->general_numeric)
2755           *p++ = 'g';
2756         if (key->ignore == nonprinting)
2757           *p++ = 'i';
2758         if (key->month)
2759           *p++ = 'M';
2760         if (key->numeric)
2761           *p++ = 'n';
2762         if (key->version)
2763           *p++ = 'V';
2764         if (key->random)
2765           *p++ = 'R';
2766         *p = '\0';
2767         incompatible_options (opts);
2768       }
2769 }
2770
2771 /* Parse the leading integer in STRING and store the resulting value
2772    (which must fit into size_t) into *VAL.  Return the address of the
2773    suffix after the integer.  If the value is too large, silently
2774    substitute SIZE_MAX.  If MSGID is NULL, return NULL after
2775    failure; otherwise, report MSGID and exit on failure.  */
2776
2777 static char const *
2778 parse_field_count (char const *string, size_t *val, char const *msgid)
2779 {
2780   char *suffix;
2781   uintmax_t n;
2782
2783   switch (xstrtoumax (string, &suffix, 10, &n, ""))
2784     {
2785     case LONGINT_OK:
2786     case LONGINT_INVALID_SUFFIX_CHAR:
2787       *val = n;
2788       if (*val == n)
2789         break;
2790       /* Fall through.  */
2791     case LONGINT_OVERFLOW:
2792     case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2793       *val = SIZE_MAX;
2794       break;
2795
2796     case LONGINT_INVALID:
2797       if (msgid)
2798         error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
2799                _(msgid), quote (string));
2800       return NULL;
2801     }
2802
2803   return suffix;
2804 }
2805
2806 /* Handle interrupts and hangups. */
2807
2808 static void
2809 sighandler (int sig)
2810 {
2811   if (! SA_NOCLDSTOP)
2812     signal (sig, SIG_IGN);
2813
2814   cleanup ();
2815
2816   signal (sig, SIG_DFL);
2817   raise (sig);
2818 }
2819
2820 /* Set the ordering options for KEY specified in S.
2821    Return the address of the first character in S that
2822    is not a valid ordering option.
2823    BLANKTYPE is the kind of blanks that 'b' should skip. */
2824
2825 static char *
2826 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
2827 {
2828   while (*s)
2829     {
2830       switch (*s)
2831         {
2832         case 'b':
2833           if (blanktype == bl_start || blanktype == bl_both)
2834             key->skipsblanks = true;
2835           if (blanktype == bl_end || blanktype == bl_both)
2836             key->skipeblanks = true;
2837           break;
2838         case 'd':
2839           key->ignore = nondictionary;
2840           break;
2841         case 'f':
2842           key->translate = fold_toupper;
2843           break;
2844         case 'g':
2845           key->general_numeric = true;
2846           break;
2847         case 'i':
2848           /* Option order should not matter, so don't let -i override
2849              -d.  -d implies -i, but -i does not imply -d.  */
2850           if (! key->ignore)
2851             key->ignore = nonprinting;
2852           break;
2853         case 'M':
2854           key->month = true;
2855           break;
2856         case 'n':
2857           key->numeric = true;
2858           break;
2859         case 'R':
2860           key->random = true;
2861           break;
2862         case 'r':
2863           key->reverse = true;
2864           break;
2865         case 'V':
2866           key->version = true;
2867           break;
2868         default:
2869           return (char *) s;
2870         }
2871       ++s;
2872     }
2873   return (char *) s;
2874 }
2875
2876 static struct keyfield *
2877 key_init (struct keyfield *key)
2878 {
2879   memset (key, 0, sizeof *key);
2880   key->eword = SIZE_MAX;
2881   return key;
2882 }
2883
2884 int
2885 main (int argc, char **argv)
2886 {
2887   struct keyfield *key;
2888   struct keyfield key_buf;
2889   struct keyfield gkey;
2890   char const *s;
2891   int c = 0;
2892   char checkonly = 0;
2893   bool mergeonly = false;
2894   char *random_source = NULL;
2895   bool need_random = false;
2896   size_t nfiles = 0;
2897   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2898   bool obsolete_usage = (posix2_version () < 200112);
2899   char **files;
2900   char *files_from = NULL;
2901   struct Tokens tok;
2902   char const *outfile = NULL;
2903
2904   initialize_main (&argc, &argv);
2905   set_program_name (argv[0]);
2906   setlocale (LC_ALL, "");
2907   bindtextdomain (PACKAGE, LOCALEDIR);
2908   textdomain (PACKAGE);
2909
2910   initialize_exit_failure (SORT_FAILURE);
2911
2912   hard_LC_COLLATE = hard_locale (LC_COLLATE);
2913 #if HAVE_NL_LANGINFO
2914   hard_LC_TIME = hard_locale (LC_TIME);
2915 #endif
2916
2917   /* Get locale's representation of the decimal point.  */
2918   {
2919     struct lconv const *locale = localeconv ();
2920
2921     /* If the locale doesn't define a decimal point, or if the decimal
2922        point is multibyte, use the C locale's decimal point.  FIXME:
2923        add support for multibyte decimal points.  */
2924     decimal_point = to_uchar (locale->decimal_point[0]);
2925     if (! decimal_point || locale->decimal_point[1])
2926       decimal_point = '.';
2927
2928     /* FIXME: add support for multibyte thousands separators.  */
2929     thousands_sep = to_uchar (*locale->thousands_sep);
2930     if (! thousands_sep || locale->thousands_sep[1])
2931       thousands_sep = -1;
2932   }
2933
2934   have_read_stdin = false;
2935   inittables ();
2936
2937   {
2938     size_t i;
2939     static int const sig[] =
2940       {
2941         /* The usual suspects.  */
2942         SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
2943 #ifdef SIGPOLL
2944         SIGPOLL,
2945 #endif
2946 #ifdef SIGPROF
2947         SIGPROF,
2948 #endif
2949 #ifdef SIGVTALRM
2950         SIGVTALRM,
2951 #endif
2952 #ifdef SIGXCPU
2953         SIGXCPU,
2954 #endif
2955 #ifdef SIGXFSZ
2956         SIGXFSZ,
2957 #endif
2958       };
2959     enum { nsigs = sizeof sig / sizeof sig[0] };
2960
2961 #if SA_NOCLDSTOP
2962     struct sigaction act;
2963
2964     sigemptyset (&caught_signals);
2965     for (i = 0; i < nsigs; i++)
2966       {
2967         sigaction (sig[i], NULL, &act);
2968         if (act.sa_handler != SIG_IGN)
2969           sigaddset (&caught_signals, sig[i]);
2970       }
2971
2972     act.sa_handler = sighandler;
2973     act.sa_mask = caught_signals;
2974     act.sa_flags = 0;
2975
2976     for (i = 0; i < nsigs; i++)
2977       if (sigismember (&caught_signals, sig[i]))
2978         sigaction (sig[i], &act, NULL);
2979 #else
2980     for (i = 0; i < nsigs; i++)
2981       if (signal (sig[i], SIG_IGN) != SIG_IGN)
2982         {
2983           signal (sig[i], sighandler);
2984           siginterrupt (sig[i], 1);
2985         }
2986 #endif
2987   }
2988
2989   /* The signal mask is known, so it is safe to invoke exit_cleanup.  */
2990   atexit (exit_cleanup);
2991
2992   gkey.sword = gkey.eword = SIZE_MAX;
2993   gkey.ignore = NULL;
2994   gkey.translate = NULL;
2995   gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false;
2996   gkey.month = gkey.reverse = false;
2997   gkey.skipsblanks = gkey.skipeblanks = false;
2998
2999   files = xnmalloc (argc, sizeof *files);
3000
3001   for (;;)
3002     {
3003       /* Parse an operand as a file after "--" was seen; or if
3004          pedantic and a file was seen, unless the POSIX version
3005          predates 1003.1-2001 and -c was not seen and the operand is
3006          "-o FILE" or "-oFILE".  */
3007       int oi = -1;
3008
3009       if (c == -1
3010           || (posixly_correct && nfiles != 0
3011               && ! (obsolete_usage
3012                     && ! checkonly
3013                     && optind != argc
3014                     && argv[optind][0] == '-' && argv[optind][1] == 'o'
3015                     && (argv[optind][2] || optind + 1 != argc)))
3016           || ((c = getopt_long (argc, argv, short_options,
3017                                 long_options, &oi))
3018               == -1))
3019         {
3020           if (argc <= optind)
3021             break;
3022           files[nfiles++] = argv[optind++];
3023         }
3024       else switch (c)
3025         {
3026         case 1:
3027           key = NULL;
3028           if (optarg[0] == '+')
3029             {
3030               bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
3031                                       && ISDIGIT (argv[optind][1]));
3032               obsolete_usage |= minus_pos_usage & ~posixly_correct;
3033               if (obsolete_usage)
3034                 {
3035                   /* Treat +POS1 [-POS2] as a key if possible; but silently
3036                      treat an operand as a file if it is not a valid +POS1.  */
3037                   key = key_init (&key_buf);
3038                   s = parse_field_count (optarg + 1, &key->sword, NULL);
3039                   if (s && *s == '.')
3040                     s = parse_field_count (s + 1, &key->schar, NULL);
3041                   if (! (key->sword | key->schar))
3042                     key->sword = SIZE_MAX;
3043                   if (! s || *set_ordering (s, key, bl_start))
3044                     key = NULL;
3045                   else
3046                     {
3047                       if (minus_pos_usage)
3048                         {
3049                           char const *optarg1 = argv[optind++];
3050                           s = parse_field_count (optarg1 + 1, &key->eword,
3051                                              N_("invalid number after `-'"));
3052                           if (*s == '.')
3053                             s = parse_field_count (s + 1, &key->echar,
3054                                                N_("invalid number after `.'"));
3055                           if (*set_ordering (s, key, bl_end))
3056                             badfieldspec (optarg1,
3057                                       N_("stray character in field spec"));
3058                         }
3059                       insertkey (key);
3060                     }
3061                 }
3062             }
3063           if (! key)
3064             files[nfiles++] = optarg;
3065           break;
3066
3067         case SORT_OPTION:
3068           c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
3069           /* Fall through. */
3070         case 'b':
3071         case 'd':
3072         case 'f':
3073         case 'g':
3074         case 'i':
3075         case 'M':
3076         case 'n':
3077         case 'r':
3078         case 'R':
3079         case 'V':
3080           {
3081             char str[2];
3082             str[0] = c;
3083             str[1] = '\0';
3084             set_ordering (str, &gkey, bl_both);
3085           }
3086           break;
3087
3088         case CHECK_OPTION:
3089           c = (optarg
3090                ? XARGMATCH ("--check", optarg, check_args, check_types)
3091                : 'c');
3092           /* Fall through.  */
3093         case 'c':
3094         case 'C':
3095           if (checkonly && checkonly != c)
3096             incompatible_options ("cC");
3097           checkonly = c;
3098           break;
3099
3100         case COMPRESS_PROGRAM_OPTION:
3101           if (compress_program && !STREQ (compress_program, optarg))
3102             error (SORT_FAILURE, 0, _("multiple compress programs specified"));
3103           compress_program = optarg;
3104           break;
3105
3106         case FILES0_FROM_OPTION:
3107           files_from = optarg;
3108           break;
3109
3110         case 'k':
3111           key = key_init (&key_buf);
3112
3113           /* Get POS1. */
3114           s = parse_field_count (optarg, &key->sword,
3115                                  N_("invalid number at field start"));
3116           if (! key->sword--)
3117             {
3118               /* Provoke with `sort -k0' */
3119               badfieldspec (optarg, N_("field number is zero"));
3120             }
3121           if (*s == '.')
3122             {
3123               s = parse_field_count (s + 1, &key->schar,
3124                                      N_("invalid number after `.'"));
3125               if (! key->schar--)
3126                 {
3127                   /* Provoke with `sort -k1.0' */
3128                   badfieldspec (optarg, N_("character offset is zero"));
3129                 }
3130             }
3131           if (! (key->sword | key->schar))
3132             key->sword = SIZE_MAX;
3133           s = set_ordering (s, key, bl_start);
3134           if (*s != ',')
3135             {
3136               key->eword = SIZE_MAX;
3137               key->echar = 0;
3138             }
3139           else
3140             {
3141               /* Get POS2. */
3142               s = parse_field_count (s + 1, &key->eword,
3143                                      N_("invalid number after `,'"));
3144               if (! key->eword--)
3145                 {
3146                   /* Provoke with `sort -k1,0' */
3147                   badfieldspec (optarg, N_("field number is zero"));
3148                 }
3149               if (*s == '.')
3150                 s = parse_field_count (s + 1, &key->echar,
3151                                        N_("invalid number after `.'"));
3152               else
3153                 {
3154                   /* `-k 2,3' is equivalent to `+1 -3'.  */
3155                   key->eword++;
3156                 }
3157               s = set_ordering (s, key, bl_end);
3158             }
3159           if (*s)
3160             badfieldspec (optarg, N_("stray character in field spec"));
3161           insertkey (key);
3162           break;
3163
3164         case 'm':
3165           mergeonly = true;
3166           break;
3167
3168         case NMERGE_OPTION:
3169           specify_nmerge (oi, c, optarg);
3170           break;
3171
3172         case 'o':
3173           if (outfile && !STREQ (outfile, optarg))
3174             error (SORT_FAILURE, 0, _("multiple output files specified"));
3175           outfile = optarg;
3176           break;
3177
3178         case RANDOM_SOURCE_OPTION:
3179           if (random_source && !STREQ (random_source, optarg))
3180             error (SORT_FAILURE, 0, _("multiple random sources specified"));
3181           random_source = optarg;
3182           break;
3183
3184         case 's':
3185           stable = true;
3186           break;
3187
3188         case 'S':
3189           specify_sort_size (oi, c, optarg);
3190           break;
3191
3192         case 't':
3193           {
3194             char newtab = optarg[0];
3195             if (! newtab)
3196               error (SORT_FAILURE, 0, _("empty tab"));
3197             if (optarg[1])
3198               {
3199                 if (STREQ (optarg, "\\0"))
3200                   newtab = '\0';
3201                 else
3202                   {
3203                     /* Provoke with `sort -txx'.  Complain about
3204                        "multi-character tab" instead of "multibyte tab", so
3205                        that the diagnostic's wording does not need to be
3206                        changed once multibyte characters are supported.  */
3207                     error (SORT_FAILURE, 0, _("multi-character tab %s"),
3208                            quote (optarg));
3209                   }
3210               }
3211             if (tab != TAB_DEFAULT && tab != newtab)
3212               error (SORT_FAILURE, 0, _("incompatible tabs"));
3213             tab = newtab;
3214           }
3215           break;
3216
3217         case 'T':
3218           add_temp_dir (optarg);
3219           break;
3220
3221         case 'u':
3222           unique = true;
3223           break;
3224
3225         case 'y':
3226           /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
3227              through Solaris 7.  It is also accepted by many non-Solaris
3228              "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
3229              -y is marked as obsolete starting with Solaris 8 (1999), but is
3230              still accepted as of Solaris 10 prerelease (2004).
3231
3232              Solaris 2.5.1 "sort -y 100" reads the input file "100", but
3233              emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
3234              and which in general ignores the argument after "-y" if it
3235              consists entirely of digits (it can even be empty).  */
3236           if (optarg == argv[optind - 1])
3237             {
3238               char const *p;
3239               for (p = optarg; ISDIGIT (*p); p++)
3240                 continue;
3241               optind -= (*p != '\0');
3242             }
3243           break;
3244
3245         case 'z':
3246           eolchar = 0;
3247           break;
3248
3249         case_GETOPT_HELP_CHAR;
3250
3251         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
3252
3253         default:
3254           usage (SORT_FAILURE);
3255         }
3256     }
3257
3258   if (files_from)
3259     {
3260       FILE *stream;
3261
3262       /* When using --files0-from=F, you may not specify any files
3263          on the command-line.  */
3264       if (nfiles)
3265         {
3266           error (0, 0, _("extra operand %s"), quote (files[0]));
3267           fprintf (stderr, "%s\n",
3268                    _("file operands cannot be combined with --files0-from"));
3269           usage (SORT_FAILURE);
3270         }
3271
3272       if (STREQ (files_from, "-"))
3273         stream = stdin;
3274       else
3275         {
3276           stream = fopen (files_from, "r");
3277           if (stream == NULL)
3278             error (SORT_FAILURE, errno, _("cannot open %s for reading"),
3279                    quote (files_from));
3280         }
3281
3282       readtokens0_init (&tok);
3283
3284       if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
3285         error (SORT_FAILURE, 0, _("cannot read file names from %s"),
3286                quote (files_from));
3287
3288       if (tok.n_tok)
3289         {
3290           size_t i;
3291           free (files);
3292           files = tok.tok;
3293           nfiles = tok.n_tok;
3294           for (i = 0; i < nfiles; i++)
3295           {
3296               if (STREQ (files[i], "-"))
3297                 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
3298                                           "no file name of %s allowed"),
3299                        quote (files[i]));
3300               else if (files[i][0] == '\0')
3301                 {
3302                   /* Using the standard `filename:line-number:' prefix here is
3303                      not totally appropriate, since NUL is the separator, not NL,
3304                      but it might be better than nothing.  */
3305                   unsigned long int file_number = i + 1;
3306                   error (SORT_FAILURE, 0,
3307                          _("%s:%lu: invalid zero-length file name"),
3308                          quotearg_colon (files_from), file_number);
3309                 }
3310           }
3311         }
3312       else
3313         error (SORT_FAILURE, 0, _("no input from %s"),
3314                quote (files_from));
3315     }
3316
3317   /* Inheritance of global options to individual keys. */
3318   for (key = keylist; key; key = key->next)
3319     {
3320       if (! (key->ignore
3321              || key->translate
3322              || (key->skipsblanks
3323                  | key->reverse
3324                  | key->skipeblanks
3325                  | key->month
3326                  | key->numeric
3327                  | key->version
3328                  | key->general_numeric
3329                  | key->random)))
3330         {
3331           key->ignore = gkey.ignore;
3332           key->translate = gkey.translate;
3333           key->skipsblanks = gkey.skipsblanks;
3334           key->skipeblanks = gkey.skipeblanks;
3335           key->month = gkey.month;
3336           key->numeric = gkey.numeric;
3337           key->general_numeric = gkey.general_numeric;
3338           key->random = gkey.random;
3339           key->reverse = gkey.reverse;
3340           key->version = gkey.version;
3341         }
3342
3343       need_random |= key->random;
3344     }
3345
3346   if (!keylist && (gkey.ignore
3347                    || gkey.translate
3348                    || (gkey.skipsblanks
3349                        | gkey.skipeblanks
3350                        | gkey.month
3351                        | gkey.numeric
3352                        | gkey.general_numeric
3353                        | gkey.random
3354                        | gkey.version)))
3355     {
3356       insertkey (&gkey);
3357       need_random |= gkey.random;
3358     }
3359
3360   check_ordering_compatibility ();
3361
3362   reverse = gkey.reverse;
3363
3364   if (need_random)
3365     {
3366       randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
3367       if (! randread_source)
3368         die (_("open failed"), random_source);
3369     }
3370
3371   if (temp_dir_count == 0)
3372     {
3373       char const *tmp_dir = getenv ("TMPDIR");
3374       add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
3375     }
3376
3377   if (nfiles == 0)
3378     {
3379       static char *minus = "-";
3380       nfiles = 1;
3381       free (files);
3382       files = &minus;
3383     }
3384
3385   /* Need to re-check that we meet the minimum requirement for memory
3386      usage with the final value for NMERGE. */
3387   if (0 < sort_size)
3388     sort_size = MAX (sort_size, MIN_SORT_SIZE);
3389
3390   if (checkonly)
3391     {
3392       if (nfiles > 1)
3393         error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
3394                quote (files[1]), checkonly);
3395
3396       if (outfile)
3397         {
3398           static char opts[] = {0, 'o', 0};
3399           opts[0] = checkonly;
3400           incompatible_options (opts);
3401         }
3402
3403       /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
3404          input is not properly sorted.  */
3405       exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
3406     }
3407
3408   if (mergeonly)
3409     {
3410       struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
3411       size_t i;
3412
3413       for (i = 0; i < nfiles; ++i)
3414         sortfiles[i].name = files[i];
3415
3416       merge (sortfiles, 0, nfiles, outfile);
3417       IF_LINT (free (sortfiles));
3418     }
3419   else
3420     sort (files, nfiles, outfile);
3421
3422   if (have_read_stdin && fclose (stdin) == EOF)
3423     die (_("close failed"), "-");
3424
3425   exit (EXIT_SUCCESS);
3426 }