src/sort.c

   1 /* sort - sort lines of text (with all kinds of options).
   2    Copyright (C) 1988, 1991-2008 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17    Written December 1988 by Mike Haertel.
  18    The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
  19    or (US mail) as Mike Haertel c/o Free Software Foundation.
  20
  21    Ørn E. Hansen added NLS support in 1997.  */
  22
  23 #include <config.h>
  24
  25 #include <getopt.h>
  26 #include <sys/types.h>
  27 #include <sys/wait.h>
  28 #include <signal.h>
  29 #include "system.h"
  30 #include "argmatch.h"
  31 #include "error.h"
  32 #include "hard-locale.h"
  33 #include "hash.h"
  34 #include "md5.h"
  35 #include "physmem.h"
  36 #include "posixver.h"
  37 #include "quote.h"
  38 #include "quotearg.h"
  39 #include "randread.h"
  40 #include "readtokens0.h"
  41 #include "stdio--.h"
  42 #include "stdlib--.h"
  43 #include "strnumcmp.h"
  44 #include "xmemcoll.h"
  45 #include "xmemxfrm.h"
  46 #include "xstrtol.h"
  47
  48 #if HAVE_SYS_RESOURCE_H
  49 # include <sys/resource.h>
  50 #endif
  51 #ifndef RLIMIT_DATA
  52 struct rlimit { size_t rlim_cur; };
  53 # define getrlimit(Resource, Rlp) (-1)
  54 #endif
  55
  56 /* The official name of this program (e.g., no `g' prefix).  */
  57 #define PROGRAM_NAME "sort"
  58
  59 #define AUTHORS \
  60   proper_name ("Mike Haertel"), \
  61   proper_name ("Paul Eggert")
  62
  63 #if HAVE_LANGINFO_CODESET
  64 # include <langinfo.h>
  65 #endif
  66
  67 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
  68    present.  */
  69 #ifndef SA_NOCLDSTOP
  70 # define SA_NOCLDSTOP 0
  71 /* No sigprocmask.  Always 'return' zero. */
  72 # define sigprocmask(How, Set, Oset) (0)
  73 # define sigset_t int
  74 # if ! HAVE_SIGINTERRUPT
  75 #  define siginterrupt(sig, flag) /* empty */
  76 # endif
  77 #endif
  78
  79 #if !defined OPEN_MAX && defined NR_OPEN
  80 # define OPEN_MAX NR_OPEN
  81 #endif
  82 #if !defined OPEN_MAX
  83 # define OPEN_MAX 20
  84 #endif
  85
  86 #define UCHAR_LIM (UCHAR_MAX + 1)
  87
  88 #ifndef DEFAULT_TMPDIR
  89 # define DEFAULT_TMPDIR "/tmp"
  90 #endif
  91
  92 /* Exit statuses.  */
  93 enum
  94   {
  95     /* POSIX says to exit with status 1 if invoked with -c and the
  96        input is not properly sorted.  */
  97     SORT_OUT_OF_ORDER = 1,
  98
  99     /* POSIX says any other irregular exit must exit with a status
 100        code greater than 1.  */
 101     SORT_FAILURE = 2
 102   };
 103
 104 enum
 105   {
 106     /* The number of times we should try to fork a compression process
 107        (we retry if the fork call fails).  We don't _need_ to compress
 108        temp files, this is just to reduce disk access, so this number
 109        can be small.  */
 110     MAX_FORK_TRIES_COMPRESS = 2,
 111
 112     /* The number of times we should try to fork a decompression process.
 113        If we can't fork a decompression process, we can't sort, so this
 114        number should be big.  */
 115     MAX_FORK_TRIES_DECOMPRESS = 8
 116   };
 117
 118 /* The representation of the decimal point in the current locale.  */
 119 static int decimal_point;
 120
 121 /* Thousands separator; if -1, then there isn't one.  */
 122 static int thousands_sep;
 123
 124 /* Nonzero if the corresponding locales are hard.  */
 125 static bool hard_LC_COLLATE;
 126 #if HAVE_NL_LANGINFO
 127 static bool hard_LC_TIME;
 128 #endif
 129
 130 #define NONZERO(x) ((x) != 0)
 131
 132 /* The kind of blanks for '-b' to skip in various options. */
 133 enum blanktype { bl_start, bl_end, bl_both };
 134
 135 /* The character marking end of line. Default to \n. */
 136 static char eolchar = '\n';
 137
 138 /* Lines are held in core as counted strings. */
 139 struct line
 140 {
 141   char *text;                   /* Text of the line. */
 142   size_t length;                /* Length including final newline. */
 143   char *keybeg;                 /* Start of first key. */
 144   char *keylim;                 /* Limit of first key. */
 145 };
 146
 147 /* Input buffers. */
 148 struct buffer
 149 {
 150   char *buf;                    /* Dynamically allocated buffer,
 151                                    partitioned into 3 regions:
 152                                    - input data;
 153                                    - unused area;
 154                                    - an array of lines, in reverse order.  */
 155   size_t used;                  /* Number of bytes used for input data.  */
 156   size_t nlines;                /* Number of lines in the line array.  */
 157   size_t alloc;                 /* Number of bytes allocated. */
 158   size_t left;                  /* Number of bytes left from previous reads. */
 159   size_t line_bytes;            /* Number of bytes to reserve for each line. */
 160   bool eof;                     /* An EOF has been read.  */
 161 };
 162
 163 struct keyfield
 164 {
 165   size_t sword;                 /* Zero-origin 'word' to start at. */
 166   size_t schar;                 /* Additional characters to skip. */
 167   size_t eword;                 /* Zero-origin first word after field. */
 168   size_t echar;                 /* Additional characters in field. */
 169   bool const *ignore;           /* Boolean array of characters to ignore. */
 170   char const *translate;        /* Translation applied to characters. */
 171   bool skipsblanks;             /* Skip leading blanks when finding start.  */
 172   bool skipeblanks;             /* Skip leading blanks when finding end.  */
 173   bool numeric;                 /* Flag for numeric comparison.  Handle
 174                                    strings of digits with optional decimal
 175                                    point, but no exponential notation. */
 176   bool random;                  /* Sort by random hash of key.  */
 177   bool general_numeric;         /* Flag for general, numeric comparison.
 178                                    Handle numbers in exponential notation. */
 179   bool month;                   /* Flag for comparison by month name. */
 180   bool reverse;                 /* Reverse the sense of comparison. */
 181   bool version;                 /* sort by version number */
 182   struct keyfield *next;        /* Next keyfield to try. */
 183 };
 184
 185 struct month
 186 {
 187   char const *name;
 188   int val;
 189 };
 190
 191 /* FIXME: None of these tables work with multibyte character sets.
 192    Also, there are many other bugs when handling multibyte characters.
 193    One way to fix this is to rewrite `sort' to use wide characters
 194    internally, but doing this with good performance is a bit
 195    tricky.  */
 196
 197 /* Table of blanks.  */
 198 static bool blanks[UCHAR_LIM];
 199
 200 /* Table of non-printing characters. */
 201 static bool nonprinting[UCHAR_LIM];
 202
 203 /* Table of non-dictionary characters (not letters, digits, or blanks). */
 204 static bool nondictionary[UCHAR_LIM];
 205
 206 /* Translation table folding lower case to upper.  */
 207 static char fold_toupper[UCHAR_LIM];
 208
 209 #define MONTHS_PER_YEAR 12
 210
 211 /* Table mapping month names to integers.
 212    Alphabetic order allows binary search. */
 213 static struct month monthtab[] =
 214 {
 215   {"APR", 4},
 216   {"AUG", 8},
 217   {"DEC", 12},
 218   {"FEB", 2},
 219   {"JAN", 1},
 220   {"JUL", 7},
 221   {"JUN", 6},
 222   {"MAR", 3},
 223   {"MAY", 5},
 224   {"NOV", 11},
 225   {"OCT", 10},
 226   {"SEP", 9}
 227 };
 228
 229 /* During the merge phase, the number of files to merge at once. */
 230 #define NMERGE_DEFAULT 16
 231
 232 /* Minimum size for a merge or check buffer.  */
 233 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 234
 235 /* Minimum sort size; the code might not work with smaller sizes.  */
 236 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
 237
 238 /* The number of bytes needed for a merge or check buffer, which can
 239    function relatively efficiently even if it holds only one line.  If
 240    a longer line is seen, this value is increased.  */
 241 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
 242
 243 /* The approximate maximum number of bytes of main memory to use, as
 244    specified by the user.  Zero if the user has not specified a size.  */
 245 static size_t sort_size;
 246
 247 /* The guessed size for non-regular files.  */
 248 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
 249
 250 /* Array of directory names in which any temporary files are to be created. */
 251 static char const **temp_dirs;
 252
 253 /* Number of temporary directory names used.  */
 254 static size_t temp_dir_count;
 255
 256 /* Number of allocated slots in temp_dirs.  */
 257 static size_t temp_dir_alloc;
 258
 259 /* Flag to reverse the order of all comparisons. */
 260 static bool reverse;
 261
 262 /* Flag for stable sort.  This turns off the last ditch bytewise
 263    comparison of lines, and instead leaves lines in the same order
 264    they were read if all keys compare equal.  */
 265 static bool stable;
 266
 267 /* If TAB has this value, blanks separate fields.  */
 268 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 269
 270 /* Tab character separating fields.  If TAB_DEFAULT, then fields are
 271    separated by the empty string between a non-blank character and a blank
 272    character. */
 273 static int tab = TAB_DEFAULT;
 274
 275 /* Flag to remove consecutive duplicate lines from the output.
 276    Only the last of a sequence of equal lines will be output. */
 277 static bool unique;
 278
 279 /* Nonzero if any of the input files are the standard input. */
 280 static bool have_read_stdin;
 281
 282 /* List of key field comparisons to be tried.  */
 283 static struct keyfield *keylist;
 284
 285 /* Program used to (de)compress temp files.  Must accept -d.  */
 286 static char const *compress_program;
 287
 288 /* Maximum number of files to merge in one go.  If more than this
 289    number are present, temp files will be used. */
 290 static unsigned int nmerge = NMERGE_DEFAULT;
 291
 292 static void sortlines_temp (struct line *, size_t, struct line *);
 293
 294 /* Report MESSAGE for FILE, then clean up and exit.
 295    If FILE is null, it represents standard output.  */
 296
 297 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
 298 static void
 299 die (char const *message, char const *file)
 300 {
 301   error (0, errno, "%s: %s", message, file ? file : _("standard output"));
 302   exit (SORT_FAILURE);
 303 }
 304
 305 void
 306 usage (int status)
 307 {
 308   if (status != EXIT_SUCCESS)
 309     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 310              program_name);
 311   else
 312     {
 313       printf (_("\
 314 Usage: %s [OPTION]... [FILE]...\n\
 315   or:  %s [OPTION]... --files0-from=F\n\
 316 "),
 317               program_name, program_name);
 318       fputs (_("\
 319 Write sorted concatenation of all FILE(s) to standard output.\n\
 320 \n\
 321 "), stdout);
 322       fputs (_("\
 323 Mandatory arguments to long options are mandatory for short options too.\n\
 324 "), stdout);
 325       fputs (_("\
 326 Ordering options:\n\
 327 \n\
 328 "), stdout);
 329       fputs (_("\
 330   -b, --ignore-leading-blanks  ignore leading blanks\n\
 331   -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
 332   -f, --ignore-case           fold lower case to upper case characters\n\
 333 "), stdout);
 334       fputs (_("\
 335   -g, --general-numeric-sort  compare according to general numerical value\n\
 336   -i, --ignore-nonprinting    consider only printable characters\n\
 337   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 338   -n, --numeric-sort          compare according to string numerical value\n\
 339   -R, --random-sort           sort by random hash of keys\n\
 340   -V, --version-sort          sort by numeric version (see strverscmp(3C))\n\
 341       --random-source=FILE    get random bytes from FILE (default /dev/urandom)\n\
 342       --sort=WORD             sort according to WORD:\n\
 343                                 general-numeric -g, month -M, numeric -n,\n\
 344                                 random -R, version -V\n\
 345   -r, --reverse               reverse the result of comparisons\n\
 346 \n\
 347 "), stdout);
 348       fputs (_("\
 349 Other options:\n\
 350 \n\
 351 "), stdout);
 352       fputs (_("\
 353       --batch-size=NMERGE   merge at most NMERGE inputs at once;\n\
 354                             for more use temp files\n\
 355 "), stdout);
 356       fputs (_("\
 357   -c, --check, --check=diagnose-first  check for sorted input; do not sort\n\
 358   -C, --check=quiet, --check=silent  like -c, but do not report first bad line\n\
 359       --compress-program=PROG  compress temporaries with PROG;\n\
 360                               decompress them with PROG -d\n\
 361       --files0-from=F       read input from the files specified by\n\
 362                             NUL-terminated names in file F\n\
 363 "), stdout);
 364       fputs (_("\
 365   -k, --key=POS1[,POS2]     start a key at POS1, end it at POS2 (origin 1)\n\
 366   -m, --merge               merge already sorted files; do not sort\n\
 367 "), stdout);
 368       fputs (_("\
 369   -o, --output=FILE         write result to FILE instead of standard output\n\
 370   -s, --stable              stabilize sort by disabling last-resort comparison\n\
 371   -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
 372 "), stdout);
 373       printf (_("\
 374   -t, --field-separator=SEP  use SEP instead of non-blank to blank transition\n\
 375   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
 376                               multiple options specify multiple directories\n\
 377   -u, --unique              with -c, check for strict ordering;\n\
 378                               without -c, output only the first of an equal run\n\
 379 "), DEFAULT_TMPDIR);
 380       fputs (_("\
 381   -z, --zero-terminated     end lines with 0 byte, not newline\n\
 382 "), stdout);
 383       fputs (HELP_OPTION_DESCRIPTION, stdout);
 384       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 385       fputs (_("\
 386 \n\
 387 POS is F[.C][OPTS], where F is the field number and C the character position\n\
 388 in the field; both are origin 1.  If neither -t nor -b is in effect, characters\n\
 389 in a field are counted from the beginning of the preceding whitespace.  OPTS is\n\
 390 one or more single-letter ordering options, which override global ordering\n\
 391 options for that key.  If no key is given, use the entire line as the key.\n\
 392 \n\
 393 SIZE may be followed by the following multiplicative suffixes:\n\
 394 "), stdout);
 395       fputs (_("\
 396 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
 397 \n\
 398 With no FILE, or when FILE is -, read standard input.\n\
 399 \n\
 400 *** WARNING ***\n\
 401 The locale specified by the environment affects sort order.\n\
 402 Set LC_ALL=C to get the traditional sort order that uses\n\
 403 native byte values.\n\
 404 "), stdout );
 405       emit_bug_reporting_address ();
 406     }
 407
 408   exit (status);
 409 }
 410
 411 /* For long options that have no equivalent short option, use a
 412    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 413 enum
 414 {
 415   CHECK_OPTION = CHAR_MAX + 1,
 416   COMPRESS_PROGRAM_OPTION,
 417   FILES0_FROM_OPTION,
 418   NMERGE_OPTION,
 419   RANDOM_SOURCE_OPTION,
 420   SORT_OPTION
 421 };
 422
 423 static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z";
 424
 425 static struct option const long_options[] =
 426 {
 427   {"ignore-leading-blanks", no_argument, NULL, 'b'},
 428   {"check", optional_argument, NULL, CHECK_OPTION},
 429   {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
 430   {"dictionary-order", no_argument, NULL, 'd'},
 431   {"ignore-case", no_argument, NULL, 'f'},
 432   {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
 433   {"general-numeric-sort", no_argument, NULL, 'g'},
 434   {"ignore-nonprinting", no_argument, NULL, 'i'},
 435   {"key", required_argument, NULL, 'k'},
 436   {"merge", no_argument, NULL, 'm'},
 437   {"month-sort", no_argument, NULL, 'M'},
 438   {"numeric-sort", no_argument, NULL, 'n'},
 439   {"version-sort", no_argument, NULL, 'V'},
 440   {"random-sort", no_argument, NULL, 'R'},
 441   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
 442   {"sort", required_argument, NULL, SORT_OPTION},
 443   {"output", required_argument, NULL, 'o'},
 444   {"reverse", no_argument, NULL, 'r'},
 445   {"stable", no_argument, NULL, 's'},
 446   {"batch-size", required_argument, NULL, NMERGE_OPTION},
 447   {"buffer-size", required_argument, NULL, 'S'},
 448   {"field-separator", required_argument, NULL, 't'},
 449   {"temporary-directory", required_argument, NULL, 'T'},
 450   {"unique", no_argument, NULL, 'u'},
 451   {"zero-terminated", no_argument, NULL, 'z'},
 452   {GETOPT_HELP_OPTION_DECL},
 453   {GETOPT_VERSION_OPTION_DECL},
 454   {NULL, 0, NULL, 0},
 455 };
 456
 457 #define CHECK_TABLE \
 458   _ct_("quiet",          'C') \
 459   _ct_("silent",         'C') \
 460   _ct_("diagnose-first", 'c')
 461
 462 static char const *const check_args[] =
 463 {
 464 #define _ct_(_s, _c) _s,
 465   CHECK_TABLE NULL
 466 #undef  _ct_
 467 };
 468 static char const check_types[] =
 469 {
 470 #define _ct_(_s, _c) _c,
 471   CHECK_TABLE
 472 #undef  _ct_
 473 };
 474
 475 #define SORT_TABLE \
 476   _st_("general-numeric", 'g') \
 477   _st_("month",           'M') \
 478   _st_("numeric",         'n') \
 479   _st_("random",          'R') \
 480   _st_("version",         'V')
 481
 482 static char const *const sort_args[] =
 483 {
 484 #define _st_(_s, _c) _s,
 485   SORT_TABLE NULL
 486 #undef  _st_
 487 };
 488 static char const sort_types[] =
 489 {
 490 #define _st_(_s, _c) _c,
 491   SORT_TABLE
 492 #undef  _st_
 493 };
 494
 495 /* The set of signals that are caught.  */
 496 static sigset_t caught_signals;
 497
 498 /* Critical section status.  */
 499 struct cs_status
 500 {
 501   bool valid;
 502   sigset_t sigs;
 503 };
 504
 505 /* Enter a critical section.  */
 506 static struct cs_status
 507 cs_enter (void)
 508 {
 509   struct cs_status status;
 510   status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
 511   return status;
 512 }
 513
 514 /* Leave a critical section.  */
 515 static void
 516 cs_leave (struct cs_status status)
 517 {
 518   if (status.valid)
 519     {
 520       /* Ignore failure when restoring the signal mask. */
 521       sigprocmask (SIG_SETMASK, &status.sigs, NULL);
 522     }
 523 }
 524
 525 /* The list of temporary files. */
 526 struct tempnode
 527 {
 528   struct tempnode *volatile next;
 529   pid_t pid;     /* If compressed, the pid of compressor, else zero */
 530   char name[1];  /* Actual size is 1 + file name length.  */
 531 };
 532 static struct tempnode *volatile temphead;
 533 static struct tempnode *volatile *temptail = &temphead;
 534
 535 struct sortfile
 536 {
 537   char const *name;
 538   pid_t pid;     /* If compressed, the pid of compressor, else zero */
 539 };
 540
 541 /* A table where we store compression process states.  We clean up all
 542    processes in a timely manner so as not to exhaust system resources,
 543    so we store the info on whether the process is still running, or has
 544    been reaped here.  */
 545 static Hash_table *proctab;
 546
 547 enum { INIT_PROCTAB_SIZE = 47 };
 548
 549 enum procstate { ALIVE, ZOMBIE };
 550
 551 /* A proctab entry.  The COUNT field is there in case we fork a new
 552    compression process that has the same PID as an old zombie process
 553    that is still in the table (because the process to decompress the
 554    temp file it was associated with hasn't started yet).  */
 555 struct procnode
 556 {
 557   pid_t pid;
 558   enum procstate state;
 559   size_t count;
 560 };
 561
 562 static size_t
 563 proctab_hasher (const void *entry, size_t tabsize)
 564 {
 565   const struct procnode *node = entry;
 566   return node->pid % tabsize;
 567 }
 568
 569 static bool
 570 proctab_comparator (const void *e1, const void *e2)
 571 {
 572   const struct procnode *n1 = e1, *n2 = e2;
 573   return n1->pid == n2->pid;
 574 }
 575
 576 /* The total number of forked processes (compressors and decompressors)
 577    that have not been reaped yet. */
 578 static size_t nprocs;
 579
 580 /* The number of child processes we'll allow before we try to reap some. */
 581 enum { MAX_PROCS_BEFORE_REAP = 2 };
 582
 583 /* If 0 < PID, wait for the child process with that PID to exit.
 584    If PID is -1, clean up a random child process which has finished and
 585    return the process ID of that child.  If PID is -1 and no processes
 586    have quit yet, return 0 without waiting.  */
 587
 588 static pid_t
 589 reap (pid_t pid)
 590 {
 591   int status;
 592   pid_t cpid = waitpid (pid, &status, pid < 0 ? WNOHANG : 0);
 593
 594   if (cpid < 0)
 595     error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
 596            compress_program);
 597   else if (0 < cpid)
 598     {
 599       if (! WIFEXITED (status) || WEXITSTATUS (status))
 600         error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
 601                compress_program);
 602       --nprocs;
 603     }
 604
 605   return cpid;
 606 }
 607
 608 /* Add the PID of a running compression process to proctab, or update
 609    the entry COUNT and STATE fields if it's already there.  This also
 610    creates the table for us the first time it's called.  */
 611
 612 static void
 613 register_proc (pid_t pid)
 614 {
 615   struct procnode test, *node;
 616
 617   if (! proctab)
 618     {
 619       proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
 620                                  proctab_hasher,
 621                                  proctab_comparator,
 622                                  free);
 623       if (! proctab)
 624         xalloc_die ();
 625     }
 626
 627   test.pid = pid;
 628   node = hash_lookup (proctab, &test);
 629   if (node)
 630     {
 631       node->state = ALIVE;
 632       ++node->count;
 633     }
 634   else
 635     {
 636       node = xmalloc (sizeof *node);
 637       node->pid = pid;
 638       node->state = ALIVE;
 639       node->count = 1;
 640       hash_insert (proctab, node);
 641     }
 642 }
 643
 644 /* This is called when we reap a random process.  We don't know
 645    whether we have reaped a compression process or a decompression
 646    process until we look in the table.  If there's an ALIVE entry for
 647    it, then we have reaped a compression process, so change the state
 648    to ZOMBIE.  Otherwise, it's a decompression processes, so ignore it.  */
 649
 650 static void
 651 update_proc (pid_t pid)
 652 {
 653   struct procnode test, *node;
 654
 655   test.pid = pid;
 656   node = hash_lookup (proctab, &test);
 657   if (node)
 658     node->state = ZOMBIE;
 659 }
 660
 661 /* This is for when we need to wait for a compression process to exit.
 662    If it has a ZOMBIE entry in the table then it's already dead and has
 663    been reaped.  Note that if there's an ALIVE entry for it, it still may
 664    already have died and been reaped if a second process was created with
 665    the same PID.  This is probably exceedingly rare, but to be on the safe
 666    side we will have to wait for any compression process with this PID.  */
 667
 668 static void
 669 wait_proc (pid_t pid)
 670 {
 671   struct procnode test, *node;
 672
 673   test.pid = pid;
 674   node = hash_lookup (proctab, &test);
 675   if (node->state == ALIVE)
 676     reap (pid);
 677
 678   node->state = ZOMBIE;
 679   if (! --node->count)
 680     {
 681       hash_delete (proctab, node);
 682       free (node);
 683     }
 684 }
 685
 686 /* Keep reaping finished children as long as there are more to reap.
 687    This doesn't block waiting for any of them, it only reaps those
 688    that are already dead.  */
 689
 690 static void
 691 reap_some (void)
 692 {
 693   pid_t pid;
 694
 695   while (0 < nprocs && (pid = reap (-1)))
 696     update_proc (pid);
 697 }
 698
 699 /* Clean up any remaining temporary files.  */
 700
 701 static void
 702 cleanup (void)
 703 {
 704   struct tempnode const *node;
 705
 706   for (node = temphead; node; node = node->next)
 707     unlink (node->name);
 708   temphead = NULL;
 709 }
 710
 711 /* Cleanup actions to take when exiting.  */
 712
 713 static void
 714 exit_cleanup (void)
 715 {
 716   if (temphead)
 717     {
 718       /* Clean up any remaining temporary files in a critical section so
 719          that a signal handler does not try to clean them too.  */
 720       struct cs_status cs = cs_enter ();
 721       cleanup ();
 722       cs_leave (cs);
 723     }
 724
 725   close_stdout ();
 726 }
 727
 728 /* Create a new temporary file, returning its newly allocated tempnode.
 729    Store into *PFD the file descriptor open for writing.  */
 730
 731 static struct tempnode *
 732 create_temp_file (int *pfd)
 733 {
 734   static char const slashbase[] = "/sortXXXXXX";
 735   static size_t temp_dir_index;
 736   int fd;
 737   int saved_errno;
 738   char const *temp_dir = temp_dirs[temp_dir_index];
 739   size_t len = strlen (temp_dir);
 740   struct tempnode *node =
 741     xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
 742   char *file = node->name;
 743   struct cs_status cs;
 744
 745   memcpy (file, temp_dir, len);
 746   memcpy (file + len, slashbase, sizeof slashbase);
 747   node->next = NULL;
 748   node->pid = 0;
 749   if (++temp_dir_index == temp_dir_count)
 750     temp_dir_index = 0;
 751
 752   /* Create the temporary file in a critical section, to avoid races.  */
 753   cs = cs_enter ();
 754   fd = mkstemp (file);
 755   if (0 <= fd)
 756     {
 757       *temptail = node;
 758       temptail = &node->next;
 759     }
 760   saved_errno = errno;
 761   cs_leave (cs);
 762   errno = saved_errno;
 763
 764   if (fd < 0)
 765     error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
 766            quote (temp_dir));
 767
 768   *pfd = fd;
 769   return node;
 770 }
 771
 772 /* Return a stream for FILE, opened with mode HOW.  A null FILE means
 773    standard output; HOW should be "w".  When opening for input, "-"
 774    means standard input.  To avoid confusion, do not return file
 775    descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
 776    opening an ordinary FILE.  */
 777
 778 static FILE *
 779 xfopen (const char *file, const char *how)
 780 {
 781   FILE *fp;
 782
 783   if (!file)
 784     fp = stdout;
 785   else if (STREQ (file, "-") && *how == 'r')
 786     {
 787       have_read_stdin = true;
 788       fp = stdin;
 789     }
 790   else
 791     {
 792       fp = fopen (file, how);
 793       if (! fp)
 794         die (_("open failed"), file);
 795     }
 796
 797   return fp;
 798 }
 799
 800 /* Close FP, whose name is FILE, and report any errors.  */
 801
 802 static void
 803 xfclose (FILE *fp, char const *file)
 804 {
 805   switch (fileno (fp))
 806     {
 807     case STDIN_FILENO:
 808       /* Allow reading stdin from tty more than once.  */
 809       if (feof (fp))
 810         clearerr (fp);
 811       break;
 812
 813     case STDOUT_FILENO:
 814       /* Don't close stdout just yet.  close_stdout does that.  */
 815       if (fflush (fp) != 0)
 816         die (_("fflush failed"), file);
 817       break;
 818
 819     default:
 820       if (fclose (fp) != 0)
 821         die (_("close failed"), file);
 822       break;
 823     }
 824 }
 825
 826 static void
 827 dup2_or_die (int oldfd, int newfd)
 828 {
 829   if (dup2 (oldfd, newfd) < 0)
 830     error (SORT_FAILURE, errno, _("dup2 failed"));
 831 }
 832
 833 /* Fork a child process for piping to and do common cleanup.  The
 834    TRIES parameter tells us how many times to try to fork before
 835    giving up.  Return the PID of the child or -1 if fork failed.  */
 836
 837 static pid_t
 838 pipe_fork (int pipefds[2], size_t tries)
 839 {
 840 #if HAVE_WORKING_FORK
 841   struct tempnode *saved_temphead;
 842   int saved_errno;
 843   unsigned int wait_retry = 1;
 844   pid_t pid IF_LINT (= -1);
 845   struct cs_status cs;
 846
 847   if (pipe (pipefds) < 0)
 848     return -1;
 849
 850   while (tries--)
 851     {
 852       /* This is so the child process won't delete our temp files
 853          if it receives a signal before exec-ing.  */
 854       cs = cs_enter ();
 855       saved_temphead = temphead;
 856       temphead = NULL;
 857
 858       pid = fork ();
 859       saved_errno = errno;
 860       if (pid)
 861         temphead = saved_temphead;
 862
 863       cs_leave (cs);
 864       errno = saved_errno;
 865
 866       if (0 <= pid || errno != EAGAIN)
 867         break;
 868       else
 869         {
 870           sleep (wait_retry);
 871           wait_retry *= 2;
 872           reap_some ();
 873         }
 874     }
 875
 876   if (pid < 0)
 877     {
 878       close (pipefds[0]);
 879       close (pipefds[1]);
 880     }
 881   else if (pid == 0)
 882     {
 883       close (STDIN_FILENO);
 884       close (STDOUT_FILENO);
 885     }
 886   else
 887     ++nprocs;
 888
 889   return pid;
 890
 891 #else  /* ! HAVE_WORKING_FORK */
 892   return -1;
 893 #endif
 894 }
 895
 896 /* Create a temporary file and start a compression program to filter output
 897    to that file.  Set *PFP to the file handle and if *PPID is non-NULL,
 898    set it to the PID of the newly-created process.  */
 899
 900 static char *
 901 create_temp (FILE **pfp, pid_t *ppid)
 902 {
 903   int tempfd;
 904   struct tempnode *node = create_temp_file (&tempfd);
 905   char *name = node->name;
 906
 907   if (compress_program)
 908     {
 909       int pipefds[2];
 910
 911       node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
 912       if (0 < node->pid)
 913         {
 914           close (tempfd);
 915           close (pipefds[0]);
 916           tempfd = pipefds[1];
 917
 918           register_proc (node->pid);
 919         }
 920       else if (node->pid == 0)
 921         {
 922           close (pipefds[1]);
 923           dup2_or_die (tempfd, STDOUT_FILENO);
 924           close (tempfd);
 925           dup2_or_die (pipefds[0], STDIN_FILENO);
 926           close (pipefds[0]);
 927
 928           if (execlp (compress_program, compress_program, (char *) NULL) < 0)
 929             error (SORT_FAILURE, errno, _("couldn't execute %s"),
 930                    compress_program);
 931         }
 932       else
 933         node->pid = 0;
 934     }
 935
 936   *pfp = fdopen (tempfd, "w");
 937   if (! *pfp)
 938     die (_("couldn't create temporary file"), name);
 939
 940   if (ppid)
 941     *ppid = node->pid;
 942
 943   return name;
 944 }
 945
 946 /* Open a compressed temp file and start a decompression process through
 947    which to filter the input.  PID must be the valid processes ID of the
 948    process used to compress the file.  */
 949
 950 static FILE *
 951 open_temp (const char *name, pid_t pid)
 952 {
 953   int tempfd, pipefds[2];
 954   pid_t child_pid;
 955   FILE *fp;
 956
 957   wait_proc (pid);
 958
 959   tempfd = open (name, O_RDONLY);
 960   if (tempfd < 0)
 961     die (_("couldn't open temporary file"), name);
 962
 963   child_pid = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
 964   if (0 < child_pid)
 965     {
 966       close (tempfd);
 967       close (pipefds[1]);
 968     }
 969   else if (child_pid == 0)
 970     {
 971       close (pipefds[0]);
 972       dup2_or_die (tempfd, STDIN_FILENO);
 973       close (tempfd);
 974       dup2_or_die (pipefds[1], STDOUT_FILENO);
 975       close (pipefds[1]);
 976
 977       if (execlp (compress_program, compress_program, "-d", (char *) NULL) < 0)
 978         error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
 979                compress_program);
 980     }
 981   else
 982     error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
 983            compress_program);
 984
 985   fp = fdopen (pipefds[0], "r");
 986   if (! fp)
 987     die (_("couldn't create temporary file"), name);
 988
 989   return fp;
 990 }
 991
 992 static void
 993 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
 994 {
 995   if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
 996     die (_("write failed"), output_file);
 997 }
 998
 999 /* Append DIR to the array of temporary directory names.  */
1000 static void
1001 add_temp_dir (char const *dir)
1002 {
1003   if (temp_dir_count == temp_dir_alloc)
1004     temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1005
1006   temp_dirs[temp_dir_count++] = dir;
1007 }
1008
1009 /* Remove NAME from the list of temporary files.  */
1010
1011 static void
1012 zaptemp (const char *name)
1013 {
1014   struct tempnode *volatile *pnode;
1015   struct tempnode *node;
1016   struct tempnode *next;
1017   int unlink_status;
1018   int unlink_errno = 0;
1019   struct cs_status cs;
1020
1021   for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1022     continue;
1023
1024   /* Unlink the temporary file in a critical section to avoid races.  */
1025   next = node->next;
1026   cs = cs_enter ();
1027   unlink_status = unlink (name);
1028   unlink_errno = errno;
1029   *pnode = next;
1030   cs_leave (cs);
1031
1032   if (unlink_status != 0)
1033     error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1034   if (! next)
1035     temptail = pnode;
1036   free (node);
1037 }
1038
1039 #if HAVE_NL_LANGINFO
1040
1041 static int
1042 struct_month_cmp (const void *m1, const void *m2)
1043 {
1044   struct month const *month1 = m1;
1045   struct month const *month2 = m2;
1046   return strcmp (month1->name, month2->name);
1047 }
1048
1049 #endif
1050
1051 /* Initialize the character class tables. */
1052
1053 static void
1054 inittables (void)
1055 {
1056   size_t i;
1057
1058   for (i = 0; i < UCHAR_LIM; ++i)
1059     {
1060       blanks[i] = !! isblank (i);
1061       nonprinting[i] = ! isprint (i);
1062       nondictionary[i] = ! isalnum (i) && ! isblank (i);
1063       fold_toupper[i] = toupper (i);
1064     }
1065
1066 #if HAVE_NL_LANGINFO
1067   /* If we're not in the "C" locale, read different names for months.  */
1068   if (hard_LC_TIME)
1069     {
1070       for (i = 0; i < MONTHS_PER_YEAR; i++)
1071         {
1072           char const *s;
1073           size_t s_len;
1074           size_t j;
1075           char *name;
1076
1077           s = (char *) nl_langinfo (ABMON_1 + i);
1078           s_len = strlen (s);
1079           monthtab[i].name = name = xmalloc (s_len + 1);
1080           monthtab[i].val = i + 1;
1081
1082           for (j = 0; j < s_len; j++)
1083             name[j] = fold_toupper[to_uchar (s[j])];
1084           name[j] = '\0';
1085         }
1086       qsort ((void *) monthtab, MONTHS_PER_YEAR,
1087              sizeof *monthtab, struct_month_cmp);
1088     }
1089 #endif
1090 }
1091
1092 /* Specify how many inputs may be merged at once.
1093    This may be set on the command-line with the
1094    --batch-size option. */
1095 static void
1096 specify_nmerge (int oi, char c, char const *s)
1097 {
1098   uintmax_t n;
1099   struct rlimit rlimit;
1100   enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1101
1102   /* Try to find out how many file descriptors we'll be able
1103      to open.  We need at least nmerge + 3 (STDIN_FILENO,
1104      STDOUT_FILENO and STDERR_FILENO). */
1105   unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1106                               ? rlimit.rlim_cur
1107                               : OPEN_MAX)
1108                              - 3);
1109
1110   if (e == LONGINT_OK)
1111     {
1112       nmerge = n;
1113       if (nmerge != n)
1114         e = LONGINT_OVERFLOW;
1115       else
1116         {
1117           if (nmerge < 2)
1118             {
1119               error (0, 0, _("invalid --%s argument %s"),
1120                      long_options[oi].name, quote(s));
1121               error (SORT_FAILURE, 0,
1122                      _("minimum --%s argument is %s"),
1123                      long_options[oi].name, quote("2"));
1124             }
1125           else if (max_nmerge < nmerge)
1126             {
1127               e = LONGINT_OVERFLOW;
1128             }
1129           else
1130             return;
1131         }
1132     }
1133
1134   if (e == LONGINT_OVERFLOW)
1135     {
1136       char max_nmerge_buf[INT_BUFSIZE_BOUND (unsigned int)];
1137       error (0, 0, _("--%s argument %s too large"),
1138              long_options[oi].name, quote(s));
1139       error (SORT_FAILURE, 0,
1140              _("maximum --%s argument with current rlimit is %s"),
1141              long_options[oi].name,
1142              uinttostr (max_nmerge, max_nmerge_buf));
1143     }
1144   else
1145     xstrtol_fatal (e, oi, c, long_options, s);
1146 }
1147
1148 /* Specify the amount of main memory to use when sorting.  */
1149 static void
1150 specify_sort_size (int oi, char c, char const *s)
1151 {
1152   uintmax_t n;
1153   char *suffix;
1154   enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1155
1156   /* The default unit is KiB.  */
1157   if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1158     {
1159       if (n <= UINTMAX_MAX / 1024)
1160         n *= 1024;
1161       else
1162         e = LONGINT_OVERFLOW;
1163     }
1164
1165   /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
1166   if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1167     switch (suffix[0])
1168       {
1169       case 'b':
1170         e = LONGINT_OK;
1171         break;
1172
1173       case '%':
1174         {
1175           double mem = physmem_total () * n / 100;
1176
1177           /* Use "<", not "<=", to avoid problems with rounding.  */
1178           if (mem < UINTMAX_MAX)
1179             {
1180               n = mem;
1181               e = LONGINT_OK;
1182             }
1183           else
1184             e = LONGINT_OVERFLOW;
1185         }
1186         break;
1187       }
1188
1189   if (e == LONGINT_OK)
1190     {
1191       /* If multiple sort sizes are specified, take the maximum, so
1192          that option order does not matter.  */
1193       if (n < sort_size)
1194         return;
1195
1196       sort_size = n;
1197       if (sort_size == n)
1198         {
1199           sort_size = MAX (sort_size, MIN_SORT_SIZE);
1200           return;
1201         }
1202
1203       e = LONGINT_OVERFLOW;
1204     }
1205
1206   xstrtol_fatal (e, oi, c, long_options, s);
1207 }
1208
1209 /* Return the default sort size.  */
1210 static size_t
1211 default_sort_size (void)
1212 {
1213   /* Let MEM be available memory or 1/8 of total memory, whichever
1214      is greater.  */
1215   double avail = physmem_available ();
1216   double total = physmem_total ();
1217   double mem = MAX (avail, total / 8);
1218   struct rlimit rlimit;
1219
1220   /* Let SIZE be MEM, but no more than the maximum object size or
1221      system resource limits.  Avoid the MIN macro here, as it is not
1222      quite right when only one argument is floating point.  Don't
1223      bother to check for values like RLIM_INFINITY since in practice
1224      they are not much less than SIZE_MAX.  */
1225   size_t size = SIZE_MAX;
1226   if (mem < size)
1227     size = mem;
1228   if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1229     size = rlimit.rlim_cur;
1230 #ifdef RLIMIT_AS
1231   if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1232     size = rlimit.rlim_cur;
1233 #endif
1234
1235   /* Leave a large safety margin for the above limits, as failure can
1236      occur when they are exceeded.  */
1237   size /= 2;
1238
1239 #ifdef RLIMIT_RSS
1240   /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1241      Exceeding RSS is not fatal, but can be quite slow.  */
1242   if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1243     size = rlimit.rlim_cur / 16 * 15;
1244 #endif
1245
1246   /* Use no less than the minimum.  */
1247   return MAX (size, MIN_SORT_SIZE);
1248 }
1249
1250 /* Return the sort buffer size to use with the input files identified
1251    by FPS and FILES, which are alternate names of the same files.
1252    NFILES gives the number of input files; NFPS may be less.  Assume
1253    that each input line requires LINE_BYTES extra bytes' worth of line
1254    information.  Do not exceed the size bound specified by the user
1255    (or a default size bound, if the user does not specify one).  */
1256
1257 static size_t
1258 sort_buffer_size (FILE *const *fps, size_t nfps,
1259                   char *const *files, size_t nfiles,
1260                   size_t line_bytes)
1261 {
1262   /* A bound on the input size.  If zero, the bound hasn't been
1263      determined yet.  */
1264   static size_t size_bound;
1265
1266   /* In the worst case, each input byte is a newline.  */
1267   size_t worst_case_per_input_byte = line_bytes + 1;
1268
1269   /* Keep enough room for one extra input line and an extra byte.
1270      This extra room might be needed when preparing to read EOF.  */
1271   size_t size = worst_case_per_input_byte + 1;
1272
1273   size_t i;
1274
1275   for (i = 0; i < nfiles; i++)
1276     {
1277       struct stat st;
1278       off_t file_size;
1279       size_t worst_case;
1280
1281       if ((i < nfps ? fstat (fileno (fps[i]), &st)
1282            : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1283            : stat (files[i], &st))
1284           != 0)
1285         die (_("stat failed"), files[i]);
1286
1287       if (S_ISREG (st.st_mode))
1288         file_size = st.st_size;
1289       else
1290         {
1291           /* The file has unknown size.  If the user specified a sort
1292              buffer size, use that; otherwise, guess the size.  */
1293           if (sort_size)
1294             return sort_size;
1295           file_size = INPUT_FILE_SIZE_GUESS;
1296         }
1297
1298       if (! size_bound)
1299         {
1300           size_bound = sort_size;
1301           if (! size_bound)
1302             size_bound = default_sort_size ();
1303         }
1304
1305       /* Add the amount of memory needed to represent the worst case
1306          where the input consists entirely of newlines followed by a
1307          single non-newline.  Check for overflow.  */
1308       worst_case = file_size * worst_case_per_input_byte + 1;
1309       if (file_size != worst_case / worst_case_per_input_byte
1310           || size_bound - size <= worst_case)
1311         return size_bound;
1312       size += worst_case;
1313     }
1314
1315   return size;
1316 }
1317
1318 /* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
1319    must be at least sizeof (struct line).  Allocate ALLOC bytes
1320    initially.  */
1321
1322 static void
1323 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1324 {
1325   /* Ensure that the line array is properly aligned.  If the desired
1326      size cannot be allocated, repeatedly halve it until allocation
1327      succeeds.  The smaller allocation may hurt overall performance,
1328      but that's better than failing.  */
1329   for (;;)
1330     {
1331       alloc += sizeof (struct line) - alloc % sizeof (struct line);
1332       buf->buf = malloc (alloc);
1333       if (buf->buf)
1334         break;
1335       alloc /= 2;
1336       if (alloc <= line_bytes + 1)
1337         xalloc_die ();
1338     }
1339
1340   buf->line_bytes = line_bytes;
1341   buf->alloc = alloc;
1342   buf->used = buf->left = buf->nlines = 0;
1343   buf->eof = false;
1344 }
1345
1346 /* Return one past the limit of the line array.  */
1347
1348 static inline struct line *
1349 buffer_linelim (struct buffer const *buf)
1350 {
1351   return (struct line *) (buf->buf + buf->alloc);
1352 }
1353
1354 /* Return a pointer to the first character of the field specified
1355    by KEY in LINE. */
1356
1357 static char *
1358 begfield (const struct line *line, const struct keyfield *key)
1359 {
1360   char *ptr = line->text, *lim = ptr + line->length - 1;
1361   size_t sword = key->sword;
1362   size_t schar = key->schar;
1363   size_t remaining_bytes;
1364
1365   /* The leading field separator itself is included in a field when -t
1366      is absent.  */
1367
1368   if (tab != TAB_DEFAULT)
1369     while (ptr < lim && sword--)
1370       {
1371         while (ptr < lim && *ptr != tab)
1372           ++ptr;
1373         if (ptr < lim)
1374           ++ptr;
1375       }
1376   else
1377     while (ptr < lim && sword--)
1378       {
1379         while (ptr < lim && blanks[to_uchar (*ptr)])
1380           ++ptr;
1381         while (ptr < lim && !blanks[to_uchar (*ptr)])
1382           ++ptr;
1383       }
1384
1385   if (key->skipsblanks)
1386     while (ptr < lim && blanks[to_uchar (*ptr)])
1387       ++ptr;
1388
1389   /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
1390   remaining_bytes = lim - ptr;
1391   if (schar < remaining_bytes)
1392     ptr += schar;
1393   else
1394     ptr = lim;
1395
1396   return ptr;
1397 }
1398
1399 /* Return the limit of (a pointer to the first character after) the field
1400    in LINE specified by KEY. */
1401
1402 static char *
1403 limfield (const struct line *line, const struct keyfield *key)
1404 {
1405   char *ptr = line->text, *lim = ptr + line->length - 1;
1406   size_t eword = key->eword, echar = key->echar;
1407   size_t remaining_bytes;
1408
1409   /* Move PTR past EWORD fields or to one past the last byte on LINE,
1410      whichever comes first.  If there are more than EWORD fields, leave
1411      PTR pointing at the beginning of the field having zero-based index,
1412      EWORD.  If a delimiter character was specified (via -t), then that
1413      `beginning' is the first character following the delimiting TAB.
1414      Otherwise, leave PTR pointing at the first `blank' character after
1415      the preceding field.  */
1416   if (tab != TAB_DEFAULT)
1417     while (ptr < lim && eword--)
1418       {
1419         while (ptr < lim && *ptr != tab)
1420           ++ptr;
1421         if (ptr < lim && (eword | echar))
1422           ++ptr;
1423       }
1424   else
1425     while (ptr < lim && eword--)
1426       {
1427         while (ptr < lim && blanks[to_uchar (*ptr)])
1428           ++ptr;
1429         while (ptr < lim && !blanks[to_uchar (*ptr)])
1430           ++ptr;
1431       }
1432
1433 #ifdef POSIX_UNSPECIFIED
1434   /* The following block of code makes GNU sort incompatible with
1435      standard Unix sort, so it's ifdef'd out for now.
1436      The POSIX spec isn't clear on how to interpret this.
1437      FIXME: request clarification.
1438
1439      From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1440      Date: Thu, 30 May 96 12:20:41 -0400
1441      [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1442
1443      [...]I believe I've found another bug in `sort'.
1444
1445      $ cat /tmp/sort.in
1446      a b c 2 d
1447      pq rs 1 t
1448      $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1449      a b c 2 d
1450      pq rs 1 t
1451      $ /bin/sort -k1.7,1.7 </tmp/sort.in
1452      pq rs 1 t
1453      a b c 2 d
1454
1455      Unix sort produced the answer I expected: sort on the single character
1456      in column 7.  GNU sort produced different results, because it disagrees
1457      on the interpretation of the key-end spec "M.N".  Unix sort reads this
1458      as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1459      "skip M-1 fields, then either N-1 characters or the rest of the current
1460      field, whichever comes first".  This extra clause applies only to
1461      key-ends, not key-starts.
1462      */
1463
1464   /* Make LIM point to the end of (one byte past) the current field.  */
1465   if (tab != TAB_DEFAULT)
1466     {
1467       char *newlim;
1468       newlim = memchr (ptr, tab, lim - ptr);
1469       if (newlim)
1470         lim = newlim;
1471     }
1472   else
1473     {
1474       char *newlim;
1475       newlim = ptr;
1476       while (newlim < lim && blanks[to_uchar (*newlim)])
1477         ++newlim;
1478       while (newlim < lim && !blanks[to_uchar (*newlim)])
1479         ++newlim;
1480       lim = newlim;
1481     }
1482 #endif
1483
1484   /* If we're ignoring leading blanks when computing the End
1485      of the field, don't start counting bytes until after skipping
1486      past any leading blanks. */
1487   if (key->skipeblanks)
1488     while (ptr < lim && blanks[to_uchar (*ptr)])
1489       ++ptr;
1490
1491   /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
1492   remaining_bytes = lim - ptr;
1493   if (echar < remaining_bytes)
1494     ptr += echar;
1495   else
1496     ptr = lim;
1497
1498   return ptr;
1499 }
1500
1501 /* Fill BUF reading from FP, moving buf->left bytes from the end
1502    of buf->buf to the beginning first.  If EOF is reached and the
1503    file wasn't terminated by a newline, supply one.  Set up BUF's line
1504    table too.  FILE is the name of the file corresponding to FP.
1505    Return true if some input was read.  */
1506
1507 static bool
1508 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1509 {
1510   struct keyfield const *key = keylist;
1511   char eol = eolchar;
1512   size_t line_bytes = buf->line_bytes;
1513   size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1514
1515   if (buf->eof)
1516     return false;
1517
1518   if (buf->used != buf->left)
1519     {
1520       memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1521       buf->used = buf->left;
1522       buf->nlines = 0;
1523     }
1524
1525   for (;;)
1526     {
1527       char *ptr = buf->buf + buf->used;
1528       struct line *linelim = buffer_linelim (buf);
1529       struct line *line = linelim - buf->nlines;
1530       size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1531       char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1532
1533       while (line_bytes + 1 < avail)
1534         {
1535           /* Read as many bytes as possible, but do not read so many
1536              bytes that there might not be enough room for the
1537              corresponding line array.  The worst case is when the
1538              rest of the input file consists entirely of newlines,
1539              except that the last byte is not a newline.  */
1540           size_t readsize = (avail - 1) / (line_bytes + 1);
1541           size_t bytes_read = fread (ptr, 1, readsize, fp);
1542           char *ptrlim = ptr + bytes_read;
1543           char *p;
1544           avail -= bytes_read;
1545
1546           if (bytes_read != readsize)
1547             {
1548               if (ferror (fp))
1549                 die (_("read failed"), file);
1550               if (feof (fp))
1551                 {
1552                   buf->eof = true;
1553                   if (buf->buf == ptrlim)
1554                     return false;
1555                   if (ptrlim[-1] != eol)
1556                     *ptrlim++ = eol;
1557                 }
1558             }
1559
1560           /* Find and record each line in the just-read input.  */
1561           while ((p = memchr (ptr, eol, ptrlim - ptr)))
1562             {
1563               ptr = p + 1;
1564               line--;
1565               line->text = line_start;
1566               line->length = ptr - line_start;
1567               mergesize = MAX (mergesize, line->length);
1568               avail -= line_bytes;
1569
1570               if (key)
1571                 {
1572                   /* Precompute the position of the first key for
1573                      efficiency.  */
1574                   line->keylim = (key->eword == SIZE_MAX
1575                                   ? p
1576                                   : limfield (line, key));
1577
1578                   if (key->sword != SIZE_MAX)
1579                     line->keybeg = begfield (line, key);
1580                   else
1581                     {
1582                       if (key->skipsblanks)
1583                         while (blanks[to_uchar (*line_start)])
1584                           line_start++;
1585                       line->keybeg = line_start;
1586                     }
1587                 }
1588
1589               line_start = ptr;
1590             }
1591
1592           ptr = ptrlim;
1593           if (buf->eof)
1594             break;
1595         }
1596
1597       buf->used = ptr - buf->buf;
1598       buf->nlines = buffer_linelim (buf) - line;
1599       if (buf->nlines != 0)
1600         {
1601           buf->left = ptr - line_start;
1602           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1603           return true;
1604         }
1605
1606       {
1607         /* The current input line is too long to fit in the buffer.
1608            Double the buffer size and try again, keeping it properly
1609            aligned.  */
1610         size_t line_alloc = buf->alloc / sizeof (struct line);
1611         buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1612         buf->alloc = line_alloc * sizeof (struct line);
1613       }
1614     }
1615 }
1616
1617 /* Compare strings A and B as numbers without explicitly converting them to
1618    machine numbers.  Comparatively slow for short strings, but asymptotically
1619    hideously fast. */
1620
1621 static int
1622 numcompare (const char *a, const char *b)
1623 {
1624   while (blanks[to_uchar (*a)])
1625     a++;
1626   while (blanks[to_uchar (*b)])
1627     b++;
1628
1629   return strnumcmp (a, b, decimal_point, thousands_sep);
1630 }
1631
1632 static int
1633 general_numcompare (const char *sa, const char *sb)
1634 {
1635   /* FIXME: add option to warn about failed conversions.  */
1636   /* FIXME: maybe add option to try expensive FP conversion
1637      only if A and B can't be compared more cheaply/accurately.  */
1638
1639   char *ea;
1640   char *eb;
1641   double a = strtod (sa, &ea);
1642   double b = strtod (sb, &eb);
1643
1644   /* Put conversion errors at the start of the collating sequence.  */
1645   if (sa == ea)
1646     return sb == eb ? 0 : -1;
1647   if (sb == eb)
1648     return 1;
1649
1650   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1651      conversion errors but before numbers; sort them by internal
1652      bit-pattern, for lack of a more portable alternative.  */
1653   return (a < b ? -1
1654           : a > b ? 1
1655           : a == b ? 0
1656           : b == b ? -1
1657           : a == a ? 1
1658           : memcmp ((char *) &a, (char *) &b, sizeof a));
1659 }
1660
1661 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1662    Return 0 if the name in S is not recognized.  */
1663
1664 static int
1665 getmonth (char const *month, size_t len)
1666 {
1667   size_t lo = 0;
1668   size_t hi = MONTHS_PER_YEAR;
1669   char const *monthlim = month + len;
1670
1671   for (;;)
1672     {
1673       if (month == monthlim)
1674         return 0;
1675       if (!blanks[to_uchar (*month)])
1676         break;
1677       ++month;
1678     }
1679
1680   do
1681     {
1682       size_t ix = (lo + hi) / 2;
1683       char const *m = month;
1684       char const *n = monthtab[ix].name;
1685
1686       for (;; m++, n++)
1687         {
1688           if (!*n)
1689             return monthtab[ix].val;
1690           if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1691             {
1692               hi = ix;
1693               break;
1694             }
1695           else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1696             {
1697               lo = ix + 1;
1698               break;
1699             }
1700         }
1701     }
1702   while (lo < hi);
1703
1704   return 0;
1705 }
1706
1707 /* A source of random data.  */
1708 static struct randread_source *randread_source;
1709
1710 /* Return the Ith randomly-generated state.  The caller must invoke
1711    random_state (H) for all H less than I before invoking random_state
1712    (I).  */
1713
1714 static struct md5_ctx
1715 random_state (size_t i)
1716 {
1717   /* An array of states resulting from the random data, and counts of
1718      its used and allocated members.  */
1719   static struct md5_ctx *state;
1720   static size_t used;
1721   static size_t allocated;
1722
1723   struct md5_ctx *s = &state[i];
1724
1725   if (used <= i)
1726     {
1727       unsigned char buf[MD5_DIGEST_SIZE];
1728
1729       used++;
1730
1731       if (allocated <= i)
1732         {
1733           state = X2NREALLOC (state, &allocated);
1734           s = &state[i];
1735         }
1736
1737       randread (randread_source, buf, sizeof buf);
1738       md5_init_ctx (s);
1739       md5_process_bytes (buf, sizeof buf, s);
1740     }
1741
1742   return *s;
1743 }
1744
1745 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
1746    with length LENGTHB.  Return negative if less, zero if equal,
1747    positive if greater.  */
1748
1749 static int
1750 cmp_hashes (char const *texta, size_t lena,
1751             char const *textb, size_t lenb)
1752 {
1753   /* Try random hashes until a pair of hashes disagree.  But if the
1754      first pair of random hashes agree, check whether the keys are
1755      identical and if so report no difference.  */
1756   int diff;
1757   size_t i;
1758   for (i = 0; ; i++)
1759     {
1760       uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
1761       struct md5_ctx s[2];
1762       s[0] = s[1] = random_state (i);
1763       md5_process_bytes (texta, lena, &s[0]);  md5_finish_ctx (&s[0], dig[0]);
1764       md5_process_bytes (textb, lenb, &s[1]);  md5_finish_ctx (&s[1], dig[1]);
1765       diff = memcmp (dig[0], dig[1], sizeof dig[0]);
1766       if (diff != 0)
1767         break;
1768       if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)
1769         break;
1770     }
1771
1772   return diff;
1773 }
1774
1775 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1776    using one or more random hash functions.  */
1777
1778 static int
1779 compare_random (char *restrict texta, size_t lena,
1780                 char *restrict textb, size_t lenb)
1781 {
1782   int diff;
1783
1784   if (! hard_LC_COLLATE)
1785     diff = cmp_hashes (texta, lena, textb, lenb);
1786   else
1787     {
1788       /* Transform the text into the basis of comparison, so that byte
1789          strings that would otherwise considered to be equal are
1790          considered equal here even if their bytes differ.  */
1791
1792       char *buf = NULL;
1793       char stackbuf[4000];
1794       size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
1795       bool a_fits = tlena <= sizeof stackbuf;
1796       size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
1797                                (a_fits ? sizeof stackbuf - tlena : 0),
1798                                textb, lenb);
1799
1800       if (a_fits && tlena + tlenb <= sizeof stackbuf)
1801         buf = stackbuf;
1802       else
1803         {
1804           /* Adding 1 to the buffer size lets xmemxfrm run a bit
1805              faster by avoiding the need for an extra buffer copy.  */
1806           buf = xmalloc (tlena + tlenb + 1);
1807           xmemxfrm (buf, tlena + 1, texta, lena);
1808           xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
1809         }
1810
1811       diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
1812
1813       if (buf != stackbuf)
1814         free (buf);
1815     }
1816
1817   return diff;
1818 }
1819
1820 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1821    using strverscmp.  */
1822
1823 static int
1824 compare_version (char *restrict texta, size_t lena,
1825                 char *restrict textb, size_t lenb)
1826 {
1827   int diff;
1828
1829   /*
1830    *  It is necessary to save the character after the end of the field.
1831    *  "strverscmp" works with NUL terminated strings.  Our blocks of
1832    *  text are not necessarily terminated with a NUL byte.
1833    */
1834   char sv_a = texta[lena];
1835   char sv_b = textb[lenb];
1836
1837   texta[lena] = textb[lenb] = '\0';
1838   diff = strverscmp (texta, textb);
1839
1840   texta[lena] = sv_a;
1841   textb[lenb] = sv_b;
1842
1843   return diff;
1844 }
1845
1846 /* Compare two lines A and B trying every key in sequence until there
1847    are no more keys or a difference is found. */
1848
1849 static int
1850 keycompare (const struct line *a, const struct line *b)
1851 {
1852   struct keyfield const *key = keylist;
1853
1854   /* For the first iteration only, the key positions have been
1855      precomputed for us. */
1856   char *texta = a->keybeg;
1857   char *textb = b->keybeg;
1858   char *lima = a->keylim;
1859   char *limb = b->keylim;
1860
1861   int diff;
1862
1863   for (;;)
1864     {
1865       char const *translate = key->translate;
1866       bool const *ignore = key->ignore;
1867
1868       /* Find the lengths. */
1869       size_t lena = lima <= texta ? 0 : lima - texta;
1870       size_t lenb = limb <= textb ? 0 : limb - textb;
1871
1872       /* Actually compare the fields. */
1873
1874       if (key->random)
1875         diff = compare_random (texta, lena, textb, lenb);
1876       else if (key->numeric | key->general_numeric)
1877         {
1878           char savea = *lima, saveb = *limb;
1879
1880           *lima = *limb = '\0';
1881           diff = ((key->numeric ? numcompare : general_numcompare)
1882                   (texta, textb));
1883           *lima = savea, *limb = saveb;
1884         }
1885
1886       else if (key->version)
1887         diff = compare_version (texta, lena, textb, lenb);
1888
1889       else if (key->month)
1890         diff = getmonth (texta, lena) - getmonth (textb, lenb);
1891       /* Sorting like this may become slow, so in a simple locale the user
1892          can select a faster sort that is similar to ascii sort.  */
1893       else if (hard_LC_COLLATE)
1894         {
1895           if (ignore || translate)
1896             {
1897               char buf[4000];
1898               size_t size = lena + 1 + lenb + 1;
1899               char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1900               char *copy_b = copy_a + lena + 1;
1901               size_t new_len_a, new_len_b, i;
1902
1903               /* Ignore and/or translate chars before comparing.  */
1904               for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1905                 {
1906                   if (i < lena)
1907                     {
1908                       copy_a[new_len_a] = (translate
1909                                            ? translate[to_uchar (texta[i])]
1910                                            : texta[i]);
1911                       if (!ignore || !ignore[to_uchar (texta[i])])
1912                         ++new_len_a;
1913                     }
1914                   if (i < lenb)
1915                     {
1916                       copy_b[new_len_b] = (translate
1917                                            ? translate[to_uchar (textb[i])]
1918                                            : textb [i]);
1919                       if (!ignore || !ignore[to_uchar (textb[i])])
1920                         ++new_len_b;
1921                     }
1922                 }
1923
1924               diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1925
1926               if (sizeof buf < size)
1927                 free (copy_a);
1928             }
1929           else if (lena == 0)
1930             diff = - NONZERO (lenb);
1931           else if (lenb == 0)
1932             goto greater;
1933           else
1934             diff = xmemcoll (texta, lena, textb, lenb);
1935         }
1936       else if (ignore)
1937         {
1938 #define CMP_WITH_IGNORE(A, B)                                           \
1939   do                                                                    \
1940     {                                                                   \
1941           for (;;)                                                      \
1942             {                                                           \
1943               while (texta < lima && ignore[to_uchar (*texta)])         \
1944                 ++texta;                                                \
1945               while (textb < limb && ignore[to_uchar (*textb)])         \
1946                 ++textb;                                                \
1947               if (! (texta < lima && textb < limb))                     \
1948                 break;                                                  \
1949               diff = to_uchar (A) - to_uchar (B);                       \
1950               if (diff)                                                 \
1951                 goto not_equal;                                         \
1952               ++texta;                                                  \
1953               ++textb;                                                  \
1954             }                                                           \
1955                                                                         \
1956           diff = (texta < lima) - (textb < limb);                       \
1957     }                                                                   \
1958   while (0)
1959
1960           if (translate)
1961             CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1962                              translate[to_uchar (*textb)]);
1963           else
1964             CMP_WITH_IGNORE (*texta, *textb);
1965         }
1966       else if (lena == 0)
1967         diff = - NONZERO (lenb);
1968       else if (lenb == 0)
1969         goto greater;
1970       else
1971         {
1972           if (translate)
1973             {
1974               while (texta < lima && textb < limb)
1975                 {
1976                   diff = (to_uchar (translate[to_uchar (*texta++)])
1977                           - to_uchar (translate[to_uchar (*textb++)]));
1978                   if (diff)
1979                     goto not_equal;
1980                 }
1981             }
1982           else
1983             {
1984               diff = memcmp (texta, textb, MIN (lena, lenb));
1985               if (diff)
1986                 goto not_equal;
1987             }
1988           diff = lena < lenb ? -1 : lena != lenb;
1989         }
1990
1991       if (diff)
1992         goto not_equal;
1993
1994       key = key->next;
1995       if (! key)
1996         break;
1997
1998       /* Find the beginning and limit of the next field.  */
1999       if (key->eword != SIZE_MAX)
2000         lima = limfield (a, key), limb = limfield (b, key);
2001       else
2002         lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2003
2004       if (key->sword != SIZE_MAX)
2005         texta = begfield (a, key), textb = begfield (b, key);
2006       else
2007         {
2008           texta = a->text, textb = b->text;
2009           if (key->skipsblanks)
2010             {
2011               while (texta < lima && blanks[to_uchar (*texta)])
2012                 ++texta;
2013               while (textb < limb && blanks[to_uchar (*textb)])
2014                 ++textb;
2015             }
2016         }
2017     }
2018
2019   return 0;
2020
2021  greater:
2022   diff = 1;
2023  not_equal:
2024   return key->reverse ? -diff : diff;
2025 }
2026
2027 /* Compare two lines A and B, returning negative, zero, or positive
2028    depending on whether A compares less than, equal to, or greater than B. */
2029
2030 static int
2031 compare (const struct line *a, const struct line *b)
2032 {
2033   int diff;
2034   size_t alen, blen;
2035
2036   /* First try to compare on the specified keys (if any).
2037      The only two cases with no key at all are unadorned sort,
2038      and unadorned sort -r. */
2039   if (keylist)
2040     {
2041       diff = keycompare (a, b);
2042       if (diff | unique | stable)
2043         return diff;
2044     }
2045
2046   /* If the keys all compare equal (or no keys were specified)
2047      fall through to the default comparison.  */
2048   alen = a->length - 1, blen = b->length - 1;
2049
2050   if (alen == 0)
2051     diff = - NONZERO (blen);
2052   else if (blen == 0)
2053     diff = 1;
2054   else if (hard_LC_COLLATE)
2055     diff = xmemcoll (a->text, alen, b->text, blen);
2056   else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2057     diff = alen < blen ? -1 : alen != blen;
2058
2059   return reverse ? -diff : diff;
2060 }
2061
2062 /* Check that the lines read from FILE_NAME come in order.  Return
2063    true if they are in order.  If CHECKONLY == 'c', also print a
2064    diagnostic (FILE_NAME, line number, contents of line) to stderr if
2065    they are not in order.  */
2066
2067 static bool
2068 check (char const *file_name, char checkonly)
2069 {
2070   FILE *fp = xfopen (file_name, "r");
2071   struct buffer buf;            /* Input buffer. */
2072   struct line temp;             /* Copy of previous line. */
2073   size_t alloc = 0;
2074   uintmax_t line_number = 0;
2075   struct keyfield const *key = keylist;
2076   bool nonunique = ! unique;
2077   bool ordered = true;
2078
2079   initbuf (&buf, sizeof (struct line),
2080            MAX (merge_buffer_size, sort_size));
2081   temp.text = NULL;
2082
2083   while (fillbuf (&buf, fp, file_name))
2084     {
2085       struct line const *line = buffer_linelim (&buf);
2086       struct line const *linebase = line - buf.nlines;
2087
2088       /* Make sure the line saved from the old buffer contents is
2089          less than or equal to the first line of the new buffer. */
2090       if (alloc && nonunique <= compare (&temp, line - 1))
2091         {
2092         found_disorder:
2093           {
2094             if (checkonly == 'c')
2095               {
2096                 struct line const *disorder_line = line - 1;
2097                 uintmax_t disorder_line_number =
2098                   buffer_linelim (&buf) - disorder_line + line_number;
2099                 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
2100                 fprintf (stderr, _("%s: %s:%s: disorder: "),
2101                          program_name, file_name,
2102                          umaxtostr (disorder_line_number, hr_buf));
2103                 write_bytes (disorder_line->text, disorder_line->length,
2104                              stderr, _("standard error"));
2105               }
2106
2107             ordered = false;
2108             break;
2109           }
2110         }
2111
2112       /* Compare each line in the buffer with its successor.  */
2113       while (linebase < --line)
2114         if (nonunique <= compare (line, line - 1))
2115           goto found_disorder;
2116
2117       line_number += buf.nlines;
2118
2119       /* Save the last line of the buffer.  */
2120       if (alloc < line->length)
2121         {
2122           do
2123             {
2124               alloc *= 2;
2125               if (! alloc)
2126                 {
2127                   alloc = line->length;
2128                   break;
2129                 }
2130             }
2131           while (alloc < line->length);
2132
2133           temp.text = xrealloc (temp.text, alloc);
2134         }
2135       memcpy (temp.text, line->text, line->length);
2136       temp.length = line->length;
2137       if (key)
2138         {
2139           temp.keybeg = temp.text + (line->keybeg - line->text);
2140           temp.keylim = temp.text + (line->keylim - line->text);
2141         }
2142     }
2143
2144   xfclose (fp, file_name);
2145   free (buf.buf);
2146   free (temp.text);
2147   return ordered;
2148 }
2149
2150 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
2151    files (all of which are at the start of the FILES array), and
2152    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2153    Close input and output files before returning.
2154    OUTPUT_FILE gives the name of the output file.  If it is NULL,
2155    the output file is standard output.  If OFP is NULL, the output
2156    file has not been opened yet (or written to, if standard output).  */
2157
2158 static void
2159 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2160           FILE *ofp, char const *output_file)
2161 {
2162   FILE **fps = xnmalloc (nmerge, sizeof *fps);
2163                                 /* Input streams for each file.  */
2164   struct buffer *buffer = xnmalloc (nmerge, sizeof *buffer);
2165                                 /* Input buffers for each file. */
2166   struct line saved;            /* Saved line storage for unique check. */
2167   struct line const *savedline = NULL;
2168                                 /* &saved if there is a saved line. */
2169   size_t savealloc = 0;         /* Size allocated for the saved line. */
2170   struct line const **cur = xnmalloc (nmerge, sizeof *cur);
2171                                 /* Current line in each line table. */
2172   struct line const **base = xnmalloc (nmerge, sizeof *base);
2173                                 /* Base of each line table.  */
2174   size_t *ord = xnmalloc (nmerge, sizeof *ord);
2175                                 /* Table representing a permutation of fps,
2176                                    such that cur[ord[0]] is the smallest line
2177                                    and will be next output. */
2178   size_t i;
2179   size_t j;
2180   size_t t;
2181   struct keyfield const *key = keylist;
2182   saved.text = NULL;
2183
2184   /* Read initial lines from each input file. */
2185   for (i = 0; i < nfiles; )
2186     {
2187       fps[i] = (files[i].pid
2188                 ? open_temp (files[i].name, files[i].pid)
2189                 : xfopen (files[i].name, "r"));
2190       initbuf (&buffer[i], sizeof (struct line),
2191                MAX (merge_buffer_size, sort_size / nfiles));
2192       if (fillbuf (&buffer[i], fps[i], files[i].name))
2193         {
2194           struct line const *linelim = buffer_linelim (&buffer[i]);
2195           cur[i] = linelim - 1;
2196           base[i] = linelim - buffer[i].nlines;
2197           i++;
2198         }
2199       else
2200         {
2201           /* fps[i] is empty; eliminate it from future consideration.  */
2202           xfclose (fps[i], files[i].name);
2203           if (i < ntemps)
2204             {
2205               ntemps--;
2206               zaptemp (files[i].name);
2207             }
2208           free (buffer[i].buf);
2209           --nfiles;
2210           for (j = i; j < nfiles; ++j)
2211             files[j] = files[j + 1];
2212         }
2213     }
2214
2215   if (! ofp)
2216     ofp = xfopen (output_file, "w");
2217
2218   /* Set up the ord table according to comparisons among input lines.
2219      Since this only reorders two items if one is strictly greater than
2220      the other, it is stable. */
2221   for (i = 0; i < nfiles; ++i)
2222     ord[i] = i;
2223   for (i = 1; i < nfiles; ++i)
2224     if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2225       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2226
2227   /* Repeatedly output the smallest line until no input remains. */
2228   while (nfiles)
2229     {
2230       struct line const *smallest = cur[ord[0]];
2231
2232       /* If uniquified output is turned on, output only the first of
2233          an identical series of lines. */
2234       if (unique)
2235         {
2236           if (savedline && compare (savedline, smallest))
2237             {
2238               savedline = NULL;
2239               write_bytes (saved.text, saved.length, ofp, output_file);
2240             }
2241           if (!savedline)
2242             {
2243               savedline = &saved;
2244               if (savealloc < smallest->length)
2245                 {
2246                   do
2247                     if (! savealloc)
2248                       {
2249                         savealloc = smallest->length;
2250                         break;
2251                       }
2252                   while ((savealloc *= 2) < smallest->length);
2253
2254                   saved.text = xrealloc (saved.text, savealloc);
2255                 }
2256               saved.length = smallest->length;
2257               memcpy (saved.text, smallest->text, saved.length);
2258               if (key)
2259                 {
2260                   saved.keybeg =
2261                     saved.text + (smallest->keybeg - smallest->text);
2262                   saved.keylim =
2263                     saved.text + (smallest->keylim - smallest->text);
2264                 }
2265             }
2266         }
2267       else
2268         write_bytes (smallest->text, smallest->length, ofp, output_file);
2269
2270       /* Check if we need to read more lines into core. */
2271       if (base[ord[0]] < smallest)
2272         cur[ord[0]] = smallest - 1;
2273       else
2274         {
2275           if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2276             {
2277               struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2278               cur[ord[0]] = linelim - 1;
2279               base[ord[0]] = linelim - buffer[ord[0]].nlines;
2280             }
2281           else
2282             {
2283               /* We reached EOF on fps[ord[0]].  */
2284               for (i = 1; i < nfiles; ++i)
2285                 if (ord[i] > ord[0])
2286                   --ord[i];
2287               --nfiles;
2288               xfclose (fps[ord[0]], files[ord[0]].name);
2289               if (ord[0] < ntemps)
2290                 {
2291                   ntemps--;
2292                   zaptemp (files[ord[0]].name);
2293                 }
2294               free (buffer[ord[0]].buf);
2295               for (i = ord[0]; i < nfiles; ++i)
2296                 {
2297                   fps[i] = fps[i + 1];
2298                   files[i] = files[i + 1];
2299                   buffer[i] = buffer[i + 1];
2300                   cur[i] = cur[i + 1];
2301                   base[i] = base[i + 1];
2302                 }
2303               for (i = 0; i < nfiles; ++i)
2304                 ord[i] = ord[i + 1];
2305               continue;
2306             }
2307         }
2308
2309       /* The new line just read in may be larger than other lines
2310          already in main memory; push it back in the queue until we
2311          encounter a line larger than it.  Optimize for the common
2312          case where the new line is smallest.  */
2313       {
2314         size_t lo = 1;
2315         size_t hi = nfiles;
2316         size_t probe = lo;
2317         size_t ord0 = ord[0];
2318         size_t count_of_smaller_lines;
2319
2320         while (lo < hi)
2321           {
2322             int cmp = compare (cur[ord0], cur[ord[probe]]);
2323             if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2324               hi = probe;
2325             else
2326               lo = probe + 1;
2327             probe = (lo + hi) / 2;
2328           }
2329
2330         count_of_smaller_lines = lo - 1;
2331         for (j = 0; j < count_of_smaller_lines; j++)
2332           ord[j] = ord[j + 1];
2333         ord[count_of_smaller_lines] = ord0;
2334       }
2335
2336       /* Free up some resources every once in a while.  */
2337       if (MAX_PROCS_BEFORE_REAP < nprocs)
2338         reap_some ();
2339     }
2340
2341   if (unique && savedline)
2342     {
2343       write_bytes (saved.text, saved.length, ofp, output_file);
2344       free (saved.text);
2345     }
2346
2347   xfclose (ofp, output_file);
2348   free(fps);
2349   free(buffer);
2350   free(ord);
2351   free(base);
2352   free(cur);
2353 }
2354
2355 /* Merge into T the two sorted arrays of lines LO (with NLO members)
2356    and HI (with NHI members).  T, LO, and HI point just past their
2357    respective arrays, and the arrays are in reverse order.  NLO and
2358    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
2359
2360 static inline void
2361 mergelines (struct line *t,
2362             struct line const *lo, size_t nlo,
2363             struct line const *hi, size_t nhi)
2364 {
2365   for (;;)
2366     if (compare (lo - 1, hi - 1) <= 0)
2367       {
2368         *--t = *--lo;
2369         if (! --nlo)
2370           {
2371             /* HI - NHI equalled T - (NLO + NHI) when this function
2372                began.  Therefore HI must equal T now, and there is no
2373                need to copy from HI to T.  */
2374             return;
2375           }
2376       }
2377     else
2378       {
2379         *--t = *--hi;
2380         if (! --nhi)
2381           {
2382             do
2383               *--t = *--lo;
2384             while (--nlo);
2385
2386             return;
2387           }
2388       }
2389 }
2390
2391 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
2392    NLINES must be at least 2.
2393    The input and output arrays are in reverse order, and LINES and
2394    TEMP point just past the end of their respective arrays.
2395
2396    Use a recursive divide-and-conquer algorithm, in the style
2397    suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
2398    the optimization suggested by exercise 5.2.4-10; this requires room
2399    for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
2400    writes that this memory optimization was originally published by
2401    D. A. Bell, Comp J. 1 (1958), 75.  */
2402
2403 static void
2404 sortlines (struct line *lines, size_t nlines, struct line *temp)
2405 {
2406   if (nlines == 2)
2407     {
2408       if (0 < compare (&lines[-1], &lines[-2]))
2409         {
2410           struct line tmp = lines[-1];
2411           lines[-1] = lines[-2];
2412           lines[-2] = tmp;
2413         }
2414     }
2415   else
2416     {
2417       size_t nlo = nlines / 2;
2418       size_t nhi = nlines - nlo;
2419       struct line *lo = lines;
2420       struct line *hi = lines - nlo;
2421       struct line *sorted_lo = temp;
2422
2423       sortlines (hi, nhi, temp);
2424       if (1 < nlo)
2425         sortlines_temp (lo, nlo, sorted_lo);
2426       else
2427         sorted_lo[-1] = lo[-1];
2428
2429       mergelines (lines, sorted_lo, nlo, hi, nhi);
2430     }
2431 }
2432
2433 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
2434    rather than sorting in place.  */
2435
2436 static void
2437 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
2438 {
2439   if (nlines == 2)
2440     {
2441       /* Declare `swap' as int, not bool, to work around a bug
2442          <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
2443          in the IBM xlc 6.0.0.0 compiler in 64-bit mode.  */
2444       int swap = (0 < compare (&lines[-1], &lines[-2]));
2445       temp[-1] = lines[-1 - swap];
2446       temp[-2] = lines[-2 + swap];
2447     }
2448   else
2449     {
2450       size_t nlo = nlines / 2;
2451       size_t nhi = nlines - nlo;
2452       struct line *lo = lines;
2453       struct line *hi = lines - nlo;
2454       struct line *sorted_hi = temp - nlo;
2455
2456       sortlines_temp (hi, nhi, sorted_hi);
2457       if (1 < nlo)
2458         sortlines (lo, nlo, temp);
2459
2460       mergelines (temp, lo, nlo, sorted_hi, nhi);
2461     }
2462 }
2463
2464 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
2465    the same as OUTFILE.  If found, merge the found instances (and perhaps
2466    some other files) into a temporary file so that it can in turn be
2467    merged into OUTFILE without destroying OUTFILE before it is completely
2468    read.  Return the new value of NFILES, which differs from the old if
2469    some merging occurred.
2470
2471    This test ensures that an otherwise-erroneous use like
2472    "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
2473    It's not clear that POSIX requires this nicety.
2474    Detect common error cases, but don't try to catch obscure cases like
2475    "cat ... FILE ... | sort -m -o FILE"
2476    where traditional "sort" doesn't copy the input and where
2477    people should know that they're getting into trouble anyway.
2478    Catching these obscure cases would slow down performance in
2479    common cases.  */
2480
2481 static size_t
2482 avoid_trashing_input (struct sortfile *files, size_t ntemps,
2483                       size_t nfiles, char const *outfile)
2484 {
2485   size_t i;
2486   bool got_outstat = false;
2487   struct stat outstat;
2488
2489   for (i = ntemps; i < nfiles; i++)
2490     {
2491       bool is_stdin = STREQ (files[i].name, "-");
2492       bool same;
2493       struct stat instat;
2494
2495       if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
2496         same = true;
2497       else
2498         {
2499           if (! got_outstat)
2500             {
2501               if ((outfile
2502                    ? stat (outfile, &outstat)
2503                    : fstat (STDOUT_FILENO, &outstat))
2504                   != 0)
2505                 break;
2506               got_outstat = true;
2507             }
2508
2509           same = (((is_stdin
2510                     ? fstat (STDIN_FILENO, &instat)
2511                     : stat (files[i].name, &instat))
2512                    == 0)
2513                   && SAME_INODE (instat, outstat));
2514         }
2515
2516       if (same)
2517         {
2518           FILE *tftp;
2519           pid_t pid;
2520           char *temp = create_temp (&tftp, &pid);
2521           mergefps (&files[i],0, nfiles - i, tftp, temp);
2522           files[i].name = temp;
2523           files[i].pid = pid;
2524           return i + 1;
2525         }
2526     }
2527
2528   return nfiles;
2529 }
2530
2531 /* Merge the input FILES.  NTEMPS is the number of files at the
2532    start of FILES that are temporary; it is zero at the top level.
2533    NFILES is the total number of files.  Put the output in
2534    OUTPUT_FILE; a null OUTPUT_FILE stands for standard output.  */
2535
2536 static void
2537 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
2538        char const *output_file)
2539 {
2540   while (nmerge < nfiles)
2541     {
2542       /* Number of input files processed so far.  */
2543       size_t in;
2544
2545       /* Number of output files generated so far.  */
2546       size_t out;
2547
2548       /* nfiles % NMERGE; this counts input files that are left over
2549          after all full-sized merges have been done.  */
2550       size_t remainder;
2551
2552       /* Number of easily-available slots at the next loop iteration.  */
2553       size_t cheap_slots;
2554
2555       /* Do as many NMERGE-size merges as possible.  */
2556       for (out = in = 0; out < nfiles / nmerge; out++, in += nmerge)
2557         {
2558           FILE *tfp;
2559           pid_t pid;
2560           char *temp = create_temp (&tfp, &pid);
2561           size_t nt = MIN (ntemps, nmerge);
2562           ntemps -= nt;
2563           mergefps (&files[in], nt, nmerge, tfp, temp);
2564           files[out].name = temp;
2565           files[out].pid = pid;
2566         }
2567
2568       remainder = nfiles - in;
2569       cheap_slots = nmerge - out % nmerge;
2570
2571       if (cheap_slots < remainder)
2572         {
2573           /* So many files remain that they can't all be put into the last
2574              NMERGE-sized output window.  Do one more merge.  Merge as few
2575              files as possible, to avoid needless I/O.  */
2576           size_t nshortmerge = remainder - cheap_slots + 1;
2577           FILE *tfp;
2578           pid_t pid;
2579           char *temp = create_temp (&tfp, &pid);
2580           size_t nt = MIN (ntemps, nshortmerge);
2581           ntemps -= nt;
2582           mergefps (&files[in], nt, nshortmerge, tfp, temp);
2583           files[out].name = temp;
2584           files[out++].pid = pid;
2585           in += nshortmerge;
2586         }
2587
2588       /* Put the remaining input files into the last NMERGE-sized output
2589          window, so they will be merged in the next pass.  */
2590       memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2591       ntemps += out;
2592       nfiles -= in - out;
2593     }
2594
2595   nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2596   mergefps (files, ntemps, nfiles, NULL, output_file);
2597 }
2598
2599 /* Sort NFILES FILES onto OUTPUT_FILE. */
2600
2601 static void
2602 sort (char * const *files, size_t nfiles, char const *output_file)
2603 {
2604   struct buffer buf;
2605   size_t ntemps = 0;
2606   bool output_file_created = false;
2607
2608   buf.alloc = 0;
2609
2610   while (nfiles)
2611     {
2612       char const *temp_output;
2613       char const *file = *files;
2614       FILE *fp = xfopen (file, "r");
2615       FILE *tfp;
2616       size_t bytes_per_line = (2 * sizeof (struct line)
2617                                - sizeof (struct line) / 2);
2618
2619       if (! buf.alloc)
2620         initbuf (&buf, bytes_per_line,
2621                  sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2622       buf.eof = false;
2623       files++;
2624       nfiles--;
2625
2626       while (fillbuf (&buf, fp, file))
2627         {
2628           struct line *line;
2629           struct line *linebase;
2630
2631           if (buf.eof && nfiles
2632               && (bytes_per_line + 1
2633                   < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2634             {
2635               /* End of file, but there is more input and buffer room.
2636                  Concatenate the next input file; this is faster in
2637                  the usual case.  */
2638               buf.left = buf.used;
2639               break;
2640             }
2641
2642           line = buffer_linelim (&buf);
2643           linebase = line - buf.nlines;
2644           if (1 < buf.nlines)
2645             sortlines (line, buf.nlines, linebase);
2646           if (buf.eof && !nfiles && !ntemps && !buf.left)
2647             {
2648               xfclose (fp, file);
2649               tfp = xfopen (output_file, "w");
2650               temp_output = output_file;
2651               output_file_created = true;
2652             }
2653           else
2654             {
2655               ++ntemps;
2656               temp_output = create_temp (&tfp, NULL);
2657             }
2658
2659           do
2660             {
2661               line--;
2662               write_bytes (line->text, line->length, tfp, temp_output);
2663               if (unique)
2664                 while (linebase < line && compare (line, line - 1) == 0)
2665                   line--;
2666             }
2667           while (linebase < line);
2668
2669           xfclose (tfp, temp_output);
2670
2671           /* Free up some resources every once in a while.  */
2672           if (MAX_PROCS_BEFORE_REAP < nprocs)
2673             reap_some ();
2674
2675           if (output_file_created)
2676             goto finish;
2677         }
2678       xfclose (fp, file);
2679     }
2680
2681  finish:
2682   free (buf.buf);
2683
2684   if (! output_file_created)
2685     {
2686       size_t i;
2687       struct tempnode *node = temphead;
2688       struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2689       for (i = 0; node; i++)
2690         {
2691           tempfiles[i].name = node->name;
2692           tempfiles[i].pid = node->pid;
2693           node = node->next;
2694         }
2695       merge (tempfiles, ntemps, ntemps, output_file);
2696       free (tempfiles);
2697     }
2698 }
2699
2700 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list.  */
2701
2702 static void
2703 insertkey (struct keyfield *key_arg)
2704 {
2705   struct keyfield **p;
2706   struct keyfield *key = xmemdup (key_arg, sizeof *key);
2707
2708   for (p = &keylist; *p; p = &(*p)->next)
2709     continue;
2710   *p = key;
2711   key->next = NULL;
2712 }
2713
2714 /* Report a bad field specification SPEC, with extra info MSGID.  */
2715
2716 static void badfieldspec (char const *, char const *)
2717      ATTRIBUTE_NORETURN;
2718 static void
2719 badfieldspec (char const *spec, char const *msgid)
2720 {
2721   error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
2722          _(msgid), quote (spec));
2723   abort ();
2724 }
2725
2726 /* Report incompatible options.  */
2727
2728 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
2729 static void
2730 incompatible_options (char const *opts)
2731 {
2732   error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
2733   abort ();
2734 }
2735
2736 /* Check compatibility of ordering options.  */
2737
2738 static void
2739 check_ordering_compatibility (void)
2740 {
2741   struct keyfield const *key;
2742
2743   for (key = keylist; key; key = key->next)
2744     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
2745               + key->version + !!key->ignore))
2746         || (key->random && key->translate))
2747       {
2748         /* The following is too big, but guaranteed to be "big enough". */
2749         char opts[sizeof short_options];
2750         char *p = opts;
2751         if (key->ignore == nondictionary)
2752           *p++ = 'd';
2753         if (key->translate)
2754           *p++ = 'f';
2755         if (key->general_numeric)
2756           *p++ = 'g';
2757         if (key->ignore == nonprinting)
2758           *p++ = 'i';
2759         if (key->month)
2760           *p++ = 'M';
2761         if (key->numeric)
2762           *p++ = 'n';
2763         if (key->version)
2764           *p++ = 'V';
2765         if (key->random)
2766           *p++ = 'R';
2767         *p = '\0';
2768         incompatible_options (opts);
2769       }
2770 }
2771
2772 /* Parse the leading integer in STRING and store the resulting value
2773    (which must fit into size_t) into *VAL.  Return the address of the
2774    suffix after the integer.  If the value is too large, silently
2775    substitute SIZE_MAX.  If MSGID is NULL, return NULL after
2776    failure; otherwise, report MSGID and exit on failure.  */
2777
2778 static char const *
2779 parse_field_count (char const *string, size_t *val, char const *msgid)
2780 {
2781   char *suffix;
2782   uintmax_t n;
2783
2784   switch (xstrtoumax (string, &suffix, 10, &n, ""))
2785     {
2786     case LONGINT_OK:
2787     case LONGINT_INVALID_SUFFIX_CHAR:
2788       *val = n;
2789       if (*val == n)
2790         break;
2791       /* Fall through.  */
2792     case LONGINT_OVERFLOW:
2793     case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2794       *val = SIZE_MAX;
2795       break;
2796
2797     case LONGINT_INVALID:
2798       if (msgid)
2799         error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
2800                _(msgid), quote (string));
2801       return NULL;
2802     }
2803
2804   return suffix;
2805 }
2806
2807 /* Handle interrupts and hangups. */
2808
2809 static void
2810 sighandler (int sig)
2811 {
2812   if (! SA_NOCLDSTOP)
2813     signal (sig, SIG_IGN);
2814
2815   cleanup ();
2816
2817   signal (sig, SIG_DFL);
2818   raise (sig);
2819 }
2820
2821 /* Set the ordering options for KEY specified in S.
2822    Return the address of the first character in S that
2823    is not a valid ordering option.
2824    BLANKTYPE is the kind of blanks that 'b' should skip. */
2825
2826 static char *
2827 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
2828 {
2829   while (*s)
2830     {
2831       switch (*s)
2832         {
2833         case 'b':
2834           if (blanktype == bl_start || blanktype == bl_both)
2835             key->skipsblanks = true;
2836           if (blanktype == bl_end || blanktype == bl_both)
2837             key->skipeblanks = true;
2838           break;
2839         case 'd':
2840           key->ignore = nondictionary;
2841           break;
2842         case 'f':
2843           key->translate = fold_toupper;
2844           break;
2845         case 'g':
2846           key->general_numeric = true;
2847           break;
2848         case 'i':
2849           /* Option order should not matter, so don't let -i override
2850              -d.  -d implies -i, but -i does not imply -d.  */
2851           if (! key->ignore)
2852             key->ignore = nonprinting;
2853           break;
2854         case 'M':
2855           key->month = true;
2856           break;
2857         case 'n':
2858           key->numeric = true;
2859           break;
2860         case 'R':
2861           key->random = true;
2862           break;
2863         case 'r':
2864           key->reverse = true;
2865           break;
2866         case 'V':
2867           key->version = true;
2868           break;
2869         default:
2870           return (char *) s;
2871         }
2872       ++s;
2873     }
2874   return (char *) s;
2875 }
2876
2877 static struct keyfield *
2878 key_init (struct keyfield *key)
2879 {
2880   memset (key, 0, sizeof *key);
2881   key->eword = SIZE_MAX;
2882   return key;
2883 }
2884
2885 int
2886 main (int argc, char **argv)
2887 {
2888   struct keyfield *key;
2889   struct keyfield key_buf;
2890   struct keyfield gkey;
2891   char const *s;
2892   int c = 0;
2893   char checkonly = 0;
2894   bool mergeonly = false;
2895   char *random_source = NULL;
2896   bool need_random = false;
2897   size_t nfiles = 0;
2898   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2899   bool obsolete_usage = (posix2_version () < 200112);
2900   char **files;
2901   char *files_from = NULL;
2902   struct Tokens tok;
2903   char const *outfile = NULL;
2904
2905   initialize_main (&argc, &argv);
2906   set_program_name (argv[0]);
2907   setlocale (LC_ALL, "");
2908   bindtextdomain (PACKAGE, LOCALEDIR);
2909   textdomain (PACKAGE);
2910
2911   initialize_exit_failure (SORT_FAILURE);
2912
2913   hard_LC_COLLATE = hard_locale (LC_COLLATE);
2914 #if HAVE_NL_LANGINFO
2915   hard_LC_TIME = hard_locale (LC_TIME);
2916 #endif
2917
2918   /* Get locale's representation of the decimal point.  */
2919   {
2920     struct lconv const *locale = localeconv ();
2921
2922     /* If the locale doesn't define a decimal point, or if the decimal
2923        point is multibyte, use the C locale's decimal point.  FIXME:
2924        add support for multibyte decimal points.  */
2925     decimal_point = to_uchar (locale->decimal_point[0]);
2926     if (! decimal_point || locale->decimal_point[1])
2927       decimal_point = '.';
2928
2929     /* FIXME: add support for multibyte thousands separators.  */
2930     thousands_sep = to_uchar (*locale->thousands_sep);
2931     if (! thousands_sep || locale->thousands_sep[1])
2932       thousands_sep = -1;
2933   }
2934
2935   have_read_stdin = false;
2936   inittables ();
2937
2938   {
2939     size_t i;
2940     static int const sig[] =
2941       {
2942         /* The usual suspects.  */
2943         SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
2944 #ifdef SIGPOLL
2945         SIGPOLL,
2946 #endif
2947 #ifdef SIGPROF
2948         SIGPROF,
2949 #endif
2950 #ifdef SIGVTALRM
2951         SIGVTALRM,
2952 #endif
2953 #ifdef SIGXCPU
2954         SIGXCPU,
2955 #endif
2956 #ifdef SIGXFSZ
2957         SIGXFSZ,
2958 #endif
2959       };
2960     enum { nsigs = sizeof sig / sizeof sig[0] };
2961
2962 #if SA_NOCLDSTOP
2963     struct sigaction act;
2964
2965     sigemptyset (&caught_signals);
2966     for (i = 0; i < nsigs; i++)
2967       {
2968         sigaction (sig[i], NULL, &act);
2969         if (act.sa_handler != SIG_IGN)
2970           sigaddset (&caught_signals, sig[i]);
2971       }
2972
2973     act.sa_handler = sighandler;
2974     act.sa_mask = caught_signals;
2975     act.sa_flags = 0;
2976
2977     for (i = 0; i < nsigs; i++)
2978       if (sigismember (&caught_signals, sig[i]))
2979         sigaction (sig[i], &act, NULL);
2980 #else
2981     for (i = 0; i < nsigs; i++)
2982       if (signal (sig[i], SIG_IGN) != SIG_IGN)
2983         {
2984           signal (sig[i], sighandler);
2985           siginterrupt (sig[i], 1);
2986         }
2987 #endif
2988   }
2989
2990   /* The signal mask is known, so it is safe to invoke exit_cleanup.  */
2991   atexit (exit_cleanup);
2992
2993   gkey.sword = gkey.eword = SIZE_MAX;
2994   gkey.ignore = NULL;
2995   gkey.translate = NULL;
2996   gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false;
2997   gkey.month = gkey.reverse = false;
2998   gkey.skipsblanks = gkey.skipeblanks = false;
2999
3000   files = xnmalloc (argc, sizeof *files);
3001
3002   for (;;)
3003     {
3004       /* Parse an operand as a file after "--" was seen; or if
3005          pedantic and a file was seen, unless the POSIX version
3006          predates 1003.1-2001 and -c was not seen and the operand is
3007          "-o FILE" or "-oFILE".  */
3008       int oi = -1;
3009
3010       if (c == -1
3011           || (posixly_correct && nfiles != 0
3012               && ! (obsolete_usage
3013                     && ! checkonly
3014                     && optind != argc
3015                     && argv[optind][0] == '-' && argv[optind][1] == 'o'
3016                     && (argv[optind][2] || optind + 1 != argc)))
3017           || ((c = getopt_long (argc, argv, short_options,
3018                                 long_options, &oi))
3019               == -1))
3020         {
3021           if (argc <= optind)
3022             break;
3023           files[nfiles++] = argv[optind++];
3024         }
3025       else switch (c)
3026         {
3027         case 1:
3028           key = NULL;
3029           if (optarg[0] == '+')
3030             {
3031               bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
3032                                       && ISDIGIT (argv[optind][1]));
3033               obsolete_usage |= minus_pos_usage & ~posixly_correct;
3034               if (obsolete_usage)
3035                 {
3036                   /* Treat +POS1 [-POS2] as a key if possible; but silently
3037                      treat an operand as a file if it is not a valid +POS1.  */
3038                   key = key_init (&key_buf);
3039                   s = parse_field_count (optarg + 1, &key->sword, NULL);
3040                   if (s && *s == '.')
3041                     s = parse_field_count (s + 1, &key->schar, NULL);
3042                   if (! (key->sword | key->schar))
3043                     key->sword = SIZE_MAX;
3044                   if (! s || *set_ordering (s, key, bl_start))
3045                     key = NULL;
3046                   else
3047                     {
3048                       if (minus_pos_usage)
3049                         {
3050                           char const *optarg1 = argv[optind++];
3051                           s = parse_field_count (optarg1 + 1, &key->eword,
3052                                              N_("invalid number after `-'"));
3053                           if (*s == '.')
3054                             s = parse_field_count (s + 1, &key->echar,
3055                                                N_("invalid number after `.'"));
3056                           if (*set_ordering (s, key, bl_end))
3057                             badfieldspec (optarg1,
3058                                       N_("stray character in field spec"));
3059                         }
3060                       insertkey (key);
3061                     }
3062                 }
3063             }
3064           if (! key)
3065             files[nfiles++] = optarg;
3066           break;
3067
3068         case SORT_OPTION:
3069           c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
3070           /* Fall through. */
3071         case 'b':
3072         case 'd':
3073         case 'f':
3074         case 'g':
3075         case 'i':
3076         case 'M':
3077         case 'n':
3078         case 'r':
3079         case 'R':
3080         case 'V':
3081           {
3082             char str[2];
3083             str[0] = c;
3084             str[1] = '\0';
3085             set_ordering (str, &gkey, bl_both);
3086           }
3087           break;
3088
3089         case CHECK_OPTION:
3090           c = (optarg
3091                ? XARGMATCH ("--check", optarg, check_args, check_types)
3092                : 'c');
3093           /* Fall through.  */
3094         case 'c':
3095         case 'C':
3096           if (checkonly && checkonly != c)
3097             incompatible_options ("cC");
3098           checkonly = c;
3099           break;
3100
3101         case COMPRESS_PROGRAM_OPTION:
3102           if (compress_program && !STREQ (compress_program, optarg))
3103             error (SORT_FAILURE, 0, _("multiple compress programs specified"));
3104           compress_program = optarg;
3105           break;
3106
3107         case FILES0_FROM_OPTION:
3108           files_from = optarg;
3109           break;
3110
3111         case 'k':
3112           key = key_init (&key_buf);
3113
3114           /* Get POS1. */
3115           s = parse_field_count (optarg, &key->sword,
3116                                  N_("invalid number at field start"));
3117           if (! key->sword--)
3118             {
3119               /* Provoke with `sort -k0' */
3120               badfieldspec (optarg, N_("field number is zero"));
3121             }
3122           if (*s == '.')
3123             {
3124               s = parse_field_count (s + 1, &key->schar,
3125                                      N_("invalid number after `.'"));
3126               if (! key->schar--)
3127                 {
3128                   /* Provoke with `sort -k1.0' */
3129                   badfieldspec (optarg, N_("character offset is zero"));
3130                 }
3131             }
3132           if (! (key->sword | key->schar))
3133             key->sword = SIZE_MAX;
3134           s = set_ordering (s, key, bl_start);
3135           if (*s != ',')
3136             {
3137               key->eword = SIZE_MAX;
3138               key->echar = 0;
3139             }
3140           else
3141             {
3142               /* Get POS2. */
3143               s = parse_field_count (s + 1, &key->eword,
3144                                      N_("invalid number after `,'"));
3145               if (! key->eword--)
3146                 {
3147                   /* Provoke with `sort -k1,0' */
3148                   badfieldspec (optarg, N_("field number is zero"));
3149                 }
3150               if (*s == '.')
3151                 s = parse_field_count (s + 1, &key->echar,
3152                                        N_("invalid number after `.'"));
3153               else
3154                 {
3155                   /* `-k 2,3' is equivalent to `+1 -3'.  */
3156                   key->eword++;
3157                 }
3158               s = set_ordering (s, key, bl_end);
3159             }
3160           if (*s)
3161             badfieldspec (optarg, N_("stray character in field spec"));
3162           insertkey (key);
3163           break;
3164
3165         case 'm':
3166           mergeonly = true;
3167           break;
3168
3169         case NMERGE_OPTION:
3170           specify_nmerge (oi, c, optarg);
3171           break;
3172
3173         case 'o':
3174           if (outfile && !STREQ (outfile, optarg))
3175             error (SORT_FAILURE, 0, _("multiple output files specified"));
3176           outfile = optarg;
3177           break;
3178
3179         case RANDOM_SOURCE_OPTION:
3180           if (random_source && !STREQ (random_source, optarg))
3181             error (SORT_FAILURE, 0, _("multiple random sources specified"));
3182           random_source = optarg;
3183           break;
3184
3185         case 's':
3186           stable = true;
3187           break;
3188
3189         case 'S':
3190           specify_sort_size (oi, c, optarg);
3191           break;
3192
3193         case 't':
3194           {
3195             char newtab = optarg[0];
3196             if (! newtab)
3197               error (SORT_FAILURE, 0, _("empty tab"));
3198             if (optarg[1])
3199               {
3200                 if (STREQ (optarg, "\\0"))
3201                   newtab = '\0';
3202                 else
3203                   {
3204                     /* Provoke with `sort -txx'.  Complain about
3205                        "multi-character tab" instead of "multibyte tab", so
3206                        that the diagnostic's wording does not need to be
3207                        changed once multibyte characters are supported.  */
3208                     error (SORT_FAILURE, 0, _("multi-character tab %s"),
3209                            quote (optarg));
3210                   }
3211               }
3212             if (tab != TAB_DEFAULT && tab != newtab)
3213               error (SORT_FAILURE, 0, _("incompatible tabs"));
3214             tab = newtab;
3215           }
3216           break;
3217
3218         case 'T':
3219           add_temp_dir (optarg);
3220           break;
3221
3222         case 'u':
3223           unique = true;
3224           break;
3225
3226         case 'y':
3227           /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
3228              through Solaris 7.  It is also accepted by many non-Solaris
3229              "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
3230              -y is marked as obsolete starting with Solaris 8 (1999), but is
3231              still accepted as of Solaris 10 prerelease (2004).
3232
3233              Solaris 2.5.1 "sort -y 100" reads the input file "100", but
3234              emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
3235              and which in general ignores the argument after "-y" if it
3236              consists entirely of digits (it can even be empty).  */
3237           if (optarg == argv[optind - 1])
3238             {
3239               char const *p;
3240               for (p = optarg; ISDIGIT (*p); p++)
3241                 continue;
3242               optind -= (*p != '\0');
3243             }
3244           break;
3245
3246         case 'z':
3247           eolchar = 0;
3248           break;
3249
3250         case_GETOPT_HELP_CHAR;
3251
3252         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
3253
3254         default:
3255           usage (SORT_FAILURE);
3256         }
3257     }
3258
3259   if (files_from)
3260     {
3261       FILE *stream;
3262
3263       /* When using --files0-from=F, you may not specify any files
3264          on the command-line.  */
3265       if (nfiles)
3266         {
3267           error (0, 0, _("extra operand %s"), quote (files[0]));
3268           fprintf (stderr, "%s\n",
3269                    _("file operands cannot be combined with --files0-from"));
3270           usage (SORT_FAILURE);
3271         }
3272
3273       if (STREQ (files_from, "-"))
3274         stream = stdin;
3275       else
3276         {
3277           stream = fopen (files_from, "r");
3278           if (stream == NULL)
3279             error (SORT_FAILURE, errno, _("cannot open %s for reading"),
3280                    quote (files_from));
3281         }
3282
3283       readtokens0_init (&tok);
3284
3285       if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
3286         error (SORT_FAILURE, 0, _("cannot read file names from %s"),
3287                quote (files_from));
3288
3289       if (tok.n_tok)
3290         {
3291           size_t i;
3292           free (files);
3293           files = tok.tok;
3294           nfiles = tok.n_tok;
3295           for (i = 0; i < nfiles; i++)
3296           {
3297               if (STREQ (files[i], "-"))
3298                 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
3299                                           "no file name of %s allowed"),
3300                        quote (files[i]));
3301               else if (files[i][0] == '\0')
3302                 {
3303                   /* Using the standard `filename:line-number:' prefix here is
3304                      not totally appropriate, since NUL is the separator, not NL,
3305                      but it might be better than nothing.  */
3306                   unsigned long int file_number = i + 1;
3307                   error (SORT_FAILURE, 0,
3308                          _("%s:%lu: invalid zero-length file name"),
3309                          quotearg_colon (files_from), file_number);
3310                 }
3311           }
3312         }
3313       else
3314         error (SORT_FAILURE, 0, _("no input from %s"),
3315                quote (files_from));
3316     }
3317
3318   /* Inheritance of global options to individual keys. */
3319   for (key = keylist; key; key = key->next)
3320     {
3321       if (! (key->ignore
3322              || key->translate
3323              || (key->skipsblanks
3324                  | key->reverse
3325                  | key->skipeblanks
3326                  | key->month
3327                  | key->numeric
3328                  | key->version
3329                  | key->general_numeric
3330                  | key->random)))
3331         {
3332           key->ignore = gkey.ignore;
3333           key->translate = gkey.translate;
3334           key->skipsblanks = gkey.skipsblanks;
3335           key->skipeblanks = gkey.skipeblanks;
3336           key->month = gkey.month;
3337           key->numeric = gkey.numeric;
3338           key->general_numeric = gkey.general_numeric;
3339           key->random = gkey.random;
3340           key->reverse = gkey.reverse;
3341           key->version = gkey.version;
3342         }
3343
3344       need_random |= key->random;
3345     }
3346
3347   if (!keylist && (gkey.ignore
3348                    || gkey.translate
3349                    || (gkey.skipsblanks
3350                        | gkey.skipeblanks
3351                        | gkey.month
3352                        | gkey.numeric
3353                        | gkey.general_numeric
3354                        | gkey.random
3355                        | gkey.version)))
3356     {
3357       insertkey (&gkey);
3358       need_random |= gkey.random;
3359     }
3360
3361   check_ordering_compatibility ();
3362
3363   reverse = gkey.reverse;
3364
3365   if (need_random)
3366     {
3367       randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
3368       if (! randread_source)
3369         die (_("open failed"), random_source);
3370     }
3371
3372   if (temp_dir_count == 0)
3373     {
3374       char const *tmp_dir = getenv ("TMPDIR");
3375       add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
3376     }
3377
3378   if (nfiles == 0)
3379     {
3380       static char *minus = "-";
3381       nfiles = 1;
3382       free (files);
3383       files = &minus;
3384     }
3385
3386   /* Need to re-check that we meet the minimum requirement for memory
3387      usage with the final value for NMERGE. */
3388   if (0 < sort_size)
3389     sort_size = MAX (sort_size, MIN_SORT_SIZE);
3390
3391   if (checkonly)
3392     {
3393       if (nfiles > 1)
3394         error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
3395                quote (files[1]), checkonly);
3396
3397       if (outfile)
3398         {
3399           static char opts[] = {0, 'o', 0};
3400           opts[0] = checkonly;
3401           incompatible_options (opts);
3402         }
3403
3404       /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
3405          input is not properly sorted.  */
3406       exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
3407     }
3408
3409   if (mergeonly)
3410     {
3411       struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
3412       size_t i;
3413
3414       for (i = 0; i < nfiles; ++i)
3415         sortfiles[i].name = files[i];
3416
3417       merge (sortfiles, 0, nfiles, outfile);
3418       IF_LINT (free (sortfiles));
3419     }
3420   else
3421     sort (files, nfiles, outfile);
3422
3423   if (have_read_stdin && fclose (stdin) == EOF)
3424     die (_("close failed"), "-");
3425
3426   exit (EXIT_SUCCESS);
3427 }