src/sort.c

   1 /* sort - sort lines of text (with all kinds of options).
   2    Copyright (C) 1988, 1991-2006 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  17
  18    Written December 1988 by Mike Haertel.
  19    The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
  20    or (US mail) as Mike Haertel c/o Free Software Foundation.
  21
  22    Ørn E. Hansen added NLS support in 1997.  */
  23
  24 #include <config.h>
  25
  26 #include <getopt.h>
  27 #include <sys/types.h>
  28 #include <signal.h>
  29 #include "system.h"
  30 #include "error.h"
  31 #include "hard-locale.h"
  32 #include "inttostr.h"
  33 #include "md5.h"
  34 #include "physmem.h"
  35 #include "posixver.h"
  36 #include "quote.h"
  37 #include "randread.h"
  38 #include "stdio--.h"
  39 #include "stdlib--.h"
  40 #include "strnumcmp.h"
  41 #include "xmemcoll.h"
  42 #include "xmemxfrm.h"
  43 #include "xstrtol.h"
  44
  45 #if HAVE_SYS_RESOURCE_H
  46 # include <sys/resource.h>
  47 #endif
  48 #ifndef RLIMIT_DATA
  49 struct rlimit { size_t rlim_cur; };
  50 # define getrlimit(Resource, Rlp) (-1)
  51 #endif
  52
  53 /* The official name of this program (e.g., no `g' prefix).  */
  54 #define PROGRAM_NAME "sort"
  55
  56 #define AUTHORS "Mike Haertel", "Paul Eggert"
  57
  58 #if HAVE_LANGINFO_CODESET
  59 # include <langinfo.h>
  60 #endif
  61
  62 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
  63    present.  */
  64 #ifndef SA_NOCLDSTOP
  65 # define SA_NOCLDSTOP 0
  66 # define sigprocmask(How, Set, Oset) /* empty */
  67 # define sigset_t int
  68 # if ! HAVE_SIGINTERRUPT
  69 #  define siginterrupt(sig, flag) /* empty */
  70 # endif
  71 #endif
  72
  73 #ifndef STDC_HEADERS
  74 double strtod ();
  75 #endif
  76
  77 #define UCHAR_LIM (UCHAR_MAX + 1)
  78
  79 #ifndef DEFAULT_TMPDIR
  80 # define DEFAULT_TMPDIR "/tmp"
  81 #endif
  82
  83 /* Exit statuses.  */
  84 enum
  85   {
  86     /* POSIX says to exit with status 1 if invoked with -c and the
  87        input is not properly sorted.  */
  88     SORT_OUT_OF_ORDER = 1,
  89
  90     /* POSIX says any other irregular exit must exit with a status
  91        code greater than 1.  */
  92     SORT_FAILURE = 2
  93   };
  94
  95 /* The representation of the decimal point in the current locale.  */
  96 static int decimal_point;
  97
  98 /* Thousands separator; if -1, then there isn't one.  */
  99 static int thousands_sep;
 100
 101 /* Nonzero if the corresponding locales are hard.  */
 102 static bool hard_LC_COLLATE;
 103 #if HAVE_NL_LANGINFO
 104 static bool hard_LC_TIME;
 105 #endif
 106
 107 #define NONZERO(x) ((x) != 0)
 108
 109 /* The kind of blanks for '-b' to skip in various options. */
 110 enum blanktype { bl_start, bl_end, bl_both };
 111
 112 /* The character marking end of line. Default to \n. */
 113 static char eolchar = '\n';
 114
 115 /* Lines are held in core as counted strings. */
 116 struct line
 117 {
 118   char *text;                   /* Text of the line. */
 119   size_t length;                /* Length including final newline. */
 120   char *keybeg;                 /* Start of first key. */
 121   char *keylim;                 /* Limit of first key. */
 122 };
 123
 124 /* Input buffers. */
 125 struct buffer
 126 {
 127   char *buf;                    /* Dynamically allocated buffer,
 128                                    partitioned into 3 regions:
 129                                    - input data;
 130                                    - unused area;
 131                                    - an array of lines, in reverse order.  */
 132   size_t used;                  /* Number of bytes used for input data.  */
 133   size_t nlines;                /* Number of lines in the line array.  */
 134   size_t alloc;                 /* Number of bytes allocated. */
 135   size_t left;                  /* Number of bytes left from previous reads. */
 136   size_t line_bytes;            /* Number of bytes to reserve for each line. */
 137   bool eof;                     /* An EOF has been read.  */
 138 };
 139
 140 struct keyfield
 141 {
 142   size_t sword;                 /* Zero-origin 'word' to start at. */
 143   size_t schar;                 /* Additional characters to skip. */
 144   size_t eword;                 /* Zero-origin first word after field. */
 145   size_t echar;                 /* Additional characters in field. */
 146   bool const *ignore;           /* Boolean array of characters to ignore. */
 147   char const *translate;        /* Translation applied to characters. */
 148   bool skipsblanks;             /* Skip leading blanks when finding start.  */
 149   bool skipeblanks;             /* Skip leading blanks when finding end.  */
 150   bool numeric;                 /* Flag for numeric comparison.  Handle
 151                                    strings of digits with optional decimal
 152                                    point, but no exponential notation. */
 153   bool random;                  /* Sort by random hash of key.  */
 154   bool general_numeric;         /* Flag for general, numeric comparison.
 155                                    Handle numbers in exponential notation. */
 156   bool month;                   /* Flag for comparison by month name. */
 157   bool reverse;                 /* Reverse the sense of comparison. */
 158   struct keyfield *next;        /* Next keyfield to try. */
 159 };
 160
 161 struct month
 162 {
 163   char const *name;
 164   int val;
 165 };
 166
 167 /* The name this program was run with. */
 168 char *program_name;
 169
 170 /* FIXME: None of these tables work with multibyte character sets.
 171    Also, there are many other bugs when handling multibyte characters.
 172    One way to fix this is to rewrite `sort' to use wide characters
 173    internally, but doing this with good performance is a bit
 174    tricky.  */
 175
 176 /* Table of blanks.  */
 177 static bool blanks[UCHAR_LIM];
 178
 179 /* Table of non-printing characters. */
 180 static bool nonprinting[UCHAR_LIM];
 181
 182 /* Table of non-dictionary characters (not letters, digits, or blanks). */
 183 static bool nondictionary[UCHAR_LIM];
 184
 185 /* Translation table folding lower case to upper.  */
 186 static char fold_toupper[UCHAR_LIM];
 187
 188 #define MONTHS_PER_YEAR 12
 189
 190 /* Table mapping month names to integers.
 191    Alphabetic order allows binary search. */
 192 static struct month monthtab[] =
 193 {
 194   {"APR", 4},
 195   {"AUG", 8},
 196   {"DEC", 12},
 197   {"FEB", 2},
 198   {"JAN", 1},
 199   {"JUL", 7},
 200   {"JUN", 6},
 201   {"MAR", 3},
 202   {"MAY", 5},
 203   {"NOV", 11},
 204   {"OCT", 10},
 205   {"SEP", 9}
 206 };
 207
 208 /* During the merge phase, the number of files to merge at once. */
 209 #define NMERGE 16
 210
 211 /* Minimum size for a merge or check buffer.  */
 212 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 213
 214 /* Minimum sort size; the code might not work with smaller sizes.  */
 215 #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
 216
 217 /* The number of bytes needed for a merge or check buffer, which can
 218    function relatively efficiently even if it holds only one line.  If
 219    a longer line is seen, this value is increased.  */
 220 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
 221
 222 /* The approximate maximum number of bytes of main memory to use, as
 223    specified by the user.  Zero if the user has not specified a size.  */
 224 static size_t sort_size;
 225
 226 /* The guessed size for non-regular files.  */
 227 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
 228
 229 /* Array of directory names in which any temporary files are to be created. */
 230 static char const **temp_dirs;
 231
 232 /* Number of temporary directory names used.  */
 233 static size_t temp_dir_count;
 234
 235 /* Number of allocated slots in temp_dirs.  */
 236 static size_t temp_dir_alloc;
 237
 238 /* Flag to reverse the order of all comparisons. */
 239 static bool reverse;
 240
 241 /* Flag for stable sort.  This turns off the last ditch bytewise
 242    comparison of lines, and instead leaves lines in the same order
 243    they were read if all keys compare equal.  */
 244 static bool stable;
 245
 246 /* If TAB has this value, blanks separate fields.  */
 247 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 248
 249 /* Tab character separating fields.  If TAB_DEFAULT, then fields are
 250    separated by the empty string between a non-blank character and a blank
 251    character. */
 252 static int tab = TAB_DEFAULT;
 253
 254 /* Flag to remove consecutive duplicate lines from the output.
 255    Only the last of a sequence of equal lines will be output. */
 256 static bool unique;
 257
 258 /* Nonzero if any of the input files are the standard input. */
 259 static bool have_read_stdin;
 260
 261 /* List of key field comparisons to be tried.  */
 262 static struct keyfield *keylist;
 263
 264 static void sortlines_temp (struct line *, size_t, struct line *);
 265
 266 /* Report MESSAGE for FILE, then clean up and exit.
 267    If FILE is null, it represents standard output.  */
 268
 269 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
 270 static void
 271 die (char const *message, char const *file)
 272 {
 273   error (0, errno, "%s: %s", message, file ? file : _("standard output"));
 274   exit (SORT_FAILURE);
 275 }
 276
 277 void
 278 usage (int status)
 279 {
 280   if (status != EXIT_SUCCESS)
 281     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 282              program_name);
 283   else
 284     {
 285       printf (_("\
 286 Usage: %s [OPTION]... [FILE]...\n\
 287 "),
 288               program_name);
 289       fputs (_("\
 290 Write sorted concatenation of all FILE(s) to standard output.\n\
 291 \n\
 292 "), stdout);
 293       fputs (_("\
 294 Mandatory arguments to long options are mandatory for short options too.\n\
 295 "), stdout);
 296       fputs (_("\
 297 Ordering options:\n\
 298 \n\
 299 "), stdout);
 300       fputs (_("\
 301   -b, --ignore-leading-blanks  ignore leading blanks\n\
 302   -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
 303   -f, --ignore-case           fold lower case to upper case characters\n\
 304 "), stdout);
 305       fputs (_("\
 306   -g, --general-numeric-sort  compare according to general numerical value\n\
 307   -i, --ignore-nonprinting    consider only printable characters\n\
 308   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 309   -n, --numeric-sort          compare according to string numerical value\n\
 310   -R, --random-sort           sort by random hash of keys\n\
 311       --random-source=FILE    get random bytes from FILE (default /dev/urandom)\n\
 312   -r, --reverse               reverse the result of comparisons\n\
 313 \n\
 314 "), stdout);
 315       fputs (_("\
 316 Other options:\n\
 317 \n\
 318   -c, --check               check whether input is sorted; do not sort\n\
 319   -k, --key=POS1[,POS2]     start a key at POS1, end it at POS2 (origin 1)\n\
 320   -m, --merge               merge already sorted files; do not sort\n\
 321   -o, --output=FILE         write result to FILE instead of standard output\n\
 322   -s, --stable              stabilize sort by disabling last-resort comparison\n\
 323   -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
 324 "), stdout);
 325       printf (_("\
 326   -t, --field-separator=SEP  use SEP instead of non-blank to blank transition\n\
 327   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
 328                               multiple options specify multiple directories\n\
 329   -u, --unique              with -c, check for strict ordering;\n\
 330                               without -c, output only the first of an equal run\n\
 331 "), DEFAULT_TMPDIR);
 332       fputs (_("\
 333   -z, --zero-terminated     end lines with 0 byte, not newline\n\
 334 "), stdout);
 335       fputs (HELP_OPTION_DESCRIPTION, stdout);
 336       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 337       fputs (_("\
 338 \n\
 339 POS is F[.C][OPTS], where F is the field number and C the character position\n\
 340 in the field; both are origin 1.  If neither -t nor -b is in effect, characters\n\
 341 in a field are counted from the beginning of the preceding whitespace.  OPTS is\n\
 342 one or more single-letter ordering options, which override global ordering\n\
 343 options for that key.  If no key is given, use the entire line as the key.\n\
 344 \n\
 345 SIZE may be followed by the following multiplicative suffixes:\n\
 346 "), stdout);
 347       fputs (_("\
 348 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
 349 \n\
 350 With no FILE, or when FILE is -, read standard input.\n\
 351 \n\
 352 *** WARNING ***\n\
 353 The locale specified by the environment affects sort order.\n\
 354 Set LC_ALL=C to get the traditional sort order that uses\n\
 355 native byte values.\n\
 356 "), stdout );
 357       printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 358     }
 359
 360   exit (status);
 361 }
 362
 363 /* For long options that have no equivalent short option, use a
 364    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
 365 enum
 366 {
 367   RANDOM_SOURCE_OPTION = CHAR_MAX + 1
 368 };
 369
 370 static char const short_options[] = "-bcdfgik:mMno:rRsS:t:T:uy:z";
 371
 372 static struct option const long_options[] =
 373 {
 374   {"ignore-leading-blanks", no_argument, NULL, 'b'},
 375   {"check", no_argument, NULL, 'c'},
 376   {"dictionary-order", no_argument, NULL, 'd'},
 377   {"ignore-case", no_argument, NULL, 'f'},
 378   {"general-numeric-sort", no_argument, NULL, 'g'},
 379   {"ignore-nonprinting", no_argument, NULL, 'i'},
 380   {"key", required_argument, NULL, 'k'},
 381   {"merge", no_argument, NULL, 'm'},
 382   {"month-sort", no_argument, NULL, 'M'},
 383   {"numeric-sort", no_argument, NULL, 'n'},
 384   {"random-sort", no_argument, NULL, 'R'},
 385   {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
 386   {"output", required_argument, NULL, 'o'},
 387   {"reverse", no_argument, NULL, 'r'},
 388   {"stable", no_argument, NULL, 's'},
 389   {"buffer-size", required_argument, NULL, 'S'},
 390   {"field-separator", required_argument, NULL, 't'},
 391   {"temporary-directory", required_argument, NULL, 'T'},
 392   {"unique", no_argument, NULL, 'u'},
 393   {"zero-terminated", no_argument, NULL, 'z'},
 394   {GETOPT_HELP_OPTION_DECL},
 395   {GETOPT_VERSION_OPTION_DECL},
 396   {NULL, 0, NULL, 0},
 397 };
 398
 399 /* The set of signals that are caught.  */
 400 static sigset_t caught_signals;
 401
 402 /* The list of temporary files. */
 403 struct tempnode
 404 {
 405   struct tempnode *volatile next;
 406   char name[1];  /* Actual size is 1 + file name length.  */
 407 };
 408 static struct tempnode *volatile temphead;
 409 static struct tempnode *volatile *temptail = &temphead;
 410
 411 /* Clean up any remaining temporary files.  */
 412
 413 static void
 414 cleanup (void)
 415 {
 416   struct tempnode const *node;
 417
 418   for (node = temphead; node; node = node->next)
 419     unlink (node->name);
 420   temphead = NULL;
 421 }
 422
 423 /* Cleanup actions to take when exiting.  */
 424
 425 static void
 426 exit_cleanup (void)
 427 {
 428   if (temphead)
 429     {
 430       /* Clean up any remaining temporary files in a critical section so
 431          that a signal handler does not try to clean them too.  */
 432       sigset_t oldset;
 433       sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
 434       cleanup ();
 435       sigprocmask (SIG_SETMASK, &oldset, NULL);
 436     }
 437
 438   close_stdout ();
 439 }
 440
 441 /* Create a new temporary file, returning its newly allocated name.
 442    Store into *PFP a stream open for writing.  */
 443
 444 static char *
 445 create_temp_file (FILE **pfp)
 446 {
 447   static char const slashbase[] = "/sortXXXXXX";
 448   static size_t temp_dir_index;
 449   sigset_t oldset;
 450   int fd;
 451   int saved_errno;
 452   char const *temp_dir = temp_dirs[temp_dir_index];
 453   size_t len = strlen (temp_dir);
 454   struct tempnode *node =
 455     xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
 456   char *file = node->name;
 457
 458   memcpy (file, temp_dir, len);
 459   memcpy (file + len, slashbase, sizeof slashbase);
 460   node->next = NULL;
 461   if (++temp_dir_index == temp_dir_count)
 462     temp_dir_index = 0;
 463
 464   /* Create the temporary file in a critical section, to avoid races.  */
 465   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
 466   fd = mkstemp (file);
 467   if (0 <= fd)
 468     {
 469       *temptail = node;
 470       temptail = &node->next;
 471     }
 472   saved_errno = errno;
 473   sigprocmask (SIG_SETMASK, &oldset, NULL);
 474   errno = saved_errno;
 475
 476   if (fd < 0 || (*pfp = fdopen (fd, "w")) == NULL)
 477     die (_("cannot create temporary file"), file);
 478
 479   return file;
 480 }
 481
 482 /* Return a stream for FILE, opened with mode HOW.  A null FILE means
 483    standard output; HOW should be "w".  When opening for input, "-"
 484    means standard input.  To avoid confusion, do not return file
 485    descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
 486    opening an ordinary FILE.  */
 487
 488 static FILE *
 489 xfopen (const char *file, const char *how)
 490 {
 491   FILE *fp;
 492
 493   if (!file)
 494     fp = stdout;
 495   else if (STREQ (file, "-") && *how == 'r')
 496     {
 497       have_read_stdin = true;
 498       fp = stdin;
 499     }
 500   else
 501     {
 502       fp = fopen (file, how);
 503       if (! fp)
 504         die (_("open failed"), file);
 505     }
 506
 507   return fp;
 508 }
 509
 510 /* Close FP, whose name is FILE, and report any errors.  */
 511
 512 static void
 513 xfclose (FILE *fp, char const *file)
 514 {
 515   switch (fileno (fp))
 516     {
 517     case STDIN_FILENO:
 518       /* Allow reading stdin from tty more than once.  */
 519       if (feof (fp))
 520         clearerr (fp);
 521       break;
 522
 523     case STDOUT_FILENO:
 524       /* Don't close stdout just yet.  close_stdout does that.  */
 525       if (fflush (fp) != 0)
 526         die (_("fflush failed"), file);
 527       break;
 528
 529     default:
 530       if (fclose (fp) != 0)
 531         die (_("close failed"), file);
 532       break;
 533     }
 534 }
 535
 536 static void
 537 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
 538 {
 539   if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
 540     die (_("write failed"), output_file);
 541 }
 542
 543 /* Append DIR to the array of temporary directory names.  */
 544 static void
 545 add_temp_dir (char const *dir)
 546 {
 547   if (temp_dir_count == temp_dir_alloc)
 548     temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
 549
 550   temp_dirs[temp_dir_count++] = dir;
 551 }
 552
 553 /* Remove NAME from the list of temporary files.  */
 554
 555 static void
 556 zaptemp (const char *name)
 557 {
 558   struct tempnode *volatile *pnode;
 559   struct tempnode *node;
 560   struct tempnode *next;
 561   sigset_t oldset;
 562   int unlink_status;
 563   int unlink_errno = 0;
 564
 565   for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
 566     continue;
 567
 568   /* Unlink the temporary file in a critical section to avoid races.  */
 569   next = node->next;
 570   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
 571   unlink_status = unlink (name);
 572   unlink_errno = errno;
 573   *pnode = next;
 574   sigprocmask (SIG_SETMASK, &oldset, NULL);
 575
 576   if (unlink_status != 0)
 577     error (0, unlink_errno, _("warning: cannot remove: %s"), name);
 578   if (! next)
 579     temptail = pnode;
 580   free (node);
 581 }
 582
 583 #if HAVE_NL_LANGINFO
 584
 585 static int
 586 struct_month_cmp (const void *m1, const void *m2)
 587 {
 588   struct month const *month1 = m1;
 589   struct month const *month2 = m2;
 590   return strcmp (month1->name, month2->name);
 591 }
 592
 593 #endif
 594
 595 /* Initialize the character class tables. */
 596
 597 static void
 598 inittables (void)
 599 {
 600   size_t i;
 601
 602   for (i = 0; i < UCHAR_LIM; ++i)
 603     {
 604       blanks[i] = !! isblank (i);
 605       nonprinting[i] = ! isprint (i);
 606       nondictionary[i] = ! isalnum (i) && ! isblank (i);
 607       fold_toupper[i] = toupper (i);
 608     }
 609
 610 #if HAVE_NL_LANGINFO
 611   /* If we're not in the "C" locale, read different names for months.  */
 612   if (hard_LC_TIME)
 613     {
 614       for (i = 0; i < MONTHS_PER_YEAR; i++)
 615         {
 616           char const *s;
 617           size_t s_len;
 618           size_t j;
 619           char *name;
 620
 621           s = (char *) nl_langinfo (ABMON_1 + i);
 622           s_len = strlen (s);
 623           monthtab[i].name = name = xmalloc (s_len + 1);
 624           monthtab[i].val = i + 1;
 625
 626           for (j = 0; j < s_len; j++)
 627             name[j] = fold_toupper[to_uchar (s[j])];
 628           name[j] = '\0';
 629         }
 630       qsort ((void *) monthtab, MONTHS_PER_YEAR,
 631              sizeof *monthtab, struct_month_cmp);
 632     }
 633 #endif
 634 }
 635
 636 /* Specify the amount of main memory to use when sorting.  */
 637 static void
 638 specify_sort_size (char const *s)
 639 {
 640   uintmax_t n;
 641   char *suffix;
 642   enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
 643
 644   /* The default unit is KiB.  */
 645   if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
 646     {
 647       if (n <= UINTMAX_MAX / 1024)
 648         n *= 1024;
 649       else
 650         e = LONGINT_OVERFLOW;
 651     }
 652
 653   /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
 654   if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
 655     switch (suffix[0])
 656       {
 657       case 'b':
 658         e = LONGINT_OK;
 659         break;
 660
 661       case '%':
 662         {
 663           double mem = physmem_total () * n / 100;
 664
 665           /* Use "<", not "<=", to avoid problems with rounding.  */
 666           if (mem < UINTMAX_MAX)
 667             {
 668               n = mem;
 669               e = LONGINT_OK;
 670             }
 671           else
 672             e = LONGINT_OVERFLOW;
 673         }
 674         break;
 675       }
 676
 677   if (e == LONGINT_OK)
 678     {
 679       /* If multiple sort sizes are specified, take the maximum, so
 680          that option order does not matter.  */
 681       if (n < sort_size)
 682         return;
 683
 684       sort_size = n;
 685       if (sort_size == n)
 686         {
 687           sort_size = MAX (sort_size, MIN_SORT_SIZE);
 688           return;
 689         }
 690
 691       e = LONGINT_OVERFLOW;
 692     }
 693
 694   STRTOL_FATAL_ERROR (s, _("sort size"), e);
 695 }
 696
 697 /* Return the default sort size.  */
 698 static size_t
 699 default_sort_size (void)
 700 {
 701   /* Let MEM be available memory or 1/8 of total memory, whichever
 702      is greater.  */
 703   double avail = physmem_available ();
 704   double total = physmem_total ();
 705   double mem = MAX (avail, total / 8);
 706   struct rlimit rlimit;
 707
 708   /* Let SIZE be MEM, but no more than the maximum object size or
 709      system resource limits.  Avoid the MIN macro here, as it is not
 710      quite right when only one argument is floating point.  Don't
 711      bother to check for values like RLIM_INFINITY since in practice
 712      they are not much less than SIZE_MAX.  */
 713   size_t size = SIZE_MAX;
 714   if (mem < size)
 715     size = mem;
 716   if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
 717     size = rlimit.rlim_cur;
 718 #ifdef RLIMIT_AS
 719   if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
 720     size = rlimit.rlim_cur;
 721 #endif
 722
 723   /* Leave a large safety margin for the above limits, as failure can
 724      occur when they are exceeded.  */
 725   size /= 2;
 726
 727 #ifdef RLIMIT_RSS
 728   /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
 729      Exceeding RSS is not fatal, but can be quite slow.  */
 730   if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
 731     size = rlimit.rlim_cur / 16 * 15;
 732 #endif
 733
 734   /* Use no less than the minimum.  */
 735   return MAX (size, MIN_SORT_SIZE);
 736 }
 737
 738 /* Return the sort buffer size to use with the input files identified
 739    by FPS and FILES, which are alternate names of the same files.
 740    NFILES gives the number of input files; NFPS may be less.  Assume
 741    that each input line requires LINE_BYTES extra bytes' worth of line
 742    information.  Do not exceed the size bound specified by the user
 743    (or a default size bound, if the user does not specify one).  */
 744
 745 static size_t
 746 sort_buffer_size (FILE *const *fps, size_t nfps,
 747                   char *const *files, size_t nfiles,
 748                   size_t line_bytes)
 749 {
 750   /* A bound on the input size.  If zero, the bound hasn't been
 751      determined yet.  */
 752   static size_t size_bound;
 753
 754   /* In the worst case, each input byte is a newline.  */
 755   size_t worst_case_per_input_byte = line_bytes + 1;
 756
 757   /* Keep enough room for one extra input line and an extra byte.
 758      This extra room might be needed when preparing to read EOF.  */
 759   size_t size = worst_case_per_input_byte + 1;
 760
 761   size_t i;
 762
 763   for (i = 0; i < nfiles; i++)
 764     {
 765       struct stat st;
 766       off_t file_size;
 767       size_t worst_case;
 768
 769       if ((i < nfps ? fstat (fileno (fps[i]), &st)
 770            : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
 771            : stat (files[i], &st))
 772           != 0)
 773         die (_("stat failed"), files[i]);
 774
 775       if (S_ISREG (st.st_mode))
 776         file_size = st.st_size;
 777       else
 778         {
 779           /* The file has unknown size.  If the user specified a sort
 780              buffer size, use that; otherwise, guess the size.  */
 781           if (sort_size)
 782             return sort_size;
 783           file_size = INPUT_FILE_SIZE_GUESS;
 784         }
 785
 786       if (! size_bound)
 787         {
 788           size_bound = sort_size;
 789           if (! size_bound)
 790             size_bound = default_sort_size ();
 791         }
 792
 793       /* Add the amount of memory needed to represent the worst case
 794          where the input consists entirely of newlines followed by a
 795          single non-newline.  Check for overflow.  */
 796       worst_case = file_size * worst_case_per_input_byte + 1;
 797       if (file_size != worst_case / worst_case_per_input_byte
 798           || size_bound - size <= worst_case)
 799         return size_bound;
 800       size += worst_case;
 801     }
 802
 803   return size;
 804 }
 805
 806 /* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
 807    must be at least sizeof (struct line).  Allocate ALLOC bytes
 808    initially.  */
 809
 810 static void
 811 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
 812 {
 813   /* Ensure that the line array is properly aligned.  If the desired
 814      size cannot be allocated, repeatedly halve it until allocation
 815      succeeds.  The smaller allocation may hurt overall performance,
 816      but that's better than failing.  */
 817   for (;;)
 818     {
 819       alloc += sizeof (struct line) - alloc % sizeof (struct line);
 820       buf->buf = malloc (alloc);
 821       if (buf->buf)
 822         break;
 823       alloc /= 2;
 824       if (alloc <= line_bytes + 1)
 825         xalloc_die ();
 826     }
 827
 828   buf->line_bytes = line_bytes;
 829   buf->alloc = alloc;
 830   buf->used = buf->left = buf->nlines = 0;
 831   buf->eof = false;
 832 }
 833
 834 /* Return one past the limit of the line array.  */
 835
 836 static inline struct line *
 837 buffer_linelim (struct buffer const *buf)
 838 {
 839   return (struct line *) (buf->buf + buf->alloc);
 840 }
 841
 842 /* Return a pointer to the first character of the field specified
 843    by KEY in LINE. */
 844
 845 static char *
 846 begfield (const struct line *line, const struct keyfield *key)
 847 {
 848   char *ptr = line->text, *lim = ptr + line->length - 1;
 849   size_t sword = key->sword;
 850   size_t schar = key->schar;
 851   size_t remaining_bytes;
 852
 853   /* The leading field separator itself is included in a field when -t
 854      is absent.  */
 855
 856   if (tab != TAB_DEFAULT)
 857     while (ptr < lim && sword--)
 858       {
 859         while (ptr < lim && *ptr != tab)
 860           ++ptr;
 861         if (ptr < lim)
 862           ++ptr;
 863       }
 864   else
 865     while (ptr < lim && sword--)
 866       {
 867         while (ptr < lim && blanks[to_uchar (*ptr)])
 868           ++ptr;
 869         while (ptr < lim && !blanks[to_uchar (*ptr)])
 870           ++ptr;
 871       }
 872
 873   if (key->skipsblanks)
 874     while (ptr < lim && blanks[to_uchar (*ptr)])
 875       ++ptr;
 876
 877   /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
 878   remaining_bytes = lim - ptr;
 879   if (schar < remaining_bytes)
 880     ptr += schar;
 881   else
 882     ptr = lim;
 883
 884   return ptr;
 885 }
 886
 887 /* Return the limit of (a pointer to the first character after) the field
 888    in LINE specified by KEY. */
 889
 890 static char *
 891 limfield (const struct line *line, const struct keyfield *key)
 892 {
 893   char *ptr = line->text, *lim = ptr + line->length - 1;
 894   size_t eword = key->eword, echar = key->echar;
 895   size_t remaining_bytes;
 896
 897   /* Move PTR past EWORD fields or to one past the last byte on LINE,
 898      whichever comes first.  If there are more than EWORD fields, leave
 899      PTR pointing at the beginning of the field having zero-based index,
 900      EWORD.  If a delimiter character was specified (via -t), then that
 901      `beginning' is the first character following the delimiting TAB.
 902      Otherwise, leave PTR pointing at the first `blank' character after
 903      the preceding field.  */
 904   if (tab != TAB_DEFAULT)
 905     while (ptr < lim && eword--)
 906       {
 907         while (ptr < lim && *ptr != tab)
 908           ++ptr;
 909         if (ptr < lim && (eword | echar))
 910           ++ptr;
 911       }
 912   else
 913     while (ptr < lim && eword--)
 914       {
 915         while (ptr < lim && blanks[to_uchar (*ptr)])
 916           ++ptr;
 917         while (ptr < lim && !blanks[to_uchar (*ptr)])
 918           ++ptr;
 919       }
 920
 921 #ifdef POSIX_UNSPECIFIED
 922   /* The following block of code makes GNU sort incompatible with
 923      standard Unix sort, so it's ifdef'd out for now.
 924      The POSIX spec isn't clear on how to interpret this.
 925      FIXME: request clarification.
 926
 927      From: kwzh@gnu.ai.mit.edu (Karl Heuer)
 928      Date: Thu, 30 May 96 12:20:41 -0400
 929      [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
 930
 931      [...]I believe I've found another bug in `sort'.
 932
 933      $ cat /tmp/sort.in
 934      a b c 2 d
 935      pq rs 1 t
 936      $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
 937      a b c 2 d
 938      pq rs 1 t
 939      $ /bin/sort -k1.7,1.7 </tmp/sort.in
 940      pq rs 1 t
 941      a b c 2 d
 942
 943      Unix sort produced the answer I expected: sort on the single character
 944      in column 7.  GNU sort produced different results, because it disagrees
 945      on the interpretation of the key-end spec "M.N".  Unix sort reads this
 946      as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
 947      "skip M-1 fields, then either N-1 characters or the rest of the current
 948      field, whichever comes first".  This extra clause applies only to
 949      key-ends, not key-starts.
 950      */
 951
 952   /* Make LIM point to the end of (one byte past) the current field.  */
 953   if (tab != TAB_DEFAULT)
 954     {
 955       char *newlim;
 956       newlim = memchr (ptr, tab, lim - ptr);
 957       if (newlim)
 958         lim = newlim;
 959     }
 960   else
 961     {
 962       char *newlim;
 963       newlim = ptr;
 964       while (newlim < lim && blanks[to_uchar (*newlim)])
 965         ++newlim;
 966       while (newlim < lim && !blanks[to_uchar (*newlim)])
 967         ++newlim;
 968       lim = newlim;
 969     }
 970 #endif
 971
 972   /* If we're ignoring leading blanks when computing the End
 973      of the field, don't start counting bytes until after skipping
 974      past any leading blanks. */
 975   if (key->skipeblanks)
 976     while (ptr < lim && blanks[to_uchar (*ptr)])
 977       ++ptr;
 978
 979   /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
 980   remaining_bytes = lim - ptr;
 981   if (echar < remaining_bytes)
 982     ptr += echar;
 983   else
 984     ptr = lim;
 985
 986   return ptr;
 987 }
 988
 989 /* Fill BUF reading from FP, moving buf->left bytes from the end
 990    of buf->buf to the beginning first.  If EOF is reached and the
 991    file wasn't terminated by a newline, supply one.  Set up BUF's line
 992    table too.  FILE is the name of the file corresponding to FP.
 993    Return true if some input was read.  */
 994
 995 static bool
 996 fillbuf (struct buffer *buf, FILE *fp, char const *file)
 997 {
 998   struct keyfield const *key = keylist;
 999   char eol = eolchar;
1000   size_t line_bytes = buf->line_bytes;
1001   size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1002
1003   if (buf->eof)
1004     return false;
1005
1006   if (buf->used != buf->left)
1007     {
1008       memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1009       buf->used = buf->left;
1010       buf->nlines = 0;
1011     }
1012
1013   for (;;)
1014     {
1015       char *ptr = buf->buf + buf->used;
1016       struct line *linelim = buffer_linelim (buf);
1017       struct line *line = linelim - buf->nlines;
1018       size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1019       char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1020
1021       while (line_bytes + 1 < avail)
1022         {
1023           /* Read as many bytes as possible, but do not read so many
1024              bytes that there might not be enough room for the
1025              corresponding line array.  The worst case is when the
1026              rest of the input file consists entirely of newlines,
1027              except that the last byte is not a newline.  */
1028           size_t readsize = (avail - 1) / (line_bytes + 1);
1029           size_t bytes_read = fread (ptr, 1, readsize, fp);
1030           char *ptrlim = ptr + bytes_read;
1031           char *p;
1032           avail -= bytes_read;
1033
1034           if (bytes_read != readsize)
1035             {
1036               if (ferror (fp))
1037                 die (_("read failed"), file);
1038               if (feof (fp))
1039                 {
1040                   buf->eof = true;
1041                   if (buf->buf == ptrlim)
1042                     return false;
1043                   if (ptrlim[-1] != eol)
1044                     *ptrlim++ = eol;
1045                 }
1046             }
1047
1048           /* Find and record each line in the just-read input.  */
1049           while ((p = memchr (ptr, eol, ptrlim - ptr)))
1050             {
1051               ptr = p + 1;
1052               line--;
1053               line->text = line_start;
1054               line->length = ptr - line_start;
1055               mergesize = MAX (mergesize, line->length);
1056               avail -= line_bytes;
1057
1058               if (key)
1059                 {
1060                   /* Precompute the position of the first key for
1061                      efficiency.  */
1062                   line->keylim = (key->eword == SIZE_MAX
1063                                   ? p
1064                                   : limfield (line, key));
1065
1066                   if (key->sword != SIZE_MAX)
1067                     line->keybeg = begfield (line, key);
1068                   else
1069                     {
1070                       if (key->skipsblanks)
1071                         while (blanks[to_uchar (*line_start)])
1072                           line_start++;
1073                       line->keybeg = line_start;
1074                     }
1075                 }
1076
1077               line_start = ptr;
1078             }
1079
1080           ptr = ptrlim;
1081           if (buf->eof)
1082             break;
1083         }
1084
1085       buf->used = ptr - buf->buf;
1086       buf->nlines = buffer_linelim (buf) - line;
1087       if (buf->nlines != 0)
1088         {
1089           buf->left = ptr - line_start;
1090           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1091           return true;
1092         }
1093
1094       /* The current input line is too long to fit in the buffer.
1095          Double the buffer size and try again.  */
1096       buf->buf = X2REALLOC (buf->buf, &buf->alloc);
1097     }
1098 }
1099
1100 /* Compare strings A and B as numbers without explicitly converting them to
1101    machine numbers.  Comparatively slow for short strings, but asymptotically
1102    hideously fast. */
1103
1104 static int
1105 numcompare (const char *a, const char *b)
1106 {
1107   while (blanks[to_uchar (*a)])
1108     a++;
1109   while (blanks[to_uchar (*b)])
1110     b++;
1111
1112   return strnumcmp (a, b, decimal_point, thousands_sep);
1113 }
1114
1115 static int
1116 general_numcompare (const char *sa, const char *sb)
1117 {
1118   /* FIXME: add option to warn about failed conversions.  */
1119   /* FIXME: maybe add option to try expensive FP conversion
1120      only if A and B can't be compared more cheaply/accurately.  */
1121
1122   char *ea;
1123   char *eb;
1124   double a = strtod (sa, &ea);
1125   double b = strtod (sb, &eb);
1126
1127   /* Put conversion errors at the start of the collating sequence.  */
1128   if (sa == ea)
1129     return sb == eb ? 0 : -1;
1130   if (sb == eb)
1131     return 1;
1132
1133   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1134      conversion errors but before numbers; sort them by internal
1135      bit-pattern, for lack of a more portable alternative.  */
1136   return (a < b ? -1
1137           : a > b ? 1
1138           : a == b ? 0
1139           : b == b ? -1
1140           : a == a ? 1
1141           : memcmp ((char *) &a, (char *) &b, sizeof a));
1142 }
1143
1144 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1145    Return 0 if the name in S is not recognized.  */
1146
1147 static int
1148 getmonth (char const *month, size_t len)
1149 {
1150   size_t lo = 0;
1151   size_t hi = MONTHS_PER_YEAR;
1152   char const *monthlim = month + len;
1153
1154   for (;;)
1155     {
1156       if (month == monthlim)
1157         return 0;
1158       if (!blanks[to_uchar (*month)])
1159         break;
1160       ++month;
1161     }
1162
1163   do
1164     {
1165       size_t ix = (lo + hi) / 2;
1166       char const *m = month;
1167       char const *n = monthtab[ix].name;
1168
1169       for (;; m++, n++)
1170         {
1171           if (!*n)
1172             return monthtab[ix].val;
1173           if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1174             {
1175               hi = ix;
1176               break;
1177             }
1178           else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1179             {
1180               lo = ix + 1;
1181               break;
1182             }
1183         }
1184     }
1185   while (lo < hi);
1186
1187   return 0;
1188 }
1189
1190 /* A source of random data.  */
1191 static struct randread_source *randread_source;
1192
1193 /* Return the Ith randomly-generated state.  The caller must invoke
1194    random_state (H) for all H less than I before invoking random_state
1195    (I).  */
1196
1197 static struct md5_ctx
1198 random_state (size_t i)
1199 {
1200   /* An array of states resulting from the random data, and counts of
1201      its used and allocated members.  */
1202   static struct md5_ctx *state;
1203   static size_t used;
1204   static size_t allocated;
1205
1206   struct md5_ctx *s = &state[i];
1207
1208   if (used <= i)
1209     {
1210       unsigned char buf[MD5_DIGEST_SIZE];
1211
1212       used++;
1213
1214       if (allocated <= i)
1215         {
1216           state = X2NREALLOC (state, &allocated);
1217           s = &state[i];
1218         }
1219
1220       randread (randread_source, buf, sizeof buf);
1221       md5_init_ctx (s);
1222       md5_process_bytes (buf, sizeof buf, s);
1223     }
1224
1225   return *s;
1226 }
1227
1228 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
1229    with length LENGTHB.  Return negative if less, zero if equal,
1230    positive if greater.  */
1231
1232 static int
1233 cmp_hashes (char const *texta, size_t lena,
1234             char const *textb, size_t lenb)
1235 {
1236   /* Try random hashes until a pair of hashes disagree.  But if the
1237      first pair of random hashes agree, check whether the keys are
1238      identical and if so report no difference.  */
1239   int diff;
1240   size_t i;
1241   for (i = 0; ; i++)
1242     {
1243       uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
1244       struct md5_ctx s[2];
1245       s[0] = s[1] = random_state (i);
1246       md5_process_bytes (texta, lena, &s[0]);  md5_finish_ctx (&s[0], dig[0]);
1247       md5_process_bytes (textb, lenb, &s[1]);  md5_finish_ctx (&s[1], dig[1]);
1248       diff = memcmp (dig[0], dig[1], sizeof dig[0]);
1249       if (diff != 0)
1250         break;
1251       if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)
1252         break;
1253     }
1254
1255   return diff;
1256 }
1257
1258 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1259    using one or more random hash functions.  */
1260
1261 static int
1262 compare_random (char *restrict texta, size_t lena,
1263                 char *restrict textb, size_t lenb)
1264 {
1265   int diff;
1266
1267   if (! hard_LC_COLLATE)
1268     diff = cmp_hashes (texta, lena, textb, lenb);
1269   else
1270     {
1271       /* Transform the text into the basis of comparison, so that byte
1272          strings that would otherwise considered to be equal are
1273          considered equal here even if their bytes differ.  */
1274
1275       char *buf = NULL;
1276       char stackbuf[4000];
1277       size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
1278       bool a_fits = tlena <= sizeof stackbuf;
1279       size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
1280                                (a_fits ? sizeof stackbuf - tlena : 0),
1281                                textb, lenb);
1282
1283       if (a_fits && tlena + tlenb <= sizeof stackbuf)
1284         buf = stackbuf;
1285       else
1286         {
1287           /* Adding 1 to the buffer size lets xmemxfrm run a bit
1288              faster by avoiding the need for an extra buffer copy.  */
1289           buf = xmalloc (tlena + tlenb + 1);
1290           xmemxfrm (buf, tlena + 1, texta, lena);
1291           xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
1292         }
1293
1294       diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
1295
1296       if (buf != stackbuf)
1297         free (buf);
1298     }
1299
1300   return diff;
1301 }
1302
1303 /* Compare two lines A and B trying every key in sequence until there
1304    are no more keys or a difference is found. */
1305
1306 static int
1307 keycompare (const struct line *a, const struct line *b)
1308 {
1309   struct keyfield const *key = keylist;
1310
1311   /* For the first iteration only, the key positions have been
1312      precomputed for us. */
1313   char *texta = a->keybeg;
1314   char *textb = b->keybeg;
1315   char *lima = a->keylim;
1316   char *limb = b->keylim;
1317
1318   int diff;
1319
1320   for (;;)
1321     {
1322       char const *translate = key->translate;
1323       bool const *ignore = key->ignore;
1324
1325       /* Find the lengths. */
1326       size_t lena = lima <= texta ? 0 : lima - texta;
1327       size_t lenb = limb <= textb ? 0 : limb - textb;
1328
1329       /* Actually compare the fields. */
1330
1331       if (key->random)
1332         diff = compare_random (texta, lena, textb, lenb);
1333       else if (key->numeric | key->general_numeric)
1334         {
1335           char savea = *lima, saveb = *limb;
1336
1337           *lima = *limb = '\0';
1338           diff = ((key->numeric ? numcompare : general_numcompare)
1339                   (texta, textb));
1340           *lima = savea, *limb = saveb;
1341         }
1342       else if (key->month)
1343         diff = getmonth (texta, lena) - getmonth (textb, lenb);
1344       /* Sorting like this may become slow, so in a simple locale the user
1345          can select a faster sort that is similar to ascii sort.  */
1346       else if (hard_LC_COLLATE)
1347         {
1348           if (ignore || translate)
1349             {
1350               char buf[4000];
1351               size_t size = lena + 1 + lenb + 1;
1352               char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1353               char *copy_b = copy_a + lena + 1;
1354               size_t new_len_a, new_len_b, i;
1355
1356               /* Ignore and/or translate chars before comparing.  */
1357               for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1358                 {
1359                   if (i < lena)
1360                     {
1361                       copy_a[new_len_a] = (translate
1362                                            ? translate[to_uchar (texta[i])]
1363                                            : texta[i]);
1364                       if (!ignore || !ignore[to_uchar (texta[i])])
1365                         ++new_len_a;
1366                     }
1367                   if (i < lenb)
1368                     {
1369                       copy_b[new_len_b] = (translate
1370                                            ? translate[to_uchar (textb[i])]
1371                                            : textb [i]);
1372                       if (!ignore || !ignore[to_uchar (textb[i])])
1373                         ++new_len_b;
1374                     }
1375                 }
1376
1377               diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1378
1379               if (sizeof buf < size)
1380                 free (copy_a);
1381             }
1382           else if (lena == 0)
1383             diff = - NONZERO (lenb);
1384           else if (lenb == 0)
1385             goto greater;
1386           else
1387             diff = xmemcoll (texta, lena, textb, lenb);
1388         }
1389       else if (ignore)
1390         {
1391 #define CMP_WITH_IGNORE(A, B)                                           \
1392   do                                                                    \
1393     {                                                                   \
1394           for (;;)                                                      \
1395             {                                                           \
1396               while (texta < lima && ignore[to_uchar (*texta)])         \
1397                 ++texta;                                                \
1398               while (textb < limb && ignore[to_uchar (*textb)])         \
1399                 ++textb;                                                \
1400               if (! (texta < lima && textb < limb))                     \
1401                 break;                                                  \
1402               diff = to_uchar (A) - to_uchar (B);                       \
1403               if (diff)                                                 \
1404                 goto not_equal;                                         \
1405               ++texta;                                                  \
1406               ++textb;                                                  \
1407             }                                                           \
1408                                                                         \
1409           diff = (texta < lima) - (textb < limb);                       \
1410     }                                                                   \
1411   while (0)
1412
1413           if (translate)
1414             CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1415                              translate[to_uchar (*textb)]);
1416           else
1417             CMP_WITH_IGNORE (*texta, *textb);
1418         }
1419       else if (lena == 0)
1420         diff = - NONZERO (lenb);
1421       else if (lenb == 0)
1422         goto greater;
1423       else
1424         {
1425           if (translate)
1426             {
1427               while (texta < lima && textb < limb)
1428                 {
1429                   diff = (to_uchar (translate[to_uchar (*texta++)])
1430                           - to_uchar (translate[to_uchar (*textb++)]));
1431                   if (diff)
1432                     goto not_equal;
1433                 }
1434             }
1435           else
1436             {
1437               diff = memcmp (texta, textb, MIN (lena, lenb));
1438               if (diff)
1439                 goto not_equal;
1440             }
1441           diff = lena < lenb ? -1 : lena != lenb;
1442         }
1443
1444       if (diff)
1445         goto not_equal;
1446
1447       key = key->next;
1448       if (! key)
1449         break;
1450
1451       /* Find the beginning and limit of the next field.  */
1452       if (key->eword != SIZE_MAX)
1453         lima = limfield (a, key), limb = limfield (b, key);
1454       else
1455         lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1456
1457       if (key->sword != SIZE_MAX)
1458         texta = begfield (a, key), textb = begfield (b, key);
1459       else
1460         {
1461           texta = a->text, textb = b->text;
1462           if (key->skipsblanks)
1463             {
1464               while (texta < lima && blanks[to_uchar (*texta)])
1465                 ++texta;
1466               while (textb < limb && blanks[to_uchar (*textb)])
1467                 ++textb;
1468             }
1469         }
1470     }
1471
1472   return 0;
1473
1474  greater:
1475   diff = 1;
1476  not_equal:
1477   return key->reverse ? -diff : diff;
1478 }
1479
1480 /* Compare two lines A and B, returning negative, zero, or positive
1481    depending on whether A compares less than, equal to, or greater than B. */
1482
1483 static int
1484 compare (const struct line *a, const struct line *b)
1485 {
1486   int diff;
1487   size_t alen, blen;
1488
1489   /* First try to compare on the specified keys (if any).
1490      The only two cases with no key at all are unadorned sort,
1491      and unadorned sort -r. */
1492   if (keylist)
1493     {
1494       diff = keycompare (a, b);
1495       if (diff | unique | stable)
1496         return diff;
1497     }
1498
1499   /* If the keys all compare equal (or no keys were specified)
1500      fall through to the default comparison.  */
1501   alen = a->length - 1, blen = b->length - 1;
1502
1503   if (alen == 0)
1504     diff = - NONZERO (blen);
1505   else if (blen == 0)
1506     diff = 1;
1507   else if (hard_LC_COLLATE)
1508     diff = xmemcoll (a->text, alen, b->text, blen);
1509   else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
1510     diff = alen < blen ? -1 : alen != blen;
1511
1512   return reverse ? -diff : diff;
1513 }
1514
1515 /* Check that the lines read from FILE_NAME come in order.  Print a
1516    diagnostic (FILE_NAME, line number, contents of line) to stderr and return
1517    false if they are not in order.  Otherwise, print no diagnostic
1518    and return true.  */
1519
1520 static bool
1521 check (char const *file_name)
1522 {
1523   FILE *fp = xfopen (file_name, "r");
1524   struct buffer buf;            /* Input buffer. */
1525   struct line temp;             /* Copy of previous line. */
1526   size_t alloc = 0;
1527   uintmax_t line_number = 0;
1528   struct keyfield const *key = keylist;
1529   bool nonunique = ! unique;
1530   bool ordered = true;
1531
1532   initbuf (&buf, sizeof (struct line),
1533            MAX (merge_buffer_size, sort_size));
1534   temp.text = NULL;
1535
1536   while (fillbuf (&buf, fp, file_name))
1537     {
1538       struct line const *line = buffer_linelim (&buf);
1539       struct line const *linebase = line - buf.nlines;
1540
1541       /* Make sure the line saved from the old buffer contents is
1542          less than or equal to the first line of the new buffer. */
1543       if (alloc && nonunique <= compare (&temp, line - 1))
1544         {
1545         found_disorder:
1546           {
1547             struct line const *disorder_line = line - 1;
1548             uintmax_t disorder_line_number =
1549               buffer_linelim (&buf) - disorder_line + line_number;
1550             char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
1551             fprintf (stderr, _("%s: %s:%s: disorder: "),
1552                      program_name, file_name,
1553                      umaxtostr (disorder_line_number, hr_buf));
1554             write_bytes (disorder_line->text, disorder_line->length, stderr,
1555                          _("standard error"));
1556             ordered = false;
1557             break;
1558           }
1559         }
1560
1561       /* Compare each line in the buffer with its successor.  */
1562       while (linebase < --line)
1563         if (nonunique <= compare (line, line - 1))
1564           goto found_disorder;
1565
1566       line_number += buf.nlines;
1567
1568       /* Save the last line of the buffer.  */
1569       if (alloc < line->length)
1570         {
1571           do
1572             {
1573               alloc *= 2;
1574               if (! alloc)
1575                 {
1576                   alloc = line->length;
1577                   break;
1578                 }
1579             }
1580           while (alloc < line->length);
1581
1582           temp.text = xrealloc (temp.text, alloc);
1583         }
1584       memcpy (temp.text, line->text, line->length);
1585       temp.length = line->length;
1586       if (key)
1587         {
1588           temp.keybeg = temp.text + (line->keybeg - line->text);
1589           temp.keylim = temp.text + (line->keylim - line->text);
1590         }
1591     }
1592
1593   xfclose (fp, file_name);
1594   free (buf.buf);
1595   free (temp.text);
1596   return ordered;
1597 }
1598
1599 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
1600    files (all of which are at the start of the FILES array), and
1601    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
1602    Close input and output files before returning.
1603    OUTPUT_FILE gives the name of the output file.  If it is NULL,
1604    the output file is standard output.  If OFP is NULL, the output
1605    file has not been opened yet (or written to, if standard output).  */
1606
1607 static void
1608 mergefps (char **files, size_t ntemps, size_t nfiles,
1609           FILE *ofp, char const *output_file)
1610 {
1611   FILE *fps[NMERGE];            /* Input streams for each file.  */
1612   struct buffer buffer[NMERGE]; /* Input buffers for each file. */
1613   struct line saved;            /* Saved line storage for unique check. */
1614   struct line const *savedline = NULL;
1615                                 /* &saved if there is a saved line. */
1616   size_t savealloc = 0;         /* Size allocated for the saved line. */
1617   struct line const *cur[NMERGE]; /* Current line in each line table. */
1618   struct line const *base[NMERGE]; /* Base of each line table.  */
1619   size_t ord[NMERGE];           /* Table representing a permutation of fps,
1620                                    such that cur[ord[0]] is the smallest line
1621                                    and will be next output. */
1622   size_t i;
1623   size_t j;
1624   size_t t;
1625   struct keyfield const *key = keylist;
1626   saved.text = NULL;
1627
1628   /* Read initial lines from each input file. */
1629   for (i = 0; i < nfiles; )
1630     {
1631       fps[i] = xfopen (files[i], "r");
1632       initbuf (&buffer[i], sizeof (struct line),
1633                MAX (merge_buffer_size, sort_size / nfiles));
1634       if (fillbuf (&buffer[i], fps[i], files[i]))
1635         {
1636           struct line const *linelim = buffer_linelim (&buffer[i]);
1637           cur[i] = linelim - 1;
1638           base[i] = linelim - buffer[i].nlines;
1639           i++;
1640         }
1641       else
1642         {
1643           /* fps[i] is empty; eliminate it from future consideration.  */
1644           xfclose (fps[i], files[i]);
1645           if (i < ntemps)
1646             {
1647               ntemps--;
1648               zaptemp (files[i]);
1649             }
1650           free (buffer[i].buf);
1651           --nfiles;
1652           for (j = i; j < nfiles; ++j)
1653             files[j] = files[j + 1];
1654         }
1655     }
1656
1657   if (! ofp)
1658     ofp = xfopen (output_file, "w");
1659
1660   /* Set up the ord table according to comparisons among input lines.
1661      Since this only reorders two items if one is strictly greater than
1662      the other, it is stable. */
1663   for (i = 0; i < nfiles; ++i)
1664     ord[i] = i;
1665   for (i = 1; i < nfiles; ++i)
1666     if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
1667       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1668
1669   /* Repeatedly output the smallest line until no input remains. */
1670   while (nfiles)
1671     {
1672       struct line const *smallest = cur[ord[0]];
1673
1674       /* If uniquified output is turned on, output only the first of
1675          an identical series of lines. */
1676       if (unique)
1677         {
1678           if (savedline && compare (savedline, smallest))
1679             {
1680               savedline = NULL;
1681               write_bytes (saved.text, saved.length, ofp, output_file);
1682             }
1683           if (!savedline)
1684             {
1685               savedline = &saved;
1686               if (savealloc < smallest->length)
1687                 {
1688                   do
1689                     if (! savealloc)
1690                       {
1691                         savealloc = smallest->length;
1692                         break;
1693                       }
1694                   while ((savealloc *= 2) < smallest->length);
1695
1696                   saved.text = xrealloc (saved.text, savealloc);
1697                 }
1698               saved.length = smallest->length;
1699               memcpy (saved.text, smallest->text, saved.length);
1700               if (key)
1701                 {
1702                   saved.keybeg =
1703                     saved.text + (smallest->keybeg - smallest->text);
1704                   saved.keylim =
1705                     saved.text + (smallest->keylim - smallest->text);
1706                 }
1707             }
1708         }
1709       else
1710         write_bytes (smallest->text, smallest->length, ofp, output_file);
1711
1712       /* Check if we need to read more lines into core. */
1713       if (base[ord[0]] < smallest)
1714         cur[ord[0]] = smallest - 1;
1715       else
1716         {
1717           if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]]))
1718             {
1719               struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
1720               cur[ord[0]] = linelim - 1;
1721               base[ord[0]] = linelim - buffer[ord[0]].nlines;
1722             }
1723           else
1724             {
1725               /* We reached EOF on fps[ord[0]].  */
1726               for (i = 1; i < nfiles; ++i)
1727                 if (ord[i] > ord[0])
1728                   --ord[i];
1729               --nfiles;
1730               xfclose (fps[ord[0]], files[ord[0]]);
1731               if (ord[0] < ntemps)
1732                 {
1733                   ntemps--;
1734                   zaptemp (files[ord[0]]);
1735                 }
1736               free (buffer[ord[0]].buf);
1737               for (i = ord[0]; i < nfiles; ++i)
1738                 {
1739                   fps[i] = fps[i + 1];
1740                   files[i] = files[i + 1];
1741                   buffer[i] = buffer[i + 1];
1742                   cur[i] = cur[i + 1];
1743                   base[i] = base[i + 1];
1744                 }
1745               for (i = 0; i < nfiles; ++i)
1746                 ord[i] = ord[i + 1];
1747               continue;
1748             }
1749         }
1750
1751       /* The new line just read in may be larger than other lines
1752          already in main memory; push it back in the queue until we
1753          encounter a line larger than it.  Optimize for the common
1754          case where the new line is smallest.  */
1755       {
1756         size_t lo = 1;
1757         size_t hi = nfiles;
1758         size_t probe = lo;
1759         size_t ord0 = ord[0];
1760         size_t count_of_smaller_lines;
1761
1762         while (lo < hi)
1763           {
1764             int cmp = compare (cur[ord0], cur[ord[probe]]);
1765             if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
1766               hi = probe;
1767             else
1768               lo = probe + 1;
1769             probe = (lo + hi) / 2;
1770           }
1771
1772         count_of_smaller_lines = lo - 1;
1773         for (j = 0; j < count_of_smaller_lines; j++)
1774           ord[j] = ord[j + 1];
1775         ord[count_of_smaller_lines] = ord0;
1776       }
1777     }
1778
1779   if (unique && savedline)
1780     {
1781       write_bytes (saved.text, saved.length, ofp, output_file);
1782       free (saved.text);
1783     }
1784
1785   xfclose (ofp, output_file);
1786 }
1787
1788 /* Merge into T the two sorted arrays of lines LO (with NLO members)
1789    and HI (with NHI members).  T, LO, and HI point just past their
1790    respective arrays, and the arrays are in reverse order.  NLO and
1791    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
1792
1793 static inline void
1794 mergelines (struct line *t,
1795             struct line const *lo, size_t nlo,
1796             struct line const *hi, size_t nhi)
1797 {
1798   for (;;)
1799     if (compare (lo - 1, hi - 1) <= 0)
1800       {
1801         *--t = *--lo;
1802         if (! --nlo)
1803           {
1804             /* HI - NHI equalled T - (NLO + NHI) when this function
1805                began.  Therefore HI must equal T now, and there is no
1806                need to copy from HI to T.  */
1807             return;
1808           }
1809       }
1810     else
1811       {
1812         *--t = *--hi;
1813         if (! --nhi)
1814           {
1815             do
1816               *--t = *--lo;
1817             while (--nlo);
1818
1819             return;
1820           }
1821       }
1822 }
1823
1824 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
1825    NLINES must be at least 2.
1826    The input and output arrays are in reverse order, and LINES and
1827    TEMP point just past the end of their respective arrays.
1828
1829    Use a recursive divide-and-conquer algorithm, in the style
1830    suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
1831    the optimization suggested by exercise 5.2.4-10; this requires room
1832    for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
1833    writes that this memory optimization was originally published by
1834    D. A. Bell, Comp J. 1 (1958), 75.  */
1835
1836 static void
1837 sortlines (struct line *lines, size_t nlines, struct line *temp)
1838 {
1839   if (nlines == 2)
1840     {
1841       if (0 < compare (&lines[-1], &lines[-2]))
1842         {
1843           struct line tmp = lines[-1];
1844           lines[-1] = lines[-2];
1845           lines[-2] = tmp;
1846         }
1847     }
1848   else
1849     {
1850       size_t nlo = nlines / 2;
1851       size_t nhi = nlines - nlo;
1852       struct line *lo = lines;
1853       struct line *hi = lines - nlo;
1854       struct line *sorted_lo = temp;
1855
1856       sortlines (hi, nhi, temp);
1857       if (1 < nlo)
1858         sortlines_temp (lo, nlo, sorted_lo);
1859       else
1860         sorted_lo[-1] = lo[-1];
1861
1862       mergelines (lines, sorted_lo, nlo, hi, nhi);
1863     }
1864 }
1865
1866 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
1867    rather than sorting in place.  */
1868
1869 static void
1870 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
1871 {
1872   if (nlines == 2)
1873     {
1874       /* Declare `swap' as int, not bool, to work around a bug
1875          <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
1876          in the IBM xlc 6.0.0.0 compiler in 64-bit mode.  */
1877       int swap = (0 < compare (&lines[-1], &lines[-2]));
1878       temp[-1] = lines[-1 - swap];
1879       temp[-2] = lines[-2 + swap];
1880     }
1881   else
1882     {
1883       size_t nlo = nlines / 2;
1884       size_t nhi = nlines - nlo;
1885       struct line *lo = lines;
1886       struct line *hi = lines - nlo;
1887       struct line *sorted_hi = temp - nlo;
1888
1889       sortlines_temp (hi, nhi, sorted_hi);
1890       if (1 < nlo)
1891         sortlines (lo, nlo, temp);
1892
1893       mergelines (temp, lo, nlo, sorted_hi, nhi);
1894     }
1895 }
1896
1897 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
1898    the same as OUTFILE.  If found, merge the found instances (and perhaps
1899    some other files) into a temporary file so that it can in turn be
1900    merged into OUTFILE without destroying OUTFILE before it is completely
1901    read.  Return the new value of NFILES, which differs from the old if
1902    some merging occurred.
1903
1904    This test ensures that an otherwise-erroneous use like
1905    "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
1906    It's not clear that POSIX requires this nicety.
1907    Detect common error cases, but don't try to catch obscure cases like
1908    "cat ... FILE ... | sort -m -o FILE"
1909    where traditional "sort" doesn't copy the input and where
1910    people should know that they're getting into trouble anyway.
1911    Catching these obscure cases would slow down performance in
1912    common cases.  */
1913
1914 static size_t
1915 avoid_trashing_input (char **files, size_t ntemps, size_t nfiles,
1916                       char const *outfile)
1917 {
1918   size_t i;
1919   bool got_outstat = false;
1920   struct stat outstat;
1921
1922   for (i = ntemps; i < nfiles; i++)
1923     {
1924       bool is_stdin = STREQ (files[i], "-");
1925       bool same;
1926       struct stat instat;
1927
1928       if (outfile && STREQ (outfile, files[i]) && !is_stdin)
1929         same = true;
1930       else
1931         {
1932           if (! got_outstat)
1933             {
1934               if ((outfile
1935                    ? stat (outfile, &outstat)
1936                    : fstat (STDOUT_FILENO, &outstat))
1937                   != 0)
1938                 break;
1939               got_outstat = true;
1940             }
1941
1942           same = (((is_stdin
1943                     ? fstat (STDIN_FILENO, &instat)
1944                     : stat (files[i], &instat))
1945                    == 0)
1946                   && SAME_INODE (instat, outstat));
1947         }
1948
1949       if (same)
1950         {
1951           FILE *tftp;
1952           char *temp = create_temp_file (&tftp);
1953           mergefps (&files[i], 0, nfiles - i, tftp, temp);
1954           files[i] = temp;
1955           return i + 1;
1956         }
1957     }
1958
1959   return nfiles;
1960 }
1961
1962 /* Merge the input FILES.  NTEMPS is the number of files at the
1963    start of FILES that are temporary; it is zero at the top level.
1964    NFILES is the total number of files.  Put the output in
1965    OUTPUT_FILE; a null OUTPUT_FILE stands for standard output.  */
1966
1967 static void
1968 merge (char **files, size_t ntemps, size_t nfiles, char const *output_file)
1969 {
1970   while (NMERGE < nfiles)
1971     {
1972       /* Number of input files processed so far.  */
1973       size_t in;
1974
1975       /* Number of output files generated so far.  */
1976       size_t out;
1977
1978       /* nfiles % NMERGE; this counts input files that are left over
1979          after all full-sized merges have been done.  */
1980       size_t remainder;
1981
1982       /* Number of easily-available slots at the next loop iteration.  */
1983       size_t cheap_slots;
1984
1985       /* Do as many NMERGE-size merges as possible.  */
1986       for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
1987         {
1988           FILE *tfp;
1989           char *temp = create_temp_file (&tfp);
1990           size_t nt = MIN (ntemps, NMERGE);
1991           ntemps -= nt;
1992           mergefps (&files[in], nt, NMERGE, tfp, temp);
1993           files[out] = temp;
1994         }
1995
1996       remainder = nfiles - in;
1997       cheap_slots = NMERGE - out % NMERGE;
1998
1999       if (cheap_slots < remainder)
2000         {
2001           /* So many files remain that they can't all be put into the last
2002              NMERGE-sized output window.  Do one more merge.  Merge as few
2003              files as possible, to avoid needless I/O.  */
2004           size_t nshortmerge = remainder - cheap_slots + 1;
2005           FILE *tfp;
2006           char *temp = create_temp_file (&tfp);
2007           size_t nt = MIN (ntemps, nshortmerge);
2008           ntemps -= nt;
2009           mergefps (&files[in], nt, nshortmerge, tfp, temp);
2010           files[out++] = temp;
2011           in += nshortmerge;
2012         }
2013
2014       /* Put the remaining input files into the last NMERGE-sized output
2015          window, so they will be merged in the next pass.  */
2016       memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2017       ntemps += out;
2018       nfiles -= in - out;
2019     }
2020
2021   nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2022   mergefps (files, ntemps, nfiles, NULL, output_file);
2023 }
2024
2025 /* Sort NFILES FILES onto OUTPUT_FILE. */
2026
2027 static void
2028 sort (char * const *files, size_t nfiles, char const *output_file)
2029 {
2030   struct buffer buf;
2031   size_t ntemps = 0;
2032   bool output_file_created = false;
2033
2034   buf.alloc = 0;
2035
2036   while (nfiles)
2037     {
2038       char const *temp_output;
2039       char const *file = *files;
2040       FILE *fp = xfopen (file, "r");
2041       FILE *tfp;
2042       size_t bytes_per_line = (2 * sizeof (struct line)
2043                                - sizeof (struct line) / 2);
2044
2045       if (! buf.alloc)
2046         initbuf (&buf, bytes_per_line,
2047                  sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2048       buf.eof = false;
2049       files++;
2050       nfiles--;
2051
2052       while (fillbuf (&buf, fp, file))
2053         {
2054           struct line *line;
2055           struct line *linebase;
2056
2057           if (buf.eof && nfiles
2058               && (bytes_per_line + 1
2059                   < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2060             {
2061               /* End of file, but there is more input and buffer room.
2062                  Concatenate the next input file; this is faster in
2063                  the usual case.  */
2064               buf.left = buf.used;
2065               break;
2066             }
2067
2068           line = buffer_linelim (&buf);
2069           linebase = line - buf.nlines;
2070           if (1 < buf.nlines)
2071             sortlines (line, buf.nlines, linebase);
2072           if (buf.eof && !nfiles && !ntemps && !buf.left)
2073             {
2074               xfclose (fp, file);
2075               tfp = xfopen (output_file, "w");
2076               temp_output = output_file;
2077               output_file_created = true;
2078             }
2079           else
2080             {
2081               ++ntemps;
2082               temp_output = create_temp_file (&tfp);
2083             }
2084
2085           do
2086             {
2087               line--;
2088               write_bytes (line->text, line->length, tfp, temp_output);
2089               if (unique)
2090                 while (linebase < line && compare (line, line - 1) == 0)
2091                   line--;
2092             }
2093           while (linebase < line);
2094
2095           xfclose (tfp, temp_output);
2096
2097           if (output_file_created)
2098             goto finish;
2099         }
2100       xfclose (fp, file);
2101     }
2102
2103  finish:
2104   free (buf.buf);
2105
2106   if (! output_file_created)
2107     {
2108       size_t i;
2109       struct tempnode *node = temphead;
2110       char **tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2111       for (i = 0; node; i++)
2112         {
2113           tempfiles[i] = node->name;
2114           node = node->next;
2115         }
2116       merge (tempfiles, ntemps, ntemps, output_file);
2117       free (tempfiles);
2118     }
2119 }
2120
2121 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list.  */
2122
2123 static void
2124 insertkey (struct keyfield *key_arg)
2125 {
2126   struct keyfield **p;
2127   struct keyfield *key = xmemdup (key_arg, sizeof *key);
2128
2129   for (p = &keylist; *p; p = &(*p)->next)
2130     continue;
2131   *p = key;
2132   key->next = NULL;
2133 }
2134
2135 /* Report a bad field specification SPEC, with extra info MSGID.  */
2136
2137 static void badfieldspec (char const *, char const *)
2138      ATTRIBUTE_NORETURN;
2139 static void
2140 badfieldspec (char const *spec, char const *msgid)
2141 {
2142   error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
2143          _(msgid), quote (spec));
2144   abort ();
2145 }
2146
2147 /* Report incompatible options.  */
2148
2149 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
2150 static void
2151 incompatible_options (char const *opts)
2152 {
2153   error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
2154   abort ();
2155 }
2156
2157 /* Check compatibility of ordering options.  */
2158
2159 static void
2160 check_ordering_compatibility (void)
2161 {
2162   struct keyfield const *key;
2163
2164   for (key = keylist; key; key = key->next)
2165     if ((1 < (key->random + key->numeric + key->general_numeric + key->month
2166               + !!key->ignore))
2167         || (key->random && key->translate))
2168       {
2169         char opts[7];
2170         char *p = opts;
2171         if (key->ignore == nondictionary)
2172           *p++ = 'd';
2173         if (key->translate)
2174           *p++ = 'f';
2175         if (key->general_numeric)
2176           *p++ = 'g';
2177         if (key->ignore == nonprinting)
2178           *p++ = 'i';
2179         if (key->month)
2180           *p++ = 'M';
2181         if (key->numeric)
2182           *p++ = 'n';
2183         if (key->random)
2184           *p++ = 'R';
2185         *p = '\0';
2186         incompatible_options (opts);
2187       }
2188 }
2189
2190 /* Parse the leading integer in STRING and store the resulting value
2191    (which must fit into size_t) into *VAL.  Return the address of the
2192    suffix after the integer.  If the value is too large, silently
2193    substitute SIZE_MAX.  If MSGID is NULL, return NULL after
2194    failure; otherwise, report MSGID and exit on failure.  */
2195
2196 static char const *
2197 parse_field_count (char const *string, size_t *val, char const *msgid)
2198 {
2199   char *suffix;
2200   uintmax_t n;
2201
2202   switch (xstrtoumax (string, &suffix, 10, &n, ""))
2203     {
2204     case LONGINT_OK:
2205     case LONGINT_INVALID_SUFFIX_CHAR:
2206       *val = n;
2207       if (*val == n)
2208         break;
2209       /* Fall through.  */
2210     case LONGINT_OVERFLOW:
2211     case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2212       *val = SIZE_MAX;
2213       break;
2214
2215     case LONGINT_INVALID:
2216       if (msgid)
2217         error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
2218                _(msgid), quote (string));
2219       return NULL;
2220     }
2221
2222   return suffix;
2223 }
2224
2225 /* Handle interrupts and hangups. */
2226
2227 static void
2228 sighandler (int sig)
2229 {
2230   if (! SA_NOCLDSTOP)
2231     signal (sig, SIG_IGN);
2232
2233   cleanup ();
2234
2235   signal (sig, SIG_DFL);
2236   raise (sig);
2237 }
2238
2239 /* Set the ordering options for KEY specified in S.
2240    Return the address of the first character in S that
2241    is not a valid ordering option.
2242    BLANKTYPE is the kind of blanks that 'b' should skip. */
2243
2244 static char *
2245 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
2246 {
2247   while (*s)
2248     {
2249       switch (*s)
2250         {
2251         case 'b':
2252           if (blanktype == bl_start || blanktype == bl_both)
2253             key->skipsblanks = true;
2254           if (blanktype == bl_end || blanktype == bl_both)
2255             key->skipeblanks = true;
2256           break;
2257         case 'd':
2258           key->ignore = nondictionary;
2259           break;
2260         case 'f':
2261           key->translate = fold_toupper;
2262           break;
2263         case 'g':
2264           key->general_numeric = true;
2265           break;
2266         case 'i':
2267           /* Option order should not matter, so don't let -i override
2268              -d.  -d implies -i, but -i does not imply -d.  */
2269           if (! key->ignore)
2270             key->ignore = nonprinting;
2271           break;
2272         case 'M':
2273           key->month = true;
2274           break;
2275         case 'n':
2276           key->numeric = true;
2277           break;
2278         case 'R':
2279           key->random = true;
2280           break;
2281         case 'r':
2282           key->reverse = true;
2283           break;
2284         default:
2285           return (char *) s;
2286         }
2287       ++s;
2288     }
2289   return (char *) s;
2290 }
2291
2292 static struct keyfield *
2293 key_init (struct keyfield *key)
2294 {
2295   memset (key, 0, sizeof *key);
2296   key->eword = SIZE_MAX;
2297   return key;
2298 }
2299
2300 int
2301 main (int argc, char **argv)
2302 {
2303   struct keyfield *key;
2304   struct keyfield key_buf;
2305   struct keyfield gkey;
2306   char const *s;
2307   int c = 0;
2308   bool checkonly = false;
2309   bool mergeonly = false;
2310   char *random_source = NULL;
2311   bool need_random = false;
2312   size_t nfiles = 0;
2313   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2314   bool obsolete_usage = (posix2_version () < 200112);
2315   char **files;
2316   char const *outfile = NULL;
2317
2318   initialize_main (&argc, &argv);
2319   program_name = argv[0];
2320   setlocale (LC_ALL, "");
2321   bindtextdomain (PACKAGE, LOCALEDIR);
2322   textdomain (PACKAGE);
2323
2324   initialize_exit_failure (SORT_FAILURE);
2325
2326   hard_LC_COLLATE = hard_locale (LC_COLLATE);
2327 #if HAVE_NL_LANGINFO
2328   hard_LC_TIME = hard_locale (LC_TIME);
2329 #endif
2330
2331   /* Get locale's representation of the decimal point.  */
2332   {
2333     struct lconv const *locale = localeconv ();
2334
2335     /* If the locale doesn't define a decimal point, or if the decimal
2336        point is multibyte, use the C locale's decimal point.  FIXME:
2337        add support for multibyte decimal points.  */
2338     decimal_point = to_uchar (locale->decimal_point[0]);
2339     if (! decimal_point || locale->decimal_point[1])
2340       decimal_point = '.';
2341
2342     /* FIXME: add support for multibyte thousands separators.  */
2343     thousands_sep = to_uchar (*locale->thousands_sep);
2344     if (! thousands_sep || locale->thousands_sep[1])
2345       thousands_sep = -1;
2346   }
2347
2348   have_read_stdin = false;
2349   inittables ();
2350
2351   {
2352     size_t i;
2353     static int const sig[] =
2354       {
2355         /* The usual suspects.  */
2356         SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
2357 #ifdef SIGPOLL
2358         SIGPOLL,
2359 #endif
2360 #ifdef SIGPROF
2361         SIGPROF,
2362 #endif
2363 #ifdef SIGVTALRM
2364         SIGVTALRM,
2365 #endif
2366 #ifdef SIGXCPU
2367         SIGXCPU,
2368 #endif
2369 #ifdef SIGXFSZ
2370         SIGXFSZ,
2371 #endif
2372       };
2373     enum { nsigs = sizeof sig / sizeof sig[0] };
2374
2375 #if SA_NOCLDSTOP
2376     struct sigaction act;
2377
2378     sigemptyset (&caught_signals);
2379     for (i = 0; i < nsigs; i++)
2380       {
2381         sigaction (sig[i], NULL, &act);
2382         if (act.sa_handler != SIG_IGN)
2383           sigaddset (&caught_signals, sig[i]);
2384       }
2385
2386     act.sa_handler = sighandler;
2387     act.sa_mask = caught_signals;
2388     act.sa_flags = 0;
2389
2390     for (i = 0; i < nsigs; i++)
2391       if (sigismember (&caught_signals, sig[i]))
2392         sigaction (sig[i], &act, NULL);
2393 #else
2394     for (i = 0; i < nsigs; i++)
2395       if (signal (sig[i], SIG_IGN) != SIG_IGN)
2396         {
2397           signal (sig[i], sighandler);
2398           siginterrupt (sig[i], 1);
2399         }
2400 #endif
2401   }
2402
2403   /* The signal mask is known, so it is safe to invoke exit_cleanup.  */
2404   atexit (exit_cleanup);
2405
2406   gkey.sword = gkey.eword = SIZE_MAX;
2407   gkey.ignore = NULL;
2408   gkey.translate = NULL;
2409   gkey.numeric = gkey.general_numeric = gkey.random = false;
2410   gkey.month = gkey.reverse = false;
2411   gkey.skipsblanks = gkey.skipeblanks = false;
2412
2413   files = xnmalloc (argc, sizeof *files);
2414
2415   for (;;)
2416     {
2417       /* Parse an operand as a file after "--" was seen; or if
2418          pedantic and a file was seen, unless the POSIX version
2419          predates 1003.1-2001 and -c was not seen and the operand is
2420          "-o FILE" or "-oFILE".  */
2421
2422       if (c == -1
2423           || (posixly_correct && nfiles != 0
2424               && ! (obsolete_usage
2425                     && ! checkonly
2426                     && optind != argc
2427                     && argv[optind][0] == '-' && argv[optind][1] == 'o'
2428                     && (argv[optind][2] || optind + 1 != argc)))
2429           || ((c = getopt_long (argc, argv, short_options,
2430                                 long_options, NULL))
2431               == -1))
2432         {
2433           if (argc <= optind)
2434             break;
2435           files[nfiles++] = argv[optind++];
2436         }
2437       else switch (c)
2438         {
2439         case 1:
2440           key = NULL;
2441           if (optarg[0] == '+')
2442             {
2443               bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
2444                                       && ISDIGIT (argv[optind][1]));
2445               obsolete_usage |= minus_pos_usage & ~posixly_correct;
2446               if (obsolete_usage)
2447                 {
2448                   /* Treat +POS1 [-POS2] as a key if possible; but silently
2449                      treat an operand as a file if it is not a valid +POS1.  */
2450                   key = key_init (&key_buf);
2451                   s = parse_field_count (optarg + 1, &key->sword, NULL);
2452                   if (s && *s == '.')
2453                     s = parse_field_count (s + 1, &key->schar, NULL);
2454                   if (! (key->sword | key->schar))
2455                     key->sword = SIZE_MAX;
2456                   if (! s || *set_ordering (s, key, bl_start))
2457                     {
2458                       free (key);
2459                       key = NULL;
2460                     }
2461                   else
2462                     {
2463                       if (minus_pos_usage)
2464                         {
2465                           char const *optarg1 = argv[optind++];
2466                           s = parse_field_count (optarg1 + 1, &key->eword,
2467                                              N_("invalid number after `-'"));
2468                           if (*s == '.')
2469                             s = parse_field_count (s + 1, &key->echar,
2470                                                N_("invalid number after `.'"));
2471                           if (*set_ordering (s, key, bl_end))
2472                             badfieldspec (optarg1,
2473                                       N_("stray character in field spec"));
2474                         }
2475                       insertkey (key);
2476                     }
2477                 }
2478             }
2479           if (! key)
2480             files[nfiles++] = optarg;
2481           break;
2482
2483         case 'b':
2484         case 'd':
2485         case 'f':
2486         case 'g':
2487         case 'i':
2488         case 'M':
2489         case 'n':
2490         case 'r':
2491         case 'R':
2492           {
2493             char str[2];
2494             str[0] = c;
2495             str[1] = '\0';
2496             set_ordering (str, &gkey, bl_both);
2497           }
2498           break;
2499
2500         case 'c':
2501           checkonly = true;
2502           break;
2503
2504         case 'k':
2505           key = key_init (&key_buf);
2506
2507           /* Get POS1. */
2508           s = parse_field_count (optarg, &key->sword,
2509                                  N_("invalid number at field start"));
2510           if (! key->sword--)
2511             {
2512               /* Provoke with `sort -k0' */
2513               badfieldspec (optarg, N_("field number is zero"));
2514             }
2515           if (*s == '.')
2516             {
2517               s = parse_field_count (s + 1, &key->schar,
2518                                      N_("invalid number after `.'"));
2519               if (! key->schar--)
2520                 {
2521                   /* Provoke with `sort -k1.0' */
2522                   badfieldspec (optarg, N_("character offset is zero"));
2523                 }
2524             }
2525           if (! (key->sword | key->schar))
2526             key->sword = SIZE_MAX;
2527           s = set_ordering (s, key, bl_start);
2528           if (*s != ',')
2529             {
2530               key->eword = SIZE_MAX;
2531               key->echar = 0;
2532             }
2533           else
2534             {
2535               /* Get POS2. */
2536               s = parse_field_count (s + 1, &key->eword,
2537                                      N_("invalid number after `,'"));
2538               if (! key->eword--)
2539                 {
2540                   /* Provoke with `sort -k1,0' */
2541                   badfieldspec (optarg, N_("field number is zero"));
2542                 }
2543               if (*s == '.')
2544                 s = parse_field_count (s + 1, &key->echar,
2545                                        N_("invalid number after `.'"));
2546               else
2547                 {
2548                   /* `-k 2,3' is equivalent to `+1 -3'.  */
2549                   key->eword++;
2550                 }
2551               s = set_ordering (s, key, bl_end);
2552             }
2553           if (*s)
2554             badfieldspec (optarg, N_("stray character in field spec"));
2555           insertkey (key);
2556           break;
2557
2558         case 'm':
2559           mergeonly = true;
2560           break;
2561
2562         case 'o':
2563           if (outfile && !STREQ (outfile, optarg))
2564             error (SORT_FAILURE, 0, _("multiple output files specified"));
2565           outfile = optarg;
2566           break;
2567
2568         case RANDOM_SOURCE_OPTION:
2569           if (random_source && !STREQ (random_source, optarg))
2570             error (SORT_FAILURE, 0, _("multiple random sources specified"));
2571           random_source = optarg;
2572           break;
2573
2574         case 's':
2575           stable = true;
2576           break;
2577
2578         case 'S':
2579           specify_sort_size (optarg);
2580           break;
2581
2582         case 't':
2583           {
2584             char newtab = optarg[0];
2585             if (! newtab)
2586               error (SORT_FAILURE, 0, _("empty tab"));
2587             if (optarg[1])
2588               {
2589                 if (STREQ (optarg, "\\0"))
2590                   newtab = '\0';
2591                 else
2592                   {
2593                     /* Provoke with `sort -txx'.  Complain about
2594                        "multi-character tab" instead of "multibyte tab", so
2595                        that the diagnostic's wording does not need to be
2596                        changed once multibyte characters are supported.  */
2597                     error (SORT_FAILURE, 0, _("multi-character tab %s"),
2598                            quote (optarg));
2599                   }
2600               }
2601             if (tab != TAB_DEFAULT && tab != newtab)
2602               error (SORT_FAILURE, 0, _("incompatible tabs"));
2603             tab = newtab;
2604           }
2605           break;
2606
2607         case 'T':
2608           add_temp_dir (optarg);
2609           break;
2610
2611         case 'u':
2612           unique = true;
2613           break;
2614
2615         case 'y':
2616           /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
2617              through Solaris 7.  It is also accepted by many non-Solaris
2618              "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
2619              -y is marked as obsolete starting with Solaris 8 (1999), but is
2620              still accepted as of Solaris 10 prerelease (2004).
2621
2622              Solaris 2.5.1 "sort -y 100" reads the input file "100", but
2623              emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
2624              and which in general ignores the argument after "-y" if it
2625              consists entirely of digits (it can even be empty).  */
2626           if (optarg == argv[optind - 1])
2627             {
2628               char const *p;
2629               for (p = optarg; ISDIGIT (*p); p++)
2630                 continue;
2631               optind -= (*p != '\0');
2632             }
2633           break;
2634
2635         case 'z':
2636           eolchar = 0;
2637           break;
2638
2639         case_GETOPT_HELP_CHAR;
2640
2641         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2642
2643         default:
2644           usage (SORT_FAILURE);
2645         }
2646     }
2647
2648   /* Inheritance of global options to individual keys. */
2649   for (key = keylist; key; key = key->next)
2650     {
2651       if (! (key->ignore || key->translate
2652              || (key->skipsblanks | key->reverse
2653                  | key->skipeblanks | key->month | key->numeric
2654                  | key->general_numeric
2655                  | key->random)))
2656         {
2657           key->ignore = gkey.ignore;
2658           key->translate = gkey.translate;
2659           key->skipsblanks = gkey.skipsblanks;
2660           key->skipeblanks = gkey.skipeblanks;
2661           key->month = gkey.month;
2662           key->numeric = gkey.numeric;
2663           key->general_numeric = gkey.general_numeric;
2664           key->random = gkey.random;
2665           key->reverse = gkey.reverse;
2666         }
2667
2668       need_random |= key->random;
2669     }
2670
2671   if (!keylist && (gkey.ignore || gkey.translate
2672                    || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
2673                        | gkey.numeric | gkey.general_numeric
2674                        | gkey.random)))
2675     {
2676       insertkey (&gkey);
2677       need_random |= gkey.random;
2678     }
2679
2680   check_ordering_compatibility ();
2681
2682   reverse = gkey.reverse;
2683
2684   if (need_random)
2685     {
2686       randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
2687       if (! randread_source)
2688         die (_("open failed"), random_source);
2689     }
2690
2691   if (temp_dir_count == 0)
2692     {
2693       char const *tmp_dir = getenv ("TMPDIR");
2694       add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
2695     }
2696
2697   if (nfiles == 0)
2698     {
2699       static char *minus = "-";
2700       nfiles = 1;
2701       free (files);
2702       files = &minus;
2703     }
2704
2705   if (checkonly)
2706     {
2707       if (nfiles > 1)
2708         error (SORT_FAILURE, 0, _("extra operand %s not allowed with -c"),
2709                quote (files[1]));
2710
2711       if (outfile)
2712         incompatible_options ("co");
2713
2714       /* POSIX requires that sort return 1 IFF invoked with -c and the
2715          input is not properly sorted.  */
2716       exit (check (files[0]) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
2717     }
2718
2719   if (mergeonly)
2720     merge (files, 0, nfiles, outfile);
2721   else
2722     sort (files, nfiles, outfile);
2723
2724   if (have_read_stdin && fclose (stdin) == EOF)
2725     die (_("close failed"), "-");
2726
2727   exit (EXIT_SUCCESS);
2728 }