src/sort.c

   1 /* sort - sort lines of text (with all kinds of options).
   2    Copyright (C) 1988, 1991-2005 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17
  18    Written December 1988 by Mike Haertel.
  19    The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
  20    or (US mail) as Mike Haertel c/o Free Software Foundation.
  21
  22    Ørn E. Hansen added NLS support in 1997.  */
  23
  24 #include <config.h>
  25
  26 #include <getopt.h>
  27 #include <sys/types.h>
  28 #include <signal.h>
  29 #include <stdio.h>
  30 #include "system.h"
  31 #include "error.h"
  32 #include "hard-locale.h"
  33 #include "inttostr.h"
  34 #include "physmem.h"
  35 #include "posixver.h"
  36 #include "quote.h"
  37 #include "stdio-safer.h"
  38 #include "xmemcoll.h"
  39 #include "xstrtol.h"
  40
  41 #if HAVE_SYS_RESOURCE_H
  42 # include <sys/resource.h>
  43 #endif
  44 #ifndef RLIMIT_DATA
  45 struct rlimit { size_t rlim_cur; };
  46 # define getrlimit(Resource, Rlp) (-1)
  47 #endif
  48
  49 /* The official name of this program (e.g., no `g' prefix).  */
  50 #define PROGRAM_NAME "sort"
  51
  52 #define AUTHORS "Mike Haertel", "Paul Eggert"
  53
  54 #if HAVE_LANGINFO_CODESET
  55 # include <langinfo.h>
  56 #endif
  57
  58 #ifndef SA_NOCLDSTOP
  59 # define sigprocmask(How, Set, Oset) /* empty */
  60 # define sigset_t int
  61 #endif
  62
  63 #ifndef STDC_HEADERS
  64 double strtod ();
  65 #endif
  66
  67 #define UCHAR_LIM (UCHAR_MAX + 1)
  68
  69 #ifndef DEFAULT_TMPDIR
  70 # define DEFAULT_TMPDIR "/tmp"
  71 #endif
  72
  73 /* Exit statuses.  */
  74 enum
  75   {
  76     /* POSIX says to exit with status 1 if invoked with -c and the
  77        input is not properly sorted.  */
  78     SORT_OUT_OF_ORDER = 1,
  79
  80     /* POSIX says any other irregular exit must exit with a status
  81        code greater than 1.  */
  82     SORT_FAILURE = 2
  83   };
  84
  85 #define NEGATION_SIGN   '-'
  86 #define NUMERIC_ZERO    '0'
  87
  88 /* The representation of the decimal point in the current locale.  */
  89 static char decimal_point;
  90
  91 /* Thousands separator; if CHAR_MAX + 1, then there isn't one.  */
  92 static int thousands_sep;
  93
  94 /* Nonzero if the corresponding locales are hard.  */
  95 static bool hard_LC_COLLATE;
  96 #if HAVE_NL_LANGINFO
  97 static bool hard_LC_TIME;
  98 #endif
  99
 100 #define NONZERO(x) ((x) != 0)
 101
 102 /* The kind of blanks for '-b' to skip in various options. */
 103 enum blanktype { bl_start, bl_end, bl_both };
 104
 105 /* The character marking end of line. Default to \n. */
 106 static char eolchar = '\n';
 107
 108 /* Lines are held in core as counted strings. */
 109 struct line
 110 {
 111   char *text;                   /* Text of the line. */
 112   size_t length;                /* Length including final newline. */
 113   char *keybeg;                 /* Start of first key. */
 114   char *keylim;                 /* Limit of first key. */
 115 };
 116
 117 /* Input buffers. */
 118 struct buffer
 119 {
 120   char *buf;                    /* Dynamically allocated buffer,
 121                                    partitioned into 3 regions:
 122                                    - input data;
 123                                    - unused area;
 124                                    - an array of lines, in reverse order.  */
 125   size_t used;                  /* Number of bytes used for input data.  */
 126   size_t nlines;                /* Number of lines in the line array.  */
 127   size_t alloc;                 /* Number of bytes allocated. */
 128   size_t left;                  /* Number of bytes left from previous reads. */
 129   size_t line_bytes;            /* Number of bytes to reserve for each line. */
 130   bool eof;                     /* An EOF has been read.  */
 131 };
 132
 133 struct keyfield
 134 {
 135   size_t sword;                 /* Zero-origin 'word' to start at. */
 136   size_t schar;                 /* Additional characters to skip. */
 137   size_t eword;                 /* Zero-origin first word after field. */
 138   size_t echar;                 /* Additional characters in field. */
 139   bool const *ignore;           /* Boolean array of characters to ignore. */
 140   char const *translate;        /* Translation applied to characters. */
 141   bool skipsblanks;             /* Skip leading blanks when finding start.  */
 142   bool skipeblanks;             /* Skip leading blanks when finding end.  */
 143   bool numeric;                 /* Flag for numeric comparison.  Handle
 144                                    strings of digits with optional decimal
 145                                    point, but no exponential notation. */
 146   bool general_numeric;         /* Flag for general, numeric comparison.
 147                                    Handle numbers in exponential notation. */
 148   bool month;                   /* Flag for comparison by month name. */
 149   bool reverse;                 /* Reverse the sense of comparison. */
 150   struct keyfield *next;        /* Next keyfield to try. */
 151 };
 152
 153 struct month
 154 {
 155   char const *name;
 156   int val;
 157 };
 158
 159 /* The name this program was run with. */
 160 char *program_name;
 161
 162 /* FIXME: None of these tables work with multibyte character sets.
 163    Also, there are many other bugs when handling multibyte characters.
 164    One way to fix this is to rewrite `sort' to use wide characters
 165    internally, but doing this with good performance is a bit
 166    tricky.  */
 167
 168 /* Table of blanks.  */
 169 static bool blanks[UCHAR_LIM];
 170
 171 /* Table of non-printing characters. */
 172 static bool nonprinting[UCHAR_LIM];
 173
 174 /* Table of non-dictionary characters (not letters, digits, or blanks). */
 175 static bool nondictionary[UCHAR_LIM];
 176
 177 /* Translation table folding lower case to upper.  */
 178 static char fold_toupper[UCHAR_LIM];
 179
 180 #define MONTHS_PER_YEAR 12
 181
 182 /* Table mapping month names to integers.
 183    Alphabetic order allows binary search. */
 184 static struct month monthtab[] =
 185 {
 186   {"APR", 4},
 187   {"AUG", 8},
 188   {"DEC", 12},
 189   {"FEB", 2},
 190   {"JAN", 1},
 191   {"JUL", 7},
 192   {"JUN", 6},
 193   {"MAR", 3},
 194   {"MAY", 5},
 195   {"NOV", 11},
 196   {"OCT", 10},
 197   {"SEP", 9}
 198 };
 199
 200 /* During the merge phase, the number of files to merge at once. */
 201 #define NMERGE 16
 202
 203 /* Minimum size for a merge or check buffer.  */
 204 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
 205
 206 /* Minimum sort size; the code might not work with smaller sizes.  */
 207 #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
 208
 209 /* The number of bytes needed for a merge or check buffer, which can
 210    function relatively efficiently even if it holds only one line.  If
 211    a longer line is seen, this value is increased.  */
 212 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
 213
 214 /* The approximate maximum number of bytes of main memory to use, as
 215    specified by the user.  Zero if the user has not specified a size.  */
 216 static size_t sort_size;
 217
 218 /* The guessed size for non-regular files.  */
 219 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
 220
 221 /* Array of directory names in which any temporary files are to be created. */
 222 static char const **temp_dirs;
 223
 224 /* Number of temporary directory names used.  */
 225 static size_t temp_dir_count;
 226
 227 /* Number of allocated slots in temp_dirs.  */
 228 static size_t temp_dir_alloc;
 229
 230 /* Flag to reverse the order of all comparisons. */
 231 static bool reverse;
 232
 233 /* Flag for stable sort.  This turns off the last ditch bytewise
 234    comparison of lines, and instead leaves lines in the same order
 235    they were read if all keys compare equal.  */
 236 static bool stable;
 237
 238 /* If TAB has this value, blanks separate fields.  */
 239 enum { TAB_DEFAULT = CHAR_MAX + 1 };
 240
 241 /* Tab character separating fields.  If TAB_DEFAULT, then fields are
 242    separated by the empty string between a non-blank character and a blank
 243    character. */
 244 static int tab = TAB_DEFAULT;
 245
 246 /* Flag to remove consecutive duplicate lines from the output.
 247    Only the last of a sequence of equal lines will be output. */
 248 static bool unique;
 249
 250 /* Nonzero if any of the input files are the standard input. */
 251 static bool have_read_stdin;
 252
 253 /* List of key field comparisons to be tried.  */
 254 static struct keyfield *keylist;
 255
 256 static void sortlines_temp (struct line *, size_t, struct line *);
 257
 258 /* Report MESSAGE for FILE, then clean up and exit.
 259    If FILE is null, it represents standard output.  */
 260
 261 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
 262 static void
 263 die (char const *message, char const *file)
 264 {
 265   error (0, errno, "%s: %s", message, file ? file : _("standard output"));
 266   exit (SORT_FAILURE);
 267 }
 268
 269 void
 270 usage (int status)
 271 {
 272   if (status != EXIT_SUCCESS)
 273     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 274              program_name);
 275   else
 276     {
 277       printf (_("\
 278 Usage: %s [OPTION]... [FILE]...\n\
 279 "),
 280               program_name);
 281       fputs (_("\
 282 Write sorted concatenation of all FILE(s) to standard output.\n\
 283 \n\
 284 Ordering options:\n\
 285 \n\
 286 "), stdout);
 287       fputs (_("\
 288 Mandatory arguments to long options are mandatory for short options too.\n\
 289 "), stdout);
 290       fputs (_("\
 291   -b, --ignore-leading-blanks ignore leading blanks\n\
 292   -d, --dictionary-order      consider only blanks and alphanumeric characters\n\
 293   -f, --ignore-case           fold lower case to upper case characters\n\
 294 "), stdout);
 295       fputs (_("\
 296   -g, --general-numeric-sort  compare according to general numerical value\n\
 297   -i, --ignore-nonprinting    consider only printable characters\n\
 298   -M, --month-sort            compare (unknown) < `JAN' < ... < `DEC'\n\
 299   -n, --numeric-sort          compare according to string numerical value\n\
 300   -r, --reverse               reverse the result of comparisons\n\
 301 \n\
 302 "), stdout);
 303       fputs (_("\
 304 Other options:\n\
 305 \n\
 306   -c, --check               check whether input is sorted; do not sort\n\
 307   -k, --key=POS1[,POS2]     start a key at POS1, end it at POS 2 (origin 1)\n\
 308   -m, --merge               merge already sorted files; do not sort\n\
 309   -o, --output=FILE         write result to FILE instead of standard output\n\
 310   -s, --stable              stabilize sort by disabling last-resort comparison\n\
 311   -S, --buffer-size=SIZE    use SIZE for main memory buffer\n\
 312 "), stdout);
 313       printf (_("\
 314   -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
 315   -T, --temporary-directory=DIR  use DIR for temporaries, not $TMPDIR or %s;\n\
 316                               multiple options specify multiple directories\n\
 317   -u, --unique              with -c, check for strict ordering;\n\
 318                               without -c, output only the first of an equal run\n\
 319 "), DEFAULT_TMPDIR);
 320       fputs (_("\
 321   -z, --zero-terminated     end lines with 0 byte, not newline\n\
 322 "), stdout);
 323       fputs (HELP_OPTION_DESCRIPTION, stdout);
 324       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 325       fputs (_("\
 326 \n\
 327 POS is F[.C][OPTS], where F is the field number and C the character position\n\
 328 in the field.  OPTS is one or more single-letter ordering options, which\n\
 329 override global ordering options for that key.  If no key is given, use the\n\
 330 entire line as the key.\n\
 331 \n\
 332 SIZE may be followed by the following multiplicative suffixes:\n\
 333 "), stdout);
 334       fputs (_("\
 335 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
 336 \n\
 337 With no FILE, or when FILE is -, read standard input.\n\
 338 \n\
 339 *** WARNING ***\n\
 340 The locale specified by the environment affects sort order.\n\
 341 Set LC_ALL=C to get the traditional sort order that uses\n\
 342 native byte values.\n\
 343 "), stdout );
 344       printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 345     }
 346
 347   exit (status);
 348 }
 349
 350 #define COMMON_SHORT_OPTIONS "-bcdfgik:mMno:rsS:t:T:uz"
 351
 352 static struct option const long_options[] =
 353 {
 354   {"ignore-leading-blanks", no_argument, NULL, 'b'},
 355   {"check", no_argument, NULL, 'c'},
 356   {"dictionary-order", no_argument, NULL, 'd'},
 357   {"ignore-case", no_argument, NULL, 'f'},
 358   {"general-numeric-sort", no_argument, NULL, 'g'},
 359   {"ignore-nonprinting", no_argument, NULL, 'i'},
 360   {"key", required_argument, NULL, 'k'},
 361   {"merge", no_argument, NULL, 'm'},
 362   {"month-sort", no_argument, NULL, 'M'},
 363   {"numeric-sort", no_argument, NULL, 'n'},
 364   {"output", required_argument, NULL, 'o'},
 365   {"reverse", no_argument, NULL, 'r'},
 366   {"stable", no_argument, NULL, 's'},
 367   {"buffer-size", required_argument, NULL, 'S'},
 368   {"field-separator", required_argument, NULL, 't'},
 369   {"temporary-directory", required_argument, NULL, 'T'},
 370   {"unique", no_argument, NULL, 'u'},
 371   {"zero-terminated", no_argument, NULL, 'z'},
 372   {GETOPT_HELP_OPTION_DECL},
 373   {GETOPT_VERSION_OPTION_DECL},
 374   {NULL, 0, NULL, 0},
 375 };
 376
 377 /* The set of signals that are caught.  */
 378 static sigset_t caught_signals;
 379
 380 /* The list of temporary files. */
 381 struct tempnode
 382 {
 383   struct tempnode *volatile next;
 384   char name[1];  /* Actual size is 1 + file name length.  */
 385 };
 386 static struct tempnode *volatile temphead;
 387 static struct tempnode *volatile *temptail = &temphead;
 388
 389 /* Clean up any remaining temporary files.  */
 390
 391 static void
 392 cleanup (void)
 393 {
 394   struct tempnode const *node;
 395
 396   for (node = temphead; node; node = node->next)
 397     unlink (node->name);
 398 }
 399
 400 /* Create a new temporary file, returning its newly allocated name.
 401    Store into *PFP a stream open for writing.  */
 402
 403 static char *
 404 create_temp_file (FILE **pfp)
 405 {
 406   static char const slashbase[] = "/sortXXXXXX";
 407   static size_t temp_dir_index;
 408   sigset_t oldset;
 409   int fd;
 410   int saved_errno;
 411   char const *temp_dir = temp_dirs[temp_dir_index];
 412   size_t len = strlen (temp_dir);
 413   struct tempnode *node =
 414     xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
 415   char *file = node->name;
 416
 417   memcpy (file, temp_dir, len);
 418   memcpy (file + len, slashbase, sizeof slashbase);
 419   node->next = NULL;
 420   if (++temp_dir_index == temp_dir_count)
 421     temp_dir_index = 0;
 422
 423   /* Create the temporary file in a critical section, to avoid races.  */
 424   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
 425   fd = mkstemp (file);
 426   if (0 <= fd)
 427     {
 428       *temptail = node;
 429       temptail = &node->next;
 430     }
 431   saved_errno = errno;
 432   sigprocmask (SIG_SETMASK, &oldset, NULL);
 433   errno = saved_errno;
 434
 435   if (fd < 0 || (*pfp = fdopen (fd, "w")) == NULL)
 436     die (_("cannot create temporary file"), file);
 437
 438   return file;
 439 }
 440
 441 /* Return a stream for FILE, opened with mode HOW.  A null FILE means
 442    standard output; HOW should be "w".  When opening for input, "-"
 443    means standard input.  To avoid confusion, do not return file
 444    descriptors 0, 1, or 2.  */
 445
 446 static FILE *
 447 xfopen (const char *file, const char *how)
 448 {
 449   FILE *fp;
 450
 451   if (!file)
 452     fp = stdout;
 453   else if (STREQ (file, "-") && *how == 'r')
 454     {
 455       have_read_stdin = true;
 456       fp = stdin;
 457     }
 458   else
 459     {
 460       if ((fp = fopen_safer (file, how)) == NULL)
 461         die (_("open failed"), file);
 462     }
 463
 464   return fp;
 465 }
 466
 467 /* Close FP, whose name is FILE, and report any errors.  */
 468
 469 static void
 470 xfclose (FILE *fp, char const *file)
 471 {
 472   if (fp == stdin)
 473     {
 474       /* Allow reading stdin from tty more than once. */
 475       if (feof (fp))
 476         clearerr (fp);
 477     }
 478   else if (fp == stdout)
 479     {
 480       /* Don't close stdout just yet.  close_stdout does that.  */
 481       if (fflush (fp) != 0)
 482         die (_("fflush failed"), file);
 483     }
 484   else
 485     {
 486       if (fclose (fp) != 0)
 487         die (_("close failed"), file);
 488     }
 489 }
 490
 491 static void
 492 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
 493 {
 494   if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
 495     die (_("write failed"), output_file);
 496 }
 497
 498 /* Append DIR to the array of temporary directory names.  */
 499 static void
 500 add_temp_dir (char const *dir)
 501 {
 502   if (temp_dir_count == temp_dir_alloc)
 503     temp_dirs = x2nrealloc (temp_dirs, &temp_dir_alloc, sizeof *temp_dirs);
 504
 505   temp_dirs[temp_dir_count++] = dir;
 506 }
 507
 508 /* Remove NAME from the list of temporary files.  */
 509
 510 static void
 511 zaptemp (const char *name)
 512 {
 513   struct tempnode *volatile *pnode;
 514   struct tempnode *node;
 515   struct tempnode *next;
 516   sigset_t oldset;
 517   int unlink_status;
 518   int unlink_errno = 0;
 519
 520   for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
 521     continue;
 522
 523   /* Unlink the temporary file in a critical section to avoid races.  */
 524   next = node->next;
 525   sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
 526   unlink_status = unlink (name);
 527   unlink_errno = errno;
 528   *pnode = next;
 529   sigprocmask (SIG_SETMASK, &oldset, NULL);
 530
 531   if (unlink_status != 0)
 532     error (0, unlink_errno, _("warning: cannot remove: %s"), name);
 533   if (! next)
 534     temptail = pnode;
 535   free (node);
 536 }
 537
 538 #if HAVE_NL_LANGINFO
 539
 540 static int
 541 struct_month_cmp (const void *m1, const void *m2)
 542 {
 543   struct month const *month1 = m1;
 544   struct month const *month2 = m2;
 545   return strcmp (month1->name, month2->name);
 546 }
 547
 548 #endif
 549
 550 /* Initialize the character class tables. */
 551
 552 static void
 553 inittables (void)
 554 {
 555   size_t i;
 556
 557   for (i = 0; i < UCHAR_LIM; ++i)
 558     {
 559       blanks[i] = !!ISBLANK (i);
 560       nonprinting[i] = !ISPRINT (i);
 561       nondictionary[i] = !ISALNUM (i) && !ISBLANK (i);
 562       fold_toupper[i] = (ISLOWER (i) ? toupper (i) : i);
 563     }
 564
 565 #if HAVE_NL_LANGINFO
 566   /* If we're not in the "C" locale, read different names for months.  */
 567   if (hard_LC_TIME)
 568     {
 569       for (i = 0; i < MONTHS_PER_YEAR; i++)
 570         {
 571           char const *s;
 572           size_t s_len;
 573           size_t j;
 574           char *name;
 575
 576           s = (char *) nl_langinfo (ABMON_1 + i);
 577           s_len = strlen (s);
 578           monthtab[i].name = name = xmalloc (s_len + 1);
 579           monthtab[i].val = i + 1;
 580
 581           for (j = 0; j < s_len; j++)
 582             name[j] = fold_toupper[to_uchar (s[j])];
 583           name[j] = '\0';
 584         }
 585       qsort ((void *) monthtab, MONTHS_PER_YEAR,
 586              sizeof *monthtab, struct_month_cmp);
 587     }
 588 #endif
 589 }
 590
 591 /* Specify the amount of main memory to use when sorting.  */
 592 static void
 593 specify_sort_size (char const *s)
 594 {
 595   uintmax_t n;
 596   char *suffix;
 597   enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
 598
 599   /* The default unit is KiB.  */
 600   if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
 601     {
 602       if (n <= UINTMAX_MAX / 1024)
 603         n *= 1024;
 604       else
 605         e = LONGINT_OVERFLOW;
 606     }
 607
 608   /* A 'b' suffix means bytes; a '%' suffix means percent of memory.  */
 609   if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
 610     switch (suffix[0])
 611       {
 612       case 'b':
 613         e = LONGINT_OK;
 614         break;
 615
 616       case '%':
 617         {
 618           double mem = physmem_total () * n / 100;
 619
 620           /* Use "<", not "<=", to avoid problems with rounding.  */
 621           if (mem < UINTMAX_MAX)
 622             {
 623               n = mem;
 624               e = LONGINT_OK;
 625             }
 626           else
 627             e = LONGINT_OVERFLOW;
 628         }
 629         break;
 630       }
 631
 632   if (e == LONGINT_OK)
 633     {
 634       /* If multiple sort sizes are specified, take the maximum, so
 635          that option order does not matter.  */
 636       if (n < sort_size)
 637         return;
 638
 639       sort_size = n;
 640       if (sort_size == n)
 641         {
 642           sort_size = MAX (sort_size, MIN_SORT_SIZE);
 643           return;
 644         }
 645
 646       e = LONGINT_OVERFLOW;
 647     }
 648
 649   STRTOL_FATAL_ERROR (s, _("sort size"), e);
 650 }
 651
 652 /* Return the default sort size.  */
 653 static size_t
 654 default_sort_size (void)
 655 {
 656   /* Let MEM be available memory or 1/8 of total memory, whichever
 657      is greater.  */
 658   double avail = physmem_available ();
 659   double total = physmem_total ();
 660   double mem = MAX (avail, total / 8);
 661   struct rlimit rlimit;
 662
 663   /* Let SIZE be MEM, but no more than the maximum object size or
 664      system resource limits.  Avoid the MIN macro here, as it is not
 665      quite right when only one argument is floating point.  Don't
 666      bother to check for values like RLIM_INFINITY since in practice
 667      they are not much less than SIZE_MAX.  */
 668   size_t size = SIZE_MAX;
 669   if (mem < size)
 670     size = mem;
 671   if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
 672     size = rlimit.rlim_cur;
 673 #ifdef RLIMIT_AS
 674   if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
 675     size = rlimit.rlim_cur;
 676 #endif
 677
 678   /* Leave a large safety margin for the above limits, as failure can
 679      occur when they are exceeded.  */
 680   size /= 2;
 681
 682 #ifdef RLIMIT_RSS
 683   /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
 684      Exceeding RSS is not fatal, but can be quite slow.  */
 685   if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
 686     size = rlimit.rlim_cur / 16 * 15;
 687 #endif
 688
 689   /* Use no less than the minimum.  */
 690   return MAX (size, MIN_SORT_SIZE);
 691 }
 692
 693 /* Return the sort buffer size to use with the input files identified
 694    by FPS and FILES, which are alternate paths to the same files.
 695    NFILES gives the number of input files; NFPS may be less.  Assume
 696    that each input line requires LINE_BYTES extra bytes' worth of line
 697    information.  Do not exceed a bound on the size: if the bound is
 698    not specified by the user, use a default.  */
 699
 700 static size_t
 701 sort_buffer_size (FILE *const *fps, size_t nfps,
 702                   char *const *files, size_t nfiles,
 703                   size_t line_bytes)
 704 {
 705   /* A bound on the input size.  If zero, the bound hasn't been
 706      determined yet.  */
 707   static size_t size_bound;
 708
 709   /* In the worst case, each input byte is a newline.  */
 710   size_t worst_case_per_input_byte = line_bytes + 1;
 711
 712   /* Keep enough room for one extra input line and an extra byte.
 713      This extra room might be needed when preparing to read EOF.  */
 714   size_t size = worst_case_per_input_byte + 1;
 715
 716   size_t i;
 717
 718   for (i = 0; i < nfiles; i++)
 719     {
 720       struct stat st;
 721       off_t file_size;
 722       size_t worst_case;
 723
 724       if ((i < nfps ? fstat (fileno (fps[i]), &st)
 725            : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
 726            : stat (files[i], &st))
 727           != 0)
 728         die (_("stat failed"), files[i]);
 729
 730       if (S_ISREG (st.st_mode))
 731         file_size = st.st_size;
 732       else
 733         {
 734           /* The file has unknown size.  If the user specified a sort
 735              buffer size, use that; otherwise, guess the size.  */
 736           if (sort_size)
 737             return sort_size;
 738           file_size = INPUT_FILE_SIZE_GUESS;
 739         }
 740
 741       if (! size_bound)
 742         {
 743           size_bound = sort_size;
 744           if (! size_bound)
 745             size_bound = default_sort_size ();
 746         }
 747
 748       /* Add the amount of memory needed to represent the worst case
 749          where the input consists entirely of newlines followed by a
 750          single non-newline.  Check for overflow.  */
 751       worst_case = file_size * worst_case_per_input_byte + 1;
 752       if (file_size != worst_case / worst_case_per_input_byte
 753           || size_bound - size <= worst_case)
 754         return size_bound;
 755       size += worst_case;
 756     }
 757
 758   return size;
 759 }
 760
 761 /* Initialize BUF.  Reserve LINE_BYTES bytes for each line; LINE_BYTES
 762    must be at least sizeof (struct line).  Allocate ALLOC bytes
 763    initially.  */
 764
 765 static void
 766 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
 767 {
 768   /* Ensure that the line array is properly aligned.  If the desired
 769      size cannot be allocated, repeatedly halve it until allocation
 770      succeeds.  The smaller allocation may hurt overall performance,
 771      but that's better than failing.  */
 772   for (;;)
 773     {
 774       alloc += sizeof (struct line) - alloc % sizeof (struct line);
 775       buf->buf = malloc (alloc);
 776       if (buf->buf)
 777         break;
 778       alloc /= 2;
 779       if (alloc <= line_bytes + 1)
 780         xalloc_die ();
 781     }
 782
 783   buf->line_bytes = line_bytes;
 784   buf->alloc = alloc;
 785   buf->used = buf->left = buf->nlines = 0;
 786   buf->eof = false;
 787 }
 788
 789 /* Return one past the limit of the line array.  */
 790
 791 static inline struct line *
 792 buffer_linelim (struct buffer const *buf)
 793 {
 794   return (struct line *) (buf->buf + buf->alloc);
 795 }
 796
 797 /* Return a pointer to the first character of the field specified
 798    by KEY in LINE. */
 799
 800 static char *
 801 begfield (const struct line *line, const struct keyfield *key)
 802 {
 803   char *ptr = line->text, *lim = ptr + line->length - 1;
 804   size_t sword = key->sword;
 805   size_t schar = key->schar;
 806   size_t remaining_bytes;
 807
 808   /* The leading field separator itself is included in a field when -t
 809      is absent.  */
 810
 811   if (tab != TAB_DEFAULT)
 812     while (ptr < lim && sword--)
 813       {
 814         while (ptr < lim && *ptr != tab)
 815           ++ptr;
 816         if (ptr < lim)
 817           ++ptr;
 818       }
 819   else
 820     while (ptr < lim && sword--)
 821       {
 822         while (ptr < lim && blanks[to_uchar (*ptr)])
 823           ++ptr;
 824         while (ptr < lim && !blanks[to_uchar (*ptr)])
 825           ++ptr;
 826       }
 827
 828   if (key->skipsblanks)
 829     while (ptr < lim && blanks[to_uchar (*ptr)])
 830       ++ptr;
 831
 832   /* Advance PTR by SCHAR (if possible), but no further than LIM.  */
 833   remaining_bytes = lim - ptr;
 834   if (schar < remaining_bytes)
 835     ptr += schar;
 836   else
 837     ptr = lim;
 838
 839   return ptr;
 840 }
 841
 842 /* Return the limit of (a pointer to the first character after) the field
 843    in LINE specified by KEY. */
 844
 845 static char *
 846 limfield (const struct line *line, const struct keyfield *key)
 847 {
 848   char *ptr = line->text, *lim = ptr + line->length - 1;
 849   size_t eword = key->eword, echar = key->echar;
 850   size_t remaining_bytes;
 851
 852   /* Move PTR past EWORD fields or to one past the last byte on LINE,
 853      whichever comes first.  If there are more than EWORD fields, leave
 854      PTR pointing at the beginning of the field having zero-based index,
 855      EWORD.  If a delimiter character was specified (via -t), then that
 856      `beginning' is the first character following the delimiting TAB.
 857      Otherwise, leave PTR pointing at the first `blank' character after
 858      the preceding field.  */
 859   if (tab != TAB_DEFAULT)
 860     while (ptr < lim && eword--)
 861       {
 862         while (ptr < lim && *ptr != tab)
 863           ++ptr;
 864         if (ptr < lim && (eword | echar))
 865           ++ptr;
 866       }
 867   else
 868     while (ptr < lim && eword--)
 869       {
 870         while (ptr < lim && blanks[to_uchar (*ptr)])
 871           ++ptr;
 872         while (ptr < lim && !blanks[to_uchar (*ptr)])
 873           ++ptr;
 874       }
 875
 876 #ifdef POSIX_UNSPECIFIED
 877   /* The following block of code makes GNU sort incompatible with
 878      standard Unix sort, so it's ifdef'd out for now.
 879      The POSIX spec isn't clear on how to interpret this.
 880      FIXME: request clarification.
 881
 882      From: kwzh@gnu.ai.mit.edu (Karl Heuer)
 883      Date: Thu, 30 May 96 12:20:41 -0400
 884      [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
 885
 886      [...]I believe I've found another bug in `sort'.
 887
 888      $ cat /tmp/sort.in
 889      a b c 2 d
 890      pq rs 1 t
 891      $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
 892      a b c 2 d
 893      pq rs 1 t
 894      $ /bin/sort -k1.7,1.7 </tmp/sort.in
 895      pq rs 1 t
 896      a b c 2 d
 897
 898      Unix sort produced the answer I expected: sort on the single character
 899      in column 7.  GNU sort produced different results, because it disagrees
 900      on the interpretation of the key-end spec "M.N".  Unix sort reads this
 901      as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
 902      "skip M-1 fields, then either N-1 characters or the rest of the current
 903      field, whichever comes first".  This extra clause applies only to
 904      key-ends, not key-starts.
 905      */
 906
 907   /* Make LIM point to the end of (one byte past) the current field.  */
 908   if (tab != TAB_DEFAULT)
 909     {
 910       char *newlim;
 911       newlim = memchr (ptr, tab, lim - ptr);
 912       if (newlim)
 913         lim = newlim;
 914     }
 915   else
 916     {
 917       char *newlim;
 918       newlim = ptr;
 919       while (newlim < lim && blanks[to_uchar (*newlim)])
 920         ++newlim;
 921       while (newlim < lim && !blanks[to_uchar (*newlim)])
 922         ++newlim;
 923       lim = newlim;
 924     }
 925 #endif
 926
 927   /* If we're ignoring leading blanks when computing the End
 928      of the field, don't start counting bytes until after skipping
 929      past any leading blanks. */
 930   if (key->skipeblanks)
 931     while (ptr < lim && blanks[to_uchar (*ptr)])
 932       ++ptr;
 933
 934   /* Advance PTR by ECHAR (if possible), but no further than LIM.  */
 935   remaining_bytes = lim - ptr;
 936   if (echar < remaining_bytes)
 937     ptr += echar;
 938   else
 939     ptr = lim;
 940
 941   return ptr;
 942 }
 943
 944 /* Fill BUF reading from FP, moving buf->left bytes from the end
 945    of buf->buf to the beginning first.  If EOF is reached and the
 946    file wasn't terminated by a newline, supply one.  Set up BUF's line
 947    table too.  FILE is the name of the file corresponding to FP.
 948    Return true if some input was read.  */
 949
 950 static bool
 951 fillbuf (struct buffer *buf, FILE *fp, char const *file)
 952 {
 953   struct keyfield const *key = keylist;
 954   char eol = eolchar;
 955   size_t line_bytes = buf->line_bytes;
 956   size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
 957
 958   if (buf->eof)
 959     return false;
 960
 961   if (buf->used != buf->left)
 962     {
 963       memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
 964       buf->used = buf->left;
 965       buf->nlines = 0;
 966     }
 967
 968   for (;;)
 969     {
 970       char *ptr = buf->buf + buf->used;
 971       struct line *linelim = buffer_linelim (buf);
 972       struct line *line = linelim - buf->nlines;
 973       size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
 974       char *line_start = buf->nlines ? line->text + line->length : buf->buf;
 975
 976       while (line_bytes + 1 < avail)
 977         {
 978           /* Read as many bytes as possible, but do not read so many
 979              bytes that there might not be enough room for the
 980              corresponding line array.  The worst case is when the
 981              rest of the input file consists entirely of newlines,
 982              except that the last byte is not a newline.  */
 983           size_t readsize = (avail - 1) / (line_bytes + 1);
 984           size_t bytes_read = fread (ptr, 1, readsize, fp);
 985           char *ptrlim = ptr + bytes_read;
 986           char *p;
 987           avail -= bytes_read;
 988
 989           if (bytes_read != readsize)
 990             {
 991               if (ferror (fp))
 992                 die (_("read failed"), file);
 993               if (feof (fp))
 994                 {
 995                   buf->eof = true;
 996                   if (buf->buf == ptrlim)
 997                     return false;
 998                   if (ptrlim[-1] != eol)
 999                     *ptrlim++ = eol;
1000                 }
1001             }
1002
1003           /* Find and record each line in the just-read input.  */
1004           while ((p = memchr (ptr, eol, ptrlim - ptr)))
1005             {
1006               ptr = p + 1;
1007               line--;
1008               line->text = line_start;
1009               line->length = ptr - line_start;
1010               mergesize = MAX (mergesize, line->length);
1011               avail -= line_bytes;
1012
1013               if (key)
1014                 {
1015                   /* Precompute the position of the first key for
1016                      efficiency.  */
1017                   line->keylim = (key->eword == SIZE_MAX
1018                                   ? p
1019                                   : limfield (line, key));
1020
1021                   if (key->sword != SIZE_MAX)
1022                     line->keybeg = begfield (line, key);
1023                   else
1024                     {
1025                       if (key->skipsblanks)
1026                         while (blanks[to_uchar (*line_start)])
1027                           line_start++;
1028                       line->keybeg = line_start;
1029                     }
1030                 }
1031
1032               line_start = ptr;
1033             }
1034
1035           ptr = ptrlim;
1036           if (buf->eof)
1037             break;
1038         }
1039
1040       buf->used = ptr - buf->buf;
1041       buf->nlines = buffer_linelim (buf) - line;
1042       if (buf->nlines != 0)
1043         {
1044           buf->left = ptr - line_start;
1045           merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1046           return true;
1047         }
1048
1049       /* The current input line is too long to fit in the buffer.
1050          Double the buffer size and try again.  */
1051       buf->buf = x2nrealloc (buf->buf, &buf->alloc, sizeof *(buf->buf));
1052     }
1053 }
1054
1055 /* Compare strings A and B containing decimal fractions < 1.  Each string
1056    should begin with a decimal point followed immediately by the digits
1057    of the fraction.  Strings not of this form are considered to be zero. */
1058
1059 /* The goal here, is to take two numbers a and b... compare these
1060    in parallel.  Instead of converting each, and then comparing the
1061    outcome.  Most likely stopping the comparison before the conversion
1062    is complete.  The algorithm used, in the old sort:
1063
1064    Algorithm: fraccompare
1065    Action   : compare two decimal fractions
1066    accepts  : char *a, char *b
1067    returns  : -1 if a<b, 0 if a=b, 1 if a>b.
1068    implement:
1069
1070    if *a == decimal_point AND *b == decimal_point
1071      find first character different in a and b.
1072      if both are digits, return the difference *a - *b.
1073      if *a is a digit
1074        skip past zeros
1075        if digit return 1, else 0
1076      if *b is a digit
1077        skip past zeros
1078        if digit return -1, else 0
1079    if *a is a decimal_point
1080      skip past decimal_point and zeros
1081      if digit return 1, else 0
1082    if *b is a decimal_point
1083      skip past decimal_point and zeros
1084      if digit return -1, else 0
1085    return 0 */
1086
1087 static int
1088 fraccompare (const char *a, const char *b)
1089 {
1090   if (*a == decimal_point && *b == decimal_point)
1091     {
1092       while (*++a == *++b)
1093         if (! ISDIGIT (*a))
1094           return 0;
1095       if (ISDIGIT (*a) && ISDIGIT (*b))
1096         return *a - *b;
1097       if (ISDIGIT (*a))
1098         goto a_trailing_nonzero;
1099       if (ISDIGIT (*b))
1100         goto b_trailing_nonzero;
1101       return 0;
1102     }
1103   else if (*a++ == decimal_point)
1104     {
1105     a_trailing_nonzero:
1106       while (*a == NUMERIC_ZERO)
1107         a++;
1108       return ISDIGIT (*a);
1109     }
1110   else if (*b++ == decimal_point)
1111     {
1112     b_trailing_nonzero:
1113       while (*b == NUMERIC_ZERO)
1114         b++;
1115       return - ISDIGIT (*b);
1116     }
1117   return 0;
1118 }
1119
1120 /* Compare strings A and B as numbers without explicitly converting them to
1121    machine numbers.  Comparatively slow for short strings, but asymptotically
1122    hideously fast. */
1123
1124 static int
1125 numcompare (const char *a, const char *b)
1126 {
1127   char tmpa;
1128   char tmpb;
1129   int tmp;
1130   size_t log_a;
1131   size_t log_b;
1132
1133   while (blanks[to_uchar (tmpa = *a)])
1134     a++;
1135   while (blanks[to_uchar (tmpb = *b)])
1136     b++;
1137
1138   if (tmpa == NEGATION_SIGN)
1139     {
1140       do
1141         tmpa = *++a;
1142       while (tmpa == NUMERIC_ZERO || tmpa == thousands_sep);
1143       if (tmpb != NEGATION_SIGN)
1144         {
1145           if (tmpa == decimal_point)
1146             do
1147               tmpa = *++a;
1148             while (tmpa == NUMERIC_ZERO);
1149           if (ISDIGIT (tmpa))
1150             return -1;
1151           while (tmpb == NUMERIC_ZERO || tmpb == thousands_sep)
1152             tmpb = *++b;
1153           if (tmpb == decimal_point)
1154             do
1155               tmpb = *++b;
1156             while (tmpb == NUMERIC_ZERO);
1157           return - ISDIGIT (tmpb);
1158         }
1159       do
1160         tmpb = *++b;
1161       while (tmpb == NUMERIC_ZERO || tmpb == thousands_sep);
1162
1163       while (tmpa == tmpb && ISDIGIT (tmpa))
1164         {
1165           do
1166             tmpa = *++a;
1167           while (tmpa == thousands_sep);
1168           do
1169             tmpb = *++b;
1170           while (tmpb == thousands_sep);
1171         }
1172
1173       if ((tmpa == decimal_point && !ISDIGIT (tmpb))
1174           || (tmpb == decimal_point && !ISDIGIT (tmpa)))
1175         return fraccompare (b, a);
1176
1177       tmp = tmpb - tmpa;
1178
1179       for (log_a = 0; ISDIGIT (tmpa); ++log_a)
1180         do
1181           tmpa = *++a;
1182         while (tmpa == thousands_sep);
1183
1184       for (log_b = 0; ISDIGIT (tmpb); ++log_b)
1185         do
1186           tmpb = *++b;
1187         while (tmpb == thousands_sep);
1188
1189       if (log_a != log_b)
1190         return log_a < log_b ? 1 : -1;
1191
1192       if (!log_a)
1193         return 0;
1194
1195       return tmp;
1196     }
1197   else if (tmpb == NEGATION_SIGN)
1198     {
1199       do
1200         tmpb = *++b;
1201       while (tmpb == NUMERIC_ZERO || tmpb == thousands_sep);
1202       if (tmpb == decimal_point)
1203         do
1204           tmpb = *++b;
1205         while (tmpb == NUMERIC_ZERO);
1206       if (ISDIGIT (tmpb))
1207         return 1;
1208       while (tmpa == NUMERIC_ZERO || tmpa == thousands_sep)
1209         tmpa = *++a;
1210       if (tmpa == decimal_point)
1211         do
1212           tmpa = *++a;
1213         while (tmpa == NUMERIC_ZERO);
1214       return ISDIGIT (tmpa);
1215     }
1216   else
1217     {
1218       while (tmpa == NUMERIC_ZERO || tmpa == thousands_sep)
1219         tmpa = *++a;
1220       while (tmpb == NUMERIC_ZERO || tmpb == thousands_sep)
1221         tmpb = *++b;
1222
1223       while (tmpa == tmpb && ISDIGIT (tmpa))
1224         {
1225           do
1226             tmpa = *++a;
1227           while (tmpa == thousands_sep);
1228           do
1229             tmpb = *++b;
1230           while (tmpb == thousands_sep);
1231         }
1232
1233       if ((tmpa == decimal_point && !ISDIGIT (tmpb))
1234           || (tmpb == decimal_point && !ISDIGIT (tmpa)))
1235         return fraccompare (a, b);
1236
1237       tmp = tmpa - tmpb;
1238
1239       for (log_a = 0; ISDIGIT (tmpa); ++log_a)
1240         do
1241           tmpa = *++a;
1242         while (tmpa == thousands_sep);
1243
1244       for (log_b = 0; ISDIGIT (tmpb); ++log_b)
1245         do
1246           tmpb = *++b;
1247         while (tmpb == thousands_sep);
1248
1249       if (log_a != log_b)
1250         return log_a < log_b ? -1 : 1;
1251
1252       if (!log_a)
1253         return 0;
1254
1255       return tmp;
1256     }
1257 }
1258
1259 static int
1260 general_numcompare (const char *sa, const char *sb)
1261 {
1262   /* FIXME: add option to warn about failed conversions.  */
1263   /* FIXME: maybe add option to try expensive FP conversion
1264      only if A and B can't be compared more cheaply/accurately.  */
1265
1266   char *ea;
1267   char *eb;
1268   double a = strtod (sa, &ea);
1269   double b = strtod (sb, &eb);
1270
1271   /* Put conversion errors at the start of the collating sequence.  */
1272   if (sa == ea)
1273     return sb == eb ? 0 : -1;
1274   if (sb == eb)
1275     return 1;
1276
1277   /* Sort numbers in the usual way, where -0 == +0.  Put NaNs after
1278      conversion errors but before numbers; sort them by internal
1279      bit-pattern, for lack of a more portable alternative.  */
1280   return (a < b ? -1
1281           : a > b ? 1
1282           : a == b ? 0
1283           : b == b ? -1
1284           : a == a ? 1
1285           : memcmp ((char *) &a, (char *) &b, sizeof a));
1286 }
1287
1288 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1289    Return 0 if the name in S is not recognized.  */
1290
1291 static int
1292 getmonth (char const *month, size_t len)
1293 {
1294   size_t lo = 0;
1295   size_t hi = MONTHS_PER_YEAR;
1296   char const *monthlim = month + len;
1297
1298   for (;;)
1299     {
1300       if (month == monthlim)
1301         return 0;
1302       if (!blanks[to_uchar (*month)])
1303         break;
1304       ++month;
1305     }
1306
1307   do
1308     {
1309       size_t ix = (lo + hi) / 2;
1310       char const *m = month;
1311       char const *n = monthtab[ix].name;
1312
1313       for (;; m++, n++)
1314         {
1315           if (!*n)
1316             return monthtab[ix].val;
1317           if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1318             {
1319               hi = ix;
1320               break;
1321             }
1322           else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1323             {
1324               lo = ix + 1;
1325               break;
1326             }
1327         }
1328     }
1329   while (lo < hi);
1330
1331   return 0;
1332 }
1333
1334 /* Compare two lines A and B trying every key in sequence until there
1335    are no more keys or a difference is found. */
1336
1337 static int
1338 keycompare (const struct line *a, const struct line *b)
1339 {
1340   struct keyfield const *key = keylist;
1341
1342   /* For the first iteration only, the key positions have been
1343      precomputed for us. */
1344   char *texta = a->keybeg;
1345   char *textb = b->keybeg;
1346   char *lima = a->keylim;
1347   char *limb = b->keylim;
1348
1349   int diff;
1350
1351   for (;;)
1352     {
1353       char const *translate = key->translate;
1354       bool const *ignore = key->ignore;
1355
1356       /* Find the lengths. */
1357       size_t lena = lima <= texta ? 0 : lima - texta;
1358       size_t lenb = limb <= textb ? 0 : limb - textb;
1359
1360       /* Actually compare the fields. */
1361       if (key->numeric | key->general_numeric)
1362         {
1363           char savea = *lima, saveb = *limb;
1364
1365           *lima = *limb = '\0';
1366           diff = ((key->numeric ? numcompare : general_numcompare)
1367                   (texta, textb));
1368           *lima = savea, *limb = saveb;
1369         }
1370       else if (key->month)
1371         diff = getmonth (texta, lena) - getmonth (textb, lenb);
1372       /* Sorting like this may become slow, so in a simple locale the user
1373          can select a faster sort that is similar to ascii sort.  */
1374       else if (hard_LC_COLLATE)
1375         {
1376           if (ignore || translate)
1377             {
1378               char buf[4000];
1379               size_t size = lena + 1 + lenb + 1;
1380               char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1381               char *copy_b = copy_a + lena + 1;
1382               size_t new_len_a, new_len_b, i;
1383
1384               /* Ignore and/or translate chars before comparing.  */
1385               for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1386                 {
1387                   if (i < lena)
1388                     {
1389                       copy_a[new_len_a] = (translate
1390                                            ? translate[to_uchar (texta[i])]
1391                                            : texta[i]);
1392                       if (!ignore || !ignore[to_uchar (texta[i])])
1393                         ++new_len_a;
1394                     }
1395                   if (i < lenb)
1396                     {
1397                       copy_b[new_len_b] = (translate
1398                                            ? translate[to_uchar (textb[i])]
1399                                            : textb [i]);
1400                       if (!ignore || !ignore[to_uchar (textb[i])])
1401                         ++new_len_b;
1402                     }
1403                 }
1404
1405               diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1406
1407               if (sizeof buf < size)
1408                 free (copy_a);
1409             }
1410           else if (lena == 0)
1411             diff = - NONZERO (lenb);
1412           else if (lenb == 0)
1413             goto greater;
1414           else
1415             diff = xmemcoll (texta, lena, textb, lenb);
1416         }
1417       else if (ignore)
1418         {
1419 #define CMP_WITH_IGNORE(A, B)                                           \
1420   do                                                                    \
1421     {                                                                   \
1422           for (;;)                                                      \
1423             {                                                           \
1424               while (texta < lima && ignore[to_uchar (*texta)])         \
1425                 ++texta;                                                \
1426               while (textb < limb && ignore[to_uchar (*textb)])         \
1427                 ++textb;                                                \
1428               if (! (texta < lima && textb < limb))                     \
1429                 break;                                                  \
1430               diff = to_uchar (A) - to_uchar (B);                       \
1431               if (diff)                                                 \
1432                 goto not_equal;                                         \
1433               ++texta;                                                  \
1434               ++textb;                                                  \
1435             }                                                           \
1436                                                                         \
1437           diff = (texta < lima) - (textb < limb);                       \
1438     }                                                                   \
1439   while (0)
1440
1441           if (translate)
1442             CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1443                              translate[to_uchar (*textb)]);
1444           else
1445             CMP_WITH_IGNORE (*texta, *textb);
1446         }
1447       else if (lena == 0)
1448         diff = - NONZERO (lenb);
1449       else if (lenb == 0)
1450         goto greater;
1451       else
1452         {
1453           if (translate)
1454             {
1455               while (texta < lima && textb < limb)
1456                 {
1457                   diff = (to_uchar (translate[to_uchar (*texta++)])
1458                           - to_uchar (translate[to_uchar (*textb++)]));
1459                   if (diff)
1460                     goto not_equal;
1461                 }
1462             }
1463           else
1464             {
1465               diff = memcmp (texta, textb, MIN (lena, lenb));
1466               if (diff)
1467                 goto not_equal;
1468             }
1469           diff = lena < lenb ? -1 : lena != lenb;
1470         }
1471
1472       if (diff)
1473         goto not_equal;
1474
1475       key = key->next;
1476       if (! key)
1477         break;
1478
1479       /* Find the beginning and limit of the next field.  */
1480       if (key->eword != SIZE_MAX)
1481         lima = limfield (a, key), limb = limfield (b, key);
1482       else
1483         lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1484
1485       if (key->sword != SIZE_MAX)
1486         texta = begfield (a, key), textb = begfield (b, key);
1487       else
1488         {
1489           texta = a->text, textb = b->text;
1490           if (key->skipsblanks)
1491             {
1492               while (texta < lima && blanks[to_uchar (*texta)])
1493                 ++texta;
1494               while (textb < limb && blanks[to_uchar (*textb)])
1495                 ++textb;
1496             }
1497         }
1498     }
1499
1500   return 0;
1501
1502  greater:
1503   diff = 1;
1504  not_equal:
1505   return key->reverse ? -diff : diff;
1506 }
1507
1508 /* Compare two lines A and B, returning negative, zero, or positive
1509    depending on whether A compares less than, equal to, or greater than B. */
1510
1511 static int
1512 compare (const struct line *a, const struct line *b)
1513 {
1514   int diff;
1515   size_t alen, blen;
1516
1517   /* First try to compare on the specified keys (if any).
1518      The only two cases with no key at all are unadorned sort,
1519      and unadorned sort -r. */
1520   if (keylist)
1521     {
1522       diff = keycompare (a, b);
1523       if (diff | unique | stable)
1524         return diff;
1525     }
1526
1527   /* If the keys all compare equal (or no keys were specified)
1528      fall through to the default comparison.  */
1529   alen = a->length - 1, blen = b->length - 1;
1530
1531   if (alen == 0)
1532     diff = - NONZERO (blen);
1533   else if (blen == 0)
1534     diff = 1;
1535   else if (hard_LC_COLLATE)
1536     diff = xmemcoll (a->text, alen, b->text, blen);
1537   else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
1538     diff = alen < blen ? -1 : alen != blen;
1539
1540   return reverse ? -diff : diff;
1541 }
1542
1543 /* Check that the lines read from FILE_NAME come in order.  Print a
1544    diagnostic (FILE_NAME, line number, contents of line) to stderr and return
1545    false if they are not in order.  Otherwise, print no diagnostic
1546    and return true.  */
1547
1548 static bool
1549 check (char const *file_name)
1550 {
1551   FILE *fp = xfopen (file_name, "r");
1552   struct buffer buf;            /* Input buffer. */
1553   struct line temp;             /* Copy of previous line. */
1554   size_t alloc = 0;
1555   uintmax_t line_number = 0;
1556   struct keyfield const *key = keylist;
1557   bool nonunique = ! unique;
1558   bool ordered = true;
1559
1560   initbuf (&buf, sizeof (struct line),
1561            MAX (merge_buffer_size, sort_size));
1562   temp.text = NULL;
1563
1564   while (fillbuf (&buf, fp, file_name))
1565     {
1566       struct line const *line = buffer_linelim (&buf);
1567       struct line const *linebase = line - buf.nlines;
1568
1569       /* Make sure the line saved from the old buffer contents is
1570          less than or equal to the first line of the new buffer. */
1571       if (alloc && nonunique <= compare (&temp, line - 1))
1572         {
1573         found_disorder:
1574           {
1575             struct line const *disorder_line = line - 1;
1576             uintmax_t disorder_line_number =
1577               buffer_linelim (&buf) - disorder_line + line_number;
1578             char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
1579             fprintf (stderr, _("%s: %s:%s: disorder: "),
1580                      program_name, file_name,
1581                      umaxtostr (disorder_line_number, hr_buf));
1582             write_bytes (disorder_line->text, disorder_line->length, stderr,
1583                          _("standard error"));
1584             ordered = false;
1585             break;
1586           }
1587         }
1588
1589       /* Compare each line in the buffer with its successor.  */
1590       while (linebase < --line)
1591         if (nonunique <= compare (line, line - 1))
1592           goto found_disorder;
1593
1594       line_number += buf.nlines;
1595
1596       /* Save the last line of the buffer.  */
1597       if (alloc < line->length)
1598         {
1599           do
1600             {
1601               alloc *= 2;
1602               if (! alloc)
1603                 {
1604                   alloc = line->length;
1605                   break;
1606                 }
1607             }
1608           while (alloc < line->length);
1609
1610           temp.text = xrealloc (temp.text, alloc);
1611         }
1612       memcpy (temp.text, line->text, line->length);
1613       temp.length = line->length;
1614       if (key)
1615         {
1616           temp.keybeg = temp.text + (line->keybeg - line->text);
1617           temp.keylim = temp.text + (line->keylim - line->text);
1618         }
1619     }
1620
1621   xfclose (fp, file_name);
1622   free (buf.buf);
1623   if (temp.text)
1624     free (temp.text);
1625   return ordered;
1626 }
1627
1628 /* Merge lines from FILES onto OFP.  NTEMPS is the number of temporary
1629    files (all of which are at the start of the FILES array), and
1630    NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
1631    Close input and output files before returning.
1632    OUTPUT_FILE gives the name of the output file.  If it is NULL,
1633    the output file is standard output.  If OFP is NULL, the output
1634    file has not been opened yet (or written to, if standard output).  */
1635
1636 static void
1637 mergefps (char **files, size_t ntemps, size_t nfiles,
1638           FILE *ofp, char const *output_file)
1639 {
1640   FILE *fps[NMERGE];            /* Input streams for each file.  */
1641   struct buffer buffer[NMERGE]; /* Input buffers for each file. */
1642   struct line saved;            /* Saved line storage for unique check. */
1643   struct line const *savedline = NULL;
1644                                 /* &saved if there is a saved line. */
1645   size_t savealloc = 0;         /* Size allocated for the saved line. */
1646   struct line const *cur[NMERGE]; /* Current line in each line table. */
1647   struct line const *base[NMERGE]; /* Base of each line table.  */
1648   size_t ord[NMERGE];           /* Table representing a permutation of fps,
1649                                    such that cur[ord[0]] is the smallest line
1650                                    and will be next output. */
1651   size_t i;
1652   size_t j;
1653   size_t t;
1654   struct keyfield const *key = keylist;
1655   saved.text = NULL;
1656
1657   /* Read initial lines from each input file. */
1658   for (i = 0; i < nfiles; )
1659     {
1660       fps[i] = xfopen (files[i], "r");
1661       initbuf (&buffer[i], sizeof (struct line),
1662                MAX (merge_buffer_size, sort_size / nfiles));
1663       if (fillbuf (&buffer[i], fps[i], files[i]))
1664         {
1665           struct line const *linelim = buffer_linelim (&buffer[i]);
1666           cur[i] = linelim - 1;
1667           base[i] = linelim - buffer[i].nlines;
1668           i++;
1669         }
1670       else
1671         {
1672           /* fps[i] is empty; eliminate it from future consideration.  */
1673           xfclose (fps[i], files[i]);
1674           if (i < ntemps)
1675             {
1676               ntemps--;
1677               zaptemp (files[i]);
1678             }
1679           free (buffer[i].buf);
1680           --nfiles;
1681           for (j = i; j < nfiles; ++j)
1682             files[j] = files[j + 1];
1683         }
1684     }
1685
1686   if (! ofp)
1687     ofp = xfopen (output_file, "w");
1688
1689   /* Set up the ord table according to comparisons among input lines.
1690      Since this only reorders two items if one is strictly greater than
1691      the other, it is stable. */
1692   for (i = 0; i < nfiles; ++i)
1693     ord[i] = i;
1694   for (i = 1; i < nfiles; ++i)
1695     if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
1696       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1697
1698   /* Repeatedly output the smallest line until no input remains. */
1699   while (nfiles)
1700     {
1701       struct line const *smallest = cur[ord[0]];
1702
1703       /* If uniquified output is turned on, output only the first of
1704          an identical series of lines. */
1705       if (unique)
1706         {
1707           if (savedline && compare (savedline, smallest))
1708             {
1709               savedline = NULL;
1710               write_bytes (saved.text, saved.length, ofp, output_file);
1711             }
1712           if (!savedline)
1713             {
1714               savedline = &saved;
1715               if (savealloc < smallest->length)
1716                 {
1717                   do
1718                     if (! savealloc)
1719                       {
1720                         savealloc = smallest->length;
1721                         break;
1722                       }
1723                   while ((savealloc *= 2) < smallest->length);
1724
1725                   saved.text = xrealloc (saved.text, savealloc);
1726                 }
1727               saved.length = smallest->length;
1728               memcpy (saved.text, smallest->text, saved.length);
1729               if (key)
1730                 {
1731                   saved.keybeg =
1732                     saved.text + (smallest->keybeg - smallest->text);
1733                   saved.keylim =
1734                     saved.text + (smallest->keylim - smallest->text);
1735                 }
1736             }
1737         }
1738       else
1739         write_bytes (smallest->text, smallest->length, ofp, output_file);
1740
1741       /* Check if we need to read more lines into core. */
1742       if (base[ord[0]] < smallest)
1743         cur[ord[0]] = smallest - 1;
1744       else
1745         {
1746           if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]]))
1747             {
1748               struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
1749               cur[ord[0]] = linelim - 1;
1750               base[ord[0]] = linelim - buffer[ord[0]].nlines;
1751             }
1752           else
1753             {
1754               /* We reached EOF on fps[ord[0]].  */
1755               for (i = 1; i < nfiles; ++i)
1756                 if (ord[i] > ord[0])
1757                   --ord[i];
1758               --nfiles;
1759               xfclose (fps[ord[0]], files[ord[0]]);
1760               if (ord[0] < ntemps)
1761                 {
1762                   ntemps--;
1763                   zaptemp (files[ord[0]]);
1764                 }
1765               free (buffer[ord[0]].buf);
1766               for (i = ord[0]; i < nfiles; ++i)
1767                 {
1768                   fps[i] = fps[i + 1];
1769                   files[i] = files[i + 1];
1770                   buffer[i] = buffer[i + 1];
1771                   cur[i] = cur[i + 1];
1772                   base[i] = base[i + 1];
1773                 }
1774               for (i = 0; i < nfiles; ++i)
1775                 ord[i] = ord[i + 1];
1776               continue;
1777             }
1778         }
1779
1780       /* The new line just read in may be larger than other lines
1781          already in main memory; push it back in the queue until we
1782          encounter a line larger than it.  Optimize for the common
1783          case where the new line is smallest.  */
1784       {
1785         size_t lo = 1;
1786         size_t hi = nfiles;
1787         size_t probe = lo;
1788         size_t ord0 = ord[0];
1789         size_t count_of_smaller_lines;
1790
1791         while (lo < hi)
1792           {
1793             int cmp = compare (cur[ord0], cur[ord[probe]]);
1794             if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
1795               hi = probe;
1796             else
1797               lo = probe + 1;
1798             probe = (lo + hi) / 2;
1799           }
1800
1801         count_of_smaller_lines = lo - 1;
1802         for (j = 0; j < count_of_smaller_lines; j++)
1803           ord[j] = ord[j + 1];
1804         ord[count_of_smaller_lines] = ord0;
1805       }
1806     }
1807
1808   if (unique && savedline)
1809     {
1810       write_bytes (saved.text, saved.length, ofp, output_file);
1811       free (saved.text);
1812     }
1813
1814   xfclose (ofp, output_file);
1815 }
1816
1817 /* Merge into T the two sorted arrays of lines LO (with NLO members)
1818    and HI (with NHI members).  T, LO, and HI point just past their
1819    respective arrays, and the arrays are in reverse order.  NLO and
1820    NHI must be positive, and HI - NHI must equal T - (NLO + NHI).  */
1821
1822 static inline void
1823 mergelines (struct line *t,
1824             struct line const *lo, size_t nlo,
1825             struct line const *hi, size_t nhi)
1826 {
1827   for (;;)
1828     if (compare (lo - 1, hi - 1) <= 0)
1829       {
1830         *--t = *--lo;
1831         if (! --nlo)
1832           {
1833             /* HI - NHI equalled T - (NLO + NHI) when this function
1834                began.  Therefore HI must equal T now, and there is no
1835                need to copy from HI to T.  */
1836             return;
1837           }
1838       }
1839     else
1840       {
1841         *--t = *--hi;
1842         if (! --nhi)
1843           {
1844             do
1845               *--t = *--lo;
1846             while (--nlo);
1847
1848             return;
1849           }
1850       }
1851 }
1852
1853 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
1854    NLINES must be at least 2.
1855    The input and output arrays are in reverse order, and LINES and
1856    TEMP point just past the end of their respective arrays.
1857
1858    Use a recursive divide-and-conquer algorithm, in the style
1859    suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23.  Use
1860    the optimization suggested by exercise 5.2.4-10; this requires room
1861    for only 1.5*N lines, rather than the usual 2*N lines.  Knuth
1862    writes that this memory optimization was originally published by
1863    D. A. Bell, Comp J. 1 (1958), 75.  */
1864
1865 static void
1866 sortlines (struct line *lines, size_t nlines, struct line *temp)
1867 {
1868   if (nlines == 2)
1869     {
1870       if (0 < compare (&lines[-1], &lines[-2]))
1871         {
1872           struct line tmp = lines[-1];
1873           lines[-1] = lines[-2];
1874           lines[-2] = tmp;
1875         }
1876     }
1877   else
1878     {
1879       size_t nlo = nlines / 2;
1880       size_t nhi = nlines - nlo;
1881       struct line *lo = lines;
1882       struct line *hi = lines - nlo;
1883       struct line *sorted_lo = temp;
1884
1885       sortlines (hi, nhi, temp);
1886       if (1 < nlo)
1887         sortlines_temp (lo, nlo, sorted_lo);
1888       else
1889         sorted_lo[-1] = lo[-1];
1890
1891       mergelines (lines, sorted_lo, nlo, hi, nhi);
1892     }
1893 }
1894
1895 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
1896    rather than sorting in place.  */
1897
1898 static void
1899 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
1900 {
1901   if (nlines == 2)
1902     {
1903       bool swap = (0 < compare (&lines[-1], &lines[-2]));
1904       temp[-1] = lines[-1 - swap];
1905       temp[-2] = lines[-2 + swap];
1906     }
1907   else
1908     {
1909       size_t nlo = nlines / 2;
1910       size_t nhi = nlines - nlo;
1911       struct line *lo = lines;
1912       struct line *hi = lines - nlo;
1913       struct line *sorted_hi = temp - nlo;
1914
1915       sortlines_temp (hi, nhi, sorted_hi);
1916       if (1 < nlo)
1917         sortlines (lo, nlo, temp);
1918
1919       mergelines (temp, lo, nlo, sorted_hi, nhi);
1920     }
1921 }
1922
1923 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
1924    the same as OUTFILE.  If found, merge the found instances (and perhaps
1925    some other files) into a temporary file so that it can in turn be
1926    merged into OUTFILE without destroying OUTFILE before it is completely
1927    read.  Return the new value of NFILES, which differs from the old if
1928    some merging occurred.
1929
1930    This test ensures that an otherwise-erroneous use like
1931    "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
1932    It's not clear that POSIX requires this nicety.
1933    Detect common error cases, but don't try to catch obscure cases like
1934    "cat ... FILE ... | sort -m -o FILE"
1935    where traditional "sort" doesn't copy the input and where
1936    people should know that they're getting into trouble anyway.
1937    Catching these obscure cases would slow down performance in
1938    common cases.  */
1939
1940 static size_t
1941 avoid_trashing_input (char **files, size_t ntemps, size_t nfiles,
1942                       char const *outfile)
1943 {
1944   size_t i;
1945   bool got_outstat = false;
1946   struct stat outstat;
1947
1948   for (i = ntemps; i < nfiles; i++)
1949     {
1950       bool standard_input = STREQ (files[i], "-");
1951       bool same;
1952       struct stat instat;
1953
1954       if (outfile && STREQ (outfile, files[i]) && ! standard_input)
1955         same = true;
1956       else
1957         {
1958           if (! got_outstat)
1959             {
1960               if ((outfile
1961                    ? stat (outfile, &outstat)
1962                    : fstat (STDOUT_FILENO, &outstat))
1963                   != 0)
1964                 break;
1965               got_outstat = true;
1966             }
1967
1968           same = (((standard_input
1969                     ? fstat (STDIN_FILENO, &instat)
1970                     : stat (files[i], &instat))
1971                    == 0)
1972                   && SAME_INODE (instat, outstat));
1973         }
1974
1975       if (same)
1976         {
1977           FILE *tftp;
1978           char *temp = create_temp_file (&tftp);
1979           mergefps (&files[i], 0, nfiles - i, tftp, temp);
1980           files[i] = temp;
1981           return i + 1;
1982         }
1983     }
1984
1985   return nfiles;
1986 }
1987
1988 /* Merge the input FILES.  NTEMPS is the number of files at the
1989    start of FILES that are temporary; it is zero at the top level.
1990    NFILES is the total number of files.  Put the output in
1991    OUTPUT_FILE; a null OUTPUT_FILE stands for standard output.  */
1992
1993 static void
1994 merge (char **files, size_t ntemps, size_t nfiles, char const *output_file)
1995 {
1996   while (NMERGE < nfiles)
1997     {
1998       /* Number of input files processed so far.  */
1999       size_t in;
2000
2001       /* Number of output files generated so far.  */
2002       size_t out;
2003
2004       /* nfiles % NMERGE; this counts input files that are left over
2005          after all full-sized merges have been done.  */
2006       size_t remainder;
2007
2008       /* Number of easily-available slots at the next loop iteration.  */
2009       size_t cheap_slots;
2010
2011       /* Do as many NMERGE-size merges as possible.  */
2012       for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
2013         {
2014           FILE *tfp;
2015           char *temp = create_temp_file (&tfp);
2016           size_t nt = MIN (ntemps, NMERGE);
2017           ntemps -= nt;
2018           mergefps (&files[in], nt, NMERGE, tfp, temp);
2019           files[out] = temp;
2020         }
2021
2022       remainder = nfiles - in;
2023       cheap_slots = NMERGE - out % NMERGE;
2024
2025       if (cheap_slots < remainder)
2026         {
2027           /* So many files remain that they can't all be put into the last
2028              NMERGE-sized output window.  Do one more merge.  Merge as few
2029              files as possible, to avoid needless I/O.  */
2030           size_t nshortmerge = remainder - cheap_slots + 1;
2031           FILE *tfp;
2032           char *temp = create_temp_file (&tfp);
2033           size_t nt = MIN (ntemps, nshortmerge);
2034           ntemps -= nt;
2035           mergefps (&files[in], nt, nshortmerge, tfp, temp);
2036           files[out++] = temp;
2037           in += nshortmerge;
2038         }
2039
2040       /* Put the remaining input files into the last NMERGE-sized output
2041          window, so they will be merged in the next pass.  */
2042       memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2043       ntemps += out;
2044       nfiles -= in - out;
2045     }
2046
2047   nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2048   mergefps (files, ntemps, nfiles, NULL, output_file);
2049 }
2050
2051 /* Sort NFILES FILES onto OUTPUT_FILE. */
2052
2053 static void
2054 sort (char * const *files, size_t nfiles, char const *output_file)
2055 {
2056   struct buffer buf;
2057   size_t ntemps = 0;
2058   bool output_file_created = false;
2059
2060   buf.alloc = 0;
2061
2062   while (nfiles)
2063     {
2064       char const *temp_output;
2065       char const *file = *files;
2066       FILE *fp = xfopen (file, "r");
2067       FILE *tfp;
2068       size_t bytes_per_line = (2 * sizeof (struct line)
2069                                - sizeof (struct line) / 2);
2070
2071       if (! buf.alloc)
2072         initbuf (&buf, bytes_per_line,
2073                  sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2074       buf.eof = false;
2075       files++;
2076       nfiles--;
2077
2078       while (fillbuf (&buf, fp, file))
2079         {
2080           struct line *line;
2081           struct line *linebase;
2082
2083           if (buf.eof && nfiles
2084               && (bytes_per_line + 1
2085                   < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2086             {
2087               /* End of file, but there is more input and buffer room.
2088                  Concatenate the next input file; this is faster in
2089                  the usual case.  */
2090               buf.left = buf.used;
2091               break;
2092             }
2093
2094           line = buffer_linelim (&buf);
2095           linebase = line - buf.nlines;
2096           if (1 < buf.nlines)
2097             sortlines (line, buf.nlines, linebase);
2098           if (buf.eof && !nfiles && !ntemps && !buf.left)
2099             {
2100               xfclose (fp, file);
2101               tfp = xfopen (output_file, "w");
2102               temp_output = output_file;
2103               output_file_created = true;
2104             }
2105           else
2106             {
2107               ++ntemps;
2108               temp_output = create_temp_file (&tfp);
2109             }
2110
2111           do
2112             {
2113               line--;
2114               write_bytes (line->text, line->length, tfp, temp_output);
2115               if (unique)
2116                 while (linebase < line && compare (line, line - 1) == 0)
2117                   line--;
2118             }
2119           while (linebase < line);
2120
2121           xfclose (tfp, temp_output);
2122
2123           if (output_file_created)
2124             goto finish;
2125         }
2126       xfclose (fp, file);
2127     }
2128
2129  finish:
2130   free (buf.buf);
2131
2132   if (! output_file_created)
2133     {
2134       size_t i;
2135       struct tempnode *node = temphead;
2136       char **tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2137       for (i = 0; node; i++)
2138         {
2139           tempfiles[i] = node->name;
2140           node = node->next;
2141         }
2142       merge (tempfiles, ntemps, ntemps, output_file);
2143       free (tempfiles);
2144     }
2145 }
2146
2147 /* Insert key KEY at the end of the key list.  */
2148
2149 static void
2150 insertkey (struct keyfield *key)
2151 {
2152   struct keyfield **p;
2153
2154   for (p = &keylist; *p; p = &(*p)->next)
2155     continue;
2156   *p = key;
2157   key->next = NULL;
2158 }
2159
2160 /* Report a bad field specification SPEC, with extra info MSGID.  */
2161
2162 static void badfieldspec (char const *, char const *)
2163      ATTRIBUTE_NORETURN;
2164 static void
2165 badfieldspec (char const *spec, char const *msgid)
2166 {
2167   error (SORT_FAILURE, 0, _("%s: invalid field specification `%s'"),
2168          _(msgid), spec);
2169   abort ();
2170 }
2171
2172 /* Parse the leading integer in STRING and store the resulting value
2173    (which must fit into size_t) into *VAL.  Return the address of the
2174    suffix after the integer.  If MSGID is NULL, return NULL after
2175    failure; otherwise, report MSGID and exit on failure.  */
2176
2177 static char const *
2178 parse_field_count (char const *string, size_t *val, char const *msgid)
2179 {
2180   char *suffix;
2181   uintmax_t n;
2182
2183   switch (xstrtoumax (string, &suffix, 10, &n, ""))
2184     {
2185     case LONGINT_OK:
2186     case LONGINT_INVALID_SUFFIX_CHAR:
2187       *val = n;
2188       if (*val == n)
2189         break;
2190       /* Fall through.  */
2191     case LONGINT_OVERFLOW:
2192     case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2193       if (msgid)
2194         error (SORT_FAILURE, 0, _("%s: count `%.*s' too large"),
2195                _(msgid), (int) (suffix - string), string);
2196       return NULL;
2197
2198     case LONGINT_INVALID:
2199       if (msgid)
2200         error (SORT_FAILURE, 0, _("%s: invalid count at start of `%s'"),
2201                _(msgid), string);
2202       return NULL;
2203     }
2204
2205   return suffix;
2206 }
2207
2208 /* Handle interrupts and hangups. */
2209
2210 static void
2211 sighandler (int sig)
2212 {
2213 #ifndef SA_NOCLDSTOP
2214   signal (sig, SIG_IGN);
2215 #endif
2216
2217   cleanup ();
2218
2219   signal (sig, SIG_DFL);
2220   raise (sig);
2221 }
2222
2223 /* Set the ordering options for KEY specified in S.
2224    Return the address of the first character in S that
2225    is not a valid ordering option.
2226    BLANKTYPE is the kind of blanks that 'b' should skip. */
2227
2228 static char *
2229 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
2230 {
2231   while (*s)
2232     {
2233       switch (*s)
2234         {
2235         case 'b':
2236           if (blanktype == bl_start || blanktype == bl_both)
2237             key->skipsblanks = true;
2238           if (blanktype == bl_end || blanktype == bl_both)
2239             key->skipeblanks = true;
2240           break;
2241         case 'd':
2242           key->ignore = nondictionary;
2243           break;
2244         case 'f':
2245           key->translate = fold_toupper;
2246           break;
2247         case 'g':
2248           key->general_numeric = true;
2249           break;
2250         case 'i':
2251           /* Option order should not matter, so don't let -i override
2252              -d.  -d implies -i, but -i does not imply -d.  */
2253           if (! key->ignore)
2254             key->ignore = nonprinting;
2255           break;
2256         case 'M':
2257           key->month = true;
2258           break;
2259         case 'n':
2260           key->numeric = true;
2261           break;
2262         case 'r':
2263           key->reverse = true;
2264           break;
2265         default:
2266           return (char *) s;
2267         }
2268       ++s;
2269     }
2270   return (char *) s;
2271 }
2272
2273 static struct keyfield *
2274 new_key (void)
2275 {
2276   struct keyfield *key = xzalloc (sizeof *key);
2277   key->eword = SIZE_MAX;
2278   return key;
2279 }
2280
2281 int
2282 main (int argc, char **argv)
2283 {
2284   struct keyfield *key;
2285   struct keyfield gkey;
2286   char const *s;
2287   int c = 0;
2288   bool checkonly = false;
2289   bool mergeonly = false;
2290   size_t nfiles = 0;
2291   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2292   bool obsolete_usage = (posix2_version () < 200112);
2293   char const *short_options = (obsolete_usage
2294                                ? COMMON_SHORT_OPTIONS "y::"
2295                                : COMMON_SHORT_OPTIONS "y:");
2296   char *minus = "-", **files;
2297   char const *outfile = NULL;
2298
2299   initialize_main (&argc, &argv);
2300   program_name = argv[0];
2301   setlocale (LC_ALL, "");
2302   bindtextdomain (PACKAGE, LOCALEDIR);
2303   textdomain (PACKAGE);
2304
2305   atexit (cleanup);
2306
2307   initialize_exit_failure (SORT_FAILURE);
2308   atexit (close_stdout);
2309
2310   hard_LC_COLLATE = hard_locale (LC_COLLATE);
2311 #if HAVE_NL_LANGINFO
2312   hard_LC_TIME = hard_locale (LC_TIME);
2313 #endif
2314
2315   /* Get locale's representation of the decimal point.  */
2316   {
2317     struct lconv const *locale = localeconv ();
2318
2319     /* If the locale doesn't define a decimal point, or if the decimal
2320        point is multibyte, use the C locale's decimal point.  FIXME:
2321        add support for multibyte decimal points.  */
2322     decimal_point = locale->decimal_point[0];
2323     if (! decimal_point || locale->decimal_point[1])
2324       decimal_point = '.';
2325
2326     /* FIXME: add support for multibyte thousands separators.  */
2327     thousands_sep = *locale->thousands_sep;
2328     if (! thousands_sep || locale->thousands_sep[1])
2329       thousands_sep = CHAR_MAX + 1;
2330   }
2331
2332   have_read_stdin = false;
2333   inittables ();
2334
2335   {
2336     size_t i;
2337     static int const sig[] = { SIGHUP, SIGINT, SIGPIPE, SIGTERM };
2338     enum { nsigs = sizeof sig / sizeof sig[0] };
2339
2340 #ifdef SA_NOCLDSTOP
2341     struct sigaction act;
2342
2343     sigemptyset (&caught_signals);
2344     for (i = 0; i < nsigs; i++)
2345       {
2346         sigaction (sig[i], NULL, &act);
2347         if (act.sa_handler != SIG_IGN)
2348           sigaddset (&caught_signals, sig[i]);
2349       }
2350
2351     act.sa_handler = sighandler;
2352     act.sa_mask = caught_signals;
2353     act.sa_flags = 0;
2354
2355     for (i = 0; i < nsigs; i++)
2356       if (sigismember (&caught_signals, sig[i]))
2357         sigaction (sig[i], &act, NULL);
2358 #else
2359     for (i = 0; i < nsigs; i++)
2360       if (signal (sig[i], SIG_IGN) != SIG_IGN)
2361         signal (sig[i], sighandler);
2362 #endif
2363   }
2364
2365   gkey.sword = gkey.eword = SIZE_MAX;
2366   gkey.ignore = NULL;
2367   gkey.translate = NULL;
2368   gkey.numeric = gkey.general_numeric = gkey.month = gkey.reverse = false;
2369   gkey.skipsblanks = gkey.skipeblanks = false;
2370
2371   files = xnmalloc (argc, sizeof *files);
2372
2373   for (;;)
2374     {
2375       /* Parse an operand as a file after "--" was seen; or if
2376          pedantic and a file was seen, unless the POSIX version
2377          predates 1003.1-2001 and -c was not seen and the operand is
2378          "-o FILE" or "-oFILE".  */
2379
2380       if (c == -1
2381           || (posixly_correct && nfiles != 0
2382               && ! (obsolete_usage
2383                     && ! checkonly
2384                     && optind != argc
2385                     && argv[optind][0] == '-' && argv[optind][1] == 'o'
2386                     && (argv[optind][2] || optind + 1 != argc)))
2387           || ((c = getopt_long (argc, argv, short_options,
2388                                 long_options, NULL))
2389               == -1))
2390         {
2391           if (argc <= optind)
2392             break;
2393           files[nfiles++] = argv[optind++];
2394         }
2395       else switch (c)
2396         {
2397         case 1:
2398           key = NULL;
2399           if (obsolete_usage && optarg[0] == '+')
2400             {
2401               /* Treat +POS1 [-POS2] as a key if possible; but silently
2402                  treat an operand as a file if it is not a valid +POS1.  */
2403               key = new_key ();
2404               s = parse_field_count (optarg + 1, &key->sword, NULL);
2405               if (s && *s == '.')
2406                 s = parse_field_count (s + 1, &key->schar, NULL);
2407               if (! (key->sword | key->schar))
2408                 key->sword = SIZE_MAX;
2409               if (! s || *set_ordering (s, key, bl_start))
2410                 {
2411                   free (key);
2412                   key = NULL;
2413                 }
2414               else
2415                 {
2416                   if (optind != argc && argv[optind][0] == '-'
2417                       && ISDIGIT (argv[optind][1]))
2418                     {
2419                       char const *optarg1 = argv[optind++];
2420                       s = parse_field_count (optarg1 + 1, &key->eword,
2421                                              N_("invalid number after `-'"));
2422                       if (*s == '.')
2423                         s = parse_field_count (s + 1, &key->echar,
2424                                                N_("invalid number after `.'"));
2425                       if (*set_ordering (s, key, bl_end))
2426                         badfieldspec (optarg1,
2427                                       N_("stray character in field spec"));
2428                     }
2429                   insertkey (key);
2430                 }
2431             }
2432           if (! key)
2433             files[nfiles++] = optarg;
2434           break;
2435
2436         case 'b':
2437         case 'd':
2438         case 'f':
2439         case 'g':
2440         case 'i':
2441         case 'M':
2442         case 'n':
2443         case 'r':
2444           {
2445             char str[2];
2446             str[0] = c;
2447             str[1] = '\0';
2448             set_ordering (str, &gkey, bl_both);
2449           }
2450           break;
2451
2452         case 'c':
2453           checkonly = true;
2454           break;
2455
2456         case 'k':
2457           key = new_key ();
2458
2459           /* Get POS1. */
2460           s = parse_field_count (optarg, &key->sword,
2461                                  N_("invalid number at field start"));
2462           if (! key->sword--)
2463             {
2464               /* Provoke with `sort -k0' */
2465               badfieldspec (optarg, N_("field number is zero"));
2466             }
2467           if (*s == '.')
2468             {
2469               s = parse_field_count (s + 1, &key->schar,
2470                                      N_("invalid number after `.'"));
2471               if (! key->schar--)
2472                 {
2473                   /* Provoke with `sort -k1.0' */
2474                   badfieldspec (optarg, N_("character offset is zero"));
2475                 }
2476             }
2477           if (! (key->sword | key->schar))
2478             key->sword = SIZE_MAX;
2479           s = set_ordering (s, key, bl_start);
2480           if (*s != ',')
2481             {
2482               key->eword = SIZE_MAX;
2483               key->echar = 0;
2484             }
2485           else
2486             {
2487               /* Get POS2. */
2488               s = parse_field_count (s + 1, &key->eword,
2489                                      N_("invalid number after `,'"));
2490               if (! key->eword--)
2491                 {
2492                   /* Provoke with `sort -k1,0' */
2493                   badfieldspec (optarg, N_("field number is zero"));
2494                 }
2495               if (*s == '.')
2496                 s = parse_field_count (s + 1, &key->echar,
2497                                        N_("invalid number after `.'"));
2498               else
2499                 {
2500                   /* `-k 2,3' is equivalent to `+1 -3'.  */
2501                   key->eword++;
2502                 }
2503               s = set_ordering (s, key, bl_end);
2504             }
2505           if (*s)
2506             badfieldspec (optarg, N_("stray character in field spec"));
2507           insertkey (key);
2508           break;
2509
2510         case 'm':
2511           mergeonly = true;
2512           break;
2513
2514         case 'o':
2515           if (outfile && !STREQ (outfile, optarg))
2516             error (SORT_FAILURE, 0, _("multiple output files specified"));
2517           outfile = optarg;
2518           break;
2519
2520         case 's':
2521           stable = true;
2522           break;
2523
2524         case 'S':
2525           specify_sort_size (optarg);
2526           break;
2527
2528         case 't':
2529           {
2530             char newtab = optarg[0];
2531             if (! newtab)
2532               error (SORT_FAILURE, 0, _("empty tab"));
2533             if (optarg[1])
2534               {
2535                 if (STREQ (optarg, "\\0"))
2536                   newtab = '\0';
2537                 else
2538                   {
2539                     /* Provoke with `sort -txx'.  Complain about
2540                        "multi-character tab" instead of "multibyte tab", so
2541                        that the diagnostic's wording does not need to be
2542                        changed once multibyte characters are supported.  */
2543                     error (SORT_FAILURE, 0, _("multi-character tab `%s'"),
2544                            optarg);
2545                   }
2546               }
2547             if (tab != TAB_DEFAULT && tab != newtab)
2548               error (SORT_FAILURE, 0, _("incompatible tabs"));
2549             tab = newtab;
2550           }
2551           break;
2552
2553         case 'T':
2554           add_temp_dir (optarg);
2555           break;
2556
2557         case 'u':
2558           unique = true;
2559           break;
2560
2561         case 'y':
2562           /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
2563              through Solaris 7.  It is also accepted by many non-Solaris
2564              "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
2565              -y is marked as obsolete starting with Solaris 8 (1999), but is
2566              still accepted as of Solaris 10 prerelease (2004).
2567
2568              Solaris 2.5.1 "sort -y 100" reads the input file "100", but
2569              emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
2570              and which in general ignores the argument after "-y" if it
2571              consists entirely of digits (it can even be empty).  */
2572           if (!optarg && optind != argc)
2573             {
2574               char const *p;
2575               for (p = argv[optind]; ISDIGIT (*p); p++)
2576                 continue;
2577               optind += !*p;
2578             }
2579           break;
2580
2581         case 'z':
2582           eolchar = 0;
2583           break;
2584
2585         case_GETOPT_HELP_CHAR;
2586
2587         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2588
2589         default:
2590           usage (SORT_FAILURE);
2591         }
2592     }
2593
2594   /* Inheritance of global options to individual keys. */
2595   for (key = keylist; key; key = key->next)
2596     if (! (key->ignore || key->translate
2597            || (key->skipsblanks | key->reverse
2598                | key->skipeblanks | key->month | key->numeric
2599                | key->general_numeric)))
2600       {
2601         key->ignore = gkey.ignore;
2602         key->translate = gkey.translate;
2603         key->skipsblanks = gkey.skipsblanks;
2604         key->skipeblanks = gkey.skipeblanks;
2605         key->month = gkey.month;
2606         key->numeric = gkey.numeric;
2607         key->general_numeric = gkey.general_numeric;
2608         key->reverse = gkey.reverse;
2609       }
2610
2611   if (!keylist && (gkey.ignore || gkey.translate
2612                    || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
2613                        | gkey.numeric | gkey.general_numeric)))
2614     insertkey (&gkey);
2615   reverse = gkey.reverse;
2616
2617   if (temp_dir_count == 0)
2618     {
2619       char const *tmp_dir = getenv ("TMPDIR");
2620       add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
2621     }
2622
2623   if (nfiles == 0)
2624     {
2625       nfiles = 1;
2626       files = &minus;
2627     }
2628
2629   if (checkonly)
2630     {
2631       if (nfiles > 1)
2632         {
2633           error (0, 0, _("extra operand %s not allowed with -c"),
2634                  quote (files[1]));
2635           usage (SORT_FAILURE);
2636         }
2637
2638       /* POSIX requires that sort return 1 IFF invoked with -c and the
2639          input is not properly sorted.  */
2640       exit (check (files[0]) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
2641     }
2642
2643   if (mergeonly)
2644     merge (files, 0, nfiles, outfile);
2645   else
2646     sort (files, nfiles, outfile);
2647
2648   if (have_read_stdin && fclose (stdin) == EOF)
2649     die (_("close failed"), "-");
2650
2651   exit (EXIT_SUCCESS);
2652 }