1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 88, 1991-2004 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written December 1988 by Mike Haertel.
19 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
20 or (US mail) as Mike Haertel c/o Free Software Foundation.
22 Ørn E. Hansen added NLS support in 1997. */
27 #include <sys/types.h>
32 #include "hard-locale.h"
37 #include "stdio-safer.h"
41 #if HAVE_SYS_RESOURCE_H
42 # include <sys/resource.h>
45 struct rlimit { size_t rlim_cur; };
46 # define getrlimit(Resource, Rlp) (-1)
49 /* The official name of this program (e.g., no `g' prefix). */
50 #define PROGRAM_NAME "sort"
52 #define AUTHORS "Mike Haertel", "Paul Eggert"
54 #if HAVE_LANGINFO_CODESET
55 # include <langinfo.h>
59 # define sigprocmask(How, Set, Oset) /* empty */
67 #define UCHAR_LIM (UCHAR_MAX + 1)
69 #ifndef DEFAULT_TMPDIR
70 # define DEFAULT_TMPDIR "/tmp"
76 /* POSIX says to exit with status 1 if invoked with -c and the
77 input is not properly sorted. */
78 SORT_OUT_OF_ORDER = 1,
80 /* POSIX says any other irregular exit must exit with a status
81 code greater than 1. */
85 #define C_DECIMAL_POINT '.'
86 #define NEGATION_SIGN '-'
87 #define NUMERIC_ZERO '0'
91 static char decimal_point;
92 static int th_sep; /* if CHAR_MAX + 1, then there is no thousands separator */
94 /* Nonzero if the corresponding locales are hard. */
95 static bool hard_LC_COLLATE;
97 static bool hard_LC_TIME;
100 # define IS_THOUSANDS_SEP(x) ((x) == th_sep)
104 # define decimal_point C_DECIMAL_POINT
105 # define IS_THOUSANDS_SEP(x) false
109 #define NONZERO(x) (x != 0)
111 /* The kind of blanks for '-b' to skip in various options. */
112 enum blanktype { bl_start, bl_end, bl_both };
114 /* The character marking end of line. Default to \n. */
115 static char eolchar = '\n';
117 /* Lines are held in core as counted strings. */
120 char *text; /* Text of the line. */
121 size_t length; /* Length including final newline. */
122 char *keybeg; /* Start of first key. */
123 char *keylim; /* Limit of first key. */
129 char *buf; /* Dynamically allocated buffer,
130 partitioned into 3 regions:
133 - an array of lines, in reverse order. */
134 size_t used; /* Number of bytes used for input data. */
135 size_t nlines; /* Number of lines in the line array. */
136 size_t alloc; /* Number of bytes allocated. */
137 size_t left; /* Number of bytes left from previous reads. */
138 size_t line_bytes; /* Number of bytes to reserve for each line. */
139 bool eof; /* An EOF has been read. */
144 size_t sword; /* Zero-origin 'word' to start at. */
145 size_t schar; /* Additional characters to skip. */
146 size_t eword; /* Zero-origin first word after field. */
147 size_t echar; /* Additional characters in field. */
148 bool const *ignore; /* Boolean array of characters to ignore. */
149 char const *translate; /* Translation applied to characters. */
150 bool skipsblanks; /* Skip leading blanks when finding start. */
151 bool skipeblanks; /* Skip leading blanks when finding end. */
152 bool numeric; /* Flag for numeric comparison. Handle
153 strings of digits with optional decimal
154 point, but no exponential notation. */
155 bool general_numeric; /* Flag for general, numeric comparison.
156 Handle numbers in exponential notation. */
157 bool month; /* Flag for comparison by month name. */
158 bool reverse; /* Reverse the sense of comparison. */
159 struct keyfield *next; /* Next keyfield to try. */
168 /* The name this program was run with. */
171 /* FIXME: None of these tables work with multibyte character sets.
172 Also, there are many other bugs when handling multibyte characters.
173 One way to fix this is to rewrite `sort' to use wide characters
174 internally, but doing this with good performance is a bit
177 /* Table of blanks. */
178 static bool blanks[UCHAR_LIM];
180 /* Table of non-printing characters. */
181 static bool nonprinting[UCHAR_LIM];
183 /* Table of non-dictionary characters (not letters, digits, or blanks). */
184 static bool nondictionary[UCHAR_LIM];
186 /* Translation table folding lower case to upper. */
187 static char fold_toupper[UCHAR_LIM];
189 #define MONTHS_PER_YEAR 12
191 /* Table mapping month names to integers.
192 Alphabetic order allows binary search. */
193 static struct month monthtab[] =
209 /* During the merge phase, the number of files to merge at once. */
212 /* Minimum size for a merge or check buffer. */
213 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
215 /* Minimum sort size; the code might not work with smaller sizes. */
216 #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
218 /* The number of bytes needed for a merge or check buffer, which can
219 function relatively efficiently even if it holds only one line. If
220 a longer line is seen, this value is increased. */
221 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
223 /* The approximate maximum number of bytes of main memory to use, as
224 specified by the user. Zero if the user has not specified a size. */
225 static size_t sort_size;
227 /* The guessed size for non-regular files. */
228 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
230 /* Array of directory names in which any temporary files are to be created. */
231 static char const **temp_dirs;
233 /* Number of temporary directory names used. */
234 static size_t temp_dir_count;
236 /* Number of allocated slots in temp_dirs. */
237 static size_t temp_dir_alloc;
239 /* Flag to reverse the order of all comparisons. */
242 /* Flag for stable sort. This turns off the last ditch bytewise
243 comparison of lines, and instead leaves lines in the same order
244 they were read if all keys compare equal. */
247 /* If TAB has this value, blanks separate fields. */
248 enum { TAB_DEFAULT = CHAR_MAX + 1 };
250 /* Tab character separating fields. If TAB_DEFAULT, then fields are
251 separated by the empty string between a non-blank character and a blank
253 static int tab = TAB_DEFAULT;
255 /* Flag to remove consecutive duplicate lines from the output.
256 Only the last of a sequence of equal lines will be output. */
259 /* Nonzero if any of the input files are the standard input. */
260 static bool have_read_stdin;
262 /* List of key field comparisons to be tried. */
263 static struct keyfield *keylist;
265 static void sortlines_temp (struct line *, size_t, struct line *);
267 /* Report MESSAGE for FILE, then clean up and exit.
268 If FILE is null, it represents standard output. */
270 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
272 die (char const *message, char const *file)
274 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
281 if (status != EXIT_SUCCESS)
282 fprintf (stderr, _("Try `%s --help' for more information.\n"),
287 Usage: %s [OPTION]... [FILE]...\n\
291 Write sorted concatenation of all FILE(s) to standard output.\n\
297 Mandatory arguments to long options are mandatory for short options too.\n\
300 -b, --ignore-leading-blanks ignore leading blanks\n\
301 -d, --dictionary-order consider only blanks and alphanumeric characters\n\
302 -f, --ignore-case fold lower case to upper case characters\n\
305 -g, --general-numeric-sort compare according to general numerical value\n\
306 -i, --ignore-nonprinting consider only printable characters\n\
307 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
308 -n, --numeric-sort compare according to string numerical value\n\
309 -r, --reverse reverse the result of comparisons\n\
315 -c, --check check whether input is sorted; do not sort\n\
316 -k, --key=POS1[,POS2] start a key at POS1, end it at POS 2 (origin 1)\n\
317 -m, --merge merge already sorted files; do not sort\n\
318 -o, --output=FILE write result to FILE instead of standard output\n\
319 -s, --stable stabilize sort by disabling last-resort comparison\n\
320 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
323 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
324 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
325 multiple options specify multiple directories\n\
326 -u, --unique with -c, check for strict ordering;\n\
327 without -c, output only the first of an equal run\n\
330 -z, --zero-terminated end lines with 0 byte, not newline\n\
332 fputs (HELP_OPTION_DESCRIPTION, stdout);
333 fputs (VERSION_OPTION_DESCRIPTION, stdout);
336 POS is F[.C][OPTS], where F is the field number and C the character position\n\
337 in the field. OPTS is one or more single-letter ordering options, which\n\
338 override global ordering options for that key. If no key is given, use the\n\
339 entire line as the key.\n\
341 SIZE may be followed by the following multiplicative suffixes:\n\
344 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
346 With no FILE, or when FILE is -, read standard input.\n\
349 The locale specified by the environment affects sort order.\n\
350 Set LC_ALL=C to get the traditional sort order that uses\n\
351 native byte values.\n\
353 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
359 #define COMMON_SHORT_OPTIONS "-bcdfgik:mMno:rsS:t:T:uz"
361 static struct option const long_options[] =
363 {"ignore-leading-blanks", no_argument, NULL, 'b'},
364 {"check", no_argument, NULL, 'c'},
365 {"dictionary-order", no_argument, NULL, 'd'},
366 {"ignore-case", no_argument, NULL, 'f'},
367 {"general-numeric-sort", no_argument, NULL, 'g'},
368 {"ignore-nonprinting", no_argument, NULL, 'i'},
369 {"key", required_argument, NULL, 'k'},
370 {"merge", no_argument, NULL, 'm'},
371 {"month-sort", no_argument, NULL, 'M'},
372 {"numeric-sort", no_argument, NULL, 'n'},
373 {"output", required_argument, NULL, 'o'},
374 {"reverse", no_argument, NULL, 'r'},
375 {"stable", no_argument, NULL, 's'},
376 {"buffer-size", required_argument, NULL, 'S'},
377 {"field-separator", required_argument, NULL, 't'},
378 {"temporary-directory", required_argument, NULL, 'T'},
379 {"unique", no_argument, NULL, 'u'},
380 {"zero-terminated", no_argument, NULL, 'z'},
381 {GETOPT_HELP_OPTION_DECL},
382 {GETOPT_VERSION_OPTION_DECL},
386 /* The set of signals that are caught. */
387 static sigset_t caught_signals;
389 /* The list of temporary files. */
392 struct tempnode *volatile next;
393 char name[1]; /* Actual size is 1 + file name length. */
395 static struct tempnode *volatile temphead;
396 static struct tempnode *volatile *temptail = &temphead;
398 /* Clean up any remaining temporary files. */
403 struct tempnode const *node;
405 for (node = temphead; node; node = node->next)
409 /* Create a new temporary file, returning its newly allocated name.
410 Store into *PFP a stream open for writing. */
413 create_temp_file (FILE **pfp)
415 static char const slashbase[] = "/sortXXXXXX";
416 static size_t temp_dir_index;
420 char const *temp_dir = temp_dirs[temp_dir_index];
421 size_t len = strlen (temp_dir);
422 struct tempnode *node =
423 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
424 char *file = node->name;
426 memcpy (file, temp_dir, len);
427 memcpy (file + len, slashbase, sizeof slashbase);
429 if (++temp_dir_index == temp_dir_count)
432 /* Create the temporary file in a critical section, to avoid races. */
433 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
438 temptail = &node->next;
441 sigprocmask (SIG_SETMASK, &oldset, NULL);
444 if (fd < 0 || (*pfp = fdopen (fd, "w")) == NULL)
445 die (_("cannot create temporary file"), file);
450 /* Return a stream for FILE, opened with mode HOW. A null FILE means
451 standard output; HOW should be "w". When opening for input, "-"
452 means standard input. To avoid confusion, do not return file
453 descriptors 0, 1, or 2. */
456 xfopen (const char *file, const char *how)
462 else if (STREQ (file, "-") && *how == 'r')
464 have_read_stdin = true;
469 if ((fp = fopen_safer (file, how)) == NULL)
470 die (_("open failed"), file);
476 /* Close FP, whose name is FILE, and report any errors. */
479 xfclose (FILE *fp, char const *file)
483 /* Allow reading stdin from tty more than once. */
487 else if (fp == stdout)
489 /* Don't close stdout just yet. close_stdout does that. */
490 if (fflush (fp) != 0)
491 die (_("fflush failed"), file);
495 if (fclose (fp) != 0)
496 die (_("close failed"), file);
501 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
503 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
504 die (_("write failed"), output_file);
507 /* Append DIR to the array of temporary directory names. */
509 add_temp_dir (char const *dir)
511 if (temp_dir_count == temp_dir_alloc)
512 temp_dirs = x2nrealloc (temp_dirs, &temp_dir_alloc, sizeof *temp_dirs);
514 temp_dirs[temp_dir_count++] = dir;
517 /* Search through the list of temporary files for NAME;
518 remove it if it is found on the list. */
521 zaptemp (const char *name)
523 struct tempnode *volatile *pnode;
524 struct tempnode *node;
527 for (pnode = &temphead; (node = *pnode); pnode = &node->next)
528 if (node->name == name)
530 /* Unlink the temporary file in a critical section, to avoid races. */
531 struct tempnode *t = node->next;
532 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
535 sigprocmask (SIG_SETMASK, &oldset, NULL);
546 struct_month_cmp (const void *m1, const void *m2)
548 struct month const *month1 = m1;
549 struct month const *month2 = m2;
550 return strcmp (month1->name, month2->name);
555 /* Initialize the character class tables. */
562 for (i = 0; i < UCHAR_LIM; ++i)
564 blanks[i] = !!ISBLANK (i);
565 nonprinting[i] = !ISPRINT (i);
566 nondictionary[i] = !ISALNUM (i) && !ISBLANK (i);
567 fold_toupper[i] = (ISLOWER (i) ? toupper (i) : i);
571 /* If we're not in the "C" locale, read different names for months. */
574 for (i = 0; i < MONTHS_PER_YEAR; i++)
581 s = (char *) nl_langinfo (ABMON_1 + i);
583 monthtab[i].name = name = xmalloc (s_len + 1);
584 monthtab[i].val = i + 1;
586 for (j = 0; j < s_len; j++)
587 name[j] = fold_toupper[to_uchar (s[j])];
590 qsort ((void *) monthtab, MONTHS_PER_YEAR,
591 sizeof *monthtab, struct_month_cmp);
596 /* Specify the amount of main memory to use when sorting. */
598 specify_sort_size (char const *s)
602 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
604 /* The default unit is KiB. */
605 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
607 if (n <= UINTMAX_MAX / 1024)
610 e = LONGINT_OVERFLOW;
613 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
614 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
623 double mem = physmem_total () * n / 100;
625 /* Use "<", not "<=", to avoid problems with rounding. */
626 if (mem < UINTMAX_MAX)
632 e = LONGINT_OVERFLOW;
639 /* If multiple sort sizes are specified, take the maximum, so
640 that option order does not matter. */
647 sort_size = MAX (sort_size, MIN_SORT_SIZE);
651 e = LONGINT_OVERFLOW;
654 STRTOL_FATAL_ERROR (s, _("sort size"), e);
657 /* Return the default sort size. */
659 default_sort_size (void)
661 /* Let MEM be available memory or 1/8 of total memory, whichever
663 double avail = physmem_available ();
664 double total = physmem_total ();
665 double mem = MAX (avail, total / 8);
666 struct rlimit rlimit;
668 /* Let SIZE be MEM, but no more than the maximum object size or
669 system resource limits. Avoid the MIN macro here, as it is not
670 quite right when only one argument is floating point. Don't
671 bother to check for values like RLIM_INFINITY since in practice
672 they are not much less than SIZE_MAX. */
673 size_t size = SIZE_MAX;
676 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
677 size = rlimit.rlim_cur;
679 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
680 size = rlimit.rlim_cur;
683 /* Leave a large safety margin for the above limits, as failure can
684 occur when they are exceeded. */
688 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
689 Exceeding RSS is not fatal, but can be quite slow. */
690 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
691 size = rlimit.rlim_cur / 16 * 15;
694 /* Use no less than the minimum. */
695 return MAX (size, MIN_SORT_SIZE);
698 /* Return the sort buffer size to use with the input files identified
699 by FPS and FILES, which are alternate paths to the same files.
700 NFILES gives the number of input files; NFPS may be less. Assume
701 that each input line requires LINE_BYTES extra bytes' worth of line
702 information. Do not exceed a bound on the size: if the bound is
703 not specified by the user, use a default. */
706 sort_buffer_size (FILE *const *fps, size_t nfps,
707 char *const *files, size_t nfiles,
710 /* A bound on the input size. If zero, the bound hasn't been
712 static size_t size_bound;
714 /* In the worst case, each input byte is a newline. */
715 size_t worst_case_per_input_byte = line_bytes + 1;
717 /* Keep enough room for one extra input line and an extra byte.
718 This extra room might be needed when preparing to read EOF. */
719 size_t size = worst_case_per_input_byte + 1;
723 for (i = 0; i < nfiles; i++)
729 if ((i < nfps ? fstat (fileno (fps[i]), &st)
730 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
731 : stat (files[i], &st))
733 die (_("stat failed"), files[i]);
735 if (S_ISREG (st.st_mode))
736 file_size = st.st_size;
739 /* The file has unknown size. If the user specified a sort
740 buffer size, use that; otherwise, guess the size. */
743 file_size = INPUT_FILE_SIZE_GUESS;
748 size_bound = sort_size;
750 size_bound = default_sort_size ();
753 /* Add the amount of memory needed to represent the worst case
754 where the input consists entirely of newlines followed by a
755 single non-newline. Check for overflow. */
756 worst_case = file_size * worst_case_per_input_byte + 1;
757 if (file_size != worst_case / worst_case_per_input_byte
758 || size_bound - size <= worst_case)
766 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
767 must be at least sizeof (struct line). Allocate ALLOC bytes
771 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
773 /* Ensure that the line array is properly aligned. If the desired
774 size cannot be allocated, repeatedly halve it until allocation
775 succeeds. The smaller allocation may hurt overall performance,
776 but that's better than failing. */
779 alloc += sizeof (struct line) - alloc % sizeof (struct line);
780 buf->buf = malloc (alloc);
784 if (alloc <= line_bytes + 1)
788 buf->line_bytes = line_bytes;
790 buf->used = buf->left = buf->nlines = 0;
794 /* Return one past the limit of the line array. */
796 static inline struct line *
797 buffer_linelim (struct buffer const *buf)
799 return (struct line *) (buf->buf + buf->alloc);
802 /* Return a pointer to the first character of the field specified
806 begfield (const struct line *line, const struct keyfield *key)
808 register char *ptr = line->text, *lim = ptr + line->length - 1;
809 register size_t sword = key->sword;
810 register size_t schar = key->schar;
811 register size_t remaining_bytes;
813 /* The leading field separator itself is included in a field when -t
816 if (tab != TAB_DEFAULT)
817 while (ptr < lim && sword--)
819 while (ptr < lim && *ptr != tab)
825 while (ptr < lim && sword--)
827 while (ptr < lim && blanks[to_uchar (*ptr)])
829 while (ptr < lim && !blanks[to_uchar (*ptr)])
833 if (key->skipsblanks)
834 while (ptr < lim && blanks[to_uchar (*ptr)])
837 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
838 remaining_bytes = lim - ptr;
839 if (schar < remaining_bytes)
847 /* Return the limit of (a pointer to the first character after) the field
848 in LINE specified by KEY. */
851 limfield (const struct line *line, const struct keyfield *key)
853 register char *ptr = line->text, *lim = ptr + line->length - 1;
854 register size_t eword = key->eword, echar = key->echar;
855 register size_t remaining_bytes;
857 /* Move PTR past EWORD fields or to one past the last byte on LINE,
858 whichever comes first. If there are more than EWORD fields, leave
859 PTR pointing at the beginning of the field having zero-based index,
860 EWORD. If a delimiter character was specified (via -t), then that
861 `beginning' is the first character following the delimiting TAB.
862 Otherwise, leave PTR pointing at the first `blank' character after
863 the preceding field. */
864 if (tab != TAB_DEFAULT)
865 while (ptr < lim && eword--)
867 while (ptr < lim && *ptr != tab)
869 if (ptr < lim && (eword | echar))
873 while (ptr < lim && eword--)
875 while (ptr < lim && blanks[to_uchar (*ptr)])
877 while (ptr < lim && !blanks[to_uchar (*ptr)])
881 #ifdef POSIX_UNSPECIFIED
882 /* The following block of code makes GNU sort incompatible with
883 standard Unix sort, so it's ifdef'd out for now.
884 The POSIX spec isn't clear on how to interpret this.
885 FIXME: request clarification.
887 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
888 Date: Thu, 30 May 96 12:20:41 -0400
889 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
891 [...]I believe I've found another bug in `sort'.
896 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
899 $ /bin/sort -k1.7,1.7 </tmp/sort.in
903 Unix sort produced the answer I expected: sort on the single character
904 in column 7. GNU sort produced different results, because it disagrees
905 on the interpretation of the key-end spec "M.N". Unix sort reads this
906 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
907 "skip M-1 fields, then either N-1 characters or the rest of the current
908 field, whichever comes first". This extra clause applies only to
909 key-ends, not key-starts.
912 /* Make LIM point to the end of (one byte past) the current field. */
913 if (tab != TAB_DEFAULT)
916 newlim = memchr (ptr, tab, lim - ptr);
924 while (newlim < lim && blanks[to_uchar (*newlim)])
926 while (newlim < lim && !blanks[to_uchar (*newlim)])
932 /* If we're ignoring leading blanks when computing the End
933 of the field, don't start counting bytes until after skipping
934 past any leading blanks. */
935 if (key->skipeblanks)
936 while (ptr < lim && blanks[to_uchar (*ptr)])
939 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
940 remaining_bytes = lim - ptr;
941 if (echar < remaining_bytes)
949 /* Fill BUF reading from FP, moving buf->left bytes from the end
950 of buf->buf to the beginning first. If EOF is reached and the
951 file wasn't terminated by a newline, supply one. Set up BUF's line
952 table too. FILE is the name of the file corresponding to FP.
953 Return true if some input was read. */
956 fillbuf (struct buffer *buf, register FILE *fp, char const *file)
958 struct keyfield const *key = keylist;
960 size_t line_bytes = buf->line_bytes;
961 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
966 if (buf->used != buf->left)
968 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
969 buf->used = buf->left;
975 char *ptr = buf->buf + buf->used;
976 struct line *linelim = buffer_linelim (buf);
977 struct line *line = linelim - buf->nlines;
978 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
979 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
981 while (line_bytes + 1 < avail)
983 /* Read as many bytes as possible, but do not read so many
984 bytes that there might not be enough room for the
985 corresponding line array. The worst case is when the
986 rest of the input file consists entirely of newlines,
987 except that the last byte is not a newline. */
988 size_t readsize = (avail - 1) / (line_bytes + 1);
989 size_t bytes_read = fread (ptr, 1, readsize, fp);
990 char *ptrlim = ptr + bytes_read;
994 if (bytes_read != readsize)
997 die (_("read failed"), file);
1001 if (buf->buf == ptrlim)
1003 if (ptrlim[-1] != eol)
1008 /* Find and record each line in the just-read input. */
1009 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1013 line->text = line_start;
1014 line->length = ptr - line_start;
1015 mergesize = MAX (mergesize, line->length);
1016 avail -= line_bytes;
1020 /* Precompute the position of the first key for
1022 line->keylim = (key->eword == SIZE_MAX
1024 : limfield (line, key));
1026 if (key->sword != SIZE_MAX)
1027 line->keybeg = begfield (line, key);
1030 if (key->skipsblanks)
1031 while (blanks[to_uchar (*line_start)])
1033 line->keybeg = line_start;
1045 buf->used = ptr - buf->buf;
1046 buf->nlines = buffer_linelim (buf) - line;
1047 if (buf->nlines != 0)
1049 buf->left = ptr - line_start;
1050 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1054 /* The current input line is too long to fit in the buffer.
1055 Double the buffer size and try again. */
1056 buf->buf = x2nrealloc (buf->buf, &buf->alloc, sizeof *(buf->buf));
1060 /* Compare strings A and B containing decimal fractions < 1. Each string
1061 should begin with a decimal point followed immediately by the digits
1062 of the fraction. Strings not of this form are considered to be zero. */
1064 /* The goal here, is to take two numbers a and b... compare these
1065 in parallel. Instead of converting each, and then comparing the
1066 outcome. Most likely stopping the comparison before the conversion
1067 is complete. The algorithm used, in the old sort:
1069 Algorithm: fraccompare
1070 Action : compare two decimal fractions
1071 accepts : char *a, char *b
1072 returns : -1 if a<b, 0 if a=b, 1 if a>b.
1075 if *a == decimal_point AND *b == decimal_point
1076 find first character different in a and b.
1077 if both are digits, return the difference *a - *b.
1080 if digit return 1, else 0
1083 if digit return -1, else 0
1084 if *a is a decimal_point
1085 skip past decimal_point and zeros
1086 if digit return 1, else 0
1087 if *b is a decimal_point
1088 skip past decimal_point and zeros
1089 if digit return -1, else 0
1093 fraccompare (register const char *a, register const char *b)
1095 if (*a == decimal_point && *b == decimal_point)
1097 while (*++a == *++b)
1100 if (ISDIGIT (*a) && ISDIGIT (*b))
1103 goto a_trailing_nonzero;
1105 goto b_trailing_nonzero;
1108 else if (*a++ == decimal_point)
1111 while (*a == NUMERIC_ZERO)
1113 return ISDIGIT (*a);
1115 else if (*b++ == decimal_point)
1118 while (*b == NUMERIC_ZERO)
1120 return - ISDIGIT (*b);
1125 /* Compare strings A and B as numbers without explicitly converting them to
1126 machine numbers. Comparatively slow for short strings, but asymptotically
1130 numcompare (register const char *a, register const char *b)
1141 while (blanks[to_uchar (tmpa)])
1143 while (blanks[to_uchar (tmpb)])
1146 if (tmpa == NEGATION_SIGN)
1150 while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa));
1151 if (tmpb != NEGATION_SIGN)
1153 if (tmpa == decimal_point)
1156 while (tmpa == NUMERIC_ZERO);
1159 while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb))
1161 if (tmpb == decimal_point)
1164 while (tmpb == NUMERIC_ZERO);
1171 while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb));
1173 while (tmpa == tmpb && ISDIGIT (tmpa))
1177 while (IS_THOUSANDS_SEP (tmpa));
1180 while (IS_THOUSANDS_SEP (tmpb));
1183 if ((tmpa == decimal_point && !ISDIGIT (tmpb))
1184 || (tmpb == decimal_point && !ISDIGIT (tmpa)))
1185 return -fraccompare (a, b);
1189 for (log_a = 0; ISDIGIT (tmpa); ++log_a)
1192 while (IS_THOUSANDS_SEP (tmpa));
1194 for (log_b = 0; ISDIGIT (tmpb); ++log_b)
1197 while (IS_THOUSANDS_SEP (tmpb));
1200 return log_a < log_b ? 1 : -1;
1207 else if (tmpb == NEGATION_SIGN)
1211 while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb));
1212 if (tmpb == decimal_point)
1215 while (tmpb == NUMERIC_ZERO);
1218 while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa))
1220 if (tmpa == decimal_point)
1223 while (tmpa == NUMERIC_ZERO);
1230 while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa))
1232 while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb))
1235 while (tmpa == tmpb && ISDIGIT (tmpa))
1239 while (IS_THOUSANDS_SEP (tmpa));
1242 while (IS_THOUSANDS_SEP (tmpb));
1245 if ((tmpa == decimal_point && !ISDIGIT (tmpb))
1246 || (tmpb == decimal_point && !ISDIGIT (tmpa)))
1247 return fraccompare (a, b);
1251 for (log_a = 0; ISDIGIT (tmpa); ++log_a)
1254 while (IS_THOUSANDS_SEP (tmpa));
1256 for (log_b = 0; ISDIGIT (tmpb); ++log_b)
1259 while (IS_THOUSANDS_SEP (tmpb));
1262 return log_a < log_b ? -1 : 1;
1272 general_numcompare (const char *sa, const char *sb)
1274 /* FIXME: add option to warn about failed conversions. */
1275 /* FIXME: maybe add option to try expensive FP conversion
1276 only if A and B can't be compared more cheaply/accurately. */
1280 double a = strtod (sa, &ea);
1281 double b = strtod (sb, &eb);
1283 /* Put conversion errors at the start of the collating sequence. */
1285 return sb == eb ? 0 : -1;
1289 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1290 conversion errors but before numbers; sort them by internal
1291 bit-pattern, for lack of a more portable alternative. */
1297 : memcmp ((char *) &a, (char *) &b, sizeof a));
1300 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1301 Return 0 if the name in S is not recognized. */
1304 getmonth (char const *month, size_t len)
1307 size_t hi = MONTHS_PER_YEAR;
1308 char const *monthlim = month + len;
1312 if (month == monthlim)
1314 if (!blanks[to_uchar (*month)])
1321 size_t ix = (lo + hi) / 2;
1322 char const *m = month;
1323 char const *n = monthtab[ix].name;
1328 return monthtab[ix].val;
1329 if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1334 else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1346 /* Compare two lines A and B trying every key in sequence until there
1347 are no more keys or a difference is found. */
1350 keycompare (const struct line *a, const struct line *b)
1352 struct keyfield const *key = keylist;
1354 /* For the first iteration only, the key positions have been
1355 precomputed for us. */
1356 register char *texta = a->keybeg;
1357 register char *textb = b->keybeg;
1358 register char *lima = a->keylim;
1359 register char *limb = b->keylim;
1365 register char const *translate = key->translate;
1366 register bool const *ignore = key->ignore;
1368 /* Find the lengths. */
1369 size_t lena = lima <= texta ? 0 : lima - texta;
1370 size_t lenb = limb <= textb ? 0 : limb - textb;
1372 /* Actually compare the fields. */
1373 if (key->numeric | key->general_numeric)
1375 char savea = *lima, saveb = *limb;
1377 *lima = *limb = '\0';
1378 diff = ((key->numeric ? numcompare : general_numcompare)
1380 *lima = savea, *limb = saveb;
1382 else if (key->month)
1383 diff = getmonth (texta, lena) - getmonth (textb, lenb);
1384 /* Sorting like this may become slow, so in a simple locale the user
1385 can select a faster sort that is similar to ascii sort. */
1386 else if (HAVE_SETLOCALE && hard_LC_COLLATE)
1388 if (ignore || translate)
1391 size_t size = lena + 1 + lenb + 1;
1392 char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1393 char *copy_b = copy_a + lena + 1;
1394 size_t new_len_a, new_len_b, i;
1396 /* Ignore and/or translate chars before comparing. */
1397 for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1401 copy_a[new_len_a] = (translate
1402 ? translate[to_uchar (texta[i])]
1404 if (!ignore || !ignore[to_uchar (texta[i])])
1409 copy_b[new_len_b] = (translate
1410 ? translate[to_uchar (textb[i])]
1412 if (!ignore || !ignore[to_uchar (textb[i])])
1417 diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1419 if (sizeof buf < size)
1423 diff = - NONZERO (lenb);
1427 diff = xmemcoll (texta, lena, textb, lenb);
1431 #define CMP_WITH_IGNORE(A, B) \
1436 while (texta < lima && ignore[to_uchar (*texta)]) \
1438 while (textb < limb && ignore[to_uchar (*textb)]) \
1440 if (! (texta < lima && textb < limb)) \
1442 diff = to_uchar (A) - to_uchar (B); \
1449 diff = (texta < lima) - (textb < limb); \
1454 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1455 translate[to_uchar (*textb)]);
1457 CMP_WITH_IGNORE (*texta, *textb);
1460 diff = - NONZERO (lenb);
1467 while (texta < lima && textb < limb)
1469 diff = (to_uchar (translate[to_uchar (*texta++)])
1470 - to_uchar (translate[to_uchar (*textb++)]));
1477 diff = memcmp (texta, textb, MIN (lena, lenb));
1481 diff = lena < lenb ? -1 : lena != lenb;
1491 /* Find the beginning and limit of the next field. */
1492 if (key->eword != SIZE_MAX)
1493 lima = limfield (a, key), limb = limfield (b, key);
1495 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1497 if (key->sword != SIZE_MAX)
1498 texta = begfield (a, key), textb = begfield (b, key);
1501 texta = a->text, textb = b->text;
1502 if (key->skipsblanks)
1504 while (texta < lima && blanks[to_uchar (*texta)])
1506 while (textb < limb && blanks[to_uchar (*textb)])
1517 return key->reverse ? -diff : diff;
1520 /* Compare two lines A and B, returning negative, zero, or positive
1521 depending on whether A compares less than, equal to, or greater than B. */
1524 compare (register const struct line *a, register const struct line *b)
1529 /* First try to compare on the specified keys (if any).
1530 The only two cases with no key at all are unadorned sort,
1531 and unadorned sort -r. */
1534 diff = keycompare (a, b);
1535 if (diff | unique | stable)
1539 /* If the keys all compare equal (or no keys were specified)
1540 fall through to the default comparison. */
1541 alen = a->length - 1, blen = b->length - 1;
1544 diff = - NONZERO (blen);
1547 else if (HAVE_SETLOCALE && hard_LC_COLLATE)
1548 diff = xmemcoll (a->text, alen, b->text, blen);
1549 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
1550 diff = alen < blen ? -1 : alen != blen;
1552 return reverse ? -diff : diff;
1555 /* Check that the lines read from FILE_NAME come in order. Print a
1556 diagnostic (FILE_NAME, line number, contents of line) to stderr and return
1557 false if they are not in order. Otherwise, print no diagnostic
1561 check (char const *file_name)
1563 FILE *fp = xfopen (file_name, "r");
1564 struct buffer buf; /* Input buffer. */
1565 struct line temp; /* Copy of previous line. */
1567 uintmax_t line_number = 0;
1568 struct keyfield const *key = keylist;
1569 bool nonunique = ! unique;
1570 bool ordered = true;
1572 initbuf (&buf, sizeof (struct line),
1573 MAX (merge_buffer_size, sort_size));
1576 while (fillbuf (&buf, fp, file_name))
1578 struct line const *line = buffer_linelim (&buf);
1579 struct line const *linebase = line - buf.nlines;
1581 /* Make sure the line saved from the old buffer contents is
1582 less than or equal to the first line of the new buffer. */
1583 if (alloc && nonunique <= compare (&temp, line - 1))
1587 struct line const *disorder_line = line - 1;
1588 uintmax_t disorder_line_number =
1589 buffer_linelim (&buf) - disorder_line + line_number;
1590 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
1591 fprintf (stderr, _("%s: %s:%s: disorder: "),
1592 program_name, file_name,
1593 umaxtostr (disorder_line_number, hr_buf));
1594 write_bytes (disorder_line->text, disorder_line->length, stderr,
1595 _("standard error"));
1601 /* Compare each line in the buffer with its successor. */
1602 while (linebase < --line)
1603 if (nonunique <= compare (line, line - 1))
1604 goto found_disorder;
1606 line_number += buf.nlines;
1608 /* Save the last line of the buffer. */
1609 if (alloc < line->length)
1616 alloc = line->length;
1620 while (alloc < line->length);
1622 temp.text = xrealloc (temp.text, alloc);
1624 memcpy (temp.text, line->text, line->length);
1625 temp.length = line->length;
1628 temp.keybeg = temp.text + (line->keybeg - line->text);
1629 temp.keylim = temp.text + (line->keylim - line->text);
1633 xfclose (fp, file_name);
1640 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
1641 files (all of which are at the start of the FILES array), and
1642 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
1643 Close input and output files before returning.
1644 OUTPUT_FILE gives the name of the output file. If it is NULL,
1645 the output file is standard output. If OFP is NULL, the output
1646 file has not been opened yet (or written to, if standard output). */
1649 mergefps (char **files, size_t ntemps, size_t nfiles,
1650 FILE *ofp, char const *output_file)
1652 FILE *fps[NMERGE]; /* Input streams for each file. */
1653 struct buffer buffer[NMERGE]; /* Input buffers for each file. */
1654 struct line saved; /* Saved line storage for unique check. */
1655 struct line const *savedline = NULL;
1656 /* &saved if there is a saved line. */
1657 size_t savealloc = 0; /* Size allocated for the saved line. */
1658 struct line const *cur[NMERGE]; /* Current line in each line table. */
1659 struct line const *base[NMERGE]; /* Base of each line table. */
1660 size_t ord[NMERGE]; /* Table representing a permutation of fps,
1661 such that cur[ord[0]] is the smallest line
1662 and will be next output. */
1666 struct keyfield const *key = keylist;
1669 /* Read initial lines from each input file. */
1670 for (i = 0; i < nfiles; )
1672 fps[i] = xfopen (files[i], "r");
1673 initbuf (&buffer[i], sizeof (struct line),
1674 MAX (merge_buffer_size, sort_size / nfiles));
1675 if (fillbuf (&buffer[i], fps[i], files[i]))
1677 struct line const *linelim = buffer_linelim (&buffer[i]);
1678 cur[i] = linelim - 1;
1679 base[i] = linelim - buffer[i].nlines;
1684 /* fps[i] is empty; eliminate it from future consideration. */
1685 xfclose (fps[i], files[i]);
1691 free (buffer[i].buf);
1693 for (j = i; j < nfiles; ++j)
1694 files[j] = files[j + 1];
1699 ofp = xfopen (output_file, "w");
1701 /* Set up the ord table according to comparisons among input lines.
1702 Since this only reorders two items if one is strictly greater than
1703 the other, it is stable. */
1704 for (i = 0; i < nfiles; ++i)
1706 for (i = 1; i < nfiles; ++i)
1707 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
1708 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1710 /* Repeatedly output the smallest line until no input remains. */
1713 struct line const *smallest = cur[ord[0]];
1715 /* If uniquified output is turned on, output only the first of
1716 an identical series of lines. */
1719 if (savedline && compare (savedline, smallest))
1722 write_bytes (saved.text, saved.length, ofp, output_file);
1727 if (savealloc < smallest->length)
1732 savealloc = smallest->length;
1735 while ((savealloc *= 2) < smallest->length);
1737 saved.text = xrealloc (saved.text, savealloc);
1739 saved.length = smallest->length;
1740 memcpy (saved.text, smallest->text, saved.length);
1744 saved.text + (smallest->keybeg - smallest->text);
1746 saved.text + (smallest->keylim - smallest->text);
1751 write_bytes (smallest->text, smallest->length, ofp, output_file);
1753 /* Check if we need to read more lines into core. */
1754 if (base[ord[0]] < smallest)
1755 cur[ord[0]] = smallest - 1;
1758 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]]))
1760 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
1761 cur[ord[0]] = linelim - 1;
1762 base[ord[0]] = linelim - buffer[ord[0]].nlines;
1766 /* We reached EOF on fps[ord[0]]. */
1767 for (i = 1; i < nfiles; ++i)
1768 if (ord[i] > ord[0])
1771 xfclose (fps[ord[0]], files[ord[0]]);
1772 if (ord[0] < ntemps)
1775 zaptemp (files[ord[0]]);
1777 free (buffer[ord[0]].buf);
1778 for (i = ord[0]; i < nfiles; ++i)
1780 fps[i] = fps[i + 1];
1781 files[i] = files[i + 1];
1782 buffer[i] = buffer[i + 1];
1783 cur[i] = cur[i + 1];
1784 base[i] = base[i + 1];
1786 for (i = 0; i < nfiles; ++i)
1787 ord[i] = ord[i + 1];
1792 /* The new line just read in may be larger than other lines
1793 already in core; push it back in the queue until we encounter
1794 a line larger than it. */
1795 for (i = 1; i < nfiles; ++i)
1797 int cmp = compare (cur[ord[0]], cur[ord[i]]);
1798 if (cmp < 0 || (cmp == 0 && ord[0] < ord[i]))
1802 for (j = 1; j < i; ++j)
1803 ord[j - 1] = ord[j];
1807 if (unique && savedline)
1809 write_bytes (saved.text, saved.length, ofp, output_file);
1813 xfclose (ofp, output_file);
1816 /* Merge into T the two sorted arrays of lines LO (with NLO members)
1817 and HI (with NHI members). T, LO, and HI point just past their
1818 respective arrays, and the arrays are in reverse order. NLO and
1819 NHI must be positive, and HI - NHI must equal T - (NLO + NHI). */
1822 mergelines (struct line *t,
1823 struct line const *lo, size_t nlo,
1824 struct line const *hi, size_t nhi)
1827 if (compare (lo - 1, hi - 1) <= 0)
1832 /* HI - NHI equalled T - (NLO + NHI) when this function
1833 began. Therefore HI must equal T now, and there is no
1834 need to copy from HI to T. */
1852 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
1853 NLINES must be at least 2.
1854 The input and output arrays are in reverse order, and LINES and
1855 TEMP point just past the end of their respective arrays.
1857 Use a recursive divide-and-conquer algorithm, in the style
1858 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
1859 the optimization suggested by exercise 5.2.4-10; this requires room
1860 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
1861 writes that this memory optimization was originally published by
1862 D. A. Bell, Comp J. 1 (1958), 75. */
1865 sortlines (struct line *lines, size_t nlines, struct line *temp)
1869 if (0 < compare (&lines[-1], &lines[-2]))
1871 struct line tmp = lines[-1];
1872 lines[-1] = lines[-2];
1878 size_t nlo = nlines / 2;
1879 size_t nhi = nlines - nlo;
1880 struct line *lo = lines;
1881 struct line *hi = lines - nlo;
1882 struct line *sorted_lo = temp;
1884 sortlines (hi, nhi, temp);
1886 sortlines_temp (lo, nlo, sorted_lo);
1888 sorted_lo[-1] = lo[-1];
1890 mergelines (lines, sorted_lo, nlo, hi, nhi);
1894 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
1895 rather than sorting in place. */
1898 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
1902 bool swap = (0 < compare (&lines[-1], &lines[-2]));
1903 temp[-1] = lines[-1 - swap];
1904 temp[-2] = lines[-2 + swap];
1908 size_t nlo = nlines / 2;
1909 size_t nhi = nlines - nlo;
1910 struct line *lo = lines;
1911 struct line *hi = lines - nlo;
1912 struct line *sorted_hi = temp - nlo;
1914 sortlines_temp (hi, nhi, sorted_hi);
1916 sortlines (lo, nlo, temp);
1918 mergelines (temp, lo, nlo, sorted_hi, nhi);
1922 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
1923 the same as OUTFILE. If found, merge the found instances (and perhaps
1924 some other files) into a temporary file so that it can in turn be
1925 merged into OUTFILE without destroying OUTFILE before it is completely
1926 read. Return the new value of NFILES, which differs from the old if
1927 some merging occurred.
1929 This test ensures that an otherwise-erroneous use like
1930 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
1931 It's not clear that POSIX requires this nicety.
1932 Detect common error cases, but don't try to catch obscure cases like
1933 "cat ... FILE ... | sort -m -o FILE"
1934 where traditional "sort" doesn't copy the input and where
1935 people should know that they're getting into trouble anyway.
1936 Catching these obscure cases would slow down performance in
1940 avoid_trashing_input (char **files, size_t ntemps, size_t nfiles,
1941 char const *outfile)
1944 bool got_outstat = false;
1945 struct stat outstat;
1947 for (i = ntemps; i < nfiles; i++)
1949 bool standard_input = STREQ (files[i], "-");
1953 if (outfile && STREQ (outfile, files[i]) && ! standard_input)
1960 ? stat (outfile, &outstat)
1961 : fstat (STDOUT_FILENO, &outstat))
1967 same = (((standard_input
1968 ? fstat (STDIN_FILENO, &instat)
1969 : stat (files[i], &instat))
1971 && SAME_INODE (instat, outstat));
1977 char *temp = create_temp_file (&tftp);
1978 mergefps (&files[i], 0, nfiles - i, tftp, temp);
1987 /* Merge the input FILES. NTEMPS is the number of files at the
1988 start of FILES that are temporary; it is zero at the top level.
1989 NFILES is the total number of files. Put the output in
1990 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
1993 merge (char **files, size_t ntemps, size_t nfiles, char const *output_file)
1995 while (NMERGE < nfiles)
1997 /* Number of input files processed so far. */
2000 /* Number of output files generated so far. */
2003 /* nfiles % NMERGE; this counts input files that are left over
2004 after all full-sized merges have been done. */
2007 /* Number of easily-available slots at the next loop iteration. */
2010 /* Do as many NMERGE-size merges as possible. */
2011 for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
2014 char *temp = create_temp_file (&tfp);
2015 size_t nt = MIN (ntemps, NMERGE);
2017 mergefps (&files[in], nt, NMERGE, tfp, temp);
2021 remainder = nfiles - in;
2022 cheap_slots = NMERGE - out % NMERGE;
2024 if (cheap_slots < remainder)
2026 /* So many files remain that they can't all be put into the last
2027 NMERGE-sized output window. Do one more merge. Merge as few
2028 files as possible, to avoid needless I/O. */
2029 size_t nshortmerge = remainder - cheap_slots + 1;
2031 char *temp = create_temp_file (&tfp);
2032 size_t nt = MIN (ntemps, nshortmerge);
2034 mergefps (&files[in], nt, nshortmerge, tfp, temp);
2035 files[out++] = temp;
2039 /* Put the remaining input files into the last NMERGE-sized output
2040 window, so they will be merged in the next pass. */
2041 memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2046 nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2047 mergefps (files, ntemps, nfiles, NULL, output_file);
2050 /* Sort NFILES FILES onto OUTPUT_FILE. */
2053 sort (char * const *files, size_t nfiles, char const *output_file)
2057 bool output_file_created = false;
2063 char const *temp_output;
2064 char const *file = *files;
2065 FILE *fp = xfopen (file, "r");
2067 size_t bytes_per_line = (2 * sizeof (struct line)
2068 - sizeof (struct line) / 2);
2071 initbuf (&buf, bytes_per_line,
2072 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2077 while (fillbuf (&buf, fp, file))
2080 struct line *linebase;
2082 if (buf.eof && nfiles
2083 && (bytes_per_line + 1
2084 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2086 /* End of file, but there is more input and buffer room.
2087 Concatenate the next input file; this is faster in
2089 buf.left = buf.used;
2093 line = buffer_linelim (&buf);
2094 linebase = line - buf.nlines;
2096 sortlines (line, buf.nlines, linebase);
2097 if (buf.eof && !nfiles && !ntemps && !buf.left)
2100 tfp = xfopen (output_file, "w");
2101 temp_output = output_file;
2102 output_file_created = true;
2107 temp_output = create_temp_file (&tfp);
2113 write_bytes (line->text, line->length, tfp, temp_output);
2115 while (linebase < line && compare (line, line - 1) == 0)
2118 while (linebase < line);
2120 xfclose (tfp, temp_output);
2122 if (output_file_created)
2131 if (! output_file_created)
2134 struct tempnode *node = temphead;
2135 char **tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2136 for (i = 0; node; i++)
2138 tempfiles[i] = node->name;
2141 merge (tempfiles, ntemps, ntemps, output_file);
2146 /* Insert key KEY at the end of the key list. */
2149 insertkey (struct keyfield *key)
2151 struct keyfield **p;
2153 for (p = &keylist; *p; p = &(*p)->next)
2159 /* Report a bad field specification SPEC, with extra info MSGID. */
2161 static void badfieldspec (char const *, char const *)
2164 badfieldspec (char const *spec, char const *msgid)
2166 error (SORT_FAILURE, 0, _("%s: invalid field specification `%s'"),
2171 /* Parse the leading integer in STRING and store the resulting value
2172 (which must fit into size_t) into *VAL. Return the address of the
2173 suffix after the integer. If MSGID is NULL, return NULL after
2174 failure; otherwise, report MSGID and exit on failure. */
2177 parse_field_count (char const *string, size_t *val, char const *msgid)
2182 switch (xstrtoumax (string, &suffix, 10, &n, ""))
2185 case LONGINT_INVALID_SUFFIX_CHAR:
2190 case LONGINT_OVERFLOW:
2191 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2193 error (SORT_FAILURE, 0, _("%s: count `%.*s' too large"),
2194 _(msgid), (int) (suffix - string), string);
2197 case LONGINT_INVALID:
2199 error (SORT_FAILURE, 0, _("%s: invalid count at start of `%s'"),
2207 /* Handle interrupts and hangups. */
2210 sighandler (int sig)
2212 #ifndef SA_NOCLDSTOP
2213 signal (sig, SIG_IGN);
2218 signal (sig, SIG_DFL);
2222 /* Set the ordering options for KEY specified in S.
2223 Return the address of the first character in S that
2224 is not a valid ordering option.
2225 BLANKTYPE is the kind of blanks that 'b' should skip. */
2228 set_ordering (register const char *s, struct keyfield *key,
2229 enum blanktype blanktype)
2236 if (blanktype == bl_start || blanktype == bl_both)
2237 key->skipsblanks = true;
2238 if (blanktype == bl_end || blanktype == bl_both)
2239 key->skipeblanks = true;
2242 key->ignore = nondictionary;
2245 key->translate = fold_toupper;
2248 key->general_numeric = true;
2251 /* Option order should not matter, so don't let -i override
2252 -d. -d implies -i, but -i does not imply -d. */
2254 key->ignore = nonprinting;
2260 key->numeric = true;
2263 key->reverse = true;
2273 static struct keyfield *
2276 struct keyfield *key = xzalloc (sizeof *key);
2277 key->eword = SIZE_MAX;
2282 main (int argc, char **argv)
2284 struct keyfield *key;
2285 struct keyfield gkey;
2288 bool checkonly = false;
2289 bool mergeonly = false;
2291 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2292 bool obsolete_usage = (posix2_version () < 200112);
2293 char const *short_options = (obsolete_usage
2294 ? COMMON_SHORT_OPTIONS "y::"
2295 : COMMON_SHORT_OPTIONS "y:");
2296 char *minus = "-", **files;
2297 char const *outfile = NULL;
2299 initialize_main (&argc, &argv);
2300 program_name = argv[0];
2301 setlocale (LC_ALL, "");
2302 bindtextdomain (PACKAGE, LOCALEDIR);
2303 textdomain (PACKAGE);
2307 initialize_exit_failure (SORT_FAILURE);
2308 atexit (close_stdout);
2310 hard_LC_COLLATE = hard_locale (LC_COLLATE);
2311 #if HAVE_NL_LANGINFO
2312 hard_LC_TIME = hard_locale (LC_TIME);
2316 /* Let's get locale's representation of the decimal point */
2318 struct lconv const *lconvp = localeconv ();
2320 /* If the locale doesn't define a decimal point, or if the decimal
2321 point is multibyte, use the C decimal point. We don't support
2322 multibyte decimal points yet. */
2323 decimal_point = *lconvp->decimal_point;
2324 if (! decimal_point || lconvp->decimal_point[1])
2325 decimal_point = C_DECIMAL_POINT;
2327 /* We don't support multibyte thousands separators yet. */
2328 th_sep = *lconvp->thousands_sep;
2329 if (! th_sep || lconvp->thousands_sep[1])
2330 th_sep = CHAR_MAX + 1;
2334 have_read_stdin = false;
2339 static int const sig[] = { SIGHUP, SIGINT, SIGPIPE, SIGTERM };
2340 enum { nsigs = sizeof sig / sizeof sig[0] };
2343 struct sigaction act;
2345 sigemptyset (&caught_signals);
2346 for (i = 0; i < nsigs; i++)
2348 sigaction (sig[i], NULL, &act);
2349 if (act.sa_handler != SIG_IGN)
2350 sigaddset (&caught_signals, sig[i]);
2353 act.sa_handler = sighandler;
2354 act.sa_mask = caught_signals;
2357 for (i = 0; i < nsigs; i++)
2358 if (sigismember (&caught_signals, sig[i]))
2359 sigaction (sig[i], &act, NULL);
2361 for (i = 0; i < nsigs; i++)
2362 if (signal (sig[i], SIG_IGN) != SIG_IGN)
2363 signal (sig[i], sighandler);
2367 gkey.sword = gkey.eword = SIZE_MAX;
2369 gkey.translate = NULL;
2370 gkey.numeric = gkey.general_numeric = gkey.month = gkey.reverse = false;
2371 gkey.skipsblanks = gkey.skipeblanks = false;
2373 files = xnmalloc (argc, sizeof *files);
2377 /* Parse an operand as a file after "--" was seen; or if
2378 pedantic and a file was seen, unless the POSIX version
2379 predates 1003.1-2001 and -c was not seen and the operand is
2380 "-o FILE" or "-oFILE". */
2383 || (posixly_correct && nfiles != 0
2384 && ! (obsolete_usage
2387 && argv[optind][0] == '-' && argv[optind][1] == 'o'
2388 && (argv[optind][2] || optind + 1 != argc)))
2389 || ((c = getopt_long (argc, argv, short_options,
2390 long_options, NULL))
2395 files[nfiles++] = argv[optind++];
2401 if (obsolete_usage && optarg[0] == '+')
2403 /* Treat +POS1 [-POS2] as a key if possible; but silently
2404 treat an operand as a file if it is not a valid +POS1. */
2406 s = parse_field_count (optarg + 1, &key->sword, NULL);
2408 s = parse_field_count (s + 1, &key->schar, NULL);
2409 if (! (key->sword | key->schar))
2410 key->sword = SIZE_MAX;
2411 if (! s || *set_ordering (s, key, bl_start))
2418 if (optind != argc && argv[optind][0] == '-'
2419 && ISDIGIT (argv[optind][1]))
2421 char const *optarg1 = argv[optind++];
2422 s = parse_field_count (optarg1 + 1, &key->eword,
2423 N_("invalid number after `-'"));
2425 s = parse_field_count (s + 1, &key->echar,
2426 N_("invalid number after `.'"));
2427 if (*set_ordering (s, key, bl_end))
2428 badfieldspec (optarg1,
2429 N_("stray character in field spec"));
2435 files[nfiles++] = optarg;
2450 set_ordering (str, &gkey, bl_both);
2462 s = parse_field_count (optarg, &key->sword,
2463 N_("invalid number at field start"));
2466 /* Provoke with `sort -k0' */
2467 badfieldspec (optarg, N_("field number is zero"));
2471 s = parse_field_count (s + 1, &key->schar,
2472 N_("invalid number after `.'"));
2475 /* Provoke with `sort -k1.0' */
2476 badfieldspec (optarg, N_("character offset is zero"));
2479 if (! (key->sword | key->schar))
2480 key->sword = SIZE_MAX;
2481 s = set_ordering (s, key, bl_start);
2484 key->eword = SIZE_MAX;
2490 s = parse_field_count (s + 1, &key->eword,
2491 N_("invalid number after `,'"));
2494 /* Provoke with `sort -k1,0' */
2495 badfieldspec (optarg, N_("field number is zero"));
2498 s = parse_field_count (s + 1, &key->echar,
2499 N_("invalid number after `.'"));
2502 /* `-k 2,3' is equivalent to `+1 -3'. */
2505 s = set_ordering (s, key, bl_end);
2508 badfieldspec (optarg, N_("stray character in field spec"));
2517 if (outfile && !STREQ (outfile, optarg))
2518 error (SORT_FAILURE, 0, _("multiple output files specified"));
2527 specify_sort_size (optarg);
2532 char newtab = optarg[0];
2534 error (SORT_FAILURE, 0, _("empty tab"));
2537 if (STREQ (optarg, "\\0"))
2541 /* Provoke with `sort -txx'. Complain about
2542 "multi-character tab" instead of "multibyte tab", so
2543 that the diagnostic's wording does not need to be
2544 changed once multibyte characters are supported. */
2545 error (SORT_FAILURE, 0, _("multi-character tab `%s'"),
2549 if (tab != TAB_DEFAULT && tab != newtab)
2550 error (SORT_FAILURE, 0, _("incompatible tabs"));
2556 add_temp_dir (optarg);
2564 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
2565 through Solaris 7. It is also accepted by many non-Solaris
2566 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
2567 -y is marked as obsolete starting with Solaris 8 (1999), but is
2568 still accepted as of Solaris 10 prerelease (2004).
2570 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
2571 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
2572 and which in general ignores the argument after "-y" if it
2573 consists entirely of digits (it can even be empty). */
2574 if (!optarg && optind != argc)
2577 for (p = argv[optind]; ISDIGIT (*p); p++)
2587 case_GETOPT_HELP_CHAR;
2589 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2592 usage (SORT_FAILURE);
2596 /* Inheritance of global options to individual keys. */
2597 for (key = keylist; key; key = key->next)
2598 if (! (key->ignore || key->translate
2599 || (key->skipsblanks | key->reverse
2600 | key->skipeblanks | key->month | key->numeric
2601 | key->general_numeric)))
2603 key->ignore = gkey.ignore;
2604 key->translate = gkey.translate;
2605 key->skipsblanks = gkey.skipsblanks;
2606 key->skipeblanks = gkey.skipeblanks;
2607 key->month = gkey.month;
2608 key->numeric = gkey.numeric;
2609 key->general_numeric = gkey.general_numeric;
2610 key->reverse = gkey.reverse;
2613 if (!keylist && (gkey.ignore || gkey.translate
2614 || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
2615 | gkey.numeric | gkey.general_numeric)))
2617 reverse = gkey.reverse;
2619 if (temp_dir_count == 0)
2621 char const *tmp_dir = getenv ("TMPDIR");
2622 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
2635 error (0, 0, _("extra operand %s not allowed with -c"),
2637 usage (SORT_FAILURE);
2640 /* POSIX requires that sort return 1 IFF invoked with -c and the
2641 input is not properly sorted. */
2642 exit (check (files[0]) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
2646 merge (files, 0, nfiles, outfile);
2648 sort (files, nfiles, outfile);
2650 if (have_read_stdin && fclose (stdin) == EOF)
2651 die (_("close failed"), "-");
2653 exit (EXIT_SUCCESS);