1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988, 1991-2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 Written December 1988 by Mike Haertel.
19 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
20 or (US mail) as Mike Haertel c/o Free Software Foundation.
22 Ørn E. Hansen added NLS support in 1997. */
27 #include <sys/types.h>
31 #include "hard-locale.h"
40 #include "strnumcmp.h"
45 #if HAVE_SYS_RESOURCE_H
46 # include <sys/resource.h>
49 struct rlimit { size_t rlim_cur; };
50 # define getrlimit(Resource, Rlp) (-1)
53 /* The official name of this program (e.g., no `g' prefix). */
54 #define PROGRAM_NAME "sort"
56 #define AUTHORS "Mike Haertel", "Paul Eggert"
58 #if HAVE_LANGINFO_CODESET
59 # include <langinfo.h>
62 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
65 # define SA_NOCLDSTOP 0
66 # define sigprocmask(How, Set, Oset) /* empty */
68 # if ! HAVE_SIGINTERRUPT
69 # define siginterrupt(sig, flag) /* empty */
77 #define UCHAR_LIM (UCHAR_MAX + 1)
79 #ifndef DEFAULT_TMPDIR
80 # define DEFAULT_TMPDIR "/tmp"
86 /* POSIX says to exit with status 1 if invoked with -c and the
87 input is not properly sorted. */
88 SORT_OUT_OF_ORDER = 1,
90 /* POSIX says any other irregular exit must exit with a status
91 code greater than 1. */
95 /* The representation of the decimal point in the current locale. */
96 static int decimal_point;
98 /* Thousands separator; if -1, then there isn't one. */
99 static int thousands_sep;
101 /* Nonzero if the corresponding locales are hard. */
102 static bool hard_LC_COLLATE;
104 static bool hard_LC_TIME;
107 #define NONZERO(x) ((x) != 0)
109 /* The kind of blanks for '-b' to skip in various options. */
110 enum blanktype { bl_start, bl_end, bl_both };
112 /* The character marking end of line. Default to \n. */
113 static char eolchar = '\n';
115 /* Lines are held in core as counted strings. */
118 char *text; /* Text of the line. */
119 size_t length; /* Length including final newline. */
120 char *keybeg; /* Start of first key. */
121 char *keylim; /* Limit of first key. */
127 char *buf; /* Dynamically allocated buffer,
128 partitioned into 3 regions:
131 - an array of lines, in reverse order. */
132 size_t used; /* Number of bytes used for input data. */
133 size_t nlines; /* Number of lines in the line array. */
134 size_t alloc; /* Number of bytes allocated. */
135 size_t left; /* Number of bytes left from previous reads. */
136 size_t line_bytes; /* Number of bytes to reserve for each line. */
137 bool eof; /* An EOF has been read. */
142 size_t sword; /* Zero-origin 'word' to start at. */
143 size_t schar; /* Additional characters to skip. */
144 size_t eword; /* Zero-origin first word after field. */
145 size_t echar; /* Additional characters in field. */
146 bool const *ignore; /* Boolean array of characters to ignore. */
147 char const *translate; /* Translation applied to characters. */
148 bool skipsblanks; /* Skip leading blanks when finding start. */
149 bool skipeblanks; /* Skip leading blanks when finding end. */
150 bool numeric; /* Flag for numeric comparison. Handle
151 strings of digits with optional decimal
152 point, but no exponential notation. */
153 bool random; /* Sort by random hash of key. */
154 bool general_numeric; /* Flag for general, numeric comparison.
155 Handle numbers in exponential notation. */
156 bool month; /* Flag for comparison by month name. */
157 bool reverse; /* Reverse the sense of comparison. */
158 struct keyfield *next; /* Next keyfield to try. */
167 /* The name this program was run with. */
170 /* FIXME: None of these tables work with multibyte character sets.
171 Also, there are many other bugs when handling multibyte characters.
172 One way to fix this is to rewrite `sort' to use wide characters
173 internally, but doing this with good performance is a bit
176 /* Table of blanks. */
177 static bool blanks[UCHAR_LIM];
179 /* Table of non-printing characters. */
180 static bool nonprinting[UCHAR_LIM];
182 /* Table of non-dictionary characters (not letters, digits, or blanks). */
183 static bool nondictionary[UCHAR_LIM];
185 /* Translation table folding lower case to upper. */
186 static char fold_toupper[UCHAR_LIM];
188 #define MONTHS_PER_YEAR 12
190 /* Table mapping month names to integers.
191 Alphabetic order allows binary search. */
192 static struct month monthtab[] =
208 /* During the merge phase, the number of files to merge at once. */
211 /* Minimum size for a merge or check buffer. */
212 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
214 /* Minimum sort size; the code might not work with smaller sizes. */
215 #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
217 /* The number of bytes needed for a merge or check buffer, which can
218 function relatively efficiently even if it holds only one line. If
219 a longer line is seen, this value is increased. */
220 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
222 /* The approximate maximum number of bytes of main memory to use, as
223 specified by the user. Zero if the user has not specified a size. */
224 static size_t sort_size;
226 /* The guessed size for non-regular files. */
227 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
229 /* Array of directory names in which any temporary files are to be created. */
230 static char const **temp_dirs;
232 /* Number of temporary directory names used. */
233 static size_t temp_dir_count;
235 /* Number of allocated slots in temp_dirs. */
236 static size_t temp_dir_alloc;
238 /* Flag to reverse the order of all comparisons. */
241 /* Flag for stable sort. This turns off the last ditch bytewise
242 comparison of lines, and instead leaves lines in the same order
243 they were read if all keys compare equal. */
246 /* If TAB has this value, blanks separate fields. */
247 enum { TAB_DEFAULT = CHAR_MAX + 1 };
249 /* Tab character separating fields. If TAB_DEFAULT, then fields are
250 separated by the empty string between a non-blank character and a blank
252 static int tab = TAB_DEFAULT;
254 /* Flag to remove consecutive duplicate lines from the output.
255 Only the last of a sequence of equal lines will be output. */
258 /* Nonzero if any of the input files are the standard input. */
259 static bool have_read_stdin;
261 /* List of key field comparisons to be tried. */
262 static struct keyfield *keylist;
264 static void sortlines_temp (struct line *, size_t, struct line *);
266 /* Report MESSAGE for FILE, then clean up and exit.
267 If FILE is null, it represents standard output. */
269 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
271 die (char const *message, char const *file)
273 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
280 if (status != EXIT_SUCCESS)
281 fprintf (stderr, _("Try `%s --help' for more information.\n"),
286 Usage: %s [OPTION]... [FILE]...\n\
290 Write sorted concatenation of all FILE(s) to standard output.\n\
294 Mandatory arguments to long options are mandatory for short options too.\n\
301 -b, --ignore-leading-blanks ignore leading blanks\n\
302 -d, --dictionary-order consider only blanks and alphanumeric characters\n\
303 -f, --ignore-case fold lower case to upper case characters\n\
306 -g, --general-numeric-sort compare according to general numerical value\n\
307 -i, --ignore-nonprinting consider only printable characters\n\
308 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
309 -n, --numeric-sort compare according to string numerical value\n\
310 -R, --random-sort sort by random hash of keys\n\
311 --random-source=FILE get random bytes from FILE (default /dev/urandom)\n\
312 -r, --reverse reverse the result of comparisons\n\
318 -c, --check check whether input is sorted; do not sort\n\
319 -k, --key=POS1[,POS2] start a key at POS1, end it at POS2 (origin 1)\n\
320 -m, --merge merge already sorted files; do not sort\n\
321 -o, --output=FILE write result to FILE instead of standard output\n\
322 -s, --stable stabilize sort by disabling last-resort comparison\n\
323 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
326 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
327 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
328 multiple options specify multiple directories\n\
329 -u, --unique with -c, check for strict ordering;\n\
330 without -c, output only the first of an equal run\n\
333 -z, --zero-terminated end lines with 0 byte, not newline\n\
335 fputs (HELP_OPTION_DESCRIPTION, stdout);
336 fputs (VERSION_OPTION_DESCRIPTION, stdout);
339 POS is F[.C][OPTS], where F is the field number and C the character position\n\
340 in the field; both are origin 1. If neither -t nor -b is in effect, characters\n\
341 in a field are counted from the beginning of the preceding whitespace. OPTS is\n\
342 one or more single-letter ordering options, which override global ordering\n\
343 options for that key. If no key is given, use the entire line as the key.\n\
345 SIZE may be followed by the following multiplicative suffixes:\n\
348 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
350 With no FILE, or when FILE is -, read standard input.\n\
353 The locale specified by the environment affects sort order.\n\
354 Set LC_ALL=C to get the traditional sort order that uses\n\
355 native byte values.\n\
357 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
363 /* For long options that have no equivalent short option, use a
364 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
367 RANDOM_SOURCE_OPTION = CHAR_MAX + 1
370 static char const short_options[] = "-bcdfgik:mMno:rRsS:t:T:uy:z";
372 static struct option const long_options[] =
374 {"ignore-leading-blanks", no_argument, NULL, 'b'},
375 {"check", no_argument, NULL, 'c'},
376 {"dictionary-order", no_argument, NULL, 'd'},
377 {"ignore-case", no_argument, NULL, 'f'},
378 {"general-numeric-sort", no_argument, NULL, 'g'},
379 {"ignore-nonprinting", no_argument, NULL, 'i'},
380 {"key", required_argument, NULL, 'k'},
381 {"merge", no_argument, NULL, 'm'},
382 {"month-sort", no_argument, NULL, 'M'},
383 {"numeric-sort", no_argument, NULL, 'n'},
384 {"random-sort", no_argument, NULL, 'R'},
385 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
386 {"output", required_argument, NULL, 'o'},
387 {"reverse", no_argument, NULL, 'r'},
388 {"stable", no_argument, NULL, 's'},
389 {"buffer-size", required_argument, NULL, 'S'},
390 {"field-separator", required_argument, NULL, 't'},
391 {"temporary-directory", required_argument, NULL, 'T'},
392 {"unique", no_argument, NULL, 'u'},
393 {"zero-terminated", no_argument, NULL, 'z'},
394 {GETOPT_HELP_OPTION_DECL},
395 {GETOPT_VERSION_OPTION_DECL},
399 /* The set of signals that are caught. */
400 static sigset_t caught_signals;
402 /* The list of temporary files. */
405 struct tempnode *volatile next;
406 char name[1]; /* Actual size is 1 + file name length. */
408 static struct tempnode *volatile temphead;
409 static struct tempnode *volatile *temptail = &temphead;
411 /* Clean up any remaining temporary files. */
416 struct tempnode const *node;
418 for (node = temphead; node; node = node->next)
423 /* Cleanup actions to take when exiting. */
430 /* Clean up any remaining temporary files in a critical section so
431 that a signal handler does not try to clean them too. */
433 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
435 sigprocmask (SIG_SETMASK, &oldset, NULL);
441 /* Create a new temporary file, returning its newly allocated name.
442 Store into *PFP a stream open for writing. */
445 create_temp_file (FILE **pfp)
447 static char const slashbase[] = "/sortXXXXXX";
448 static size_t temp_dir_index;
452 char const *temp_dir = temp_dirs[temp_dir_index];
453 size_t len = strlen (temp_dir);
454 struct tempnode *node =
455 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
456 char *file = node->name;
458 memcpy (file, temp_dir, len);
459 memcpy (file + len, slashbase, sizeof slashbase);
461 if (++temp_dir_index == temp_dir_count)
464 /* Create the temporary file in a critical section, to avoid races. */
465 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
470 temptail = &node->next;
473 sigprocmask (SIG_SETMASK, &oldset, NULL);
476 if (fd < 0 || (*pfp = fdopen (fd, "w")) == NULL)
477 die (_("cannot create temporary file"), file);
482 /* Return a stream for FILE, opened with mode HOW. A null FILE means
483 standard output; HOW should be "w". When opening for input, "-"
484 means standard input. To avoid confusion, do not return file
485 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
486 opening an ordinary FILE. */
489 xfopen (const char *file, const char *how)
495 else if (STREQ (file, "-") && *how == 'r')
497 have_read_stdin = true;
502 fp = fopen (file, how);
504 die (_("open failed"), file);
510 /* Close FP, whose name is FILE, and report any errors. */
513 xfclose (FILE *fp, char const *file)
518 /* Allow reading stdin from tty more than once. */
524 /* Don't close stdout just yet. close_stdout does that. */
525 if (fflush (fp) != 0)
526 die (_("fflush failed"), file);
530 if (fclose (fp) != 0)
531 die (_("close failed"), file);
537 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
539 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
540 die (_("write failed"), output_file);
543 /* Append DIR to the array of temporary directory names. */
545 add_temp_dir (char const *dir)
547 if (temp_dir_count == temp_dir_alloc)
548 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
550 temp_dirs[temp_dir_count++] = dir;
553 /* Remove NAME from the list of temporary files. */
556 zaptemp (const char *name)
558 struct tempnode *volatile *pnode;
559 struct tempnode *node;
560 struct tempnode *next;
563 int unlink_errno = 0;
565 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
568 /* Unlink the temporary file in a critical section to avoid races. */
570 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
571 unlink_status = unlink (name);
572 unlink_errno = errno;
574 sigprocmask (SIG_SETMASK, &oldset, NULL);
576 if (unlink_status != 0)
577 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
586 struct_month_cmp (const void *m1, const void *m2)
588 struct month const *month1 = m1;
589 struct month const *month2 = m2;
590 return strcmp (month1->name, month2->name);
595 /* Initialize the character class tables. */
602 for (i = 0; i < UCHAR_LIM; ++i)
604 blanks[i] = !! isblank (i);
605 nonprinting[i] = ! isprint (i);
606 nondictionary[i] = ! isalnum (i) && ! isblank (i);
607 fold_toupper[i] = toupper (i);
611 /* If we're not in the "C" locale, read different names for months. */
614 for (i = 0; i < MONTHS_PER_YEAR; i++)
621 s = (char *) nl_langinfo (ABMON_1 + i);
623 monthtab[i].name = name = xmalloc (s_len + 1);
624 monthtab[i].val = i + 1;
626 for (j = 0; j < s_len; j++)
627 name[j] = fold_toupper[to_uchar (s[j])];
630 qsort ((void *) monthtab, MONTHS_PER_YEAR,
631 sizeof *monthtab, struct_month_cmp);
636 /* Specify the amount of main memory to use when sorting. */
638 specify_sort_size (char const *s)
642 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
644 /* The default unit is KiB. */
645 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
647 if (n <= UINTMAX_MAX / 1024)
650 e = LONGINT_OVERFLOW;
653 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
654 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
663 double mem = physmem_total () * n / 100;
665 /* Use "<", not "<=", to avoid problems with rounding. */
666 if (mem < UINTMAX_MAX)
672 e = LONGINT_OVERFLOW;
679 /* If multiple sort sizes are specified, take the maximum, so
680 that option order does not matter. */
687 sort_size = MAX (sort_size, MIN_SORT_SIZE);
691 e = LONGINT_OVERFLOW;
694 STRTOL_FATAL_ERROR (s, _("sort size"), e);
697 /* Return the default sort size. */
699 default_sort_size (void)
701 /* Let MEM be available memory or 1/8 of total memory, whichever
703 double avail = physmem_available ();
704 double total = physmem_total ();
705 double mem = MAX (avail, total / 8);
706 struct rlimit rlimit;
708 /* Let SIZE be MEM, but no more than the maximum object size or
709 system resource limits. Avoid the MIN macro here, as it is not
710 quite right when only one argument is floating point. Don't
711 bother to check for values like RLIM_INFINITY since in practice
712 they are not much less than SIZE_MAX. */
713 size_t size = SIZE_MAX;
716 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
717 size = rlimit.rlim_cur;
719 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
720 size = rlimit.rlim_cur;
723 /* Leave a large safety margin for the above limits, as failure can
724 occur when they are exceeded. */
728 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
729 Exceeding RSS is not fatal, but can be quite slow. */
730 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
731 size = rlimit.rlim_cur / 16 * 15;
734 /* Use no less than the minimum. */
735 return MAX (size, MIN_SORT_SIZE);
738 /* Return the sort buffer size to use with the input files identified
739 by FPS and FILES, which are alternate names of the same files.
740 NFILES gives the number of input files; NFPS may be less. Assume
741 that each input line requires LINE_BYTES extra bytes' worth of line
742 information. Do not exceed the size bound specified by the user
743 (or a default size bound, if the user does not specify one). */
746 sort_buffer_size (FILE *const *fps, size_t nfps,
747 char *const *files, size_t nfiles,
750 /* A bound on the input size. If zero, the bound hasn't been
752 static size_t size_bound;
754 /* In the worst case, each input byte is a newline. */
755 size_t worst_case_per_input_byte = line_bytes + 1;
757 /* Keep enough room for one extra input line and an extra byte.
758 This extra room might be needed when preparing to read EOF. */
759 size_t size = worst_case_per_input_byte + 1;
763 for (i = 0; i < nfiles; i++)
769 if ((i < nfps ? fstat (fileno (fps[i]), &st)
770 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
771 : stat (files[i], &st))
773 die (_("stat failed"), files[i]);
775 if (S_ISREG (st.st_mode))
776 file_size = st.st_size;
779 /* The file has unknown size. If the user specified a sort
780 buffer size, use that; otherwise, guess the size. */
783 file_size = INPUT_FILE_SIZE_GUESS;
788 size_bound = sort_size;
790 size_bound = default_sort_size ();
793 /* Add the amount of memory needed to represent the worst case
794 where the input consists entirely of newlines followed by a
795 single non-newline. Check for overflow. */
796 worst_case = file_size * worst_case_per_input_byte + 1;
797 if (file_size != worst_case / worst_case_per_input_byte
798 || size_bound - size <= worst_case)
806 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
807 must be at least sizeof (struct line). Allocate ALLOC bytes
811 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
813 /* Ensure that the line array is properly aligned. If the desired
814 size cannot be allocated, repeatedly halve it until allocation
815 succeeds. The smaller allocation may hurt overall performance,
816 but that's better than failing. */
819 alloc += sizeof (struct line) - alloc % sizeof (struct line);
820 buf->buf = malloc (alloc);
824 if (alloc <= line_bytes + 1)
828 buf->line_bytes = line_bytes;
830 buf->used = buf->left = buf->nlines = 0;
834 /* Return one past the limit of the line array. */
836 static inline struct line *
837 buffer_linelim (struct buffer const *buf)
839 return (struct line *) (buf->buf + buf->alloc);
842 /* Return a pointer to the first character of the field specified
846 begfield (const struct line *line, const struct keyfield *key)
848 char *ptr = line->text, *lim = ptr + line->length - 1;
849 size_t sword = key->sword;
850 size_t schar = key->schar;
851 size_t remaining_bytes;
853 /* The leading field separator itself is included in a field when -t
856 if (tab != TAB_DEFAULT)
857 while (ptr < lim && sword--)
859 while (ptr < lim && *ptr != tab)
865 while (ptr < lim && sword--)
867 while (ptr < lim && blanks[to_uchar (*ptr)])
869 while (ptr < lim && !blanks[to_uchar (*ptr)])
873 if (key->skipsblanks)
874 while (ptr < lim && blanks[to_uchar (*ptr)])
877 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
878 remaining_bytes = lim - ptr;
879 if (schar < remaining_bytes)
887 /* Return the limit of (a pointer to the first character after) the field
888 in LINE specified by KEY. */
891 limfield (const struct line *line, const struct keyfield *key)
893 char *ptr = line->text, *lim = ptr + line->length - 1;
894 size_t eword = key->eword, echar = key->echar;
895 size_t remaining_bytes;
897 /* Move PTR past EWORD fields or to one past the last byte on LINE,
898 whichever comes first. If there are more than EWORD fields, leave
899 PTR pointing at the beginning of the field having zero-based index,
900 EWORD. If a delimiter character was specified (via -t), then that
901 `beginning' is the first character following the delimiting TAB.
902 Otherwise, leave PTR pointing at the first `blank' character after
903 the preceding field. */
904 if (tab != TAB_DEFAULT)
905 while (ptr < lim && eword--)
907 while (ptr < lim && *ptr != tab)
909 if (ptr < lim && (eword | echar))
913 while (ptr < lim && eword--)
915 while (ptr < lim && blanks[to_uchar (*ptr)])
917 while (ptr < lim && !blanks[to_uchar (*ptr)])
921 #ifdef POSIX_UNSPECIFIED
922 /* The following block of code makes GNU sort incompatible with
923 standard Unix sort, so it's ifdef'd out for now.
924 The POSIX spec isn't clear on how to interpret this.
925 FIXME: request clarification.
927 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
928 Date: Thu, 30 May 96 12:20:41 -0400
929 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
931 [...]I believe I've found another bug in `sort'.
936 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
939 $ /bin/sort -k1.7,1.7 </tmp/sort.in
943 Unix sort produced the answer I expected: sort on the single character
944 in column 7. GNU sort produced different results, because it disagrees
945 on the interpretation of the key-end spec "M.N". Unix sort reads this
946 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
947 "skip M-1 fields, then either N-1 characters or the rest of the current
948 field, whichever comes first". This extra clause applies only to
949 key-ends, not key-starts.
952 /* Make LIM point to the end of (one byte past) the current field. */
953 if (tab != TAB_DEFAULT)
956 newlim = memchr (ptr, tab, lim - ptr);
964 while (newlim < lim && blanks[to_uchar (*newlim)])
966 while (newlim < lim && !blanks[to_uchar (*newlim)])
972 /* If we're ignoring leading blanks when computing the End
973 of the field, don't start counting bytes until after skipping
974 past any leading blanks. */
975 if (key->skipeblanks)
976 while (ptr < lim && blanks[to_uchar (*ptr)])
979 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
980 remaining_bytes = lim - ptr;
981 if (echar < remaining_bytes)
989 /* Fill BUF reading from FP, moving buf->left bytes from the end
990 of buf->buf to the beginning first. If EOF is reached and the
991 file wasn't terminated by a newline, supply one. Set up BUF's line
992 table too. FILE is the name of the file corresponding to FP.
993 Return true if some input was read. */
996 fillbuf (struct buffer *buf, FILE *fp, char const *file)
998 struct keyfield const *key = keylist;
1000 size_t line_bytes = buf->line_bytes;
1001 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1006 if (buf->used != buf->left)
1008 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1009 buf->used = buf->left;
1015 char *ptr = buf->buf + buf->used;
1016 struct line *linelim = buffer_linelim (buf);
1017 struct line *line = linelim - buf->nlines;
1018 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1019 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1021 while (line_bytes + 1 < avail)
1023 /* Read as many bytes as possible, but do not read so many
1024 bytes that there might not be enough room for the
1025 corresponding line array. The worst case is when the
1026 rest of the input file consists entirely of newlines,
1027 except that the last byte is not a newline. */
1028 size_t readsize = (avail - 1) / (line_bytes + 1);
1029 size_t bytes_read = fread (ptr, 1, readsize, fp);
1030 char *ptrlim = ptr + bytes_read;
1032 avail -= bytes_read;
1034 if (bytes_read != readsize)
1037 die (_("read failed"), file);
1041 if (buf->buf == ptrlim)
1043 if (ptrlim[-1] != eol)
1048 /* Find and record each line in the just-read input. */
1049 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1053 line->text = line_start;
1054 line->length = ptr - line_start;
1055 mergesize = MAX (mergesize, line->length);
1056 avail -= line_bytes;
1060 /* Precompute the position of the first key for
1062 line->keylim = (key->eword == SIZE_MAX
1064 : limfield (line, key));
1066 if (key->sword != SIZE_MAX)
1067 line->keybeg = begfield (line, key);
1070 if (key->skipsblanks)
1071 while (blanks[to_uchar (*line_start)])
1073 line->keybeg = line_start;
1085 buf->used = ptr - buf->buf;
1086 buf->nlines = buffer_linelim (buf) - line;
1087 if (buf->nlines != 0)
1089 buf->left = ptr - line_start;
1090 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1094 /* The current input line is too long to fit in the buffer.
1095 Double the buffer size and try again. */
1096 buf->buf = X2REALLOC (buf->buf, &buf->alloc);
1100 /* Compare strings A and B as numbers without explicitly converting them to
1101 machine numbers. Comparatively slow for short strings, but asymptotically
1105 numcompare (const char *a, const char *b)
1107 while (blanks[to_uchar (*a)])
1109 while (blanks[to_uchar (*b)])
1112 return strnumcmp (a, b, decimal_point, thousands_sep);
1116 general_numcompare (const char *sa, const char *sb)
1118 /* FIXME: add option to warn about failed conversions. */
1119 /* FIXME: maybe add option to try expensive FP conversion
1120 only if A and B can't be compared more cheaply/accurately. */
1124 double a = strtod (sa, &ea);
1125 double b = strtod (sb, &eb);
1127 /* Put conversion errors at the start of the collating sequence. */
1129 return sb == eb ? 0 : -1;
1133 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1134 conversion errors but before numbers; sort them by internal
1135 bit-pattern, for lack of a more portable alternative. */
1141 : memcmp ((char *) &a, (char *) &b, sizeof a));
1144 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1145 Return 0 if the name in S is not recognized. */
1148 getmonth (char const *month, size_t len)
1151 size_t hi = MONTHS_PER_YEAR;
1152 char const *monthlim = month + len;
1156 if (month == monthlim)
1158 if (!blanks[to_uchar (*month)])
1165 size_t ix = (lo + hi) / 2;
1166 char const *m = month;
1167 char const *n = monthtab[ix].name;
1172 return monthtab[ix].val;
1173 if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1178 else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1190 /* A source of random data. */
1191 static struct randread_source *randread_source;
1193 /* Return the Ith randomly-generated state. The caller must invoke
1194 random_state (H) for all H less than I before invoking random_state
1197 static struct md5_ctx
1198 random_state (size_t i)
1200 /* An array of states resulting from the random data, and counts of
1201 its used and allocated members. */
1202 static struct md5_ctx *state;
1204 static size_t allocated;
1206 struct md5_ctx *s = &state[i];
1210 unsigned char buf[MD5_DIGEST_SIZE];
1216 state = X2NREALLOC (state, &allocated);
1220 randread (randread_source, buf, sizeof buf);
1222 md5_process_bytes (buf, sizeof buf, s);
1228 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
1229 with length LENGTHB. Return negative if less, zero if equal,
1230 positive if greater. */
1233 cmp_hashes (char const *texta, size_t lena,
1234 char const *textb, size_t lenb)
1236 /* Try random hashes until a pair of hashes disagree. But if the
1237 first pair of random hashes agree, check whether the keys are
1238 identical and if so report no difference. */
1243 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
1244 struct md5_ctx s[2];
1245 s[0] = s[1] = random_state (i);
1246 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
1247 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
1248 diff = memcmp (dig[0], dig[1], sizeof dig[0]);
1251 if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)
1258 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1259 using one or more random hash functions. */
1262 compare_random (char *restrict texta, size_t lena,
1263 char *restrict textb, size_t lenb)
1267 if (! hard_LC_COLLATE)
1268 diff = cmp_hashes (texta, lena, textb, lenb);
1271 /* Transform the text into the basis of comparison, so that byte
1272 strings that would otherwise considered to be equal are
1273 considered equal here even if their bytes differ. */
1276 char stackbuf[4000];
1277 size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
1278 bool a_fits = tlena <= sizeof stackbuf;
1279 size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
1280 (a_fits ? sizeof stackbuf - tlena : 0),
1283 if (a_fits && tlena + tlenb <= sizeof stackbuf)
1287 /* Adding 1 to the buffer size lets xmemxfrm run a bit
1288 faster by avoiding the need for an extra buffer copy. */
1289 buf = xmalloc (tlena + tlenb + 1);
1290 xmemxfrm (buf, tlena + 1, texta, lena);
1291 xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
1294 diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
1296 if (buf != stackbuf)
1303 /* Compare two lines A and B trying every key in sequence until there
1304 are no more keys or a difference is found. */
1307 keycompare (const struct line *a, const struct line *b)
1309 struct keyfield const *key = keylist;
1311 /* For the first iteration only, the key positions have been
1312 precomputed for us. */
1313 char *texta = a->keybeg;
1314 char *textb = b->keybeg;
1315 char *lima = a->keylim;
1316 char *limb = b->keylim;
1322 char const *translate = key->translate;
1323 bool const *ignore = key->ignore;
1325 /* Find the lengths. */
1326 size_t lena = lima <= texta ? 0 : lima - texta;
1327 size_t lenb = limb <= textb ? 0 : limb - textb;
1329 /* Actually compare the fields. */
1332 diff = compare_random (texta, lena, textb, lenb);
1333 else if (key->numeric | key->general_numeric)
1335 char savea = *lima, saveb = *limb;
1337 *lima = *limb = '\0';
1338 diff = ((key->numeric ? numcompare : general_numcompare)
1340 *lima = savea, *limb = saveb;
1342 else if (key->month)
1343 diff = getmonth (texta, lena) - getmonth (textb, lenb);
1344 /* Sorting like this may become slow, so in a simple locale the user
1345 can select a faster sort that is similar to ascii sort. */
1346 else if (hard_LC_COLLATE)
1348 if (ignore || translate)
1351 size_t size = lena + 1 + lenb + 1;
1352 char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1353 char *copy_b = copy_a + lena + 1;
1354 size_t new_len_a, new_len_b, i;
1356 /* Ignore and/or translate chars before comparing. */
1357 for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1361 copy_a[new_len_a] = (translate
1362 ? translate[to_uchar (texta[i])]
1364 if (!ignore || !ignore[to_uchar (texta[i])])
1369 copy_b[new_len_b] = (translate
1370 ? translate[to_uchar (textb[i])]
1372 if (!ignore || !ignore[to_uchar (textb[i])])
1377 diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1379 if (sizeof buf < size)
1383 diff = - NONZERO (lenb);
1387 diff = xmemcoll (texta, lena, textb, lenb);
1391 #define CMP_WITH_IGNORE(A, B) \
1396 while (texta < lima && ignore[to_uchar (*texta)]) \
1398 while (textb < limb && ignore[to_uchar (*textb)]) \
1400 if (! (texta < lima && textb < limb)) \
1402 diff = to_uchar (A) - to_uchar (B); \
1409 diff = (texta < lima) - (textb < limb); \
1414 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1415 translate[to_uchar (*textb)]);
1417 CMP_WITH_IGNORE (*texta, *textb);
1420 diff = - NONZERO (lenb);
1427 while (texta < lima && textb < limb)
1429 diff = (to_uchar (translate[to_uchar (*texta++)])
1430 - to_uchar (translate[to_uchar (*textb++)]));
1437 diff = memcmp (texta, textb, MIN (lena, lenb));
1441 diff = lena < lenb ? -1 : lena != lenb;
1451 /* Find the beginning and limit of the next field. */
1452 if (key->eword != SIZE_MAX)
1453 lima = limfield (a, key), limb = limfield (b, key);
1455 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1457 if (key->sword != SIZE_MAX)
1458 texta = begfield (a, key), textb = begfield (b, key);
1461 texta = a->text, textb = b->text;
1462 if (key->skipsblanks)
1464 while (texta < lima && blanks[to_uchar (*texta)])
1466 while (textb < limb && blanks[to_uchar (*textb)])
1477 return key->reverse ? -diff : diff;
1480 /* Compare two lines A and B, returning negative, zero, or positive
1481 depending on whether A compares less than, equal to, or greater than B. */
1484 compare (const struct line *a, const struct line *b)
1489 /* First try to compare on the specified keys (if any).
1490 The only two cases with no key at all are unadorned sort,
1491 and unadorned sort -r. */
1494 diff = keycompare (a, b);
1495 if (diff | unique | stable)
1499 /* If the keys all compare equal (or no keys were specified)
1500 fall through to the default comparison. */
1501 alen = a->length - 1, blen = b->length - 1;
1504 diff = - NONZERO (blen);
1507 else if (hard_LC_COLLATE)
1508 diff = xmemcoll (a->text, alen, b->text, blen);
1509 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
1510 diff = alen < blen ? -1 : alen != blen;
1512 return reverse ? -diff : diff;
1515 /* Check that the lines read from FILE_NAME come in order. Print a
1516 diagnostic (FILE_NAME, line number, contents of line) to stderr and return
1517 false if they are not in order. Otherwise, print no diagnostic
1521 check (char const *file_name)
1523 FILE *fp = xfopen (file_name, "r");
1524 struct buffer buf; /* Input buffer. */
1525 struct line temp; /* Copy of previous line. */
1527 uintmax_t line_number = 0;
1528 struct keyfield const *key = keylist;
1529 bool nonunique = ! unique;
1530 bool ordered = true;
1532 initbuf (&buf, sizeof (struct line),
1533 MAX (merge_buffer_size, sort_size));
1536 while (fillbuf (&buf, fp, file_name))
1538 struct line const *line = buffer_linelim (&buf);
1539 struct line const *linebase = line - buf.nlines;
1541 /* Make sure the line saved from the old buffer contents is
1542 less than or equal to the first line of the new buffer. */
1543 if (alloc && nonunique <= compare (&temp, line - 1))
1547 struct line const *disorder_line = line - 1;
1548 uintmax_t disorder_line_number =
1549 buffer_linelim (&buf) - disorder_line + line_number;
1550 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
1551 fprintf (stderr, _("%s: %s:%s: disorder: "),
1552 program_name, file_name,
1553 umaxtostr (disorder_line_number, hr_buf));
1554 write_bytes (disorder_line->text, disorder_line->length, stderr,
1555 _("standard error"));
1561 /* Compare each line in the buffer with its successor. */
1562 while (linebase < --line)
1563 if (nonunique <= compare (line, line - 1))
1564 goto found_disorder;
1566 line_number += buf.nlines;
1568 /* Save the last line of the buffer. */
1569 if (alloc < line->length)
1576 alloc = line->length;
1580 while (alloc < line->length);
1582 temp.text = xrealloc (temp.text, alloc);
1584 memcpy (temp.text, line->text, line->length);
1585 temp.length = line->length;
1588 temp.keybeg = temp.text + (line->keybeg - line->text);
1589 temp.keylim = temp.text + (line->keylim - line->text);
1593 xfclose (fp, file_name);
1599 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
1600 files (all of which are at the start of the FILES array), and
1601 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
1602 Close input and output files before returning.
1603 OUTPUT_FILE gives the name of the output file. If it is NULL,
1604 the output file is standard output. If OFP is NULL, the output
1605 file has not been opened yet (or written to, if standard output). */
1608 mergefps (char **files, size_t ntemps, size_t nfiles,
1609 FILE *ofp, char const *output_file)
1611 FILE *fps[NMERGE]; /* Input streams for each file. */
1612 struct buffer buffer[NMERGE]; /* Input buffers for each file. */
1613 struct line saved; /* Saved line storage for unique check. */
1614 struct line const *savedline = NULL;
1615 /* &saved if there is a saved line. */
1616 size_t savealloc = 0; /* Size allocated for the saved line. */
1617 struct line const *cur[NMERGE]; /* Current line in each line table. */
1618 struct line const *base[NMERGE]; /* Base of each line table. */
1619 size_t ord[NMERGE]; /* Table representing a permutation of fps,
1620 such that cur[ord[0]] is the smallest line
1621 and will be next output. */
1625 struct keyfield const *key = keylist;
1628 /* Read initial lines from each input file. */
1629 for (i = 0; i < nfiles; )
1631 fps[i] = xfopen (files[i], "r");
1632 initbuf (&buffer[i], sizeof (struct line),
1633 MAX (merge_buffer_size, sort_size / nfiles));
1634 if (fillbuf (&buffer[i], fps[i], files[i]))
1636 struct line const *linelim = buffer_linelim (&buffer[i]);
1637 cur[i] = linelim - 1;
1638 base[i] = linelim - buffer[i].nlines;
1643 /* fps[i] is empty; eliminate it from future consideration. */
1644 xfclose (fps[i], files[i]);
1650 free (buffer[i].buf);
1652 for (j = i; j < nfiles; ++j)
1653 files[j] = files[j + 1];
1658 ofp = xfopen (output_file, "w");
1660 /* Set up the ord table according to comparisons among input lines.
1661 Since this only reorders two items if one is strictly greater than
1662 the other, it is stable. */
1663 for (i = 0; i < nfiles; ++i)
1665 for (i = 1; i < nfiles; ++i)
1666 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
1667 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1669 /* Repeatedly output the smallest line until no input remains. */
1672 struct line const *smallest = cur[ord[0]];
1674 /* If uniquified output is turned on, output only the first of
1675 an identical series of lines. */
1678 if (savedline && compare (savedline, smallest))
1681 write_bytes (saved.text, saved.length, ofp, output_file);
1686 if (savealloc < smallest->length)
1691 savealloc = smallest->length;
1694 while ((savealloc *= 2) < smallest->length);
1696 saved.text = xrealloc (saved.text, savealloc);
1698 saved.length = smallest->length;
1699 memcpy (saved.text, smallest->text, saved.length);
1703 saved.text + (smallest->keybeg - smallest->text);
1705 saved.text + (smallest->keylim - smallest->text);
1710 write_bytes (smallest->text, smallest->length, ofp, output_file);
1712 /* Check if we need to read more lines into core. */
1713 if (base[ord[0]] < smallest)
1714 cur[ord[0]] = smallest - 1;
1717 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]]))
1719 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
1720 cur[ord[0]] = linelim - 1;
1721 base[ord[0]] = linelim - buffer[ord[0]].nlines;
1725 /* We reached EOF on fps[ord[0]]. */
1726 for (i = 1; i < nfiles; ++i)
1727 if (ord[i] > ord[0])
1730 xfclose (fps[ord[0]], files[ord[0]]);
1731 if (ord[0] < ntemps)
1734 zaptemp (files[ord[0]]);
1736 free (buffer[ord[0]].buf);
1737 for (i = ord[0]; i < nfiles; ++i)
1739 fps[i] = fps[i + 1];
1740 files[i] = files[i + 1];
1741 buffer[i] = buffer[i + 1];
1742 cur[i] = cur[i + 1];
1743 base[i] = base[i + 1];
1745 for (i = 0; i < nfiles; ++i)
1746 ord[i] = ord[i + 1];
1751 /* The new line just read in may be larger than other lines
1752 already in main memory; push it back in the queue until we
1753 encounter a line larger than it. Optimize for the common
1754 case where the new line is smallest. */
1759 size_t ord0 = ord[0];
1760 size_t count_of_smaller_lines;
1764 int cmp = compare (cur[ord0], cur[ord[probe]]);
1765 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
1769 probe = (lo + hi) / 2;
1772 count_of_smaller_lines = lo - 1;
1773 for (j = 0; j < count_of_smaller_lines; j++)
1774 ord[j] = ord[j + 1];
1775 ord[count_of_smaller_lines] = ord0;
1779 if (unique && savedline)
1781 write_bytes (saved.text, saved.length, ofp, output_file);
1785 xfclose (ofp, output_file);
1788 /* Merge into T the two sorted arrays of lines LO (with NLO members)
1789 and HI (with NHI members). T, LO, and HI point just past their
1790 respective arrays, and the arrays are in reverse order. NLO and
1791 NHI must be positive, and HI - NHI must equal T - (NLO + NHI). */
1794 mergelines (struct line *t,
1795 struct line const *lo, size_t nlo,
1796 struct line const *hi, size_t nhi)
1799 if (compare (lo - 1, hi - 1) <= 0)
1804 /* HI - NHI equalled T - (NLO + NHI) when this function
1805 began. Therefore HI must equal T now, and there is no
1806 need to copy from HI to T. */
1824 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
1825 NLINES must be at least 2.
1826 The input and output arrays are in reverse order, and LINES and
1827 TEMP point just past the end of their respective arrays.
1829 Use a recursive divide-and-conquer algorithm, in the style
1830 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
1831 the optimization suggested by exercise 5.2.4-10; this requires room
1832 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
1833 writes that this memory optimization was originally published by
1834 D. A. Bell, Comp J. 1 (1958), 75. */
1837 sortlines (struct line *lines, size_t nlines, struct line *temp)
1841 if (0 < compare (&lines[-1], &lines[-2]))
1843 struct line tmp = lines[-1];
1844 lines[-1] = lines[-2];
1850 size_t nlo = nlines / 2;
1851 size_t nhi = nlines - nlo;
1852 struct line *lo = lines;
1853 struct line *hi = lines - nlo;
1854 struct line *sorted_lo = temp;
1856 sortlines (hi, nhi, temp);
1858 sortlines_temp (lo, nlo, sorted_lo);
1860 sorted_lo[-1] = lo[-1];
1862 mergelines (lines, sorted_lo, nlo, hi, nhi);
1866 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
1867 rather than sorting in place. */
1870 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
1874 /* Declare `swap' as int, not bool, to work around a bug
1875 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
1876 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
1877 int swap = (0 < compare (&lines[-1], &lines[-2]));
1878 temp[-1] = lines[-1 - swap];
1879 temp[-2] = lines[-2 + swap];
1883 size_t nlo = nlines / 2;
1884 size_t nhi = nlines - nlo;
1885 struct line *lo = lines;
1886 struct line *hi = lines - nlo;
1887 struct line *sorted_hi = temp - nlo;
1889 sortlines_temp (hi, nhi, sorted_hi);
1891 sortlines (lo, nlo, temp);
1893 mergelines (temp, lo, nlo, sorted_hi, nhi);
1897 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
1898 the same as OUTFILE. If found, merge the found instances (and perhaps
1899 some other files) into a temporary file so that it can in turn be
1900 merged into OUTFILE without destroying OUTFILE before it is completely
1901 read. Return the new value of NFILES, which differs from the old if
1902 some merging occurred.
1904 This test ensures that an otherwise-erroneous use like
1905 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
1906 It's not clear that POSIX requires this nicety.
1907 Detect common error cases, but don't try to catch obscure cases like
1908 "cat ... FILE ... | sort -m -o FILE"
1909 where traditional "sort" doesn't copy the input and where
1910 people should know that they're getting into trouble anyway.
1911 Catching these obscure cases would slow down performance in
1915 avoid_trashing_input (char **files, size_t ntemps, size_t nfiles,
1916 char const *outfile)
1919 bool got_outstat = false;
1920 struct stat outstat;
1922 for (i = ntemps; i < nfiles; i++)
1924 bool is_stdin = STREQ (files[i], "-");
1928 if (outfile && STREQ (outfile, files[i]) && !is_stdin)
1935 ? stat (outfile, &outstat)
1936 : fstat (STDOUT_FILENO, &outstat))
1943 ? fstat (STDIN_FILENO, &instat)
1944 : stat (files[i], &instat))
1946 && SAME_INODE (instat, outstat));
1952 char *temp = create_temp_file (&tftp);
1953 mergefps (&files[i], 0, nfiles - i, tftp, temp);
1962 /* Merge the input FILES. NTEMPS is the number of files at the
1963 start of FILES that are temporary; it is zero at the top level.
1964 NFILES is the total number of files. Put the output in
1965 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
1968 merge (char **files, size_t ntemps, size_t nfiles, char const *output_file)
1970 while (NMERGE < nfiles)
1972 /* Number of input files processed so far. */
1975 /* Number of output files generated so far. */
1978 /* nfiles % NMERGE; this counts input files that are left over
1979 after all full-sized merges have been done. */
1982 /* Number of easily-available slots at the next loop iteration. */
1985 /* Do as many NMERGE-size merges as possible. */
1986 for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
1989 char *temp = create_temp_file (&tfp);
1990 size_t nt = MIN (ntemps, NMERGE);
1992 mergefps (&files[in], nt, NMERGE, tfp, temp);
1996 remainder = nfiles - in;
1997 cheap_slots = NMERGE - out % NMERGE;
1999 if (cheap_slots < remainder)
2001 /* So many files remain that they can't all be put into the last
2002 NMERGE-sized output window. Do one more merge. Merge as few
2003 files as possible, to avoid needless I/O. */
2004 size_t nshortmerge = remainder - cheap_slots + 1;
2006 char *temp = create_temp_file (&tfp);
2007 size_t nt = MIN (ntemps, nshortmerge);
2009 mergefps (&files[in], nt, nshortmerge, tfp, temp);
2010 files[out++] = temp;
2014 /* Put the remaining input files into the last NMERGE-sized output
2015 window, so they will be merged in the next pass. */
2016 memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2021 nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2022 mergefps (files, ntemps, nfiles, NULL, output_file);
2025 /* Sort NFILES FILES onto OUTPUT_FILE. */
2028 sort (char * const *files, size_t nfiles, char const *output_file)
2032 bool output_file_created = false;
2038 char const *temp_output;
2039 char const *file = *files;
2040 FILE *fp = xfopen (file, "r");
2042 size_t bytes_per_line = (2 * sizeof (struct line)
2043 - sizeof (struct line) / 2);
2046 initbuf (&buf, bytes_per_line,
2047 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2052 while (fillbuf (&buf, fp, file))
2055 struct line *linebase;
2057 if (buf.eof && nfiles
2058 && (bytes_per_line + 1
2059 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2061 /* End of file, but there is more input and buffer room.
2062 Concatenate the next input file; this is faster in
2064 buf.left = buf.used;
2068 line = buffer_linelim (&buf);
2069 linebase = line - buf.nlines;
2071 sortlines (line, buf.nlines, linebase);
2072 if (buf.eof && !nfiles && !ntemps && !buf.left)
2075 tfp = xfopen (output_file, "w");
2076 temp_output = output_file;
2077 output_file_created = true;
2082 temp_output = create_temp_file (&tfp);
2088 write_bytes (line->text, line->length, tfp, temp_output);
2090 while (linebase < line && compare (line, line - 1) == 0)
2093 while (linebase < line);
2095 xfclose (tfp, temp_output);
2097 if (output_file_created)
2106 if (! output_file_created)
2109 struct tempnode *node = temphead;
2110 char **tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2111 for (i = 0; node; i++)
2113 tempfiles[i] = node->name;
2116 merge (tempfiles, ntemps, ntemps, output_file);
2121 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
2124 insertkey (struct keyfield *key_arg)
2126 struct keyfield **p;
2127 struct keyfield *key = xmemdup (key_arg, sizeof *key);
2129 for (p = &keylist; *p; p = &(*p)->next)
2135 /* Report a bad field specification SPEC, with extra info MSGID. */
2137 static void badfieldspec (char const *, char const *)
2140 badfieldspec (char const *spec, char const *msgid)
2142 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
2143 _(msgid), quote (spec));
2147 /* Report incompatible options. */
2149 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
2151 incompatible_options (char const *opts)
2153 error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
2157 /* Check compatibility of ordering options. */
2160 check_ordering_compatibility (void)
2162 struct keyfield const *key;
2164 for (key = keylist; key; key = key->next)
2165 if ((1 < (key->random + key->numeric + key->general_numeric + key->month
2167 || (key->random && key->translate))
2171 if (key->ignore == nondictionary)
2175 if (key->general_numeric)
2177 if (key->ignore == nonprinting)
2186 incompatible_options (opts);
2190 /* Parse the leading integer in STRING and store the resulting value
2191 (which must fit into size_t) into *VAL. Return the address of the
2192 suffix after the integer. If the value is too large, silently
2193 substitute SIZE_MAX. If MSGID is NULL, return NULL after
2194 failure; otherwise, report MSGID and exit on failure. */
2197 parse_field_count (char const *string, size_t *val, char const *msgid)
2202 switch (xstrtoumax (string, &suffix, 10, &n, ""))
2205 case LONGINT_INVALID_SUFFIX_CHAR:
2210 case LONGINT_OVERFLOW:
2211 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2215 case LONGINT_INVALID:
2217 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
2218 _(msgid), quote (string));
2225 /* Handle interrupts and hangups. */
2228 sighandler (int sig)
2231 signal (sig, SIG_IGN);
2235 signal (sig, SIG_DFL);
2239 /* Set the ordering options for KEY specified in S.
2240 Return the address of the first character in S that
2241 is not a valid ordering option.
2242 BLANKTYPE is the kind of blanks that 'b' should skip. */
2245 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
2252 if (blanktype == bl_start || blanktype == bl_both)
2253 key->skipsblanks = true;
2254 if (blanktype == bl_end || blanktype == bl_both)
2255 key->skipeblanks = true;
2258 key->ignore = nondictionary;
2261 key->translate = fold_toupper;
2264 key->general_numeric = true;
2267 /* Option order should not matter, so don't let -i override
2268 -d. -d implies -i, but -i does not imply -d. */
2270 key->ignore = nonprinting;
2276 key->numeric = true;
2282 key->reverse = true;
2292 static struct keyfield *
2293 key_init (struct keyfield *key)
2295 memset (key, 0, sizeof *key);
2296 key->eword = SIZE_MAX;
2301 main (int argc, char **argv)
2303 struct keyfield *key;
2304 struct keyfield key_buf;
2305 struct keyfield gkey;
2308 bool checkonly = false;
2309 bool mergeonly = false;
2310 char *random_source = NULL;
2311 bool need_random = false;
2313 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2314 bool obsolete_usage = (posix2_version () < 200112);
2316 char const *outfile = NULL;
2318 initialize_main (&argc, &argv);
2319 program_name = argv[0];
2320 setlocale (LC_ALL, "");
2321 bindtextdomain (PACKAGE, LOCALEDIR);
2322 textdomain (PACKAGE);
2324 initialize_exit_failure (SORT_FAILURE);
2326 hard_LC_COLLATE = hard_locale (LC_COLLATE);
2327 #if HAVE_NL_LANGINFO
2328 hard_LC_TIME = hard_locale (LC_TIME);
2331 /* Get locale's representation of the decimal point. */
2333 struct lconv const *locale = localeconv ();
2335 /* If the locale doesn't define a decimal point, or if the decimal
2336 point is multibyte, use the C locale's decimal point. FIXME:
2337 add support for multibyte decimal points. */
2338 decimal_point = to_uchar (locale->decimal_point[0]);
2339 if (! decimal_point || locale->decimal_point[1])
2340 decimal_point = '.';
2342 /* FIXME: add support for multibyte thousands separators. */
2343 thousands_sep = to_uchar (*locale->thousands_sep);
2344 if (! thousands_sep || locale->thousands_sep[1])
2348 have_read_stdin = false;
2353 static int const sig[] =
2355 /* The usual suspects. */
2356 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
2373 enum { nsigs = sizeof sig / sizeof sig[0] };
2376 struct sigaction act;
2378 sigemptyset (&caught_signals);
2379 for (i = 0; i < nsigs; i++)
2381 sigaction (sig[i], NULL, &act);
2382 if (act.sa_handler != SIG_IGN)
2383 sigaddset (&caught_signals, sig[i]);
2386 act.sa_handler = sighandler;
2387 act.sa_mask = caught_signals;
2390 for (i = 0; i < nsigs; i++)
2391 if (sigismember (&caught_signals, sig[i]))
2392 sigaction (sig[i], &act, NULL);
2394 for (i = 0; i < nsigs; i++)
2395 if (signal (sig[i], SIG_IGN) != SIG_IGN)
2397 signal (sig[i], sighandler);
2398 siginterrupt (sig[i], 1);
2403 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
2404 atexit (exit_cleanup);
2406 gkey.sword = gkey.eword = SIZE_MAX;
2408 gkey.translate = NULL;
2409 gkey.numeric = gkey.general_numeric = gkey.random = false;
2410 gkey.month = gkey.reverse = false;
2411 gkey.skipsblanks = gkey.skipeblanks = false;
2413 files = xnmalloc (argc, sizeof *files);
2417 /* Parse an operand as a file after "--" was seen; or if
2418 pedantic and a file was seen, unless the POSIX version
2419 predates 1003.1-2001 and -c was not seen and the operand is
2420 "-o FILE" or "-oFILE". */
2423 || (posixly_correct && nfiles != 0
2424 && ! (obsolete_usage
2427 && argv[optind][0] == '-' && argv[optind][1] == 'o'
2428 && (argv[optind][2] || optind + 1 != argc)))
2429 || ((c = getopt_long (argc, argv, short_options,
2430 long_options, NULL))
2435 files[nfiles++] = argv[optind++];
2441 if (optarg[0] == '+')
2443 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
2444 && ISDIGIT (argv[optind][1]));
2445 obsolete_usage |= minus_pos_usage & ~posixly_correct;
2448 /* Treat +POS1 [-POS2] as a key if possible; but silently
2449 treat an operand as a file if it is not a valid +POS1. */
2450 key = key_init (&key_buf);
2451 s = parse_field_count (optarg + 1, &key->sword, NULL);
2453 s = parse_field_count (s + 1, &key->schar, NULL);
2454 if (! (key->sword | key->schar))
2455 key->sword = SIZE_MAX;
2456 if (! s || *set_ordering (s, key, bl_start))
2463 if (minus_pos_usage)
2465 char const *optarg1 = argv[optind++];
2466 s = parse_field_count (optarg1 + 1, &key->eword,
2467 N_("invalid number after `-'"));
2469 s = parse_field_count (s + 1, &key->echar,
2470 N_("invalid number after `.'"));
2471 if (*set_ordering (s, key, bl_end))
2472 badfieldspec (optarg1,
2473 N_("stray character in field spec"));
2480 files[nfiles++] = optarg;
2496 set_ordering (str, &gkey, bl_both);
2505 key = key_init (&key_buf);
2508 s = parse_field_count (optarg, &key->sword,
2509 N_("invalid number at field start"));
2512 /* Provoke with `sort -k0' */
2513 badfieldspec (optarg, N_("field number is zero"));
2517 s = parse_field_count (s + 1, &key->schar,
2518 N_("invalid number after `.'"));
2521 /* Provoke with `sort -k1.0' */
2522 badfieldspec (optarg, N_("character offset is zero"));
2525 if (! (key->sword | key->schar))
2526 key->sword = SIZE_MAX;
2527 s = set_ordering (s, key, bl_start);
2530 key->eword = SIZE_MAX;
2536 s = parse_field_count (s + 1, &key->eword,
2537 N_("invalid number after `,'"));
2540 /* Provoke with `sort -k1,0' */
2541 badfieldspec (optarg, N_("field number is zero"));
2544 s = parse_field_count (s + 1, &key->echar,
2545 N_("invalid number after `.'"));
2548 /* `-k 2,3' is equivalent to `+1 -3'. */
2551 s = set_ordering (s, key, bl_end);
2554 badfieldspec (optarg, N_("stray character in field spec"));
2563 if (outfile && !STREQ (outfile, optarg))
2564 error (SORT_FAILURE, 0, _("multiple output files specified"));
2568 case RANDOM_SOURCE_OPTION:
2569 if (random_source && !STREQ (random_source, optarg))
2570 error (SORT_FAILURE, 0, _("multiple random sources specified"));
2571 random_source = optarg;
2579 specify_sort_size (optarg);
2584 char newtab = optarg[0];
2586 error (SORT_FAILURE, 0, _("empty tab"));
2589 if (STREQ (optarg, "\\0"))
2593 /* Provoke with `sort -txx'. Complain about
2594 "multi-character tab" instead of "multibyte tab", so
2595 that the diagnostic's wording does not need to be
2596 changed once multibyte characters are supported. */
2597 error (SORT_FAILURE, 0, _("multi-character tab %s"),
2601 if (tab != TAB_DEFAULT && tab != newtab)
2602 error (SORT_FAILURE, 0, _("incompatible tabs"));
2608 add_temp_dir (optarg);
2616 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
2617 through Solaris 7. It is also accepted by many non-Solaris
2618 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
2619 -y is marked as obsolete starting with Solaris 8 (1999), but is
2620 still accepted as of Solaris 10 prerelease (2004).
2622 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
2623 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
2624 and which in general ignores the argument after "-y" if it
2625 consists entirely of digits (it can even be empty). */
2626 if (optarg == argv[optind - 1])
2629 for (p = optarg; ISDIGIT (*p); p++)
2631 optind -= (*p != '\0');
2639 case_GETOPT_HELP_CHAR;
2641 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2644 usage (SORT_FAILURE);
2648 /* Inheritance of global options to individual keys. */
2649 for (key = keylist; key; key = key->next)
2651 if (! (key->ignore || key->translate
2652 || (key->skipsblanks | key->reverse
2653 | key->skipeblanks | key->month | key->numeric
2654 | key->general_numeric
2657 key->ignore = gkey.ignore;
2658 key->translate = gkey.translate;
2659 key->skipsblanks = gkey.skipsblanks;
2660 key->skipeblanks = gkey.skipeblanks;
2661 key->month = gkey.month;
2662 key->numeric = gkey.numeric;
2663 key->general_numeric = gkey.general_numeric;
2664 key->random = gkey.random;
2665 key->reverse = gkey.reverse;
2668 need_random |= key->random;
2671 if (!keylist && (gkey.ignore || gkey.translate
2672 || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
2673 | gkey.numeric | gkey.general_numeric
2677 need_random |= gkey.random;
2680 check_ordering_compatibility ();
2682 reverse = gkey.reverse;
2686 randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
2687 if (! randread_source)
2688 die (_("open failed"), random_source);
2691 if (temp_dir_count == 0)
2693 char const *tmp_dir = getenv ("TMPDIR");
2694 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
2699 static char *minus = "-";
2708 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -c"),
2712 incompatible_options ("co");
2714 /* POSIX requires that sort return 1 IFF invoked with -c and the
2715 input is not properly sorted. */
2716 exit (check (files[0]) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
2720 merge (files, 0, nfiles, outfile);
2722 sort (files, nfiles, outfile);
2724 if (have_read_stdin && fclose (stdin) == EOF)
2725 die (_("close failed"), "-");
2727 exit (EXIT_SUCCESS);