1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988, 1991-2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
27 #include <sys/types.h>
34 #include "filevercmp.h"
35 #include "hard-locale.h"
38 #include "ignore-value.h"
47 #include "readtokens0.h"
50 #include "strnumcmp.h"
52 #include "xnanosleep.h"
55 #if HAVE_SYS_RESOURCE_H
56 # include <sys/resource.h>
59 struct rlimit { size_t rlim_cur; };
60 # define getrlimit(Resource, Rlp) (-1)
63 /* The official name of this program (e.g., no `g' prefix). */
64 #define PROGRAM_NAME "sort"
67 proper_name ("Mike Haertel"), \
68 proper_name ("Paul Eggert")
70 #if HAVE_LANGINFO_CODESET
71 # include <langinfo.h>
74 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
77 # define SA_NOCLDSTOP 0
78 /* No sigprocmask. Always 'return' zero. */
79 # define sigprocmask(How, Set, Oset) (0)
81 # if ! HAVE_SIGINTERRUPT
82 # define siginterrupt(sig, flag) /* empty */
86 #if !defined OPEN_MAX && defined NR_OPEN
87 # define OPEN_MAX NR_OPEN
93 #define UCHAR_LIM (UCHAR_MAX + 1)
96 # define long_double long double
98 # define long_double double
100 # define strtold strtod
103 #ifndef DEFAULT_TMPDIR
104 # define DEFAULT_TMPDIR "/tmp"
107 /* Maximum number of lines to merge every time a NODE is taken from
108 the merge queue. Node is at LEVEL in the binary merge tree,
109 and is responsible for merging TOTAL lines. */
110 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
112 /* Heuristic value for the number of lines for which it is worth creating
113 a subthread, during an internal merge sort. I.e., it is a small number
114 of "average" lines for which sorting via two threads is faster than
115 sorting via one on an "average" system. On an dual-core 2.0 GHz i686
116 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
117 lines of gensort -a output is sorted slightly faster with --parallel=2
118 than with --parallel=1. By contrast, using --parallel=1 is about 10%
119 faster than using --parallel=2 with a 64K-line input. */
120 enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
121 verify (4 <= SUBTHREAD_LINES_HEURISTIC);
123 /* The number of threads after which there are
124 diminishing performance gains. */
125 enum { DEFAULT_MAX_THREADS = 8 };
130 /* POSIX says to exit with status 1 if invoked with -c and the
131 input is not properly sorted. */
132 SORT_OUT_OF_ORDER = 1,
134 /* POSIX says any other irregular exit must exit with a status
135 code greater than 1. */
141 /* The number of times we should try to fork a compression process
142 (we retry if the fork call fails). We don't _need_ to compress
143 temp files, this is just to reduce disk access, so this number
144 can be small. Each retry doubles in duration. */
145 MAX_FORK_TRIES_COMPRESS = 4,
147 /* The number of times we should try to fork a decompression process.
148 If we can't fork a decompression process, we can't sort, so this
149 number should be big. Each retry doubles in duration. */
150 MAX_FORK_TRIES_DECOMPRESS = 9
155 /* Level of the end-of-merge node, one level above the root. */
158 /* Level of the root node in merge tree. */
162 /* The representation of the decimal point in the current locale. */
163 static int decimal_point;
165 /* Thousands separator; if -1, then there isn't one. */
166 static int thousands_sep;
168 /* Nonzero if the corresponding locales are hard. */
169 static bool hard_LC_COLLATE;
171 static bool hard_LC_TIME;
174 #define NONZERO(x) ((x) != 0)
176 /* The kind of blanks for '-b' to skip in various options. */
177 enum blanktype { bl_start, bl_end, bl_both };
179 /* The character marking end of line. Default to \n. */
180 static char eolchar = '\n';
182 /* Lines are held in core as counted strings. */
185 char *text; /* Text of the line. */
186 size_t length; /* Length including final newline. */
187 char *keybeg; /* Start of first key. */
188 char *keylim; /* Limit of first key. */
194 char *buf; /* Dynamically allocated buffer,
195 partitioned into 3 regions:
198 - an array of lines, in reverse order. */
199 size_t used; /* Number of bytes used for input data. */
200 size_t nlines; /* Number of lines in the line array. */
201 size_t alloc; /* Number of bytes allocated. */
202 size_t left; /* Number of bytes left from previous reads. */
203 size_t line_bytes; /* Number of bytes to reserve for each line. */
204 bool eof; /* An EOF has been read. */
210 size_t sword; /* Zero-origin 'word' to start at. */
211 size_t schar; /* Additional characters to skip. */
212 size_t eword; /* Zero-origin last 'word' of key. */
213 size_t echar; /* Additional characters in field. */
214 bool const *ignore; /* Boolean array of characters to ignore. */
215 char const *translate; /* Translation applied to characters. */
216 bool skipsblanks; /* Skip leading blanks when finding start. */
217 bool skipeblanks; /* Skip leading blanks when finding end. */
218 bool numeric; /* Flag for numeric comparison. Handle
219 strings of digits with optional decimal
220 point, but no exponential notation. */
221 bool random; /* Sort by random hash of key. */
222 bool general_numeric; /* Flag for general, numeric comparison.
223 Handle numbers in exponential notation. */
224 bool human_numeric; /* Flag for sorting by human readable
225 units with either SI xor IEC prefixes. */
226 bool month; /* Flag for comparison by month name. */
227 bool reverse; /* Reverse the sense of comparison. */
228 bool version; /* sort by version number */
229 bool obsolete_used; /* obsolescent key option format is used. */
230 struct keyfield *next; /* Next keyfield to try. */
239 /* Binary merge tree node. */
242 struct line *lo; /* Lines to merge from LO child node. */
243 struct line *hi; /* Lines to merge from HI child ndoe. */
244 struct line *end_lo; /* End of available lines from LO. */
245 struct line *end_hi; /* End of available lines from HI. */
246 struct line **dest; /* Pointer to destination of merge. */
247 size_t nlo; /* Total Lines remaining from LO. */
248 size_t nhi; /* Total lines remaining from HI. */
249 struct merge_node *parent; /* Parent node. */
250 struct merge_node *lo_child; /* LO child node. */
251 struct merge_node *hi_child; /* HI child node. */
252 unsigned int level; /* Level in merge tree. */
253 bool queued; /* Node is already in heap. */
254 pthread_mutex_t lock; /* Lock for node operations. */
257 /* Priority queue of merge nodes. */
258 struct merge_node_queue
260 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
261 pthread_mutex_t mutex; /* Lock for queue operations. */
262 pthread_cond_t cond; /* Conditional wait for empty queue to populate
266 /* FIXME: None of these tables work with multibyte character sets.
267 Also, there are many other bugs when handling multibyte characters.
268 One way to fix this is to rewrite `sort' to use wide characters
269 internally, but doing this with good performance is a bit
272 /* Table of blanks. */
273 static bool blanks[UCHAR_LIM];
275 /* Table of non-printing characters. */
276 static bool nonprinting[UCHAR_LIM];
278 /* Table of non-dictionary characters (not letters, digits, or blanks). */
279 static bool nondictionary[UCHAR_LIM];
281 /* Translation table folding lower case to upper. */
282 static char fold_toupper[UCHAR_LIM];
284 #define MONTHS_PER_YEAR 12
286 /* Table mapping month names to integers.
287 Alphabetic order allows binary search. */
288 static struct month monthtab[] =
304 /* During the merge phase, the number of files to merge at once. */
305 #define NMERGE_DEFAULT 16
307 /* Minimum size for a merge or check buffer. */
308 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
310 /* Minimum sort size; the code might not work with smaller sizes. */
311 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
313 /* The number of bytes needed for a merge or check buffer, which can
314 function relatively efficiently even if it holds only one line. If
315 a longer line is seen, this value is increased. */
316 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
318 /* The approximate maximum number of bytes of main memory to use, as
319 specified by the user. Zero if the user has not specified a size. */
320 static size_t sort_size;
322 /* The guessed size for non-regular files. */
323 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
325 /* Array of directory names in which any temporary files are to be created. */
326 static char const **temp_dirs;
328 /* Number of temporary directory names used. */
329 static size_t temp_dir_count;
331 /* Number of allocated slots in temp_dirs. */
332 static size_t temp_dir_alloc;
334 /* Flag to reverse the order of all comparisons. */
337 /* Flag for stable sort. This turns off the last ditch bytewise
338 comparison of lines, and instead leaves lines in the same order
339 they were read if all keys compare equal. */
342 /* If TAB has this value, blanks separate fields. */
343 enum { TAB_DEFAULT = CHAR_MAX + 1 };
345 /* Tab character separating fields. If TAB_DEFAULT, then fields are
346 separated by the empty string between a non-blank character and a blank
348 static int tab = TAB_DEFAULT;
350 /* Flag to remove consecutive duplicate lines from the output.
351 Only the last of a sequence of equal lines will be output. */
354 /* Nonzero if any of the input files are the standard input. */
355 static bool have_read_stdin;
357 /* List of key field comparisons to be tried. */
358 static struct keyfield *keylist;
360 /* Program used to (de)compress temp files. Must accept -d. */
361 static char const *compress_program;
363 /* Annotate the output with extra info to aid the user. */
366 /* Maximum number of files to merge in one go. If more than this
367 number are present, temp files will be used. */
368 static unsigned int nmerge = NMERGE_DEFAULT;
370 /* Report MESSAGE for FILE, then clean up and exit.
371 If FILE is null, it represents standard output. */
373 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
375 die (char const *message, char const *file)
377 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
384 if (status != EXIT_SUCCESS)
385 fprintf (stderr, _("Try `%s --help' for more information.\n"),
390 Usage: %s [OPTION]... [FILE]...\n\
391 or: %s [OPTION]... --files0-from=F\n\
393 program_name, program_name);
395 Write sorted concatenation of all FILE(s) to standard output.\n\
399 Mandatory arguments to long options are mandatory for short options too.\n\
406 -b, --ignore-leading-blanks ignore leading blanks\n\
407 -d, --dictionary-order consider only blanks and alphanumeric characters\
409 -f, --ignore-case fold lower case to upper case characters\n\
412 -g, --general-numeric-sort compare according to general numerical value\n\
413 -i, --ignore-nonprinting consider only printable characters\n\
414 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
417 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
420 -n, --numeric-sort compare according to string numerical value\n\
421 -R, --random-sort sort by random hash of keys\n\
422 --random-source=FILE get random bytes from FILE\n\
423 -r, --reverse reverse the result of comparisons\n\
426 --sort=WORD sort according to WORD:\n\
427 general-numeric -g, human-numeric -h, month -M,\
429 numeric -n, random -R, version -V\n\
430 -V, --version-sort natural sort of (version) numbers within text\n\
438 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
439 for more use temp files\n\
442 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
443 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
445 --compress-program=PROG compress temporaries with PROG;\n\
446 decompress them with PROG -d\n\
449 --debug annotate the part of the line used to sort,\n\
450 and warn about questionable usage to stderr\n\
451 --files0-from=F read input from the files specified by\n\
452 NUL-terminated names in file F;\n\
453 If F is - then read names from standard input\n\
456 -k, --key=POS1[,POS2] start a key at POS1 (origin 1), end it at POS2\n\
457 (default end of line). See POS syntax below\n\
458 -m, --merge merge already sorted files; do not sort\n\
461 -o, --output=FILE write result to FILE instead of standard output\n\
462 -s, --stable stabilize sort by disabling last-resort comparison\
464 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
467 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
468 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
469 multiple options specify multiple directories\n\
470 --parallel=N change the number of sorts run concurrently to N\n\
471 -u, --unique with -c, check for strict ordering;\n\
472 without -c, output only the first of an equal run\
476 -z, --zero-terminated end lines with 0 byte, not newline\n\
478 fputs (HELP_OPTION_DESCRIPTION, stdout);
479 fputs (VERSION_OPTION_DESCRIPTION, stdout);
482 POS is F[.C][OPTS], where F is the field number and C the character position\n\
483 in the field; both are origin 1. If neither -t nor -b is in effect, characters\
485 in a field are counted from the beginning of the preceding whitespace. OPTS is\
487 one or more single-letter ordering options, which override global ordering\n\
488 options for that key. If no key is given, use the entire line as the key.\n\
490 SIZE may be followed by the following multiplicative suffixes:\n\
493 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
495 With no FILE, or when FILE is -, read standard input.\n\
498 The locale specified by the environment affects sort order.\n\
499 Set LC_ALL=C to get the traditional sort order that uses\n\
500 native byte values.\n\
502 emit_ancillary_info ();
508 /* For long options that have no equivalent short option, use a
509 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
512 CHECK_OPTION = CHAR_MAX + 1,
513 COMPRESS_PROGRAM_OPTION,
514 DEBUG_PROGRAM_OPTION,
517 RANDOM_SOURCE_OPTION,
522 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
524 static struct option const long_options[] =
526 {"ignore-leading-blanks", no_argument, NULL, 'b'},
527 {"check", optional_argument, NULL, CHECK_OPTION},
528 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
529 {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
530 {"dictionary-order", no_argument, NULL, 'd'},
531 {"ignore-case", no_argument, NULL, 'f'},
532 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
533 {"general-numeric-sort", no_argument, NULL, 'g'},
534 {"ignore-nonprinting", no_argument, NULL, 'i'},
535 {"key", required_argument, NULL, 'k'},
536 {"merge", no_argument, NULL, 'm'},
537 {"month-sort", no_argument, NULL, 'M'},
538 {"numeric-sort", no_argument, NULL, 'n'},
539 {"human-numeric-sort", no_argument, NULL, 'h'},
540 {"version-sort", no_argument, NULL, 'V'},
541 {"random-sort", no_argument, NULL, 'R'},
542 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
543 {"sort", required_argument, NULL, SORT_OPTION},
544 {"output", required_argument, NULL, 'o'},
545 {"reverse", no_argument, NULL, 'r'},
546 {"stable", no_argument, NULL, 's'},
547 {"batch-size", required_argument, NULL, NMERGE_OPTION},
548 {"buffer-size", required_argument, NULL, 'S'},
549 {"field-separator", required_argument, NULL, 't'},
550 {"temporary-directory", required_argument, NULL, 'T'},
551 {"unique", no_argument, NULL, 'u'},
552 {"zero-terminated", no_argument, NULL, 'z'},
553 {"parallel", required_argument, NULL, PARALLEL_OPTION},
554 {GETOPT_HELP_OPTION_DECL},
555 {GETOPT_VERSION_OPTION_DECL},
559 #define CHECK_TABLE \
561 _ct_("silent", 'C') \
562 _ct_("diagnose-first", 'c')
564 static char const *const check_args[] =
566 #define _ct_(_s, _c) _s,
570 static char const check_types[] =
572 #define _ct_(_s, _c) _c,
578 _st_("general-numeric", 'g') \
579 _st_("human-numeric", 'h') \
581 _st_("numeric", 'n') \
582 _st_("random", 'R') \
585 static char const *const sort_args[] =
587 #define _st_(_s, _c) _s,
591 static char const sort_types[] =
593 #define _st_(_s, _c) _c,
598 /* The set of signals that are caught. */
599 static sigset_t caught_signals;
601 /* Critical section status. */
608 /* Enter a critical section. */
609 static struct cs_status
612 struct cs_status status;
613 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
617 /* Leave a critical section. */
619 cs_leave (struct cs_status status)
623 /* Ignore failure when restoring the signal mask. */
624 sigprocmask (SIG_SETMASK, &status.sigs, NULL);
628 /* Possible states for a temp file. If compressed, the file's status
629 is unreaped or reaped, depending on whether 'sort' has waited for
630 the subprocess to finish. */
631 enum { UNCOMPRESSED, UNREAPED, REAPED };
633 /* The list of temporary files. */
636 struct tempnode *volatile next;
637 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
639 char name[1]; /* Actual size is 1 + file name length. */
641 static struct tempnode *volatile temphead;
642 static struct tempnode *volatile *temptail = &temphead;
644 /* A file to be sorted. */
647 /* The file's name. */
650 /* Nonnull if this is a temporary file, in which case NAME == TEMP->name. */
651 struct tempnode *temp;
654 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
655 static Hash_table *proctab;
657 enum { INIT_PROCTAB_SIZE = 47 };
660 proctab_hasher (void const *entry, size_t tabsize)
662 struct tempnode const *node = entry;
663 return node->pid % tabsize;
667 proctab_comparator (void const *e1, void const *e2)
669 struct tempnode const *n1 = e1;
670 struct tempnode const *n2 = e2;
671 return n1->pid == n2->pid;
674 /* The number of unreaped child processes. */
677 static bool delete_proc (pid_t);
679 /* If PID is positive, wait for the child process with that PID to
680 exit, and assume that PID has already been removed from the process
681 table. If PID is 0 or -1, clean up some child that has exited (by
682 waiting for it, and removing it from the proc table) and return the
683 child's process ID. However, if PID is 0 and no children have
684 exited, return 0 without waiting. */
690 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
693 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
695 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
697 if (! WIFEXITED (status) || WEXITSTATUS (status))
698 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
706 /* TEMP represents a new process; add it to the process table. Create
707 the process table the first time it's called. */
710 register_proc (struct tempnode *temp)
714 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
722 temp->state = UNREAPED;
724 if (! hash_insert (proctab, temp))
728 /* If PID is in the process table, remove it and return true.
729 Otherwise, return false. */
732 delete_proc (pid_t pid)
734 struct tempnode test;
737 struct tempnode *node = hash_delete (proctab, &test);
740 node->state = REAPED;
744 /* Remove PID from the process table, and wait for it to exit if it
748 wait_proc (pid_t pid)
750 if (delete_proc (pid))
754 /* Reap any exited children. Do not block; reap only those that have
760 while (0 < nprocs && reap (0))
764 /* Reap at least one exited child, waiting if necessary. */
773 /* Reap all children, waiting if necessary. */
782 /* Clean up any remaining temporary files. */
787 struct tempnode const *node;
789 for (node = temphead; node; node = node->next)
794 /* Cleanup actions to take when exiting. */
801 /* Clean up any remaining temporary files in a critical section so
802 that a signal handler does not try to clean them too. */
803 struct cs_status cs = cs_enter ();
811 /* Create a new temporary file, returning its newly allocated tempnode.
812 Store into *PFD the file descriptor open for writing.
813 If the creation fails, return NULL and store -1 into *PFD if the
814 failure is due to file descriptor exhaustion and
815 SURVIVE_FD_EXHAUSTION; otherwise, die. */
817 static struct tempnode *
818 create_temp_file (int *pfd, bool survive_fd_exhaustion)
820 static char const slashbase[] = "/sortXXXXXX";
821 static size_t temp_dir_index;
824 char const *temp_dir = temp_dirs[temp_dir_index];
825 size_t len = strlen (temp_dir);
826 struct tempnode *node =
827 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
828 char *file = node->name;
831 memcpy (file, temp_dir, len);
832 memcpy (file + len, slashbase, sizeof slashbase);
834 if (++temp_dir_index == temp_dir_count)
837 /* Create the temporary file in a critical section, to avoid races. */
843 temptail = &node->next;
851 if (! (survive_fd_exhaustion && errno == EMFILE))
852 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
862 /* Return a stream for FILE, opened with mode HOW. A null FILE means
863 standard output; HOW should be "w". When opening for input, "-"
864 means standard input. To avoid confusion, do not return file
865 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
866 opening an ordinary FILE. Return NULL if unsuccessful.
868 fadvise() is used to specify an access pattern for input files.
869 There are a few hints we could possibly provide,
870 and after careful testing it was decided that
871 specifying POSIX_FADV_SEQUENTIAL was not detrimental
872 to any cases. On Linux 2.6.31, this option doubles
873 the size of read ahead performed and thus was seen to
876 Sorting with a smaller internal buffer
877 Reading from faster flash devices
879 In _addition_ one could also specify other hints...
881 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
882 at least uses that to _synchronously_ prepopulate the cache
883 with the specified range. While sort does need to
884 read all of its input before outputting, a synchronous
885 read of the whole file up front precludes any processing
886 that sort could do in parallel with the system doing
887 read ahead of the data. This was seen to have negative effects
888 in a couple of cases:
890 Sorting with a smaller internal buffer
891 Note this option was seen to shorten the runtime for sort
892 on a multicore system with lots of RAM and other processes
893 competing for CPU. It could be argued that more explicit
894 scheduling hints with `nice` et. al. are more appropriate
897 POSIX_FADV_NOREUSE is a possibility as it could lower
898 the priority of input data in the cache as sort will
899 only need to process it once. However its functionality
900 has changed over Linux kernel versions and as of 2.6.31
901 it does nothing and thus we can't depend on what it might
904 POSIX_FADV_DONTNEED is not appropriate for user specified
905 input files, but for temp files we do want to drop the
906 cache immediately after processing. This is done implicitly
907 however when the files are unlinked. */
910 stream_open (char const *file, char const *how)
917 if (STREQ (file, "-"))
919 have_read_stdin = true;
923 fp = fopen (file, how);
924 fadvise (fp, FADVISE_SEQUENTIAL);
927 return fopen (file, how);
930 /* Same as stream_open, except always return a non-null value; die on
934 xfopen (char const *file, char const *how)
936 FILE *fp = stream_open (file, how);
938 die (_("open failed"), file);
942 /* Close FP, whose name is FILE, and report any errors. */
945 xfclose (FILE *fp, char const *file)
950 /* Allow reading stdin from tty more than once. */
956 /* Don't close stdout just yet. close_stdout does that. */
957 if (fflush (fp) != 0)
958 die (_("fflush failed"), file);
962 if (fclose (fp) != 0)
963 die (_("close failed"), file);
969 dup2_or_die (int oldfd, int newfd)
971 if (dup2 (oldfd, newfd) < 0)
972 error (SORT_FAILURE, errno, _("dup2 failed"));
975 /* Fork a child process for piping to and do common cleanup. The
976 TRIES parameter tells us how many times to try to fork before
977 giving up. Return the PID of the child, or -1 (setting errno)
981 pipe_fork (int pipefds[2], size_t tries)
983 #if HAVE_WORKING_FORK
984 struct tempnode *saved_temphead;
986 double wait_retry = 0.25;
987 pid_t pid IF_LINT ( = -1);
990 if (pipe (pipefds) < 0)
993 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
994 uncontrolled subprocess generation can hurt performance significantly.
995 Allow at most NMERGE + 2 subprocesses, on the theory that there
996 may be some useful parallelism by letting compression for the
997 previous merge finish (1 subprocess) in parallel with the current
998 merge (NMERGE + 1 subprocesses). */
1000 if (nmerge + 1 < nprocs)
1005 /* This is so the child process won't delete our temp files
1006 if it receives a signal before exec-ing. */
1008 saved_temphead = temphead;
1012 saved_errno = errno;
1014 temphead = saved_temphead;
1017 errno = saved_errno;
1019 if (0 <= pid || errno != EAGAIN)
1023 xnanosleep (wait_retry);
1031 saved_errno = errno;
1034 errno = saved_errno;
1038 close (STDIN_FILENO);
1039 close (STDOUT_FILENO);
1046 #else /* ! HAVE_WORKING_FORK */
1051 /* Create a temporary file and, if asked for, start a compressor
1052 to that file. Set *PFP to the file handle and return
1053 the address of the new temp node. If the creation
1054 fails, return NULL if the failure is due to file descriptor
1055 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1057 static struct tempnode *
1058 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1061 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1065 node->state = UNCOMPRESSED;
1067 if (compress_program)
1071 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1076 tempfd = pipefds[1];
1078 register_proc (node);
1080 else if (node->pid == 0)
1083 dup2_or_die (tempfd, STDOUT_FILENO);
1085 dup2_or_die (pipefds[0], STDIN_FILENO);
1088 if (execlp (compress_program, compress_program, (char *) NULL) < 0)
1089 error (SORT_FAILURE, errno, _("couldn't execute %s"),
1094 *pfp = fdopen (tempfd, "w");
1096 die (_("couldn't create temporary file"), node->name);
1101 /* Create a temporary file and, if asked for, start a compressor
1102 to that file. Set *PFP to the file handle and return the address
1103 of the new temp node. Die on failure. */
1105 static struct tempnode *
1106 create_temp (FILE **pfp)
1108 return maybe_create_temp (pfp, false);
1111 /* Open a compressed temp file and start a decompression process through
1112 which to filter the input. Return NULL (setting errno to
1113 EMFILE) if we ran out of file descriptors, and die on any other
1117 open_temp (struct tempnode *temp)
1119 int tempfd, pipefds[2];
1122 if (temp->state == UNREAPED)
1123 wait_proc (temp->pid);
1125 tempfd = open (temp->name, O_RDONLY);
1129 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1134 if (errno != EMFILE)
1135 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1143 dup2_or_die (tempfd, STDIN_FILENO);
1145 dup2_or_die (pipefds[1], STDOUT_FILENO);
1148 execlp (compress_program, compress_program, "-d", (char *) NULL);
1149 error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
1154 register_proc (temp);
1158 fp = fdopen (pipefds[0], "r");
1161 int saved_errno = errno;
1163 errno = saved_errno;
1171 /* Append DIR to the array of temporary directory names. */
1173 add_temp_dir (char const *dir)
1175 if (temp_dir_count == temp_dir_alloc)
1176 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1178 temp_dirs[temp_dir_count++] = dir;
1181 /* Remove NAME from the list of temporary files. */
1184 zaptemp (char const *name)
1186 struct tempnode *volatile *pnode;
1187 struct tempnode *node;
1188 struct tempnode *next;
1190 int unlink_errno = 0;
1191 struct cs_status cs;
1193 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1196 if (node->state == UNREAPED)
1197 wait_proc (node->pid);
1199 /* Unlink the temporary file in a critical section to avoid races. */
1202 unlink_status = unlink (name);
1203 unlink_errno = errno;
1207 if (unlink_status != 0)
1208 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1214 #if HAVE_NL_LANGINFO
1217 struct_month_cmp (void const *m1, void const *m2)
1219 struct month const *month1 = m1;
1220 struct month const *month2 = m2;
1221 return strcmp (month1->name, month2->name);
1226 /* Initialize the character class tables. */
1233 for (i = 0; i < UCHAR_LIM; ++i)
1235 blanks[i] = !! isblank (i);
1236 nonprinting[i] = ! isprint (i);
1237 nondictionary[i] = ! isalnum (i) && ! isblank (i);
1238 fold_toupper[i] = toupper (i);
1241 #if HAVE_NL_LANGINFO
1242 /* If we're not in the "C" locale, read different names for months. */
1245 for (i = 0; i < MONTHS_PER_YEAR; i++)
1252 s = nl_langinfo (ABMON_1 + i);
1254 monthtab[i].name = name = xmalloc (s_len + 1);
1255 monthtab[i].val = i + 1;
1257 for (j = k = 0; j < s_len; j++)
1258 if (! isblank (to_uchar (s[j])))
1259 name[k++] = fold_toupper[to_uchar (s[j])];
1262 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1267 /* Specify how many inputs may be merged at once.
1268 This may be set on the command-line with the
1269 --batch-size option. */
1271 specify_nmerge (int oi, char c, char const *s)
1274 struct rlimit rlimit;
1275 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1277 /* Try to find out how many file descriptors we'll be able
1278 to open. We need at least nmerge + 3 (STDIN_FILENO,
1279 STDOUT_FILENO and STDERR_FILENO). */
1280 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1285 if (e == LONGINT_OK)
1289 e = LONGINT_OVERFLOW;
1294 error (0, 0, _("invalid --%s argument %s"),
1295 long_options[oi].name, quote (s));
1296 error (SORT_FAILURE, 0,
1297 _("minimum --%s argument is %s"),
1298 long_options[oi].name, quote ("2"));
1300 else if (max_nmerge < nmerge)
1302 e = LONGINT_OVERFLOW;
1309 if (e == LONGINT_OVERFLOW)
1311 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1312 error (0, 0, _("--%s argument %s too large"),
1313 long_options[oi].name, quote (s));
1314 error (SORT_FAILURE, 0,
1315 _("maximum --%s argument with current rlimit is %s"),
1316 long_options[oi].name,
1317 uinttostr (max_nmerge, max_nmerge_buf));
1320 xstrtol_fatal (e, oi, c, long_options, s);
1323 /* Specify the amount of main memory to use when sorting. */
1325 specify_sort_size (int oi, char c, char const *s)
1329 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1331 /* The default unit is KiB. */
1332 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1334 if (n <= UINTMAX_MAX / 1024)
1337 e = LONGINT_OVERFLOW;
1340 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1341 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1350 double mem = physmem_total () * n / 100;
1352 /* Use "<", not "<=", to avoid problems with rounding. */
1353 if (mem < UINTMAX_MAX)
1359 e = LONGINT_OVERFLOW;
1364 if (e == LONGINT_OK)
1366 /* If multiple sort sizes are specified, take the maximum, so
1367 that option order does not matter. */
1374 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1378 e = LONGINT_OVERFLOW;
1381 xstrtol_fatal (e, oi, c, long_options, s);
1384 /* Specify the number of threads to spawn during internal sort. */
1386 specify_nthreads (int oi, char c, char const *s)
1388 unsigned long int nthreads;
1389 enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
1390 if (e == LONGINT_OVERFLOW)
1392 if (e != LONGINT_OK)
1393 xstrtol_fatal (e, oi, c, long_options, s);
1394 if (SIZE_MAX < nthreads)
1395 nthreads = SIZE_MAX;
1397 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1402 /* Return the default sort size. */
1404 default_sort_size (void)
1406 /* Let MEM be available memory or 1/8 of total memory, whichever
1408 double avail = physmem_available ();
1409 double total = physmem_total ();
1410 double mem = MAX (avail, total / 8);
1411 struct rlimit rlimit;
1413 /* Let SIZE be MEM, but no more than the maximum object size or
1414 system resource limits. Avoid the MIN macro here, as it is not
1415 quite right when only one argument is floating point. Don't
1416 bother to check for values like RLIM_INFINITY since in practice
1417 they are not much less than SIZE_MAX. */
1418 size_t size = SIZE_MAX;
1421 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1422 size = rlimit.rlim_cur;
1424 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1425 size = rlimit.rlim_cur;
1428 /* Leave a large safety margin for the above limits, as failure can
1429 occur when they are exceeded. */
1433 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1434 Exceeding RSS is not fatal, but can be quite slow. */
1435 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1436 size = rlimit.rlim_cur / 16 * 15;
1439 /* Use no less than the minimum. */
1440 return MAX (size, MIN_SORT_SIZE);
1443 /* Return the sort buffer size to use with the input files identified
1444 by FPS and FILES, which are alternate names of the same files.
1445 NFILES gives the number of input files; NFPS may be less. Assume
1446 that each input line requires LINE_BYTES extra bytes' worth of line
1447 information. Do not exceed the size bound specified by the user
1448 (or a default size bound, if the user does not specify one). */
1451 sort_buffer_size (FILE *const *fps, size_t nfps,
1452 char *const *files, size_t nfiles,
1455 /* A bound on the input size. If zero, the bound hasn't been
1457 static size_t size_bound;
1459 /* In the worst case, each input byte is a newline. */
1460 size_t worst_case_per_input_byte = line_bytes + 1;
1462 /* Keep enough room for one extra input line and an extra byte.
1463 This extra room might be needed when preparing to read EOF. */
1464 size_t size = worst_case_per_input_byte + 1;
1468 for (i = 0; i < nfiles; i++)
1474 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1475 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1476 : stat (files[i], &st))
1478 die (_("stat failed"), files[i]);
1480 if (S_ISREG (st.st_mode))
1481 file_size = st.st_size;
1484 /* The file has unknown size. If the user specified a sort
1485 buffer size, use that; otherwise, guess the size. */
1488 file_size = INPUT_FILE_SIZE_GUESS;
1493 size_bound = sort_size;
1495 size_bound = default_sort_size ();
1498 /* Add the amount of memory needed to represent the worst case
1499 where the input consists entirely of newlines followed by a
1500 single non-newline. Check for overflow. */
1501 worst_case = file_size * worst_case_per_input_byte + 1;
1502 if (file_size != worst_case / worst_case_per_input_byte
1503 || size_bound - size <= worst_case)
1511 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1512 must be at least sizeof (struct line). Allocate ALLOC bytes
1516 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1518 /* Ensure that the line array is properly aligned. If the desired
1519 size cannot be allocated, repeatedly halve it until allocation
1520 succeeds. The smaller allocation may hurt overall performance,
1521 but that's better than failing. */
1524 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1525 buf->buf = malloc (alloc);
1529 if (alloc <= line_bytes + 1)
1533 buf->line_bytes = line_bytes;
1535 buf->used = buf->left = buf->nlines = 0;
1539 /* Return one past the limit of the line array. */
1541 static inline struct line *
1542 buffer_linelim (struct buffer const *buf)
1544 return (struct line *) (buf->buf + buf->alloc);
1547 /* Return a pointer to the first character of the field specified
1551 begfield (struct line const *line, struct keyfield const *key)
1553 char *ptr = line->text, *lim = ptr + line->length - 1;
1554 size_t sword = key->sword;
1555 size_t schar = key->schar;
1557 /* The leading field separator itself is included in a field when -t
1560 if (tab != TAB_DEFAULT)
1561 while (ptr < lim && sword--)
1563 while (ptr < lim && *ptr != tab)
1569 while (ptr < lim && sword--)
1571 while (ptr < lim && blanks[to_uchar (*ptr)])
1573 while (ptr < lim && !blanks[to_uchar (*ptr)])
1577 /* If we're ignoring leading blanks when computing the Start
1578 of the field, skip past them here. */
1579 if (key->skipsblanks)
1580 while (ptr < lim && blanks[to_uchar (*ptr)])
1583 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1584 ptr = MIN (lim, ptr + schar);
1589 /* Return the limit of (a pointer to the first character after) the field
1590 in LINE specified by KEY. */
1593 limfield (struct line const *line, struct keyfield const *key)
1595 char *ptr = line->text, *lim = ptr + line->length - 1;
1596 size_t eword = key->eword, echar = key->echar;
1599 eword++; /* Skip all of end field. */
1601 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1602 whichever comes first. If there are more than EWORD fields, leave
1603 PTR pointing at the beginning of the field having zero-based index,
1604 EWORD. If a delimiter character was specified (via -t), then that
1605 `beginning' is the first character following the delimiting TAB.
1606 Otherwise, leave PTR pointing at the first `blank' character after
1607 the preceding field. */
1608 if (tab != TAB_DEFAULT)
1609 while (ptr < lim && eword--)
1611 while (ptr < lim && *ptr != tab)
1613 if (ptr < lim && (eword || echar))
1617 while (ptr < lim && eword--)
1619 while (ptr < lim && blanks[to_uchar (*ptr)])
1621 while (ptr < lim && !blanks[to_uchar (*ptr)])
1625 #ifdef POSIX_UNSPECIFIED
1626 /* The following block of code makes GNU sort incompatible with
1627 standard Unix sort, so it's ifdef'd out for now.
1628 The POSIX spec isn't clear on how to interpret this.
1629 FIXME: request clarification.
1631 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1632 Date: Thu, 30 May 96 12:20:41 -0400
1633 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1635 [...]I believe I've found another bug in `sort'.
1640 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1643 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1647 Unix sort produced the answer I expected: sort on the single character
1648 in column 7. GNU sort produced different results, because it disagrees
1649 on the interpretation of the key-end spec "M.N". Unix sort reads this
1650 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1651 "skip M-1 fields, then either N-1 characters or the rest of the current
1652 field, whichever comes first". This extra clause applies only to
1653 key-ends, not key-starts.
1656 /* Make LIM point to the end of (one byte past) the current field. */
1657 if (tab != TAB_DEFAULT)
1660 newlim = memchr (ptr, tab, lim - ptr);
1668 while (newlim < lim && blanks[to_uchar (*newlim)])
1670 while (newlim < lim && !blanks[to_uchar (*newlim)])
1676 if (echar != 0) /* We need to skip over a portion of the end field. */
1678 /* If we're ignoring leading blanks when computing the End
1679 of the field, skip past them here. */
1680 if (key->skipeblanks)
1681 while (ptr < lim && blanks[to_uchar (*ptr)])
1684 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1685 ptr = MIN (lim, ptr + echar);
1691 /* Fill BUF reading from FP, moving buf->left bytes from the end
1692 of buf->buf to the beginning first. If EOF is reached and the
1693 file wasn't terminated by a newline, supply one. Set up BUF's line
1694 table too. FILE is the name of the file corresponding to FP.
1695 Return true if some input was read. */
1698 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1700 struct keyfield const *key = keylist;
1702 size_t line_bytes = buf->line_bytes;
1703 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1708 if (buf->used != buf->left)
1710 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1711 buf->used = buf->left;
1717 char *ptr = buf->buf + buf->used;
1718 struct line *linelim = buffer_linelim (buf);
1719 struct line *line = linelim - buf->nlines;
1720 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1721 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1723 while (line_bytes + 1 < avail)
1725 /* Read as many bytes as possible, but do not read so many
1726 bytes that there might not be enough room for the
1727 corresponding line array. The worst case is when the
1728 rest of the input file consists entirely of newlines,
1729 except that the last byte is not a newline. */
1730 size_t readsize = (avail - 1) / (line_bytes + 1);
1731 size_t bytes_read = fread (ptr, 1, readsize, fp);
1732 char *ptrlim = ptr + bytes_read;
1734 avail -= bytes_read;
1736 if (bytes_read != readsize)
1739 die (_("read failed"), file);
1743 if (buf->buf == ptrlim)
1745 if (line_start != ptrlim && ptrlim[-1] != eol)
1750 /* Find and record each line in the just-read input. */
1751 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1753 /* Delimit the line with NUL. This eliminates the need to
1754 temporarily replace the last byte with NUL when calling
1755 xmemcoll(), which increases performance. */
1759 line->text = line_start;
1760 line->length = ptr - line_start;
1761 mergesize = MAX (mergesize, line->length);
1762 avail -= line_bytes;
1766 /* Precompute the position of the first key for
1768 line->keylim = (key->eword == SIZE_MAX
1770 : limfield (line, key));
1772 if (key->sword != SIZE_MAX)
1773 line->keybeg = begfield (line, key);
1776 if (key->skipsblanks)
1777 while (blanks[to_uchar (*line_start)])
1779 line->keybeg = line_start;
1791 buf->used = ptr - buf->buf;
1792 buf->nlines = buffer_linelim (buf) - line;
1793 if (buf->nlines != 0)
1795 buf->left = ptr - line_start;
1796 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1801 /* The current input line is too long to fit in the buffer.
1802 Double the buffer size and try again, keeping it properly
1804 size_t line_alloc = buf->alloc / sizeof (struct line);
1805 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1806 buf->alloc = line_alloc * sizeof (struct line);
1811 /* Table that maps characters to order-of-magnitude values. */
1812 static char const unit_order[UCHAR_LIM] =
1814 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1815 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1816 /* This initializer syntax works on all C99 hosts. For now, use
1817 it only on non-ASCII hosts, to ease the pain of porting to
1818 pre-C99 ASCII hosts. */
1819 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1822 /* Generate the following table with this command:
1823 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1824 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1826 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1828 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1829 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1830 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1832 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1833 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1835 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1840 /* Return an integer that represents the order of magnitude of the
1841 unit following the number. The number may contain thousands
1842 separators and a decimal point, but it may not contain leading blanks.
1843 Negative numbers get negative orders; zero numbers have a zero order. */
1846 find_unit_order (char const *number)
1848 bool minus_sign = (*number == '-');
1849 char const *p = number + minus_sign;
1853 /* Scan to end of number.
1854 Decimals or separators not followed by digits stop the scan.
1855 Numbers ending in decimals or separators are thus considered
1856 to be lacking in units.
1857 FIXME: add support for multibyte thousands_sep and decimal_point. */
1861 while (ISDIGIT (ch = *p++))
1862 nonzero |= ch - '0';
1864 while (ch == thousands_sep);
1866 if (ch == decimal_point)
1867 while (ISDIGIT (ch = *p++))
1868 nonzero |= ch - '0';
1872 int order = unit_order[ch];
1873 return (minus_sign ? -order : order);
1879 /* Compare numbers A and B ending in units with SI or IEC prefixes
1880 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
1883 human_numcompare (char const *a, char const *b)
1885 while (blanks[to_uchar (*a)])
1887 while (blanks[to_uchar (*b)])
1890 int diff = find_unit_order (a) - find_unit_order (b);
1891 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1894 /* Compare strings A and B as numbers without explicitly converting them to
1895 machine numbers. Comparatively slow for short strings, but asymptotically
1899 numcompare (char const *a, char const *b)
1901 while (blanks[to_uchar (*a)])
1903 while (blanks[to_uchar (*b)])
1906 return strnumcmp (a, b, decimal_point, thousands_sep);
1910 general_numcompare (char const *sa, char const *sb)
1912 /* FIXME: maybe add option to try expensive FP conversion
1913 only if A and B can't be compared more cheaply/accurately. */
1917 long_double a = strtold (sa, &ea);
1918 long_double b = strtold (sb, &eb);
1920 /* Put conversion errors at the start of the collating sequence. */
1922 return sb == eb ? 0 : -1;
1926 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1927 conversion errors but before numbers; sort them by internal
1928 bit-pattern, for lack of a more portable alternative. */
1934 : memcmp (&a, &b, sizeof a));
1937 /* Return an integer in 1..12 of the month name MONTH.
1938 Return 0 if the name in S is not recognized. */
1941 getmonth (char const *month, char **ea)
1944 size_t hi = MONTHS_PER_YEAR;
1946 while (blanks[to_uchar (*month)])
1951 size_t ix = (lo + hi) / 2;
1952 char const *m = month;
1953 char const *n = monthtab[ix].name;
1961 return monthtab[ix].val;
1963 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
1968 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
1980 /* A randomly chosen MD5 state, used for random comparison. */
1981 static struct md5_ctx random_md5_state;
1983 /* Initialize the randomly chosen MD5 state. */
1986 random_md5_state_init (char const *random_source)
1988 unsigned char buf[MD5_DIGEST_SIZE];
1989 struct randread_source *r = randread_new (random_source, sizeof buf);
1991 die (_("open failed"), random_source);
1992 randread (r, buf, sizeof buf);
1993 if (randread_free (r) != 0)
1994 die (_("close failed"), random_source);
1995 md5_init_ctx (&random_md5_state);
1996 md5_process_bytes (buf, sizeof buf, &random_md5_state);
1999 /* This is like strxfrm, except it reports any error and exits. */
2002 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2005 size_t translated_size = strxfrm (dest, src, destsize);
2009 error (0, errno, _("string transformation failed"));
2010 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2011 error (SORT_FAILURE, 0,
2012 _("the untransformed string was %s"),
2013 quotearg_n_style (0, locale_quoting_style, src));
2016 return translated_size;
2019 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2020 using one or more random hash functions. TEXTA[LENA] and
2021 TEXTB[LENB] must be zero. */
2024 compare_random (char *restrict texta, size_t lena,
2025 char *restrict textb, size_t lenb)
2027 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2028 data. This is used to break ties if there is an checksum
2029 collision, and this is good enough given the astronomically low
2030 probability of a collision. */
2033 char stackbuf[4000];
2034 char *buf = stackbuf;
2035 size_t bufsize = sizeof stackbuf;
2036 void *allocated = NULL;
2037 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2038 struct md5_ctx s[2];
2039 s[0] = s[1] = random_md5_state;
2041 if (hard_LC_COLLATE)
2043 char const *lima = texta + lena;
2044 char const *limb = textb + lenb;
2048 /* Transform the text into the basis of comparison, so that byte
2049 strings that would otherwise considered to be equal are
2050 considered equal here even if their bytes differ.
2052 Each time through this loop, transform one
2053 null-terminated string's worth from TEXTA or from TEXTB
2054 or both. That way, there's no need to store the
2055 transformation of the whole line, if it contains many
2056 null-terminated strings. */
2058 /* Store the transformed data into a big-enough buffer. */
2060 /* A 3X size guess avoids the overhead of calling strxfrm
2061 twice on typical implementations. Don't worry about
2062 size_t overflow, as the guess need not be correct. */
2063 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2064 if (bufsize < guess_bufsize)
2066 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2068 buf = allocated = malloc (bufsize);
2072 bufsize = sizeof stackbuf;
2077 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2078 bool a_fits = sizea <= bufsize;
2081 ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
2082 (a_fits ? bufsize - sizea : 0))
2086 if (! (a_fits && sizea + sizeb <= bufsize))
2088 bufsize = sizea + sizeb;
2089 if (bufsize < SIZE_MAX / 3)
2090 bufsize = bufsize * 3 / 2;
2092 buf = allocated = xmalloc (bufsize);
2094 strxfrm (buf, texta, sizea);
2096 strxfrm (buf + sizea, textb, sizeb);
2099 /* Advance past NULs to the next part of each input string,
2100 exiting the loop if both strings are exhausted. When
2101 exiting the loop, prepare to finish off the tiebreaker
2102 comparison properly. */
2104 texta += strlen (texta) + 1;
2106 textb += strlen (textb) + 1;
2107 if (! (texta < lima || textb < limb))
2109 lena = sizea; texta = buf;
2110 lenb = sizeb; textb = buf + sizea;
2114 /* Accumulate the transformed data in the corresponding
2116 md5_process_bytes (buf, sizea, &s[0]);
2117 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2119 /* Update the tiebreaker comparison of the transformed data. */
2122 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2124 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2129 /* Compute and compare the checksums. */
2130 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2131 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2132 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2134 /* Fall back on the tiebreaker if the checksums collide. */
2139 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2141 xfrm_diff = (lena > lenb) - (lena < lenb);
2152 /* Return the printable width of the block of memory starting at
2153 TEXT and ending just before LIM, counting each tab as one byte.
2154 FIXME: Should we generally be counting non printable chars? */
2157 debug_width (char const *text, char const *lim)
2159 size_t width = mbsnwidth (text, lim - text, 0);
2161 width += (*text++ == '\t');
2165 /* For debug mode, "underline" a key at the
2166 specified offset and screen width. */
2169 mark_key (size_t offset, size_t width)
2175 printf (_("^ no match for key\n"));
2186 /* Return true if KEY is a numeric key. */
2189 key_numeric (struct keyfield const *key)
2191 return key->numeric || key->general_numeric || key->human_numeric;
2194 /* For LINE, output a debugging line that underlines KEY in LINE.
2195 If KEY is null, underline the whole line. */
2198 debug_key (struct line const *line, struct keyfield const *key)
2200 char *text = line->text;
2202 char *lim = text + line->length - 1;
2206 if (key->sword != SIZE_MAX)
2207 beg = begfield (line, key);
2208 if (key->eword != SIZE_MAX)
2209 lim = limfield (line, key);
2211 if (key->skipsblanks || key->month || key_numeric (key))
2216 while (blanks[to_uchar (*beg)])
2219 char *tighter_lim = beg;
2223 else if (key->month)
2224 getmonth (beg, &tighter_lim);
2225 else if (key->general_numeric)
2226 ignore_value (strtold (beg, &tighter_lim));
2227 else if (key->numeric || key->human_numeric)
2229 char *p = beg + (beg < lim && *beg == '-');
2230 bool found_digit = false;
2235 while (ISDIGIT (ch = *p++))
2238 while (ch == thousands_sep);
2240 if (ch == decimal_point)
2241 while (ISDIGIT (ch = *p++))
2245 tighter_lim = p - ! (key->human_numeric && unit_order[ch]);
2255 size_t offset = debug_width (text, beg);
2256 size_t width = debug_width (beg, lim);
2257 mark_key (offset, width);
2260 /* Debug LINE by underlining its keys. */
2263 debug_line (struct line const *line)
2265 struct keyfield const *key = keylist;
2268 debug_key (line, key);
2269 while (key && ((key = key->next) || ! (unique || stable)));
2272 /* Return whether sorting options specified for key. */
2275 default_key_compare (struct keyfield const *key)
2277 return ! (key->ignore
2281 || key_numeric (key)
2285 /* || key->reverse */
2289 /* Convert a key to the short options used to specify it. */
2292 key_to_opts (struct keyfield const *key, char *opts)
2294 if (key->skipsblanks || key->skipeblanks)
2295 *opts++ = 'b';/* either disables global -b */
2296 if (key->ignore == nondictionary)
2300 if (key->general_numeric)
2302 if (key->human_numeric)
2304 if (key->ignore == nonprinting)
2319 /* Output data independent key warnings to stderr. */
2322 key_warnings (struct keyfield const *gkey, bool gkey_only)
2324 struct keyfield const *key;
2325 struct keyfield ugkey = *gkey;
2326 unsigned long keynum = 1;
2328 for (key = keylist; key; key = key->next, keynum++)
2330 if (key->obsolete_used)
2332 size_t sword = key->sword;
2333 size_t eword = key->eword;
2334 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2335 /* obsolescent syntax +A.x -B.y is equivalent to:
2336 -k A+1.x+1,B.y (when y = 0)
2337 -k A+1.x+1,B+1.y (when y > 0) */
2338 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2339 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2343 if (sword == SIZE_MAX)
2346 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2347 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2348 if (key->eword != SIZE_MAX)
2350 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2351 stpcpy (stpcpy (pn, ","),
2352 umaxtostr (eword + 1
2353 + (key->echar == SIZE_MAX), tmp));
2355 error (0, 0, _("obsolescent key `%s' used; consider `%s' instead"),
2359 /* Warn about field specs that will never match. */
2360 if (key->sword != SIZE_MAX && key->eword < key->sword)
2361 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2363 /* Warn about significant leading blanks. */
2364 bool implicit_skip = key_numeric (key) || key->month;
2365 bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
2366 && !(key->schar || key->echar);
2367 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2368 if (!gkey_only && tab == TAB_DEFAULT && !line_offset
2369 && ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
2370 || (!key->skipsblanks && key->schar)
2371 || (!key->skipeblanks && key->echar)))
2372 error (0, 0, _("leading blanks are significant in key %lu; "
2373 "consider also specifying `b'"), keynum);
2375 /* Warn about numeric comparisons spanning fields,
2376 as field delimiters could be interpreted as part
2377 of the number (maybe only in other locales). */
2378 if (!gkey_only && key_numeric (key))
2380 size_t sword = key->sword + 1;
2381 size_t eword = key->eword + 1;
2385 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2389 /* Flag global options not copied or specified in any key. */
2390 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2391 ugkey.ignore = NULL;
2392 if (ugkey.translate && (ugkey.translate == key->translate))
2393 ugkey.translate = NULL;
2394 ugkey.skipsblanks &= !key->skipsblanks;
2395 ugkey.skipeblanks &= !key->skipeblanks;
2396 ugkey.month &= !key->month;
2397 ugkey.numeric &= !key->numeric;
2398 ugkey.general_numeric &= !key->general_numeric;
2399 ugkey.human_numeric &= !key->human_numeric;
2400 ugkey.random &= !key->random;
2401 ugkey.version &= !key->version;
2402 ugkey.reverse &= !key->reverse;
2405 /* Warn about ignored global options flagged above.
2406 Note if gkey is the only one in the list, all flags are cleared. */
2407 if (!default_key_compare (&ugkey)
2408 || (ugkey.reverse && (stable || unique) && keylist))
2410 bool ugkey_reverse = ugkey.reverse;
2411 if (!(stable || unique))
2412 ugkey.reverse = false;
2413 /* The following is too big, but guaranteed to be "big enough". */
2414 char opts[sizeof short_options];
2415 key_to_opts (&ugkey, opts);
2417 ngettext ("option `-%s' is ignored",
2418 "options `-%s' are ignored",
2419 select_plural (strlen (opts))), opts);
2420 ugkey.reverse = ugkey_reverse;
2422 if (ugkey.reverse && !(stable || unique) && keylist)
2423 error (0, 0, _("option `-r' only applies to last-resort comparison"));
2426 /* Compare two lines A and B trying every key in sequence until there
2427 are no more keys or a difference is found. */
2430 keycompare (struct line const *a, struct line const *b)
2432 struct keyfield *key = keylist;
2434 /* For the first iteration only, the key positions have been
2435 precomputed for us. */
2436 char *texta = a->keybeg;
2437 char *textb = b->keybeg;
2438 char *lima = a->keylim;
2439 char *limb = b->keylim;
2445 char const *translate = key->translate;
2446 bool const *ignore = key->ignore;
2448 /* Treat field ends before field starts as empty fields. */
2449 lima = MAX (texta, lima);
2450 limb = MAX (textb, limb);
2452 /* Find the lengths. */
2453 size_t lena = lima - texta;
2454 size_t lenb = limb - textb;
2456 if (hard_LC_COLLATE || key_numeric (key)
2457 || key->month || key->random || key->version)
2464 char enda IF_LINT (= 0);
2465 char endb IF_LINT (= 0);
2466 void *allocated IF_LINT (= NULL);
2467 char stackbuf[4000];
2469 if (ignore || translate)
2471 /* Compute with copies of the keys, which are the result of
2472 translating or ignoring characters, and which need their
2477 /* Allocate space for copies. */
2478 size_t size = lena + 1 + lenb + 1;
2479 if (size <= sizeof stackbuf)
2480 ta = stackbuf, allocated = NULL;
2482 ta = allocated = xmalloc (size);
2485 /* Put into each copy a version of the key in which the
2486 requested characters are ignored or translated. */
2487 for (tlena = i = 0; i < lena; i++)
2488 if (! (ignore && ignore[to_uchar (texta[i])]))
2489 ta[tlena++] = (translate
2490 ? translate[to_uchar (texta[i])]
2494 for (tlenb = i = 0; i < lenb; i++)
2495 if (! (ignore && ignore[to_uchar (textb[i])]))
2496 tb[tlenb++] = (translate
2497 ? translate[to_uchar (textb[i])]
2503 /* Use the keys in-place, temporarily null-terminated. */
2504 ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
2505 tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
2509 diff = numcompare (ta, tb);
2510 else if (key->general_numeric)
2511 diff = general_numcompare (ta, tb);
2512 else if (key->human_numeric)
2513 diff = human_numcompare (ta, tb);
2514 else if (key->month)
2515 diff = getmonth (ta, NULL) - getmonth (tb, NULL);
2516 else if (key->random)
2517 diff = compare_random (ta, tlena, tb, tlenb);
2518 else if (key->version)
2519 diff = filevercmp (ta, tb);
2522 /* Locale-dependent string sorting. This is slower than
2523 C-locale sorting, which is implemented below. */
2525 diff = - NONZERO (tlenb);
2526 else if (tlenb == 0)
2529 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2532 if (ignore || translate)
2542 #define CMP_WITH_IGNORE(A, B) \
2547 while (texta < lima && ignore[to_uchar (*texta)]) \
2549 while (textb < limb && ignore[to_uchar (*textb)]) \
2551 if (! (texta < lima && textb < limb)) \
2553 diff = to_uchar (A) - to_uchar (B); \
2560 diff = (texta < lima) - (textb < limb); \
2565 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2566 translate[to_uchar (*textb)]);
2568 CMP_WITH_IGNORE (*texta, *textb);
2571 diff = - NONZERO (lenb);
2578 while (texta < lima && textb < limb)
2580 diff = (to_uchar (translate[to_uchar (*texta++)])
2581 - to_uchar (translate[to_uchar (*textb++)]));
2588 diff = memcmp (texta, textb, MIN (lena, lenb));
2592 diff = lena < lenb ? -1 : lena != lenb;
2602 /* Find the beginning and limit of the next field. */
2603 if (key->eword != SIZE_MAX)
2604 lima = limfield (a, key), limb = limfield (b, key);
2606 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2608 if (key->sword != SIZE_MAX)
2609 texta = begfield (a, key), textb = begfield (b, key);
2612 texta = a->text, textb = b->text;
2613 if (key->skipsblanks)
2615 while (texta < lima && blanks[to_uchar (*texta)])
2617 while (textb < limb && blanks[to_uchar (*textb)])
2628 return key->reverse ? -diff : diff;
2631 /* Compare two lines A and B, returning negative, zero, or positive
2632 depending on whether A compares less than, equal to, or greater than B. */
2635 compare (struct line const *a, struct line const *b)
2640 /* First try to compare on the specified keys (if any).
2641 The only two cases with no key at all are unadorned sort,
2642 and unadorned sort -r. */
2645 diff = keycompare (a, b);
2646 if (diff || unique || stable)
2650 /* If the keys all compare equal (or no keys were specified)
2651 fall through to the default comparison. */
2652 alen = a->length - 1, blen = b->length - 1;
2655 diff = - NONZERO (blen);
2658 else if (hard_LC_COLLATE)
2660 /* Note xmemcoll0 is a performance enhancement as
2661 it will not unconditionally write '\0' after the
2662 passed in buffers, which was seen to give around
2663 a 3% increase in performance for short lines. */
2664 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2666 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2667 diff = alen < blen ? -1 : alen != blen;
2669 return reverse ? -diff : diff;
2672 /* Write LINE to output stream FP; the output file's name is
2673 OUTPUT_FILE if OUTPUT_FILE is nonnull, and is the standard output
2674 otherwise. If debugging is enabled and FP is standard output,
2675 append some debugging information. */
2678 write_line (struct line const *line, FILE *fp, char const *output_file)
2680 char *buf = line->text;
2681 size_t n_bytes = line->length;
2682 char *ebuf = buf + n_bytes;
2684 if (!output_file && debug)
2686 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2687 char const *c = buf;
2696 if (fputc (wc, fp) == EOF)
2697 die (_("write failed"), output_file);
2705 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2706 die (_("write failed"), output_file);
2711 /* Check that the lines read from FILE_NAME come in order. Return
2712 true if they are in order. If CHECKONLY == 'c', also print a
2713 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2714 they are not in order. */
2717 check (char const *file_name, char checkonly)
2719 FILE *fp = xfopen (file_name, "r");
2720 struct buffer buf; /* Input buffer. */
2721 struct line temp; /* Copy of previous line. */
2723 uintmax_t line_number = 0;
2724 struct keyfield const *key = keylist;
2725 bool nonunique = ! unique;
2726 bool ordered = true;
2728 initbuf (&buf, sizeof (struct line),
2729 MAX (merge_buffer_size, sort_size));
2732 while (fillbuf (&buf, fp, file_name))
2734 struct line const *line = buffer_linelim (&buf);
2735 struct line const *linebase = line - buf.nlines;
2737 /* Make sure the line saved from the old buffer contents is
2738 less than or equal to the first line of the new buffer. */
2739 if (alloc && nonunique <= compare (&temp, line - 1))
2743 if (checkonly == 'c')
2745 struct line const *disorder_line = line - 1;
2746 uintmax_t disorder_line_number =
2747 buffer_linelim (&buf) - disorder_line + line_number;
2748 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2749 fprintf (stderr, _("%s: %s:%s: disorder: "),
2750 program_name, file_name,
2751 umaxtostr (disorder_line_number, hr_buf));
2752 write_line (disorder_line, stderr, _("standard error"));
2760 /* Compare each line in the buffer with its successor. */
2761 while (linebase < --line)
2762 if (nonunique <= compare (line, line - 1))
2763 goto found_disorder;
2765 line_number += buf.nlines;
2767 /* Save the last line of the buffer. */
2768 if (alloc < line->length)
2775 alloc = line->length;
2779 while (alloc < line->length);
2782 temp.text = xmalloc (alloc);
2784 memcpy (temp.text, line->text, line->length);
2785 temp.length = line->length;
2788 temp.keybeg = temp.text + (line->keybeg - line->text);
2789 temp.keylim = temp.text + (line->keylim - line->text);
2793 xfclose (fp, file_name);
2799 /* Open FILES (there are NFILES of them) and store the resulting array
2800 of stream pointers into (*PFPS). Allocate the array. Return the
2801 number of successfully opened files, setting errno if this value is
2802 less than NFILES. */
2805 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2807 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2810 /* Open as many input files as we can. */
2811 for (i = 0; i < nfiles; i++)
2813 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
2814 ? open_temp (files[i].temp)
2815 : stream_open (files[i].name, "r"));
2823 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2824 files (all of which are at the start of the FILES array), and
2825 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2826 FPS is the vector of open stream corresponding to the files.
2827 Close input and output streams before returning.
2828 OUTPUT_FILE gives the name of the output file. If it is NULL,
2829 the output file is standard output. */
2832 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2833 FILE *ofp, char const *output_file, FILE **fps)
2835 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2836 /* Input buffers for each file. */
2837 struct line saved; /* Saved line storage for unique check. */
2838 struct line const *savedline = NULL;
2839 /* &saved if there is a saved line. */
2840 size_t savealloc = 0; /* Size allocated for the saved line. */
2841 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2842 /* Current line in each line table. */
2843 struct line const **base = xnmalloc (nfiles, sizeof *base);
2844 /* Base of each line table. */
2845 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2846 /* Table representing a permutation of fps,
2847 such that cur[ord[0]] is the smallest line
2848 and will be next output. */
2852 struct keyfield const *key = keylist;
2855 /* Read initial lines from each input file. */
2856 for (i = 0; i < nfiles; )
2858 initbuf (&buffer[i], sizeof (struct line),
2859 MAX (merge_buffer_size, sort_size / nfiles));
2860 if (fillbuf (&buffer[i], fps[i], files[i].name))
2862 struct line const *linelim = buffer_linelim (&buffer[i]);
2863 cur[i] = linelim - 1;
2864 base[i] = linelim - buffer[i].nlines;
2869 /* fps[i] is empty; eliminate it from future consideration. */
2870 xfclose (fps[i], files[i].name);
2874 zaptemp (files[i].name);
2876 free (buffer[i].buf);
2878 for (j = i; j < nfiles; ++j)
2880 files[j] = files[j + 1];
2881 fps[j] = fps[j + 1];
2886 /* Set up the ord table according to comparisons among input lines.
2887 Since this only reorders two items if one is strictly greater than
2888 the other, it is stable. */
2889 for (i = 0; i < nfiles; ++i)
2891 for (i = 1; i < nfiles; ++i)
2892 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2893 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2895 /* Repeatedly output the smallest line until no input remains. */
2898 struct line const *smallest = cur[ord[0]];
2900 /* If uniquified output is turned on, output only the first of
2901 an identical series of lines. */
2904 if (savedline && compare (savedline, smallest))
2907 write_line (&saved, ofp, output_file);
2912 if (savealloc < smallest->length)
2917 savealloc = smallest->length;
2920 while ((savealloc *= 2) < smallest->length);
2923 saved.text = xmalloc (savealloc);
2925 saved.length = smallest->length;
2926 memcpy (saved.text, smallest->text, saved.length);
2930 saved.text + (smallest->keybeg - smallest->text);
2932 saved.text + (smallest->keylim - smallest->text);
2937 write_line (smallest, ofp, output_file);
2939 /* Check if we need to read more lines into core. */
2940 if (base[ord[0]] < smallest)
2941 cur[ord[0]] = smallest - 1;
2944 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2946 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2947 cur[ord[0]] = linelim - 1;
2948 base[ord[0]] = linelim - buffer[ord[0]].nlines;
2952 /* We reached EOF on fps[ord[0]]. */
2953 for (i = 1; i < nfiles; ++i)
2954 if (ord[i] > ord[0])
2957 xfclose (fps[ord[0]], files[ord[0]].name);
2958 if (ord[0] < ntemps)
2961 zaptemp (files[ord[0]].name);
2963 free (buffer[ord[0]].buf);
2964 for (i = ord[0]; i < nfiles; ++i)
2966 fps[i] = fps[i + 1];
2967 files[i] = files[i + 1];
2968 buffer[i] = buffer[i + 1];
2969 cur[i] = cur[i + 1];
2970 base[i] = base[i + 1];
2972 for (i = 0; i < nfiles; ++i)
2973 ord[i] = ord[i + 1];
2978 /* The new line just read in may be larger than other lines
2979 already in main memory; push it back in the queue until we
2980 encounter a line larger than it. Optimize for the common
2981 case where the new line is smallest. */
2986 size_t ord0 = ord[0];
2987 size_t count_of_smaller_lines;
2991 int cmp = compare (cur[ord0], cur[ord[probe]]);
2992 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2996 probe = (lo + hi) / 2;
2999 count_of_smaller_lines = lo - 1;
3000 for (j = 0; j < count_of_smaller_lines; j++)
3001 ord[j] = ord[j + 1];
3002 ord[count_of_smaller_lines] = ord0;
3006 if (unique && savedline)
3008 write_line (&saved, ofp, output_file);
3012 xfclose (ofp, output_file);
3020 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3021 files (all of which are at the start of the FILES array), and
3022 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3023 Close input and output files before returning.
3024 OUTPUT_FILE gives the name of the output file.
3026 Return the number of files successfully merged. This number can be
3027 less than NFILES if we ran low on file descriptors, but in this
3028 case it is never less than 2. */
3031 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3032 FILE *ofp, char const *output_file)
3035 size_t nopened = open_input_files (files, nfiles, &fps);
3036 if (nopened < nfiles && nopened < 2)
3037 die (_("open failed"), files[nopened].name);
3038 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3042 /* Merge into T (of size NLINES) the two sorted arrays of lines
3043 LO (with NLINES / 2 members), and
3044 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3045 T and LO point just past their respective arrays, and the arrays
3046 are in reverse order. NLINES must be at least 2. */
3049 mergelines (struct line *restrict t, size_t nlines,
3050 struct line const *restrict lo)
3052 size_t nlo = nlines / 2;
3053 size_t nhi = nlines - nlo;
3054 struct line *hi = t - nlo;
3057 if (compare (lo - 1, hi - 1) <= 0)
3062 /* HI must equal T now, and there is no need to copy from
3081 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3082 Do this all within one thread. NLINES must be at least 2.
3083 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3084 Otherwise the sort is in-place and TEMP is half-sized.
3085 The input and output arrays are in reverse order, and LINES and
3086 TEMP point just past the end of their respective arrays.
3088 Use a recursive divide-and-conquer algorithm, in the style
3089 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3090 the optimization suggested by exercise 5.2.4-10; this requires room
3091 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3092 writes that this memory optimization was originally published by
3093 D. A. Bell, Comp J. 1 (1958), 75. */
3096 sequential_sort (struct line *restrict lines, size_t nlines,
3097 struct line *restrict temp, bool to_temp)
3101 /* Declare `swap' as int, not bool, to work around a bug
3102 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3103 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3104 int swap = (0 < compare (&lines[-1], &lines[-2]));
3107 temp[-1] = lines[-1 - swap];
3108 temp[-2] = lines[-2 + swap];
3112 temp[-1] = lines[-1];
3113 lines[-1] = lines[-2];
3114 lines[-2] = temp[-1];
3119 size_t nlo = nlines / 2;
3120 size_t nhi = nlines - nlo;
3121 struct line *lo = lines;
3122 struct line *hi = lines - nlo;
3124 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3126 sequential_sort (lo, nlo, temp, !to_temp);
3131 struct line const *sorted_lo;
3142 mergelines (dest, nlines, sorted_lo);
3146 static struct merge_node *init_node (struct merge_node *restrict,
3147 struct merge_node *restrict,
3148 struct line *, size_t, size_t, bool);
3151 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3152 lines, with destination DEST. */
3153 static struct merge_node *
3154 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3156 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3158 struct merge_node *root = merge_tree;
3159 root->lo = root->hi = root->end_lo = root->end_hi = NULL;
3161 root->nlo = root->nhi = nlines;
3162 root->parent = NULL;
3163 root->level = MERGE_END;
3164 root->queued = false;
3165 pthread_mutex_init (&root->lock, NULL);
3167 init_node (root, root + 1, dest, nthreads, nlines, false);
3171 /* Destroy the merge tree. */
3173 merge_tree_destroy (struct merge_node *merge_tree)
3178 /* Initialize a merge tree node and its descendants. The node's
3179 parent is PARENT. The node and its descendants are taken from the
3180 array of nodes NODE_POOL. Their destination starts at DEST; they
3181 will consume NTHREADS threads. The total number of sort lines is
3182 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3185 static struct merge_node *
3186 init_node (struct merge_node *restrict parent,
3187 struct merge_node *restrict node_pool,
3188 struct line *dest, size_t nthreads,
3189 size_t total_lines, bool is_lo_child)
3191 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3192 size_t nlo = nlines / 2;
3193 size_t nhi = nlines - nlo;
3194 struct line *lo = dest - total_lines;
3195 struct line *hi = lo - nlo;
3196 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3198 struct merge_node *node = node_pool++;
3199 node->lo = node->end_lo = lo;
3200 node->hi = node->end_hi = hi;
3201 node->dest = parent_end;
3204 node->parent = parent;
3205 node->level = parent->level + 1;
3206 node->queued = false;
3207 pthread_mutex_init (&node->lock, NULL);
3211 size_t lo_threads = nthreads / 2;
3212 size_t hi_threads = nthreads - lo_threads;
3213 node->lo_child = node_pool;
3214 node_pool = init_node (node, node_pool, lo, lo_threads,
3216 node->hi_child = node_pool;
3217 node_pool = init_node (node, node_pool, hi, hi_threads,
3218 total_lines, false);
3222 node->lo_child = NULL;
3223 node->hi_child = NULL;
3229 /* Compare two merge nodes A and B for priority. */
3232 compare_nodes (void const *a, void const *b)
3234 struct merge_node const *nodea = a;
3235 struct merge_node const *nodeb = b;
3236 if (nodea->level == nodeb->level)
3237 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3238 return nodea->level < nodeb->level;
3241 /* Lock a merge tree NODE. */
3244 lock_node (struct merge_node *node)
3246 pthread_mutex_lock (&node->lock);
3249 /* Unlock a merge tree NODE. */
3252 unlock_node (struct merge_node *node)
3254 pthread_mutex_unlock (&node->lock);
3257 /* Destroy merge QUEUE. */
3260 queue_destroy (struct merge_node_queue *queue)
3262 heap_free (queue->priority_queue);
3263 pthread_cond_destroy (&queue->cond);
3264 pthread_mutex_destroy (&queue->mutex);
3267 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3268 NTHREADS threads. */
3271 queue_init (struct merge_node_queue *queue, size_t nthreads)
3273 /* Though it's highly unlikely all nodes are in the heap at the same
3274 time, the heap should accommodate all of them. Counting a NULL
3275 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3276 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3277 pthread_mutex_init (&queue->mutex, NULL);
3278 pthread_cond_init (&queue->cond, NULL);
3281 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3282 does not need to lock NODE. */
3285 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3287 pthread_mutex_lock (&queue->mutex);
3288 heap_insert (queue->priority_queue, node);
3289 node->queued = true;
3290 pthread_mutex_unlock (&queue->mutex);
3291 pthread_cond_signal (&queue->cond);
3294 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3296 static struct merge_node *
3297 queue_pop (struct merge_node_queue *queue)
3299 struct merge_node *node;
3300 pthread_mutex_lock (&queue->mutex);
3301 while (! (node = heap_remove_top (queue->priority_queue)))
3302 pthread_cond_wait (&queue->cond, &queue->mutex);
3303 pthread_mutex_unlock (&queue->mutex);
3305 node->queued = false;
3309 /* Output LINE to TFP, unless -u is specified and the line compares
3310 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3311 is null if TFP is standard output.
3313 This function does not save the line for comparison later, so it is
3314 appropriate only for internal sort. */
3317 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3319 static struct line saved;
3323 if (saved.text && ! compare (line, &saved))
3328 write_line (line, tfp, temp_output);
3331 /* Merge the lines currently available to a NODE in the binary
3332 merge tree. Merge a number of lines appropriate for this merge
3333 level, assuming TOTAL_LINES is the total number of lines.
3335 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3336 the name of TFP, or is null if TFP is standard output. */
3339 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3340 FILE *tfp, char const *temp_output)
3342 struct line *lo_orig = node->lo;
3343 struct line *hi_orig = node->hi;
3344 size_t to_merge = MAX_MERGE (total_lines, node->level);
3348 if (node->level > MERGE_ROOT)
3350 /* Merge to destination buffer. */
3351 struct line *dest = *node->dest;
3352 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3353 if (compare (node->lo - 1, node->hi - 1) <= 0)
3354 *--dest = *--node->lo;
3356 *--dest = *--node->hi;
3358 merged_lo = lo_orig - node->lo;
3359 merged_hi = hi_orig - node->hi;
3361 if (node->nhi == merged_hi)
3362 while (node->lo != node->end_lo && to_merge--)
3363 *--dest = *--node->lo;
3364 else if (node->nlo == merged_lo)
3365 while (node->hi != node->end_hi && to_merge--)
3366 *--dest = *--node->hi;
3371 /* Merge directly to output. */
3372 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3374 if (compare (node->lo - 1, node->hi - 1) <= 0)
3375 write_unique (--node->lo, tfp, temp_output);
3377 write_unique (--node->hi, tfp, temp_output);
3380 merged_lo = lo_orig - node->lo;
3381 merged_hi = hi_orig - node->hi;
3383 if (node->nhi == merged_hi)
3385 while (node->lo != node->end_lo && to_merge--)
3386 write_unique (--node->lo, tfp, temp_output);
3388 else if (node->nlo == merged_lo)
3390 while (node->hi != node->end_hi && to_merge--)
3391 write_unique (--node->hi, tfp, temp_output);
3396 merged_lo = lo_orig - node->lo;
3397 merged_hi = hi_orig - node->hi;
3398 node->nlo -= merged_lo;
3399 node->nhi -= merged_hi;
3402 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3403 NODE's children has available lines and the other either has
3404 available lines or has exhausted its lines. */
3407 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3411 bool lo_avail = (node->lo - node->end_lo) != 0;
3412 bool hi_avail = (node->hi - node->end_hi) != 0;
3413 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3414 queue_insert (queue, node);
3418 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3421 queue_check_insert_parent (struct merge_node_queue *queue,
3422 struct merge_node *node)
3424 if (node->level > MERGE_ROOT)
3426 lock_node (node->parent);
3427 queue_check_insert (queue, node->parent);
3428 unlock_node (node->parent);
3430 else if (node->nlo + node->nhi == 0)
3432 /* If the MERGE_ROOT NODE has finished merging, insert the
3434 queue_insert (queue, node->parent);
3438 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3439 some of those lines, until the MERGE_END node is popped.
3440 TOTAL_LINES is the total number of lines. If merging at the top
3441 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3442 null if TFP is standard output. */
3445 merge_loop (struct merge_node_queue *queue,
3446 size_t total_lines, FILE *tfp, char const *temp_output)
3450 struct merge_node *node = queue_pop (queue);
3452 if (node->level == MERGE_END)
3455 /* Reinsert so other threads can pop it. */
3456 queue_insert (queue, node);
3459 mergelines_node (node, total_lines, tfp, temp_output);
3460 queue_check_insert (queue, node);
3461 queue_check_insert_parent (queue, node);
3468 static void sortlines (struct line *restrict, size_t, size_t,
3469 struct merge_node *, bool, struct merge_node_queue *,
3470 FILE *, char const *);
3472 /* Thread arguments for sortlines_thread. */
3476 /* Source, i.e., the array of lines to sort. This points just past
3477 the end of the array. */
3480 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3483 /* Number of lines in LINES and DEST. */
3484 size_t const total_lines;
3486 /* Merge node. Lines from this node and this node's sibling will merged
3487 to this node's parent. */
3488 struct merge_node *const node;
3490 /* True if this node is sorting the lower half of the parent's work. */
3493 /* The priority queue controlling available work for the entire
3495 struct merge_node_queue *const queue;
3497 /* If at the top level, the file to output to, and the file's name.
3498 If the file is standard output, the file's name is null. */
3500 char const *output_temp;
3503 /* Like sortlines, except with a signature acceptable to pthread_create. */
3506 sortlines_thread (void *data)
3508 struct thread_args const *args = data;
3509 sortlines (args->lines, args->nthreads, args->total_lines,
3510 args->node, args->is_lo_child, args->queue, args->tfp,
3515 /* Sort lines, possibly in parallel. The arguments are as in struct
3518 The algorithm has three phases: node creation, sequential sort,
3521 During node creation, sortlines recursively visits each node in the
3522 binary merge tree and creates a NODE structure corresponding to all the
3523 future line merging NODE is responsible for. For each call to
3524 sortlines, half the available threads are assigned to each recursive
3525 call, until a leaf node having only 1 available thread is reached.
3527 Each leaf node then performs two sequential sorts, one on each half of
3528 the lines it is responsible for. It records in its NODE structure that
3529 there are two sorted sublists available to merge from, and inserts its
3530 NODE into the priority queue.
3532 The binary merge phase then begins. Each thread drops into a loop
3533 where the thread retrieves a NODE from the priority queue, merges lines
3534 available to that NODE, and potentially insert NODE or its parent back
3535 into the queue if there are sufficient available lines for them to
3536 merge. This continues until all lines at all nodes of the merge tree
3537 have been merged. */
3540 sortlines (struct line *restrict lines, size_t nthreads,
3541 size_t total_lines, struct merge_node *node, bool is_lo_child,
3542 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3544 size_t nlines = node->nlo + node->nhi;
3546 /* Calculate thread arguments. */
3547 size_t lo_threads = nthreads / 2;
3548 size_t hi_threads = nthreads - lo_threads;
3550 struct thread_args args = {lines, lo_threads, total_lines,
3551 node->lo_child, true, queue, tfp, temp_output};
3553 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3554 && pthread_create (&thread, NULL, sortlines_thread, &args) == 0)
3556 sortlines (lines - node->nlo, hi_threads, total_lines,
3557 node->hi_child, false, queue, tfp, temp_output);
3558 pthread_join (thread, NULL);
3562 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3563 Sort with 1 thread. */
3564 size_t nlo = node->nlo;
3565 size_t nhi = node->nhi;
3566 struct line *temp = lines - total_lines;
3568 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3570 sequential_sort (lines, nlo, temp, false);
3572 /* Update merge NODE. No need to lock yet. */
3574 node->hi = lines - nlo;
3575 node->end_lo = lines - nlo;
3576 node->end_hi = lines - nlo - nhi;
3578 queue_insert (queue, node);
3579 merge_loop (queue, total_lines, tfp, temp_output);
3582 pthread_mutex_destroy (&node->lock);
3585 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3586 the same as OUTFILE. If found, replace each with the same
3587 temporary copy that can be merged into OUTFILE without destroying
3588 OUTFILE before it is completely read. This temporary copy does not
3589 count as a merge temp, so don't worry about incrementing NTEMPS in
3590 the caller; final cleanup will remove it, not zaptemp.
3592 This test ensures that an otherwise-erroneous use like
3593 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3594 It's not clear that POSIX requires this nicety.
3595 Detect common error cases, but don't try to catch obscure cases like
3596 "cat ... FILE ... | sort -m -o FILE"
3597 where traditional "sort" doesn't copy the input and where
3598 people should know that they're getting into trouble anyway.
3599 Catching these obscure cases would slow down performance in
3603 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3604 size_t nfiles, char const *outfile)
3607 bool got_outstat = false;
3608 struct stat outstat;
3609 struct tempnode *tempcopy = NULL;
3611 for (i = ntemps; i < nfiles; i++)
3613 bool is_stdin = STREQ (files[i].name, "-");
3617 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3624 ? stat (outfile, &outstat)
3625 : fstat (STDOUT_FILENO, &outstat))
3632 ? fstat (STDIN_FILENO, &instat)
3633 : stat (files[i].name, &instat))
3635 && SAME_INODE (instat, outstat));
3643 tempcopy = create_temp (&tftp);
3644 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3647 files[i].name = tempcopy->name;
3648 files[i].temp = tempcopy;
3653 /* Merge the input FILES. NTEMPS is the number of files at the
3654 start of FILES that are temporary; it is zero at the top level.
3655 NFILES is the total number of files. Put the output in
3656 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3659 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3660 char const *output_file)
3662 while (nmerge < nfiles)
3664 /* Number of input files processed so far. */
3667 /* Number of output files generated so far. */
3670 /* nfiles % NMERGE; this counts input files that are left over
3671 after all full-sized merges have been done. */
3674 /* Number of easily-available slots at the next loop iteration. */
3677 /* Do as many NMERGE-size merges as possible. In the case that
3678 nmerge is bogus, increment by the maximum number of file
3679 descriptors allowed. */
3680 for (out = in = 0; nmerge <= nfiles - in; out++)
3683 struct tempnode *temp = create_temp (&tfp);
3684 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3685 nmerge, tfp, temp->name);
3686 ntemps -= MIN (ntemps, num_merged);
3687 files[out].name = temp->name;
3688 files[out].temp = temp;
3692 remainder = nfiles - in;
3693 cheap_slots = nmerge - out % nmerge;
3695 if (cheap_slots < remainder)
3697 /* So many files remain that they can't all be put into the last
3698 NMERGE-sized output window. Do one more merge. Merge as few
3699 files as possible, to avoid needless I/O. */
3700 size_t nshortmerge = remainder - cheap_slots + 1;
3702 struct tempnode *temp = create_temp (&tfp);
3703 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3704 nshortmerge, tfp, temp->name);
3705 ntemps -= MIN (ntemps, num_merged);
3706 files[out].name = temp->name;
3707 files[out++].temp = temp;
3711 /* Put the remaining input files into the last NMERGE-sized output
3712 window, so they will be merged in the next pass. */
3713 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3718 avoid_trashing_input (files, ntemps, nfiles, output_file);
3720 /* We aren't guaranteed that this final mergefiles will work, therefore we
3721 try to merge into the output, and then merge as much as we can into a
3722 temp file if we can't. Repeat. */
3726 /* Merge directly into the output file if possible. */
3728 size_t nopened = open_input_files (files, nfiles, &fps);
3730 if (nopened == nfiles)
3732 FILE *ofp = stream_open (output_file, "w");
3735 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3738 if (errno != EMFILE || nopened <= 2)
3739 die (_("open failed"), output_file);
3741 else if (nopened <= 2)
3742 die (_("open failed"), files[nopened].name);
3744 /* We ran out of file descriptors. Close one of the input
3745 files, to gain a file descriptor. Then create a temporary
3746 file with our spare file descriptor. Retry if that failed
3747 (e.g., some other process could open a file between the time
3748 we closed and tried to create). */
3750 struct tempnode *temp;
3754 xfclose (fps[nopened], files[nopened].name);
3755 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
3759 /* Merge into the newly allocated temporary. */
3760 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
3762 ntemps -= MIN (ntemps, nopened);
3763 files[0].name = temp->name;
3764 files[0].temp = temp;
3766 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3768 nfiles -= nopened - 1;
3772 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3775 sort (char *const *files, size_t nfiles, char const *output_file,
3779 IF_LINT (buf.buf = NULL);
3781 bool output_file_created = false;
3787 char const *temp_output;
3788 char const *file = *files;
3789 FILE *fp = xfopen (file, "r");
3792 size_t bytes_per_line;
3798 while (tmp < nthreads)
3803 bytes_per_line = (mult * sizeof (struct line));
3806 bytes_per_line = sizeof (struct line) * 3 / 2;
3809 initbuf (&buf, bytes_per_line,
3810 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3815 while (fillbuf (&buf, fp, file))
3819 if (buf.eof && nfiles
3820 && (bytes_per_line + 1
3821 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3823 /* End of file, but there is more input and buffer room.
3824 Concatenate the next input file; this is faster in
3826 buf.left = buf.used;
3830 line = buffer_linelim (&buf);
3831 if (buf.eof && !nfiles && !ntemps && !buf.left)
3834 tfp = xfopen (output_file, "w");
3835 temp_output = output_file;
3836 output_file_created = true;
3841 temp_output = create_temp (&tfp)->name;
3845 struct merge_node_queue queue;
3846 queue_init (&queue, nthreads);
3847 struct merge_node *merge_tree =
3848 merge_tree_init (nthreads, buf.nlines, line);
3849 struct merge_node *root = merge_tree + 1;
3851 sortlines (line, nthreads, buf.nlines, root,
3852 true, &queue, tfp, temp_output);
3853 queue_destroy (&queue);
3854 pthread_mutex_destroy (&root->lock);
3855 merge_tree_destroy (merge_tree);
3858 write_unique (line - 1, tfp, temp_output);
3860 xfclose (tfp, temp_output);
3862 if (output_file_created)
3871 if (! output_file_created)
3874 struct tempnode *node = temphead;
3875 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3876 for (i = 0; node; i++)
3878 tempfiles[i].name = node->name;
3879 tempfiles[i].temp = node;
3882 merge (tempfiles, ntemps, ntemps, output_file);
3889 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3892 insertkey (struct keyfield *key_arg)
3894 struct keyfield **p;
3895 struct keyfield *key = xmemdup (key_arg, sizeof *key);
3897 for (p = &keylist; *p; p = &(*p)->next)
3903 /* Report a bad field specification SPEC, with extra info MSGID. */
3905 static void badfieldspec (char const *, char const *)
3908 badfieldspec (char const *spec, char const *msgid)
3910 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
3911 _(msgid), quote (spec));
3915 /* Report incompatible options. */
3917 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
3919 incompatible_options (char const *opts)
3921 error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
3925 /* Check compatibility of ordering options. */
3928 check_ordering_compatibility (void)
3930 struct keyfield *key;
3932 for (key = keylist; key; key = key->next)
3933 if (1 < (key->numeric + key->general_numeric + key->human_numeric
3934 + key->month + (key->version | key->random | !!key->ignore)))
3936 /* The following is too big, but guaranteed to be "big enough". */
3937 char opts[sizeof short_options];
3938 /* Clear flags we're not interested in. */
3939 key->skipsblanks = key->skipeblanks = key->reverse = false;
3940 key_to_opts (key, opts);
3941 incompatible_options (opts);
3945 /* Parse the leading integer in STRING and store the resulting value
3946 (which must fit into size_t) into *VAL. Return the address of the
3947 suffix after the integer. If the value is too large, silently
3948 substitute SIZE_MAX. If MSGID is NULL, return NULL after
3949 failure; otherwise, report MSGID and exit on failure. */
3952 parse_field_count (char const *string, size_t *val, char const *msgid)
3957 switch (xstrtoumax (string, &suffix, 10, &n, ""))
3960 case LONGINT_INVALID_SUFFIX_CHAR:
3965 case LONGINT_OVERFLOW:
3966 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
3970 case LONGINT_INVALID:
3972 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
3973 _(msgid), quote (string));
3980 /* Handle interrupts and hangups. */
3983 sighandler (int sig)
3986 signal (sig, SIG_IGN);
3990 signal (sig, SIG_DFL);
3994 /* Set the ordering options for KEY specified in S.
3995 Return the address of the first character in S that
3996 is not a valid ordering option.
3997 BLANKTYPE is the kind of blanks that 'b' should skip. */
4000 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4007 if (blanktype == bl_start || blanktype == bl_both)
4008 key->skipsblanks = true;
4009 if (blanktype == bl_end || blanktype == bl_both)
4010 key->skipeblanks = true;
4013 key->ignore = nondictionary;
4016 key->translate = fold_toupper;
4019 key->general_numeric = true;
4022 key->human_numeric = true;
4025 /* Option order should not matter, so don't let -i override
4026 -d. -d implies -i, but -i does not imply -d. */
4028 key->ignore = nonprinting;
4034 key->numeric = true;
4040 key->reverse = true;
4043 key->version = true;
4053 /* Initialize KEY. */
4055 static struct keyfield *
4056 key_init (struct keyfield *key)
4058 memset (key, 0, sizeof *key);
4059 key->eword = SIZE_MAX;
4064 main (int argc, char **argv)
4066 struct keyfield *key;
4067 struct keyfield key_buf;
4068 struct keyfield gkey;
4069 bool gkey_only = false;
4073 bool mergeonly = false;
4074 char *random_source = NULL;
4075 bool need_random = false;
4076 size_t nthreads = 0;
4078 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
4079 bool obsolete_usage = (posix2_version () < 200112);
4081 char *files_from = NULL;
4083 char const *outfile = NULL;
4085 initialize_main (&argc, &argv);
4086 set_program_name (argv[0]);
4087 setlocale (LC_ALL, "");
4088 bindtextdomain (PACKAGE, LOCALEDIR);
4089 textdomain (PACKAGE);
4091 initialize_exit_failure (SORT_FAILURE);
4093 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4094 #if HAVE_NL_LANGINFO
4095 hard_LC_TIME = hard_locale (LC_TIME);
4098 /* Get locale's representation of the decimal point. */
4100 struct lconv const *locale = localeconv ();
4102 /* If the locale doesn't define a decimal point, or if the decimal
4103 point is multibyte, use the C locale's decimal point. FIXME:
4104 add support for multibyte decimal points. */
4105 decimal_point = to_uchar (locale->decimal_point[0]);
4106 if (! decimal_point || locale->decimal_point[1])
4107 decimal_point = '.';
4109 /* FIXME: add support for multibyte thousands separators. */
4110 thousands_sep = to_uchar (*locale->thousands_sep);
4111 if (! thousands_sep || locale->thousands_sep[1])
4115 have_read_stdin = false;
4120 static int const sig[] =
4122 /* The usual suspects. */
4123 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4140 enum { nsigs = ARRAY_CARDINALITY (sig) };
4143 struct sigaction act;
4145 sigemptyset (&caught_signals);
4146 for (i = 0; i < nsigs; i++)
4148 sigaction (sig[i], NULL, &act);
4149 if (act.sa_handler != SIG_IGN)
4150 sigaddset (&caught_signals, sig[i]);
4153 act.sa_handler = sighandler;
4154 act.sa_mask = caught_signals;
4157 for (i = 0; i < nsigs; i++)
4158 if (sigismember (&caught_signals, sig[i]))
4159 sigaction (sig[i], &act, NULL);
4161 for (i = 0; i < nsigs; i++)
4162 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4164 signal (sig[i], sighandler);
4165 siginterrupt (sig[i], 1);
4169 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4171 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4172 atexit (exit_cleanup);
4175 gkey.sword = SIZE_MAX;
4177 files = xnmalloc (argc, sizeof *files);
4181 /* Parse an operand as a file after "--" was seen; or if
4182 pedantic and a file was seen, unless the POSIX version
4183 predates 1003.1-2001 and -c was not seen and the operand is
4184 "-o FILE" or "-oFILE". */
4188 || (posixly_correct && nfiles != 0
4189 && ! (obsolete_usage
4192 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4193 && (argv[optind][2] || optind + 1 != argc)))
4194 || ((c = getopt_long (argc, argv, short_options,
4200 files[nfiles++] = argv[optind++];
4206 if (optarg[0] == '+')
4208 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4209 && ISDIGIT (argv[optind][1]));
4210 obsolete_usage |= minus_pos_usage && !posixly_correct;
4213 /* Treat +POS1 [-POS2] as a key if possible; but silently
4214 treat an operand as a file if it is not a valid +POS1. */
4215 key = key_init (&key_buf);
4216 s = parse_field_count (optarg + 1, &key->sword, NULL);
4218 s = parse_field_count (s + 1, &key->schar, NULL);
4219 if (! (key->sword || key->schar))
4220 key->sword = SIZE_MAX;
4221 if (! s || *set_ordering (s, key, bl_start))
4225 if (minus_pos_usage)
4227 char const *optarg1 = argv[optind++];
4228 s = parse_field_count (optarg1 + 1, &key->eword,
4229 N_("invalid number after `-'"));
4231 s = parse_field_count (s + 1, &key->echar,
4232 N_("invalid number after `.'"));
4233 if (!key->echar && key->eword)
4235 /* obsolescent syntax +A.x -B.y is equivalent to:
4236 -k A+1.x+1,B.y (when y = 0)
4237 -k A+1.x+1,B+1.y (when y > 0)
4238 So eword is decremented as in the -k case
4239 only when the end field (B) is specified and
4243 if (*set_ordering (s, key, bl_end))
4244 badfieldspec (optarg1,
4245 N_("stray character in field spec"));
4247 key->obsolete_used = true;
4253 files[nfiles++] = optarg;
4257 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4274 set_ordering (str, &gkey, bl_both);
4280 ? XARGMATCH ("--check", optarg, check_args, check_types)
4285 if (checkonly && checkonly != c)
4286 incompatible_options ("cC");
4290 case COMPRESS_PROGRAM_OPTION:
4291 if (compress_program && !STREQ (compress_program, optarg))
4292 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4293 compress_program = optarg;
4296 case DEBUG_PROGRAM_OPTION:
4300 case FILES0_FROM_OPTION:
4301 files_from = optarg;
4305 key = key_init (&key_buf);
4308 s = parse_field_count (optarg, &key->sword,
4309 N_("invalid number at field start"));
4312 /* Provoke with `sort -k0' */
4313 badfieldspec (optarg, N_("field number is zero"));
4317 s = parse_field_count (s + 1, &key->schar,
4318 N_("invalid number after `.'"));
4321 /* Provoke with `sort -k1.0' */
4322 badfieldspec (optarg, N_("character offset is zero"));
4325 if (! (key->sword || key->schar))
4326 key->sword = SIZE_MAX;
4327 s = set_ordering (s, key, bl_start);
4330 key->eword = SIZE_MAX;
4336 s = parse_field_count (s + 1, &key->eword,
4337 N_("invalid number after `,'"));
4340 /* Provoke with `sort -k1,0' */
4341 badfieldspec (optarg, N_("field number is zero"));
4345 s = parse_field_count (s + 1, &key->echar,
4346 N_("invalid number after `.'"));
4348 s = set_ordering (s, key, bl_end);
4351 badfieldspec (optarg, N_("stray character in field spec"));
4360 specify_nmerge (oi, c, optarg);
4364 if (outfile && !STREQ (outfile, optarg))
4365 error (SORT_FAILURE, 0, _("multiple output files specified"));
4369 case RANDOM_SOURCE_OPTION:
4370 if (random_source && !STREQ (random_source, optarg))
4371 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4372 random_source = optarg;
4380 specify_sort_size (oi, c, optarg);
4385 char newtab = optarg[0];
4387 error (SORT_FAILURE, 0, _("empty tab"));
4390 if (STREQ (optarg, "\\0"))
4394 /* Provoke with `sort -txx'. Complain about
4395 "multi-character tab" instead of "multibyte tab", so
4396 that the diagnostic's wording does not need to be
4397 changed once multibyte characters are supported. */
4398 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4402 if (tab != TAB_DEFAULT && tab != newtab)
4403 error (SORT_FAILURE, 0, _("incompatible tabs"));
4409 add_temp_dir (optarg);
4412 case PARALLEL_OPTION:
4413 nthreads = specify_nthreads (oi, c, optarg);
4421 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4422 through Solaris 7. It is also accepted by many non-Solaris
4423 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4424 -y is marked as obsolete starting with Solaris 8 (1999), but is
4425 still accepted as of Solaris 10 prerelease (2004).
4427 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4428 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4429 and which in general ignores the argument after "-y" if it
4430 consists entirely of digits (it can even be empty). */
4431 if (optarg == argv[optind - 1])
4434 for (p = optarg; ISDIGIT (*p); p++)
4436 optind -= (*p != '\0');
4444 case_GETOPT_HELP_CHAR;
4446 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4449 usage (SORT_FAILURE);
4457 /* When using --files0-from=F, you may not specify any files
4458 on the command-line. */
4461 error (0, 0, _("extra operand %s"), quote (files[0]));
4462 fprintf (stderr, "%s\n",
4463 _("file operands cannot be combined with --files0-from"));
4464 usage (SORT_FAILURE);
4467 if (STREQ (files_from, "-"))
4471 stream = fopen (files_from, "r");
4473 error (SORT_FAILURE, errno, _("cannot open %s for reading"),
4474 quote (files_from));
4477 readtokens0_init (&tok);
4479 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
4480 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4481 quote (files_from));
4489 for (i = 0; i < nfiles; i++)
4491 if (STREQ (files[i], "-"))
4492 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4493 "no file name of %s allowed"),
4495 else if (files[i][0] == '\0')
4497 /* Using the standard `filename:line-number:' prefix here is
4498 not totally appropriate, since NUL is the separator,
4499 not NL, but it might be better than nothing. */
4500 unsigned long int file_number = i + 1;
4501 error (SORT_FAILURE, 0,
4502 _("%s:%lu: invalid zero-length file name"),
4503 quotearg_colon (files_from), file_number);
4508 error (SORT_FAILURE, 0, _("no input from %s"),
4509 quote (files_from));
4512 /* Inheritance of global options to individual keys. */
4513 for (key = keylist; key; key = key->next)
4515 if (default_key_compare (key) && !key->reverse)
4517 key->ignore = gkey.ignore;
4518 key->translate = gkey.translate;
4519 key->skipsblanks = gkey.skipsblanks;
4520 key->skipeblanks = gkey.skipeblanks;
4521 key->month = gkey.month;
4522 key->numeric = gkey.numeric;
4523 key->general_numeric = gkey.general_numeric;
4524 key->human_numeric = gkey.human_numeric;
4525 key->version = gkey.version;
4526 key->random = gkey.random;
4527 key->reverse = gkey.reverse;
4530 need_random |= key->random;
4533 if (!keylist && !default_key_compare (&gkey))
4537 need_random |= gkey.random;
4540 check_ordering_compatibility ();
4544 if (checkonly || outfile)
4546 static char opts[] = "X --debug";
4547 opts[0] = (checkonly ? checkonly : 'o');
4548 incompatible_options (opts);
4551 /* Always output the locale in debug mode, since this
4552 is such a common source of confusion. */
4553 if (hard_LC_COLLATE)
4554 error (0, 0, _("using %s sorting rules"),
4555 quote (setlocale (LC_COLLATE, NULL)));
4557 error (0, 0, _("using simple byte comparison"));
4558 key_warnings (&gkey, gkey_only);
4561 reverse = gkey.reverse;
4564 random_md5_state_init (random_source);
4566 if (temp_dir_count == 0)
4568 char const *tmp_dir = getenv ("TMPDIR");
4569 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4574 static char *minus = (char *) "-";
4580 /* Need to re-check that we meet the minimum requirement for memory
4581 usage with the final value for NMERGE. */
4583 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4588 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4589 quote (files[1]), checkonly);
4593 static char opts[] = {0, 'o', 0};
4594 opts[0] = checkonly;
4595 incompatible_options (opts);
4598 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4599 input is not properly sorted. */
4600 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4605 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4608 for (i = 0; i < nfiles; ++i)
4609 sortfiles[i].name = files[i];
4611 merge (sortfiles, 0, nfiles, outfile);
4612 IF_LINT (free (sortfiles));
4618 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4619 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4622 /* Avoid integer overflow later. */
4623 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4624 nthreads = MIN (nthreads, nthreads_max);
4626 sort (files, nfiles, outfile, nthreads);
4629 if (have_read_stdin && fclose (stdin) == EOF)
4630 die (_("close failed"), "-");
4632 exit (EXIT_SUCCESS);