1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988-2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 Written December 1988 by Mike Haertel.
18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 or (US mail) as Mike Haertel c/o Free Software Foundation.
21 Ørn E. Hansen added NLS support in 1997. */
27 #include <sys/resource.h>
28 #include <sys/types.h>
36 #include "filevercmp.h"
37 #include "hard-locale.h"
40 #include "ignore-value.h"
49 #include "readtokens0.h"
52 #include "strnumcmp.h"
54 #include "xnanosleep.h"
58 struct rlimit { size_t rlim_cur; };
59 # define getrlimit(Resource, Rlp) (-1)
62 /* The official name of this program (e.g., no 'g' prefix). */
63 #define PROGRAM_NAME "sort"
66 proper_name ("Mike Haertel"), \
67 proper_name ("Paul Eggert")
69 #if HAVE_LANGINFO_CODESET
70 # include <langinfo.h>
73 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
76 # define SA_NOCLDSTOP 0
77 /* No sigprocmask. Always 'return' zero. */
78 # define sigprocmask(How, Set, Oset) (0)
80 # if ! HAVE_SIGINTERRUPT
81 # define siginterrupt(sig, flag) /* empty */
85 #if !defined OPEN_MAX && defined NR_OPEN
86 # define OPEN_MAX NR_OPEN
92 #define UCHAR_LIM (UCHAR_MAX + 1)
95 # define long_double long double
97 # define long_double double
99 # define strtold strtod
102 #ifndef DEFAULT_TMPDIR
103 # define DEFAULT_TMPDIR "/tmp"
106 /* Maximum number of lines to merge every time a NODE is taken from
107 the merge queue. Node is at LEVEL in the binary merge tree,
108 and is responsible for merging TOTAL lines. */
109 #define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
111 /* Heuristic value for the number of lines for which it is worth creating
112 a subthread, during an internal merge sort. I.e., it is a small number
113 of "average" lines for which sorting via two threads is faster than
114 sorting via one on an "average" system. On a dual-core 2.0 GHz i686
115 system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
116 lines of gensort -a output is sorted slightly faster with --parallel=2
117 than with --parallel=1. By contrast, using --parallel=1 is about 10%
118 faster than using --parallel=2 with a 64K-line input. */
119 enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
120 verify (4 <= SUBTHREAD_LINES_HEURISTIC);
122 /* The number of threads after which there are
123 diminishing performance gains. */
124 enum { DEFAULT_MAX_THREADS = 8 };
129 /* POSIX says to exit with status 1 if invoked with -c and the
130 input is not properly sorted. */
131 SORT_OUT_OF_ORDER = 1,
133 /* POSIX says any other irregular exit must exit with a status
134 code greater than 1. */
140 /* The number of times we should try to fork a compression process
141 (we retry if the fork call fails). We don't _need_ to compress
142 temp files, this is just to reduce disk access, so this number
143 can be small. Each retry doubles in duration. */
144 MAX_FORK_TRIES_COMPRESS = 4,
146 /* The number of times we should try to fork a decompression process.
147 If we can't fork a decompression process, we can't sort, so this
148 number should be big. Each retry doubles in duration. */
149 MAX_FORK_TRIES_DECOMPRESS = 9
154 /* Level of the end-of-merge node, one level above the root. */
157 /* Level of the root node in merge tree. */
161 /* The representation of the decimal point in the current locale. */
162 static int decimal_point;
164 /* Thousands separator; if -1, then there isn't one. */
165 static int thousands_sep;
167 /* Nonzero if the corresponding locales are hard. */
168 static bool hard_LC_COLLATE;
170 static bool hard_LC_TIME;
173 #define NONZERO(x) ((x) != 0)
175 /* The kind of blanks for '-b' to skip in various options. */
176 enum blanktype { bl_start, bl_end, bl_both };
178 /* The character marking end of line. Default to \n. */
179 static char eolchar = '\n';
181 /* Lines are held in core as counted strings. */
184 char *text; /* Text of the line. */
185 size_t length; /* Length including final newline. */
186 char *keybeg; /* Start of first key. */
187 char *keylim; /* Limit of first key. */
193 char *buf; /* Dynamically allocated buffer,
194 partitioned into 3 regions:
197 - an array of lines, in reverse order. */
198 size_t used; /* Number of bytes used for input data. */
199 size_t nlines; /* Number of lines in the line array. */
200 size_t alloc; /* Number of bytes allocated. */
201 size_t left; /* Number of bytes left from previous reads. */
202 size_t line_bytes; /* Number of bytes to reserve for each line. */
203 bool eof; /* An EOF has been read. */
209 size_t sword; /* Zero-origin 'word' to start at. */
210 size_t schar; /* Additional characters to skip. */
211 size_t eword; /* Zero-origin last 'word' of key. */
212 size_t echar; /* Additional characters in field. */
213 bool const *ignore; /* Boolean array of characters to ignore. */
214 char const *translate; /* Translation applied to characters. */
215 bool skipsblanks; /* Skip leading blanks when finding start. */
216 bool skipeblanks; /* Skip leading blanks when finding end. */
217 bool numeric; /* Flag for numeric comparison. Handle
218 strings of digits with optional decimal
219 point, but no exponential notation. */
220 bool random; /* Sort by random hash of key. */
221 bool general_numeric; /* Flag for general, numeric comparison.
222 Handle numbers in exponential notation. */
223 bool human_numeric; /* Flag for sorting by human readable
224 units with either SI xor IEC prefixes. */
225 bool month; /* Flag for comparison by month name. */
226 bool reverse; /* Reverse the sense of comparison. */
227 bool version; /* sort by version number */
228 bool obsolete_used; /* obsolescent key option format is used. */
229 struct keyfield *next; /* Next keyfield to try. */
238 /* Binary merge tree node. */
241 struct line *lo; /* Lines to merge from LO child node. */
242 struct line *hi; /* Lines to merge from HI child ndoe. */
243 struct line *end_lo; /* End of available lines from LO. */
244 struct line *end_hi; /* End of available lines from HI. */
245 struct line **dest; /* Pointer to destination of merge. */
246 size_t nlo; /* Total Lines remaining from LO. */
247 size_t nhi; /* Total lines remaining from HI. */
248 struct merge_node *parent; /* Parent node. */
249 struct merge_node *lo_child; /* LO child node. */
250 struct merge_node *hi_child; /* HI child node. */
251 unsigned int level; /* Level in merge tree. */
252 bool queued; /* Node is already in heap. */
253 pthread_mutex_t lock; /* Lock for node operations. */
256 /* Priority queue of merge nodes. */
257 struct merge_node_queue
259 struct heap *priority_queue; /* Priority queue of merge tree nodes. */
260 pthread_mutex_t mutex; /* Lock for queue operations. */
261 pthread_cond_t cond; /* Conditional wait for empty queue to populate
265 /* FIXME: None of these tables work with multibyte character sets.
266 Also, there are many other bugs when handling multibyte characters.
267 One way to fix this is to rewrite 'sort' to use wide characters
268 internally, but doing this with good performance is a bit
271 /* Table of blanks. */
272 static bool blanks[UCHAR_LIM];
274 /* Table of non-printing characters. */
275 static bool nonprinting[UCHAR_LIM];
277 /* Table of non-dictionary characters (not letters, digits, or blanks). */
278 static bool nondictionary[UCHAR_LIM];
280 /* Translation table folding lower case to upper. */
281 static char fold_toupper[UCHAR_LIM];
283 #define MONTHS_PER_YEAR 12
285 /* Table mapping month names to integers.
286 Alphabetic order allows binary search. */
287 static struct month monthtab[] =
303 /* During the merge phase, the number of files to merge at once. */
304 #define NMERGE_DEFAULT 16
306 /* Minimum size for a merge or check buffer. */
307 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
309 /* Minimum sort size; the code might not work with smaller sizes. */
310 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
312 /* The number of bytes needed for a merge or check buffer, which can
313 function relatively efficiently even if it holds only one line. If
314 a longer line is seen, this value is increased. */
315 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
317 /* The approximate maximum number of bytes of main memory to use, as
318 specified by the user. Zero if the user has not specified a size. */
319 static size_t sort_size;
321 /* The initial allocation factor for non-regular files.
322 This is used, e.g., when reading from a pipe.
323 Don't make it too big, since it is multiplied by ~130 to
324 obtain the size of the actual buffer sort will allocate.
325 Also, there may be 8 threads all doing this at the same time. */
326 #define INPUT_FILE_SIZE_GUESS (128 * 1024)
328 /* Array of directory names in which any temporary files are to be created. */
329 static char const **temp_dirs;
331 /* Number of temporary directory names used. */
332 static size_t temp_dir_count;
334 /* Number of allocated slots in temp_dirs. */
335 static size_t temp_dir_alloc;
337 /* Flag to reverse the order of all comparisons. */
340 /* Flag for stable sort. This turns off the last ditch bytewise
341 comparison of lines, and instead leaves lines in the same order
342 they were read if all keys compare equal. */
345 /* If TAB has this value, blanks separate fields. */
346 enum { TAB_DEFAULT = CHAR_MAX + 1 };
348 /* Tab character separating fields. If TAB_DEFAULT, then fields are
349 separated by the empty string between a non-blank character and a blank
351 static int tab = TAB_DEFAULT;
353 /* Flag to remove consecutive duplicate lines from the output.
354 Only the last of a sequence of equal lines will be output. */
357 /* Nonzero if any of the input files are the standard input. */
358 static bool have_read_stdin;
360 /* File descriptor associated with -o. */
361 static int outfd = -1;
363 /* List of key field comparisons to be tried. */
364 static struct keyfield *keylist;
366 /* Program used to (de)compress temp files. Must accept -d. */
367 static char const *compress_program;
369 /* Annotate the output with extra info to aid the user. */
372 /* Maximum number of files to merge in one go. If more than this
373 number are present, temp files will be used. */
374 static unsigned int nmerge = NMERGE_DEFAULT;
376 /* Report MESSAGE for FILE, then clean up and exit.
377 If FILE is null, it represents standard output. */
379 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
381 die (char const *message, char const *file)
383 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
390 if (status != EXIT_SUCCESS)
395 Usage: %s [OPTION]... [FILE]...\n\
396 or: %s [OPTION]... --files0-from=F\n\
398 program_name, program_name);
400 Write sorted concatenation of all FILE(s) to standard output.\n\
404 Mandatory arguments to long options are mandatory for short options too.\n\
411 -b, --ignore-leading-blanks ignore leading blanks\n\
412 -d, --dictionary-order consider only blanks and alphanumeric characters\
414 -f, --ignore-case fold lower case to upper case characters\n\
417 -g, --general-numeric-sort compare according to general numerical value\n\
418 -i, --ignore-nonprinting consider only printable characters\n\
419 -M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
422 -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
425 -n, --numeric-sort compare according to string numerical value\n\
426 -R, --random-sort sort by random hash of keys\n\
427 --random-source=FILE get random bytes from FILE\n\
428 -r, --reverse reverse the result of comparisons\n\
431 --sort=WORD sort according to WORD:\n\
432 general-numeric -g, human-numeric -h, month -M,\
434 numeric -n, random -R, version -V\n\
435 -V, --version-sort natural sort of (version) numbers within text\n\
443 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\
444 for more use temp files\n\
447 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
448 -C, --check=quiet, --check=silent like -c, but do not report first bad line\
450 --compress-program=PROG compress temporaries with PROG;\n\
451 decompress them with PROG -d\n\
454 --debug annotate the part of the line used to sort,\n\
455 and warn about questionable usage to stderr\n\
456 --files0-from=F read input from the files specified by\n\
457 NUL-terminated names in file F;\n\
458 If F is - then read names from standard input\n\
461 -k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
462 -m, --merge merge already sorted files; do not sort\n\
465 -o, --output=FILE write result to FILE instead of standard output\n\
466 -s, --stable stabilize sort by disabling last-resort comparison\
468 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
471 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
472 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
473 multiple options specify multiple directories\n\
474 --parallel=N change the number of sorts run concurrently to N\n\
475 -u, --unique with -c, check for strict ordering;\n\
476 without -c, output only the first of an equal run\
480 -z, --zero-terminated end lines with 0 byte, not newline\n\
482 fputs (HELP_OPTION_DESCRIPTION, stdout);
483 fputs (VERSION_OPTION_DESCRIPTION, stdout);
486 KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
487 field number and C a character position in the field; both are origin 1, and\n\
488 the stop position defaults to the line's end. If neither -t nor -b is in\n\
489 effect, characters in a field are counted from the beginning of the preceding\n\
490 whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
492 which override global ordering options for that key. If no key is given, use\n\
493 the entire line as the key.\n\
495 SIZE may be followed by the following multiplicative suffixes:\n\
498 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
500 With no FILE, or when FILE is -, read standard input.\n\
503 The locale specified by the environment affects sort order.\n\
504 Set LC_ALL=C to get the traditional sort order that uses\n\
505 native byte values.\n\
507 emit_ancillary_info ();
513 /* For long options that have no equivalent short option, use a
514 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
517 CHECK_OPTION = CHAR_MAX + 1,
518 COMPRESS_PROGRAM_OPTION,
519 DEBUG_PROGRAM_OPTION,
522 RANDOM_SOURCE_OPTION,
527 static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
529 static struct option const long_options[] =
531 {"ignore-leading-blanks", no_argument, NULL, 'b'},
532 {"check", optional_argument, NULL, CHECK_OPTION},
533 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
534 {"debug", no_argument, NULL, DEBUG_PROGRAM_OPTION},
535 {"dictionary-order", no_argument, NULL, 'd'},
536 {"ignore-case", no_argument, NULL, 'f'},
537 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},
538 {"general-numeric-sort", no_argument, NULL, 'g'},
539 {"ignore-nonprinting", no_argument, NULL, 'i'},
540 {"key", required_argument, NULL, 'k'},
541 {"merge", no_argument, NULL, 'm'},
542 {"month-sort", no_argument, NULL, 'M'},
543 {"numeric-sort", no_argument, NULL, 'n'},
544 {"human-numeric-sort", no_argument, NULL, 'h'},
545 {"version-sort", no_argument, NULL, 'V'},
546 {"random-sort", no_argument, NULL, 'R'},
547 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
548 {"sort", required_argument, NULL, SORT_OPTION},
549 {"output", required_argument, NULL, 'o'},
550 {"reverse", no_argument, NULL, 'r'},
551 {"stable", no_argument, NULL, 's'},
552 {"batch-size", required_argument, NULL, NMERGE_OPTION},
553 {"buffer-size", required_argument, NULL, 'S'},
554 {"field-separator", required_argument, NULL, 't'},
555 {"temporary-directory", required_argument, NULL, 'T'},
556 {"unique", no_argument, NULL, 'u'},
557 {"zero-terminated", no_argument, NULL, 'z'},
558 {"parallel", required_argument, NULL, PARALLEL_OPTION},
559 {GETOPT_HELP_OPTION_DECL},
560 {GETOPT_VERSION_OPTION_DECL},
564 #define CHECK_TABLE \
566 _ct_("silent", 'C') \
567 _ct_("diagnose-first", 'c')
569 static char const *const check_args[] =
571 #define _ct_(_s, _c) _s,
575 static char const check_types[] =
577 #define _ct_(_s, _c) _c,
583 _st_("general-numeric", 'g') \
584 _st_("human-numeric", 'h') \
586 _st_("numeric", 'n') \
587 _st_("random", 'R') \
590 static char const *const sort_args[] =
592 #define _st_(_s, _c) _s,
596 static char const sort_types[] =
598 #define _st_(_s, _c) _c,
603 /* The set of signals that are caught. */
604 static sigset_t caught_signals;
606 /* Critical section status. */
613 /* Enter a critical section. */
614 static struct cs_status
617 struct cs_status status;
618 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
622 /* Leave a critical section. */
624 cs_leave (struct cs_status status)
628 /* Ignore failure when restoring the signal mask. */
629 sigprocmask (SIG_SETMASK, &status.sigs, NULL);
633 /* Possible states for a temp file. If compressed, the file's status
634 is unreaped or reaped, depending on whether 'sort' has waited for
635 the subprocess to finish. */
636 enum { UNCOMPRESSED, UNREAPED, REAPED };
638 /* The list of temporary files. */
641 struct tempnode *volatile next;
642 pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
644 char name[1]; /* Actual size is 1 + file name length. */
646 static struct tempnode *volatile temphead;
647 static struct tempnode *volatile *temptail = &temphead;
649 /* A file to be sorted. */
652 /* The file's name. */
655 /* Nonnull if this is a temporary file, in which case NAME == TEMP->name. */
656 struct tempnode *temp;
659 /* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
660 static Hash_table *proctab;
662 enum { INIT_PROCTAB_SIZE = 47 };
665 proctab_hasher (void const *entry, size_t tabsize)
667 struct tempnode const *node = entry;
668 return node->pid % tabsize;
672 proctab_comparator (void const *e1, void const *e2)
674 struct tempnode const *n1 = e1;
675 struct tempnode const *n2 = e2;
676 return n1->pid == n2->pid;
679 /* The number of unreaped child processes. */
682 static bool delete_proc (pid_t);
684 /* If PID is positive, wait for the child process with that PID to
685 exit, and assume that PID has already been removed from the process
686 table. If PID is 0 or -1, clean up some child that has exited (by
687 waiting for it, and removing it from the proc table) and return the
688 child's process ID. However, if PID is 0 and no children have
689 exited, return 0 without waiting. */
695 pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
698 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
700 else if (0 < cpid && (0 < pid || delete_proc (cpid)))
702 if (! WIFEXITED (status) || WEXITSTATUS (status))
703 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
711 /* TEMP represents a new process; add it to the process table. Create
712 the process table the first time it's called. */
715 register_proc (struct tempnode *temp)
719 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
727 temp->state = UNREAPED;
729 if (! hash_insert (proctab, temp))
733 /* If PID is in the process table, remove it and return true.
734 Otherwise, return false. */
737 delete_proc (pid_t pid)
739 struct tempnode test;
742 struct tempnode *node = hash_delete (proctab, &test);
745 node->state = REAPED;
749 /* Remove PID from the process table, and wait for it to exit if it
753 wait_proc (pid_t pid)
755 if (delete_proc (pid))
759 /* Reap any exited children. Do not block; reap only those that have
765 while (0 < nprocs && reap (0))
769 /* Reap at least one exited child, waiting if necessary. */
778 /* Reap all children, waiting if necessary. */
787 /* Clean up any remaining temporary files. */
792 struct tempnode const *node;
794 for (node = temphead; node; node = node->next)
799 /* Cleanup actions to take when exiting. */
806 /* Clean up any remaining temporary files in a critical section so
807 that a signal handler does not try to clean them too. */
808 struct cs_status cs = cs_enter ();
816 /* Create a new temporary file, returning its newly allocated tempnode.
817 Store into *PFD the file descriptor open for writing.
818 If the creation fails, return NULL and store -1 into *PFD if the
819 failure is due to file descriptor exhaustion and
820 SURVIVE_FD_EXHAUSTION; otherwise, die. */
822 static struct tempnode *
823 create_temp_file (int *pfd, bool survive_fd_exhaustion)
825 static char const slashbase[] = "/sortXXXXXX";
826 static size_t temp_dir_index;
829 char const *temp_dir = temp_dirs[temp_dir_index];
830 size_t len = strlen (temp_dir);
831 struct tempnode *node =
832 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
833 char *file = node->name;
836 memcpy (file, temp_dir, len);
837 memcpy (file + len, slashbase, sizeof slashbase);
839 if (++temp_dir_index == temp_dir_count)
842 /* Create the temporary file in a critical section, to avoid races. */
848 temptail = &node->next;
856 if (! (survive_fd_exhaustion && errno == EMFILE))
857 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
867 /* Return a stream for FILE, opened with mode HOW. A null FILE means
868 standard output; HOW should be "w". When opening for input, "-"
869 means standard input. To avoid confusion, do not return file
870 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
871 opening an ordinary FILE. Return NULL if unsuccessful.
873 fadvise() is used to specify an access pattern for input files.
874 There are a few hints we could possibly provide,
875 and after careful testing it was decided that
876 specifying POSIX_FADV_SEQUENTIAL was not detrimental
877 to any cases. On Linux 2.6.31, this option doubles
878 the size of read ahead performed and thus was seen to
881 Sorting with a smaller internal buffer
882 Reading from faster flash devices
884 In _addition_ one could also specify other hints...
886 POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
887 at least uses that to _synchronously_ prepopulate the cache
888 with the specified range. While sort does need to
889 read all of its input before outputting, a synchronous
890 read of the whole file up front precludes any processing
891 that sort could do in parallel with the system doing
892 read ahead of the data. This was seen to have negative effects
893 in a couple of cases:
895 Sorting with a smaller internal buffer
896 Note this option was seen to shorten the runtime for sort
897 on a multicore system with lots of RAM and other processes
898 competing for CPU. It could be argued that more explicit
899 scheduling hints with 'nice' et. al. are more appropriate
902 POSIX_FADV_NOREUSE is a possibility as it could lower
903 the priority of input data in the cache as sort will
904 only need to process it once. However its functionality
905 has changed over Linux kernel versions and as of 2.6.31
906 it does nothing and thus we can't depend on what it might
909 POSIX_FADV_DONTNEED is not appropriate for user specified
910 input files, but for temp files we do want to drop the
911 cache immediately after processing. This is done implicitly
912 however when the files are unlinked. */
915 stream_open (char const *file, char const *how)
923 if (STREQ (file, "-"))
925 have_read_stdin = true;
929 fp = fopen (file, how);
930 fadvise (fp, FADVISE_SEQUENTIAL);
932 else if (*how == 'w')
934 assert (outfd != -1);
935 if (ftruncate (outfd, 0) != 0)
936 error (EXIT_FAILURE, errno, _("%s: error truncating"), quote (file));
937 fp = fdopen (outfd, how);
940 assert (!"unexpected mode passed to stream_open");
945 /* Same as stream_open, except always return a non-null value; die on
949 xfopen (char const *file, char const *how)
951 FILE *fp = stream_open (file, how);
953 die (_("open failed"), file);
957 /* Close FP, whose name is FILE, and report any errors. */
960 xfclose (FILE *fp, char const *file)
965 /* Allow reading stdin from tty more than once. */
971 /* Don't close stdout just yet. close_stdout does that. */
972 if (fflush (fp) != 0)
973 die (_("fflush failed"), file);
977 if (fclose (fp) != 0)
978 die (_("close failed"), file);
984 dup2_or_die (int oldfd, int newfd)
986 if (dup2 (oldfd, newfd) < 0)
987 error (SORT_FAILURE, errno, _("dup2 failed"));
990 /* Fork a child process for piping to and do common cleanup. The
991 TRIES parameter tells us how many times to try to fork before
992 giving up. Return the PID of the child, or -1 (setting errno)
996 pipe_fork (int pipefds[2], size_t tries)
998 #if HAVE_WORKING_FORK
999 struct tempnode *saved_temphead;
1001 double wait_retry = 0.25;
1002 pid_t pid IF_LINT ( = -1);
1003 struct cs_status cs;
1005 if (pipe (pipefds) < 0)
1008 /* At least NMERGE + 1 subprocesses are needed. More could be created, but
1009 uncontrolled subprocess generation can hurt performance significantly.
1010 Allow at most NMERGE + 2 subprocesses, on the theory that there
1011 may be some useful parallelism by letting compression for the
1012 previous merge finish (1 subprocess) in parallel with the current
1013 merge (NMERGE + 1 subprocesses). */
1015 if (nmerge + 1 < nprocs)
1020 /* This is so the child process won't delete our temp files
1021 if it receives a signal before exec-ing. */
1023 saved_temphead = temphead;
1027 saved_errno = errno;
1029 temphead = saved_temphead;
1032 errno = saved_errno;
1034 if (0 <= pid || errno != EAGAIN)
1038 xnanosleep (wait_retry);
1046 saved_errno = errno;
1049 errno = saved_errno;
1053 close (STDIN_FILENO);
1054 close (STDOUT_FILENO);
1061 #else /* ! HAVE_WORKING_FORK */
1066 /* Create a temporary file and, if asked for, start a compressor
1067 to that file. Set *PFP to the file handle and return
1068 the address of the new temp node. If the creation
1069 fails, return NULL if the failure is due to file descriptor
1070 exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
1072 static struct tempnode *
1073 maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
1076 struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
1080 node->state = UNCOMPRESSED;
1082 if (compress_program)
1086 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
1091 tempfd = pipefds[1];
1093 register_proc (node);
1095 else if (node->pid == 0)
1098 dup2_or_die (tempfd, STDOUT_FILENO);
1100 dup2_or_die (pipefds[0], STDIN_FILENO);
1103 if (execlp (compress_program, compress_program, (char *) NULL) < 0)
1104 error (SORT_FAILURE, errno, _("couldn't execute %s"),
1109 *pfp = fdopen (tempfd, "w");
1111 die (_("couldn't create temporary file"), node->name);
1116 /* Create a temporary file and, if asked for, start a compressor
1117 to that file. Set *PFP to the file handle and return the address
1118 of the new temp node. Die on failure. */
1120 static struct tempnode *
1121 create_temp (FILE **pfp)
1123 return maybe_create_temp (pfp, false);
1126 /* Open a compressed temp file and start a decompression process through
1127 which to filter the input. Return NULL (setting errno to
1128 EMFILE) if we ran out of file descriptors, and die on any other
1132 open_temp (struct tempnode *temp)
1134 int tempfd, pipefds[2];
1137 if (temp->state == UNREAPED)
1138 wait_proc (temp->pid);
1140 tempfd = open (temp->name, O_RDONLY);
1144 pid_t child = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
1149 if (errno != EMFILE)
1150 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
1158 dup2_or_die (tempfd, STDIN_FILENO);
1160 dup2_or_die (pipefds[1], STDOUT_FILENO);
1163 execlp (compress_program, compress_program, "-d", (char *) NULL);
1164 error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
1169 register_proc (temp);
1173 fp = fdopen (pipefds[0], "r");
1176 int saved_errno = errno;
1178 errno = saved_errno;
1186 /* Append DIR to the array of temporary directory names. */
1188 add_temp_dir (char const *dir)
1190 if (temp_dir_count == temp_dir_alloc)
1191 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
1193 temp_dirs[temp_dir_count++] = dir;
1196 /* Remove NAME from the list of temporary files. */
1199 zaptemp (char const *name)
1201 struct tempnode *volatile *pnode;
1202 struct tempnode *node;
1203 struct tempnode *next;
1205 int unlink_errno = 0;
1206 struct cs_status cs;
1208 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
1211 if (node->state == UNREAPED)
1212 wait_proc (node->pid);
1214 /* Unlink the temporary file in a critical section to avoid races. */
1217 unlink_status = unlink (name);
1218 unlink_errno = errno;
1222 if (unlink_status != 0)
1223 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
1229 #if HAVE_NL_LANGINFO
1232 struct_month_cmp (void const *m1, void const *m2)
1234 struct month const *month1 = m1;
1235 struct month const *month2 = m2;
1236 return strcmp (month1->name, month2->name);
1241 /* Initialize the character class tables. */
1248 for (i = 0; i < UCHAR_LIM; ++i)
1250 blanks[i] = !! isblank (i);
1251 nonprinting[i] = ! isprint (i);
1252 nondictionary[i] = ! isalnum (i) && ! isblank (i);
1253 fold_toupper[i] = toupper (i);
1256 #if HAVE_NL_LANGINFO
1257 /* If we're not in the "C" locale, read different names for months. */
1260 for (i = 0; i < MONTHS_PER_YEAR; i++)
1267 s = nl_langinfo (ABMON_1 + i);
1269 monthtab[i].name = name = xmalloc (s_len + 1);
1270 monthtab[i].val = i + 1;
1272 for (j = k = 0; j < s_len; j++)
1273 if (! isblank (to_uchar (s[j])))
1274 name[k++] = fold_toupper[to_uchar (s[j])];
1277 qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
1282 /* Specify how many inputs may be merged at once.
1283 This may be set on the command-line with the
1284 --batch-size option. */
1286 specify_nmerge (int oi, char c, char const *s)
1289 struct rlimit rlimit;
1290 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);
1292 /* Try to find out how many file descriptors we'll be able
1293 to open. We need at least nmerge + 3 (STDIN_FILENO,
1294 STDOUT_FILENO and STDERR_FILENO). */
1295 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
1300 if (e == LONGINT_OK)
1304 e = LONGINT_OVERFLOW;
1309 error (0, 0, _("invalid --%s argument %s"),
1310 long_options[oi].name, quote (s));
1311 error (SORT_FAILURE, 0,
1312 _("minimum --%s argument is %s"),
1313 long_options[oi].name, quote ("2"));
1315 else if (max_nmerge < nmerge)
1317 e = LONGINT_OVERFLOW;
1324 if (e == LONGINT_OVERFLOW)
1326 char max_nmerge_buf[INT_BUFSIZE_BOUND (max_nmerge)];
1327 error (0, 0, _("--%s argument %s too large"),
1328 long_options[oi].name, quote (s));
1329 error (SORT_FAILURE, 0,
1330 _("maximum --%s argument with current rlimit is %s"),
1331 long_options[oi].name,
1332 uinttostr (max_nmerge, max_nmerge_buf));
1335 xstrtol_fatal (e, oi, c, long_options, s);
1338 /* Specify the amount of main memory to use when sorting. */
1340 specify_sort_size (int oi, char c, char const *s)
1344 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1346 /* The default unit is KiB. */
1347 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1349 if (n <= UINTMAX_MAX / 1024)
1352 e = LONGINT_OVERFLOW;
1355 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1356 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1365 double mem = physmem_total () * n / 100;
1367 /* Use "<", not "<=", to avoid problems with rounding. */
1368 if (mem < UINTMAX_MAX)
1374 e = LONGINT_OVERFLOW;
1379 if (e == LONGINT_OK)
1381 /* If multiple sort sizes are specified, take the maximum, so
1382 that option order does not matter. */
1389 sort_size = MAX (sort_size, MIN_SORT_SIZE);
1393 e = LONGINT_OVERFLOW;
1396 xstrtol_fatal (e, oi, c, long_options, s);
1399 /* Specify the number of threads to spawn during internal sort. */
1401 specify_nthreads (int oi, char c, char const *s)
1403 unsigned long int nthreads;
1404 enum strtol_error e = xstrtoul (s, NULL, 10, &nthreads, "");
1405 if (e == LONGINT_OVERFLOW)
1407 if (e != LONGINT_OK)
1408 xstrtol_fatal (e, oi, c, long_options, s);
1409 if (SIZE_MAX < nthreads)
1410 nthreads = SIZE_MAX;
1412 error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
1416 /* Return the default sort size. */
1418 default_sort_size (void)
1420 /* Let SIZE be MEM, but no more than the maximum object size or
1421 system resource limits. Don't bother to check for values like
1422 RLIM_INFINITY since in practice they are not much less than SIZE_MAX. */
1423 size_t size = SIZE_MAX;
1424 struct rlimit rlimit;
1425 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1426 size = rlimit.rlim_cur;
1428 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1429 size = rlimit.rlim_cur;
1432 /* Leave a large safety margin for the above limits, as failure can
1433 occur when they are exceeded. */
1437 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1438 Exceeding RSS is not fatal, but can be quite slow. */
1439 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1440 size = rlimit.rlim_cur / 16 * 15;
1443 /* Let MEM be available memory or 1/8 of total memory, whichever
1445 double avail = physmem_available ();
1446 double total = physmem_total ();
1447 double mem = MAX (avail, total / 8);
1449 /* Return the minimum of MEM and SIZE, but no less than
1450 MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
1451 right when only one argument is floating point. */
1454 return MAX (size, MIN_SORT_SIZE);
1457 /* Return the sort buffer size to use with the input files identified
1458 by FPS and FILES, which are alternate names of the same files.
1459 NFILES gives the number of input files; NFPS may be less. Assume
1460 that each input line requires LINE_BYTES extra bytes' worth of line
1461 information. Do not exceed the size bound specified by the user
1462 (or a default size bound, if the user does not specify one). */
1465 sort_buffer_size (FILE *const *fps, size_t nfps,
1466 char *const *files, size_t nfiles,
1469 /* A bound on the input size. If zero, the bound hasn't been
1471 static size_t size_bound;
1473 /* In the worst case, each input byte is a newline. */
1474 size_t worst_case_per_input_byte = line_bytes + 1;
1476 /* Keep enough room for one extra input line and an extra byte.
1477 This extra room might be needed when preparing to read EOF. */
1478 size_t size = worst_case_per_input_byte + 1;
1482 for (i = 0; i < nfiles; i++)
1488 if ((i < nfps ? fstat (fileno (fps[i]), &st)
1489 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1490 : stat (files[i], &st))
1492 die (_("stat failed"), files[i]);
1494 if (S_ISREG (st.st_mode))
1495 file_size = st.st_size;
1498 /* The file has unknown size. If the user specified a sort
1499 buffer size, use that; otherwise, guess the size. */
1502 file_size = INPUT_FILE_SIZE_GUESS;
1507 size_bound = sort_size;
1509 size_bound = default_sort_size ();
1512 /* Add the amount of memory needed to represent the worst case
1513 where the input consists entirely of newlines followed by a
1514 single non-newline. Check for overflow. */
1515 worst_case = file_size * worst_case_per_input_byte + 1;
1516 if (file_size != worst_case / worst_case_per_input_byte
1517 || size_bound - size <= worst_case)
1525 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1526 must be at least sizeof (struct line). Allocate ALLOC bytes
1530 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1532 /* Ensure that the line array is properly aligned. If the desired
1533 size cannot be allocated, repeatedly halve it until allocation
1534 succeeds. The smaller allocation may hurt overall performance,
1535 but that's better than failing. */
1538 alloc += sizeof (struct line) - alloc % sizeof (struct line);
1539 buf->buf = malloc (alloc);
1543 if (alloc <= line_bytes + 1)
1547 buf->line_bytes = line_bytes;
1549 buf->used = buf->left = buf->nlines = 0;
1553 /* Return one past the limit of the line array. */
1555 static inline struct line *
1556 buffer_linelim (struct buffer const *buf)
1558 return (struct line *) (buf->buf + buf->alloc);
1561 /* Return a pointer to the first character of the field specified
1565 begfield (struct line const *line, struct keyfield const *key)
1567 char *ptr = line->text, *lim = ptr + line->length - 1;
1568 size_t sword = key->sword;
1569 size_t schar = key->schar;
1571 /* The leading field separator itself is included in a field when -t
1574 if (tab != TAB_DEFAULT)
1575 while (ptr < lim && sword--)
1577 while (ptr < lim && *ptr != tab)
1583 while (ptr < lim && sword--)
1585 while (ptr < lim && blanks[to_uchar (*ptr)])
1587 while (ptr < lim && !blanks[to_uchar (*ptr)])
1591 /* If we're ignoring leading blanks when computing the Start
1592 of the field, skip past them here. */
1593 if (key->skipsblanks)
1594 while (ptr < lim && blanks[to_uchar (*ptr)])
1597 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1598 ptr = MIN (lim, ptr + schar);
1603 /* Return the limit of (a pointer to the first character after) the field
1604 in LINE specified by KEY. */
1607 limfield (struct line const *line, struct keyfield const *key)
1609 char *ptr = line->text, *lim = ptr + line->length - 1;
1610 size_t eword = key->eword, echar = key->echar;
1613 eword++; /* Skip all of end field. */
1615 /* Move PTR past EWORD fields or to one past the last byte on LINE,
1616 whichever comes first. If there are more than EWORD fields, leave
1617 PTR pointing at the beginning of the field having zero-based index,
1618 EWORD. If a delimiter character was specified (via -t), then that
1619 'beginning' is the first character following the delimiting TAB.
1620 Otherwise, leave PTR pointing at the first 'blank' character after
1621 the preceding field. */
1622 if (tab != TAB_DEFAULT)
1623 while (ptr < lim && eword--)
1625 while (ptr < lim && *ptr != tab)
1627 if (ptr < lim && (eword || echar))
1631 while (ptr < lim && eword--)
1633 while (ptr < lim && blanks[to_uchar (*ptr)])
1635 while (ptr < lim && !blanks[to_uchar (*ptr)])
1639 #ifdef POSIX_UNSPECIFIED
1640 /* The following block of code makes GNU sort incompatible with
1641 standard Unix sort, so it's ifdef'd out for now.
1642 The POSIX spec isn't clear on how to interpret this.
1643 FIXME: request clarification.
1645 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1646 Date: Thu, 30 May 96 12:20:41 -0400
1647 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1649 [...]I believe I've found another bug in 'sort'.
1654 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1657 $ /bin/sort -k1.7,1.7 </tmp/sort.in
1661 Unix sort produced the answer I expected: sort on the single character
1662 in column 7. GNU sort produced different results, because it disagrees
1663 on the interpretation of the key-end spec "M.N". Unix sort reads this
1664 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1665 "skip M-1 fields, then either N-1 characters or the rest of the current
1666 field, whichever comes first". This extra clause applies only to
1667 key-ends, not key-starts.
1670 /* Make LIM point to the end of (one byte past) the current field. */
1671 if (tab != TAB_DEFAULT)
1674 newlim = memchr (ptr, tab, lim - ptr);
1682 while (newlim < lim && blanks[to_uchar (*newlim)])
1684 while (newlim < lim && !blanks[to_uchar (*newlim)])
1690 if (echar != 0) /* We need to skip over a portion of the end field. */
1692 /* If we're ignoring leading blanks when computing the End
1693 of the field, skip past them here. */
1694 if (key->skipeblanks)
1695 while (ptr < lim && blanks[to_uchar (*ptr)])
1698 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1699 ptr = MIN (lim, ptr + echar);
1705 /* Fill BUF reading from FP, moving buf->left bytes from the end
1706 of buf->buf to the beginning first. If EOF is reached and the
1707 file wasn't terminated by a newline, supply one. Set up BUF's line
1708 table too. FILE is the name of the file corresponding to FP.
1709 Return true if some input was read. */
1712 fillbuf (struct buffer *buf, FILE *fp, char const *file)
1714 struct keyfield const *key = keylist;
1716 size_t line_bytes = buf->line_bytes;
1717 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1722 if (buf->used != buf->left)
1724 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1725 buf->used = buf->left;
1731 char *ptr = buf->buf + buf->used;
1732 struct line *linelim = buffer_linelim (buf);
1733 struct line *line = linelim - buf->nlines;
1734 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1735 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1737 while (line_bytes + 1 < avail)
1739 /* Read as many bytes as possible, but do not read so many
1740 bytes that there might not be enough room for the
1741 corresponding line array. The worst case is when the
1742 rest of the input file consists entirely of newlines,
1743 except that the last byte is not a newline. */
1744 size_t readsize = (avail - 1) / (line_bytes + 1);
1745 size_t bytes_read = fread (ptr, 1, readsize, fp);
1746 char *ptrlim = ptr + bytes_read;
1748 avail -= bytes_read;
1750 if (bytes_read != readsize)
1753 die (_("read failed"), file);
1757 if (buf->buf == ptrlim)
1759 if (line_start != ptrlim && ptrlim[-1] != eol)
1764 /* Find and record each line in the just-read input. */
1765 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1767 /* Delimit the line with NUL. This eliminates the need to
1768 temporarily replace the last byte with NUL when calling
1769 xmemcoll(), which increases performance. */
1773 line->text = line_start;
1774 line->length = ptr - line_start;
1775 mergesize = MAX (mergesize, line->length);
1776 avail -= line_bytes;
1780 /* Precompute the position of the first key for
1782 line->keylim = (key->eword == SIZE_MAX
1784 : limfield (line, key));
1786 if (key->sword != SIZE_MAX)
1787 line->keybeg = begfield (line, key);
1790 if (key->skipsblanks)
1791 while (blanks[to_uchar (*line_start)])
1793 line->keybeg = line_start;
1805 buf->used = ptr - buf->buf;
1806 buf->nlines = buffer_linelim (buf) - line;
1807 if (buf->nlines != 0)
1809 buf->left = ptr - line_start;
1810 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1815 /* The current input line is too long to fit in the buffer.
1816 Double the buffer size and try again, keeping it properly
1818 size_t line_alloc = buf->alloc / sizeof (struct line);
1819 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1820 buf->alloc = line_alloc * sizeof (struct line);
1825 /* Table that maps characters to order-of-magnitude values. */
1826 static char const unit_order[UCHAR_LIM] =
1828 #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
1829 && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
1830 /* This initializer syntax works on all C99 hosts. For now, use
1831 it only on non-ASCII hosts, to ease the pain of porting to
1832 pre-C99 ASCII hosts. */
1833 ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
1836 /* Generate the following table with this command:
1837 perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
1838 foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
1840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1841 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
1843 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
1844 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1845 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1846 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1847 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1848 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1854 /* Return an integer that represents the order of magnitude of the
1855 unit following the number. The number may contain thousands
1856 separators and a decimal point, but it may not contain leading blanks.
1857 Negative numbers get negative orders; zero numbers have a zero order. */
1859 static int _GL_ATTRIBUTE_PURE
1860 find_unit_order (char const *number)
1862 bool minus_sign = (*number == '-');
1863 char const *p = number + minus_sign;
1867 /* Scan to end of number.
1868 Decimals or separators not followed by digits stop the scan.
1869 Numbers ending in decimals or separators are thus considered
1870 to be lacking in units.
1871 FIXME: add support for multibyte thousands_sep and decimal_point. */
1875 while (ISDIGIT (ch = *p++))
1876 nonzero |= ch - '0';
1878 while (ch == thousands_sep);
1880 if (ch == decimal_point)
1881 while (ISDIGIT (ch = *p++))
1882 nonzero |= ch - '0';
1886 int order = unit_order[ch];
1887 return (minus_sign ? -order : order);
1893 /* Compare numbers A and B ending in units with SI or IEC prefixes
1894 <none/unknown> < K/k < M < G < T < P < E < Z < Y */
1897 human_numcompare (char const *a, char const *b)
1899 while (blanks[to_uchar (*a)])
1901 while (blanks[to_uchar (*b)])
1904 int diff = find_unit_order (a) - find_unit_order (b);
1905 return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
1908 /* Compare strings A and B as numbers without explicitly converting them to
1909 machine numbers. Comparatively slow for short strings, but asymptotically
1913 numcompare (char const *a, char const *b)
1915 while (blanks[to_uchar (*a)])
1917 while (blanks[to_uchar (*b)])
1920 return strnumcmp (a, b, decimal_point, thousands_sep);
1923 /* Work around a problem whereby the long double value returned by glibc's
1924 strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
1925 A and B before calling strtold. FIXME: remove this function once
1926 gnulib guarantees that strtold's result is always well defined. */
1928 nan_compare (char const *sa, char const *sb)
1931 memset (&a, 0, sizeof a);
1932 a = strtold (sa, NULL);
1935 memset (&b, 0, sizeof b);
1936 b = strtold (sb, NULL);
1938 return memcmp (&a, &b, sizeof a);
1942 general_numcompare (char const *sa, char const *sb)
1944 /* FIXME: maybe add option to try expensive FP conversion
1945 only if A and B can't be compared more cheaply/accurately. */
1949 long_double a = strtold (sa, &ea);
1950 long_double b = strtold (sb, &eb);
1952 /* Put conversion errors at the start of the collating sequence. */
1954 return sb == eb ? 0 : -1;
1958 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1959 conversion errors but before numbers; sort them by internal
1960 bit-pattern, for lack of a more portable alternative. */
1966 : nan_compare (sa, sb));
1969 /* Return an integer in 1..12 of the month name MONTH.
1970 Return 0 if the name in S is not recognized. */
1973 getmonth (char const *month, char **ea)
1976 size_t hi = MONTHS_PER_YEAR;
1978 while (blanks[to_uchar (*month)])
1983 size_t ix = (lo + hi) / 2;
1984 char const *m = month;
1985 char const *n = monthtab[ix].name;
1993 return monthtab[ix].val;
1995 if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
2000 else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
2012 /* A randomly chosen MD5 state, used for random comparison. */
2013 static struct md5_ctx random_md5_state;
2015 /* Initialize the randomly chosen MD5 state. */
2018 random_md5_state_init (char const *random_source)
2020 unsigned char buf[MD5_DIGEST_SIZE];
2021 struct randread_source *r = randread_new (random_source, sizeof buf);
2023 die (_("open failed"), random_source);
2024 randread (r, buf, sizeof buf);
2025 if (randread_free (r) != 0)
2026 die (_("close failed"), random_source);
2027 md5_init_ctx (&random_md5_state);
2028 md5_process_bytes (buf, sizeof buf, &random_md5_state);
2031 /* This is like strxfrm, except it reports any error and exits. */
2034 xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
2037 size_t translated_size = strxfrm (dest, src, destsize);
2041 error (0, errno, _("string transformation failed"));
2042 error (0, 0, _("set LC_ALL='C' to work around the problem"));
2043 error (SORT_FAILURE, 0,
2044 _("the untransformed string was %s"),
2045 quotearg_n_style (0, locale_quoting_style, src));
2048 return translated_size;
2051 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
2052 using one or more random hash functions. TEXTA[LENA] and
2053 TEXTB[LENB] must be zero. */
2056 compare_random (char *restrict texta, size_t lena,
2057 char *restrict textb, size_t lenb)
2059 /* XFRM_DIFF records the equivalent of memcmp on the transformed
2060 data. This is used to break ties if there is a checksum
2061 collision, and this is good enough given the astronomically low
2062 probability of a collision. */
2065 char stackbuf[4000];
2066 char *buf = stackbuf;
2067 size_t bufsize = sizeof stackbuf;
2068 void *allocated = NULL;
2069 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
2070 struct md5_ctx s[2];
2071 s[0] = s[1] = random_md5_state;
2073 if (hard_LC_COLLATE)
2075 char const *lima = texta + lena;
2076 char const *limb = textb + lenb;
2080 /* Transform the text into the basis of comparison, so that byte
2081 strings that would otherwise considered to be equal are
2082 considered equal here even if their bytes differ.
2084 Each time through this loop, transform one
2085 null-terminated string's worth from TEXTA or from TEXTB
2086 or both. That way, there's no need to store the
2087 transformation of the whole line, if it contains many
2088 null-terminated strings. */
2090 /* Store the transformed data into a big-enough buffer. */
2092 /* A 3X size guess avoids the overhead of calling strxfrm
2093 twice on typical implementations. Don't worry about
2094 size_t overflow, as the guess need not be correct. */
2095 size_t guess_bufsize = 3 * (lena + lenb) + 2;
2096 if (bufsize < guess_bufsize)
2098 bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
2100 buf = allocated = malloc (bufsize);
2104 bufsize = sizeof stackbuf;
2109 (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
2110 bool a_fits = sizea <= bufsize;
2113 ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
2114 (a_fits ? bufsize - sizea : 0))
2118 if (! (a_fits && sizea + sizeb <= bufsize))
2120 bufsize = sizea + sizeb;
2121 if (bufsize < SIZE_MAX / 3)
2122 bufsize = bufsize * 3 / 2;
2124 buf = allocated = xmalloc (bufsize);
2126 strxfrm (buf, texta, sizea);
2128 strxfrm (buf + sizea, textb, sizeb);
2131 /* Advance past NULs to the next part of each input string,
2132 exiting the loop if both strings are exhausted. When
2133 exiting the loop, prepare to finish off the tiebreaker
2134 comparison properly. */
2136 texta += strlen (texta) + 1;
2138 textb += strlen (textb) + 1;
2139 if (! (texta < lima || textb < limb))
2141 lena = sizea; texta = buf;
2142 lenb = sizeb; textb = buf + sizea;
2146 /* Accumulate the transformed data in the corresponding
2148 md5_process_bytes (buf, sizea, &s[0]);
2149 md5_process_bytes (buf + sizea, sizeb, &s[1]);
2151 /* Update the tiebreaker comparison of the transformed data. */
2154 xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
2156 xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
2161 /* Compute and compare the checksums. */
2162 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
2163 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
2164 int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
2166 /* Fall back on the tiebreaker if the checksums collide. */
2171 xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
2173 xfrm_diff = (lena > lenb) - (lena < lenb);
2184 /* Return the printable width of the block of memory starting at
2185 TEXT and ending just before LIM, counting each tab as one byte.
2186 FIXME: Should we generally be counting non printable chars? */
2189 debug_width (char const *text, char const *lim)
2191 size_t width = mbsnwidth (text, lim - text, 0);
2193 width += (*text++ == '\t');
2197 /* For debug mode, "underline" a key at the
2198 specified offset and screen width. */
2201 mark_key (size_t offset, size_t width)
2207 printf (_("^ no match for key\n"));
2218 /* Return true if KEY is a numeric key. */
2221 key_numeric (struct keyfield const *key)
2223 return key->numeric || key->general_numeric || key->human_numeric;
2226 /* For LINE, output a debugging line that underlines KEY in LINE.
2227 If KEY is null, underline the whole line. */
2230 debug_key (struct line const *line, struct keyfield const *key)
2232 char *text = line->text;
2234 char *lim = text + line->length - 1;
2238 if (key->sword != SIZE_MAX)
2239 beg = begfield (line, key);
2240 if (key->eword != SIZE_MAX)
2241 lim = limfield (line, key);
2243 if (key->skipsblanks || key->month || key_numeric (key))
2248 while (blanks[to_uchar (*beg)])
2251 char *tighter_lim = beg;
2255 else if (key->month)
2256 getmonth (beg, &tighter_lim);
2257 else if (key->general_numeric)
2258 ignore_value (strtold (beg, &tighter_lim));
2259 else if (key->numeric || key->human_numeric)
2261 char *p = beg + (beg < lim && *beg == '-');
2262 bool found_digit = false;
2267 while (ISDIGIT (ch = *p++))
2270 while (ch == thousands_sep);
2272 if (ch == decimal_point)
2273 while (ISDIGIT (ch = *p++))
2277 tighter_lim = p - ! (key->human_numeric && unit_order[ch]);
2287 size_t offset = debug_width (text, beg);
2288 size_t width = debug_width (beg, lim);
2289 mark_key (offset, width);
2292 /* Debug LINE by underlining its keys. */
2295 debug_line (struct line const *line)
2297 struct keyfield const *key = keylist;
2300 debug_key (line, key);
2301 while (key && ((key = key->next) || ! (unique || stable)));
2304 /* Return whether sorting options specified for key. */
2307 default_key_compare (struct keyfield const *key)
2309 return ! (key->ignore
2313 || key_numeric (key)
2317 /* || key->reverse */
2321 /* Convert a key to the short options used to specify it. */
2324 key_to_opts (struct keyfield const *key, char *opts)
2326 if (key->skipsblanks || key->skipeblanks)
2327 *opts++ = 'b';/* either disables global -b */
2328 if (key->ignore == nondictionary)
2332 if (key->general_numeric)
2334 if (key->human_numeric)
2336 if (key->ignore == nonprinting)
2351 /* Output data independent key warnings to stderr. */
2354 key_warnings (struct keyfield const *gkey, bool gkey_only)
2356 struct keyfield const *key;
2357 struct keyfield ugkey = *gkey;
2358 unsigned long keynum = 1;
2360 for (key = keylist; key; key = key->next, keynum++)
2362 if (key->obsolete_used)
2364 size_t sword = key->sword;
2365 size_t eword = key->eword;
2366 char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
2367 /* obsolescent syntax +A.x -B.y is equivalent to:
2368 -k A+1.x+1,B.y (when y = 0)
2369 -k A+1.x+1,B+1.y (when y > 0) */
2370 char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
2371 char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
2375 if (sword == SIZE_MAX)
2378 po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
2379 pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
2380 if (key->eword != SIZE_MAX)
2382 stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
2383 stpcpy (stpcpy (pn, ","),
2384 umaxtostr (eword + 1
2385 + (key->echar == SIZE_MAX), tmp));
2387 error (0, 0, _("obsolescent key %s used; consider %s instead"),
2388 quote_n (0, obuf), quote_n (1, nbuf));
2391 /* Warn about field specs that will never match. */
2392 if (key->sword != SIZE_MAX && key->eword < key->sword)
2393 error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
2395 /* Warn about significant leading blanks. */
2396 bool implicit_skip = key_numeric (key) || key->month;
2397 bool maybe_space_aligned = !hard_LC_COLLATE && default_key_compare (key)
2398 && !(key->schar || key->echar);
2399 bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
2400 if (!gkey_only && tab == TAB_DEFAULT && !line_offset
2401 && ((!key->skipsblanks && !(implicit_skip || maybe_space_aligned))
2402 || (!key->skipsblanks && key->schar)
2403 || (!key->skipeblanks && key->echar)))
2404 error (0, 0, _("leading blanks are significant in key %lu; "
2405 "consider also specifying 'b'"), keynum);
2407 /* Warn about numeric comparisons spanning fields,
2408 as field delimiters could be interpreted as part
2409 of the number (maybe only in other locales). */
2410 if (!gkey_only && key_numeric (key))
2412 size_t sword = key->sword + 1;
2413 size_t eword = key->eword + 1;
2416 if (!eword || sword < eword)
2417 error (0, 0, _("key %lu is numeric and spans multiple fields"),
2421 /* Flag global options not copied or specified in any key. */
2422 if (ugkey.ignore && (ugkey.ignore == key->ignore))
2423 ugkey.ignore = NULL;
2424 if (ugkey.translate && (ugkey.translate == key->translate))
2425 ugkey.translate = NULL;
2426 ugkey.skipsblanks &= !key->skipsblanks;
2427 ugkey.skipeblanks &= !key->skipeblanks;
2428 ugkey.month &= !key->month;
2429 ugkey.numeric &= !key->numeric;
2430 ugkey.general_numeric &= !key->general_numeric;
2431 ugkey.human_numeric &= !key->human_numeric;
2432 ugkey.random &= !key->random;
2433 ugkey.version &= !key->version;
2434 ugkey.reverse &= !key->reverse;
2437 /* Warn about ignored global options flagged above.
2438 Note if gkey is the only one in the list, all flags are cleared. */
2439 if (!default_key_compare (&ugkey)
2440 || (ugkey.reverse && (stable || unique) && keylist))
2442 bool ugkey_reverse = ugkey.reverse;
2443 if (!(stable || unique))
2444 ugkey.reverse = false;
2445 /* The following is too big, but guaranteed to be "big enough". */
2446 char opts[sizeof short_options];
2447 key_to_opts (&ugkey, opts);
2449 ngettext ("option '-%s' is ignored",
2450 "options '-%s' are ignored",
2451 select_plural (strlen (opts))), opts);
2452 ugkey.reverse = ugkey_reverse;
2454 if (ugkey.reverse && !(stable || unique) && keylist)
2455 error (0, 0, _("option '-r' only applies to last-resort comparison"));
2458 /* Compare two lines A and B trying every key in sequence until there
2459 are no more keys or a difference is found. */
2462 keycompare (struct line const *a, struct line const *b)
2464 struct keyfield *key = keylist;
2466 /* For the first iteration only, the key positions have been
2467 precomputed for us. */
2468 char *texta = a->keybeg;
2469 char *textb = b->keybeg;
2470 char *lima = a->keylim;
2471 char *limb = b->keylim;
2477 char const *translate = key->translate;
2478 bool const *ignore = key->ignore;
2480 /* Treat field ends before field starts as empty fields. */
2481 lima = MAX (texta, lima);
2482 limb = MAX (textb, limb);
2484 /* Find the lengths. */
2485 size_t lena = lima - texta;
2486 size_t lenb = limb - textb;
2488 if (hard_LC_COLLATE || key_numeric (key)
2489 || key->month || key->random || key->version)
2496 char enda IF_LINT (= 0);
2497 char endb IF_LINT (= 0);
2498 void *allocated IF_LINT (= NULL);
2499 char stackbuf[4000];
2501 if (ignore || translate)
2503 /* Compute with copies of the keys, which are the result of
2504 translating or ignoring characters, and which need their
2509 /* Allocate space for copies. */
2510 size_t size = lena + 1 + lenb + 1;
2511 if (size <= sizeof stackbuf)
2512 ta = stackbuf, allocated = NULL;
2514 ta = allocated = xmalloc (size);
2517 /* Put into each copy a version of the key in which the
2518 requested characters are ignored or translated. */
2519 for (tlena = i = 0; i < lena; i++)
2520 if (! (ignore && ignore[to_uchar (texta[i])]))
2521 ta[tlena++] = (translate
2522 ? translate[to_uchar (texta[i])]
2526 for (tlenb = i = 0; i < lenb; i++)
2527 if (! (ignore && ignore[to_uchar (textb[i])]))
2528 tb[tlenb++] = (translate
2529 ? translate[to_uchar (textb[i])]
2535 /* Use the keys in-place, temporarily null-terminated. */
2536 ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
2537 tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
2541 diff = numcompare (ta, tb);
2542 else if (key->general_numeric)
2543 diff = general_numcompare (ta, tb);
2544 else if (key->human_numeric)
2545 diff = human_numcompare (ta, tb);
2546 else if (key->month)
2547 diff = getmonth (ta, NULL) - getmonth (tb, NULL);
2548 else if (key->random)
2549 diff = compare_random (ta, tlena, tb, tlenb);
2550 else if (key->version)
2551 diff = filevercmp (ta, tb);
2554 /* Locale-dependent string sorting. This is slower than
2555 C-locale sorting, which is implemented below. */
2557 diff = - NONZERO (tlenb);
2558 else if (tlenb == 0)
2561 diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
2564 if (ignore || translate)
2574 #define CMP_WITH_IGNORE(A, B) \
2579 while (texta < lima && ignore[to_uchar (*texta)]) \
2581 while (textb < limb && ignore[to_uchar (*textb)]) \
2583 if (! (texta < lima && textb < limb)) \
2585 diff = to_uchar (A) - to_uchar (B); \
2592 diff = (texta < lima) - (textb < limb); \
2597 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
2598 translate[to_uchar (*textb)]);
2600 CMP_WITH_IGNORE (*texta, *textb);
2603 diff = - NONZERO (lenb);
2610 while (texta < lima && textb < limb)
2612 diff = (to_uchar (translate[to_uchar (*texta++)])
2613 - to_uchar (translate[to_uchar (*textb++)]));
2620 diff = memcmp (texta, textb, MIN (lena, lenb));
2624 diff = lena < lenb ? -1 : lena != lenb;
2634 /* Find the beginning and limit of the next field. */
2635 if (key->eword != SIZE_MAX)
2636 lima = limfield (a, key), limb = limfield (b, key);
2638 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
2640 if (key->sword != SIZE_MAX)
2641 texta = begfield (a, key), textb = begfield (b, key);
2644 texta = a->text, textb = b->text;
2645 if (key->skipsblanks)
2647 while (texta < lima && blanks[to_uchar (*texta)])
2649 while (textb < limb && blanks[to_uchar (*textb)])
2660 return key->reverse ? -diff : diff;
2663 /* Compare two lines A and B, returning negative, zero, or positive
2664 depending on whether A compares less than, equal to, or greater than B. */
2667 compare (struct line const *a, struct line const *b)
2672 /* First try to compare on the specified keys (if any).
2673 The only two cases with no key at all are unadorned sort,
2674 and unadorned sort -r. */
2677 diff = keycompare (a, b);
2678 if (diff || unique || stable)
2682 /* If the keys all compare equal (or no keys were specified)
2683 fall through to the default comparison. */
2684 alen = a->length - 1, blen = b->length - 1;
2687 diff = - NONZERO (blen);
2690 else if (hard_LC_COLLATE)
2692 /* Note xmemcoll0 is a performance enhancement as
2693 it will not unconditionally write '\0' after the
2694 passed in buffers, which was seen to give around
2695 a 3% increase in performance for short lines. */
2696 diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
2698 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
2699 diff = alen < blen ? -1 : alen != blen;
2701 return reverse ? -diff : diff;
2704 /* Write LINE to output stream FP; the output file's name is
2705 OUTPUT_FILE if OUTPUT_FILE is nonnull, and is the standard output
2706 otherwise. If debugging is enabled and FP is standard output,
2707 append some debugging information. */
2710 write_line (struct line const *line, FILE *fp, char const *output_file)
2712 char *buf = line->text;
2713 size_t n_bytes = line->length;
2714 char *ebuf = buf + n_bytes;
2716 if (!output_file && debug)
2718 /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
2719 char const *c = buf;
2728 if (fputc (wc, fp) == EOF)
2729 die (_("write failed"), output_file);
2737 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
2738 die (_("write failed"), output_file);
2743 /* Check that the lines read from FILE_NAME come in order. Return
2744 true if they are in order. If CHECKONLY == 'c', also print a
2745 diagnostic (FILE_NAME, line number, contents of line) to stderr if
2746 they are not in order. */
2749 check (char const *file_name, char checkonly)
2751 FILE *fp = xfopen (file_name, "r");
2752 struct buffer buf; /* Input buffer. */
2753 struct line temp; /* Copy of previous line. */
2755 uintmax_t line_number = 0;
2756 struct keyfield const *key = keylist;
2757 bool nonunique = ! unique;
2758 bool ordered = true;
2760 initbuf (&buf, sizeof (struct line),
2761 MAX (merge_buffer_size, sort_size));
2764 while (fillbuf (&buf, fp, file_name))
2766 struct line const *line = buffer_linelim (&buf);
2767 struct line const *linebase = line - buf.nlines;
2769 /* Make sure the line saved from the old buffer contents is
2770 less than or equal to the first line of the new buffer. */
2771 if (alloc && nonunique <= compare (&temp, line - 1))
2775 if (checkonly == 'c')
2777 struct line const *disorder_line = line - 1;
2778 uintmax_t disorder_line_number =
2779 buffer_linelim (&buf) - disorder_line + line_number;
2780 char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)];
2781 fprintf (stderr, _("%s: %s:%s: disorder: "),
2782 program_name, file_name,
2783 umaxtostr (disorder_line_number, hr_buf));
2784 write_line (disorder_line, stderr, _("standard error"));
2792 /* Compare each line in the buffer with its successor. */
2793 while (linebase < --line)
2794 if (nonunique <= compare (line, line - 1))
2795 goto found_disorder;
2797 line_number += buf.nlines;
2799 /* Save the last line of the buffer. */
2800 if (alloc < line->length)
2807 alloc = line->length;
2811 while (alloc < line->length);
2814 temp.text = xmalloc (alloc);
2816 memcpy (temp.text, line->text, line->length);
2817 temp.length = line->length;
2820 temp.keybeg = temp.text + (line->keybeg - line->text);
2821 temp.keylim = temp.text + (line->keylim - line->text);
2825 xfclose (fp, file_name);
2831 /* Open FILES (there are NFILES of them) and store the resulting array
2832 of stream pointers into (*PFPS). Allocate the array. Return the
2833 number of successfully opened files, setting errno if this value is
2834 less than NFILES. */
2837 open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
2839 FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
2842 /* Open as many input files as we can. */
2843 for (i = 0; i < nfiles; i++)
2845 fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
2846 ? open_temp (files[i].temp)
2847 : stream_open (files[i].name, "r"));
2855 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2856 files (all of which are at the start of the FILES array), and
2857 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2858 FPS is the vector of open stream corresponding to the files.
2859 Close input and output streams before returning.
2860 OUTPUT_FILE gives the name of the output file. If it is NULL,
2861 the output file is standard output. */
2864 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2865 FILE *ofp, char const *output_file, FILE **fps)
2867 struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
2868 /* Input buffers for each file. */
2869 struct line saved; /* Saved line storage for unique check. */
2870 struct line const *savedline = NULL;
2871 /* &saved if there is a saved line. */
2872 size_t savealloc = 0; /* Size allocated for the saved line. */
2873 struct line const **cur = xnmalloc (nfiles, sizeof *cur);
2874 /* Current line in each line table. */
2875 struct line const **base = xnmalloc (nfiles, sizeof *base);
2876 /* Base of each line table. */
2877 size_t *ord = xnmalloc (nfiles, sizeof *ord);
2878 /* Table representing a permutation of fps,
2879 such that cur[ord[0]] is the smallest line
2880 and will be next output. */
2884 struct keyfield const *key = keylist;
2887 /* Read initial lines from each input file. */
2888 for (i = 0; i < nfiles; )
2890 initbuf (&buffer[i], sizeof (struct line),
2891 MAX (merge_buffer_size, sort_size / nfiles));
2892 if (fillbuf (&buffer[i], fps[i], files[i].name))
2894 struct line const *linelim = buffer_linelim (&buffer[i]);
2895 cur[i] = linelim - 1;
2896 base[i] = linelim - buffer[i].nlines;
2901 /* fps[i] is empty; eliminate it from future consideration. */
2902 xfclose (fps[i], files[i].name);
2906 zaptemp (files[i].name);
2908 free (buffer[i].buf);
2910 for (j = i; j < nfiles; ++j)
2912 files[j] = files[j + 1];
2913 fps[j] = fps[j + 1];
2918 /* Set up the ord table according to comparisons among input lines.
2919 Since this only reorders two items if one is strictly greater than
2920 the other, it is stable. */
2921 for (i = 0; i < nfiles; ++i)
2923 for (i = 1; i < nfiles; ++i)
2924 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2925 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2927 /* Repeatedly output the smallest line until no input remains. */
2930 struct line const *smallest = cur[ord[0]];
2932 /* If uniquified output is turned on, output only the first of
2933 an identical series of lines. */
2936 if (savedline && compare (savedline, smallest))
2939 write_line (&saved, ofp, output_file);
2944 if (savealloc < smallest->length)
2949 savealloc = smallest->length;
2952 while ((savealloc *= 2) < smallest->length);
2955 saved.text = xmalloc (savealloc);
2957 saved.length = smallest->length;
2958 memcpy (saved.text, smallest->text, saved.length);
2962 saved.text + (smallest->keybeg - smallest->text);
2964 saved.text + (smallest->keylim - smallest->text);
2969 write_line (smallest, ofp, output_file);
2971 /* Check if we need to read more lines into core. */
2972 if (base[ord[0]] < smallest)
2973 cur[ord[0]] = smallest - 1;
2976 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2978 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2979 cur[ord[0]] = linelim - 1;
2980 base[ord[0]] = linelim - buffer[ord[0]].nlines;
2984 /* We reached EOF on fps[ord[0]]. */
2985 for (i = 1; i < nfiles; ++i)
2986 if (ord[i] > ord[0])
2989 xfclose (fps[ord[0]], files[ord[0]].name);
2990 if (ord[0] < ntemps)
2993 zaptemp (files[ord[0]].name);
2995 free (buffer[ord[0]].buf);
2996 for (i = ord[0]; i < nfiles; ++i)
2998 fps[i] = fps[i + 1];
2999 files[i] = files[i + 1];
3000 buffer[i] = buffer[i + 1];
3001 cur[i] = cur[i + 1];
3002 base[i] = base[i + 1];
3004 for (i = 0; i < nfiles; ++i)
3005 ord[i] = ord[i + 1];
3010 /* The new line just read in may be larger than other lines
3011 already in main memory; push it back in the queue until we
3012 encounter a line larger than it. Optimize for the common
3013 case where the new line is smallest. */
3018 size_t ord0 = ord[0];
3019 size_t count_of_smaller_lines;
3023 int cmp = compare (cur[ord0], cur[ord[probe]]);
3024 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
3028 probe = (lo + hi) / 2;
3031 count_of_smaller_lines = lo - 1;
3032 for (j = 0; j < count_of_smaller_lines; j++)
3033 ord[j] = ord[j + 1];
3034 ord[count_of_smaller_lines] = ord0;
3038 if (unique && savedline)
3040 write_line (&saved, ofp, output_file);
3044 xfclose (ofp, output_file);
3052 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
3053 files (all of which are at the start of the FILES array), and
3054 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
3055 Close input and output files before returning.
3056 OUTPUT_FILE gives the name of the output file.
3058 Return the number of files successfully merged. This number can be
3059 less than NFILES if we ran low on file descriptors, but in this
3060 case it is never less than 2. */
3063 mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
3064 FILE *ofp, char const *output_file)
3067 size_t nopened = open_input_files (files, nfiles, &fps);
3068 if (nopened < nfiles && nopened < 2)
3069 die (_("open failed"), files[nopened].name);
3070 mergefps (files, ntemps, nopened, ofp, output_file, fps);
3074 /* Merge into T (of size NLINES) the two sorted arrays of lines
3075 LO (with NLINES / 2 members), and
3076 T - (NLINES / 2) (with NLINES - NLINES / 2 members).
3077 T and LO point just past their respective arrays, and the arrays
3078 are in reverse order. NLINES must be at least 2. */
3081 mergelines (struct line *restrict t, size_t nlines,
3082 struct line const *restrict lo)
3084 size_t nlo = nlines / 2;
3085 size_t nhi = nlines - nlo;
3086 struct line *hi = t - nlo;
3089 if (compare (lo - 1, hi - 1) <= 0)
3094 /* HI must equal T now, and there is no need to copy from
3113 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
3114 Do this all within one thread. NLINES must be at least 2.
3115 If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
3116 Otherwise the sort is in-place and TEMP is half-sized.
3117 The input and output arrays are in reverse order, and LINES and
3118 TEMP point just past the end of their respective arrays.
3120 Use a recursive divide-and-conquer algorithm, in the style
3121 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
3122 the optimization suggested by exercise 5.2.4-10; this requires room
3123 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
3124 writes that this memory optimization was originally published by
3125 D. A. Bell, Comp J. 1 (1958), 75. */
3128 sequential_sort (struct line *restrict lines, size_t nlines,
3129 struct line *restrict temp, bool to_temp)
3133 /* Declare 'swap' as int, not bool, to work around a bug
3134 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
3135 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
3136 int swap = (0 < compare (&lines[-1], &lines[-2]));
3139 temp[-1] = lines[-1 - swap];
3140 temp[-2] = lines[-2 + swap];
3144 temp[-1] = lines[-1];
3145 lines[-1] = lines[-2];
3146 lines[-2] = temp[-1];
3151 size_t nlo = nlines / 2;
3152 size_t nhi = nlines - nlo;
3153 struct line *lo = lines;
3154 struct line *hi = lines - nlo;
3156 sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
3158 sequential_sort (lo, nlo, temp, !to_temp);
3163 struct line const *sorted_lo;
3174 mergelines (dest, nlines, sorted_lo);
3178 static struct merge_node *init_node (struct merge_node *restrict,
3179 struct merge_node *restrict,
3180 struct line *, size_t, size_t, bool);
3183 /* Create and return a merge tree for NTHREADS threads, sorting NLINES
3184 lines, with destination DEST. */
3185 static struct merge_node *
3186 merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
3188 struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
3190 struct merge_node *root = merge_tree;
3191 root->lo = root->hi = root->end_lo = root->end_hi = NULL;
3193 root->nlo = root->nhi = nlines;
3194 root->parent = NULL;
3195 root->level = MERGE_END;
3196 root->queued = false;
3197 pthread_mutex_init (&root->lock, NULL);
3199 init_node (root, root + 1, dest, nthreads, nlines, false);
3203 /* Destroy the merge tree. */
3205 merge_tree_destroy (struct merge_node *merge_tree)
3210 /* Initialize a merge tree node and its descendants. The node's
3211 parent is PARENT. The node and its descendants are taken from the
3212 array of nodes NODE_POOL. Their destination starts at DEST; they
3213 will consume NTHREADS threads. The total number of sort lines is
3214 TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
3217 static struct merge_node *
3218 init_node (struct merge_node *restrict parent,
3219 struct merge_node *restrict node_pool,
3220 struct line *dest, size_t nthreads,
3221 size_t total_lines, bool is_lo_child)
3223 size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
3224 size_t nlo = nlines / 2;
3225 size_t nhi = nlines - nlo;
3226 struct line *lo = dest - total_lines;
3227 struct line *hi = lo - nlo;
3228 struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
3230 struct merge_node *node = node_pool++;
3231 node->lo = node->end_lo = lo;
3232 node->hi = node->end_hi = hi;
3233 node->dest = parent_end;
3236 node->parent = parent;
3237 node->level = parent->level + 1;
3238 node->queued = false;
3239 pthread_mutex_init (&node->lock, NULL);
3243 size_t lo_threads = nthreads / 2;
3244 size_t hi_threads = nthreads - lo_threads;
3245 node->lo_child = node_pool;
3246 node_pool = init_node (node, node_pool, lo, lo_threads,
3248 node->hi_child = node_pool;
3249 node_pool = init_node (node, node_pool, hi, hi_threads,
3250 total_lines, false);
3254 node->lo_child = NULL;
3255 node->hi_child = NULL;
3261 /* Compare two merge nodes A and B for priority. */
3264 compare_nodes (void const *a, void const *b)
3266 struct merge_node const *nodea = a;
3267 struct merge_node const *nodeb = b;
3268 if (nodea->level == nodeb->level)
3269 return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
3270 return nodea->level < nodeb->level;
3273 /* Lock a merge tree NODE. */
3276 lock_node (struct merge_node *node)
3278 pthread_mutex_lock (&node->lock);
3281 /* Unlock a merge tree NODE. */
3284 unlock_node (struct merge_node *node)
3286 pthread_mutex_unlock (&node->lock);
3289 /* Destroy merge QUEUE. */
3292 queue_destroy (struct merge_node_queue *queue)
3294 heap_free (queue->priority_queue);
3295 pthread_cond_destroy (&queue->cond);
3296 pthread_mutex_destroy (&queue->mutex);
3299 /* Initialize merge QUEUE, allocating space suitable for a maximum of
3300 NTHREADS threads. */
3303 queue_init (struct merge_node_queue *queue, size_t nthreads)
3305 /* Though it's highly unlikely all nodes are in the heap at the same
3306 time, the heap should accommodate all of them. Counting a NULL
3307 dummy head for the heap, reserve 2 * NTHREADS nodes. */
3308 queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
3309 pthread_mutex_init (&queue->mutex, NULL);
3310 pthread_cond_init (&queue->cond, NULL);
3313 /* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
3314 does not need to lock NODE. */
3317 queue_insert (struct merge_node_queue *queue, struct merge_node *node)
3319 pthread_mutex_lock (&queue->mutex);
3320 heap_insert (queue->priority_queue, node);
3321 node->queued = true;
3322 pthread_mutex_unlock (&queue->mutex);
3323 pthread_cond_signal (&queue->cond);
3326 /* Pop the top node off the priority QUEUE, lock the node, return it. */
3328 static struct merge_node *
3329 queue_pop (struct merge_node_queue *queue)
3331 struct merge_node *node;
3332 pthread_mutex_lock (&queue->mutex);
3333 while (! (node = heap_remove_top (queue->priority_queue)))
3334 pthread_cond_wait (&queue->cond, &queue->mutex);
3335 pthread_mutex_unlock (&queue->mutex);
3337 node->queued = false;
3341 /* Output LINE to TFP, unless -u is specified and the line compares
3342 equal to the previous line. TEMP_OUTPUT is the name of TFP, or
3343 is null if TFP is standard output.
3345 This function does not save the line for comparison later, so it is
3346 appropriate only for internal sort. */
3349 write_unique (struct line const *line, FILE *tfp, char const *temp_output)
3351 static struct line saved;
3355 if (saved.text && ! compare (line, &saved))
3360 write_line (line, tfp, temp_output);
3363 /* Merge the lines currently available to a NODE in the binary
3364 merge tree. Merge a number of lines appropriate for this merge
3365 level, assuming TOTAL_LINES is the total number of lines.
3367 If merging at the top level, send output to TFP. TEMP_OUTPUT is
3368 the name of TFP, or is null if TFP is standard output. */
3371 mergelines_node (struct merge_node *restrict node, size_t total_lines,
3372 FILE *tfp, char const *temp_output)
3374 struct line *lo_orig = node->lo;
3375 struct line *hi_orig = node->hi;
3376 size_t to_merge = MAX_MERGE (total_lines, node->level);
3380 if (node->level > MERGE_ROOT)
3382 /* Merge to destination buffer. */
3383 struct line *dest = *node->dest;
3384 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3385 if (compare (node->lo - 1, node->hi - 1) <= 0)
3386 *--dest = *--node->lo;
3388 *--dest = *--node->hi;
3390 merged_lo = lo_orig - node->lo;
3391 merged_hi = hi_orig - node->hi;
3393 if (node->nhi == merged_hi)
3394 while (node->lo != node->end_lo && to_merge--)
3395 *--dest = *--node->lo;
3396 else if (node->nlo == merged_lo)
3397 while (node->hi != node->end_hi && to_merge--)
3398 *--dest = *--node->hi;
3403 /* Merge directly to output. */
3404 while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
3406 if (compare (node->lo - 1, node->hi - 1) <= 0)
3407 write_unique (--node->lo, tfp, temp_output);
3409 write_unique (--node->hi, tfp, temp_output);
3412 merged_lo = lo_orig - node->lo;
3413 merged_hi = hi_orig - node->hi;
3415 if (node->nhi == merged_hi)
3417 while (node->lo != node->end_lo && to_merge--)
3418 write_unique (--node->lo, tfp, temp_output);
3420 else if (node->nlo == merged_lo)
3422 while (node->hi != node->end_hi && to_merge--)
3423 write_unique (--node->hi, tfp, temp_output);
3428 merged_lo = lo_orig - node->lo;
3429 merged_hi = hi_orig - node->hi;
3430 node->nlo -= merged_lo;
3431 node->nhi -= merged_hi;
3434 /* Into QUEUE, insert NODE if it is not already queued, and if one of
3435 NODE's children has available lines and the other either has
3436 available lines or has exhausted its lines. */
3439 queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
3443 bool lo_avail = (node->lo - node->end_lo) != 0;
3444 bool hi_avail = (node->hi - node->end_hi) != 0;
3445 if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
3446 queue_insert (queue, node);
3450 /* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
3453 queue_check_insert_parent (struct merge_node_queue *queue,
3454 struct merge_node *node)
3456 if (node->level > MERGE_ROOT)
3458 lock_node (node->parent);
3459 queue_check_insert (queue, node->parent);
3460 unlock_node (node->parent);
3462 else if (node->nlo + node->nhi == 0)
3464 /* If the MERGE_ROOT NODE has finished merging, insert the
3466 queue_insert (queue, node->parent);
3470 /* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
3471 some of those lines, until the MERGE_END node is popped.
3472 TOTAL_LINES is the total number of lines. If merging at the top
3473 level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
3474 null if TFP is standard output. */
3477 merge_loop (struct merge_node_queue *queue,
3478 size_t total_lines, FILE *tfp, char const *temp_output)
3482 struct merge_node *node = queue_pop (queue);
3484 if (node->level == MERGE_END)
3487 /* Reinsert so other threads can pop it. */
3488 queue_insert (queue, node);
3491 mergelines_node (node, total_lines, tfp, temp_output);
3492 queue_check_insert (queue, node);
3493 queue_check_insert_parent (queue, node);
3500 static void sortlines (struct line *restrict, size_t, size_t,
3501 struct merge_node *, struct merge_node_queue *,
3502 FILE *, char const *);
3504 /* Thread arguments for sortlines_thread. */
3508 /* Source, i.e., the array of lines to sort. This points just past
3509 the end of the array. */
3512 /* Number of threads to use. If 0 or 1, sort single-threaded. */
3515 /* Number of lines in LINES and DEST. */
3516 size_t const total_lines;
3518 /* Merge node. Lines from this node and this node's sibling will merged
3519 to this node's parent. */
3520 struct merge_node *const node;
3522 /* The priority queue controlling available work for the entire
3524 struct merge_node_queue *const queue;
3526 /* If at the top level, the file to output to, and the file's name.
3527 If the file is standard output, the file's name is null. */
3529 char const *output_temp;
3532 /* Like sortlines, except with a signature acceptable to pthread_create. */
3535 sortlines_thread (void *data)
3537 struct thread_args const *args = data;
3538 sortlines (args->lines, args->nthreads, args->total_lines,
3539 args->node, args->queue, args->tfp,
3544 /* Sort lines, possibly in parallel. The arguments are as in struct
3547 The algorithm has three phases: node creation, sequential sort,
3550 During node creation, sortlines recursively visits each node in the
3551 binary merge tree and creates a NODE structure corresponding to all the
3552 future line merging NODE is responsible for. For each call to
3553 sortlines, half the available threads are assigned to each recursive
3554 call, until a leaf node having only 1 available thread is reached.
3556 Each leaf node then performs two sequential sorts, one on each half of
3557 the lines it is responsible for. It records in its NODE structure that
3558 there are two sorted sublists available to merge from, and inserts its
3559 NODE into the priority queue.
3561 The binary merge phase then begins. Each thread drops into a loop
3562 where the thread retrieves a NODE from the priority queue, merges lines
3563 available to that NODE, and potentially insert NODE or its parent back
3564 into the queue if there are sufficient available lines for them to
3565 merge. This continues until all lines at all nodes of the merge tree
3566 have been merged. */
3569 sortlines (struct line *restrict lines, size_t nthreads,
3570 size_t total_lines, struct merge_node *node,
3571 struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
3573 size_t nlines = node->nlo + node->nhi;
3575 /* Calculate thread arguments. */
3576 size_t lo_threads = nthreads / 2;
3577 size_t hi_threads = nthreads - lo_threads;
3579 struct thread_args args = {lines, lo_threads, total_lines,
3580 node->lo_child, queue, tfp, temp_output};
3582 if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
3583 && pthread_create (&thread, NULL, sortlines_thread, &args) == 0)
3585 sortlines (lines - node->nlo, hi_threads, total_lines,
3586 node->hi_child, queue, tfp, temp_output);
3587 pthread_join (thread, NULL);
3591 /* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
3592 Sort with 1 thread. */
3593 size_t nlo = node->nlo;
3594 size_t nhi = node->nhi;
3595 struct line *temp = lines - total_lines;
3597 sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
3599 sequential_sort (lines, nlo, temp, false);
3601 /* Update merge NODE. No need to lock yet. */
3603 node->hi = lines - nlo;
3604 node->end_lo = lines - nlo;
3605 node->end_hi = lines - nlo - nhi;
3607 queue_insert (queue, node);
3608 merge_loop (queue, total_lines, tfp, temp_output);
3611 pthread_mutex_destroy (&node->lock);
3614 /* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
3615 the same as OUTFILE. If found, replace each with the same
3616 temporary copy that can be merged into OUTFILE without destroying
3617 OUTFILE before it is completely read. This temporary copy does not
3618 count as a merge temp, so don't worry about incrementing NTEMPS in
3619 the caller; final cleanup will remove it, not zaptemp.
3621 This test ensures that an otherwise-erroneous use like
3622 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
3623 It's not clear that POSIX requires this nicety.
3624 Detect common error cases, but don't try to catch obscure cases like
3625 "cat ... FILE ... | sort -m -o FILE"
3626 where traditional "sort" doesn't copy the input and where
3627 people should know that they're getting into trouble anyway.
3628 Catching these obscure cases would slow down performance in
3632 avoid_trashing_input (struct sortfile *files, size_t ntemps,
3633 size_t nfiles, char const *outfile)
3636 bool got_outstat = false;
3637 struct stat outstat;
3638 struct tempnode *tempcopy = NULL;
3640 for (i = ntemps; i < nfiles; i++)
3642 bool is_stdin = STREQ (files[i].name, "-");
3646 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
3653 ? fstat (outfd, &outstat)
3654 : fstat (STDOUT_FILENO, &outstat))
3661 ? fstat (STDIN_FILENO, &instat)
3662 : stat (files[i].name, &instat))
3664 && SAME_INODE (instat, outstat));
3672 tempcopy = create_temp (&tftp);
3673 mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
3676 files[i].name = tempcopy->name;
3677 files[i].temp = tempcopy;
3682 /* Scan the input files to ensure all are accessible.
3683 Otherwise exit with a diagnostic.
3685 Note this will catch common issues with permissions etc.
3686 but will fail to notice issues where you can open() but not read(),
3687 like when a directory is specified on some systems.
3688 Catching these obscure cases could slow down performance in
3692 check_inputs (char *const *files, size_t nfiles)
3695 for (i = 0; i < nfiles; i++)
3697 if (STREQ (files[i], "-"))
3700 if (euidaccess (files[i], R_OK) != 0)
3701 die (_("cannot read"), files[i]);
3705 /* Ensure a specified output file can be created or written to,
3706 and cache the returned descriptor in the global OUTFD variable.
3707 Otherwise exit with a diagnostic. */
3710 check_output (char const *outfile)
3714 outfd = open (outfile, O_WRONLY | O_CREAT | O_BINARY, MODE_RW_UGO);
3716 die (_("open failed"), outfile);
3720 /* Merge the input FILES. NTEMPS is the number of files at the
3721 start of FILES that are temporary; it is zero at the top level.
3722 NFILES is the total number of files. Put the output in
3723 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
3726 merge (struct sortfile *files, size_t ntemps, size_t nfiles,
3727 char const *output_file)
3729 while (nmerge < nfiles)
3731 /* Number of input files processed so far. */
3734 /* Number of output files generated so far. */
3737 /* nfiles % NMERGE; this counts input files that are left over
3738 after all full-sized merges have been done. */
3741 /* Number of easily-available slots at the next loop iteration. */
3744 /* Do as many NMERGE-size merges as possible. In the case that
3745 nmerge is bogus, increment by the maximum number of file
3746 descriptors allowed. */
3747 for (out = in = 0; nmerge <= nfiles - in; out++)
3750 struct tempnode *temp = create_temp (&tfp);
3751 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
3752 nmerge, tfp, temp->name);
3753 ntemps -= MIN (ntemps, num_merged);
3754 files[out].name = temp->name;
3755 files[out].temp = temp;
3759 remainder = nfiles - in;
3760 cheap_slots = nmerge - out % nmerge;
3762 if (cheap_slots < remainder)
3764 /* So many files remain that they can't all be put into the last
3765 NMERGE-sized output window. Do one more merge. Merge as few
3766 files as possible, to avoid needless I/O. */
3767 size_t nshortmerge = remainder - cheap_slots + 1;
3769 struct tempnode *temp = create_temp (&tfp);
3770 size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
3771 nshortmerge, tfp, temp->name);
3772 ntemps -= MIN (ntemps, num_merged);
3773 files[out].name = temp->name;
3774 files[out++].temp = temp;
3778 /* Put the remaining input files into the last NMERGE-sized output
3779 window, so they will be merged in the next pass. */
3780 memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
3785 avoid_trashing_input (files, ntemps, nfiles, output_file);
3787 /* We aren't guaranteed that this final mergefiles will work, therefore we
3788 try to merge into the output, and then merge as much as we can into a
3789 temp file if we can't. Repeat. */
3793 /* Merge directly into the output file if possible. */
3795 size_t nopened = open_input_files (files, nfiles, &fps);
3797 if (nopened == nfiles)
3799 FILE *ofp = stream_open (output_file, "w");
3802 mergefps (files, ntemps, nfiles, ofp, output_file, fps);
3805 if (errno != EMFILE || nopened <= 2)
3806 die (_("open failed"), output_file);
3808 else if (nopened <= 2)
3809 die (_("open failed"), files[nopened].name);
3811 /* We ran out of file descriptors. Close one of the input
3812 files, to gain a file descriptor. Then create a temporary
3813 file with our spare file descriptor. Retry if that failed
3814 (e.g., some other process could open a file between the time
3815 we closed and tried to create). */
3817 struct tempnode *temp;
3821 xfclose (fps[nopened], files[nopened].name);
3822 temp = maybe_create_temp (&tfp, ! (nopened <= 2));
3826 /* Merge into the newly allocated temporary. */
3827 mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
3829 ntemps -= MIN (ntemps, nopened);
3830 files[0].name = temp->name;
3831 files[0].temp = temp;
3833 memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
3835 nfiles -= nopened - 1;
3839 /* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
3842 sort (char *const *files, size_t nfiles, char const *output_file,
3846 IF_LINT (buf.buf = NULL);
3848 bool output_file_created = false;
3854 char const *temp_output;
3855 char const *file = *files;
3856 FILE *fp = xfopen (file, "r");
3859 size_t bytes_per_line;
3865 while (tmp < nthreads)
3870 bytes_per_line = (mult * sizeof (struct line));
3873 bytes_per_line = sizeof (struct line) * 3 / 2;
3876 initbuf (&buf, bytes_per_line,
3877 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
3882 while (fillbuf (&buf, fp, file))
3886 if (buf.eof && nfiles
3887 && (bytes_per_line + 1
3888 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
3890 /* End of file, but there is more input and buffer room.
3891 Concatenate the next input file; this is faster in
3893 buf.left = buf.used;
3897 line = buffer_linelim (&buf);
3898 if (buf.eof && !nfiles && !ntemps && !buf.left)
3901 tfp = xfopen (output_file, "w");
3902 temp_output = output_file;
3903 output_file_created = true;
3908 temp_output = create_temp (&tfp)->name;
3912 struct merge_node_queue queue;
3913 queue_init (&queue, nthreads);
3914 struct merge_node *merge_tree =
3915 merge_tree_init (nthreads, buf.nlines, line);
3916 struct merge_node *root = merge_tree + 1;
3918 sortlines (line, nthreads, buf.nlines, root,
3919 &queue, tfp, temp_output);
3920 queue_destroy (&queue);
3921 pthread_mutex_destroy (&root->lock);
3922 merge_tree_destroy (merge_tree);
3925 write_unique (line - 1, tfp, temp_output);
3927 xfclose (tfp, temp_output);
3929 if (output_file_created)
3938 if (! output_file_created)
3941 struct tempnode *node = temphead;
3942 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
3943 for (i = 0; node; i++)
3945 tempfiles[i].name = node->name;
3946 tempfiles[i].temp = node;
3949 merge (tempfiles, ntemps, ntemps, output_file);
3956 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
3959 insertkey (struct keyfield *key_arg)
3961 struct keyfield **p;
3962 struct keyfield *key = xmemdup (key_arg, sizeof *key);
3964 for (p = &keylist; *p; p = &(*p)->next)
3970 /* Report a bad field specification SPEC, with extra info MSGID. */
3972 static void badfieldspec (char const *, char const *)
3975 badfieldspec (char const *spec, char const *msgid)
3977 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
3978 _(msgid), quote (spec));
3982 /* Report incompatible options. */
3984 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
3986 incompatible_options (char const *opts)
3988 error (SORT_FAILURE, 0, _("options '-%s' are incompatible"), opts);
3992 /* Check compatibility of ordering options. */
3995 check_ordering_compatibility (void)
3997 struct keyfield *key;
3999 for (key = keylist; key; key = key->next)
4000 if (1 < (key->numeric + key->general_numeric + key->human_numeric
4001 + key->month + (key->version | key->random | !!key->ignore)))
4003 /* The following is too big, but guaranteed to be "big enough". */
4004 char opts[sizeof short_options];
4005 /* Clear flags we're not interested in. */
4006 key->skipsblanks = key->skipeblanks = key->reverse = false;
4007 key_to_opts (key, opts);
4008 incompatible_options (opts);
4012 /* Parse the leading integer in STRING and store the resulting value
4013 (which must fit into size_t) into *VAL. Return the address of the
4014 suffix after the integer. If the value is too large, silently
4015 substitute SIZE_MAX. If MSGID is NULL, return NULL after
4016 failure; otherwise, report MSGID and exit on failure. */
4019 parse_field_count (char const *string, size_t *val, char const *msgid)
4024 switch (xstrtoumax (string, &suffix, 10, &n, ""))
4027 case LONGINT_INVALID_SUFFIX_CHAR:
4032 case LONGINT_OVERFLOW:
4033 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
4037 case LONGINT_INVALID:
4039 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
4040 _(msgid), quote (string));
4047 /* Handle interrupts and hangups. */
4050 sighandler (int sig)
4053 signal (sig, SIG_IGN);
4057 signal (sig, SIG_DFL);
4061 /* Set the ordering options for KEY specified in S.
4062 Return the address of the first character in S that
4063 is not a valid ordering option.
4064 BLANKTYPE is the kind of blanks that 'b' should skip. */
4067 set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
4074 if (blanktype == bl_start || blanktype == bl_both)
4075 key->skipsblanks = true;
4076 if (blanktype == bl_end || blanktype == bl_both)
4077 key->skipeblanks = true;
4080 key->ignore = nondictionary;
4083 key->translate = fold_toupper;
4086 key->general_numeric = true;
4089 key->human_numeric = true;
4092 /* Option order should not matter, so don't let -i override
4093 -d. -d implies -i, but -i does not imply -d. */
4095 key->ignore = nonprinting;
4101 key->numeric = true;
4107 key->reverse = true;
4110 key->version = true;
4120 /* Initialize KEY. */
4122 static struct keyfield *
4123 key_init (struct keyfield *key)
4125 memset (key, 0, sizeof *key);
4126 key->eword = SIZE_MAX;
4131 main (int argc, char **argv)
4133 struct keyfield *key;
4134 struct keyfield key_buf;
4135 struct keyfield gkey;
4136 bool gkey_only = false;
4140 bool mergeonly = false;
4141 char *random_source = NULL;
4142 bool need_random = false;
4143 size_t nthreads = 0;
4145 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
4146 bool obsolete_usage = (posix2_version () < 200112);
4148 char *files_from = NULL;
4150 char const *outfile = NULL;
4152 initialize_main (&argc, &argv);
4153 set_program_name (argv[0]);
4154 setlocale (LC_ALL, "");
4155 bindtextdomain (PACKAGE, LOCALEDIR);
4156 textdomain (PACKAGE);
4158 initialize_exit_failure (SORT_FAILURE);
4160 hard_LC_COLLATE = hard_locale (LC_COLLATE);
4161 #if HAVE_NL_LANGINFO
4162 hard_LC_TIME = hard_locale (LC_TIME);
4165 /* Get locale's representation of the decimal point. */
4167 struct lconv const *locale = localeconv ();
4169 /* If the locale doesn't define a decimal point, or if the decimal
4170 point is multibyte, use the C locale's decimal point. FIXME:
4171 add support for multibyte decimal points. */
4172 decimal_point = to_uchar (locale->decimal_point[0]);
4173 if (! decimal_point || locale->decimal_point[1])
4174 decimal_point = '.';
4176 /* FIXME: add support for multibyte thousands separators. */
4177 thousands_sep = to_uchar (*locale->thousands_sep);
4178 if (! thousands_sep || locale->thousands_sep[1])
4182 have_read_stdin = false;
4187 static int const sig[] =
4189 /* The usual suspects. */
4190 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
4207 enum { nsigs = ARRAY_CARDINALITY (sig) };
4210 struct sigaction act;
4212 sigemptyset (&caught_signals);
4213 for (i = 0; i < nsigs; i++)
4215 sigaction (sig[i], NULL, &act);
4216 if (act.sa_handler != SIG_IGN)
4217 sigaddset (&caught_signals, sig[i]);
4220 act.sa_handler = sighandler;
4221 act.sa_mask = caught_signals;
4224 for (i = 0; i < nsigs; i++)
4225 if (sigismember (&caught_signals, sig[i]))
4226 sigaction (sig[i], &act, NULL);
4228 for (i = 0; i < nsigs; i++)
4229 if (signal (sig[i], SIG_IGN) != SIG_IGN)
4231 signal (sig[i], sighandler);
4232 siginterrupt (sig[i], 1);
4236 signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
4238 /* The signal mask is known, so it is safe to invoke exit_cleanup. */
4239 atexit (exit_cleanup);
4242 gkey.sword = SIZE_MAX;
4244 files = xnmalloc (argc, sizeof *files);
4248 /* Parse an operand as a file after "--" was seen; or if
4249 pedantic and a file was seen, unless the POSIX version
4250 predates 1003.1-2001 and -c was not seen and the operand is
4251 "-o FILE" or "-oFILE". */
4255 || (posixly_correct && nfiles != 0
4256 && ! (obsolete_usage
4259 && argv[optind][0] == '-' && argv[optind][1] == 'o'
4260 && (argv[optind][2] || optind + 1 != argc)))
4261 || ((c = getopt_long (argc, argv, short_options,
4267 files[nfiles++] = argv[optind++];
4273 if (optarg[0] == '+')
4275 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
4276 && ISDIGIT (argv[optind][1]));
4277 obsolete_usage |= minus_pos_usage && !posixly_correct;
4280 /* Treat +POS1 [-POS2] as a key if possible; but silently
4281 treat an operand as a file if it is not a valid +POS1. */
4282 key = key_init (&key_buf);
4283 s = parse_field_count (optarg + 1, &key->sword, NULL);
4285 s = parse_field_count (s + 1, &key->schar, NULL);
4286 if (! (key->sword || key->schar))
4287 key->sword = SIZE_MAX;
4288 if (! s || *set_ordering (s, key, bl_start))
4292 if (minus_pos_usage)
4294 char const *optarg1 = argv[optind++];
4295 s = parse_field_count (optarg1 + 1, &key->eword,
4296 N_("invalid number after '-'"));
4297 /* When called with a non-NULL message ID,
4298 parse_field_count cannot return NULL. Tell static
4299 analysis tools that dereferencing S is safe. */
4302 s = parse_field_count (s + 1, &key->echar,
4303 N_("invalid number after '.'"));
4304 if (!key->echar && key->eword)
4306 /* obsolescent syntax +A.x -B.y is equivalent to:
4307 -k A+1.x+1,B.y (when y = 0)
4308 -k A+1.x+1,B+1.y (when y > 0)
4309 So eword is decremented as in the -k case
4310 only when the end field (B) is specified and
4314 if (*set_ordering (s, key, bl_end))
4315 badfieldspec (optarg1,
4316 N_("stray character in field spec"));
4318 key->obsolete_used = true;
4324 files[nfiles++] = optarg;
4328 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
4345 set_ordering (str, &gkey, bl_both);
4351 ? XARGMATCH ("--check", optarg, check_args, check_types)
4356 if (checkonly && checkonly != c)
4357 incompatible_options ("cC");
4361 case COMPRESS_PROGRAM_OPTION:
4362 if (compress_program && !STREQ (compress_program, optarg))
4363 error (SORT_FAILURE, 0, _("multiple compress programs specified"));
4364 compress_program = optarg;
4367 case DEBUG_PROGRAM_OPTION:
4371 case FILES0_FROM_OPTION:
4372 files_from = optarg;
4376 key = key_init (&key_buf);
4379 s = parse_field_count (optarg, &key->sword,
4380 N_("invalid number at field start"));
4383 /* Provoke with 'sort -k0' */
4384 badfieldspec (optarg, N_("field number is zero"));
4388 s = parse_field_count (s + 1, &key->schar,
4389 N_("invalid number after '.'"));
4392 /* Provoke with 'sort -k1.0' */
4393 badfieldspec (optarg, N_("character offset is zero"));
4396 if (! (key->sword || key->schar))
4397 key->sword = SIZE_MAX;
4398 s = set_ordering (s, key, bl_start);
4401 key->eword = SIZE_MAX;
4407 s = parse_field_count (s + 1, &key->eword,
4408 N_("invalid number after ','"));
4411 /* Provoke with 'sort -k1,0' */
4412 badfieldspec (optarg, N_("field number is zero"));
4416 s = parse_field_count (s + 1, &key->echar,
4417 N_("invalid number after '.'"));
4419 s = set_ordering (s, key, bl_end);
4422 badfieldspec (optarg, N_("stray character in field spec"));
4431 specify_nmerge (oi, c, optarg);
4435 if (outfile && !STREQ (outfile, optarg))
4436 error (SORT_FAILURE, 0, _("multiple output files specified"));
4440 case RANDOM_SOURCE_OPTION:
4441 if (random_source && !STREQ (random_source, optarg))
4442 error (SORT_FAILURE, 0, _("multiple random sources specified"));
4443 random_source = optarg;
4451 specify_sort_size (oi, c, optarg);
4456 char newtab = optarg[0];
4458 error (SORT_FAILURE, 0, _("empty tab"));
4461 if (STREQ (optarg, "\\0"))
4465 /* Provoke with 'sort -txx'. Complain about
4466 "multi-character tab" instead of "multibyte tab", so
4467 that the diagnostic's wording does not need to be
4468 changed once multibyte characters are supported. */
4469 error (SORT_FAILURE, 0, _("multi-character tab %s"),
4473 if (tab != TAB_DEFAULT && tab != newtab)
4474 error (SORT_FAILURE, 0, _("incompatible tabs"));
4480 add_temp_dir (optarg);
4483 case PARALLEL_OPTION:
4484 nthreads = specify_nthreads (oi, c, optarg);
4492 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
4493 through Solaris 7. It is also accepted by many non-Solaris
4494 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
4495 -y is marked as obsolete starting with Solaris 8 (1999), but is
4496 still accepted as of Solaris 10 prerelease (2004).
4498 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
4499 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
4500 and which in general ignores the argument after "-y" if it
4501 consists entirely of digits (it can even be empty). */
4502 if (optarg == argv[optind - 1])
4505 for (p = optarg; ISDIGIT (*p); p++)
4507 optind -= (*p != '\0');
4515 case_GETOPT_HELP_CHAR;
4517 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
4520 usage (SORT_FAILURE);
4528 /* When using --files0-from=F, you may not specify any files
4529 on the command-line. */
4532 error (0, 0, _("extra operand %s"), quote (files[0]));
4533 fprintf (stderr, "%s\n",
4534 _("file operands cannot be combined with --files0-from"));
4535 usage (SORT_FAILURE);
4538 if (STREQ (files_from, "-"))
4542 stream = fopen (files_from, "r");
4544 error (SORT_FAILURE, errno, _("cannot open %s for reading"),
4545 quote (files_from));
4548 readtokens0_init (&tok);
4550 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
4551 error (SORT_FAILURE, 0, _("cannot read file names from %s"),
4552 quote (files_from));
4560 for (i = 0; i < nfiles; i++)
4562 if (STREQ (files[i], "-"))
4563 error (SORT_FAILURE, 0, _("when reading file names from stdin, "
4564 "no file name of %s allowed"),
4566 else if (files[i][0] == '\0')
4568 /* Using the standard 'filename:line-number:' prefix here is
4569 not totally appropriate, since NUL is the separator,
4570 not NL, but it might be better than nothing. */
4571 unsigned long int file_number = i + 1;
4572 error (SORT_FAILURE, 0,
4573 _("%s:%lu: invalid zero-length file name"),
4574 quotearg_colon (files_from), file_number);
4579 error (SORT_FAILURE, 0, _("no input from %s"),
4580 quote (files_from));
4583 /* Inheritance of global options to individual keys. */
4584 for (key = keylist; key; key = key->next)
4586 if (default_key_compare (key) && !key->reverse)
4588 key->ignore = gkey.ignore;
4589 key->translate = gkey.translate;
4590 key->skipsblanks = gkey.skipsblanks;
4591 key->skipeblanks = gkey.skipeblanks;
4592 key->month = gkey.month;
4593 key->numeric = gkey.numeric;
4594 key->general_numeric = gkey.general_numeric;
4595 key->human_numeric = gkey.human_numeric;
4596 key->version = gkey.version;
4597 key->random = gkey.random;
4598 key->reverse = gkey.reverse;
4601 need_random |= key->random;
4604 if (!keylist && !default_key_compare (&gkey))
4608 need_random |= gkey.random;
4611 check_ordering_compatibility ();
4615 if (checkonly || outfile)
4617 static char opts[] = "X --debug";
4618 opts[0] = (checkonly ? checkonly : 'o');
4619 incompatible_options (opts);
4622 /* Always output the locale in debug mode, since this
4623 is such a common source of confusion. */
4624 if (hard_LC_COLLATE)
4625 error (0, 0, _("using %s sorting rules"),
4626 quote (setlocale (LC_COLLATE, NULL)));
4628 error (0, 0, _("using simple byte comparison"));
4629 key_warnings (&gkey, gkey_only);
4632 reverse = gkey.reverse;
4635 random_md5_state_init (random_source);
4637 if (temp_dir_count == 0)
4639 char const *tmp_dir = getenv ("TMPDIR");
4640 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
4645 static char *minus = (char *) "-";
4651 /* Need to re-check that we meet the minimum requirement for memory
4652 usage with the final value for NMERGE. */
4654 sort_size = MAX (sort_size, MIN_SORT_SIZE);
4659 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
4660 quote (files[1]), checkonly);
4664 static char opts[] = {0, 'o', 0};
4665 opts[0] = checkonly;
4666 incompatible_options (opts);
4669 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
4670 input is not properly sorted. */
4671 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
4674 /* Check all inputs are accessible, or exit immediately. */
4675 check_inputs (files, nfiles);
4677 /* Check output is writable, or exit immediately. */
4678 check_output (outfile);
4682 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
4685 for (i = 0; i < nfiles; ++i)
4686 sortfiles[i].name = files[i];
4688 merge (sortfiles, 0, nfiles, outfile);
4689 IF_LINT (free (sortfiles));
4695 unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
4696 nthreads = MIN (np, DEFAULT_MAX_THREADS);
4699 /* Avoid integer overflow later. */
4700 size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
4701 nthreads = MIN (nthreads, nthreads_max);
4703 sort (files, nfiles, outfile, nthreads);
4706 if (have_read_stdin && fclose (stdin) == EOF)
4707 die (_("close failed"), "-");
4709 exit (EXIT_SUCCESS);