1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988, 1991-2005 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 Written December 1988 by Mike Haertel.
19 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
20 or (US mail) as Mike Haertel c/o Free Software Foundation.
22 Ørn E. Hansen added NLS support in 1997. */
27 #include <sys/types.h>
31 #include "hard-locale.h"
38 #include "strnumcmp.h"
42 #if HAVE_SYS_RESOURCE_H
43 # include <sys/resource.h>
46 struct rlimit { size_t rlim_cur; };
47 # define getrlimit(Resource, Rlp) (-1)
50 /* The official name of this program (e.g., no `g' prefix). */
51 #define PROGRAM_NAME "sort"
53 #define AUTHORS "Mike Haertel", "Paul Eggert"
55 #if HAVE_LANGINFO_CODESET
56 # include <langinfo.h>
59 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
62 # define SA_NOCLDSTOP 0
63 # define sigprocmask(How, Set, Oset) /* empty */
65 # if ! HAVE_SIGINTERRUPT
66 # define siginterrupt(sig, flag) /* empty */
74 #define UCHAR_LIM (UCHAR_MAX + 1)
76 #ifndef DEFAULT_TMPDIR
77 # define DEFAULT_TMPDIR "/tmp"
80 #include "rand-isaac.c"
85 /* POSIX says to exit with status 1 if invoked with -c and the
86 input is not properly sorted. */
87 SORT_OUT_OF_ORDER = 1,
89 /* POSIX says any other irregular exit must exit with a status
90 code greater than 1. */
94 /* The representation of the decimal point in the current locale. */
95 static int decimal_point;
97 /* Thousands separator; if -1, then there isn't one. */
98 static int thousands_sep;
100 /* Nonzero if the corresponding locales are hard. */
101 static bool hard_LC_COLLATE;
103 static bool hard_LC_TIME;
106 #define NONZERO(x) ((x) != 0)
108 /* The kind of blanks for '-b' to skip in various options. */
109 enum blanktype { bl_start, bl_end, bl_both };
111 /* The character marking end of line. Default to \n. */
112 static char eolchar = '\n';
114 /* Lines are held in core as counted strings. */
117 char *text; /* Text of the line. */
118 size_t length; /* Length including final newline. */
119 char *keybeg; /* Start of first key. */
120 char *keylim; /* Limit of first key. */
126 char *buf; /* Dynamically allocated buffer,
127 partitioned into 3 regions:
130 - an array of lines, in reverse order. */
131 size_t used; /* Number of bytes used for input data. */
132 size_t nlines; /* Number of lines in the line array. */
133 size_t alloc; /* Number of bytes allocated. */
134 size_t left; /* Number of bytes left from previous reads. */
135 size_t line_bytes; /* Number of bytes to reserve for each line. */
136 bool eof; /* An EOF has been read. */
141 size_t sword; /* Zero-origin 'word' to start at. */
142 size_t schar; /* Additional characters to skip. */
143 size_t eword; /* Zero-origin first word after field. */
144 size_t echar; /* Additional characters in field. */
145 bool const *ignore; /* Boolean array of characters to ignore. */
146 char const *translate; /* Translation applied to characters. */
147 bool skipsblanks; /* Skip leading blanks when finding start. */
148 bool skipeblanks; /* Skip leading blanks when finding end. */
149 bool numeric; /* Flag for numeric comparison. Handle
150 strings of digits with optional decimal
151 point, but no exponential notation. */
152 bool random; /* Sort by random hash of key. */
153 bool general_numeric; /* Flag for general, numeric comparison.
154 Handle numbers in exponential notation. */
155 bool month; /* Flag for comparison by month name. */
156 bool reverse; /* Reverse the sense of comparison. */
157 struct keyfield *next; /* Next keyfield to try. */
166 /* The name this program was run with. */
169 /* FIXME: None of these tables work with multibyte character sets.
170 Also, there are many other bugs when handling multibyte characters.
171 One way to fix this is to rewrite `sort' to use wide characters
172 internally, but doing this with good performance is a bit
175 /* Table of blanks. */
176 static bool blanks[UCHAR_LIM];
178 /* Table of non-printing characters. */
179 static bool nonprinting[UCHAR_LIM];
181 /* Table of non-dictionary characters (not letters, digits, or blanks). */
182 static bool nondictionary[UCHAR_LIM];
184 /* Translation table folding lower case to upper. */
185 static char fold_toupper[UCHAR_LIM];
187 #define MONTHS_PER_YEAR 12
189 /* Table mapping month names to integers.
190 Alphabetic order allows binary search. */
191 static struct month monthtab[] =
207 /* During the merge phase, the number of files to merge at once. */
210 /* Minimum size for a merge or check buffer. */
211 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
213 /* Minimum sort size; the code might not work with smaller sizes. */
214 #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
216 /* The number of bytes needed for a merge or check buffer, which can
217 function relatively efficiently even if it holds only one line. If
218 a longer line is seen, this value is increased. */
219 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
221 /* The approximate maximum number of bytes of main memory to use, as
222 specified by the user. Zero if the user has not specified a size. */
223 static size_t sort_size;
225 /* The guessed size for non-regular files. */
226 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
228 /* Array of directory names in which any temporary files are to be created. */
229 static char const **temp_dirs;
231 /* Number of temporary directory names used. */
232 static size_t temp_dir_count;
234 /* Number of allocated slots in temp_dirs. */
235 static size_t temp_dir_alloc;
237 /* Flag to reverse the order of all comparisons. */
240 /* Flag for stable sort. This turns off the last ditch bytewise
241 comparison of lines, and instead leaves lines in the same order
242 they were read if all keys compare equal. */
245 /* If TAB has this value, blanks separate fields. */
246 enum { TAB_DEFAULT = CHAR_MAX + 1 };
248 /* Tab character separating fields. If TAB_DEFAULT, then fields are
249 separated by the empty string between a non-blank character and a blank
251 static int tab = TAB_DEFAULT;
253 /* Flag to remove consecutive duplicate lines from the output.
254 Only the last of a sequence of equal lines will be output. */
257 /* Nonzero if any of the input files are the standard input. */
258 static bool have_read_stdin;
260 /* List of key field comparisons to be tried. */
261 static struct keyfield *keylist;
263 static void sortlines_temp (struct line *, size_t, struct line *);
265 /* Report MESSAGE for FILE, then clean up and exit.
266 If FILE is null, it represents standard output. */
268 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
270 die (char const *message, char const *file)
272 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
279 if (status != EXIT_SUCCESS)
280 fprintf (stderr, _("Try `%s --help' for more information.\n"),
285 Usage: %s [OPTION]... [FILE]...\n\
289 Write sorted concatenation of all FILE(s) to standard output.\n\
293 Mandatory arguments to long options are mandatory for short options too.\n\
300 -b, --ignore-leading-blanks ignore leading blanks\n\
301 -d, --dictionary-order consider only blanks and alphanumeric characters\n\
302 -f, --ignore-case fold lower case to upper case characters\n\
305 -g, --general-numeric-sort compare according to general numerical value\n\
306 -i, --ignore-nonprinting consider only printable characters\n\
307 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
308 -n, --numeric-sort compare according to string numerical value\n\
309 -R, --random-sort sort by random hash of keys\n\
310 -r, --reverse reverse the result of comparisons\n\
316 -c, --check check whether input is sorted; do not sort\n\
317 -k, --key=POS1[,POS2] start a key at POS1, end it at POS2 (origin 1)\n\
318 -m, --merge merge already sorted files; do not sort\n\
319 -o, --output=FILE write result to FILE instead of standard output\n\
320 -s, --stable stabilize sort by disabling last-resort comparison\n\
321 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
324 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
325 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
326 multiple options specify multiple directories\n\
327 -u, --unique with -c, check for strict ordering;\n\
328 without -c, output only the first of an equal run\n\
331 -z, --zero-terminated end lines with 0 byte, not newline\n\
333 fputs (HELP_OPTION_DESCRIPTION, stdout);
334 fputs (VERSION_OPTION_DESCRIPTION, stdout);
337 POS is F[.C][OPTS], where F is the field number and C the character position\n\
338 in the field. If neither the -t nor the -b option is in effect, the characters\n\
339 in a field are counted from the beginning of the preceding whitespace. OPTS is\n\
340 one or more single-letter ordering options, which override global ordering\n\
341 options for that key. If no key is given, use the entire line as the key.\n\
343 SIZE may be followed by the following multiplicative suffixes:\n\
346 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
348 With no FILE, or when FILE is -, read standard input.\n\
351 The locale specified by the environment affects sort order.\n\
352 Set LC_ALL=C to get the traditional sort order that uses\n\
353 native byte values.\n\
355 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
361 /* For long options that have no equivalent short option, use a
362 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
365 SEED_OPTION = CHAR_MAX + 1
368 static char const short_options[] = "-bcdfgik:mMno:rRsS:t:T:uy:z";
370 static struct option const long_options[] =
372 {"ignore-leading-blanks", no_argument, NULL, 'b'},
373 {"check", no_argument, NULL, 'c'},
374 {"dictionary-order", no_argument, NULL, 'd'},
375 {"ignore-case", no_argument, NULL, 'f'},
376 {"general-numeric-sort", no_argument, NULL, 'g'},
377 {"ignore-nonprinting", no_argument, NULL, 'i'},
378 {"key", required_argument, NULL, 'k'},
379 {"merge", no_argument, NULL, 'm'},
380 {"month-sort", no_argument, NULL, 'M'},
381 {"numeric-sort", no_argument, NULL, 'n'},
382 {"random-sort", no_argument, NULL, 'R'},
383 {"output", required_argument, NULL, 'o'},
384 {"reverse", no_argument, NULL, 'r'},
385 {"stable", no_argument, NULL, 's'},
386 {"buffer-size", required_argument, NULL, 'S'},
387 {"field-separator", required_argument, NULL, 't'},
388 {"temporary-directory", required_argument, NULL, 'T'},
389 {"unique", no_argument, NULL, 'u'},
390 {"zero-terminated", no_argument, NULL, 'z'},
391 {"seed", required_argument, NULL, SEED_OPTION}, /* This will go away soon. */
392 {GETOPT_HELP_OPTION_DECL},
393 {GETOPT_VERSION_OPTION_DECL},
397 /* The set of signals that are caught. */
398 static sigset_t caught_signals;
400 /* The list of temporary files. */
403 struct tempnode *volatile next;
404 char name[1]; /* Actual size is 1 + file name length. */
406 static struct tempnode *volatile temphead;
407 static struct tempnode *volatile *temptail = &temphead;
409 /* Clean up any remaining temporary files. */
414 struct tempnode const *node;
416 for (node = temphead; node; node = node->next)
420 /* Create a new temporary file, returning its newly allocated name.
421 Store into *PFP a stream open for writing. */
424 create_temp_file (FILE **pfp)
426 static char const slashbase[] = "/sortXXXXXX";
427 static size_t temp_dir_index;
431 char const *temp_dir = temp_dirs[temp_dir_index];
432 size_t len = strlen (temp_dir);
433 struct tempnode *node =
434 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
435 char *file = node->name;
437 memcpy (file, temp_dir, len);
438 memcpy (file + len, slashbase, sizeof slashbase);
440 if (++temp_dir_index == temp_dir_count)
443 /* Create the temporary file in a critical section, to avoid races. */
444 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
449 temptail = &node->next;
452 sigprocmask (SIG_SETMASK, &oldset, NULL);
455 if (fd < 0 || (*pfp = fdopen (fd, "w")) == NULL)
456 die (_("cannot create temporary file"), file);
461 /* Return a stream for FILE, opened with mode HOW. A null FILE means
462 standard output; HOW should be "w". When opening for input, "-"
463 means standard input. To avoid confusion, do not return file
464 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
465 opening an ordinary FILE. */
468 xfopen (const char *file, const char *how)
474 else if (STREQ (file, "-") && *how == 'r')
476 have_read_stdin = true;
481 fp = fopen (file, how);
483 die (_("open failed"), file);
489 /* Close FP, whose name is FILE, and report any errors. */
492 xfclose (FILE *fp, char const *file)
497 /* Allow reading stdin from tty more than once. */
503 /* Don't close stdout just yet. close_stdout does that. */
504 if (fflush (fp) != 0)
505 die (_("fflush failed"), file);
509 if (fclose (fp) != 0)
510 die (_("close failed"), file);
516 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
518 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
519 die (_("write failed"), output_file);
522 /* Append DIR to the array of temporary directory names. */
524 add_temp_dir (char const *dir)
526 if (temp_dir_count == temp_dir_alloc)
527 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
529 temp_dirs[temp_dir_count++] = dir;
532 /* Remove NAME from the list of temporary files. */
535 zaptemp (const char *name)
537 struct tempnode *volatile *pnode;
538 struct tempnode *node;
539 struct tempnode *next;
542 int unlink_errno = 0;
544 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
547 /* Unlink the temporary file in a critical section to avoid races. */
549 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
550 unlink_status = unlink (name);
551 unlink_errno = errno;
553 sigprocmask (SIG_SETMASK, &oldset, NULL);
555 if (unlink_status != 0)
556 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
565 struct_month_cmp (const void *m1, const void *m2)
567 struct month const *month1 = m1;
568 struct month const *month2 = m2;
569 return strcmp (month1->name, month2->name);
574 /* Initialize the character class tables. */
581 for (i = 0; i < UCHAR_LIM; ++i)
583 blanks[i] = !!ISBLANK (i);
584 nonprinting[i] = !ISPRINT (i);
585 nondictionary[i] = !ISALNUM (i) && !ISBLANK (i);
586 fold_toupper[i] = (ISLOWER (i) ? toupper (i) : i);
590 /* If we're not in the "C" locale, read different names for months. */
593 for (i = 0; i < MONTHS_PER_YEAR; i++)
600 s = (char *) nl_langinfo (ABMON_1 + i);
602 monthtab[i].name = name = xmalloc (s_len + 1);
603 monthtab[i].val = i + 1;
605 for (j = 0; j < s_len; j++)
606 name[j] = fold_toupper[to_uchar (s[j])];
609 qsort ((void *) monthtab, MONTHS_PER_YEAR,
610 sizeof *monthtab, struct_month_cmp);
615 /* Specify the amount of main memory to use when sorting. */
617 specify_sort_size (char const *s)
621 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
623 /* The default unit is KiB. */
624 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
626 if (n <= UINTMAX_MAX / 1024)
629 e = LONGINT_OVERFLOW;
632 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
633 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
642 double mem = physmem_total () * n / 100;
644 /* Use "<", not "<=", to avoid problems with rounding. */
645 if (mem < UINTMAX_MAX)
651 e = LONGINT_OVERFLOW;
658 /* If multiple sort sizes are specified, take the maximum, so
659 that option order does not matter. */
666 sort_size = MAX (sort_size, MIN_SORT_SIZE);
670 e = LONGINT_OVERFLOW;
673 STRTOL_FATAL_ERROR (s, _("sort size"), e);
676 /* Return the default sort size. */
678 default_sort_size (void)
680 /* Let MEM be available memory or 1/8 of total memory, whichever
682 double avail = physmem_available ();
683 double total = physmem_total ();
684 double mem = MAX (avail, total / 8);
685 struct rlimit rlimit;
687 /* Let SIZE be MEM, but no more than the maximum object size or
688 system resource limits. Avoid the MIN macro here, as it is not
689 quite right when only one argument is floating point. Don't
690 bother to check for values like RLIM_INFINITY since in practice
691 they are not much less than SIZE_MAX. */
692 size_t size = SIZE_MAX;
695 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
696 size = rlimit.rlim_cur;
698 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
699 size = rlimit.rlim_cur;
702 /* Leave a large safety margin for the above limits, as failure can
703 occur when they are exceeded. */
707 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
708 Exceeding RSS is not fatal, but can be quite slow. */
709 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
710 size = rlimit.rlim_cur / 16 * 15;
713 /* Use no less than the minimum. */
714 return MAX (size, MIN_SORT_SIZE);
717 /* Return the sort buffer size to use with the input files identified
718 by FPS and FILES, which are alternate names of the same files.
719 NFILES gives the number of input files; NFPS may be less. Assume
720 that each input line requires LINE_BYTES extra bytes' worth of line
721 information. Do not exceed the size bound specified by the user
722 (or a default size bound, if the user does not specify one). */
725 sort_buffer_size (FILE *const *fps, size_t nfps,
726 char *const *files, size_t nfiles,
729 /* A bound on the input size. If zero, the bound hasn't been
731 static size_t size_bound;
733 /* In the worst case, each input byte is a newline. */
734 size_t worst_case_per_input_byte = line_bytes + 1;
736 /* Keep enough room for one extra input line and an extra byte.
737 This extra room might be needed when preparing to read EOF. */
738 size_t size = worst_case_per_input_byte + 1;
742 for (i = 0; i < nfiles; i++)
748 if ((i < nfps ? fstat (fileno (fps[i]), &st)
749 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
750 : stat (files[i], &st))
752 die (_("stat failed"), files[i]);
754 if (S_ISREG (st.st_mode))
755 file_size = st.st_size;
758 /* The file has unknown size. If the user specified a sort
759 buffer size, use that; otherwise, guess the size. */
762 file_size = INPUT_FILE_SIZE_GUESS;
767 size_bound = sort_size;
769 size_bound = default_sort_size ();
772 /* Add the amount of memory needed to represent the worst case
773 where the input consists entirely of newlines followed by a
774 single non-newline. Check for overflow. */
775 worst_case = file_size * worst_case_per_input_byte + 1;
776 if (file_size != worst_case / worst_case_per_input_byte
777 || size_bound - size <= worst_case)
785 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
786 must be at least sizeof (struct line). Allocate ALLOC bytes
790 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
792 /* Ensure that the line array is properly aligned. If the desired
793 size cannot be allocated, repeatedly halve it until allocation
794 succeeds. The smaller allocation may hurt overall performance,
795 but that's better than failing. */
798 alloc += sizeof (struct line) - alloc % sizeof (struct line);
799 buf->buf = malloc (alloc);
803 if (alloc <= line_bytes + 1)
807 buf->line_bytes = line_bytes;
809 buf->used = buf->left = buf->nlines = 0;
813 /* Return one past the limit of the line array. */
815 static inline struct line *
816 buffer_linelim (struct buffer const *buf)
818 return (struct line *) (buf->buf + buf->alloc);
821 /* Return a pointer to the first character of the field specified
825 begfield (const struct line *line, const struct keyfield *key)
827 char *ptr = line->text, *lim = ptr + line->length - 1;
828 size_t sword = key->sword;
829 size_t schar = key->schar;
830 size_t remaining_bytes;
832 /* The leading field separator itself is included in a field when -t
835 if (tab != TAB_DEFAULT)
836 while (ptr < lim && sword--)
838 while (ptr < lim && *ptr != tab)
844 while (ptr < lim && sword--)
846 while (ptr < lim && blanks[to_uchar (*ptr)])
848 while (ptr < lim && !blanks[to_uchar (*ptr)])
852 if (key->skipsblanks)
853 while (ptr < lim && blanks[to_uchar (*ptr)])
856 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
857 remaining_bytes = lim - ptr;
858 if (schar < remaining_bytes)
866 /* Return the limit of (a pointer to the first character after) the field
867 in LINE specified by KEY. */
870 limfield (const struct line *line, const struct keyfield *key)
872 char *ptr = line->text, *lim = ptr + line->length - 1;
873 size_t eword = key->eword, echar = key->echar;
874 size_t remaining_bytes;
876 /* Move PTR past EWORD fields or to one past the last byte on LINE,
877 whichever comes first. If there are more than EWORD fields, leave
878 PTR pointing at the beginning of the field having zero-based index,
879 EWORD. If a delimiter character was specified (via -t), then that
880 `beginning' is the first character following the delimiting TAB.
881 Otherwise, leave PTR pointing at the first `blank' character after
882 the preceding field. */
883 if (tab != TAB_DEFAULT)
884 while (ptr < lim && eword--)
886 while (ptr < lim && *ptr != tab)
888 if (ptr < lim && (eword | echar))
892 while (ptr < lim && eword--)
894 while (ptr < lim && blanks[to_uchar (*ptr)])
896 while (ptr < lim && !blanks[to_uchar (*ptr)])
900 #ifdef POSIX_UNSPECIFIED
901 /* The following block of code makes GNU sort incompatible with
902 standard Unix sort, so it's ifdef'd out for now.
903 The POSIX spec isn't clear on how to interpret this.
904 FIXME: request clarification.
906 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
907 Date: Thu, 30 May 96 12:20:41 -0400
908 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
910 [...]I believe I've found another bug in `sort'.
915 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
918 $ /bin/sort -k1.7,1.7 </tmp/sort.in
922 Unix sort produced the answer I expected: sort on the single character
923 in column 7. GNU sort produced different results, because it disagrees
924 on the interpretation of the key-end spec "M.N". Unix sort reads this
925 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
926 "skip M-1 fields, then either N-1 characters or the rest of the current
927 field, whichever comes first". This extra clause applies only to
928 key-ends, not key-starts.
931 /* Make LIM point to the end of (one byte past) the current field. */
932 if (tab != TAB_DEFAULT)
935 newlim = memchr (ptr, tab, lim - ptr);
943 while (newlim < lim && blanks[to_uchar (*newlim)])
945 while (newlim < lim && !blanks[to_uchar (*newlim)])
951 /* If we're ignoring leading blanks when computing the End
952 of the field, don't start counting bytes until after skipping
953 past any leading blanks. */
954 if (key->skipeblanks)
955 while (ptr < lim && blanks[to_uchar (*ptr)])
958 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
959 remaining_bytes = lim - ptr;
960 if (echar < remaining_bytes)
968 /* Fill BUF reading from FP, moving buf->left bytes from the end
969 of buf->buf to the beginning first. If EOF is reached and the
970 file wasn't terminated by a newline, supply one. Set up BUF's line
971 table too. FILE is the name of the file corresponding to FP.
972 Return true if some input was read. */
975 fillbuf (struct buffer *buf, FILE *fp, char const *file)
977 struct keyfield const *key = keylist;
979 size_t line_bytes = buf->line_bytes;
980 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
985 if (buf->used != buf->left)
987 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
988 buf->used = buf->left;
994 char *ptr = buf->buf + buf->used;
995 struct line *linelim = buffer_linelim (buf);
996 struct line *line = linelim - buf->nlines;
997 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
998 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1000 while (line_bytes + 1 < avail)
1002 /* Read as many bytes as possible, but do not read so many
1003 bytes that there might not be enough room for the
1004 corresponding line array. The worst case is when the
1005 rest of the input file consists entirely of newlines,
1006 except that the last byte is not a newline. */
1007 size_t readsize = (avail - 1) / (line_bytes + 1);
1008 size_t bytes_read = fread (ptr, 1, readsize, fp);
1009 char *ptrlim = ptr + bytes_read;
1011 avail -= bytes_read;
1013 if (bytes_read != readsize)
1016 die (_("read failed"), file);
1020 if (buf->buf == ptrlim)
1022 if (ptrlim[-1] != eol)
1027 /* Find and record each line in the just-read input. */
1028 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1032 line->text = line_start;
1033 line->length = ptr - line_start;
1034 mergesize = MAX (mergesize, line->length);
1035 avail -= line_bytes;
1039 /* Precompute the position of the first key for
1041 line->keylim = (key->eword == SIZE_MAX
1043 : limfield (line, key));
1045 if (key->sword != SIZE_MAX)
1046 line->keybeg = begfield (line, key);
1049 if (key->skipsblanks)
1050 while (blanks[to_uchar (*line_start)])
1052 line->keybeg = line_start;
1064 buf->used = ptr - buf->buf;
1065 buf->nlines = buffer_linelim (buf) - line;
1066 if (buf->nlines != 0)
1068 buf->left = ptr - line_start;
1069 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1073 /* The current input line is too long to fit in the buffer.
1074 Double the buffer size and try again. */
1075 buf->buf = X2REALLOC (buf->buf, &buf->alloc);
1079 /* Compare strings A and B as numbers without explicitly converting them to
1080 machine numbers. Comparatively slow for short strings, but asymptotically
1084 numcompare (const char *a, const char *b)
1086 while (blanks[to_uchar (*a)])
1088 while (blanks[to_uchar (*b)])
1091 return strnumcmp (a, b, decimal_point, thousands_sep);
1095 general_numcompare (const char *sa, const char *sb)
1097 /* FIXME: add option to warn about failed conversions. */
1098 /* FIXME: maybe add option to try expensive FP conversion
1099 only if A and B can't be compared more cheaply/accurately. */
1103 double a = strtod (sa, &ea);
1104 double b = strtod (sb, &eb);
1106 /* Put conversion errors at the start of the collating sequence. */
1108 return sb == eb ? 0 : -1;
1112 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1113 conversion errors but before numbers; sort them by internal
1114 bit-pattern, for lack of a more portable alternative. */
1120 : memcmp ((char *) &a, (char *) &b, sizeof a));
1123 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1124 Return 0 if the name in S is not recognized. */
1127 getmonth (char const *month, size_t len)
1130 size_t hi = MONTHS_PER_YEAR;
1131 char const *monthlim = month + len;
1135 if (month == monthlim)
1137 if (!blanks[to_uchar (*month)])
1144 size_t ix = (lo + hi) / 2;
1145 char const *m = month;
1146 char const *n = monthtab[ix].name;
1151 return monthtab[ix].val;
1152 if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1157 else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1169 /* The ISAAC state resulting from the user-specified seed, or from
1170 random data taken from the environment. */
1171 static struct isaac_state rand_state;
1173 /* Set *RESULT to the ISAAC state resulting from applying TEXT (with
1174 length LEN) to rand_state. */
1176 get_hash (char const *text, size_t len, uint32_t resbuf[ISAAC_WORDS])
1178 struct isaac_state s = rand_state;
1179 isaac_seed_data (&s, text, len);
1180 isaac_seed_finish (&s);
1181 isaac_refill (&s, resbuf);
1184 /* Compare two lines A and B trying every key in sequence until there
1185 are no more keys or a difference is found. */
1188 keycompare (const struct line *a, const struct line *b)
1190 struct keyfield const *key = keylist;
1192 /* For the first iteration only, the key positions have been
1193 precomputed for us. */
1194 char *texta = a->keybeg;
1195 char *textb = b->keybeg;
1196 char *lima = a->keylim;
1197 char *limb = b->keylim;
1203 char const *translate = key->translate;
1204 bool const *ignore = key->ignore;
1206 /* Find the lengths. */
1207 size_t lena = lima <= texta ? 0 : lima - texta;
1208 size_t lenb = limb <= textb ? 0 : limb - textb;
1210 /* Actually compare the fields. */
1215 uint32_t diga[ISAAC_WORDS];
1216 uint32_t digb[ISAAC_WORDS];
1217 get_hash (texta, lena, diga);
1218 get_hash (textb, lenb, digb);
1220 /* Go backwards, since the last words are marginally better
1222 for (i = ISAAC_WORDS - 1; 0 <= i; i--)
1223 if (diga[i] != digb[i])
1225 diff = (diga[i] < digb[i] ? -1 : 1);
1229 /* FIXME: Should break ties by rehashing with a different
1230 random hashing function (and repeat until the tie is
1231 broken) unless --seed was specified. The probability of
1232 this being needed should be infinitesimal. */
1235 if (key->numeric | key->general_numeric)
1237 char savea = *lima, saveb = *limb;
1239 *lima = *limb = '\0';
1240 diff = ((key->numeric ? numcompare : general_numcompare)
1242 *lima = savea, *limb = saveb;
1244 else if (key->month)
1245 diff = getmonth (texta, lena) - getmonth (textb, lenb);
1246 /* Sorting like this may become slow, so in a simple locale the user
1247 can select a faster sort that is similar to ascii sort. */
1248 else if (hard_LC_COLLATE)
1250 if (ignore || translate)
1253 size_t size = lena + 1 + lenb + 1;
1254 char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1255 char *copy_b = copy_a + lena + 1;
1256 size_t new_len_a, new_len_b, i;
1258 /* Ignore and/or translate chars before comparing. */
1259 for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1263 copy_a[new_len_a] = (translate
1264 ? translate[to_uchar (texta[i])]
1266 if (!ignore || !ignore[to_uchar (texta[i])])
1271 copy_b[new_len_b] = (translate
1272 ? translate[to_uchar (textb[i])]
1274 if (!ignore || !ignore[to_uchar (textb[i])])
1279 diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1281 if (sizeof buf < size)
1285 diff = - NONZERO (lenb);
1289 diff = xmemcoll (texta, lena, textb, lenb);
1293 #define CMP_WITH_IGNORE(A, B) \
1298 while (texta < lima && ignore[to_uchar (*texta)]) \
1300 while (textb < limb && ignore[to_uchar (*textb)]) \
1302 if (! (texta < lima && textb < limb)) \
1304 diff = to_uchar (A) - to_uchar (B); \
1311 diff = (texta < lima) - (textb < limb); \
1316 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1317 translate[to_uchar (*textb)]);
1319 CMP_WITH_IGNORE (*texta, *textb);
1322 diff = - NONZERO (lenb);
1329 while (texta < lima && textb < limb)
1331 diff = (to_uchar (translate[to_uchar (*texta++)])
1332 - to_uchar (translate[to_uchar (*textb++)]));
1339 diff = memcmp (texta, textb, MIN (lena, lenb));
1343 diff = lena < lenb ? -1 : lena != lenb;
1353 /* Find the beginning and limit of the next field. */
1354 if (key->eword != SIZE_MAX)
1355 lima = limfield (a, key), limb = limfield (b, key);
1357 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1359 if (key->sword != SIZE_MAX)
1360 texta = begfield (a, key), textb = begfield (b, key);
1363 texta = a->text, textb = b->text;
1364 if (key->skipsblanks)
1366 while (texta < lima && blanks[to_uchar (*texta)])
1368 while (textb < limb && blanks[to_uchar (*textb)])
1379 return key->reverse ? -diff : diff;
1382 /* Compare two lines A and B, returning negative, zero, or positive
1383 depending on whether A compares less than, equal to, or greater than B. */
1386 compare (const struct line *a, const struct line *b)
1391 /* First try to compare on the specified keys (if any).
1392 The only two cases with no key at all are unadorned sort,
1393 and unadorned sort -r. */
1396 diff = keycompare (a, b);
1397 if (diff | unique | stable)
1401 /* If the keys all compare equal (or no keys were specified)
1402 fall through to the default comparison. */
1403 alen = a->length - 1, blen = b->length - 1;
1406 diff = - NONZERO (blen);
1409 else if (hard_LC_COLLATE)
1410 diff = xmemcoll (a->text, alen, b->text, blen);
1411 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
1412 diff = alen < blen ? -1 : alen != blen;
1414 return reverse ? -diff : diff;
1417 /* Check that the lines read from FILE_NAME come in order. Print a
1418 diagnostic (FILE_NAME, line number, contents of line) to stderr and return
1419 false if they are not in order. Otherwise, print no diagnostic
1423 check (char const *file_name)
1425 FILE *fp = xfopen (file_name, "r");
1426 struct buffer buf; /* Input buffer. */
1427 struct line temp; /* Copy of previous line. */
1429 uintmax_t line_number = 0;
1430 struct keyfield const *key = keylist;
1431 bool nonunique = ! unique;
1432 bool ordered = true;
1434 initbuf (&buf, sizeof (struct line),
1435 MAX (merge_buffer_size, sort_size));
1438 while (fillbuf (&buf, fp, file_name))
1440 struct line const *line = buffer_linelim (&buf);
1441 struct line const *linebase = line - buf.nlines;
1443 /* Make sure the line saved from the old buffer contents is
1444 less than or equal to the first line of the new buffer. */
1445 if (alloc && nonunique <= compare (&temp, line - 1))
1449 struct line const *disorder_line = line - 1;
1450 uintmax_t disorder_line_number =
1451 buffer_linelim (&buf) - disorder_line + line_number;
1452 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
1453 fprintf (stderr, _("%s: %s:%s: disorder: "),
1454 program_name, file_name,
1455 umaxtostr (disorder_line_number, hr_buf));
1456 write_bytes (disorder_line->text, disorder_line->length, stderr,
1457 _("standard error"));
1463 /* Compare each line in the buffer with its successor. */
1464 while (linebase < --line)
1465 if (nonunique <= compare (line, line - 1))
1466 goto found_disorder;
1468 line_number += buf.nlines;
1470 /* Save the last line of the buffer. */
1471 if (alloc < line->length)
1478 alloc = line->length;
1482 while (alloc < line->length);
1484 temp.text = xrealloc (temp.text, alloc);
1486 memcpy (temp.text, line->text, line->length);
1487 temp.length = line->length;
1490 temp.keybeg = temp.text + (line->keybeg - line->text);
1491 temp.keylim = temp.text + (line->keylim - line->text);
1495 xfclose (fp, file_name);
1501 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
1502 files (all of which are at the start of the FILES array), and
1503 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
1504 Close input and output files before returning.
1505 OUTPUT_FILE gives the name of the output file. If it is NULL,
1506 the output file is standard output. If OFP is NULL, the output
1507 file has not been opened yet (or written to, if standard output). */
1510 mergefps (char **files, size_t ntemps, size_t nfiles,
1511 FILE *ofp, char const *output_file)
1513 FILE *fps[NMERGE]; /* Input streams for each file. */
1514 struct buffer buffer[NMERGE]; /* Input buffers for each file. */
1515 struct line saved; /* Saved line storage for unique check. */
1516 struct line const *savedline = NULL;
1517 /* &saved if there is a saved line. */
1518 size_t savealloc = 0; /* Size allocated for the saved line. */
1519 struct line const *cur[NMERGE]; /* Current line in each line table. */
1520 struct line const *base[NMERGE]; /* Base of each line table. */
1521 size_t ord[NMERGE]; /* Table representing a permutation of fps,
1522 such that cur[ord[0]] is the smallest line
1523 and will be next output. */
1527 struct keyfield const *key = keylist;
1530 /* Read initial lines from each input file. */
1531 for (i = 0; i < nfiles; )
1533 fps[i] = xfopen (files[i], "r");
1534 initbuf (&buffer[i], sizeof (struct line),
1535 MAX (merge_buffer_size, sort_size / nfiles));
1536 if (fillbuf (&buffer[i], fps[i], files[i]))
1538 struct line const *linelim = buffer_linelim (&buffer[i]);
1539 cur[i] = linelim - 1;
1540 base[i] = linelim - buffer[i].nlines;
1545 /* fps[i] is empty; eliminate it from future consideration. */
1546 xfclose (fps[i], files[i]);
1552 free (buffer[i].buf);
1554 for (j = i; j < nfiles; ++j)
1555 files[j] = files[j + 1];
1560 ofp = xfopen (output_file, "w");
1562 /* Set up the ord table according to comparisons among input lines.
1563 Since this only reorders two items if one is strictly greater than
1564 the other, it is stable. */
1565 for (i = 0; i < nfiles; ++i)
1567 for (i = 1; i < nfiles; ++i)
1568 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
1569 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1571 /* Repeatedly output the smallest line until no input remains. */
1574 struct line const *smallest = cur[ord[0]];
1576 /* If uniquified output is turned on, output only the first of
1577 an identical series of lines. */
1580 if (savedline && compare (savedline, smallest))
1583 write_bytes (saved.text, saved.length, ofp, output_file);
1588 if (savealloc < smallest->length)
1593 savealloc = smallest->length;
1596 while ((savealloc *= 2) < smallest->length);
1598 saved.text = xrealloc (saved.text, savealloc);
1600 saved.length = smallest->length;
1601 memcpy (saved.text, smallest->text, saved.length);
1605 saved.text + (smallest->keybeg - smallest->text);
1607 saved.text + (smallest->keylim - smallest->text);
1612 write_bytes (smallest->text, smallest->length, ofp, output_file);
1614 /* Check if we need to read more lines into core. */
1615 if (base[ord[0]] < smallest)
1616 cur[ord[0]] = smallest - 1;
1619 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]]))
1621 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
1622 cur[ord[0]] = linelim - 1;
1623 base[ord[0]] = linelim - buffer[ord[0]].nlines;
1627 /* We reached EOF on fps[ord[0]]. */
1628 for (i = 1; i < nfiles; ++i)
1629 if (ord[i] > ord[0])
1632 xfclose (fps[ord[0]], files[ord[0]]);
1633 if (ord[0] < ntemps)
1636 zaptemp (files[ord[0]]);
1638 free (buffer[ord[0]].buf);
1639 for (i = ord[0]; i < nfiles; ++i)
1641 fps[i] = fps[i + 1];
1642 files[i] = files[i + 1];
1643 buffer[i] = buffer[i + 1];
1644 cur[i] = cur[i + 1];
1645 base[i] = base[i + 1];
1647 for (i = 0; i < nfiles; ++i)
1648 ord[i] = ord[i + 1];
1653 /* The new line just read in may be larger than other lines
1654 already in main memory; push it back in the queue until we
1655 encounter a line larger than it. Optimize for the common
1656 case where the new line is smallest. */
1661 size_t ord0 = ord[0];
1662 size_t count_of_smaller_lines;
1666 int cmp = compare (cur[ord0], cur[ord[probe]]);
1667 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
1671 probe = (lo + hi) / 2;
1674 count_of_smaller_lines = lo - 1;
1675 for (j = 0; j < count_of_smaller_lines; j++)
1676 ord[j] = ord[j + 1];
1677 ord[count_of_smaller_lines] = ord0;
1681 if (unique && savedline)
1683 write_bytes (saved.text, saved.length, ofp, output_file);
1687 xfclose (ofp, output_file);
1690 /* Merge into T the two sorted arrays of lines LO (with NLO members)
1691 and HI (with NHI members). T, LO, and HI point just past their
1692 respective arrays, and the arrays are in reverse order. NLO and
1693 NHI must be positive, and HI - NHI must equal T - (NLO + NHI). */
1696 mergelines (struct line *t,
1697 struct line const *lo, size_t nlo,
1698 struct line const *hi, size_t nhi)
1701 if (compare (lo - 1, hi - 1) <= 0)
1706 /* HI - NHI equalled T - (NLO + NHI) when this function
1707 began. Therefore HI must equal T now, and there is no
1708 need to copy from HI to T. */
1726 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
1727 NLINES must be at least 2.
1728 The input and output arrays are in reverse order, and LINES and
1729 TEMP point just past the end of their respective arrays.
1731 Use a recursive divide-and-conquer algorithm, in the style
1732 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
1733 the optimization suggested by exercise 5.2.4-10; this requires room
1734 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
1735 writes that this memory optimization was originally published by
1736 D. A. Bell, Comp J. 1 (1958), 75. */
1739 sortlines (struct line *lines, size_t nlines, struct line *temp)
1743 if (0 < compare (&lines[-1], &lines[-2]))
1745 struct line tmp = lines[-1];
1746 lines[-1] = lines[-2];
1752 size_t nlo = nlines / 2;
1753 size_t nhi = nlines - nlo;
1754 struct line *lo = lines;
1755 struct line *hi = lines - nlo;
1756 struct line *sorted_lo = temp;
1758 sortlines (hi, nhi, temp);
1760 sortlines_temp (lo, nlo, sorted_lo);
1762 sorted_lo[-1] = lo[-1];
1764 mergelines (lines, sorted_lo, nlo, hi, nhi);
1768 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
1769 rather than sorting in place. */
1772 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
1776 /* Declare `swap' as int, not bool, to work around a bug
1777 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
1778 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
1779 int swap = (0 < compare (&lines[-1], &lines[-2]));
1780 temp[-1] = lines[-1 - swap];
1781 temp[-2] = lines[-2 + swap];
1785 size_t nlo = nlines / 2;
1786 size_t nhi = nlines - nlo;
1787 struct line *lo = lines;
1788 struct line *hi = lines - nlo;
1789 struct line *sorted_hi = temp - nlo;
1791 sortlines_temp (hi, nhi, sorted_hi);
1793 sortlines (lo, nlo, temp);
1795 mergelines (temp, lo, nlo, sorted_hi, nhi);
1799 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
1800 the same as OUTFILE. If found, merge the found instances (and perhaps
1801 some other files) into a temporary file so that it can in turn be
1802 merged into OUTFILE without destroying OUTFILE before it is completely
1803 read. Return the new value of NFILES, which differs from the old if
1804 some merging occurred.
1806 This test ensures that an otherwise-erroneous use like
1807 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
1808 It's not clear that POSIX requires this nicety.
1809 Detect common error cases, but don't try to catch obscure cases like
1810 "cat ... FILE ... | sort -m -o FILE"
1811 where traditional "sort" doesn't copy the input and where
1812 people should know that they're getting into trouble anyway.
1813 Catching these obscure cases would slow down performance in
1817 avoid_trashing_input (char **files, size_t ntemps, size_t nfiles,
1818 char const *outfile)
1821 bool got_outstat = false;
1822 struct stat outstat;
1824 for (i = ntemps; i < nfiles; i++)
1826 bool is_stdin = STREQ (files[i], "-");
1830 if (outfile && STREQ (outfile, files[i]) && !is_stdin)
1837 ? stat (outfile, &outstat)
1838 : fstat (STDOUT_FILENO, &outstat))
1845 ? fstat (STDIN_FILENO, &instat)
1846 : stat (files[i], &instat))
1848 && SAME_INODE (instat, outstat));
1854 char *temp = create_temp_file (&tftp);
1855 mergefps (&files[i], 0, nfiles - i, tftp, temp);
1864 /* Merge the input FILES. NTEMPS is the number of files at the
1865 start of FILES that are temporary; it is zero at the top level.
1866 NFILES is the total number of files. Put the output in
1867 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
1870 merge (char **files, size_t ntemps, size_t nfiles, char const *output_file)
1872 while (NMERGE < nfiles)
1874 /* Number of input files processed so far. */
1877 /* Number of output files generated so far. */
1880 /* nfiles % NMERGE; this counts input files that are left over
1881 after all full-sized merges have been done. */
1884 /* Number of easily-available slots at the next loop iteration. */
1887 /* Do as many NMERGE-size merges as possible. */
1888 for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
1891 char *temp = create_temp_file (&tfp);
1892 size_t nt = MIN (ntemps, NMERGE);
1894 mergefps (&files[in], nt, NMERGE, tfp, temp);
1898 remainder = nfiles - in;
1899 cheap_slots = NMERGE - out % NMERGE;
1901 if (cheap_slots < remainder)
1903 /* So many files remain that they can't all be put into the last
1904 NMERGE-sized output window. Do one more merge. Merge as few
1905 files as possible, to avoid needless I/O. */
1906 size_t nshortmerge = remainder - cheap_slots + 1;
1908 char *temp = create_temp_file (&tfp);
1909 size_t nt = MIN (ntemps, nshortmerge);
1911 mergefps (&files[in], nt, nshortmerge, tfp, temp);
1912 files[out++] = temp;
1916 /* Put the remaining input files into the last NMERGE-sized output
1917 window, so they will be merged in the next pass. */
1918 memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
1923 nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
1924 mergefps (files, ntemps, nfiles, NULL, output_file);
1927 /* Sort NFILES FILES onto OUTPUT_FILE. */
1930 sort (char * const *files, size_t nfiles, char const *output_file)
1934 bool output_file_created = false;
1940 char const *temp_output;
1941 char const *file = *files;
1942 FILE *fp = xfopen (file, "r");
1944 size_t bytes_per_line = (2 * sizeof (struct line)
1945 - sizeof (struct line) / 2);
1948 initbuf (&buf, bytes_per_line,
1949 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
1954 while (fillbuf (&buf, fp, file))
1957 struct line *linebase;
1959 if (buf.eof && nfiles
1960 && (bytes_per_line + 1
1961 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
1963 /* End of file, but there is more input and buffer room.
1964 Concatenate the next input file; this is faster in
1966 buf.left = buf.used;
1970 line = buffer_linelim (&buf);
1971 linebase = line - buf.nlines;
1973 sortlines (line, buf.nlines, linebase);
1974 if (buf.eof && !nfiles && !ntemps && !buf.left)
1977 tfp = xfopen (output_file, "w");
1978 temp_output = output_file;
1979 output_file_created = true;
1984 temp_output = create_temp_file (&tfp);
1990 write_bytes (line->text, line->length, tfp, temp_output);
1992 while (linebase < line && compare (line, line - 1) == 0)
1995 while (linebase < line);
1997 xfclose (tfp, temp_output);
1999 if (output_file_created)
2008 if (! output_file_created)
2011 struct tempnode *node = temphead;
2012 char **tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2013 for (i = 0; node; i++)
2015 tempfiles[i] = node->name;
2018 merge (tempfiles, ntemps, ntemps, output_file);
2023 /* Insert key KEY at the end of the key list. */
2026 insertkey (struct keyfield *key)
2028 struct keyfield **p;
2030 for (p = &keylist; *p; p = &(*p)->next)
2036 /* Report a bad field specification SPEC, with extra info MSGID. */
2038 static void badfieldspec (char const *, char const *)
2041 badfieldspec (char const *spec, char const *msgid)
2043 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
2044 _(msgid), quote (spec));
2048 /* Report incompatible options. */
2050 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
2052 incompatible_options (char const *opts)
2054 error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
2058 /* Check compatibility of ordering options. */
2061 check_ordering_compatibility (void)
2063 struct keyfield const *key;
2065 for (key = keylist; key; key = key->next)
2066 if ((1 < (key->random + key->numeric + key->general_numeric + key->month
2068 || (key->random && key->translate))
2072 if (key->ignore == nondictionary)
2076 if (key->general_numeric)
2078 if (key->ignore == nonprinting)
2087 incompatible_options (opts);
2091 /* Parse the leading integer in STRING and store the resulting value
2092 (which must fit into size_t) into *VAL. Return the address of the
2093 suffix after the integer. If MSGID is NULL, return NULL after
2094 failure; otherwise, report MSGID and exit on failure. */
2097 parse_field_count (char const *string, size_t *val, char const *msgid)
2102 switch (xstrtoumax (string, &suffix, 10, &n, ""))
2105 case LONGINT_INVALID_SUFFIX_CHAR:
2110 case LONGINT_OVERFLOW:
2111 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2113 error (SORT_FAILURE, 0, _("%s: count `%.*s' too large"),
2114 _(msgid), (int) (suffix - string), string);
2117 case LONGINT_INVALID:
2119 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
2120 _(msgid), quote (string));
2127 /* Handle interrupts and hangups. */
2130 sighandler (int sig)
2133 signal (sig, SIG_IGN);
2137 signal (sig, SIG_DFL);
2141 /* Set the ordering options for KEY specified in S.
2142 Return the address of the first character in S that
2143 is not a valid ordering option.
2144 BLANKTYPE is the kind of blanks that 'b' should skip. */
2147 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
2154 if (blanktype == bl_start || blanktype == bl_both)
2155 key->skipsblanks = true;
2156 if (blanktype == bl_end || blanktype == bl_both)
2157 key->skipeblanks = true;
2160 key->ignore = nondictionary;
2163 key->translate = fold_toupper;
2166 key->general_numeric = true;
2169 /* Option order should not matter, so don't let -i override
2170 -d. -d implies -i, but -i does not imply -d. */
2172 key->ignore = nonprinting;
2178 key->numeric = true;
2184 key->reverse = true;
2194 static struct keyfield *
2197 struct keyfield *key = xzalloc (sizeof *key);
2198 key->eword = SIZE_MAX;
2203 main (int argc, char **argv)
2205 struct keyfield *key;
2206 struct keyfield gkey;
2209 bool checkonly = false;
2210 bool mergeonly = false;
2211 char const *seed = NULL;
2212 bool need_random = false;
2214 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2215 bool obsolete_usage = (posix2_version () < 200112);
2216 char *minus = "-", **files;
2217 char const *outfile = NULL;
2219 initialize_main (&argc, &argv);
2220 program_name = argv[0];
2221 setlocale (LC_ALL, "");
2222 bindtextdomain (PACKAGE, LOCALEDIR);
2223 textdomain (PACKAGE);
2227 initialize_exit_failure (SORT_FAILURE);
2228 atexit (close_stdout);
2230 hard_LC_COLLATE = hard_locale (LC_COLLATE);
2231 #if HAVE_NL_LANGINFO
2232 hard_LC_TIME = hard_locale (LC_TIME);
2235 /* Get locale's representation of the decimal point. */
2237 struct lconv const *locale = localeconv ();
2239 /* If the locale doesn't define a decimal point, or if the decimal
2240 point is multibyte, use the C locale's decimal point. FIXME:
2241 add support for multibyte decimal points. */
2242 decimal_point = to_uchar (locale->decimal_point[0]);
2243 if (! decimal_point || locale->decimal_point[1])
2244 decimal_point = '.';
2246 /* FIXME: add support for multibyte thousands separators. */
2247 thousands_sep = to_uchar (*locale->thousands_sep);
2248 if (! thousands_sep || locale->thousands_sep[1])
2252 have_read_stdin = false;
2257 static int const sig[] = { SIGHUP, SIGINT, SIGPIPE, SIGTERM };
2258 enum { nsigs = sizeof sig / sizeof sig[0] };
2261 struct sigaction act;
2263 sigemptyset (&caught_signals);
2264 for (i = 0; i < nsigs; i++)
2266 sigaction (sig[i], NULL, &act);
2267 if (act.sa_handler != SIG_IGN)
2268 sigaddset (&caught_signals, sig[i]);
2271 act.sa_handler = sighandler;
2272 act.sa_mask = caught_signals;
2275 for (i = 0; i < nsigs; i++)
2276 if (sigismember (&caught_signals, sig[i]))
2277 sigaction (sig[i], &act, NULL);
2279 for (i = 0; i < nsigs; i++)
2280 if (signal (sig[i], SIG_IGN) != SIG_IGN)
2282 signal (sig[i], sighandler);
2283 siginterrupt (sig[i], 1);
2288 gkey.sword = gkey.eword = SIZE_MAX;
2290 gkey.translate = NULL;
2291 gkey.numeric = gkey.general_numeric = gkey.random = false;
2292 gkey.month = gkey.reverse = false;
2293 gkey.skipsblanks = gkey.skipeblanks = false;
2295 files = xnmalloc (argc, sizeof *files);
2299 /* Parse an operand as a file after "--" was seen; or if
2300 pedantic and a file was seen, unless the POSIX version
2301 predates 1003.1-2001 and -c was not seen and the operand is
2302 "-o FILE" or "-oFILE". */
2305 || (posixly_correct && nfiles != 0
2306 && ! (obsolete_usage
2309 && argv[optind][0] == '-' && argv[optind][1] == 'o'
2310 && (argv[optind][2] || optind + 1 != argc)))
2311 || ((c = getopt_long (argc, argv, short_options,
2312 long_options, NULL))
2317 files[nfiles++] = argv[optind++];
2323 if (optarg[0] == '+')
2325 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
2326 && ISDIGIT (argv[optind][1]));
2327 obsolete_usage |= minus_pos_usage & ~posixly_correct;
2330 /* Treat +POS1 [-POS2] as a key if possible; but silently
2331 treat an operand as a file if it is not a valid +POS1. */
2333 s = parse_field_count (optarg + 1, &key->sword, NULL);
2335 s = parse_field_count (s + 1, &key->schar, NULL);
2336 if (! (key->sword | key->schar))
2337 key->sword = SIZE_MAX;
2338 if (! s || *set_ordering (s, key, bl_start))
2345 if (minus_pos_usage)
2347 char const *optarg1 = argv[optind++];
2348 s = parse_field_count (optarg1 + 1, &key->eword,
2349 N_("invalid number after `-'"));
2351 s = parse_field_count (s + 1, &key->echar,
2352 N_("invalid number after `.'"));
2353 if (*set_ordering (s, key, bl_end))
2354 badfieldspec (optarg1,
2355 N_("stray character in field spec"));
2362 files[nfiles++] = optarg;
2378 set_ordering (str, &gkey, bl_both);
2390 s = parse_field_count (optarg, &key->sword,
2391 N_("invalid number at field start"));
2394 /* Provoke with `sort -k0' */
2395 badfieldspec (optarg, N_("field number is zero"));
2399 s = parse_field_count (s + 1, &key->schar,
2400 N_("invalid number after `.'"));
2403 /* Provoke with `sort -k1.0' */
2404 badfieldspec (optarg, N_("character offset is zero"));
2407 if (! (key->sword | key->schar))
2408 key->sword = SIZE_MAX;
2409 s = set_ordering (s, key, bl_start);
2412 key->eword = SIZE_MAX;
2418 s = parse_field_count (s + 1, &key->eword,
2419 N_("invalid number after `,'"));
2422 /* Provoke with `sort -k1,0' */
2423 badfieldspec (optarg, N_("field number is zero"));
2426 s = parse_field_count (s + 1, &key->echar,
2427 N_("invalid number after `.'"));
2430 /* `-k 2,3' is equivalent to `+1 -3'. */
2433 s = set_ordering (s, key, bl_end);
2436 badfieldspec (optarg, N_("stray character in field spec"));
2445 if (outfile && !STREQ (outfile, optarg))
2446 error (SORT_FAILURE, 0, _("multiple output files specified"));
2459 specify_sort_size (optarg);
2464 char newtab = optarg[0];
2466 error (SORT_FAILURE, 0, _("empty tab"));
2469 if (STREQ (optarg, "\\0"))
2473 /* Provoke with `sort -txx'. Complain about
2474 "multi-character tab" instead of "multibyte tab", so
2475 that the diagnostic's wording does not need to be
2476 changed once multibyte characters are supported. */
2477 error (SORT_FAILURE, 0, _("multi-character tab %s"),
2481 if (tab != TAB_DEFAULT && tab != newtab)
2482 error (SORT_FAILURE, 0, _("incompatible tabs"));
2488 add_temp_dir (optarg);
2496 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
2497 through Solaris 7. It is also accepted by many non-Solaris
2498 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
2499 -y is marked as obsolete starting with Solaris 8 (1999), but is
2500 still accepted as of Solaris 10 prerelease (2004).
2502 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
2503 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
2504 and which in general ignores the argument after "-y" if it
2505 consists entirely of digits (it can even be empty). */
2506 if (optarg == argv[optind - 1])
2509 for (p = optarg; ISDIGIT (*p); p++)
2511 optind -= (*p != '\0');
2519 case_GETOPT_HELP_CHAR;
2521 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2524 usage (SORT_FAILURE);
2528 /* Inheritance of global options to individual keys. */
2529 for (key = keylist; key; key = key->next)
2531 if (! (key->ignore || key->translate
2532 || (key->skipsblanks | key->reverse
2533 | key->skipeblanks | key->month | key->numeric
2534 | key->general_numeric
2537 key->ignore = gkey.ignore;
2538 key->translate = gkey.translate;
2539 key->skipsblanks = gkey.skipsblanks;
2540 key->skipeblanks = gkey.skipeblanks;
2541 key->month = gkey.month;
2542 key->numeric = gkey.numeric;
2543 key->general_numeric = gkey.general_numeric;
2544 key->random = gkey.random;
2545 key->reverse = gkey.reverse;
2548 need_random |= key->random;
2551 if (!keylist && (gkey.ignore || gkey.translate
2552 || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
2553 | gkey.numeric | gkey.general_numeric
2557 need_random |= gkey.random;
2560 check_ordering_compatibility ();
2562 reverse = gkey.reverse;
2568 isaac_seed_start (&rand_state);
2569 isaac_seed_data (&rand_state, seed, strlen (seed));
2572 isaac_seed (&rand_state);
2575 if (temp_dir_count == 0)
2577 char const *tmp_dir = getenv ("TMPDIR");
2578 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
2590 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -c"),
2594 incompatible_options ("co");
2596 /* POSIX requires that sort return 1 IFF invoked with -c and the
2597 input is not properly sorted. */
2598 exit (check (files[0]) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
2602 merge (files, 0, nfiles, outfile);
2604 sort (files, nfiles, outfile);
2606 if (have_read_stdin && fclose (stdin) == EOF)
2607 die (_("close failed"), "-");
2609 exit (EXIT_SUCCESS);