1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 88, 1991-2003 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written December 1988 by Mike Haertel.
19 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
20 or (US mail) as Mike Haertel c/o Free Software Foundation.
22 Ørn E. Hansen added NLS support in 1997. */
27 #include <sys/types.h>
34 #include "hard-locale.h"
36 #include "long-options.h"
39 #include "stdio-safer.h"
43 #if HAVE_SYS_RESOURCE_H
44 # include <sys/resource.h>
47 struct rlimit { size_t rlim_cur; };
48 # define getrlimit(Resource, Rlp) (-1)
51 /* The official name of this program (e.g., no `g' prefix). */
52 #define PROGRAM_NAME "sort"
54 #define AUTHORS "Mike Haertel", "Paul Eggert"
56 #if HAVE_LANGINFO_CODESET
57 # include <langinfo.h>
61 # define sigprocmask(How, Set, Oset) /* empty */
69 #define UCHAR_LIM (UCHAR_MAX + 1)
70 #define UCHAR(c) ((unsigned char) (c))
72 #ifndef DEFAULT_TMPDIR
73 # define DEFAULT_TMPDIR "/tmp"
76 /* Use this as exit status in case of error, not EXIT_FAILURE. This
77 is necessary because EXIT_FAILURE is usually 1 and POSIX requires
78 that sort exit with status 1 IFF invoked with -c and the input is
79 not properly sorted. Any other irregular exit must exit with a
80 status code greater than 1. */
81 #define SORT_FAILURE 2
82 #define SORT_OUT_OF_ORDER 1
84 #define C_DECIMAL_POINT '.'
85 #define NEGATION_SIGN '-'
86 #define NUMERIC_ZERO '0'
90 static char decimal_point;
91 static int th_sep; /* if CHAR_MAX + 1, then there is no thousands separator */
93 /* Nonzero if the corresponding locales are hard. */
94 static bool hard_LC_COLLATE;
96 static bool hard_LC_TIME;
99 # define IS_THOUSANDS_SEP(x) ((x) == th_sep)
103 # define decimal_point C_DECIMAL_POINT
104 # define IS_THOUSANDS_SEP(x) 0
108 #define NONZERO(x) (x != 0)
110 /* The kind of blanks for '-b' to skip in various options. */
111 enum blanktype { bl_start, bl_end, bl_both };
113 /* The character marking end of line. Default to \n. */
114 static char eolchar = '\n';
116 /* Lines are held in core as counted strings. */
119 char *text; /* Text of the line. */
120 size_t length; /* Length including final newline. */
121 char *keybeg; /* Start of first key. */
122 char *keylim; /* Limit of first key. */
128 char *buf; /* Dynamically allocated buffer,
129 partitioned into 3 regions:
132 - an array of lines, in reverse order. */
133 size_t used; /* Number of bytes used for input data. */
134 size_t nlines; /* Number of lines in the line array. */
135 size_t alloc; /* Number of bytes allocated. */
136 size_t left; /* Number of bytes left from previous reads. */
137 size_t line_bytes; /* Number of bytes to reserve for each line. */
138 bool eof; /* An EOF has been read. */
143 size_t sword; /* Zero-origin 'word' to start at. */
144 size_t schar; /* Additional characters to skip. */
145 size_t eword; /* Zero-origin first word after field. */
146 size_t echar; /* Additional characters in field. */
147 bool const *ignore; /* Boolean array of characters to ignore. */
148 char const *translate; /* Translation applied to characters. */
149 bool skipsblanks; /* Skip leading blanks at start. */
150 bool skipeblanks; /* Skip trailing blanks at finish. */
151 bool numeric; /* Flag for numeric comparison. Handle
152 strings of digits with optional decimal
153 point, but no exponential notation. */
154 bool general_numeric; /* Flag for general, numeric comparison.
155 Handle numbers in exponential notation. */
156 bool month; /* Flag for comparison by month name. */
157 bool reverse; /* Reverse the sense of comparison. */
158 struct keyfield *next; /* Next keyfield to try. */
167 /* The name this program was run with. */
170 /* FIXME: None of these tables work with multibyte character sets.
171 Also, there are many other bugs when handling multibyte characters.
172 One way to fix this is to rewrite `sort' to use wide characters
173 internally, but doing this with good performance is a bit
176 /* Table of blanks. */
177 static bool blanks[UCHAR_LIM];
179 /* Table of non-printing characters. */
180 static bool nonprinting[UCHAR_LIM];
182 /* Table of non-dictionary characters (not letters, digits, or blanks). */
183 static bool nondictionary[UCHAR_LIM];
185 /* Translation table folding lower case to upper. */
186 static char fold_toupper[UCHAR_LIM];
188 #define MONTHS_PER_YEAR 12
190 /* Table mapping month names to integers.
191 Alphabetic order allows binary search. */
192 static struct month monthtab[] =
208 /* During the merge phase, the number of files to merge at once. */
211 /* Minimum size for a merge or check buffer. */
212 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
214 /* Minimum sort size; the code might not work with smaller sizes. */
215 #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
217 /* The number of bytes needed for a merge or check buffer, which can
218 function relatively efficiently even if it holds only one line. If
219 a longer line is seen, this value is increased. */
220 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
222 /* The approximate maximum number of bytes of main memory to use, as
223 specified by the user. Zero if the user has not specified a size. */
224 static size_t sort_size;
226 /* The guessed size for non-regular files. */
227 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
229 /* Array of directory names in which any temporary files are to be created. */
230 static char const **temp_dirs;
232 /* Number of temporary directory names used. */
233 static size_t temp_dir_count;
235 /* Number of allocated slots in temp_dirs. */
236 static size_t temp_dir_alloc;
238 /* Flag to reverse the order of all comparisons. */
241 /* Flag for stable sort. This turns off the last ditch bytewise
242 comparison of lines, and instead leaves lines in the same order
243 they were read if all keys compare equal. */
246 /* If TAB has this value, blanks separate fields. */
247 enum { TAB_DEFAULT = CHAR_MAX + 1 };
249 /* Tab character separating fields. If TAB_DEFAULT, then fields are
250 separated by the empty string between a non-blank character and a blank
252 static int tab = TAB_DEFAULT;
254 /* Flag to remove consecutive duplicate lines from the output.
255 Only the last of a sequence of equal lines will be output. */
258 /* Nonzero if any of the input files are the standard input. */
259 static bool have_read_stdin;
261 /* List of key field comparisons to be tried. */
262 static struct keyfield *keylist;
264 static void sortlines_temp (struct line *, size_t, struct line *);
270 fprintf (stderr, _("Try `%s --help' for more information.\n"),
275 Usage: %s [OPTION]... [FILE]...\n\
279 Write sorted concatenation of all FILE(s) to standard output.\n\
285 Mandatory arguments to long options are mandatory for short options too.\n\
288 -b, --ignore-leading-blanks ignore leading blanks\n\
289 -d, --dictionary-order consider only blanks and alphanumeric characters\n\
290 -f, --ignore-case fold lower case to upper case characters\n\
293 -g, --general-numeric-sort compare according to general numerical value\n\
294 -i, --ignore-nonprinting consider only printable characters\n\
295 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
296 -n, --numeric-sort compare according to string numerical value\n\
297 -r, --reverse reverse the result of comparisons\n\
303 -c, --check check whether input is sorted; do not sort\n\
304 -k, --key=POS1[,POS2] start a key at POS1, end it at POS 2 (origin 1)\n\
305 -m, --merge merge already sorted files; do not sort\n\
306 -o, --output=FILE write result to FILE instead of standard output\n\
307 -s, --stable stabilize sort by disabling last-resort comparison\n\
308 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
311 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
312 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s\n\
313 multiple options specify multiple directories\n\
314 -u, --unique with -c: check for strict ordering\n\
315 otherwise: output only the first of an equal run\n\
318 -z, --zero-terminated end lines with 0 byte, not newline\n\
320 fputs (HELP_OPTION_DESCRIPTION, stdout);
321 fputs (VERSION_OPTION_DESCRIPTION, stdout);
324 POS is F[.C][OPTS], where F is the field number and C the character position\n\
325 in the field. OPTS is one or more single-letter ordering options, which\n\
326 override global ordering options for that key. If no key is given, use the\n\
327 entire line as the key.\n\
329 SIZE may be followed by the following multiplicative suffixes:\n\
332 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
334 With no FILE, or when FILE is -, read standard input.\n\
337 The locale specified by the environment affects sort order.\n\
338 Set LC_ALL=C to get the traditional sort order that uses\n\
339 native byte values.\n\
341 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
343 /* Don't use EXIT_FAILURE here in case it is defined to be 1.
344 POSIX requires that sort return 1 IFF invoked with -c and
345 the input is not properly sorted. */
346 assert (status == 0 || status == SORT_FAILURE);
350 #define COMMON_SHORT_OPTIONS "-bcdfgik:mMno:rsS:t:T:uz"
352 static struct option const long_options[] =
354 {"ignore-leading-blanks", no_argument, NULL, 'b'},
355 {"check", no_argument, NULL, 'c'},
356 {"dictionary-order", no_argument, NULL, 'd'},
357 {"ignore-case", no_argument, NULL, 'f'},
358 {"general-numeric-sort", no_argument, NULL, 'g'},
359 {"ignore-nonprinting", no_argument, NULL, 'i'},
360 {"key", required_argument, NULL, 'k'},
361 {"merge", no_argument, NULL, 'm'},
362 {"month-sort", no_argument, NULL, 'M'},
363 {"numeric-sort", no_argument, NULL, 'n'},
364 {"output", required_argument, NULL, 'o'},
365 {"reverse", no_argument, NULL, 'r'},
366 {"stable", no_argument, NULL, 's'},
367 {"buffer-size", required_argument, NULL, 'S'},
368 {"field-separator", required_argument, NULL, 't'},
369 {"temporary-directory", required_argument, NULL, 'T'},
370 {"unique", no_argument, NULL, 'u'},
371 {"zero-terminated", no_argument, NULL, 'z'},
372 {GETOPT_HELP_OPTION_DECL},
373 {GETOPT_VERSION_OPTION_DECL},
377 /* The set of signals that are caught. */
378 static sigset_t caught_signals;
380 /* The list of temporary files. */
383 struct tempnode *volatile next;
384 char name[1]; /* Actual size is 1 + file name length. */
386 static struct tempnode *volatile temphead;
388 /* Clean up any remaining temporary files. */
393 struct tempnode const *node;
395 for (node = temphead; node; node = node->next)
399 /* Report MESSAGE for FILE, then clean up and exit. */
401 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
403 die (char const *message, char const *file)
405 error (0, errno, "%s: %s", message, file);
409 /* Create a new temporary file, returning its newly allocated name.
410 Store into *PFP a stream open for writing. */
413 create_temp_file (FILE **pfp)
415 static char const slashbase[] = "/sortXXXXXX";
416 static size_t temp_dir_index;
420 char const *temp_dir = temp_dirs[temp_dir_index];
421 size_t len = strlen (temp_dir);
422 struct tempnode *node =
423 xmalloc (sizeof node->next + len + sizeof slashbase);
424 char *file = node->name;
426 memcpy (file, temp_dir, len);
427 memcpy (file + len, slashbase, sizeof slashbase);
428 node->next = temphead;
429 if (++temp_dir_index == temp_dir_count)
432 /* Create the temporary file in a critical section, to avoid races. */
433 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
438 sigprocmask (SIG_SETMASK, &oldset, NULL);
441 if (fd < 0 || (*pfp = fdopen (fd, "w")) == NULL)
442 die (_("cannot create temporary file"), file);
448 xfopen (const char *file, const char *how)
452 if (STREQ (file, "-"))
456 have_read_stdin = true;
464 if ((fp = fopen_safer (file, how)) == NULL)
465 die (_("open failed"), file);
471 /* Close FP, whose name is FILE, and report any errors. */
474 xfclose (FILE *fp, char const *file)
478 /* Allow reading stdin from tty more than once. */
484 if (fclose (fp) != 0)
485 die (_("close failed"), file);
490 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
492 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
493 die (_("write failed"), output_file);
496 /* Append DIR to the array of temporary directory names. */
498 add_temp_dir (char const *dir)
500 if (temp_dir_count == temp_dir_alloc)
502 temp_dir_alloc = temp_dir_alloc ? temp_dir_alloc * 2 : 16;
503 temp_dirs = xrealloc (temp_dirs, sizeof (temp_dirs) * temp_dir_alloc);
506 temp_dirs[temp_dir_count++] = dir;
509 /* Search through the list of temporary files for NAME;
510 remove it if it is found on the list. */
513 zaptemp (const char *name)
515 struct tempnode *volatile *pnode;
516 struct tempnode *node;
518 for (pnode = &temphead; (node = *pnode); pnode = &node->next)
519 if (node->name == name)
531 struct_month_cmp (const void *m1, const void *m2)
533 struct month const *month1 = m1;
534 struct month const *month2 = m2;
535 return strcmp (month1->name, month2->name);
540 /* Initialize the character class tables. */
547 for (i = 0; i < UCHAR_LIM; ++i)
549 blanks[i] = !!ISBLANK (i);
550 nonprinting[i] = !ISPRINT (i);
551 nondictionary[i] = !ISALNUM (i) && !ISBLANK (i);
552 fold_toupper[i] = (ISLOWER (i) ? toupper (i) : i);
556 /* If we're not in the "C" locale, read different names for months. */
559 for (i = 0; i < MONTHS_PER_YEAR; i++)
566 s = (char *) nl_langinfo (ABMON_1 + i);
568 monthtab[i].name = name = xmalloc (s_len + 1);
569 monthtab[i].val = i + 1;
571 for (j = 0; j < s_len; j++)
572 name[j] = fold_toupper[UCHAR (s[j])];
575 qsort ((void *) monthtab, MONTHS_PER_YEAR,
576 sizeof (struct month), struct_month_cmp);
581 /* Specify the amount of main memory to use when sorting. */
583 specify_sort_size (char const *s)
587 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
589 /* The default unit is KiB. */
590 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
592 if (n <= UINTMAX_MAX / 1024)
595 e = LONGINT_OVERFLOW;
598 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
599 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
608 double mem = physmem_total () * n / 100;
610 /* Use "<", not "<=", to avoid problems with rounding. */
611 if (mem < UINTMAX_MAX)
617 e = LONGINT_OVERFLOW;
624 /* If multiple sort sizes are specified, take the maximum, so
625 that option order does not matter. */
632 sort_size = MAX (sort_size, MIN_SORT_SIZE);
636 e = LONGINT_OVERFLOW;
639 STRTOL_FATAL_ERROR (s, _("sort size"), e);
642 /* Return the default sort size. */
644 default_sort_size (void)
646 /* Let MEM be available memory or 1/8 of total memory, whichever
648 double avail = physmem_available ();
649 double total = physmem_total ();
650 double mem = MAX (avail, total / 8);
651 struct rlimit rlimit;
653 /* Let SIZE be MEM, but no more than the maximum object size or
654 system resource limits. Avoid the MIN macro here, as it is not
655 quite right when only one argument is floating point. Don't
656 bother to check for values like RLIM_INFINITY since in practice
657 they are not much less than SIZE_MAX. */
658 size_t size = SIZE_MAX;
661 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
662 size = rlimit.rlim_cur;
664 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
665 size = rlimit.rlim_cur;
668 /* Leave a large safety margin for the above limits, as failure can
669 occur when they are exceeded. */
673 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
674 Exceeding RSS is not fatal, but can be quite slow. */
675 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
676 size = rlimit.rlim_cur / 16 * 15;
679 /* Use no less than the minimum. */
680 return MAX (size, MIN_SORT_SIZE);
683 /* Return the sort buffer size to use with the input files identified
684 by FPS and FILES, which are alternate paths to the same files.
685 NFILES gives the number of input files; NFPS may be less. Assume
686 that each input line requires LINE_BYTES extra bytes' worth of line
687 information. Do not exceed a bound on the size: if the bound is
688 not specified by the user, use a default. */
691 sort_buffer_size (FILE *const *fps, int nfps,
692 char *const *files, int nfiles,
695 /* A bound on the input size. If zero, the bound hasn't been
697 static size_t size_bound;
699 /* In the worst case, each input byte is a newline. */
700 size_t worst_case_per_input_byte = line_bytes + 1;
702 /* Keep enough room for one extra input line and an extra byte.
703 This extra room might be needed when preparing to read EOF. */
704 size_t size = worst_case_per_input_byte + 1;
708 for (i = 0; i < nfiles; i++)
714 if ((i < nfps ? fstat (fileno (fps[i]), &st)
715 : strcmp (files[i], "-") == 0 ? fstat (STDIN_FILENO, &st)
716 : stat (files[i], &st))
718 die (_("stat failed"), files[i]);
720 if (S_ISREG (st.st_mode))
721 file_size = st.st_size;
724 /* The file has unknown size. If the user specified a sort
725 buffer size, use that; otherwise, guess the size. */
728 file_size = INPUT_FILE_SIZE_GUESS;
733 size_bound = sort_size;
735 size_bound = default_sort_size ();
738 /* Add the amount of memory needed to represent the worst case
739 where the input consists entirely of newlines followed by a
740 single non-newline. Check for overflow. */
741 worst_case = file_size * worst_case_per_input_byte + 1;
742 if (file_size != worst_case / worst_case_per_input_byte
743 || size_bound - size <= worst_case)
751 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
752 must be at least sizeof (struct line). Allocate ALLOC bytes
756 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
758 /* Ensure that the line array is properly aligned. If the desired
759 size cannot be allocated, repeatedly halve it until allocation
760 succeeds. The smaller allocation may hurt overall performance,
761 but that's better than failing. */
764 alloc += sizeof (struct line) - alloc % sizeof (struct line);
765 buf->buf = malloc (alloc);
769 if (alloc <= line_bytes + 1)
773 buf->line_bytes = line_bytes;
775 buf->used = buf->left = buf->nlines = 0;
779 /* Return one past the limit of the line array. */
781 static inline struct line *
782 buffer_linelim (struct buffer const *buf)
784 return (struct line *) (buf->buf + buf->alloc);
787 /* Return a pointer to the first character of the field specified
791 begfield (const struct line *line, const struct keyfield *key)
793 register char *ptr = line->text, *lim = ptr + line->length - 1;
794 register size_t sword = key->sword;
795 register size_t schar = key->schar;
796 register size_t remaining_bytes;
798 /* The leading field separator itself is included in a field when -t
801 if (tab != TAB_DEFAULT)
802 while (ptr < lim && sword--)
804 while (ptr < lim && *ptr != tab)
810 while (ptr < lim && sword--)
812 while (ptr < lim && blanks[UCHAR (*ptr)])
814 while (ptr < lim && !blanks[UCHAR (*ptr)])
818 if (key->skipsblanks)
819 while (ptr < lim && blanks[UCHAR (*ptr)])
822 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
823 remaining_bytes = lim - ptr;
824 if (schar < remaining_bytes)
832 /* Return the limit of (a pointer to the first character after) the field
833 in LINE specified by KEY. */
836 limfield (const struct line *line, const struct keyfield *key)
838 register char *ptr = line->text, *lim = ptr + line->length - 1;
839 register size_t eword = key->eword, echar = key->echar;
840 register size_t remaining_bytes;
842 /* Move PTR past EWORD fields or to one past the last byte on LINE,
843 whichever comes first. If there are more than EWORD fields, leave
844 PTR pointing at the beginning of the field having zero-based index,
845 EWORD. If a delimiter character was specified (via -t), then that
846 `beginning' is the first character following the delimiting TAB.
847 Otherwise, leave PTR pointing at the first `blank' character after
848 the preceding field. */
849 if (tab != TAB_DEFAULT)
850 while (ptr < lim && eword--)
852 while (ptr < lim && *ptr != tab)
854 if (ptr < lim && (eword | echar))
858 while (ptr < lim && eword--)
860 while (ptr < lim && blanks[UCHAR (*ptr)])
862 while (ptr < lim && !blanks[UCHAR (*ptr)])
866 #ifdef POSIX_UNSPECIFIED
867 /* The following block of code makes GNU sort incompatible with
868 standard Unix sort, so it's ifdef'd out for now.
869 The POSIX spec isn't clear on how to interpret this.
870 FIXME: request clarification.
872 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
873 Date: Thu, 30 May 96 12:20:41 -0400
874 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
876 [...]I believe I've found another bug in `sort'.
881 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
884 $ /bin/sort -k1.7,1.7 </tmp/sort.in
888 Unix sort produced the answer I expected: sort on the single character
889 in column 7. GNU sort produced different results, because it disagrees
890 on the interpretation of the key-end spec "M.N". Unix sort reads this
891 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
892 "skip M-1 fields, then either N-1 characters or the rest of the current
893 field, whichever comes first". This extra clause applies only to
894 key-ends, not key-starts.
897 /* Make LIM point to the end of (one byte past) the current field. */
898 if (tab != TAB_DEFAULT)
901 newlim = memchr (ptr, tab, lim - ptr);
909 while (newlim < lim && blanks[UCHAR (*newlim)])
911 while (newlim < lim && !blanks[UCHAR (*newlim)])
917 /* If we're skipping leading blanks, don't start counting characters
918 until after skipping past any leading blanks. */
919 if (key->skipsblanks)
920 while (ptr < lim && blanks[UCHAR (*ptr)])
923 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
924 remaining_bytes = lim - ptr;
925 if (echar < remaining_bytes)
933 /* Return the number of trailing blanks in FIELD, with LEN bytes. */
936 trailing_blanks (char const *field, size_t len)
939 for (i = len; 0 < i && blanks[UCHAR (field[i - 1])]; i--)
944 /* Fill BUF reading from FP, moving buf->left bytes from the end
945 of buf->buf to the beginning first. If EOF is reached and the
946 file wasn't terminated by a newline, supply one. Set up BUF's line
947 table too. FILE is the name of the file corresponding to FP.
948 Return true if some input was read. */
951 fillbuf (struct buffer *buf, register FILE *fp, char const *file)
953 struct keyfield const *key = keylist;
955 size_t line_bytes = buf->line_bytes;
956 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
961 if (buf->used != buf->left)
963 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
964 buf->used = buf->left;
970 char *ptr = buf->buf + buf->used;
971 struct line *linelim = buffer_linelim (buf);
972 struct line *line = linelim - buf->nlines;
973 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
974 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
976 while (line_bytes + 1 < avail)
978 /* Read as many bytes as possible, but do not read so many
979 bytes that there might not be enough room for the
980 corresponding line array. The worst case is when the
981 rest of the input file consists entirely of newlines,
982 except that the last byte is not a newline. */
983 size_t readsize = (avail - 1) / (line_bytes + 1);
984 size_t bytes_read = fread (ptr, 1, readsize, fp);
985 char *ptrlim = ptr + bytes_read;
989 if (bytes_read != readsize)
992 die (_("read failed"), file);
996 if (buf->buf == ptrlim)
998 if (ptrlim[-1] != eol)
1003 /* Find and record each line in the just-read input. */
1004 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1008 line->text = line_start;
1009 line->length = ptr - line_start;
1010 mergesize = MAX (mergesize, line->length);
1011 avail -= line_bytes;
1015 /* Precompute the position of the first key for
1017 line->keylim = (key->eword == SIZE_MAX
1019 : limfield (line, key));
1021 if (key->sword != SIZE_MAX)
1022 line->keybeg = begfield (line, key);
1025 if (key->skipsblanks)
1026 while (blanks[UCHAR (*line_start)])
1028 line->keybeg = line_start;
1030 if (key->skipeblanks)
1032 size_t keylen = line->keylim - line->keybeg;
1033 line->keylim -= trailing_blanks (line->keybeg, keylen);
1045 buf->used = ptr - buf->buf;
1046 buf->nlines = buffer_linelim (buf) - line;
1047 if (buf->nlines != 0)
1049 buf->left = ptr - line_start;
1050 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1054 /* The current input line is too long to fit in the buffer.
1055 Double the buffer size and try again. */
1056 if (2 * buf->alloc < buf->alloc)
1059 buf->buf = xrealloc (buf->buf, buf->alloc);
1063 /* Compare strings A and B containing decimal fractions < 1. Each string
1064 should begin with a decimal point followed immediately by the digits
1065 of the fraction. Strings not of this form are considered to be zero. */
1067 /* The goal here, is to take two numbers a and b... compare these
1068 in parallel. Instead of converting each, and then comparing the
1069 outcome. Most likely stopping the comparison before the conversion
1070 is complete. The algorithm used, in the old sort:
1072 Algorithm: fraccompare
1073 Action : compare two decimal fractions
1074 accepts : char *a, char *b
1075 returns : -1 if a<b, 0 if a=b, 1 if a>b.
1078 if *a == decimal_point AND *b == decimal_point
1079 find first character different in a and b.
1080 if both are digits, return the difference *a - *b.
1083 if digit return 1, else 0
1086 if digit return -1, else 0
1087 if *a is a decimal_point
1088 skip past decimal_point and zeros
1089 if digit return 1, else 0
1090 if *b is a decimal_point
1091 skip past decimal_point and zeros
1092 if digit return -1, else 0
1096 fraccompare (register const char *a, register const char *b)
1098 if (*a == decimal_point && *b == decimal_point)
1100 while (*++a == *++b)
1103 if (ISDIGIT (*a) && ISDIGIT (*b))
1106 goto a_trailing_nonzero;
1108 goto b_trailing_nonzero;
1111 else if (*a++ == decimal_point)
1114 while (*a == NUMERIC_ZERO)
1116 return ISDIGIT (*a);
1118 else if (*b++ == decimal_point)
1121 while (*b == NUMERIC_ZERO)
1123 return - ISDIGIT (*b);
1128 /* Compare strings A and B as numbers without explicitly converting them to
1129 machine numbers. Comparatively slow for short strings, but asymptotically
1133 numcompare (register const char *a, register const char *b)
1135 register int tmpa, tmpb, tmp;
1136 register size_t log_a, log_b;
1141 while (blanks[UCHAR (tmpa)])
1143 while (blanks[UCHAR (tmpb)])
1146 if (tmpa == NEGATION_SIGN)
1150 while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa));
1151 if (tmpb != NEGATION_SIGN)
1153 if (tmpa == decimal_point)
1156 while (tmpa == NUMERIC_ZERO);
1159 while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb))
1161 if (tmpb == decimal_point)
1164 while (tmpb == NUMERIC_ZERO);
1171 while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb));
1173 while (tmpa == tmpb && ISDIGIT (tmpa))
1177 while (IS_THOUSANDS_SEP (tmpa));
1180 while (IS_THOUSANDS_SEP (tmpb));
1183 if ((tmpa == decimal_point && !ISDIGIT (tmpb))
1184 || (tmpb == decimal_point && !ISDIGIT (tmpa)))
1185 return -fraccompare (a, b);
1189 for (log_a = 0; ISDIGIT (tmpa); ++log_a)
1192 while (IS_THOUSANDS_SEP (tmpa));
1194 for (log_b = 0; ISDIGIT (tmpb); ++log_b)
1197 while (IS_THOUSANDS_SEP (tmpb));
1200 return log_a < log_b ? 1 : -1;
1207 else if (tmpb == NEGATION_SIGN)
1211 while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb));
1212 if (tmpb == decimal_point)
1215 while (tmpb == NUMERIC_ZERO);
1218 while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa))
1220 if (tmpa == decimal_point)
1223 while (tmpa == NUMERIC_ZERO);
1230 while (tmpa == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpa))
1232 while (tmpb == NUMERIC_ZERO || IS_THOUSANDS_SEP (tmpb))
1235 while (tmpa == tmpb && ISDIGIT (tmpa))
1239 while (IS_THOUSANDS_SEP (tmpa));
1242 while (IS_THOUSANDS_SEP (tmpb));
1245 if ((tmpa == decimal_point && !ISDIGIT (tmpb))
1246 || (tmpb == decimal_point && !ISDIGIT (tmpa)))
1247 return fraccompare (a, b);
1251 for (log_a = 0; ISDIGIT (tmpa); ++log_a)
1254 while (IS_THOUSANDS_SEP (tmpa));
1256 for (log_b = 0; ISDIGIT (tmpb); ++log_b)
1259 while (IS_THOUSANDS_SEP (tmpb));
1262 return log_a < log_b ? -1 : 1;
1272 general_numcompare (const char *sa, const char *sb)
1274 /* FIXME: add option to warn about failed conversions. */
1275 /* FIXME: maybe add option to try expensive FP conversion
1276 only if A and B can't be compared more cheaply/accurately. */
1280 double a = strtod (sa, &ea);
1281 double b = strtod (sb, &eb);
1283 /* Put conversion errors at the start of the collating sequence. */
1285 return sb == eb ? 0 : -1;
1289 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1290 conversion errors but before numbers; sort them by internal
1291 bit-pattern, for lack of a more portable alternative. */
1297 : memcmp ((char *) &a, (char *) &b, sizeof a));
1300 /* Return an integer in 1..12 of the month name S with length LEN.
1301 Return 0 if the name in S is not recognized. */
1304 getmonth (const char *s, size_t len)
1308 register int lo = 0, hi = MONTHS_PER_YEAR, result;
1310 while (len > 0 && blanks[UCHAR (*s)])
1319 month = alloca (len + 1);
1320 for (i = 0; i < len; ++i)
1321 month[i] = fold_toupper[UCHAR (s[i])];
1322 len -= trailing_blanks (month, len);
1327 int ix = (lo + hi) / 2;
1329 if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
1334 while (hi - lo > 1);
1336 result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
1337 ? monthtab[lo].val : 0);
1342 /* Compare two lines A and B trying every key in sequence until there
1343 are no more keys or a difference is found. */
1346 keycompare (const struct line *a, const struct line *b)
1348 struct keyfield const *key = keylist;
1350 /* For the first iteration only, the key positions have been
1351 precomputed for us. */
1352 register char *texta = a->keybeg;
1353 register char *textb = b->keybeg;
1354 register char *lima = a->keylim;
1355 register char *limb = b->keylim;
1361 register char const *translate = key->translate;
1362 register bool const *ignore = key->ignore;
1364 /* Find the lengths. */
1365 size_t lena = lima <= texta ? 0 : lima - texta;
1366 size_t lenb = limb <= textb ? 0 : limb - textb;
1368 if (key->skipeblanks)
1370 lena -= trailing_blanks (texta, lena);
1371 lenb -= trailing_blanks (textb, lenb);
1374 /* Actually compare the fields. */
1375 if (key->numeric | key->general_numeric)
1377 char savea = *lima, saveb = *limb;
1379 *lima = *limb = '\0';
1380 diff = ((key->numeric ? numcompare : general_numcompare)
1382 *lima = savea, *limb = saveb;
1384 else if (key->month)
1385 diff = getmonth (texta, lena) - getmonth (textb, lenb);
1386 /* Sorting like this may become slow, so in a simple locale the user
1387 can select a faster sort that is similar to ascii sort */
1388 else if (HAVE_SETLOCALE && hard_LC_COLLATE)
1390 if (ignore || translate)
1392 char *copy_a = alloca (lena + 1 + lenb + 1);
1393 char *copy_b = copy_a + lena + 1;
1394 size_t new_len_a, new_len_b, i;
1396 /* Ignore and/or translate chars before comparing. */
1397 for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1401 copy_a[new_len_a] = (translate
1402 ? translate[UCHAR (texta[i])]
1404 if (!ignore || !ignore[UCHAR (texta[i])])
1409 copy_b[new_len_b] = (translate
1410 ? translate[UCHAR (textb[i])]
1412 if (!ignore || !ignore[UCHAR (textb[i])])
1417 diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1420 diff = - NONZERO (lenb);
1424 diff = xmemcoll (texta, lena, textb, lenb);
1428 #define CMP_WITH_IGNORE(A, B) \
1433 while (texta < lima && ignore[UCHAR (*texta)]) \
1435 while (textb < limb && ignore[UCHAR (*textb)]) \
1437 if (! (texta < lima && textb < limb)) \
1439 diff = UCHAR (A) - UCHAR (B); \
1446 diff = (texta < lima) - (textb < limb); \
1451 CMP_WITH_IGNORE (translate[UCHAR (*texta)],
1452 translate[UCHAR (*textb)]);
1454 CMP_WITH_IGNORE (*texta, *textb);
1457 diff = - NONZERO (lenb);
1464 while (texta < lima && textb < limb)
1466 diff = (UCHAR (translate[UCHAR (*texta++)])
1467 - UCHAR (translate[UCHAR (*textb++)]));
1474 diff = memcmp (texta, textb, MIN (lena, lenb));
1478 diff = lena < lenb ? -1 : lena != lenb;
1488 /* Find the beginning and limit of the next field. */
1489 if (key->eword != SIZE_MAX)
1490 lima = limfield (a, key), limb = limfield (b, key);
1492 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1494 if (key->sword != SIZE_MAX)
1495 texta = begfield (a, key), textb = begfield (b, key);
1498 texta = a->text, textb = b->text;
1499 if (key->skipsblanks)
1501 while (texta < lima && blanks[UCHAR (*texta)])
1503 while (textb < limb && blanks[UCHAR (*textb)])
1514 return key->reverse ? -diff : diff;
1517 /* Compare two lines A and B, returning negative, zero, or positive
1518 depending on whether A compares less than, equal to, or greater than B. */
1521 compare (register const struct line *a, register const struct line *b)
1526 /* First try to compare on the specified keys (if any).
1527 The only two cases with no key at all are unadorned sort,
1528 and unadorned sort -r. */
1531 diff = keycompare (a, b);
1533 if (diff | unique | stable)
1537 /* If the keys all compare equal (or no keys were specified)
1538 fall through to the default comparison. */
1539 alen = a->length - 1, blen = b->length - 1;
1542 diff = - NONZERO (blen);
1545 else if (HAVE_SETLOCALE && hard_LC_COLLATE)
1546 diff = xmemcoll (a->text, alen, b->text, blen);
1547 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
1548 diff = alen < blen ? -1 : alen != blen;
1550 return reverse ? -diff : diff;
1553 /* Check that the lines read from FILE_NAME come in order. Print a
1554 diagnostic (FILE_NAME, line number, contents of line) to stderr and return
1555 false if they are not in order. Otherwise, print no diagnostic
1559 check (char const *file_name)
1561 FILE *fp = xfopen (file_name, "r");
1562 struct buffer buf; /* Input buffer. */
1563 struct line temp; /* Copy of previous line. */
1565 uintmax_t line_number = 0;
1566 struct keyfield const *key = keylist;
1567 bool nonunique = ! unique;
1568 bool ordered = true;
1570 initbuf (&buf, sizeof (struct line),
1571 MAX (merge_buffer_size, sort_size));
1574 while (fillbuf (&buf, fp, file_name))
1576 struct line const *line = buffer_linelim (&buf);
1577 struct line const *linebase = line - buf.nlines;
1579 /* Make sure the line saved from the old buffer contents is
1580 less than or equal to the first line of the new buffer. */
1581 if (alloc && nonunique <= compare (&temp, line - 1))
1585 struct line const *disorder_line = line - 1;
1586 uintmax_t disorder_line_number =
1587 buffer_linelim (&buf) - disorder_line + line_number;
1588 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
1589 fprintf (stderr, _("%s: %s:%s: disorder: "),
1590 program_name, file_name,
1591 umaxtostr (disorder_line_number, hr_buf));
1592 write_bytes (disorder_line->text, disorder_line->length, stderr,
1593 _("standard error"));
1599 /* Compare each line in the buffer with its successor. */
1600 while (linebase < --line)
1601 if (nonunique <= compare (line, line - 1))
1602 goto found_disorder;
1604 line_number += buf.nlines;
1606 /* Save the last line of the buffer. */
1607 if (alloc < line->length)
1614 alloc = line->length;
1618 while (alloc < line->length);
1620 temp.text = xrealloc (temp.text, alloc);
1622 memcpy (temp.text, line->text, line->length);
1623 temp.length = line->length;
1626 temp.keybeg = temp.text + (line->keybeg - line->text);
1627 temp.keylim = temp.text + (line->keylim - line->text);
1631 xfclose (fp, file_name);
1638 /* Merge lines from FILES onto OFP. NFILES cannot be greater than
1639 NMERGE. Close input and output files before returning.
1640 OUTPUT_FILE gives the name of the output file; if OFP is NULL, the
1641 output file has not been opened yet. */
1644 mergefps (char **files, register int nfiles,
1645 FILE *ofp, const char *output_file)
1647 FILE *fps[NMERGE]; /* Input streams for each file. */
1648 struct buffer buffer[NMERGE]; /* Input buffers for each file. */
1649 struct line saved; /* Saved line storage for unique check. */
1650 struct line const *savedline = NULL;
1651 /* &saved if there is a saved line. */
1652 size_t savealloc = 0; /* Size allocated for the saved line. */
1653 struct line const *cur[NMERGE]; /* Current line in each line table. */
1654 struct line const *base[NMERGE]; /* Base of each line table. */
1655 int ord[NMERGE]; /* Table representing a permutation of fps,
1656 such that cur[ord[0]] is the smallest line
1657 and will be next output. */
1658 register int i, j, t;
1659 struct keyfield const *key = keylist;
1662 /* Read initial lines from each input file. */
1663 for (i = 0; i < nfiles; )
1665 fps[i] = xfopen (files[i], "r");
1666 initbuf (&buffer[i], sizeof (struct line),
1667 MAX (merge_buffer_size, sort_size / nfiles));
1668 if (fillbuf (&buffer[i], fps[i], files[i]))
1670 struct line const *linelim = buffer_linelim (&buffer[i]);
1671 cur[i] = linelim - 1;
1672 base[i] = linelim - buffer[i].nlines;
1677 /* fps[i] is empty; eliminate it from future consideration. */
1678 xfclose (fps[i], files[i]);
1680 free (buffer[i].buf);
1682 for (j = i; j < nfiles; ++j)
1683 files[j] = files[j + 1];
1688 ofp = xfopen (output_file, "w");
1690 /* Set up the ord table according to comparisons among input lines.
1691 Since this only reorders two items if one is strictly greater than
1692 the other, it is stable. */
1693 for (i = 0; i < nfiles; ++i)
1695 for (i = 1; i < nfiles; ++i)
1696 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
1697 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1699 /* Repeatedly output the smallest line until no input remains. */
1702 struct line const *smallest = cur[ord[0]];
1704 /* If uniquified output is turned on, output only the first of
1705 an identical series of lines. */
1708 if (savedline && compare (savedline, smallest))
1711 write_bytes (saved.text, saved.length, ofp, output_file);
1716 if (savealloc < smallest->length)
1721 savealloc = smallest->length;
1724 while ((savealloc *= 2) < smallest->length);
1726 saved.text = xrealloc (saved.text, savealloc);
1728 saved.length = smallest->length;
1729 memcpy (saved.text, smallest->text, saved.length);
1733 saved.text + (smallest->keybeg - smallest->text);
1735 saved.text + (smallest->keylim - smallest->text);
1740 write_bytes (smallest->text, smallest->length, ofp, output_file);
1742 /* Check if we need to read more lines into core. */
1743 if (base[ord[0]] < smallest)
1744 cur[ord[0]] = smallest - 1;
1747 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]]))
1749 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
1750 cur[ord[0]] = linelim - 1;
1751 base[ord[0]] = linelim - buffer[ord[0]].nlines;
1755 /* We reached EOF on fps[ord[0]]. */
1756 for (i = 1; i < nfiles; ++i)
1757 if (ord[i] > ord[0])
1760 xfclose (fps[ord[0]], files[ord[0]]);
1761 zaptemp (files[ord[0]]);
1762 free (buffer[ord[0]].buf);
1763 for (i = ord[0]; i < nfiles; ++i)
1765 fps[i] = fps[i + 1];
1766 files[i] = files[i + 1];
1767 buffer[i] = buffer[i + 1];
1768 cur[i] = cur[i + 1];
1769 base[i] = base[i + 1];
1771 for (i = 0; i < nfiles; ++i)
1772 ord[i] = ord[i + 1];
1777 /* The new line just read in may be larger than other lines
1778 already in core; push it back in the queue until we encounter
1779 a line larger than it. */
1780 for (i = 1; i < nfiles; ++i)
1782 t = compare (cur[ord[0]], cur[ord[i]]);
1784 t = ord[0] - ord[i];
1789 for (j = 1; j < i; ++j)
1790 ord[j - 1] = ord[j];
1794 if (unique && savedline)
1796 write_bytes (saved.text, saved.length, ofp, output_file);
1800 xfclose (ofp, output_file);
1803 /* Merge into T the two sorted arrays of lines LO (with NLO members)
1804 and HI (with NHI members). T, LO, and HI point just past their
1805 respective arrays, and the arrays are in reverse order. NLO and
1806 NHI must be positive, and HI - NHI must equal T - (NLO + NHI). */
1809 mergelines (struct line *t,
1810 struct line const *lo, size_t nlo,
1811 struct line const *hi, size_t nhi)
1814 if (compare (lo - 1, hi - 1) <= 0)
1819 /* HI - NHI equalled T - (NLO + NHI) when this function
1820 began. Therefore HI must equal T now, and there is no
1821 need to copy from HI to T. */
1839 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
1840 NLINES must be at least 2.
1841 The input and output arrays are in reverse order, and LINES and
1842 TEMP point just past the end of their respective arrays.
1844 Use a recursive divide-and-conquer algorithm, in the style
1845 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
1846 the optimization suggested by exercise 5.2.4-10; this requires room
1847 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
1848 writes that this memory optimization was originally published by
1849 D. A. Bell, Comp J. 1 (1958), 75. */
1852 sortlines (struct line *lines, size_t nlines, struct line *temp)
1856 if (0 < compare (&lines[-1], &lines[-2]))
1858 struct line tmp = lines[-1];
1859 lines[-1] = lines[-2];
1865 size_t nlo = nlines / 2;
1866 size_t nhi = nlines - nlo;
1867 struct line *lo = lines;
1868 struct line *hi = lines - nlo;
1869 struct line *sorted_lo = temp;
1871 sortlines (hi, nhi, temp);
1873 sortlines_temp (lo, nlo, sorted_lo);
1875 sorted_lo[-1] = lo[-1];
1877 mergelines (lines, sorted_lo, nlo, hi, nhi);
1881 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
1882 rather than sorting in place. */
1885 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
1889 bool swap = (0 < compare (&lines[-1], &lines[-2]));
1890 temp[-1] = lines[-1 - swap];
1891 temp[-2] = lines[-2 + swap];
1895 size_t nlo = nlines / 2;
1896 size_t nhi = nlines - nlo;
1897 struct line *lo = lines;
1898 struct line *hi = lines - nlo;
1899 struct line *sorted_hi = temp - nlo;
1901 sortlines_temp (hi, nhi, sorted_hi);
1903 sortlines (lo, nlo, temp);
1905 mergelines (temp, lo, nlo, sorted_hi, nhi);
1909 /* Return the index of the first of NFILES FILES that is the same file
1910 as OUTFILE. If none can be the same, return NFILES. Consider an
1911 input pipe to be the same as OUTFILE, since the pipe might be the
1912 output of a command like "cat OUTFILE". */
1915 first_same_file (char * const *files, int nfiles, char const *outfile)
1918 bool got_outstat = false;
1919 struct stat instat, outstat;
1921 for (i = 0; i < nfiles; i++)
1923 bool standard_input = STREQ (files[i], "-");
1925 if (STREQ (outfile, files[i]) && ! standard_input)
1931 if ((STREQ (outfile, "-")
1932 ? fstat (STDOUT_FILENO, &outstat)
1933 : stat (outfile, &outstat))
1938 if (((standard_input
1939 ? fstat (STDIN_FILENO, &instat)
1940 : stat (files[i], &instat))
1942 && (S_ISFIFO (instat.st_mode) || SAME_INODE (instat, outstat)))
1949 /* Merge NFILES FILES onto OUTPUT_FILE. However, merge at most
1950 MAX_MERGE input files directly onto OUTPUT_FILE. MAX_MERGE cannot
1954 merge (char **files, int nfiles, int max_merge, char const *output_file)
1956 while (max_merge < nfiles)
1961 for (i = 0; i < nfiles / NMERGE; ++i)
1963 temp = create_temp_file (&tfp);
1964 mergefps (&files[i * NMERGE], NMERGE, tfp, temp);
1967 temp = create_temp_file (&tfp);
1968 mergefps (&files[i * NMERGE], nfiles % NMERGE, tfp, temp);
1975 mergefps (files, nfiles, NULL, output_file);
1978 /* Sort NFILES FILES onto OUTPUT_FILE. */
1981 sort (char * const *files, int nfiles, char const *output_file)
1984 int n_temp_files = 0;
1985 bool output_file_created = false;
1991 char const *temp_output;
1992 char const *file = *files;
1993 FILE *fp = xfopen (file, "r");
1995 size_t bytes_per_line = 2 * sizeof (struct line) - sizeof (struct line) / 2;
1998 initbuf (&buf, bytes_per_line,
1999 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2004 while (fillbuf (&buf, fp, file))
2007 struct line *linebase;
2009 if (buf.eof && nfiles
2010 && (bytes_per_line + 1
2011 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2013 /* End of file, but there is more input and buffer room.
2014 Concatenate the next input file; this is faster in
2016 buf.left = buf.used;
2020 line = buffer_linelim (&buf);
2021 linebase = line - buf.nlines;
2023 sortlines (line, buf.nlines, linebase);
2024 if (buf.eof && !nfiles && !n_temp_files && !buf.left)
2027 tfp = xfopen (output_file, "w");
2028 temp_output = output_file;
2029 output_file_created = true;
2034 temp_output = create_temp_file (&tfp);
2040 write_bytes (line->text, line->length, tfp, temp_output);
2042 while (linebase < line && compare (line, line - 1) == 0)
2045 while (linebase < line);
2047 xfclose (tfp, temp_output);
2049 if (output_file_created)
2058 if (! output_file_created)
2060 int i = n_temp_files;
2061 struct tempnode *node;
2062 char **tempfiles = xmalloc (n_temp_files * sizeof (char *));
2063 for (node = temphead; i > 0; node = node->next)
2064 tempfiles[--i] = node->name;
2065 merge (tempfiles, n_temp_files, NMERGE, output_file);
2070 /* Insert key KEY at the end of the key list. */
2073 insertkey (struct keyfield *key)
2075 struct keyfield **p;
2077 for (p = &keylist; *p; p = &(*p)->next)
2083 /* Report a bad field specification SPEC, with extra info MSGID. */
2085 static void badfieldspec (char const *, char const *)
2088 badfieldspec (char const *spec, char const *msgid)
2090 error (SORT_FAILURE, 0, _("%s: invalid field specification `%s'"),
2095 /* Parse the leading integer in STRING and store the resulting value
2096 (which must fit into size_t) into *VAL. Return the address of the
2097 suffix after the integer. If MSGID is NULL, return NULL after
2098 failure; otherwise, report MSGID and exit on failure. */
2101 parse_field_count (char const *string, size_t *val, char const *msgid)
2106 switch (xstrtoumax (string, &suffix, 10, &n, ""))
2109 case LONGINT_INVALID_SUFFIX_CHAR:
2114 case LONGINT_OVERFLOW:
2115 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2117 error (SORT_FAILURE, 0, _("%s: count `%.*s' too large"),
2118 _(msgid), (int) (suffix - string), string);
2121 case LONGINT_INVALID:
2123 error (SORT_FAILURE, 0, _("%s: invalid count at start of `%s'"),
2131 /* Handle interrupts and hangups. */
2134 sighandler (int sig)
2136 #ifndef SA_NOCLDSTOP
2137 signal (sig, SIG_IGN);
2144 struct sigaction sigact;
2146 sigact.sa_handler = SIG_DFL;
2147 sigemptyset (&sigact.sa_mask);
2148 sigact.sa_flags = 0;
2149 sigaction (sig, &sigact, NULL);
2152 signal (sig, SIG_DFL);
2158 /* Set the ordering options for KEY specified in S.
2159 Return the address of the first character in S that
2160 is not a valid ordering option.
2161 BLANKTYPE is the kind of blanks that 'b' should skip. */
2164 set_ordering (register const char *s, struct keyfield *key,
2165 enum blanktype blanktype)
2172 if (blanktype == bl_start || blanktype == bl_both)
2173 key->skipsblanks = true;
2174 if (blanktype == bl_end || blanktype == bl_both)
2175 key->skipeblanks = true;
2178 key->ignore = nondictionary;
2181 key->translate = fold_toupper;
2184 key->general_numeric = true;
2187 /* Option order should not matter, so don't let -i override
2188 -d. -d implies -i, but -i does not imply -d. */
2190 key->ignore = nonprinting;
2196 key->numeric = true;
2199 key->reverse = true;
2209 static struct keyfield *
2212 struct keyfield *key = xcalloc (1, sizeof *key);
2213 key->eword = SIZE_MAX;
2218 main (int argc, char **argv)
2220 struct keyfield *key;
2221 struct keyfield gkey;
2224 bool checkonly = false;
2225 bool mergeonly = false;
2227 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2228 bool obsolete_usage = (posix2_version () < 200112);
2229 char const *short_options = (obsolete_usage
2230 ? COMMON_SHORT_OPTIONS "y::"
2231 : COMMON_SHORT_OPTIONS "y:");
2232 char *minus = "-", **files;
2233 char const *outfile = minus;
2234 static int const sigs[] = { SIGHUP, SIGINT, SIGPIPE, SIGTERM };
2235 unsigned int nsigs = sizeof sigs / sizeof *sigs;
2237 struct sigaction oldact, newact;
2240 initialize_main (&argc, &argv);
2241 program_name = argv[0];
2242 setlocale (LC_ALL, "");
2243 bindtextdomain (PACKAGE, LOCALEDIR);
2244 textdomain (PACKAGE);
2248 exit_failure = SORT_FAILURE;
2249 atexit (close_stdout);
2251 hard_LC_COLLATE = hard_locale (LC_COLLATE);
2252 #if HAVE_NL_LANGINFO
2253 hard_LC_TIME = hard_locale (LC_TIME);
2257 /* Let's get locale's representation of the decimal point */
2259 struct lconv const *lconvp = localeconv ();
2261 /* If the locale doesn't define a decimal point, or if the decimal
2262 point is multibyte, use the C decimal point. We don't support
2263 multibyte decimal points yet. */
2264 decimal_point = *lconvp->decimal_point;
2265 if (! decimal_point || lconvp->decimal_point[1])
2266 decimal_point = C_DECIMAL_POINT;
2268 /* We don't support multibyte thousands separators yet. */
2269 th_sep = *lconvp->thousands_sep;
2270 if (! th_sep || lconvp->thousands_sep[1])
2271 th_sep = CHAR_MAX + 1;
2275 have_read_stdin = false;
2278 /* Change the way library functions fail. */
2279 exit_failure = SORT_FAILURE;
2284 sigemptyset (&caught_signals);
2285 for (i = 0; i < nsigs; i++)
2286 sigaddset (&caught_signals, sigs[i]);
2287 newact.sa_handler = sighandler;
2288 newact.sa_mask = caught_signals;
2289 newact.sa_flags = 0;
2295 for (i = 0; i < nsigs; i++)
2299 sigaction (sig, NULL, &oldact);
2300 if (oldact.sa_handler != SIG_IGN)
2301 sigaction (sig, &newact, NULL);
2303 if (signal (sig, SIG_IGN) != SIG_IGN)
2304 signal (sig, sighandler);
2309 gkey.sword = gkey.eword = SIZE_MAX;
2311 gkey.translate = NULL;
2312 gkey.numeric = gkey.general_numeric = gkey.month = gkey.reverse = false;
2313 gkey.skipsblanks = gkey.skipeblanks = false;
2315 files = xmalloc (sizeof (char *) * argc);
2319 /* Parse an operand as a file after "--" was seen; or if
2320 pedantic and a file was seen, unless the POSIX version
2321 predates 1003.1-2001 and -c was not seen and the operand is
2322 "-o FILE" or "-oFILE". */
2325 || (posixly_correct && nfiles != 0
2326 && ! (obsolete_usage
2329 && argv[optind][0] == '-' && argv[optind][1] == 'o'
2330 && (argv[optind][2] || optind + 1 != argc)))
2331 || ((c = getopt_long (argc, argv, short_options,
2332 long_options, NULL))
2337 files[nfiles++] = argv[optind++];
2343 if (obsolete_usage && optarg[0] == '+')
2345 /* Treat +POS1 [-POS2] as a key if possible; but silently
2346 treat an operand as a file if it is not a valid +POS1. */
2348 s = parse_field_count (optarg + 1, &key->sword, NULL);
2350 s = parse_field_count (s + 1, &key->schar, NULL);
2351 if (! (key->sword | key->schar))
2352 key->sword = SIZE_MAX;
2353 if (! s || *set_ordering (s, key, bl_start))
2360 if (optind != argc && argv[optind][0] == '-'
2361 && ISDIGIT (argv[optind][1]))
2363 char const *optarg1 = argv[optind++];
2364 s = parse_field_count (optarg1 + 1, &key->eword,
2365 N_("invalid number after `-'"));
2367 s = parse_field_count (s + 1, &key->echar,
2368 N_("invalid number after `.'"));
2369 if (*set_ordering (s, key, bl_end))
2370 badfieldspec (optarg1,
2371 N_("stray character in field spec"));
2377 files[nfiles++] = optarg;
2392 set_ordering (str, &gkey, bl_both);
2404 s = parse_field_count (optarg, &key->sword,
2405 N_("invalid number at field start"));
2408 /* Provoke with `sort -k0' */
2409 badfieldspec (optarg, N_("field number is zero"));
2413 s = parse_field_count (s + 1, &key->schar,
2414 N_("invalid number after `.'"));
2417 /* Provoke with `sort -k1.0' */
2418 badfieldspec (optarg, N_("character offset is zero"));
2421 if (! (key->sword | key->schar))
2422 key->sword = SIZE_MAX;
2423 s = set_ordering (s, key, bl_start);
2426 key->eword = SIZE_MAX;
2432 s = parse_field_count (s + 1, &key->eword,
2433 N_("invalid number after `,'"));
2436 /* Provoke with `sort -k1,0' */
2437 badfieldspec (optarg, N_("field number is zero"));
2440 s = parse_field_count (s + 1, &key->echar,
2441 N_("invalid number after `.'"));
2444 /* `-k 2,3' is equivalent to `+1 -3'. */
2447 s = set_ordering (s, key, bl_end);
2450 badfieldspec (optarg, N_("stray character in field spec"));
2459 if (outfile != minus && strcmp (outfile, optarg) != 0)
2460 error (SORT_FAILURE, 0, _("multiple output files specified"));
2469 specify_sort_size (optarg);
2474 int newtab = optarg[0];
2476 error (SORT_FAILURE, 0, _("empty tab"));
2479 if (strcmp (optarg, "\\0") == 0)
2483 /* Provoke with `sort -txx'. Complain about
2484 "multi-character tab" instead of "multibyte tab", so
2485 that the diagnostic's wording does not need to be
2486 changed once multibyte characters are supported. */
2487 error (SORT_FAILURE, 0, _("multi-character tab `%s'"),
2491 if (tab != TAB_DEFAULT && tab != newtab)
2492 error (SORT_FAILURE, 0, _("incompatible tabs"));
2498 add_temp_dir (optarg);
2506 /* Accept and ignore e.g. -y0 for compatibility with Solaris
2507 2.x through Solaris 7. -y is marked as obsolete starting
2515 case_GETOPT_HELP_CHAR;
2517 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2520 usage (SORT_FAILURE);
2524 /* Inheritance of global options to individual keys. */
2525 for (key = keylist; key; key = key->next)
2526 if (! (key->ignore || key->translate
2527 || (key->skipsblanks | key->reverse
2528 | key->skipeblanks | key->month | key->numeric
2529 | key->general_numeric)))
2531 key->ignore = gkey.ignore;
2532 key->translate = gkey.translate;
2533 key->skipsblanks = gkey.skipsblanks;
2534 key->skipeblanks = gkey.skipeblanks;
2535 key->month = gkey.month;
2536 key->numeric = gkey.numeric;
2537 key->general_numeric = gkey.general_numeric;
2538 key->reverse = gkey.reverse;
2541 if (!keylist && (gkey.ignore || gkey.translate
2542 || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
2543 | gkey.numeric | gkey.general_numeric)))
2545 reverse = gkey.reverse;
2547 if (temp_dir_count == 0)
2549 char const *tmp_dir = getenv ("TMPDIR");
2550 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
2562 error (SORT_FAILURE, 0, _("extra operand `%s' not allowed with -c"),
2565 /* POSIX requires that sort return 1 IFF invoked with -c and the
2566 input is not properly sorted. */
2567 exit (check (files[0]) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
2572 int max_merge = first_same_file (files, MIN (nfiles, NMERGE), outfile);
2573 merge (files, nfiles, max_merge, outfile);
2576 sort (files, nfiles, outfile);
2578 if (have_read_stdin && fclose (stdin) == EOF)
2579 die (_("close failed"), "-");
2581 exit (EXIT_SUCCESS);