1 /* sort - sort lines of text (with all kinds of options).
2 Copyright (C) 1988, 1991-2005 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 Written December 1988 by Mike Haertel.
19 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
20 or (US mail) as Mike Haertel c/o Free Software Foundation.
22 Ørn E. Hansen added NLS support in 1997. */
27 #include <sys/types.h>
31 #include "hard-locale.h"
38 #include "strnumcmp.h"
42 #if HAVE_SYS_RESOURCE_H
43 # include <sys/resource.h>
46 struct rlimit { size_t rlim_cur; };
47 # define getrlimit(Resource, Rlp) (-1)
50 /* The official name of this program (e.g., no `g' prefix). */
51 #define PROGRAM_NAME "sort"
53 #define AUTHORS "Mike Haertel", "Paul Eggert"
55 #if HAVE_LANGINFO_CODESET
56 # include <langinfo.h>
59 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
62 # define SA_NOCLDSTOP 0
63 # define sigprocmask(How, Set, Oset) /* empty */
65 # if ! HAVE_SIGINTERRUPT
66 # define siginterrupt(sig, flag) /* empty */
74 #define UCHAR_LIM (UCHAR_MAX + 1)
76 #ifndef DEFAULT_TMPDIR
77 # define DEFAULT_TMPDIR "/tmp"
83 /* POSIX says to exit with status 1 if invoked with -c and the
84 input is not properly sorted. */
85 SORT_OUT_OF_ORDER = 1,
87 /* POSIX says any other irregular exit must exit with a status
88 code greater than 1. */
92 /* The representation of the decimal point in the current locale. */
93 static int decimal_point;
95 /* Thousands separator; if -1, then there isn't one. */
96 static int thousands_sep;
98 /* Nonzero if the corresponding locales are hard. */
99 static bool hard_LC_COLLATE;
101 static bool hard_LC_TIME;
104 #define NONZERO(x) ((x) != 0)
106 /* The kind of blanks for '-b' to skip in various options. */
107 enum blanktype { bl_start, bl_end, bl_both };
109 /* The character marking end of line. Default to \n. */
110 static char eolchar = '\n';
112 /* Lines are held in core as counted strings. */
115 char *text; /* Text of the line. */
116 size_t length; /* Length including final newline. */
117 char *keybeg; /* Start of first key. */
118 char *keylim; /* Limit of first key. */
124 char *buf; /* Dynamically allocated buffer,
125 partitioned into 3 regions:
128 - an array of lines, in reverse order. */
129 size_t used; /* Number of bytes used for input data. */
130 size_t nlines; /* Number of lines in the line array. */
131 size_t alloc; /* Number of bytes allocated. */
132 size_t left; /* Number of bytes left from previous reads. */
133 size_t line_bytes; /* Number of bytes to reserve for each line. */
134 bool eof; /* An EOF has been read. */
139 size_t sword; /* Zero-origin 'word' to start at. */
140 size_t schar; /* Additional characters to skip. */
141 size_t eword; /* Zero-origin first word after field. */
142 size_t echar; /* Additional characters in field. */
143 bool const *ignore; /* Boolean array of characters to ignore. */
144 char const *translate; /* Translation applied to characters. */
145 bool skipsblanks; /* Skip leading blanks when finding start. */
146 bool skipeblanks; /* Skip leading blanks when finding end. */
147 bool numeric; /* Flag for numeric comparison. Handle
148 strings of digits with optional decimal
149 point, but no exponential notation. */
150 bool general_numeric; /* Flag for general, numeric comparison.
151 Handle numbers in exponential notation. */
152 bool month; /* Flag for comparison by month name. */
153 bool reverse; /* Reverse the sense of comparison. */
154 struct keyfield *next; /* Next keyfield to try. */
163 /* The name this program was run with. */
166 /* FIXME: None of these tables work with multibyte character sets.
167 Also, there are many other bugs when handling multibyte characters.
168 One way to fix this is to rewrite `sort' to use wide characters
169 internally, but doing this with good performance is a bit
172 /* Table of blanks. */
173 static bool blanks[UCHAR_LIM];
175 /* Table of non-printing characters. */
176 static bool nonprinting[UCHAR_LIM];
178 /* Table of non-dictionary characters (not letters, digits, or blanks). */
179 static bool nondictionary[UCHAR_LIM];
181 /* Translation table folding lower case to upper. */
182 static char fold_toupper[UCHAR_LIM];
184 #define MONTHS_PER_YEAR 12
186 /* Table mapping month names to integers.
187 Alphabetic order allows binary search. */
188 static struct month monthtab[] =
204 /* During the merge phase, the number of files to merge at once. */
207 /* Minimum size for a merge or check buffer. */
208 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
210 /* Minimum sort size; the code might not work with smaller sizes. */
211 #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
213 /* The number of bytes needed for a merge or check buffer, which can
214 function relatively efficiently even if it holds only one line. If
215 a longer line is seen, this value is increased. */
216 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
218 /* The approximate maximum number of bytes of main memory to use, as
219 specified by the user. Zero if the user has not specified a size. */
220 static size_t sort_size;
222 /* The guessed size for non-regular files. */
223 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
225 /* Array of directory names in which any temporary files are to be created. */
226 static char const **temp_dirs;
228 /* Number of temporary directory names used. */
229 static size_t temp_dir_count;
231 /* Number of allocated slots in temp_dirs. */
232 static size_t temp_dir_alloc;
234 /* Flag to reverse the order of all comparisons. */
237 /* Flag for stable sort. This turns off the last ditch bytewise
238 comparison of lines, and instead leaves lines in the same order
239 they were read if all keys compare equal. */
242 /* If TAB has this value, blanks separate fields. */
243 enum { TAB_DEFAULT = CHAR_MAX + 1 };
245 /* Tab character separating fields. If TAB_DEFAULT, then fields are
246 separated by the empty string between a non-blank character and a blank
248 static int tab = TAB_DEFAULT;
250 /* Flag to remove consecutive duplicate lines from the output.
251 Only the last of a sequence of equal lines will be output. */
254 /* Nonzero if any of the input files are the standard input. */
255 static bool have_read_stdin;
257 /* List of key field comparisons to be tried. */
258 static struct keyfield *keylist;
260 static void sortlines_temp (struct line *, size_t, struct line *);
262 /* Report MESSAGE for FILE, then clean up and exit.
263 If FILE is null, it represents standard output. */
265 static void die (char const *, char const *) ATTRIBUTE_NORETURN;
267 die (char const *message, char const *file)
269 error (0, errno, "%s: %s", message, file ? file : _("standard output"));
276 if (status != EXIT_SUCCESS)
277 fprintf (stderr, _("Try `%s --help' for more information.\n"),
282 Usage: %s [OPTION]... [FILE]...\n\
286 Write sorted concatenation of all FILE(s) to standard output.\n\
290 Mandatory arguments to long options are mandatory for short options too.\n\
297 -b, --ignore-leading-blanks ignore leading blanks\n\
298 -d, --dictionary-order consider only blanks and alphanumeric characters\n\
299 -f, --ignore-case fold lower case to upper case characters\n\
302 -g, --general-numeric-sort compare according to general numerical value\n\
303 -i, --ignore-nonprinting consider only printable characters\n\
304 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
305 -n, --numeric-sort compare according to string numerical value\n\
306 -r, --reverse reverse the result of comparisons\n\
312 -c, --check check whether input is sorted; do not sort\n\
313 -k, --key=POS1[,POS2] start a key at POS1, end it at POS2 (origin 1)\n\
314 -m, --merge merge already sorted files; do not sort\n\
315 -o, --output=FILE write result to FILE instead of standard output\n\
316 -s, --stable stabilize sort by disabling last-resort comparison\n\
317 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
320 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
321 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
322 multiple options specify multiple directories\n\
323 -u, --unique with -c, check for strict ordering;\n\
324 without -c, output only the first of an equal run\n\
327 -z, --zero-terminated end lines with 0 byte, not newline\n\
329 fputs (HELP_OPTION_DESCRIPTION, stdout);
330 fputs (VERSION_OPTION_DESCRIPTION, stdout);
333 POS is F[.C][OPTS], where F is the field number and C the character position\n\
334 in the field. OPTS is one or more single-letter ordering options, which\n\
335 override global ordering options for that key. If no key is given, use the\n\
336 entire line as the key.\n\
338 SIZE may be followed by the following multiplicative suffixes:\n\
341 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
343 With no FILE, or when FILE is -, read standard input.\n\
346 The locale specified by the environment affects sort order.\n\
347 Set LC_ALL=C to get the traditional sort order that uses\n\
348 native byte values.\n\
350 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
356 static char const short_options[] = "-bcdfgik:mMno:rsS:t:T:uy:z";
358 static struct option const long_options[] =
360 {"ignore-leading-blanks", no_argument, NULL, 'b'},
361 {"check", no_argument, NULL, 'c'},
362 {"dictionary-order", no_argument, NULL, 'd'},
363 {"ignore-case", no_argument, NULL, 'f'},
364 {"general-numeric-sort", no_argument, NULL, 'g'},
365 {"ignore-nonprinting", no_argument, NULL, 'i'},
366 {"key", required_argument, NULL, 'k'},
367 {"merge", no_argument, NULL, 'm'},
368 {"month-sort", no_argument, NULL, 'M'},
369 {"numeric-sort", no_argument, NULL, 'n'},
370 {"output", required_argument, NULL, 'o'},
371 {"reverse", no_argument, NULL, 'r'},
372 {"stable", no_argument, NULL, 's'},
373 {"buffer-size", required_argument, NULL, 'S'},
374 {"field-separator", required_argument, NULL, 't'},
375 {"temporary-directory", required_argument, NULL, 'T'},
376 {"unique", no_argument, NULL, 'u'},
377 {"zero-terminated", no_argument, NULL, 'z'},
378 {GETOPT_HELP_OPTION_DECL},
379 {GETOPT_VERSION_OPTION_DECL},
383 /* The set of signals that are caught. */
384 static sigset_t caught_signals;
386 /* The list of temporary files. */
389 struct tempnode *volatile next;
390 char name[1]; /* Actual size is 1 + file name length. */
392 static struct tempnode *volatile temphead;
393 static struct tempnode *volatile *temptail = &temphead;
395 /* Clean up any remaining temporary files. */
400 struct tempnode const *node;
402 for (node = temphead; node; node = node->next)
406 /* Create a new temporary file, returning its newly allocated name.
407 Store into *PFP a stream open for writing. */
410 create_temp_file (FILE **pfp)
412 static char const slashbase[] = "/sortXXXXXX";
413 static size_t temp_dir_index;
417 char const *temp_dir = temp_dirs[temp_dir_index];
418 size_t len = strlen (temp_dir);
419 struct tempnode *node =
420 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
421 char *file = node->name;
423 memcpy (file, temp_dir, len);
424 memcpy (file + len, slashbase, sizeof slashbase);
426 if (++temp_dir_index == temp_dir_count)
429 /* Create the temporary file in a critical section, to avoid races. */
430 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
435 temptail = &node->next;
438 sigprocmask (SIG_SETMASK, &oldset, NULL);
441 if (fd < 0 || (*pfp = fdopen (fd, "w")) == NULL)
442 die (_("cannot create temporary file"), file);
447 /* Return a stream for FILE, opened with mode HOW. A null FILE means
448 standard output; HOW should be "w". When opening for input, "-"
449 means standard input. To avoid confusion, do not return file
450 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
451 opening an ordinary FILE. */
454 xfopen (const char *file, const char *how)
460 else if (STREQ (file, "-") && *how == 'r')
462 have_read_stdin = true;
467 fp = fopen (file, how);
469 die (_("open failed"), file);
475 /* Close FP, whose name is FILE, and report any errors. */
478 xfclose (FILE *fp, char const *file)
483 /* Allow reading stdin from tty more than once. */
489 /* Don't close stdout just yet. close_stdout does that. */
490 if (fflush (fp) != 0)
491 die (_("fflush failed"), file);
495 if (fclose (fp) != 0)
496 die (_("close failed"), file);
502 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
504 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
505 die (_("write failed"), output_file);
508 /* Append DIR to the array of temporary directory names. */
510 add_temp_dir (char const *dir)
512 if (temp_dir_count == temp_dir_alloc)
513 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
515 temp_dirs[temp_dir_count++] = dir;
518 /* Remove NAME from the list of temporary files. */
521 zaptemp (const char *name)
523 struct tempnode *volatile *pnode;
524 struct tempnode *node;
525 struct tempnode *next;
528 int unlink_errno = 0;
530 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
533 /* Unlink the temporary file in a critical section to avoid races. */
535 sigprocmask (SIG_BLOCK, &caught_signals, &oldset);
536 unlink_status = unlink (name);
537 unlink_errno = errno;
539 sigprocmask (SIG_SETMASK, &oldset, NULL);
541 if (unlink_status != 0)
542 error (0, unlink_errno, _("warning: cannot remove: %s"), name);
551 struct_month_cmp (const void *m1, const void *m2)
553 struct month const *month1 = m1;
554 struct month const *month2 = m2;
555 return strcmp (month1->name, month2->name);
560 /* Initialize the character class tables. */
567 for (i = 0; i < UCHAR_LIM; ++i)
569 blanks[i] = !!ISBLANK (i);
570 nonprinting[i] = !ISPRINT (i);
571 nondictionary[i] = !ISALNUM (i) && !ISBLANK (i);
572 fold_toupper[i] = (ISLOWER (i) ? toupper (i) : i);
576 /* If we're not in the "C" locale, read different names for months. */
579 for (i = 0; i < MONTHS_PER_YEAR; i++)
586 s = (char *) nl_langinfo (ABMON_1 + i);
588 monthtab[i].name = name = xmalloc (s_len + 1);
589 monthtab[i].val = i + 1;
591 for (j = 0; j < s_len; j++)
592 name[j] = fold_toupper[to_uchar (s[j])];
595 qsort ((void *) monthtab, MONTHS_PER_YEAR,
596 sizeof *monthtab, struct_month_cmp);
601 /* Specify the amount of main memory to use when sorting. */
603 specify_sort_size (char const *s)
607 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
609 /* The default unit is KiB. */
610 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
612 if (n <= UINTMAX_MAX / 1024)
615 e = LONGINT_OVERFLOW;
618 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
619 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
628 double mem = physmem_total () * n / 100;
630 /* Use "<", not "<=", to avoid problems with rounding. */
631 if (mem < UINTMAX_MAX)
637 e = LONGINT_OVERFLOW;
644 /* If multiple sort sizes are specified, take the maximum, so
645 that option order does not matter. */
652 sort_size = MAX (sort_size, MIN_SORT_SIZE);
656 e = LONGINT_OVERFLOW;
659 STRTOL_FATAL_ERROR (s, _("sort size"), e);
662 /* Return the default sort size. */
664 default_sort_size (void)
666 /* Let MEM be available memory or 1/8 of total memory, whichever
668 double avail = physmem_available ();
669 double total = physmem_total ();
670 double mem = MAX (avail, total / 8);
671 struct rlimit rlimit;
673 /* Let SIZE be MEM, but no more than the maximum object size or
674 system resource limits. Avoid the MIN macro here, as it is not
675 quite right when only one argument is floating point. Don't
676 bother to check for values like RLIM_INFINITY since in practice
677 they are not much less than SIZE_MAX. */
678 size_t size = SIZE_MAX;
681 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
682 size = rlimit.rlim_cur;
684 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
685 size = rlimit.rlim_cur;
688 /* Leave a large safety margin for the above limits, as failure can
689 occur when they are exceeded. */
693 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
694 Exceeding RSS is not fatal, but can be quite slow. */
695 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
696 size = rlimit.rlim_cur / 16 * 15;
699 /* Use no less than the minimum. */
700 return MAX (size, MIN_SORT_SIZE);
703 /* Return the sort buffer size to use with the input files identified
704 by FPS and FILES, which are alternate names of the same files.
705 NFILES gives the number of input files; NFPS may be less. Assume
706 that each input line requires LINE_BYTES extra bytes' worth of line
707 information. Do not exceed the size bound specified by the user
708 (or a default size bound, if the user does not specify one). */
711 sort_buffer_size (FILE *const *fps, size_t nfps,
712 char *const *files, size_t nfiles,
715 /* A bound on the input size. If zero, the bound hasn't been
717 static size_t size_bound;
719 /* In the worst case, each input byte is a newline. */
720 size_t worst_case_per_input_byte = line_bytes + 1;
722 /* Keep enough room for one extra input line and an extra byte.
723 This extra room might be needed when preparing to read EOF. */
724 size_t size = worst_case_per_input_byte + 1;
728 for (i = 0; i < nfiles; i++)
734 if ((i < nfps ? fstat (fileno (fps[i]), &st)
735 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
736 : stat (files[i], &st))
738 die (_("stat failed"), files[i]);
740 if (S_ISREG (st.st_mode))
741 file_size = st.st_size;
744 /* The file has unknown size. If the user specified a sort
745 buffer size, use that; otherwise, guess the size. */
748 file_size = INPUT_FILE_SIZE_GUESS;
753 size_bound = sort_size;
755 size_bound = default_sort_size ();
758 /* Add the amount of memory needed to represent the worst case
759 where the input consists entirely of newlines followed by a
760 single non-newline. Check for overflow. */
761 worst_case = file_size * worst_case_per_input_byte + 1;
762 if (file_size != worst_case / worst_case_per_input_byte
763 || size_bound - size <= worst_case)
771 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
772 must be at least sizeof (struct line). Allocate ALLOC bytes
776 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
778 /* Ensure that the line array is properly aligned. If the desired
779 size cannot be allocated, repeatedly halve it until allocation
780 succeeds. The smaller allocation may hurt overall performance,
781 but that's better than failing. */
784 alloc += sizeof (struct line) - alloc % sizeof (struct line);
785 buf->buf = malloc (alloc);
789 if (alloc <= line_bytes + 1)
793 buf->line_bytes = line_bytes;
795 buf->used = buf->left = buf->nlines = 0;
799 /* Return one past the limit of the line array. */
801 static inline struct line *
802 buffer_linelim (struct buffer const *buf)
804 return (struct line *) (buf->buf + buf->alloc);
807 /* Return a pointer to the first character of the field specified
811 begfield (const struct line *line, const struct keyfield *key)
813 char *ptr = line->text, *lim = ptr + line->length - 1;
814 size_t sword = key->sword;
815 size_t schar = key->schar;
816 size_t remaining_bytes;
818 /* The leading field separator itself is included in a field when -t
821 if (tab != TAB_DEFAULT)
822 while (ptr < lim && sword--)
824 while (ptr < lim && *ptr != tab)
830 while (ptr < lim && sword--)
832 while (ptr < lim && blanks[to_uchar (*ptr)])
834 while (ptr < lim && !blanks[to_uchar (*ptr)])
838 if (key->skipsblanks)
839 while (ptr < lim && blanks[to_uchar (*ptr)])
842 /* Advance PTR by SCHAR (if possible), but no further than LIM. */
843 remaining_bytes = lim - ptr;
844 if (schar < remaining_bytes)
852 /* Return the limit of (a pointer to the first character after) the field
853 in LINE specified by KEY. */
856 limfield (const struct line *line, const struct keyfield *key)
858 char *ptr = line->text, *lim = ptr + line->length - 1;
859 size_t eword = key->eword, echar = key->echar;
860 size_t remaining_bytes;
862 /* Move PTR past EWORD fields or to one past the last byte on LINE,
863 whichever comes first. If there are more than EWORD fields, leave
864 PTR pointing at the beginning of the field having zero-based index,
865 EWORD. If a delimiter character was specified (via -t), then that
866 `beginning' is the first character following the delimiting TAB.
867 Otherwise, leave PTR pointing at the first `blank' character after
868 the preceding field. */
869 if (tab != TAB_DEFAULT)
870 while (ptr < lim && eword--)
872 while (ptr < lim && *ptr != tab)
874 if (ptr < lim && (eword | echar))
878 while (ptr < lim && eword--)
880 while (ptr < lim && blanks[to_uchar (*ptr)])
882 while (ptr < lim && !blanks[to_uchar (*ptr)])
886 #ifdef POSIX_UNSPECIFIED
887 /* The following block of code makes GNU sort incompatible with
888 standard Unix sort, so it's ifdef'd out for now.
889 The POSIX spec isn't clear on how to interpret this.
890 FIXME: request clarification.
892 From: kwzh@gnu.ai.mit.edu (Karl Heuer)
893 Date: Thu, 30 May 96 12:20:41 -0400
894 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
896 [...]I believe I've found another bug in `sort'.
901 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
904 $ /bin/sort -k1.7,1.7 </tmp/sort.in
908 Unix sort produced the answer I expected: sort on the single character
909 in column 7. GNU sort produced different results, because it disagrees
910 on the interpretation of the key-end spec "M.N". Unix sort reads this
911 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
912 "skip M-1 fields, then either N-1 characters or the rest of the current
913 field, whichever comes first". This extra clause applies only to
914 key-ends, not key-starts.
917 /* Make LIM point to the end of (one byte past) the current field. */
918 if (tab != TAB_DEFAULT)
921 newlim = memchr (ptr, tab, lim - ptr);
929 while (newlim < lim && blanks[to_uchar (*newlim)])
931 while (newlim < lim && !blanks[to_uchar (*newlim)])
937 /* If we're ignoring leading blanks when computing the End
938 of the field, don't start counting bytes until after skipping
939 past any leading blanks. */
940 if (key->skipeblanks)
941 while (ptr < lim && blanks[to_uchar (*ptr)])
944 /* Advance PTR by ECHAR (if possible), but no further than LIM. */
945 remaining_bytes = lim - ptr;
946 if (echar < remaining_bytes)
954 /* Fill BUF reading from FP, moving buf->left bytes from the end
955 of buf->buf to the beginning first. If EOF is reached and the
956 file wasn't terminated by a newline, supply one. Set up BUF's line
957 table too. FILE is the name of the file corresponding to FP.
958 Return true if some input was read. */
961 fillbuf (struct buffer *buf, FILE *fp, char const *file)
963 struct keyfield const *key = keylist;
965 size_t line_bytes = buf->line_bytes;
966 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
971 if (buf->used != buf->left)
973 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
974 buf->used = buf->left;
980 char *ptr = buf->buf + buf->used;
981 struct line *linelim = buffer_linelim (buf);
982 struct line *line = linelim - buf->nlines;
983 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
984 char *line_start = buf->nlines ? line->text + line->length : buf->buf;
986 while (line_bytes + 1 < avail)
988 /* Read as many bytes as possible, but do not read so many
989 bytes that there might not be enough room for the
990 corresponding line array. The worst case is when the
991 rest of the input file consists entirely of newlines,
992 except that the last byte is not a newline. */
993 size_t readsize = (avail - 1) / (line_bytes + 1);
994 size_t bytes_read = fread (ptr, 1, readsize, fp);
995 char *ptrlim = ptr + bytes_read;
999 if (bytes_read != readsize)
1002 die (_("read failed"), file);
1006 if (buf->buf == ptrlim)
1008 if (ptrlim[-1] != eol)
1013 /* Find and record each line in the just-read input. */
1014 while ((p = memchr (ptr, eol, ptrlim - ptr)))
1018 line->text = line_start;
1019 line->length = ptr - line_start;
1020 mergesize = MAX (mergesize, line->length);
1021 avail -= line_bytes;
1025 /* Precompute the position of the first key for
1027 line->keylim = (key->eword == SIZE_MAX
1029 : limfield (line, key));
1031 if (key->sword != SIZE_MAX)
1032 line->keybeg = begfield (line, key);
1035 if (key->skipsblanks)
1036 while (blanks[to_uchar (*line_start)])
1038 line->keybeg = line_start;
1050 buf->used = ptr - buf->buf;
1051 buf->nlines = buffer_linelim (buf) - line;
1052 if (buf->nlines != 0)
1054 buf->left = ptr - line_start;
1055 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1059 /* The current input line is too long to fit in the buffer.
1060 Double the buffer size and try again. */
1061 buf->buf = X2REALLOC (buf->buf, &buf->alloc);
1065 /* Compare strings A and B as numbers without explicitly converting them to
1066 machine numbers. Comparatively slow for short strings, but asymptotically
1070 numcompare (const char *a, const char *b)
1072 while (blanks[to_uchar (*a)])
1074 while (blanks[to_uchar (*b)])
1077 return strnumcmp (a, b, decimal_point, thousands_sep);
1081 general_numcompare (const char *sa, const char *sb)
1083 /* FIXME: add option to warn about failed conversions. */
1084 /* FIXME: maybe add option to try expensive FP conversion
1085 only if A and B can't be compared more cheaply/accurately. */
1089 double a = strtod (sa, &ea);
1090 double b = strtod (sb, &eb);
1092 /* Put conversion errors at the start of the collating sequence. */
1094 return sb == eb ? 0 : -1;
1098 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1099 conversion errors but before numbers; sort them by internal
1100 bit-pattern, for lack of a more portable alternative. */
1106 : memcmp ((char *) &a, (char *) &b, sizeof a));
1109 /* Return an integer in 1..12 of the month name MONTH with length LEN.
1110 Return 0 if the name in S is not recognized. */
1113 getmonth (char const *month, size_t len)
1116 size_t hi = MONTHS_PER_YEAR;
1117 char const *monthlim = month + len;
1121 if (month == monthlim)
1123 if (!blanks[to_uchar (*month)])
1130 size_t ix = (lo + hi) / 2;
1131 char const *m = month;
1132 char const *n = monthtab[ix].name;
1137 return monthtab[ix].val;
1138 if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1143 else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1155 /* Compare two lines A and B trying every key in sequence until there
1156 are no more keys or a difference is found. */
1159 keycompare (const struct line *a, const struct line *b)
1161 struct keyfield const *key = keylist;
1163 /* For the first iteration only, the key positions have been
1164 precomputed for us. */
1165 char *texta = a->keybeg;
1166 char *textb = b->keybeg;
1167 char *lima = a->keylim;
1168 char *limb = b->keylim;
1174 char const *translate = key->translate;
1175 bool const *ignore = key->ignore;
1177 /* Find the lengths. */
1178 size_t lena = lima <= texta ? 0 : lima - texta;
1179 size_t lenb = limb <= textb ? 0 : limb - textb;
1181 /* Actually compare the fields. */
1182 if (key->numeric | key->general_numeric)
1184 char savea = *lima, saveb = *limb;
1186 *lima = *limb = '\0';
1187 diff = ((key->numeric ? numcompare : general_numcompare)
1189 *lima = savea, *limb = saveb;
1191 else if (key->month)
1192 diff = getmonth (texta, lena) - getmonth (textb, lenb);
1193 /* Sorting like this may become slow, so in a simple locale the user
1194 can select a faster sort that is similar to ascii sort. */
1195 else if (hard_LC_COLLATE)
1197 if (ignore || translate)
1200 size_t size = lena + 1 + lenb + 1;
1201 char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1202 char *copy_b = copy_a + lena + 1;
1203 size_t new_len_a, new_len_b, i;
1205 /* Ignore and/or translate chars before comparing. */
1206 for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1210 copy_a[new_len_a] = (translate
1211 ? translate[to_uchar (texta[i])]
1213 if (!ignore || !ignore[to_uchar (texta[i])])
1218 copy_b[new_len_b] = (translate
1219 ? translate[to_uchar (textb[i])]
1221 if (!ignore || !ignore[to_uchar (textb[i])])
1226 diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1228 if (sizeof buf < size)
1232 diff = - NONZERO (lenb);
1236 diff = xmemcoll (texta, lena, textb, lenb);
1240 #define CMP_WITH_IGNORE(A, B) \
1245 while (texta < lima && ignore[to_uchar (*texta)]) \
1247 while (textb < limb && ignore[to_uchar (*textb)]) \
1249 if (! (texta < lima && textb < limb)) \
1251 diff = to_uchar (A) - to_uchar (B); \
1258 diff = (texta < lima) - (textb < limb); \
1263 CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1264 translate[to_uchar (*textb)]);
1266 CMP_WITH_IGNORE (*texta, *textb);
1269 diff = - NONZERO (lenb);
1276 while (texta < lima && textb < limb)
1278 diff = (to_uchar (translate[to_uchar (*texta++)])
1279 - to_uchar (translate[to_uchar (*textb++)]));
1286 diff = memcmp (texta, textb, MIN (lena, lenb));
1290 diff = lena < lenb ? -1 : lena != lenb;
1300 /* Find the beginning and limit of the next field. */
1301 if (key->eword != SIZE_MAX)
1302 lima = limfield (a, key), limb = limfield (b, key);
1304 lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1306 if (key->sword != SIZE_MAX)
1307 texta = begfield (a, key), textb = begfield (b, key);
1310 texta = a->text, textb = b->text;
1311 if (key->skipsblanks)
1313 while (texta < lima && blanks[to_uchar (*texta)])
1315 while (textb < limb && blanks[to_uchar (*textb)])
1326 return key->reverse ? -diff : diff;
1329 /* Compare two lines A and B, returning negative, zero, or positive
1330 depending on whether A compares less than, equal to, or greater than B. */
1333 compare (const struct line *a, const struct line *b)
1338 /* First try to compare on the specified keys (if any).
1339 The only two cases with no key at all are unadorned sort,
1340 and unadorned sort -r. */
1343 diff = keycompare (a, b);
1344 if (diff | unique | stable)
1348 /* If the keys all compare equal (or no keys were specified)
1349 fall through to the default comparison. */
1350 alen = a->length - 1, blen = b->length - 1;
1353 diff = - NONZERO (blen);
1356 else if (hard_LC_COLLATE)
1357 diff = xmemcoll (a->text, alen, b->text, blen);
1358 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
1359 diff = alen < blen ? -1 : alen != blen;
1361 return reverse ? -diff : diff;
1364 /* Check that the lines read from FILE_NAME come in order. Print a
1365 diagnostic (FILE_NAME, line number, contents of line) to stderr and return
1366 false if they are not in order. Otherwise, print no diagnostic
1370 check (char const *file_name)
1372 FILE *fp = xfopen (file_name, "r");
1373 struct buffer buf; /* Input buffer. */
1374 struct line temp; /* Copy of previous line. */
1376 uintmax_t line_number = 0;
1377 struct keyfield const *key = keylist;
1378 bool nonunique = ! unique;
1379 bool ordered = true;
1381 initbuf (&buf, sizeof (struct line),
1382 MAX (merge_buffer_size, sort_size));
1385 while (fillbuf (&buf, fp, file_name))
1387 struct line const *line = buffer_linelim (&buf);
1388 struct line const *linebase = line - buf.nlines;
1390 /* Make sure the line saved from the old buffer contents is
1391 less than or equal to the first line of the new buffer. */
1392 if (alloc && nonunique <= compare (&temp, line - 1))
1396 struct line const *disorder_line = line - 1;
1397 uintmax_t disorder_line_number =
1398 buffer_linelim (&buf) - disorder_line + line_number;
1399 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
1400 fprintf (stderr, _("%s: %s:%s: disorder: "),
1401 program_name, file_name,
1402 umaxtostr (disorder_line_number, hr_buf));
1403 write_bytes (disorder_line->text, disorder_line->length, stderr,
1404 _("standard error"));
1410 /* Compare each line in the buffer with its successor. */
1411 while (linebase < --line)
1412 if (nonunique <= compare (line, line - 1))
1413 goto found_disorder;
1415 line_number += buf.nlines;
1417 /* Save the last line of the buffer. */
1418 if (alloc < line->length)
1425 alloc = line->length;
1429 while (alloc < line->length);
1431 temp.text = xrealloc (temp.text, alloc);
1433 memcpy (temp.text, line->text, line->length);
1434 temp.length = line->length;
1437 temp.keybeg = temp.text + (line->keybeg - line->text);
1438 temp.keylim = temp.text + (line->keylim - line->text);
1442 xfclose (fp, file_name);
1448 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
1449 files (all of which are at the start of the FILES array), and
1450 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
1451 Close input and output files before returning.
1452 OUTPUT_FILE gives the name of the output file. If it is NULL,
1453 the output file is standard output. If OFP is NULL, the output
1454 file has not been opened yet (or written to, if standard output). */
1457 mergefps (char **files, size_t ntemps, size_t nfiles,
1458 FILE *ofp, char const *output_file)
1460 FILE *fps[NMERGE]; /* Input streams for each file. */
1461 struct buffer buffer[NMERGE]; /* Input buffers for each file. */
1462 struct line saved; /* Saved line storage for unique check. */
1463 struct line const *savedline = NULL;
1464 /* &saved if there is a saved line. */
1465 size_t savealloc = 0; /* Size allocated for the saved line. */
1466 struct line const *cur[NMERGE]; /* Current line in each line table. */
1467 struct line const *base[NMERGE]; /* Base of each line table. */
1468 size_t ord[NMERGE]; /* Table representing a permutation of fps,
1469 such that cur[ord[0]] is the smallest line
1470 and will be next output. */
1474 struct keyfield const *key = keylist;
1477 /* Read initial lines from each input file. */
1478 for (i = 0; i < nfiles; )
1480 fps[i] = xfopen (files[i], "r");
1481 initbuf (&buffer[i], sizeof (struct line),
1482 MAX (merge_buffer_size, sort_size / nfiles));
1483 if (fillbuf (&buffer[i], fps[i], files[i]))
1485 struct line const *linelim = buffer_linelim (&buffer[i]);
1486 cur[i] = linelim - 1;
1487 base[i] = linelim - buffer[i].nlines;
1492 /* fps[i] is empty; eliminate it from future consideration. */
1493 xfclose (fps[i], files[i]);
1499 free (buffer[i].buf);
1501 for (j = i; j < nfiles; ++j)
1502 files[j] = files[j + 1];
1507 ofp = xfopen (output_file, "w");
1509 /* Set up the ord table according to comparisons among input lines.
1510 Since this only reorders two items if one is strictly greater than
1511 the other, it is stable. */
1512 for (i = 0; i < nfiles; ++i)
1514 for (i = 1; i < nfiles; ++i)
1515 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
1516 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1518 /* Repeatedly output the smallest line until no input remains. */
1521 struct line const *smallest = cur[ord[0]];
1523 /* If uniquified output is turned on, output only the first of
1524 an identical series of lines. */
1527 if (savedline && compare (savedline, smallest))
1530 write_bytes (saved.text, saved.length, ofp, output_file);
1535 if (savealloc < smallest->length)
1540 savealloc = smallest->length;
1543 while ((savealloc *= 2) < smallest->length);
1545 saved.text = xrealloc (saved.text, savealloc);
1547 saved.length = smallest->length;
1548 memcpy (saved.text, smallest->text, saved.length);
1552 saved.text + (smallest->keybeg - smallest->text);
1554 saved.text + (smallest->keylim - smallest->text);
1559 write_bytes (smallest->text, smallest->length, ofp, output_file);
1561 /* Check if we need to read more lines into core. */
1562 if (base[ord[0]] < smallest)
1563 cur[ord[0]] = smallest - 1;
1566 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]]))
1568 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
1569 cur[ord[0]] = linelim - 1;
1570 base[ord[0]] = linelim - buffer[ord[0]].nlines;
1574 /* We reached EOF on fps[ord[0]]. */
1575 for (i = 1; i < nfiles; ++i)
1576 if (ord[i] > ord[0])
1579 xfclose (fps[ord[0]], files[ord[0]]);
1580 if (ord[0] < ntemps)
1583 zaptemp (files[ord[0]]);
1585 free (buffer[ord[0]].buf);
1586 for (i = ord[0]; i < nfiles; ++i)
1588 fps[i] = fps[i + 1];
1589 files[i] = files[i + 1];
1590 buffer[i] = buffer[i + 1];
1591 cur[i] = cur[i + 1];
1592 base[i] = base[i + 1];
1594 for (i = 0; i < nfiles; ++i)
1595 ord[i] = ord[i + 1];
1600 /* The new line just read in may be larger than other lines
1601 already in main memory; push it back in the queue until we
1602 encounter a line larger than it. Optimize for the common
1603 case where the new line is smallest. */
1608 size_t ord0 = ord[0];
1609 size_t count_of_smaller_lines;
1613 int cmp = compare (cur[ord0], cur[ord[probe]]);
1614 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
1618 probe = (lo + hi) / 2;
1621 count_of_smaller_lines = lo - 1;
1622 for (j = 0; j < count_of_smaller_lines; j++)
1623 ord[j] = ord[j + 1];
1624 ord[count_of_smaller_lines] = ord0;
1628 if (unique && savedline)
1630 write_bytes (saved.text, saved.length, ofp, output_file);
1634 xfclose (ofp, output_file);
1637 /* Merge into T the two sorted arrays of lines LO (with NLO members)
1638 and HI (with NHI members). T, LO, and HI point just past their
1639 respective arrays, and the arrays are in reverse order. NLO and
1640 NHI must be positive, and HI - NHI must equal T - (NLO + NHI). */
1643 mergelines (struct line *t,
1644 struct line const *lo, size_t nlo,
1645 struct line const *hi, size_t nhi)
1648 if (compare (lo - 1, hi - 1) <= 0)
1653 /* HI - NHI equalled T - (NLO + NHI) when this function
1654 began. Therefore HI must equal T now, and there is no
1655 need to copy from HI to T. */
1673 /* Sort the array LINES with NLINES members, using TEMP for temporary space.
1674 NLINES must be at least 2.
1675 The input and output arrays are in reverse order, and LINES and
1676 TEMP point just past the end of their respective arrays.
1678 Use a recursive divide-and-conquer algorithm, in the style
1679 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
1680 the optimization suggested by exercise 5.2.4-10; this requires room
1681 for only 1.5*N lines, rather than the usual 2*N lines. Knuth
1682 writes that this memory optimization was originally published by
1683 D. A. Bell, Comp J. 1 (1958), 75. */
1686 sortlines (struct line *lines, size_t nlines, struct line *temp)
1690 if (0 < compare (&lines[-1], &lines[-2]))
1692 struct line tmp = lines[-1];
1693 lines[-1] = lines[-2];
1699 size_t nlo = nlines / 2;
1700 size_t nhi = nlines - nlo;
1701 struct line *lo = lines;
1702 struct line *hi = lines - nlo;
1703 struct line *sorted_lo = temp;
1705 sortlines (hi, nhi, temp);
1707 sortlines_temp (lo, nlo, sorted_lo);
1709 sorted_lo[-1] = lo[-1];
1711 mergelines (lines, sorted_lo, nlo, hi, nhi);
1715 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
1716 rather than sorting in place. */
1719 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
1723 bool swap = (0 < compare (&lines[-1], &lines[-2]));
1724 temp[-1] = lines[-1 - swap];
1725 temp[-2] = lines[-2 + swap];
1729 size_t nlo = nlines / 2;
1730 size_t nhi = nlines - nlo;
1731 struct line *lo = lines;
1732 struct line *hi = lines - nlo;
1733 struct line *sorted_hi = temp - nlo;
1735 sortlines_temp (hi, nhi, sorted_hi);
1737 sortlines (lo, nlo, temp);
1739 mergelines (temp, lo, nlo, sorted_hi, nhi);
1743 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
1744 the same as OUTFILE. If found, merge the found instances (and perhaps
1745 some other files) into a temporary file so that it can in turn be
1746 merged into OUTFILE without destroying OUTFILE before it is completely
1747 read. Return the new value of NFILES, which differs from the old if
1748 some merging occurred.
1750 This test ensures that an otherwise-erroneous use like
1751 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
1752 It's not clear that POSIX requires this nicety.
1753 Detect common error cases, but don't try to catch obscure cases like
1754 "cat ... FILE ... | sort -m -o FILE"
1755 where traditional "sort" doesn't copy the input and where
1756 people should know that they're getting into trouble anyway.
1757 Catching these obscure cases would slow down performance in
1761 avoid_trashing_input (char **files, size_t ntemps, size_t nfiles,
1762 char const *outfile)
1765 bool got_outstat = false;
1766 struct stat outstat;
1768 for (i = ntemps; i < nfiles; i++)
1770 bool is_stdin = STREQ (files[i], "-");
1774 if (outfile && STREQ (outfile, files[i]) && !is_stdin)
1781 ? stat (outfile, &outstat)
1782 : fstat (STDOUT_FILENO, &outstat))
1789 ? fstat (STDIN_FILENO, &instat)
1790 : stat (files[i], &instat))
1792 && SAME_INODE (instat, outstat));
1798 char *temp = create_temp_file (&tftp);
1799 mergefps (&files[i], 0, nfiles - i, tftp, temp);
1808 /* Merge the input FILES. NTEMPS is the number of files at the
1809 start of FILES that are temporary; it is zero at the top level.
1810 NFILES is the total number of files. Put the output in
1811 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
1814 merge (char **files, size_t ntemps, size_t nfiles, char const *output_file)
1816 while (NMERGE < nfiles)
1818 /* Number of input files processed so far. */
1821 /* Number of output files generated so far. */
1824 /* nfiles % NMERGE; this counts input files that are left over
1825 after all full-sized merges have been done. */
1828 /* Number of easily-available slots at the next loop iteration. */
1831 /* Do as many NMERGE-size merges as possible. */
1832 for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
1835 char *temp = create_temp_file (&tfp);
1836 size_t nt = MIN (ntemps, NMERGE);
1838 mergefps (&files[in], nt, NMERGE, tfp, temp);
1842 remainder = nfiles - in;
1843 cheap_slots = NMERGE - out % NMERGE;
1845 if (cheap_slots < remainder)
1847 /* So many files remain that they can't all be put into the last
1848 NMERGE-sized output window. Do one more merge. Merge as few
1849 files as possible, to avoid needless I/O. */
1850 size_t nshortmerge = remainder - cheap_slots + 1;
1852 char *temp = create_temp_file (&tfp);
1853 size_t nt = MIN (ntemps, nshortmerge);
1855 mergefps (&files[in], nt, nshortmerge, tfp, temp);
1856 files[out++] = temp;
1860 /* Put the remaining input files into the last NMERGE-sized output
1861 window, so they will be merged in the next pass. */
1862 memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
1867 nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
1868 mergefps (files, ntemps, nfiles, NULL, output_file);
1871 /* Sort NFILES FILES onto OUTPUT_FILE. */
1874 sort (char * const *files, size_t nfiles, char const *output_file)
1878 bool output_file_created = false;
1884 char const *temp_output;
1885 char const *file = *files;
1886 FILE *fp = xfopen (file, "r");
1888 size_t bytes_per_line = (2 * sizeof (struct line)
1889 - sizeof (struct line) / 2);
1892 initbuf (&buf, bytes_per_line,
1893 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
1898 while (fillbuf (&buf, fp, file))
1901 struct line *linebase;
1903 if (buf.eof && nfiles
1904 && (bytes_per_line + 1
1905 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
1907 /* End of file, but there is more input and buffer room.
1908 Concatenate the next input file; this is faster in
1910 buf.left = buf.used;
1914 line = buffer_linelim (&buf);
1915 linebase = line - buf.nlines;
1917 sortlines (line, buf.nlines, linebase);
1918 if (buf.eof && !nfiles && !ntemps && !buf.left)
1921 tfp = xfopen (output_file, "w");
1922 temp_output = output_file;
1923 output_file_created = true;
1928 temp_output = create_temp_file (&tfp);
1934 write_bytes (line->text, line->length, tfp, temp_output);
1936 while (linebase < line && compare (line, line - 1) == 0)
1939 while (linebase < line);
1941 xfclose (tfp, temp_output);
1943 if (output_file_created)
1952 if (! output_file_created)
1955 struct tempnode *node = temphead;
1956 char **tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
1957 for (i = 0; node; i++)
1959 tempfiles[i] = node->name;
1962 merge (tempfiles, ntemps, ntemps, output_file);
1967 /* Insert key KEY at the end of the key list. */
1970 insertkey (struct keyfield *key)
1972 struct keyfield **p;
1974 for (p = &keylist; *p; p = &(*p)->next)
1980 /* Report a bad field specification SPEC, with extra info MSGID. */
1982 static void badfieldspec (char const *, char const *)
1985 badfieldspec (char const *spec, char const *msgid)
1987 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
1988 _(msgid), quote (spec));
1992 /* Parse the leading integer in STRING and store the resulting value
1993 (which must fit into size_t) into *VAL. Return the address of the
1994 suffix after the integer. If MSGID is NULL, return NULL after
1995 failure; otherwise, report MSGID and exit on failure. */
1998 parse_field_count (char const *string, size_t *val, char const *msgid)
2003 switch (xstrtoumax (string, &suffix, 10, &n, ""))
2006 case LONGINT_INVALID_SUFFIX_CHAR:
2011 case LONGINT_OVERFLOW:
2012 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2014 error (SORT_FAILURE, 0, _("%s: count `%.*s' too large"),
2015 _(msgid), (int) (suffix - string), string);
2018 case LONGINT_INVALID:
2020 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
2021 _(msgid), quote (string));
2028 /* Handle interrupts and hangups. */
2031 sighandler (int sig)
2034 signal (sig, SIG_IGN);
2038 signal (sig, SIG_DFL);
2042 /* Set the ordering options for KEY specified in S.
2043 Return the address of the first character in S that
2044 is not a valid ordering option.
2045 BLANKTYPE is the kind of blanks that 'b' should skip. */
2048 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
2055 if (blanktype == bl_start || blanktype == bl_both)
2056 key->skipsblanks = true;
2057 if (blanktype == bl_end || blanktype == bl_both)
2058 key->skipeblanks = true;
2061 key->ignore = nondictionary;
2064 key->translate = fold_toupper;
2067 key->general_numeric = true;
2070 /* Option order should not matter, so don't let -i override
2071 -d. -d implies -i, but -i does not imply -d. */
2073 key->ignore = nonprinting;
2079 key->numeric = true;
2082 key->reverse = true;
2092 static struct keyfield *
2095 struct keyfield *key = xzalloc (sizeof *key);
2096 key->eword = SIZE_MAX;
2101 main (int argc, char **argv)
2103 struct keyfield *key;
2104 struct keyfield gkey;
2107 bool checkonly = false;
2108 bool mergeonly = false;
2110 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2111 bool obsolete_usage = (posix2_version () < 200112);
2112 char *minus = "-", **files;
2113 char const *outfile = NULL;
2115 initialize_main (&argc, &argv);
2116 program_name = argv[0];
2117 setlocale (LC_ALL, "");
2118 bindtextdomain (PACKAGE, LOCALEDIR);
2119 textdomain (PACKAGE);
2123 initialize_exit_failure (SORT_FAILURE);
2124 atexit (close_stdout);
2126 hard_LC_COLLATE = hard_locale (LC_COLLATE);
2127 #if HAVE_NL_LANGINFO
2128 hard_LC_TIME = hard_locale (LC_TIME);
2131 /* Get locale's representation of the decimal point. */
2133 struct lconv const *locale = localeconv ();
2135 /* If the locale doesn't define a decimal point, or if the decimal
2136 point is multibyte, use the C locale's decimal point. FIXME:
2137 add support for multibyte decimal points. */
2138 decimal_point = to_uchar (locale->decimal_point[0]);
2139 if (! decimal_point || locale->decimal_point[1])
2140 decimal_point = '.';
2142 /* FIXME: add support for multibyte thousands separators. */
2143 thousands_sep = to_uchar (*locale->thousands_sep);
2144 if (! thousands_sep || locale->thousands_sep[1])
2148 have_read_stdin = false;
2153 static int const sig[] = { SIGHUP, SIGINT, SIGPIPE, SIGTERM };
2154 enum { nsigs = sizeof sig / sizeof sig[0] };
2157 struct sigaction act;
2159 sigemptyset (&caught_signals);
2160 for (i = 0; i < nsigs; i++)
2162 sigaction (sig[i], NULL, &act);
2163 if (act.sa_handler != SIG_IGN)
2164 sigaddset (&caught_signals, sig[i]);
2167 act.sa_handler = sighandler;
2168 act.sa_mask = caught_signals;
2171 for (i = 0; i < nsigs; i++)
2172 if (sigismember (&caught_signals, sig[i]))
2173 sigaction (sig[i], &act, NULL);
2175 for (i = 0; i < nsigs; i++)
2176 if (signal (sig[i], SIG_IGN) != SIG_IGN)
2178 signal (sig[i], sighandler);
2179 siginterrupt (sig[i], 1);
2184 gkey.sword = gkey.eword = SIZE_MAX;
2186 gkey.translate = NULL;
2187 gkey.numeric = gkey.general_numeric = gkey.month = gkey.reverse = false;
2188 gkey.skipsblanks = gkey.skipeblanks = false;
2190 files = xnmalloc (argc, sizeof *files);
2194 /* Parse an operand as a file after "--" was seen; or if
2195 pedantic and a file was seen, unless the POSIX version
2196 predates 1003.1-2001 and -c was not seen and the operand is
2197 "-o FILE" or "-oFILE". */
2200 || (posixly_correct && nfiles != 0
2201 && ! (obsolete_usage
2204 && argv[optind][0] == '-' && argv[optind][1] == 'o'
2205 && (argv[optind][2] || optind + 1 != argc)))
2206 || ((c = getopt_long (argc, argv, short_options,
2207 long_options, NULL))
2212 files[nfiles++] = argv[optind++];
2218 if (obsolete_usage && optarg[0] == '+')
2220 /* Treat +POS1 [-POS2] as a key if possible; but silently
2221 treat an operand as a file if it is not a valid +POS1. */
2223 s = parse_field_count (optarg + 1, &key->sword, NULL);
2225 s = parse_field_count (s + 1, &key->schar, NULL);
2226 if (! (key->sword | key->schar))
2227 key->sword = SIZE_MAX;
2228 if (! s || *set_ordering (s, key, bl_start))
2235 if (optind != argc && argv[optind][0] == '-'
2236 && ISDIGIT (argv[optind][1]))
2238 char const *optarg1 = argv[optind++];
2239 s = parse_field_count (optarg1 + 1, &key->eword,
2240 N_("invalid number after `-'"));
2242 s = parse_field_count (s + 1, &key->echar,
2243 N_("invalid number after `.'"));
2244 if (*set_ordering (s, key, bl_end))
2245 badfieldspec (optarg1,
2246 N_("stray character in field spec"));
2252 files[nfiles++] = optarg;
2267 set_ordering (str, &gkey, bl_both);
2279 s = parse_field_count (optarg, &key->sword,
2280 N_("invalid number at field start"));
2283 /* Provoke with `sort -k0' */
2284 badfieldspec (optarg, N_("field number is zero"));
2288 s = parse_field_count (s + 1, &key->schar,
2289 N_("invalid number after `.'"));
2292 /* Provoke with `sort -k1.0' */
2293 badfieldspec (optarg, N_("character offset is zero"));
2296 if (! (key->sword | key->schar))
2297 key->sword = SIZE_MAX;
2298 s = set_ordering (s, key, bl_start);
2301 key->eword = SIZE_MAX;
2307 s = parse_field_count (s + 1, &key->eword,
2308 N_("invalid number after `,'"));
2311 /* Provoke with `sort -k1,0' */
2312 badfieldspec (optarg, N_("field number is zero"));
2315 s = parse_field_count (s + 1, &key->echar,
2316 N_("invalid number after `.'"));
2319 /* `-k 2,3' is equivalent to `+1 -3'. */
2322 s = set_ordering (s, key, bl_end);
2325 badfieldspec (optarg, N_("stray character in field spec"));
2334 if (outfile && !STREQ (outfile, optarg))
2335 error (SORT_FAILURE, 0, _("multiple output files specified"));
2344 specify_sort_size (optarg);
2349 char newtab = optarg[0];
2351 error (SORT_FAILURE, 0, _("empty tab"));
2354 if (STREQ (optarg, "\\0"))
2358 /* Provoke with `sort -txx'. Complain about
2359 "multi-character tab" instead of "multibyte tab", so
2360 that the diagnostic's wording does not need to be
2361 changed once multibyte characters are supported. */
2362 error (SORT_FAILURE, 0, _("multi-character tab %s"),
2366 if (tab != TAB_DEFAULT && tab != newtab)
2367 error (SORT_FAILURE, 0, _("incompatible tabs"));
2373 add_temp_dir (optarg);
2381 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
2382 through Solaris 7. It is also accepted by many non-Solaris
2383 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
2384 -y is marked as obsolete starting with Solaris 8 (1999), but is
2385 still accepted as of Solaris 10 prerelease (2004).
2387 Solaris 2.5.1 "sort -y 100" reads the input file "100", but
2388 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
2389 and which in general ignores the argument after "-y" if it
2390 consists entirely of digits (it can even be empty). */
2391 if (optarg == argv[optind - 1])
2394 for (p = optarg; ISDIGIT (*p); p++)
2396 optind -= (*p != '\0');
2404 case_GETOPT_HELP_CHAR;
2406 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2409 usage (SORT_FAILURE);
2413 /* Inheritance of global options to individual keys. */
2414 for (key = keylist; key; key = key->next)
2415 if (! (key->ignore || key->translate
2416 || (key->skipsblanks | key->reverse
2417 | key->skipeblanks | key->month | key->numeric
2418 | key->general_numeric)))
2420 key->ignore = gkey.ignore;
2421 key->translate = gkey.translate;
2422 key->skipsblanks = gkey.skipsblanks;
2423 key->skipeblanks = gkey.skipeblanks;
2424 key->month = gkey.month;
2425 key->numeric = gkey.numeric;
2426 key->general_numeric = gkey.general_numeric;
2427 key->reverse = gkey.reverse;
2430 if (!keylist && (gkey.ignore || gkey.translate
2431 || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
2432 | gkey.numeric | gkey.general_numeric)))
2434 reverse = gkey.reverse;
2436 if (temp_dir_count == 0)
2438 char const *tmp_dir = getenv ("TMPDIR");
2439 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
2452 error (0, 0, _("extra operand %s not allowed with -c"),
2454 usage (SORT_FAILURE);
2457 /* POSIX requires that sort return 1 IFF invoked with -c and the
2458 input is not properly sorted. */
2459 exit (check (files[0]) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
2463 merge (files, 0, nfiles, outfile);
2465 sort (files, nfiles, outfile);
2467 if (have_read_stdin && fclose (stdin) == EOF)
2468 die (_("close failed"), "-");
2470 exit (EXIT_SUCCESS);