From 94615d2acfdccbbeb8eb6f8931d0e252b05e1484 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Mon, 2 Aug 2010 19:18:01 -0700 Subject: [PATCH] sort: revert recent -h changes and use a more-conservative approach * NEWS: Document changes to sort -h, which are now minor with respect to the pre-July-30th version. * doc/coreutils.texi (sort invocation): Likewise. The documentation now describes how -h comparison is done rather than being vague with border cases. * src/sort.c (long_double, strtold): Move back to general_numcompare. (LD, compute_human): Remove. (find_unit_order): Remove THOU_SEP parameter, since thousands separators are now allowed by all callers. Revert to previous behavior of sorting by suffix, and returning the order rather than 2 * order + binary, since we no longer care whether binary powers are being used. However, treat all zeros the same, instead of sorting 0M before 0G; this is more consistent with the desired behavior of sorting -1G before -1M. * tests/misc/sort (h1, h3, h6): Adjust to match mostly-reverted behavior. However, check that all zeros sort together. * tests/misc/sort-debug-keys: Omit a "_", since the trailing "i" in "1234Gi" is no longer part of the key. --- NEWS | 12 ++--- doc/coreutils.texi | 25 +++++------ src/sort.c | 108 +++++++++++++-------------------------------- tests/misc/sort | 12 ++--- tests/misc/sort-debug-keys | 2 +- 5 files changed, 53 insertions(+), 106 deletions(-) diff --git a/NEWS b/NEWS index 4e2cb3d..4662173 100644 --- a/NEWS +++ b/NEWS @@ -39,15 +39,9 @@ GNU coreutils NEWS -*- outline -*- sort -g now uses long doubles for greater range and precision. - sort -h no longer mishandles comparisons such as 5MiB vs 5MB, or - 6000K vs 5M. It uses floating-point arithmetic for these cases, - though, which means that the comparisons are not exact. This is not - a problem when sorting the output of df, du, and ls because this - output contains so few digits before suffixes. - - sort -h no longer rejects numbers ending in trailing "." or having - leading ".". It no longer accepts numbers with multiple "." or - numbers with thousands separators. + sort -h no longer rejects numbers with leading or trailing ".", and + no longer accepts numbers with multiple ".". It now considers all + zeros to be equal. sort now uses the number of available processors to parallelize the sorting operation. The number of sorts run concurrently can be diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 4a41430..66309b1 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3802,19 +3802,18 @@ converting to floating point. @opindex --sort @cindex human numeric sort @vindex LC_NUMERIC -Sort numerically, and in addition handle IEC or SI suffixes like MiB, -MB etc (@ref{Block size}). Each number consists of optional blanks, -an optional @samp{-} sign, zero or more digits, optionally followed by -a decimal-point character and zero or more digits, and optionally -followed by an IEC or SI suffix. A number with no digits is treated -as @samp{0}. The @env{LC_NUMERIC} locale specifies the decimal-point -character. - -Numbers containing differing suffixes are compared using -floating-point arithmetic, and therefore may not be compared exactly -due to rounding error. However, the output of @command{df}, -@command{du}, and @command{ls} contains so few digits before suffixes -that rounding error is not significant and comparisons are exact. +Sort numerically, first by numeric sign (negative, zero, or positive); +then by @acronym{SI} suffix (either empty, or @samp{k} or @samp{K}, or +one of @samp{MGTPEZY}, in that order; @xref{Block size}); and finally +by numeric value. For example, @samp{1023M} sorts before @samp{1G} +because @samp{M} (mega) precedes @samp{G} (giga) as an @acronym{SI} +suffix. This option sorts values that are consistently scaled to the +nearest suffix, regardless of whether suffixes denote powers of 1000 +or 1024, and it therefore sorts the output of any single invocation of +the @command{df}, @command{du}, or @command{ls} commands that are +invoked with their @option{--human-readable} or @option{--si} options. +The syntax for numbers is the same as for the @option{--numeric-sort} +option; the @acronym{SI} suffix must immediately follow the number. @item -i @itemx --ignore-nonprinting diff --git a/src/sort.c b/src/sort.c index ac5a079..e02e547 100644 --- a/src/sort.c +++ b/src/sort.c @@ -92,16 +92,6 @@ struct rlimit { size_t rlim_cur; }; #define UCHAR_LIM (UCHAR_MAX + 1) -#if HAVE_C99_STRTOLD -# define long_double long double -# define LD(x) x##L -#else -# define long_double double -# undef strtold -# define strtold strtod -# define LD(x) x -#endif - #ifndef DEFAULT_TMPDIR # define DEFAULT_TMPDIR "/tmp" #endif @@ -1803,15 +1793,15 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file) } /* Return an integer that represents the order of magnitude of the - unit following the number. If THOU_SEP is not negative, NUMBER can - contain thousands separators equal to THOU_SEP. It can also - contain a decimal point. But it may not contain leading blanks. + unit following the number. The number may contain thousands + separators and a decimal point, but it may not contain leading blanks. + Negative numbers get negative orders; zero numbers have a zero order. Store the address of the end of the number into *ENDPTR. */ static int -find_unit_order (char const *number, int thou_sep, char **endptr) +find_unit_order (char const *number, char **endptr) { - static char const powers[UCHAR_LIM] = + static char const orders[UCHAR_LIM] = { #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \ && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107) @@ -1839,7 +1829,10 @@ find_unit_order (char const *number, int thou_sep, char **endptr) #endif }; - char const *p = number + (*number == '-'); + bool minus_sign = (*number == '-'); + char const *p = number + minus_sign; + int nonzero = 0; + unsigned char ch; /* Scan to end of number. Decimals or separators not followed by digits stop the scan. @@ -1849,50 +1842,18 @@ find_unit_order (char const *number, int thou_sep, char **endptr) do { - while (*p == thousands_sep) - p++; + while (ISDIGIT (ch = *p++)) + nonzero |= ch - '0'; } - while (ISDIGIT (*p++)); + while (ch == thousands_sep); - if (p[-1] == decimal_point) - while (ISDIGIT (*p++)) - continue; - - unsigned char ch = p[-1]; - int power = powers[ch]; - int binary = (power ? *p == 'i': 0); - *endptr = (char *) p + (power ? binary : -1); - return 2 * power + binary; -} - -/* Convert the string P (ending at ENDP) to a floating point value. - The string is assumed to be followed by a SI or IEC prefix of type - ORDER. */ - -static long_double -compute_human (char const *p, char *endp, int order) -{ - static long_double const multiplier[] = - { - LD (1e00), LD ( 1.0), - LD (1e03), LD ( 1024.0), - LD (1e06), LD ( 1048576.0), - LD (1e09), LD ( 1073741824.0), - LD (1e12), LD ( 1099511627776.0), - LD (1e15), LD ( 1125899906842624.0), - LD (1e18), LD ( 1152921504606846976.0), - LD (1e21), LD ( 1180591620717411303424.0), - LD (1e24), LD (1208925819614629174706176.0) - }; + if (ch == decimal_point) + while (ISDIGIT (ch = *p++)) + nonzero |= ch - '0'; - char *e = endp; - if (order) - e -= 1 + (order & 1); - char ch = *e; - *e = '\0'; - long_double v = strtold (p, NULL); - *e = ch; - return v * multiplier[order]; + int order = (nonzero ? orders[ch] : 0); + *endptr = (char *) p - !order; + return (minus_sign ? -order : order); } /* Compare numbers A and B ending in units with SI or IEC prefixes @@ -1910,24 +1871,8 @@ human_numcompare (char *a, char *b, char **ea) while (blanks[to_uchar (*b)]) b++; - int order_a = find_unit_order (a, -1, ea); - int order_b = find_unit_order (b, -1, &endb); - - if (order_a == order_b) - { - /* Use strnumcmp if the orders are the same, since it has no - rounding problems and is faster. Do not allow thousands - separators since strtold does not. */ - return strnumcmp (a, b, decimal_point, -1); - } - else - { - /* Fall back on floating point, despite its rounding errors, - since strnumcmp can't handle mixed orders. */ - long_double aval = compute_human (a, *ea, order_a); - long_double bval = compute_human (b, endb, order_b); - return (aval < bval ? -1 : aval > bval); - } + int diff = find_unit_order (a, ea) - find_unit_order (b, &endb); + return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep)); } /* Compare strings A and B as numbers without explicitly converting them to @@ -1945,8 +1890,8 @@ numcompare (char const *a, char const *b, char **ea) if (debug) { /* Approximate strnumcmp extents with find_unit_order. */ - int order = find_unit_order (a, thousands_sep, ea); - *ea -= !!order + (order & 1); + int order = find_unit_order (a, ea); + *ea -= !!order; } return strnumcmp (a, b, decimal_point, thousands_sep); @@ -1957,6 +1902,15 @@ general_numcompare (char const *sa, char const *sb, char **ea) { /* FIXME: maybe add option to try expensive FP conversion only if A and B can't be compared more cheaply/accurately. */ + +#if HAVE_C99_STRTOLD +# define long_double long double +#else +# define long_double double +# undef strtold +# define strtold strtod +#endif + char *eb; long_double a = strtold (sa, ea); long_double b = strtold (sb, &eb); diff --git a/tests/misc/sort b/tests/misc/sort index 12222e1..202951c 100755 --- a/tests/misc/sort +++ b/tests/misc/sort @@ -55,17 +55,17 @@ my @Tests = ["n11b", '-s -n -k1,1', {IN=>".010\n.01a\n"}, {OUT=>".010\n.01a\n"}], # human readable suffixes -["h1", '-h', {IN=>"1Y\n1Z\n1E\n1P\n1T\n1G\n1M\n1K\n02\n1\n"}, - {OUT=>"1\n02\n1K\n1M\n1G\n1T\n1P\n1E\n1Z\n1Y\n"}], +["h1", '-h', {IN=>"1Y\n1Z\n1E\n1P\n1T\n1G\n1M\n1K\n02\n1\nY\n-1k\n-1M\n-1G\n-1T\n-1P\n-1E\n-1Z\n-1Y\n"}, + {OUT=>"-1Y\n-1Z\n-1E\n-1P\n-1T\n-1G\n-1M\n-1k\nY\n1\n02\n1K\n1M\n1G\n1T\n1P\n1E\n1Z\n1Y\n"}], ["h2", '-h', {IN=>"1M\n-2G\n-3K"}, {OUT=>"-2G\n-3K\n1M\n"}], -# check that powers of 1024 beat powers of 1000 -["h3", '-k 2,2h -k 1,1', {IN=>"a 1Mi\nb 1M\n"}, {OUT=>"b 1M\na 1Mi\n"}], +# check that it works with powers of 1024 +["h3", '-k 2,2h -k 1,1', {IN=>"a 1G\nb 1023M\n"}, {OUT=>"b 1023M\na 1G\n"}], # decimal at end => allowed ["h4", '-h', {IN=>"1.E\n2.M\n"}, {OUT=>"2.M\n1.E\n"}], # double decimal => ignore suffix ["h5", '-h', {IN=>"1..2E\n2..2M\n"}, {OUT=>"1..2E\n2..2M\n"}], -# handle inconsistent use of multiplers -["h6", '-h', {IN=>"1GiB\n1030MiB\n"}, {OUT=>"1GiB\n1030MiB\n"}], +# "M" sorts before "G" regardless of the positive number attached. +["h6", '-h', {IN=>"1GiB\n1030MiB\n"}, {OUT=>"1030MiB\n1GiB\n"}], # check option incompatibility ["h7", '-hn', {IN=>""}, {OUT=>""}, {EXIT=>2}, {ERR=>"$prog: options `-hn' are incompatible\n"}], diff --git a/tests/misc/sort-debug-keys b/tests/misc/sort-debug-keys index 89f8b9b..36b67d2 100755 --- a/tests/misc/sort-debug-keys +++ b/tests/misc/sort-debug-keys @@ -293,7 +293,7 @@ _____ ^ no match for key ____ ____ - ______ + _____ _____ _____ ______ -- 2.7.4