sort: revert recent -h changes and use a more-conservative approach

author Paul Eggert <eggert@cs.ucla.edu>

Tue, 3 Aug 2010 02:18:01 +0000 (19:18 -0700)

committer Paul Eggert <eggert@cs.ucla.edu>

Tue, 3 Aug 2010 02:21:02 +0000 (19:21 -0700)
author Paul Eggert <eggert@cs.ucla.edu>
Tue, 3 Aug 2010 02:18:01 +0000 (19:18 -0700)
committer Paul Eggert <eggert@cs.ucla.edu>
Tue, 3 Aug 2010 02:21:02 +0000 (19:21 -0700)
diff --git a/NEWS b/NEWS

index 4e2cb3d..4662173 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -39,15 +39,9 @@ GNU coreutils NEWS                                    -*- outline -*-
  
    sort -g now uses long doubles for greater range and precision.
  
-  sort -h no longer mishandles comparisons such as 5MiB vs 5MB, or
-  6000K vs 5M.  It uses floating-point arithmetic for these cases,
-  though, which means that the comparisons are not exact.  This is not
-  a problem when sorting the output of df, du, and ls because this
-  output contains so few digits before suffixes.
-
-  sort -h no longer rejects numbers ending in trailing "." or having
-  leading ".".  It no longer accepts numbers with multiple "." or
-  numbers with thousands separators.
+  sort -h no longer rejects numbers with leading or trailing ".", and
+  no longer accepts numbers with multiple ".".  It now considers all
+  zeros to be equal.
  
    sort now uses the number of available processors to parallelize
    the sorting operation.  The number of sorts run concurrently can be
diff --git a/doc/coreutils.texi b/doc/coreutils.texi

index 4a41430..66309b1 100644 (file)
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3802,19 +3802,18 @@ converting to floating point.
  @opindex --sort
  @cindex human numeric sort
  @vindex LC_NUMERIC
-Sort numerically, and in addition handle IEC or SI suffixes like MiB,
-MB etc (@ref{Block size}).  Each number consists of optional blanks,
-an optional @samp{-} sign, zero or more digits, optionally followed by
-a decimal-point character and zero or more digits, and optionally
-followed by an IEC or SI suffix.  A number with no digits is treated
-as @samp{0}. The @env{LC_NUMERIC} locale specifies the decimal-point
-character.
-
-Numbers containing differing suffixes are compared using
-floating-point arithmetic, and therefore may not be compared exactly
-due to rounding error.  However, the output of @command{df},
-@command{du}, and @command{ls} contains so few digits before suffixes
-that rounding error is not significant and comparisons are exact.
+Sort numerically, first by numeric sign (negative, zero, or positive);
+then by @acronym{SI} suffix (either empty, or @samp{k} or @samp{K}, or
+one of @samp{MGTPEZY}, in that order; @xref{Block size}); and finally
+by numeric value.  For example, @samp{1023M} sorts before @samp{1G}
+because @samp{M} (mega) precedes @samp{G} (giga) as an @acronym{SI}
+suffix.  This option sorts values that are consistently scaled to the
+nearest suffix, regardless of whether suffixes denote powers of 1000
+or 1024, and it therefore sorts the output of any single invocation of
+the @command{df}, @command{du}, or @command{ls} commands that are
+invoked with their @option{--human-readable} or @option{--si} options.
+The syntax for numbers is the same as for the @option{--numeric-sort}
+option; the @acronym{SI} suffix must immediately follow the number.
  
  @item -i
  @itemx --ignore-nonprinting
diff --git a/src/sort.c b/src/sort.c

index ac5a079..e02e547 100644 (file)
--- a/src/sort.c
+++ b/src/sort.c
@@ -92,16 +92,6 @@ struct rlimit { size_t rlim_cur; };
  
  #define UCHAR_LIM (UCHAR_MAX + 1)
  
-#if HAVE_C99_STRTOLD
-# define long_double long double
-# define LD(x) x##L
-#else
-# define long_double double
-# undef strtold
-# define strtold strtod
-# define LD(x) x
-#endif
-
  #ifndef DEFAULT_TMPDIR
  # define DEFAULT_TMPDIR "/tmp"
  #endif
@@ -1803,15 +1793,15 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file)
  }
  
  /* Return an integer that represents the order of magnitude of the
-   unit following the number.  If THOU_SEP is not negative, NUMBER can
-   contain thousands separators equal to THOU_SEP.  It can also
-   contain a decimal point.  But it may not contain leading blanks.
+   unit following the number.  The number may contain thousands
+   separators and a decimal point, but it may not contain leading blanks.
+   Negative numbers get negative orders; zero numbers have a zero order.
     Store the address of the end of the number into *ENDPTR.  */
  
  static int
-find_unit_order (char const *number, int thou_sep, char **endptr)
+find_unit_order (char const *number, char **endptr)
  {
-  static char const powers[UCHAR_LIM] =
+  static char const orders[UCHAR_LIM] =
      {
  #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
         && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
@@ -1839,7 +1829,10 @@ find_unit_order (char const *number, int thou_sep, char **endptr)
  #endif
      };
  
-  char const *p = number + (*number == '-');
+  bool minus_sign = (*number == '-');
+  char const *p = number + minus_sign;
+  int nonzero = 0;
+  unsigned char ch;
  
    /* Scan to end of number.
       Decimals or separators not followed by digits stop the scan.
@@ -1849,50 +1842,18 @@ find_unit_order (char const *number, int thou_sep, char **endptr)
  
    do
      {
-      while (*p == thousands_sep)
-        p++;
+      while (ISDIGIT (ch = *p++))
+        nonzero |= ch - '0';
      }
-  while (ISDIGIT (*p++));
+  while (ch == thousands_sep);
  
-  if (p[-1] == decimal_point)
-    while (ISDIGIT (*p++))
-      continue;
-
-  unsigned char ch = p[-1];
-  int power = powers[ch];
-  int binary = (power ? *p == 'i': 0);
-  *endptr = (char *) p + (power ? binary : -1);
-  return 2 * power + binary;
-}
-
-/* Convert the string P (ending at ENDP) to a floating point value.
-   The string is assumed to be followed by a SI or IEC prefix of type
-   ORDER.  */
-
-static long_double
-compute_human (char const *p, char *endp, int order)
-{
-  static long_double const multiplier[] =
-    {
-      LD (1e00), LD (                        1.0),
-      LD (1e03), LD (                     1024.0),
-      LD (1e06), LD (                  1048576.0),
-      LD (1e09), LD (               1073741824.0),
-      LD (1e12), LD (            1099511627776.0),
-      LD (1e15), LD (         1125899906842624.0),
-      LD (1e18), LD (      1152921504606846976.0),
-      LD (1e21), LD (   1180591620717411303424.0),
-      LD (1e24), LD (1208925819614629174706176.0)
-    };
+  if (ch == decimal_point)
+    while (ISDIGIT (ch = *p++))
+      nonzero |= ch - '0';
  
-  char *e = endp;
-  if (order)
-    e -= 1 + (order & 1);
-  char ch = *e;
-  *e = '\0';
-  long_double v = strtold (p, NULL);
-  *e = ch;
-  return v * multiplier[order];
+  int order = (nonzero ? orders[ch] : 0);
+  *endptr = (char *) p - !order;
+  return (minus_sign ? -order : order);
  }
  
  /* Compare numbers A and B ending in units with SI or IEC prefixes
@@ -1910,24 +1871,8 @@ human_numcompare (char *a, char *b, char **ea)
    while (blanks[to_uchar (*b)])
      b++;
  
-  int order_a = find_unit_order (a, -1, ea);
-  int order_b = find_unit_order (b, -1, &endb);
-
-  if (order_a == order_b)
-    {
-      /* Use strnumcmp if the orders are the same, since it has no
-         rounding problems and is faster.  Do not allow thousands
-         separators since strtold does not.  */
-      return strnumcmp (a, b, decimal_point, -1);
-    }
-  else
-    {
-      /* Fall back on floating point, despite its rounding errors,
-         since strnumcmp can't handle mixed orders.  */
-      long_double aval = compute_human (a, *ea, order_a);
-      long_double bval = compute_human (b, endb, order_b);
-      return (aval < bval ? -1 : aval > bval);
-    }
+  int diff = find_unit_order (a, ea) - find_unit_order (b, &endb);
+  return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
  }
  
  /* Compare strings A and B as numbers without explicitly converting them to
@@ -1945,8 +1890,8 @@ numcompare (char const *a, char const *b, char **ea)
    if (debug)
      {
        /* Approximate strnumcmp extents with find_unit_order.  */
-      int order = find_unit_order (a, thousands_sep, ea);
-      *ea -= !!order + (order & 1);
+      int order = find_unit_order (a, ea);
+      *ea -= !!order;
      }
  
    return strnumcmp (a, b, decimal_point, thousands_sep);
@@ -1957,6 +1902,15 @@ general_numcompare (char const *sa, char const *sb, char **ea)
  {
    /* FIXME: maybe add option to try expensive FP conversion
       only if A and B can't be compared more cheaply/accurately.  */
+
+#if HAVE_C99_STRTOLD
+# define long_double long double
+#else
+# define long_double double
+# undef strtold
+# define strtold strtod
+#endif
+
    char *eb;
    long_double a = strtold (sa, ea);
    long_double b = strtold (sb, &eb);
diff --git a/tests/misc/sort b/tests/misc/sort

index 12222e1..202951c 100755 (executable)
--- a/tests/misc/sort
+++ b/tests/misc/sort
@@ -55,17 +55,17 @@ my @Tests =
  ["n11b", '-s -n -k1,1', {IN=>".010\n.01a\n"}, {OUT=>".010\n.01a\n"}],
  
  # human readable suffixes
-["h1", '-h', {IN=>"1Y\n1Z\n1E\n1P\n1T\n1G\n1M\n1K\n02\n1\n"},
- {OUT=>"1\n02\n1K\n1M\n1G\n1T\n1P\n1E\n1Z\n1Y\n"}],
+["h1", '-h', {IN=>"1Y\n1Z\n1E\n1P\n1T\n1G\n1M\n1K\n02\n1\nY\n-1k\n-1M\n-1G\n-1T\n-1P\n-1E\n-1Z\n-1Y\n"},
+ {OUT=>"-1Y\n-1Z\n-1E\n-1P\n-1T\n-1G\n-1M\n-1k\nY\n1\n02\n1K\n1M\n1G\n1T\n1P\n1E\n1Z\n1Y\n"}],
  ["h2", '-h', {IN=>"1M\n-2G\n-3K"}, {OUT=>"-2G\n-3K\n1M\n"}],
-# check that powers of 1024 beat powers of 1000
-["h3", '-k 2,2h -k 1,1', {IN=>"a 1Mi\nb 1M\n"}, {OUT=>"b 1M\na 1Mi\n"}],
+# check that it works with powers of 1024
+["h3", '-k 2,2h -k 1,1', {IN=>"a 1G\nb 1023M\n"}, {OUT=>"b 1023M\na 1G\n"}],
  # decimal at end => allowed
  ["h4", '-h', {IN=>"1.E\n2.M\n"}, {OUT=>"2.M\n1.E\n"}],
  # double decimal => ignore suffix
  ["h5", '-h', {IN=>"1..2E\n2..2M\n"}, {OUT=>"1..2E\n2..2M\n"}],
-# handle inconsistent use of multiplers
-["h6", '-h', {IN=>"1GiB\n1030MiB\n"}, {OUT=>"1GiB\n1030MiB\n"}],
+# "M" sorts before "G" regardless of the positive number attached.
+["h6", '-h', {IN=>"1GiB\n1030MiB\n"}, {OUT=>"1030MiB\n1GiB\n"}],
  # check option incompatibility
  ["h7", '-hn', {IN=>""}, {OUT=>""}, {EXIT=>2},
   {ERR=>"$prog: options `-hn' are incompatible\n"}],
diff --git a/tests/misc/sort-debug-keys b/tests/misc/sort-debug-keys

index 89f8b9b..36b67d2 100755 (executable)
--- a/tests/misc/sort-debug-keys
+++ b/tests/misc/sort-debug-keys
@@ -293,7 +293,7 @@ _____
  ^ no match for key
        ____
        ____
-      ______
+      _____
               _____
               _____
               ______
author	Paul Eggert <eggert@cs.ucla.edu>
	Tue, 3 Aug 2010 02:18:01 +0000 (19:18 -0700)
committer	Paul Eggert <eggert@cs.ucla.edu>
	Tue, 3 Aug 2010 02:21:02 +0000 (19:21 -0700)
NEWS		patch \| blob \| history
doc/coreutils.texi		patch \| blob \| history
src/sort.c		patch \| blob \| history
tests/misc/sort		patch \| blob \| history
tests/misc/sort-debug-keys		patch \| blob \| history