[flang] Work around slow clang-7

author peter klausler <pklausler@nvidia.com>

Tue, 20 Aug 2019 19:58:27 +0000 (12:58 -0700)

committer peter klausler <pklausler@nvidia.com>

Fri, 23 Aug 2019 18:31:32 +0000 (11:31 -0700)
author peter klausler <pklausler@nvidia.com>
Tue, 20 Aug 2019 19:58:27 +0000 (12:58 -0700)
committer peter klausler <pklausler@nvidia.com>
Fri, 23 Aug 2019 18:31:32 +0000 (11:31 -0700)
diff --git a/flang/lib/decimal/big-radix-floating-point.h b/flang/lib/decimal/big-radix-floating-point.h

index 445a144..49e9efc 100644 (file)
--- a/flang/lib/decimal/big-radix-floating-point.h
+++ b/flang/lib/decimal/big-radix-floating-point.h
@@ -27,6 +27,7 @@
  
  #include "binary-floating-point.h"
  #include "decimal.h"
+#include "int-divide-workaround.h"
  #include "../common/bit-population-count.h"
  #include "../common/leading-zero-bit-count.h"
  #include <cinttypes>
@@ -133,7 +134,7 @@ private:
          std::is_same_v<UINT, __uint128_t> || std::is_unsigned_v<UINT>);
      SetToZero();
      while (n != 0) {
-      auto q{n / 10};
+      auto q{FastDivision<UINT, 10>(n)};
        if (n != 10 * q) {
          break;
        }
@@ -147,7 +148,7 @@ private:
        return 0;
      } else {
        while (n != 0 && digits_ < digitLimit_) {
-        auto q{n / radix};
+        auto q{FastDivision<UINT, radix>(n)};
          digit_[digits_++] = n - radix * q;
          n = q;
        }
@@ -195,7 +196,7 @@ private:
      Digit remainder{0};
      for (int j{digits_ - 1}; j >= 0; --j) {
        // N.B. Because DIVISOR is a constant, these operations should be cheap.
-      Digit q{digit_[j] / DIVISOR};
+      Digit q{FastDivision<Digit, DIVISOR>(digit_[j])};
        Digit nrem{digit_[j] - DIVISOR * q};
        digit_[j] = q + (radix / DIVISOR) * remainder;
        remainder = nrem;
@@ -244,9 +245,9 @@ private:
  
    template<int N> int MultiplyByHelper(int carry = 0) {
      for (int j{0}; j < digits_; ++j) {
-      digit_[j] = N * digit_[j] + carry;
-      carry = digit_[j] / radix;  // N.B. radix is constant, this is fast
-      digit_[j] -= carry * radix;
+      auto v{N * digit_[j] + carry};
+      carry = FastDivision<Digit, radix>(v);
+      digit_[j] = v - carry * radix;  // i.e., v % radix
      }
      return carry;
    }
diff --git a/flang/lib/decimal/binary-to-decimal.cc b/flang/lib/decimal/binary-to-decimal.cc

index 6a8a188..cec8ba2 100644 (file)
--- a/flang/lib/decimal/binary-to-decimal.cc
+++ b/flang/lib/decimal/binary-to-decimal.cc
@@ -139,7 +139,7 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToDecimal(char *buffer,
    // Treat the MSD specially: don't emit leading zeroes.
    Digit dig{digit_[digits_ - 1]};
    for (int k{0}; k < LOG10RADIX; k += 2) {
-    Digit d{dig / hundredth};
+    Digit d{FastDivision<Digit, hundredth>(dig)};
      dig = 100 * (dig - d * hundredth);
      const char *q{lut + 2 * d};
      if (q[0] != '0' || p > start) {
@@ -152,7 +152,7 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToDecimal(char *buffer,
    for (int j{digits_ - 1}; j-- > 0;) {
      Digit dig{digit_[j]};
      for (int k{0}; k < log10Radix; k += 2) {
-      Digit d{dig / hundredth};
+      Digit d{FastDivision<Digit, hundredth>(dig)};
        dig = 100 * (dig - d * hundredth);
        const char *q = lut + 2 * d;
        *p++ = q[0];
@@ -276,9 +276,9 @@ void BigRadixFloatingPointNumber<PREC, LOG10RADIX>::Minimize(
    Digit least{less.digit_[offset]};
    Digit my{digit_[0]};
    while (true) {
-    Digit q{my / 10};
+    Digit q{FastDivision<Digit, 10>(my)};
      Digit r{my - 10 * q};
-    Digit lq{least / 10};
+    Digit lq{FastDivision<Digit, 10>(least)};
      Digit lr{least - 10 * lq};
      if (r != 0 && lq == q) {
        Digit sub{(r - lr) >> 1};
diff --git a/flang/lib/decimal/decimal-to-binary.cc b/flang/lib/decimal/decimal-to-binary.cc

index a1fbec7..f388d64 100644 (file)
--- a/flang/lib/decimal/decimal-to-binary.cc
+++ b/flang/lib/decimal/decimal-to-binary.cc
@@ -50,8 +50,8 @@ bool BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ParseNumber(
    if (q == start || (q == start + 1 && *start == '.')) {
      return false;  // require at least one digit
    }
-  auto times{radix};
    const char *d{q};
+  // Strip off trailing zeroes
    if (point != nullptr) {
      while (d > firstDigit && d[-1] == '0') {
        --d;
@@ -68,11 +68,12 @@ bool BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ParseNumber(
      }
    }
    if (d == firstDigit) {
-    exponent_ = 0;
+    exponent_ = 0;  // all zeros
    }
    if (point != nullptr) {
      exponent_ -= static_cast<int>(d - point - 1);
    }
+  // Trim any excess digits
    const char *limit{firstDigit + maxDigits * log10Radix + (point != nullptr)};
    if (d > limit) {
      inexact = true;
@@ -85,7 +86,8 @@ bool BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ParseNumber(
        }
      }
    }
-  while (d-- > firstDigit) {
+  // Rack the decimal digits up into big Digits.
+  for (auto times{radix}; d-- > firstDigit;) {
      if (*d != '.') {
        if (times == radix) {
          digit_[digits_++] = *d - '0';
@@ -253,7 +255,7 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToBinary() {
    // large power of ten.  Its radix point is defined to be to the right of its
    // digits, and "exponent_" is the power of ten by which it is to be scaled.
    Normalize();
-  if (digits_ == 0) {
+  if (digits_ == 0) {  // zero value
      if (isNegative_) {
        using Raw = typename Binary::RawType;
        Raw negZero{static_cast<Raw>(1) << (Binary::bits - 1)};
@@ -263,12 +265,22 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToBinary() {
      }
    }
    // The value is not zero.
+  IntermediateFloat<PREC> f;
+#if 0  // actually a small loss
+  // Make the value odd.
+  if (int trailing0s{common::TrailingZeroBitCount(digit_[0])}) {
+    f.AdjustExponent(trailing0s);
+    for (; trailing0s > log10Radix; trailing0s -= log10Radix) {
+      DivideByPowerOfTwo(log10Radix);
+    }
+    DivideByPowerOfTwo(trailing0s);
+  }
+#endif
    // Shift our perspective on the radix (& decimal) point so that
    // it sits to the *left* of the digits.
    exponent_ += digits_ * log10Radix;
    // Apply any negative decimal exponent by multiplication
    // by a power of two, adjusting the binary exponent to compensate.
-  IntermediateFloat<PREC> f;
    while (exponent_ < log10Radix) {
      f.AdjustExponent(-9);
      digitLimit_ = digits_;
diff --git a/flang/lib/decimal/int-divide-workaround.h b/flang/lib/decimal/int-divide-workaround.h

new file mode 100644 (file)

index 0000000..8ccb459
--- /dev/null
+++ b/flang/lib/decimal/int-divide-workaround.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef INT_DIVIDE_H_
+#define INT_DIVIDE_H_
+
+// Work around unoptimized implementations of unsigned integer division
+// by constant values in some compilers (looking at YOU, clang 7!)
+
+#ifdef __clang__
+#if __clang_major__ < 8
+#define USE_INT_DIVIDE_WORKAROUNDS 1
+#endif
+#endif
+
+#include <cinttypes>
+
+namespace Fortran::decimal {
+
+template<typename UINT, UINT DENOM> inline constexpr UINT FastDivision(UINT n) {
+  return n / DENOM;
+}
+
+#if USE_INT_DIVIDE_WORKAROUNDS
+template<> inline constexpr std::uint64_t FastDivision<std::uint64_t, 10000000000000000u>(std::uint64_t n) {
+  return (static_cast<__uint128_t>(0x39a5652fb1137857) * n) >> (64 + 51);
+}
+
+template<> inline constexpr std::uint64_t FastDivision<std::uint64_t, 100000000000000u>(std::uint64_t n) {
+  return (static_cast<__uint128_t>(0xb424dc35095cd81) * n) >> (64 + 42);
+}
+
+template<> inline constexpr std::uint32_t FastDivision<std::uint32_t, 1000000u>(std::uint32_t n) {
+  return (static_cast<std::uint64_t>(0x431bde83) * n) >> (32 + 18);
+}
+
+template<> inline constexpr std::uint32_t FastDivision<std::uint32_t, 10000u>(std::uint32_t n) {
+  return (static_cast<std::uint64_t>(0xd1b71759) * n) >> (32 + 13);
+}
+
+template<> inline constexpr std::uint64_t FastDivision<std::uint64_t, 10u>(std::uint64_t n) {
+  return (static_cast<__uint128_t>(0xcccccccccccccccd) * n) >> (64 + 3);
+}
+
+template<> inline constexpr std::uint32_t FastDivision<std::uint32_t, 10u>(std::uint32_t n) {
+  return (static_cast<std::uint64_t>(0xcccccccd) * n) >> (32 + 3);
+}
+
+template<> inline constexpr std::uint64_t FastDivision<std::uint64_t, 5u>(std::uint64_t n) {
+  return (static_cast<__uint128_t>(0xcccccccccccccccd) * n) >> (64 + 2);
+}
+
+template<> inline constexpr std::uint32_t FastDivision<std::uint32_t, 5u>(std::uint32_t n) {
+  return (static_cast<std::uint64_t>(0xcccccccd) * n) >> (32 + 2);
+}
+#endif
+
+static_assert(FastDivision<std::uint64_t, 10000000000000000u>(9999999999999999u) == 0);
+static_assert(FastDivision<std::uint64_t, 10000000000000000u>(10000000000000000u) == 1);
+static_assert(FastDivision<std::uint64_t, 100000000000000u>(99999999999999u) == 0);
+static_assert(FastDivision<std::uint64_t, 100000000000000u>(100000000000000u) == 1);
+static_assert(FastDivision<std::uint32_t, 1000000u>(999999u) == 0);
+static_assert(FastDivision<std::uint32_t, 1000000u>(1000000u) == 1);
+static_assert(FastDivision<std::uint64_t, 10>(18446744073709551615u) == 1844674407370955161u);
+static_assert(FastDivision<std::uint32_t, 10>(4294967295u) == 429496729u);
+static_assert(FastDivision<std::uint64_t, 5>(18446744073709551615u) == 3689348814741910323u);
+static_assert(FastDivision<std::uint32_t, 5>(4294967295u) == 858993459u);
+}
+#endif
author	peter klausler <pklausler@nvidia.com>
	Tue, 20 Aug 2019 19:58:27 +0000 (12:58 -0700)
committer	peter klausler <pklausler@nvidia.com>
	Fri, 23 Aug 2019 18:31:32 +0000 (11:31 -0700)
flang/lib/decimal/big-radix-floating-point.h		patch \| blob \| history
flang/lib/decimal/binary-to-decimal.cc		patch \| blob \| history
flang/lib/decimal/decimal-to-binary.cc		patch \| blob \| history
flang/lib/decimal/int-divide-workaround.h	[new file with mode: 0644]	patch \| blob