From 64a0a58bed55063fb311da8c47adcd58f0334563 Mon Sep 17 00:00:00 2001
From: peter klausler <pklausler@nvidia.com>
Date: Tue, 20 Aug 2019 12:58:27 -0700
Subject: [PATCH] [flang] Work around slow clang-7

Original-commit: flang-compiler/f18@ed634d72e3d51d56010ef5aedef614f3cda6e076
Reviewed-on: https://github.com/flang-compiler/f18/pull/671
Tree-same-pre-rewrite: false
---
 flang/lib/decimal/big-radix-floating-point.h | 13 ++---
 flang/lib/decimal/binary-to-decimal.cc       |  8 +--
 flang/lib/decimal/decimal-to-binary.cc       | 22 ++++++--
 flang/lib/decimal/int-divide-workaround.h    | 80 ++++++++++++++++++++++++++++
 4 files changed, 108 insertions(+), 15 deletions(-)
 create mode 100644 flang/lib/decimal/int-divide-workaround.h
diff --git a/flang/lib/decimal/big-radix-floating-point.h b/flang/lib/decimal/big-radix-floating-point.h
index 445a144..49e9efc 100644
--- a/flang/lib/decimal/big-radix-floating-point.h
+++ b/flang/lib/decimal/big-radix-floating-point.h
@@ -27,6 +27,7 @@
 
 #include "binary-floating-point.h"
 #include "decimal.h"
+#include "int-divide-workaround.h"
 #include "../common/bit-population-count.h"
 #include "../common/leading-zero-bit-count.h"
 #include <cinttypes>
@@ -133,7 +134,7 @@ private:
         std::is_same_v<UINT, __uint128_t> || std::is_unsigned_v<UINT>);
     SetToZero();
     while (n != 0) {
-      auto q{n / 10};
+      auto q{FastDivision<UINT, 10>(n)};
       if (n != 10 * q) {
         break;
       }
@@ -147,7 +148,7 @@ private:
       return 0;
     } else {
       while (n != 0 && digits_ < digitLimit_) {
-        auto q{n / radix};
+        auto q{FastDivision<UINT, radix>(n)};
         digit_[digits_++] = n - radix * q;
         n = q;
       }
@@ -195,7 +196,7 @@ private:
     Digit remainder{0};
     for (int j{digits_ - 1}; j >= 0; --j) {
       // N.B. Because DIVISOR is a constant, these operations should be cheap.
-      Digit q{digit_[j] / DIVISOR};
+      Digit q{FastDivision<Digit, DIVISOR>(digit_[j])};
       Digit nrem{digit_[j] - DIVISOR * q};
       digit_[j] = q + (radix / DIVISOR) * remainder;
       remainder = nrem;
@@ -244,9 +245,9 @@ private:
 
   template<int N> int MultiplyByHelper(int carry = 0) {
     for (int j{0}; j < digits_; ++j) {
-      digit_[j] = N * digit_[j] + carry;
-      carry = digit_[j] / radix;  // N.B. radix is constant, this is fast
-      digit_[j] -= carry * radix;
+      auto v{N * digit_[j] + carry};
+      carry = FastDivision<Digit, radix>(v);
+      digit_[j] = v - carry * radix;  // i.e., v % radix
     }
     return carry;
   }
diff --git a/flang/lib/decimal/binary-to-decimal.cc b/flang/lib/decimal/binary-to-decimal.cc
index 6a8a188..cec8ba2 100644
--- a/flang/lib/decimal/binary-to-decimal.cc
+++ b/flang/lib/decimal/binary-to-decimal.cc
@@ -139,7 +139,7 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToDecimal(char *buffer,
   // Treat the MSD specially: don't emit leading zeroes.
   Digit dig{digit_[digits_ - 1]};
   for (int k{0}; k < LOG10RADIX; k += 2) {
-    Digit d{dig / hundredth};
+    Digit d{FastDivision<Digit, hundredth>(dig)};
     dig = 100 * (dig - d * hundredth);
     const char *q{lut + 2 * d};
     if (q[0] != '0' || p > start) {
@@ -152,7 +152,7 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToDecimal(char *buffer,
   for (int j{digits_ - 1}; j-- > 0;) {
     Digit dig{digit_[j]};
     for (int k{0}; k < log10Radix; k += 2) {
-      Digit d{dig / hundredth};
+      Digit d{FastDivision<Digit, hundredth>(dig)};
       dig = 100 * (dig - d * hundredth);
       const char *q = lut + 2 * d;
       *p++ = q[0];
@@ -276,9 +276,9 @@ void BigRadixFloatingPointNumber<PREC, LOG10RADIX>::Minimize(
   Digit least{less.digit_[offset]};
   Digit my{digit_[0]};
   while (true) {
-    Digit q{my / 10};
+    Digit q{FastDivision<Digit, 10>(my)};
     Digit r{my - 10 * q};
-    Digit lq{least / 10};
+    Digit lq{FastDivision<Digit, 10>(least)};
     Digit lr{least - 10 * lq};
     if (r != 0 && lq == q) {
       Digit sub{(r - lr) >> 1};
diff --git a/flang/lib/decimal/decimal-to-binary.cc b/flang/lib/decimal/decimal-to-binary.cc
index a1fbec7..f388d64 100644
--- a/flang/lib/decimal/decimal-to-binary.cc
+++ b/flang/lib/decimal/decimal-to-binary.cc
@@ -50,8 +50,8 @@ bool BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ParseNumber(
   if (q == start || (q == start + 1 && *start == '.')) {
     return false;  // require at least one digit
   }
-  auto times{radix};
   const char *d{q};
+  // Strip off trailing zeroes
   if (point != nullptr) {
     while (d > firstDigit && d[-1] == '0') {
       --d;
@@ -68,11 +68,12 @@ bool BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ParseNumber(
     }
   }
   if (d == firstDigit) {
-    exponent_ = 0;
+    exponent_ = 0;  // all zeros
   }
   if (point != nullptr) {
     exponent_ -= static_cast<int>(d - point - 1);
   }
+  // Trim any excess digits
   const char *limit{firstDigit + maxDigits * log10Radix + (point != nullptr)};
   if (d > limit) {
     inexact = true;
@@ -85,7 +86,8 @@ bool BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ParseNumber(
       }
     }
   }
-  while (d-- > firstDigit) {
+  // Rack the decimal digits up into big Digits.
+  for (auto times{radix}; d-- > firstDigit;) {
     if (*d != '.') {
       if (times == radix) {
         digit_[digits_++] = *d - '0';
@@ -253,7 +255,7 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToBinary() {
   // large power of ten.  Its radix point is defined to be to the right of its
   // digits, and "exponent_" is the power of ten by which it is to be scaled.
   Normalize();
-  if (digits_ == 0) {
+  if (digits_ == 0) {  // zero value
     if (isNegative_) {
       using Raw = typename Binary::RawType;
       Raw negZero{static_cast<Raw>(1) << (Binary::bits - 1)};
@@ -263,12 +265,22 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToBinary() {
     }
   }
   // The value is not zero.
+  IntermediateFloat<PREC> f;
+#if 0  // actually a small loss
+  // Make the value odd.
+  if (int trailing0s{common::TrailingZeroBitCount(digit_[0])}) {
+    f.AdjustExponent(trailing0s);
+    for (; trailing0s > log10Radix; trailing0s -= log10Radix) {
+      DivideByPowerOfTwo(log10Radix);
+    }
+    DivideByPowerOfTwo(trailing0s);
+  }
+#endif
   // Shift our perspective on the radix (& decimal) point so that
   // it sits to the *left* of the digits.
   exponent_ += digits_ * log10Radix;
   // Apply any negative decimal exponent by multiplication
   // by a power of two, adjusting the binary exponent to compensate.
-  IntermediateFloat<PREC> f;
   while (exponent_ < log10Radix) {
     f.AdjustExponent(-9);
     digitLimit_ = digits_;
diff --git a/flang/lib/decimal/int-divide-workaround.h b/flang/lib/decimal/int-divide-workaround.h
new file mode 100644
index 0000000..8ccb459
--- /dev/null
+++ b/flang/lib/decimal/int-divide-workaround.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef INT_DIVIDE_H_
+#define INT_DIVIDE_H_
+
+// Work around unoptimized implementations of unsigned integer division
+// by constant values in some compilers (looking at YOU, clang 7!)
+
+#ifdef __clang__
+#if __clang_major__ < 8
+#define USE_INT_DIVIDE_WORKAROUNDS 1
+#endif
+#endif
+
+#include <cinttypes>
+
+namespace Fortran::decimal {
+
+template<typename UINT, UINT DENOM> inline constexpr UINT FastDivision(UINT n) {
+  return n / DENOM;
+}
+
+#if USE_INT_DIVIDE_WORKAROUNDS
+template<> inline constexpr std::uint64_t FastDivision<std::uint64_t, 10000000000000000u>(std::uint64_t n) {
+  return (static_cast<__uint128_t>(0x39a5652fb1137857) * n) >> (64 + 51);
+}
+
+template<> inline constexpr std::uint64_t FastDivision<std::uint64_t, 100000000000000u>(std::uint64_t n) {
+  return (static_cast<__uint128_t>(0xb424dc35095cd81) * n) >> (64 + 42);
+}
+
+template<> inline constexpr std::uint32_t FastDivision<std::uint32_t, 1000000u>(std::uint32_t n) {
+  return (static_cast<std::uint64_t>(0x431bde83) * n) >> (32 + 18);
+}
+
+template<> inline constexpr std::uint32_t FastDivision<std::uint32_t, 10000u>(std::uint32_t n) {
+  return (static_cast<std::uint64_t>(0xd1b71759) * n) >> (32 + 13);
+}
+
+template<> inline constexpr std::uint64_t FastDivision<std::uint64_t, 10u>(std::uint64_t n) {
+  return (static_cast<__uint128_t>(0xcccccccccccccccd) * n) >> (64 + 3);
+}
+
+template<> inline constexpr std::uint32_t FastDivision<std::uint32_t, 10u>(std::uint32_t n) {
+  return (static_cast<std::uint64_t>(0xcccccccd) * n) >> (32 + 3);
+}
+
+template<> inline constexpr std::uint64_t FastDivision<std::uint64_t, 5u>(std::uint64_t n) {
+  return (static_cast<__uint128_t>(0xcccccccccccccccd) * n) >> (64 + 2);
+}
+
+template<> inline constexpr std::uint32_t FastDivision<std::uint32_t, 5u>(std::uint32_t n) {
+  return (static_cast<std::uint64_t>(0xcccccccd) * n) >> (32 + 2);
+}
+#endif
+
+static_assert(FastDivision<std::uint64_t, 10000000000000000u>(9999999999999999u) == 0);
+static_assert(FastDivision<std::uint64_t, 10000000000000000u>(10000000000000000u) == 1);
+static_assert(FastDivision<std::uint64_t, 100000000000000u>(99999999999999u) == 0);
+static_assert(FastDivision<std::uint64_t, 100000000000000u>(100000000000000u) == 1);
+static_assert(FastDivision<std::uint32_t, 1000000u>(999999u) == 0);
+static_assert(FastDivision<std::uint32_t, 1000000u>(1000000u) == 1);
+static_assert(FastDivision<std::uint64_t, 10>(18446744073709551615u) == 1844674407370955161u);
+static_assert(FastDivision<std::uint32_t, 10>(4294967295u) == 429496729u);
+static_assert(FastDivision<std::uint64_t, 5>(18446744073709551615u) == 3689348814741910323u);
+static_assert(FastDivision<std::uint32_t, 5>(4294967295u) == 858993459u);
+}
+#endif
-- 
2.7.4