From: peter klausler <pklausler@nvidia.com>
Date: Tue, 20 Aug 2019 23:38:48 +0000 (-0700)
Subject: [flang] Better unsigned division by constants
X-Git-Tag: llvmorg-12-init~9537^2~710
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=28d55be08bd66ca52d3df1867fa3f5fb289da4f2;p=platform%2Fupstream%2Fllvm.git

[flang] Better unsigned division by constants

Original-commit: flang-compiler/f18@1b35c24a8aa37f06b7d5ce8b1af636bf15ac979b
Reviewed-on: https://github.com/flang-compiler/f18/pull/671
Tree-same-pre-rewrite: false
---

diff --git a/flang/lib/common/bit-population-count.h b/flang/lib/common/bit-population-count.h
index d7a254f..1a22138 100644
--- a/flang/lib/common/bit-population-count.h
+++ b/flang/lib/common/bit-population-count.h
@@ -86,7 +86,7 @@ template<typename UINT> inline constexpr int TrailingZeroBitCount(UINT x) {
   if ((x & 1) != 0) {
     return 0;  // fast path for odd values
   } else {
-    return BitPopulationCount(x ^ (x - 1)) - !!x;
+    return BitPopulationCount(static_cast<UINT>(x ^ (x - 1))) - !!x;
   }
 }
 }
diff --git a/flang/lib/common/unsigned-const-division.h b/flang/lib/common/unsigned-const-division.h
new file mode 100644
index 0000000..31d288c
--- /dev/null
+++ b/flang/lib/common/unsigned-const-division.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FORTRAN_COMMON_UNSIGNED_CONST_DIVISION_H_
+#define FORTRAN_COMMON_UNSIGNED_CONST_DIVISION_H_
+
+// Work around unoptimized implementations of unsigned integer division
+// by constant values in some compilers (looking at YOU, clang 7!) by
+// explicitly implementing integer division by constant divisors as
+// multiplication by a fixed-point reciprocal and a right shift.
+
+#include "bit-population-count.h"
+#include "leading-zero-bit-count.h"
+#include <cinttypes>
+#include <type_traits>
+
+namespace Fortran::common {
+
+template<typename UINT> class FixedPointReciprocal {
+public:
+  using type = UINT;
+
+private:
+  static_assert(std::is_unsigned_v<type>);
+  static const int bits{static_cast<int>(8 * sizeof(type))};
+  static_assert(bits <= 64);
+  using Big = std::conditional_t<(bits <= 32), std::uint64_t, __uint128_t>;
+
+public:
+  static constexpr FixedPointReciprocal For(type n) {
+    if (n == 0) {
+      return {0, 0};
+    } else if ((n & (n - 1)) == 0) {  // n is a power of two
+      return {TrailingZeroBitCount(n), 1};
+    } else {
+      int shift{bits - 1 + BitsNeededFor(n)};
+      return {shift, static_cast<type>(((Big{1} << shift) + n - 1) / n)};
+    }
+  }
+
+  constexpr type Divide(type n) const {
+    return (static_cast<Big>(reciprocal_) * n) >> shift_;
+  }
+
+private:
+  constexpr FixedPointReciprocal(int s, type r) : shift_{s}, reciprocal_{r} {}
+
+  int shift_;
+  type reciprocal_;
+};
+
+static_assert(FixedPointReciprocal<std::uint32_t>::For(5).Divide(2000000000u) ==
+    400000000u);
+static_assert(FixedPointReciprocal<std::uint64_t>::For(10).Divide(
+                  10000000000000000u) == 1000000000000000u);
+
+template<typename UINT, UINT DENOM>
+inline constexpr UINT DivideUnsignedBy(UINT n) {
+  if constexpr (!std::is_same_v<UINT, __uint128_t>) {
+    return FixedPointReciprocal<UINT>::For(DENOM).Divide(n);
+  } else {
+    return n / DENOM;
+  }
+}
+}
+#endif
diff --git a/flang/lib/decimal/big-radix-floating-point.h b/flang/lib/decimal/big-radix-floating-point.h
index 6884964..4d64039 100644
--- a/flang/lib/decimal/big-radix-floating-point.h
+++ b/flang/lib/decimal/big-radix-floating-point.h
@@ -27,9 +27,9 @@
 
 #include "binary-floating-point.h"
 #include "decimal.h"
-#include "int-divide-workaround.h"
 #include "../common/bit-population-count.h"
 #include "../common/leading-zero-bit-count.h"
+#include "../common/unsigned-const-division.h"
 #include <cinttypes>
 #include <limits>
 #include <type_traits>
@@ -134,7 +134,7 @@ private:
         std::is_same_v<UINT, __uint128_t> || std::is_unsigned_v<UINT>);
     SetToZero();
     while (n != 0) {
-      auto q{FastDivision<UINT, 10>(n)};
+      auto q{common::DivideUnsignedBy<UINT, 10>(n)};
       if (n != 10 * q) {
         break;
       }
@@ -148,7 +148,7 @@ private:
       return 0;
     } else {
       while (n != 0 && digits_ < digitLimit_) {
-        auto q{FastDivision<UINT, radix>(n)};
+        auto q{common::DivideUnsignedBy<UINT, radix>(n)};
         digit_[digits_++] = n - radix * q;
         n = q;
       }
@@ -196,7 +196,7 @@ private:
     Digit remainder{0};
     for (int j{digits_ - 1}; j >= 0; --j) {
       // N.B. Because DIVISOR is a constant, these operations should be cheap.
-      Digit q{FastDivision<Digit, DIVISOR>(digit_[j])};
+      Digit q{common::DivideUnsignedBy<Digit, DIVISOR>(digit_[j])};
       Digit nrem{digit_[j] - DIVISOR * q};
       digit_[j] = q + (radix / DIVISOR) * remainder;
       remainder = nrem;
@@ -246,7 +246,7 @@ private:
   template<int N> int MultiplyByHelper(int carry = 0) {
     for (int j{0}; j < digits_; ++j) {
       auto v{N * digit_[j] + carry};
-      carry = FastDivision<Digit, radix>(v);
+      carry = common::DivideUnsignedBy<Digit, radix>(v);
       digit_[j] = v - carry * radix;  // i.e., v % radix
     }
     return carry;
diff --git a/flang/lib/decimal/binary-to-decimal.cc b/flang/lib/decimal/binary-to-decimal.cc
index cec8ba2..4e9909b6d 100644
--- a/flang/lib/decimal/binary-to-decimal.cc
+++ b/flang/lib/decimal/binary-to-decimal.cc
@@ -139,7 +139,7 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToDecimal(char *buffer,
   // Treat the MSD specially: don't emit leading zeroes.
   Digit dig{digit_[digits_ - 1]};
   for (int k{0}; k < LOG10RADIX; k += 2) {
-    Digit d{FastDivision<Digit, hundredth>(dig)};
+    Digit d{common::DivideUnsignedBy<Digit, hundredth>(dig)};
     dig = 100 * (dig - d * hundredth);
     const char *q{lut + 2 * d};
     if (q[0] != '0' || p > start) {
@@ -152,7 +152,7 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToDecimal(char *buffer,
   for (int j{digits_ - 1}; j-- > 0;) {
     Digit dig{digit_[j]};
     for (int k{0}; k < log10Radix; k += 2) {
-      Digit d{FastDivision<Digit, hundredth>(dig)};
+      Digit d{common::DivideUnsignedBy<Digit, hundredth>(dig)};
       dig = 100 * (dig - d * hundredth);
       const char *q = lut + 2 * d;
       *p++ = q[0];
@@ -276,9 +276,9 @@ void BigRadixFloatingPointNumber<PREC, LOG10RADIX>::Minimize(
   Digit least{less.digit_[offset]};
   Digit my{digit_[0]};
   while (true) {
-    Digit q{FastDivision<Digit, 10>(my)};
+    Digit q{common::DivideUnsignedBy<Digit, 10>(my)};
     Digit r{my - 10 * q};
-    Digit lq{FastDivision<Digit, 10>(least)};
+    Digit lq{common::DivideUnsignedBy<Digit, 10>(least)};
     Digit lr{least - 10 * lq};
     if (r != 0 && lq == q) {
       Digit sub{(r - lr) >> 1};
diff --git a/flang/lib/decimal/int-divide-workaround.h b/flang/lib/decimal/int-divide-workaround.h
deleted file mode 100644
index 21d2cb6..0000000
--- a/flang/lib/decimal/int-divide-workaround.h
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef INT_DIVIDE_H_
-#define INT_DIVIDE_H_
-
-// Work around unoptimized implementations of unsigned integer division
-// by constant values in some compilers (looking at YOU, clang 7!)
-
-#ifdef __clang__
-#if __clang_major__ < 8
-#define USE_INT_DIVIDE_WORKAROUNDS 1
-#endif
-#endif
-
-#include <cinttypes>
-
-namespace Fortran::decimal {
-
-template<typename UINT, UINT DENOM> inline constexpr UINT FastDivision(UINT n) {
-  return n / DENOM;
-}
-
-#if USE_INT_DIVIDE_WORKAROUNDS
-template<>
-inline constexpr std::uint64_t FastDivision<std::uint64_t, 10000000000000000u>(
-    std::uint64_t n) {
-  cast<__uint128_t{0x39a5652fb1137857} * n) >> (64 + 51);
-}
-
-template<>
-inline constexpr std::uint64_t FastDivision<std::uint64_t, 100000000000000u>(
-    std::uint64_t n) {
-  return (__uint128_t{0xb424dc35095cd81} * n) >> (64 + 42);
-}
-
-template<>
-inline constexpr std::uint32_t FastDivision<std::uint32_t, 1000000u>(
-    std::uint32_t n) {
-  return (std::uint64_t{0x431bde83} * n) >> (32 + 18);
-}
-
-template<>
-inline constexpr std::uint32_t FastDivision<std::uint32_t, 10000u>(
-    std::uint32_t n) {
-  return (std::uint64_t{0xd1b71759} * n) >> (32 + 13);
-}
-
-template<>
-inline constexpr std::uint64_t FastDivision<std::uint64_t, 10u>(
-    std::uint64_t n) {
-  return (__uint128_t{0xcccccccccccccccd} * n) >> (64 + 3);
-}
-
-template<>
-inline constexpr std::uint32_t FastDivision<std::uint32_t, 10u>(
-    std::uint32_t n) {
-  return (std::uint64_t{0xcccccccd} * n) >> (32 + 3);
-}
-
-template<>
-inline constexpr std::uint64_t FastDivision<std::uint64_t, 5u>(
-    std::uint64_t n) {
-  return (__uint128_t{0xcccccccccccccccd} * n) >> (64 + 2);
-}
-
-template<>
-inline constexpr std::uint32_t FastDivision<std::uint32_t, 5u>(
-    std::uint32_t n) {
-  return (std::uint64_t{0xcccccccd} * n) >> (32 + 2);
-}
-#endif
-
-static_assert(
-    FastDivision<std::uint64_t, 10000000000000000u>(9999999999999999u) == 0);
-static_assert(
-    FastDivision<std::uint64_t, 10000000000000000u>(10000000000000000u) == 1);
-static_assert(
-    FastDivision<std::uint64_t, 100000000000000u>(99999999999999u) == 0);
-static_assert(
-    FastDivision<std::uint64_t, 100000000000000u>(100000000000000u) == 1);
-static_assert(FastDivision<std::uint32_t, 1000000u>(999999u) == 0);
-static_assert(FastDivision<std::uint32_t, 1000000u>(1000000u) == 1);
-static_assert(FastDivision<std::uint64_t, 10>(18446744073709551615u) ==
-    1844674407370955161u);
-static_assert(FastDivision<std::uint32_t, 10>(4294967295u) == 429496729u);
-static_assert(FastDivision<std::uint64_t, 5>(18446744073709551615u) ==
-    3689348814741910323u);
-static_assert(FastDivision<std::uint32_t, 5>(4294967295u) == 858993459u);
-}
-#endif
diff --git a/flang/test/decimal/thorough-test.cc b/flang/test/decimal/thorough-test.cc
index 5d4644b..17d4d1f 100644
--- a/flang/test/decimal/thorough-test.cc
+++ b/flang/test/decimal/thorough-test.cc
@@ -18,9 +18,9 @@
 #include <cstring>
 #include <iostream>
 
-static constexpr int incr{1};  // steps through all values
-static constexpr bool doNegative{true};
-static constexpr bool doMinimize{true};
+static constexpr int incr{10};  // steps through all values
+static constexpr bool doNegative{};  // true};
+static constexpr bool doMinimize{};  // true};
 
 using namespace Fortran::decimal;