[flang] Better unsigned division by constants

author peter klausler <pklausler@nvidia.com>

Tue, 20 Aug 2019 23:38:48 +0000 (16:38 -0700)

committer peter klausler <pklausler@nvidia.com>

Fri, 23 Aug 2019 18:31:37 +0000 (11:31 -0700)
author peter klausler <pklausler@nvidia.com>
Tue, 20 Aug 2019 23:38:48 +0000 (16:38 -0700)
committer peter klausler <pklausler@nvidia.com>
Fri, 23 Aug 2019 18:31:37 +0000 (11:31 -0700)
diff --git a/flang/lib/common/bit-population-count.h b/flang/lib/common/bit-population-count.h

index d7a254f..1a22138 100644 (file)
--- a/flang/lib/common/bit-population-count.h
+++ b/flang/lib/common/bit-population-count.h
@@ -86,7 +86,7 @@ template<typename UINT> inline constexpr int TrailingZeroBitCount(UINT x) {
    if ((x & 1) != 0) {
      return 0;  // fast path for odd values
    } else {
-    return BitPopulationCount(x ^ (x - 1)) - !!x;
+    return BitPopulationCount(static_cast<UINT>(x ^ (x - 1))) - !!x;
    }
  }
  }
diff --git a/flang/lib/common/unsigned-const-division.h b/flang/lib/common/unsigned-const-division.h

new file mode 100644 (file)

index 0000000..31d288c
--- /dev/null
+++ b/flang/lib/common/unsigned-const-division.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FORTRAN_COMMON_UNSIGNED_CONST_DIVISION_H_
+#define FORTRAN_COMMON_UNSIGNED_CONST_DIVISION_H_
+
+// Work around unoptimized implementations of unsigned integer division
+// by constant values in some compilers (looking at YOU, clang 7!) by
+// explicitly implementing integer division by constant divisors as
+// multiplication by a fixed-point reciprocal and a right shift.
+
+#include "bit-population-count.h"
+#include "leading-zero-bit-count.h"
+#include <cinttypes>
+#include <type_traits>
+
+namespace Fortran::common {
+
+template<typename UINT> class FixedPointReciprocal {
+public:
+  using type = UINT;
+
+private:
+  static_assert(std::is_unsigned_v<type>);
+  static const int bits{static_cast<int>(8 * sizeof(type))};
+  static_assert(bits <= 64);
+  using Big = std::conditional_t<(bits <= 32), std::uint64_t, __uint128_t>;
+
+public:
+  static constexpr FixedPointReciprocal For(type n) {
+    if (n == 0) {
+      return {0, 0};
+    } else if ((n & (n - 1)) == 0) {  // n is a power of two
+      return {TrailingZeroBitCount(n), 1};
+    } else {
+      int shift{bits - 1 + BitsNeededFor(n)};
+      return {shift, static_cast<type>(((Big{1} << shift) + n - 1) / n)};
+    }
+  }
+
+  constexpr type Divide(type n) const {
+    return (static_cast<Big>(reciprocal_) * n) >> shift_;
+  }
+
+private:
+  constexpr FixedPointReciprocal(int s, type r) : shift_{s}, reciprocal_{r} {}
+
+  int shift_;
+  type reciprocal_;
+};
+
+static_assert(FixedPointReciprocal<std::uint32_t>::For(5).Divide(2000000000u) ==
+    400000000u);
+static_assert(FixedPointReciprocal<std::uint64_t>::For(10).Divide(
+                  10000000000000000u) == 1000000000000000u);
+
+template<typename UINT, UINT DENOM>
+inline constexpr UINT DivideUnsignedBy(UINT n) {
+  if constexpr (!std::is_same_v<UINT, __uint128_t>) {
+    return FixedPointReciprocal<UINT>::For(DENOM).Divide(n);
+  } else {
+    return n / DENOM;
+  }
+}
+}
+#endif
diff --git a/flang/lib/decimal/big-radix-floating-point.h b/flang/lib/decimal/big-radix-floating-point.h

index 6884964..4d64039 100644 (file)
--- a/flang/lib/decimal/big-radix-floating-point.h
+++ b/flang/lib/decimal/big-radix-floating-point.h
@@ -27,9 +27,9 @@
  
  #include "binary-floating-point.h"
  #include "decimal.h"
-#include "int-divide-workaround.h"
  #include "../common/bit-population-count.h"
  #include "../common/leading-zero-bit-count.h"
+#include "../common/unsigned-const-division.h"
  #include <cinttypes>
  #include <limits>
  #include <type_traits>
@@ -134,7 +134,7 @@ private:
          std::is_same_v<UINT, __uint128_t> || std::is_unsigned_v<UINT>);
      SetToZero();
      while (n != 0) {
-      auto q{FastDivision<UINT, 10>(n)};
+      auto q{common::DivideUnsignedBy<UINT, 10>(n)};
        if (n != 10 * q) {
          break;
        }
@@ -148,7 +148,7 @@ private:
        return 0;
      } else {
        while (n != 0 && digits_ < digitLimit_) {
-        auto q{FastDivision<UINT, radix>(n)};
+        auto q{common::DivideUnsignedBy<UINT, radix>(n)};
          digit_[digits_++] = n - radix * q;
          n = q;
        }
@@ -196,7 +196,7 @@ private:
      Digit remainder{0};
      for (int j{digits_ - 1}; j >= 0; --j) {
        // N.B. Because DIVISOR is a constant, these operations should be cheap.
-      Digit q{FastDivision<Digit, DIVISOR>(digit_[j])};
+      Digit q{common::DivideUnsignedBy<Digit, DIVISOR>(digit_[j])};
        Digit nrem{digit_[j] - DIVISOR * q};
        digit_[j] = q + (radix / DIVISOR) * remainder;
        remainder = nrem;
@@ -246,7 +246,7 @@ private:
    template<int N> int MultiplyByHelper(int carry = 0) {
      for (int j{0}; j < digits_; ++j) {
        auto v{N * digit_[j] + carry};
-      carry = FastDivision<Digit, radix>(v);
+      carry = common::DivideUnsignedBy<Digit, radix>(v);
        digit_[j] = v - carry * radix;  // i.e., v % radix
      }
      return carry;
diff --git a/flang/lib/decimal/binary-to-decimal.cc b/flang/lib/decimal/binary-to-decimal.cc

index cec8ba2..4e9909b 100644 (file)
--- a/flang/lib/decimal/binary-to-decimal.cc
+++ b/flang/lib/decimal/binary-to-decimal.cc
@@ -139,7 +139,7 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToDecimal(char *buffer,
    // Treat the MSD specially: don't emit leading zeroes.
    Digit dig{digit_[digits_ - 1]};
    for (int k{0}; k < LOG10RADIX; k += 2) {
-    Digit d{FastDivision<Digit, hundredth>(dig)};
+    Digit d{common::DivideUnsignedBy<Digit, hundredth>(dig)};
      dig = 100 * (dig - d * hundredth);
      const char *q{lut + 2 * d};
      if (q[0] != '0' || p > start) {
@@ -152,7 +152,7 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToDecimal(char *buffer,
    for (int j{digits_ - 1}; j-- > 0;) {
      Digit dig{digit_[j]};
      for (int k{0}; k < log10Radix; k += 2) {
-      Digit d{FastDivision<Digit, hundredth>(dig)};
+      Digit d{common::DivideUnsignedBy<Digit, hundredth>(dig)};
        dig = 100 * (dig - d * hundredth);
        const char *q = lut + 2 * d;
        *p++ = q[0];
@@ -276,9 +276,9 @@ void BigRadixFloatingPointNumber<PREC, LOG10RADIX>::Minimize(
    Digit least{less.digit_[offset]};
    Digit my{digit_[0]};
    while (true) {
-    Digit q{FastDivision<Digit, 10>(my)};
+    Digit q{common::DivideUnsignedBy<Digit, 10>(my)};
      Digit r{my - 10 * q};
-    Digit lq{FastDivision<Digit, 10>(least)};
+    Digit lq{common::DivideUnsignedBy<Digit, 10>(least)};
      Digit lr{least - 10 * lq};
      if (r != 0 && lq == q) {
        Digit sub{(r - lr) >> 1};
diff --git a/flang/lib/decimal/int-divide-workaround.h b/flang/lib/decimal/int-divide-workaround.h

deleted file mode 100644 (file)

index 21d2cb6..0000000
--- a/flang/lib/decimal/int-divide-workaround.h
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef INT_DIVIDE_H_
-#define INT_DIVIDE_H_
-
-// Work around unoptimized implementations of unsigned integer division
-// by constant values in some compilers (looking at YOU, clang 7!)
-
-#ifdef __clang__
-#if __clang_major__ < 8
-#define USE_INT_DIVIDE_WORKAROUNDS 1
-#endif
-#endif
-
-#include <cinttypes>
-
-namespace Fortran::decimal {
-
-template<typename UINT, UINT DENOM> inline constexpr UINT FastDivision(UINT n) {
-  return n / DENOM;
-}
-
-#if USE_INT_DIVIDE_WORKAROUNDS
-template<>
-inline constexpr std::uint64_t FastDivision<std::uint64_t, 10000000000000000u>(
-    std::uint64_t n) {
-  cast<__uint128_t{0x39a5652fb1137857} * n) >> (64 + 51);
-}
-
-template<>
-inline constexpr std::uint64_t FastDivision<std::uint64_t, 100000000000000u>(
-    std::uint64_t n) {
-  return (__uint128_t{0xb424dc35095cd81} * n) >> (64 + 42);
-}
-
-template<>
-inline constexpr std::uint32_t FastDivision<std::uint32_t, 1000000u>(
-    std::uint32_t n) {
-  return (std::uint64_t{0x431bde83} * n) >> (32 + 18);
-}
-
-template<>
-inline constexpr std::uint32_t FastDivision<std::uint32_t, 10000u>(
-    std::uint32_t n) {
-  return (std::uint64_t{0xd1b71759} * n) >> (32 + 13);
-}
-
-template<>
-inline constexpr std::uint64_t FastDivision<std::uint64_t, 10u>(
-    std::uint64_t n) {
-  return (__uint128_t{0xcccccccccccccccd} * n) >> (64 + 3);
-}
-
-template<>
-inline constexpr std::uint32_t FastDivision<std::uint32_t, 10u>(
-    std::uint32_t n) {
-  return (std::uint64_t{0xcccccccd} * n) >> (32 + 3);
-}
-
-template<>
-inline constexpr std::uint64_t FastDivision<std::uint64_t, 5u>(
-    std::uint64_t n) {
-  return (__uint128_t{0xcccccccccccccccd} * n) >> (64 + 2);
-}
-
-template<>
-inline constexpr std::uint32_t FastDivision<std::uint32_t, 5u>(
-    std::uint32_t n) {
-  return (std::uint64_t{0xcccccccd} * n) >> (32 + 2);
-}
-#endif
-
-static_assert(
-    FastDivision<std::uint64_t, 10000000000000000u>(9999999999999999u) == 0);
-static_assert(
-    FastDivision<std::uint64_t, 10000000000000000u>(10000000000000000u) == 1);
-static_assert(
-    FastDivision<std::uint64_t, 100000000000000u>(99999999999999u) == 0);
-static_assert(
-    FastDivision<std::uint64_t, 100000000000000u>(100000000000000u) == 1);
-static_assert(FastDivision<std::uint32_t, 1000000u>(999999u) == 0);
-static_assert(FastDivision<std::uint32_t, 1000000u>(1000000u) == 1);
-static_assert(FastDivision<std::uint64_t, 10>(18446744073709551615u) ==
-    1844674407370955161u);
-static_assert(FastDivision<std::uint32_t, 10>(4294967295u) == 429496729u);
-static_assert(FastDivision<std::uint64_t, 5>(18446744073709551615u) ==
-    3689348814741910323u);
-static_assert(FastDivision<std::uint32_t, 5>(4294967295u) == 858993459u);
-}
-#endif
diff --git a/flang/test/decimal/thorough-test.cc b/flang/test/decimal/thorough-test.cc

index 5d4644b..17d4d1f 100644 (file)
--- a/flang/test/decimal/thorough-test.cc
+++ b/flang/test/decimal/thorough-test.cc
@@ -18,9 +18,9 @@
  #include <cstring>
  #include <iostream>
  
-static constexpr int incr{1};  // steps through all values
-static constexpr bool doNegative{true};
-static constexpr bool doMinimize{true};
+static constexpr int incr{10};  // steps through all values
+static constexpr bool doNegative{};  // true};
+static constexpr bool doMinimize{};  // true};
  
  using namespace Fortran::decimal;
author	peter klausler <pklausler@nvidia.com>
	Tue, 20 Aug 2019 23:38:48 +0000 (16:38 -0700)
committer	peter klausler <pklausler@nvidia.com>
	Fri, 23 Aug 2019 18:31:37 +0000 (11:31 -0700)
flang/lib/common/bit-population-count.h		patch \| blob \| history
flang/lib/common/unsigned-const-division.h	[new file with mode: 0644]	patch \| blob
flang/lib/decimal/big-radix-floating-point.h		patch \| blob \| history
flang/lib/decimal/binary-to-decimal.cc		patch \| blob \| history
flang/lib/decimal/int-divide-workaround.h	[deleted file]	patch \| blob \| history
flang/test/decimal/thorough-test.cc		patch \| blob \| history