From 64a0a58bed55063fb311da8c47adcd58f0334563 Mon Sep 17 00:00:00 2001 From: peter klausler Date: Tue, 20 Aug 2019 12:58:27 -0700 Subject: [PATCH] [flang] Work around slow clang-7 Original-commit: flang-compiler/f18@ed634d72e3d51d56010ef5aedef614f3cda6e076 Reviewed-on: https://github.com/flang-compiler/f18/pull/671 Tree-same-pre-rewrite: false --- flang/lib/decimal/big-radix-floating-point.h | 13 ++--- flang/lib/decimal/binary-to-decimal.cc | 8 +-- flang/lib/decimal/decimal-to-binary.cc | 22 ++++++-- flang/lib/decimal/int-divide-workaround.h | 80 ++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 15 deletions(-) create mode 100644 flang/lib/decimal/int-divide-workaround.h diff --git a/flang/lib/decimal/big-radix-floating-point.h b/flang/lib/decimal/big-radix-floating-point.h index 445a144..49e9efc 100644 --- a/flang/lib/decimal/big-radix-floating-point.h +++ b/flang/lib/decimal/big-radix-floating-point.h @@ -27,6 +27,7 @@ #include "binary-floating-point.h" #include "decimal.h" +#include "int-divide-workaround.h" #include "../common/bit-population-count.h" #include "../common/leading-zero-bit-count.h" #include @@ -133,7 +134,7 @@ private: std::is_same_v || std::is_unsigned_v); SetToZero(); while (n != 0) { - auto q{n / 10}; + auto q{FastDivision(n)}; if (n != 10 * q) { break; } @@ -147,7 +148,7 @@ private: return 0; } else { while (n != 0 && digits_ < digitLimit_) { - auto q{n / radix}; + auto q{FastDivision(n)}; digit_[digits_++] = n - radix * q; n = q; } @@ -195,7 +196,7 @@ private: Digit remainder{0}; for (int j{digits_ - 1}; j >= 0; --j) { // N.B. Because DIVISOR is a constant, these operations should be cheap. - Digit q{digit_[j] / DIVISOR}; + Digit q{FastDivision(digit_[j])}; Digit nrem{digit_[j] - DIVISOR * q}; digit_[j] = q + (radix / DIVISOR) * remainder; remainder = nrem; @@ -244,9 +245,9 @@ private: template int MultiplyByHelper(int carry = 0) { for (int j{0}; j < digits_; ++j) { - digit_[j] = N * digit_[j] + carry; - carry = digit_[j] / radix; // N.B. radix is constant, this is fast - digit_[j] -= carry * radix; + auto v{N * digit_[j] + carry}; + carry = FastDivision(v); + digit_[j] = v - carry * radix; // i.e., v % radix } return carry; } diff --git a/flang/lib/decimal/binary-to-decimal.cc b/flang/lib/decimal/binary-to-decimal.cc index 6a8a188..cec8ba2 100644 --- a/flang/lib/decimal/binary-to-decimal.cc +++ b/flang/lib/decimal/binary-to-decimal.cc @@ -139,7 +139,7 @@ BigRadixFloatingPointNumber::ConvertToDecimal(char *buffer, // Treat the MSD specially: don't emit leading zeroes. Digit dig{digit_[digits_ - 1]}; for (int k{0}; k < LOG10RADIX; k += 2) { - Digit d{dig / hundredth}; + Digit d{FastDivision(dig)}; dig = 100 * (dig - d * hundredth); const char *q{lut + 2 * d}; if (q[0] != '0' || p > start) { @@ -152,7 +152,7 @@ BigRadixFloatingPointNumber::ConvertToDecimal(char *buffer, for (int j{digits_ - 1}; j-- > 0;) { Digit dig{digit_[j]}; for (int k{0}; k < log10Radix; k += 2) { - Digit d{dig / hundredth}; + Digit d{FastDivision(dig)}; dig = 100 * (dig - d * hundredth); const char *q = lut + 2 * d; *p++ = q[0]; @@ -276,9 +276,9 @@ void BigRadixFloatingPointNumber::Minimize( Digit least{less.digit_[offset]}; Digit my{digit_[0]}; while (true) { - Digit q{my / 10}; + Digit q{FastDivision(my)}; Digit r{my - 10 * q}; - Digit lq{least / 10}; + Digit lq{FastDivision(least)}; Digit lr{least - 10 * lq}; if (r != 0 && lq == q) { Digit sub{(r - lr) >> 1}; diff --git a/flang/lib/decimal/decimal-to-binary.cc b/flang/lib/decimal/decimal-to-binary.cc index a1fbec7..f388d64 100644 --- a/flang/lib/decimal/decimal-to-binary.cc +++ b/flang/lib/decimal/decimal-to-binary.cc @@ -50,8 +50,8 @@ bool BigRadixFloatingPointNumber::ParseNumber( if (q == start || (q == start + 1 && *start == '.')) { return false; // require at least one digit } - auto times{radix}; const char *d{q}; + // Strip off trailing zeroes if (point != nullptr) { while (d > firstDigit && d[-1] == '0') { --d; @@ -68,11 +68,12 @@ bool BigRadixFloatingPointNumber::ParseNumber( } } if (d == firstDigit) { - exponent_ = 0; + exponent_ = 0; // all zeros } if (point != nullptr) { exponent_ -= static_cast(d - point - 1); } + // Trim any excess digits const char *limit{firstDigit + maxDigits * log10Radix + (point != nullptr)}; if (d > limit) { inexact = true; @@ -85,7 +86,8 @@ bool BigRadixFloatingPointNumber::ParseNumber( } } } - while (d-- > firstDigit) { + // Rack the decimal digits up into big Digits. + for (auto times{radix}; d-- > firstDigit;) { if (*d != '.') { if (times == radix) { digit_[digits_++] = *d - '0'; @@ -253,7 +255,7 @@ BigRadixFloatingPointNumber::ConvertToBinary() { // large power of ten. Its radix point is defined to be to the right of its // digits, and "exponent_" is the power of ten by which it is to be scaled. Normalize(); - if (digits_ == 0) { + if (digits_ == 0) { // zero value if (isNegative_) { using Raw = typename Binary::RawType; Raw negZero{static_cast(1) << (Binary::bits - 1)}; @@ -263,12 +265,22 @@ BigRadixFloatingPointNumber::ConvertToBinary() { } } // The value is not zero. + IntermediateFloat f; +#if 0 // actually a small loss + // Make the value odd. + if (int trailing0s{common::TrailingZeroBitCount(digit_[0])}) { + f.AdjustExponent(trailing0s); + for (; trailing0s > log10Radix; trailing0s -= log10Radix) { + DivideByPowerOfTwo(log10Radix); + } + DivideByPowerOfTwo(trailing0s); + } +#endif // Shift our perspective on the radix (& decimal) point so that // it sits to the *left* of the digits. exponent_ += digits_ * log10Radix; // Apply any negative decimal exponent by multiplication // by a power of two, adjusting the binary exponent to compensate. - IntermediateFloat f; while (exponent_ < log10Radix) { f.AdjustExponent(-9); digitLimit_ = digits_; diff --git a/flang/lib/decimal/int-divide-workaround.h b/flang/lib/decimal/int-divide-workaround.h new file mode 100644 index 0000000..8ccb459 --- /dev/null +++ b/flang/lib/decimal/int-divide-workaround.h @@ -0,0 +1,80 @@ +// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef INT_DIVIDE_H_ +#define INT_DIVIDE_H_ + +// Work around unoptimized implementations of unsigned integer division +// by constant values in some compilers (looking at YOU, clang 7!) + +#ifdef __clang__ +#if __clang_major__ < 8 +#define USE_INT_DIVIDE_WORKAROUNDS 1 +#endif +#endif + +#include + +namespace Fortran::decimal { + +template inline constexpr UINT FastDivision(UINT n) { + return n / DENOM; +} + +#if USE_INT_DIVIDE_WORKAROUNDS +template<> inline constexpr std::uint64_t FastDivision(std::uint64_t n) { + return (static_cast<__uint128_t>(0x39a5652fb1137857) * n) >> (64 + 51); +} + +template<> inline constexpr std::uint64_t FastDivision(std::uint64_t n) { + return (static_cast<__uint128_t>(0xb424dc35095cd81) * n) >> (64 + 42); +} + +template<> inline constexpr std::uint32_t FastDivision(std::uint32_t n) { + return (static_cast(0x431bde83) * n) >> (32 + 18); +} + +template<> inline constexpr std::uint32_t FastDivision(std::uint32_t n) { + return (static_cast(0xd1b71759) * n) >> (32 + 13); +} + +template<> inline constexpr std::uint64_t FastDivision(std::uint64_t n) { + return (static_cast<__uint128_t>(0xcccccccccccccccd) * n) >> (64 + 3); +} + +template<> inline constexpr std::uint32_t FastDivision(std::uint32_t n) { + return (static_cast(0xcccccccd) * n) >> (32 + 3); +} + +template<> inline constexpr std::uint64_t FastDivision(std::uint64_t n) { + return (static_cast<__uint128_t>(0xcccccccccccccccd) * n) >> (64 + 2); +} + +template<> inline constexpr std::uint32_t FastDivision(std::uint32_t n) { + return (static_cast(0xcccccccd) * n) >> (32 + 2); +} +#endif + +static_assert(FastDivision(9999999999999999u) == 0); +static_assert(FastDivision(10000000000000000u) == 1); +static_assert(FastDivision(99999999999999u) == 0); +static_assert(FastDivision(100000000000000u) == 1); +static_assert(FastDivision(999999u) == 0); +static_assert(FastDivision(1000000u) == 1); +static_assert(FastDivision(18446744073709551615u) == 1844674407370955161u); +static_assert(FastDivision(4294967295u) == 429496729u); +static_assert(FastDivision(18446744073709551615u) == 3689348814741910323u); +static_assert(FastDivision(4294967295u) == 858993459u); +} +#endif -- 2.7.4