From: Matt Arsenault Date: Tue, 13 Jan 2015 20:53:23 +0000 (+0000) Subject: R600: Implement getRecipEstimate X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=bf0db918b23cb28b8ed35fe0371d0d9be6fd27f4;p=platform%2Fupstream%2Fllvm.git R600: Implement getRecipEstimate This requires a new hook to prevent expanding sqrt in terms of rsqrt and reciprocal. v_rcp_f32, v_rsq_f32, and v_sqrt_f32 are all the same rate, so this expansion would just double the number of instructions and cycles. llvm-svn: 225828 --- diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h index b18c15b..a6e46ce 100644 --- a/llvm/include/llvm/Target/TargetLowering.h +++ b/llvm/include/llvm/Target/TargetLowering.h @@ -217,6 +217,11 @@ public: /// several shifts, adds, and multiplies for this target. bool isIntDivCheap() const { return IntDivIsCheap; } + /// Return true if sqrt(x) is as cheap or cheaper than 1 / rsqrt(x) + bool isFsqrtCheap() const { + return FsqrtIsCheap; + } + /// Returns true if target has indicated at least one type should be bypassed. bool isSlowDivBypassed() const { return !BypassSlowDivWidths.empty(); } @@ -1183,7 +1188,11 @@ protected: /// possible, should be replaced by an alternate sequence of instructions not /// containing an integer divide. void setIntDivIsCheap(bool isCheap = true) { IntDivIsCheap = isCheap; } - + + /// Tells the code generator that fsqrt is cheap, and should not be replaced + /// with an alternative sequence of instructions. + void setFsqrtIsCheap(bool isCheap = true) { FsqrtIsCheap = isCheap; } + /// Tells the code generator that this target supports floating point /// exceptions and cares about preserving floating point exception behavior. void setHasFloatingPointExceptions(bool FPExceptions = true) { @@ -1625,6 +1634,9 @@ private: /// unconditionally. bool IntDivIsCheap; + // Don't expand fsqrt with an approximation based on the inverse sqrt. + bool FsqrtIsCheap; + /// Tells the code generator to bypass slow divide or remainder /// instructions. For example, BypassSlowDivWidths[32,8] tells the code /// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 171ee1e..e57c5a2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7538,7 +7538,8 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { } SDValue DAGCombiner::visitFSQRT(SDNode *N) { - if (DAG.getTarget().Options.UnsafeFPMath) { + if (DAG.getTarget().Options.UnsafeFPMath && + !TLI.isFsqrtCheap()) { // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5) if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) { EVT VT = RV.getValueType(); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index fae54d2..9b2fdff 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -710,6 +710,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) HasMultipleConditionRegisters = false; HasExtractBitsInsn = false; IntDivIsCheap = false; + FsqrtIsCheap = false; Pow2SDivIsCheap = false; JumpIsExpensive = false; PredictableSelectIsExpensive = false; diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp index 55a35f7..1ad2a69 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp @@ -403,6 +403,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : // large sequence of instructions. setIntDivIsCheap(false); setPow2SDivIsCheap(false); + setFsqrtIsCheap(true); // FIXME: Need to really handle these. MaxStoresPerMemcpy = 4096; @@ -2585,6 +2586,28 @@ SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, return SDValue(); } +SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32) { + // Reciprocal, < 1 ulp error. + // + // This reciprocal approximation converges to < 0.5 ulp error with one + // newton rhapson performed with two fused multiple adds (FMAs). + + RefinementSteps = 0; + return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); + } + + // TODO: There is also f64 rcp instruction, but the documentation is less + // clear on its precision. + + return SDValue(); +} + static void computeKnownBitsForMinMax(const SDValue Op0, const SDValue Op1, APInt &KnownZero, diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.h b/llvm/lib/Target/R600/AMDGPUISelLowering.h index 21c8e98..15f529c 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.h +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.h @@ -171,6 +171,9 @@ public: DAGCombinerInfo &DCI, unsigned &RefinementSteps, bool &UseOneConstNR) const override; + SDValue getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const {