R600: Implement getRecipEstimate

author Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 13 Jan 2015 20:53:23 +0000 (20:53 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 13 Jan 2015 20:53:23 +0000 (20:53 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 13 Jan 2015 20:53:23 +0000 (20:53 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 13 Jan 2015 20:53:23 +0000 (20:53 +0000)
diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h

index b18c15b..a6e46ce 100644 (file)
--- a/llvm/include/llvm/Target/TargetLowering.h
+++ b/llvm/include/llvm/Target/TargetLowering.h
@@ -217,6 +217,11 @@ public:
    /// several shifts, adds, and multiplies for this target.
    bool isIntDivCheap() const { return IntDivIsCheap; }
  
+  /// Return true if sqrt(x) is as cheap or cheaper than 1 / rsqrt(x)
+  bool isFsqrtCheap() const {
+    return FsqrtIsCheap;
+  }
+
    /// Returns true if target has indicated at least one type should be bypassed.
    bool isSlowDivBypassed() const { return !BypassSlowDivWidths.empty(); }
  
@@ -1183,7 +1188,11 @@ protected:
    /// possible, should be replaced by an alternate sequence of instructions not
    /// containing an integer divide.
    void setIntDivIsCheap(bool isCheap = true) { IntDivIsCheap = isCheap; }
-  
+
+  /// Tells the code generator that fsqrt is cheap, and should not be replaced
+  /// with an alternative sequence of instructions.
+  void setFsqrtIsCheap(bool isCheap = true) { FsqrtIsCheap = isCheap; }
+
    /// Tells the code generator that this target supports floating point
    /// exceptions and cares about preserving floating point exception behavior.
    void setHasFloatingPointExceptions(bool FPExceptions = true) {
@@ -1625,6 +1634,9 @@ private:
    /// unconditionally.
    bool IntDivIsCheap;
  
+  // Don't expand fsqrt with an approximation based on the inverse sqrt.
+  bool FsqrtIsCheap;
+
    /// Tells the code generator to bypass slow divide or remainder
    /// instructions. For example, BypassSlowDivWidths[32,8] tells the code
    /// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index 171ee1e..e57c5a2 100644 (file)
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7538,7 +7538,8 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
  }
  
  SDValue DAGCombiner::visitFSQRT(SDNode *N) {
-  if (DAG.getTarget().Options.UnsafeFPMath) {
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      !TLI.isFsqrtCheap()) {
      // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
      if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) {
        EVT VT = RV.getValueType();
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp

index fae54d2..9b2fdff 100644 (file)
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -710,6 +710,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
    HasMultipleConditionRegisters = false;
    HasExtractBitsInsn = false;
    IntDivIsCheap = false;
+  FsqrtIsCheap = false;
    Pow2SDivIsCheap = false;
    JumpIsExpensive = false;
    PredictableSelectIsExpensive = false;
diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp

index 55a35f7..1ad2a69 100644 (file)
--- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -403,6 +403,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
    // large sequence of instructions.
    setIntDivIsCheap(false);
    setPow2SDivIsCheap(false);
+  setFsqrtIsCheap(true);
  
    // FIXME: Need to really handle these.
    MaxStoresPerMemcpy  = 4096;
@@ -2585,6 +2586,28 @@ SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
    return SDValue();
  }
  
+SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
+                                               DAGCombinerInfo &DCI,
+                                               unsigned &RefinementSteps) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = Operand.getValueType();
+
+  if (VT == MVT::f32) {
+    // Reciprocal, < 1 ulp error.
+    //
+    // This reciprocal approximation converges to < 0.5 ulp error with one
+    // newton rhapson performed with two fused multiple adds (FMAs).
+
+    RefinementSteps = 0;
+    return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
+  }
+
+  // TODO: There is also f64 rcp instruction, but the documentation is less
+  // clear on its precision.
+
+  return SDValue();
+}
+
  static void computeKnownBitsForMinMax(const SDValue Op0,
                                        const SDValue Op1,
                                        APInt &KnownZero,
diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.h b/llvm/lib/Target/R600/AMDGPUISelLowering.h

index 21c8e98..15f529c 100644 (file)
--- a/llvm/lib/Target/R600/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/R600/AMDGPUISelLowering.h
@@ -171,6 +171,9 @@ public:
                             DAGCombinerInfo &DCI,
                             unsigned &RefinementSteps,
                             bool &UseOneConstNR) const override;
+  SDValue getRecipEstimate(SDValue Operand,
+                           DAGCombinerInfo &DCI,
+                           unsigned &RefinementSteps) const override;
  
    virtual SDNode *PostISelFolding(MachineSDNode *N,
                                    SelectionDAG &DAG) const {
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 13 Jan 2015 20:53:23 +0000 (20:53 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 13 Jan 2015 20:53:23 +0000 (20:53 +0000)
llvm/include/llvm/Target/TargetLowering.h		patch \| blob \| history
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
llvm/lib/CodeGen/TargetLoweringBase.cpp		patch \| blob \| history
llvm/lib/Target/R600/AMDGPUISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/R600/AMDGPUISelLowering.h		patch \| blob \| history