return true;
}
- /// Return true if sqrt(x) is as cheap or cheaper than 1 / rsqrt(x)
- bool isFsqrtCheap() const {
- return FsqrtIsCheap;
+ /// Return true if SQRT(X) shouldn't be replaced with X*RSQRT(X).
+ virtual bool isFsqrtCheap(SDValue X, SelectionDAG &DAG) const {
+ // Default behavior is to replace SQRT(X) with X*RSQRT(X).
+ return false;
}
/// Returns true if target has indicated at least one type should be bypassed.
/// control.
void setJumpIsExpensive(bool isExpensive = true);
- /// Tells the code generator that fsqrt is cheap, and should not be replaced
- /// with an alternative sequence of instructions.
- void setFsqrtIsCheap(bool isCheap = true) { FsqrtIsCheap = isCheap; }
-
/// Tells the code generator that this target supports floating point
/// exceptions and cares about preserving floating point exception behavior.
void setHasFloatingPointExceptions(bool FPExceptions = true) {
/// combined with "shift" to BitExtract instructions.
bool HasExtractBitsInsn;
- // Don't expand fsqrt with an approximation based on the inverse sqrt.
- bool FsqrtIsCheap;
-
/// Tells the code generator to bypass slow divide or remainder
/// instructions. For example, BypassSlowDivWidths[32,8] tells the code
/// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer
}
SDValue DAGCombiner::visitFSQRT(SDNode *N) {
- if (!DAG.getTarget().Options.UnsafeFPMath || TLI.isFsqrtCheap())
+ if (!DAG.getTarget().Options.UnsafeFPMath)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ if (TLI.isFsqrtCheap(N0, DAG))
return SDValue();
// TODO: FSQRT nodes should have flags that propagate to the created nodes.
// For now, create a Flags object for use with all unsafe math transforms.
SDNodeFlags Flags;
Flags.setUnsafeAlgebra(true);
- return buildSqrtEstimate(N->getOperand(0), &Flags);
+ return buildSqrtEstimate(N0, &Flags);
}
/// copysign(x, fp_extend(y)) -> copysign(x, y)
SelectIsExpensive = false;
HasMultipleConditionRegisters = false;
HasExtractBitsInsn = false;
- FsqrtIsCheap = false;
JumpIsExpensive = JumpIsExpensiveOverride;
PredictableSelectIsExpensive = false;
MaskAndBranchFoldingIsLegal = false;
setSelectIsExpensive(false);
PredictableSelectIsExpensive = false;
- setFsqrtIsCheap(true);
-
// We want to find all load dependencies for long chains of stores to enable
// merging into very wide vectors. The problem is with vectors with > 4
// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
const char* getTargetNodeName(unsigned Opcode) const override;
+ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
+ return true;
+ }
SDValue getRsqrtEstimate(SDValue Operand,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
def FeatureFastPartialYMMWrite
: SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
"true", "Partial writes to YMM registers are fast">;
+// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
+// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
+// vector FSQRT has higher throughput than the corresponding NR code.
+// The idea is that throughput bound code is likely to be vectorized, so for
+// vectorized code we should care about the throughput of SQRT operations.
+// But if the code is scalar that probably means that the code has some kind of
+// dependency and we should care more about reducing the latency.
+def FeatureFastScalarFSQRT
+ : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
+ "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+def FeatureFastVectorFSQRT
+ : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
+ "true", "Vector SQRT is fast (disable Newton-Raphson)">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
FeaturePCLMUL,
FeatureXSAVE,
FeatureXSAVEOPT,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureFastScalarFSQRT
]>;
class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
FeatureXSAVEC,
FeatureXSAVES,
FeatureSGX,
- FeatureCLFLUSHOPT
+ FeatureCLFLUSHOPT,
+ FeatureFastVectorFSQRT
]>;
// FIXME: define SKL model
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
+/// Check if replacement of SQRT with RSQRT should be disabled.
+bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ // We never want to use both SQRT and RSQRT instructions for the same input.
+ if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+ return false;
+
+ if (VT.isVector())
+ return Subtarget.hasFastVectorFSQRT();
+ return Subtarget.hasFastScalarFSQRT();
+}
+
/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
/// Convert a comparison if required by the subtarget.
SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+ /// Check if replacement of SQRT with RSQRT should be disabled.
+ bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+
/// Use rsqrt* to speed up sqrt calculations.
SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
HasCmpxchg16b = false;
UseLeaForSP = false;
HasFastPartialYMMWrite = false;
+ HasFastScalarFSQRT = false;
+ HasFastVectorFSQRT = false;
HasSlowDivide32 = false;
HasSlowDivide64 = false;
PadShortFunctions = false;
/// of a YMM register without clearing the upper part.
bool HasFastPartialYMMWrite;
+ /// True if hardware SQRTSS instruction is at least as fast (latency) as
+ /// RSQRTSS followed by a Newton-Raphson iteration.
+ bool HasFastScalarFSQRT;
+
+ /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+ /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
+ bool HasFastVectorFSQRT;
+
/// True if 8-bit divisions are significantly faster than
/// 32-bit divisions and should be used when possible.
bool HasSlowDivide32;
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+ bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
+ bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=nehalem | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=sandybridge | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=broadwell | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=skylake | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-ACC
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=+fast-scalar-fsqrt,-fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=-fast-scalar-fsqrt,+fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-ACC
+
+declare float @llvm.sqrt.f32(float) #0
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
+
+define float @foo_x1(float %f) #0 {
+; SCALAR-EST-LABEL: foo_x1:
+; SCALAR-EST: # BB#0:
+; SCALAR-EST-NEXT: rsqrtss %xmm0
+; SCALAR-EST: retq
+;
+; SCALAR-ACC-LABEL: foo_x1:
+; SCALAR-ACC: # BB#0:
+; SCALAR-ACC-NEXT: {{^ *v?sqrtss %xmm0}}
+; SCALAR-ACC-NEXT: retq
+ %call = tail call float @llvm.sqrt.f32(float %f) #1
+ ret float %call
+}
+
+define <4 x float> @foo_x4(<4 x float> %f) #0 {
+; VECTOR-EST-LABEL: foo_x4:
+; VECTOR-EST: # BB#0:
+; VECTOR-EST-NEXT: rsqrtps %xmm0
+; VECTOR-EST: retq
+;
+; VECTOR-ACC-LABEL: foo_x4:
+; VECTOR-ACC: # BB#0:
+; VECTOR-ACC-NEXT: {{^ *v?sqrtps %xmm0}}
+; VECTOR-ACC-NEXT: retq
+ %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #1
+ ret <4 x float> %call
+}
+
+define <8 x float> @foo_x8(<8 x float> %f) #0 {
+; VECTOR-EST-LABEL: foo_x8:
+; VECTOR-EST: # BB#0:
+; VECTOR-EST-NEXT: rsqrtps
+; VECTOR-EST: retq
+;
+; VECTOR-ACC-LABEL: foo_x8:
+; VECTOR-ACC: # BB#0:
+; VECTOR-ACC-NEXT: {{^ *v?sqrtps %[xy]mm0}}
+; VECTOR-ACC-NOT: rsqrt
+; VECTOR-ACC: retq
+ %call = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #1
+ ret <8 x float> %call
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }