From 9c588f53fc423dd0ed69250fbc93b37b40c0ef44 Mon Sep 17 00:00:00 2001 From: QingShan Zhang Date: Wed, 25 Nov 2020 05:37:15 +0000 Subject: [PATCH] [DAGCombine] Add hook to allow target specific test for sqrt input PowerPC has instruction ftsqrt/xstsqrtdp etc to do the input test for software square root. LLVM now tests it with smallest normalized value using abs + setcc. We should add hook to target that has test instructions. Reviewed By: Spatel, Chen Zheng, Qiu Chao Fang Differential Revision: https://reviews.llvm.org/D80706 --- llvm/include/llvm/CodeGen/TargetLowering.h | 9 ++++++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 41 +++++++++++++----------- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 29 +++++++++++++++++ llvm/lib/Target/PowerPC/PPCISelLowering.h | 5 +++ llvm/lib/Target/PowerPC/PPCInstrFormats.td | 3 +- llvm/lib/Target/PowerPC/PPCInstrInfo.td | 7 ++++- llvm/lib/Target/PowerPC/PPCInstrVSX.td | 3 +- llvm/test/CodeGen/PowerPC/fma-mutate.ll | 7 ++--- llvm/test/CodeGen/PowerPC/recipest.ll | 45 +++++++++++---------------- 9 files changed, 96 insertions(+), 53 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 164cbd7..16580a9 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4277,6 +4277,15 @@ public: return SDValue(); } + /// Return a target-dependent comparison result if the input operand is + /// suitable for use with a square root estimate calculation. For example, the + /// comparison may check if the operand is NAN, INF, zero, normal, etc. The + /// result should be used as the condition operand for a select or branch. + virtual SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG, + const DenormalMode &Mode) const { + return SDValue(); + } + //===--------------------------------------------------------------------===// // Legalization utility functions // diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index cae602d..4ac1743 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22056,26 +22056,31 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, // possibly a denormal. Force the answer to 0.0 for those cases. SDLoc DL(Op); EVT CCVT = getSetCCResultType(VT); - ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; + SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); DenormalMode DenormMode = DAG.getDenormalMode(VT); - if (DenormMode.Input == DenormalMode::IEEE) { - // This is specifically a check for the handling of denormal inputs, - // not the result. - - // fabs(X) < SmallestNormal ? 0.0 : Est - const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); - APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); - SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); - SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); - Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est); - } else { - // X == 0.0 ? 0.0 : Est - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); - Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est); + // Try the target specific test first. + SDValue Test = TLI.getSqrtInputTest(Op, DAG, DenormMode); + if (!Test) { + // If no test provided by target, testing it with denormal inputs to + // avoid wrong estimate. + if (DenormMode.Input == DenormalMode::IEEE) { + // This is specifically a check for the handling of denormal inputs, + // not the result. + + // Test = fabs(X) < SmallestNormal + const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); + APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); + SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); + SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); + Test = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); + } else + // Test = X == 0.0 + Test = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); } + // Test ? 0.0 : Est + Est = DAG.getNode(Test.getValueType().isVector() ? ISD::VSELECT + : ISD::SELECT, + DL, VT, Test, FPZero, Est); } } return Est; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 10aecf9..d19fbd4 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1447,6 +1447,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { return "PPCISD::FP_TO_SINT_IN_VSR"; case PPCISD::FRE: return "PPCISD::FRE"; case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; + case PPCISD::FTSQRT: + return "PPCISD::FTSQRT"; case PPCISD::STFIWX: return "PPCISD::STFIWX"; case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; @@ -12758,6 +12760,33 @@ static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) { return RefinementSteps; } +SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG, + const DenormalMode &Mode) const { + // TODO - add support for v2f64/v4f32 + EVT VT = Op.getValueType(); + if (VT != MVT::f64) + return SDValue(); + + SDLoc DL(Op); + // The output register of FTSQRT is CR field. + SDValue FTSQRT = DAG.getNode(PPCISD::FTSQRT, DL, MVT::i32, Op); + // ftsqrt BF,FRB + // Let e_b be the unbiased exponent of the double-precision + // floating-point operand in register FRB. + // fe_flag is set to 1 if either of the following conditions occurs. + // - The double-precision floating-point operand in register FRB is a zero, + // a NaN, or an infinity, or a negative value. + // - e_b is less than or equal to -970. + // Otherwise fe_flag is set to 0. + // Both VSX and non-VSX versions would set EQ bit in the CR if the number is + // not eligible for iteration. (zero/negative/infinity/nan or unbiased + // exponent is less than -970) + SDValue SRIdxVal = DAG.getTargetConstant(PPC::sub_eq, DL, MVT::i32); + return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i1, + FTSQRT, SRIdxVal), + 0); +} + SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 414a355..6c4899f 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -89,6 +89,9 @@ namespace llvm { FRE, FRSQRTE, + /// Test instruction for software square root. + FTSQRT, + /// VPERM - The PPC VPERM Instruction. /// VPERM, @@ -1283,6 +1286,8 @@ namespace llvm { bool Reciprocal) const override; SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override; + SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG, + const DenormalMode &Mode) const override; unsigned combineRepeatedFPDivisors() const override; SDValue diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td index 5ff5fc7..646efe6 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -637,9 +637,10 @@ class XForm_17 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, } class XForm_17a opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin> + InstrItinClass itin, list pattern> : XForm_17 { let FRA = 0; + let Pattern = pattern; } class XForm_18 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 2e77d04..de9ae99 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -74,6 +74,9 @@ def SDT_PPCcondbr : SDTypeProfile<0, 3, [ SDTCisVT<0, i32>, SDTCisVT<2, OtherVT> ]>; +def SDT_PPCFtsqrt : SDTypeProfile<1, 1, [ + SDTCisVT<0, i32>]>; + def SDT_PPClbrx : SDTypeProfile<1, 2, [ SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT> ]>; @@ -124,6 +127,7 @@ def SDT_PPCFPMinMax : SDTypeProfile<1, 2, [ def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>; def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>; +def PPCftsqrt : SDNode<"PPCISD::FTSQRT", SDT_PPCFtsqrt,[]>; def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>; def PPCfcfidu : SDNode<"PPCISD::FCFIDU", SDTFPUnaryOp, []>; @@ -2643,7 +2647,8 @@ let isCompare = 1, mayRaiseFPException = 1, hasSideEffects = 0 in { def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB), "ftdiv $crD, $fA, $fB", IIC_FPCompare>; def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB), - "ftsqrt $crD, $fB", IIC_FPCompare>; + "ftsqrt $crD, $fB", IIC_FPCompare, + [(set i32:$crD, (PPCftsqrt f64:$fB))]>; let mayRaiseFPException = 1, hasSideEffects = 0 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 1ffbd40..b023c05 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -629,7 +629,8 @@ let hasSideEffects = 0 in { "xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>; def XSTSQRTDP : XX2Form_1<60, 106, (outs crrc:$crD), (ins vsfrc:$XB), - "xstsqrtdp $crD, $XB", IIC_FPCompare, []>; + "xstsqrtdp $crD, $XB", IIC_FPCompare, + [(set i32:$crD, (PPCftsqrt f64:$XB))]>; def XVTDIVDP : XX3Form_1<60, 125, (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB), "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>; diff --git a/llvm/test/CodeGen/PowerPC/fma-mutate.ll b/llvm/test/CodeGen/PowerPC/fma-mutate.ll index a1e3473..62cce73 100644 --- a/llvm/test/CodeGen/PowerPC/fma-mutate.ll +++ b/llvm/test/CodeGen/PowerPC/fma-mutate.ll @@ -9,12 +9,9 @@ declare double @llvm.sqrt.f64(double) define double @foo3_fmf(double %a) nounwind { ; CHECK-LABEL: foo3_fmf: ; CHECK: # %bb.0: -; CHECK-NEXT: xsabsdp 0, 1 -; CHECK-NEXT: addis 3, 2, .LCPI0_2@toc@ha -; CHECK-NEXT: lfd 2, .LCPI0_2@toc@l(3) -; CHECK-NEXT: xscmpudp 0, 0, 2 +; CHECK-NEXT: xstsqrtdp 0, 1 ; CHECK-NEXT: xxlxor 0, 0, 0 -; CHECK-NEXT: blt 0, .LBB0_2 +; CHECK-NEXT: bc 12, 2, .LBB0_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: xsrsqrtedp 0, 1 ; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll index e3894bc..cd8520b 100644 --- a/llvm/test/CodeGen/PowerPC/recipest.ll +++ b/llvm/test/CodeGen/PowerPC/recipest.ll @@ -749,11 +749,8 @@ define <4 x float> @hoo2_safe(<4 x float> %a, <4 x float> %b) nounwind { define double @foo3_fmf(double %a) nounwind { ; CHECK-P7-LABEL: foo3_fmf: ; CHECK-P7: # %bb.0: -; CHECK-P7-NEXT: fabs 0, 1 -; CHECK-P7-NEXT: addis 3, 2, .LCPI20_2@toc@ha -; CHECK-P7-NEXT: lfd 2, .LCPI20_2@toc@l(3) -; CHECK-P7-NEXT: fcmpu 0, 0, 2 -; CHECK-P7-NEXT: blt 0, .LBB20_2 +; CHECK-P7-NEXT: ftsqrt 0, 1 +; CHECK-P7-NEXT: bc 12, 2, .LBB20_2 ; CHECK-P7-NEXT: # %bb.1: ; CHECK-P7-NEXT: frsqrte 0, 1 ; CHECK-P7-NEXT: addis 3, 2, .LCPI20_0@toc@ha @@ -770,18 +767,15 @@ define double @foo3_fmf(double %a) nounwind { ; CHECK-P7-NEXT: fmul 1, 1, 0 ; CHECK-P7-NEXT: blr ; CHECK-P7-NEXT: .LBB20_2: -; CHECK-P7-NEXT: addis 3, 2, .LCPI20_3@toc@ha -; CHECK-P7-NEXT: lfs 1, .LCPI20_3@toc@l(3) +; CHECK-P7-NEXT: addis 3, 2, .LCPI20_2@toc@ha +; CHECK-P7-NEXT: lfs 1, .LCPI20_2@toc@l(3) ; CHECK-P7-NEXT: blr ; ; CHECK-P8-LABEL: foo3_fmf: ; CHECK-P8: # %bb.0: -; CHECK-P8-NEXT: xsabsdp 0, 1 -; CHECK-P8-NEXT: addis 3, 2, .LCPI20_2@toc@ha -; CHECK-P8-NEXT: lfd 2, .LCPI20_2@toc@l(3) -; CHECK-P8-NEXT: xscmpudp 0, 0, 2 +; CHECK-P8-NEXT: xstsqrtdp 0, 1 ; CHECK-P8-NEXT: xxlxor 0, 0, 0 -; CHECK-P8-NEXT: blt 0, .LBB20_2 +; CHECK-P8-NEXT: bc 12, 2, .LBB20_2 ; CHECK-P8-NEXT: # %bb.1: ; CHECK-P8-NEXT: xsrsqrtedp 0, 1 ; CHECK-P8-NEXT: addis 3, 2, .LCPI20_0@toc@ha @@ -803,12 +797,9 @@ define double @foo3_fmf(double %a) nounwind { ; ; CHECK-P9-LABEL: foo3_fmf: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: addis 3, 2, .LCPI20_2@toc@ha -; CHECK-P9-NEXT: xsabsdp 0, 1 -; CHECK-P9-NEXT: lfd 2, .LCPI20_2@toc@l(3) -; CHECK-P9-NEXT: xscmpudp 0, 0, 2 +; CHECK-P9-NEXT: xstsqrtdp 0, 1 ; CHECK-P9-NEXT: xxlxor 0, 0, 0 -; CHECK-P9-NEXT: blt 0, .LBB20_2 +; CHECK-P9-NEXT: bc 12, 2, .LBB20_2 ; CHECK-P9-NEXT: # %bb.1: ; CHECK-P9-NEXT: xsrsqrtedp 0, 1 ; CHECK-P9-NEXT: addis 3, 2, .LCPI20_0@toc@ha @@ -1038,18 +1029,18 @@ define <2 x double> @hoo4_fmf(<2 x double> %a) #1 { ; CHECK-P7-LABEL: hoo4_fmf: ; CHECK-P7: # %bb.0: ; CHECK-P7-NEXT: addis 3, 2, .LCPI26_2@toc@ha +; CHECK-P7-NEXT: ftsqrt 0, 1 ; CHECK-P7-NEXT: fmr 3, 1 -; CHECK-P7-NEXT: addis 4, 2, .LCPI26_1@toc@ha +; CHECK-P7-NEXT: addis 4, 2, .LCPI26_0@toc@ha ; CHECK-P7-NEXT: lfs 0, .LCPI26_2@toc@l(3) -; CHECK-P7-NEXT: addis 3, 2, .LCPI26_0@toc@ha -; CHECK-P7-NEXT: lfs 4, .LCPI26_1@toc@l(4) -; CHECK-P7-NEXT: lfs 5, .LCPI26_0@toc@l(3) -; CHECK-P7-NEXT: fcmpu 0, 1, 0 +; CHECK-P7-NEXT: addis 3, 2, .LCPI26_1@toc@ha +; CHECK-P7-NEXT: lfs 5, .LCPI26_0@toc@l(4) +; CHECK-P7-NEXT: lfs 4, .LCPI26_1@toc@l(3) ; CHECK-P7-NEXT: fmr 1, 0 -; CHECK-P7-NEXT: bne 0, .LBB26_3 +; CHECK-P7-NEXT: bc 4, 2, .LBB26_3 ; CHECK-P7-NEXT: # %bb.1: -; CHECK-P7-NEXT: fcmpu 0, 2, 0 -; CHECK-P7-NEXT: bne 0, .LBB26_4 +; CHECK-P7-NEXT: ftsqrt 0, 2 +; CHECK-P7-NEXT: bc 4, 2, .LBB26_4 ; CHECK-P7-NEXT: .LBB26_2: ; CHECK-P7-NEXT: fmr 2, 0 ; CHECK-P7-NEXT: blr @@ -1063,8 +1054,8 @@ define <2 x double> @hoo4_fmf(<2 x double> %a) #1 { ; CHECK-P7-NEXT: fmadd 1, 3, 1, 5 ; CHECK-P7-NEXT: fmul 3, 3, 4 ; CHECK-P7-NEXT: fmul 1, 3, 1 -; CHECK-P7-NEXT: fcmpu 0, 2, 0 -; CHECK-P7-NEXT: beq 0, .LBB26_2 +; CHECK-P7-NEXT: ftsqrt 0, 2 +; CHECK-P7-NEXT: bc 12, 2, .LBB26_2 ; CHECK-P7-NEXT: .LBB26_4: ; CHECK-P7-NEXT: frsqrte 0, 2 ; CHECK-P7-NEXT: fmul 3, 2, 0 -- 2.7.4