From 68b102b97ac340698e31ab5af0c4394a9789a49c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 31 Dec 2019 21:54:33 -0500 Subject: [PATCH] AMDGPU: Directly select 16-bank LDS case of llvm.amdgcn.interp.p1.f16 Manually select this is as a tablegen workraound. Both SelectionDAG and GlobalISel end up misplacing the copy to m0 when both instructions in the output need it. Neither considers that both output instructions depend on m0. I don't know of any other pattern we need to handle this case, so it's less effort to just workaround this for now. --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 62 +++++++++++++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 -- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 -- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 12 ------ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 32 -------------- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 10 +---- 6 files changed, 63 insertions(+), 59 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 07f500b..ae9221d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -300,6 +300,7 @@ private: void SelectATOMIC_CMP_SWAP(SDNode *N); void SelectDSAppendConsume(SDNode *N, unsigned IntrID); void SelectDS_GWS(SDNode *N, unsigned IntrID); + void SelectInterpP1F16(SDNode *N); void SelectINTRINSIC_W_CHAIN(SDNode *N); void SelectINTRINSIC_WO_CHAIN(SDNode *N); void SelectINTRINSIC_VOID(SDNode *N); @@ -2339,6 +2340,64 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { CurDAG->setNodeMemRefs(cast(Selected), {MMO}); } +void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) { + if (Subtarget->getLDSBankCount() != 16) { + // This is a single instruction with a pattern. + SelectCode(N); + return; + } + + SDLoc DL(N); + + // This requires 2 instructions. It is possible to write a pattern to support + // this, but the generated isel emitter doesn't correctly deal with multiple + // output instructions using the same physical register input. The copy to m0 + // is incorrectly placed before the second instruction. + // + // TODO: Match source modifiers. + // + // def : Pat < + // (int_amdgcn_interp_p1_f16 + // (VOP3Mods f32:$src0, i32:$src0_modifiers), + // (i32 timm:$attrchan), (i32 timm:$attr), + // (i1 timm:$high), M0), + // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr, + // timm:$attrchan, 0, + // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> { + // let Predicates = [has16BankLDS]; + // } + + // 16 bank LDS + SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0, + N->getOperand(5), SDValue()); + + SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other); + + SDNode *InterpMov = + CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, { + CurDAG->getTargetConstant(2, DL, MVT::i32), // P0 + N->getOperand(3), // Attr + N->getOperand(2), // Attrchan + ToM0.getValue(1) // In glue + }); + + SDNode *InterpP1LV = + CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, { + CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers + N->getOperand(1), // Src0 + N->getOperand(3), // Attr + N->getOperand(2), // Attrchan + CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers + SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high + N->getOperand(4), // high + CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp + CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod + SDValue(InterpMov, 1) + }); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0)); +} + void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { unsigned IntrID = cast(N->getOperand(1))->getZExtValue(); switch (IntrID) { @@ -2367,6 +2426,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { case Intrinsic::amdgcn_wwm: Opcode = AMDGPU::WWM; break; + case Intrinsic::amdgcn_interp_p1_f16: + SelectInterpP1F16(N); + return; default: SelectCode(N); return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 9232b65..927e173 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4323,9 +4323,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; - NODE_NAME_CASE(INTERP_P1LL_F16) - NODE_NAME_CASE(INTERP_P1LV_F16) - NODE_NAME_CASE(INTERP_P2_F16) NODE_NAME_CASE(LOAD_D16_HI) NODE_NAME_CASE(LOAD_D16_LO) NODE_NAME_CASE(LOAD_D16_HI_I8) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 087983e..7b269e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -474,9 +474,6 @@ enum NodeType : unsigned { BUILD_VERTICAL_VECTOR, /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, - INTERP_P1LL_F16, - INTERP_P1LV_F16, - INTERP_P2_F16, PC_ADD_REL_OFFSET, LDS, KILL, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 4d7c6b6..56eb2e9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -329,18 +329,6 @@ def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; -def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16", - SDTypeProfile<1, 7, [SDTCisFP<0>]>, - [SDNPInGlue, SDNPOutGlue]>; - -def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16", - SDTypeProfile<1, 9, [SDTCisFP<0>]>, - [SDNPInGlue, SDNPOutGlue]>; - -def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16", - SDTypeProfile<1, 8, [SDTCisFP<0>]>, - [SDNPInGlue]>; - def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT, [SDNPHasChain, SDNPSideEffect]>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index dd4a996..48f2bad 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5872,38 +5872,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); - case Intrinsic::amdgcn_interp_p1_f16: { - if (getSubtarget()->getLDSBankCount() == 16) { - // 16 bank LDS - SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0, - Op.getOperand(5), SDValue()); - - // FIXME: This implicitly will insert a second CopyToReg to M0. - SDValue S = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32, - DAG.getTargetConstant(Intrinsic::amdgcn_interp_mov, DL, MVT::i32), - DAG.getTargetConstant(2, DL, MVT::i32), // P0 - Op.getOperand(2), // Attrchan - Op.getOperand(3), // Attr - Op.getOperand(5)); // m0 - - SDValue Ops[] = { - Op.getOperand(1), // Src0 - Op.getOperand(2), // Attrchan - Op.getOperand(3), // Attr - DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers - S, // Src2 - holds two f16 values selected by high - DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers - Op.getOperand(4), // high - DAG.getTargetConstant(0, DL, MVT::i1), // $clamp - DAG.getTargetConstant(0, DL, MVT::i32), // $omod - ToM0.getValue(1) - }; - return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops); - } - - return SDValue(); - } case Intrinsic::amdgcn_sin: return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 0141f4f..176d0e06 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -491,15 +491,7 @@ def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32 } -def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>, - [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 timm:$attrchan), - (i32 timm:$attr), - (i32 timm:$src0_modifiers), - (f32 VRegSrc_32:$src2), - (i32 timm:$src2_modifiers), - (i1 timm:$high), - (i1 timm:$clamp), - (i32 timm:$omod)))]>; +def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; } // End Uses = [M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 -- 2.7.4