From 79f3ade51ae0ebe8e0f07aeee2bbafdf449de8d7 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Mon, 7 Aug 2017 19:10:56 +0000 Subject: [PATCH] [AMDGPU] Add pseudo "old" source to all DPP instructions Summary: All instructions with the DPP modifier may not write to certain lanes of the output if bound_ctrl=1 is set or any bits in bank_mask or row_mask aren't set, so the destination register may be both defined and modified. The right way to handle this is to add a constraint that the destination register is the same as one of the inputs. We could tie the destination to the first source, but that would be too restrictive for some use-cases where we want the destination to be some other value before the instruction executes. Instead, add a fake "old" source and tie it to the destination. Effectively, the "old" source defines what value unwritten lanes will get. We'll expose this functionality to users with a new intrinsic later. Also, we want to use DPP instructions for computing derivatives, which means we need to set WQM for them. We also need to enable the entire wavefront when using DPP intrinsics to implement nonuniform subgroup reductions, since otherwise we'll get incorrect results in some cases. To accomodate this, add a new operand to all DPP instructions which will be interpreted by the SI WQM pass. This will be exposed with a new intrinsic later. We'll also add support for Whole Wavefront Mode later. I also fixed llvm.amdgcn.mov.dpp to overwrite the source and fixed up the test. However, I could also keep the old behavior (where lanes that aren't written are undefined) if people want it. Reviewers: tstellar, arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D34716 llvm-svn: 310283 --- .../Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 15 +++++-------- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 26 +++++++++++++--------- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 11 +++++---- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 9 ++++---- llvm/lib/Target/AMDGPU/VOPInstructions.td | 2 ++ llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir | 4 ++-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll | 11 ++++++--- 7 files changed, 41 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 728f352..17e7288 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4458,6 +4458,11 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } + // All DPP instructions with at least one source operand have a fake "old" + // source at the beginning that's tied to the dst operand. Handle it here. + if (Desc.getNumOperands() >= 2) + Inst.addOperand(Inst.getOperand(0)); + for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments @@ -4480,16 +4485,6 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); - - // special case v_mac_{f16, f32}: - // it has src2 register operand that is tied to dst operand - if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp || - Inst.getOpcode() == AMDGPU::V_MAC_F16_dpp) { - auto it = Inst.begin(); - std::advance( - it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); - Inst.insert(it, Inst.getOperand(0)); // src2 = dst - } } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 4b8dff7..e869935 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1184,8 +1184,9 @@ class getInsVOP3OpSel { +class getInsDPP { dag ret = !if (!eq(NumSrcArgs, 0), // VOP1 without input operands (V_NOP) @@ -1194,26 +1195,29 @@ class getInsDPP _ArgVT> { getOpSelMod.ret, getOpSelMod.ret, getOpSelMod.ret>.ret; - field dag InsDPP = getInsDPP.ret; field dag InsSDWA = getInsSDWA { let Outs = (outs); let Ins32 = (ins Src0RC32:$vdst, VSrc_b32:$src0); let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0); - let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + let InsDPP = (ins DstRC:$vdst, DstRC:$old, Src0RC32:$src0, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, @@ -504,8 +505,6 @@ class VOP1_DPP op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> : let Uses = ps.Uses; let SchedRW = ps.SchedRW; let hasSideEffects = ps.hasSideEffects; - let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; bits<8> vdst; let Inst{8-0} = 0xfa; // dpp @@ -659,11 +658,11 @@ let Predicates = [isVI] in { def : Pat < (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, imm:$bound_ctrl)), - (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), - (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) + (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl)) >; - def : Pat< (i32 (anyext i16:$src)), (COPY $src) diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 9f3df2b..29266fb 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -209,9 +209,9 @@ class VOP_MAC : VOPProfile <[vt, vt, vt, vt]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64, 3, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; - let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + let InsDPP = (ins DstRCDPP:$old, + Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, - VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); @@ -282,7 +282,8 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); - let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0, + let InsDPP = (ins DstRCDPP:$old, + Src0Mod:$src0_modifiers, Src0DPP:$src0, Src1Mod:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); @@ -665,8 +666,6 @@ class VOP2_DPP op, VOP2_Pseudo ps, VOPProfile P = ps.Pfl> : let Uses = ps.Uses; let SchedRW = ps.SchedRW; let hasSideEffects = ps.hasSideEffects; - let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; bits<8> vdst; bits<8> src1; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index dad7666..cb754ef 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -510,6 +510,8 @@ class VOP_DPP : let AssemblerPredicate = !if(P.HasExt, HasDPP, DisableInst); let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); + let Constraints = !if(P.NumSrcArgs, "$old = $vdst", ""); + let DisableEncoding = !if(P.NumSrcArgs, "$old", ""); let DecoderNamespace = "DPP"; } diff --git a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir index bf4c7cb..15006e5 100644 --- a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -504,12 +504,12 @@ name: dpp body: | bb.0: %vgpr0 = V_MOV_B32_e32 0, implicit %exec - %vgpr1 = V_MOV_B32_dpp %vgpr0, 0, 15, 15, 0, implicit %exec + %vgpr1 = V_MOV_B32_dpp %vgpr1, %vgpr0, 0, 15, 15, 0, implicit %exec S_BRANCH %bb.1 bb.1: implicit %exec, implicit %vcc = V_CMPX_EQ_I32_e32 %vgpr0, %vgpr1, implicit %exec - %vgpr3 = V_MOV_B32_dpp %vgpr0, 0, 15, 15, 0, implicit %exec + %vgpr3 = V_MOV_B32_dpp %vgpr3, %vgpr0, 0, 15, 15, 0, implicit %exec S_ENDPGM ... --- diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll index d305165..1ee8320 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -5,8 +5,10 @@ ; VI-LABEL: {{^}}dpp_test: ; VI: v_mov_b32_e32 v0, s{{[0-9]+}} +; VI-NOOPT: v_mov_b32_e32 v1, s{{[0-9]+}} ; VI: s_nop 1 -; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] +; VI-OPT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] +; VI-NOOPT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11] define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) { %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0 store i32 %tmp0, i32 addrspace(1)* %out @@ -14,11 +16,14 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) { } ; VI-LABEL: {{^}}dpp_wait_states: +; VI-NOOPT: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s{{[0-9]+}} ; VI: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}} ; VI: s_nop 1 -; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI-OPT: v_mov_b32_dpp [[VGPR0]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI-NOOPT: v_mov_b32_dpp [[VGPR1]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; VI: s_nop 1 -; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI-OPT: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 +; VI-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 define amdgpu_kernel void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) { %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0 %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0 -- 2.7.4