From c4500de255c3b3088f903d6be1913e8e8c504482 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Mon, 14 Mar 2022 12:39:52 -0700 Subject: [PATCH] [AMDGPU] gfx940: disable OP_SEL on V_DOT instructions Differential Revision: https://reviews.llvm.org/D121634 --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 4 ++++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 10 ++++++++-- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 4 +++- .../AMDGPU/AMDGPUInstructionSelector.cpp | 18 +++++++++++++++++- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 6 +++++- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 14 ++++++++++++++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 +++++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 2 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 ++ llvm/lib/Target/AMDGPU/VOP3Instructions.td | 10 ++++++---- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 5 ++--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll | 9 ++++++--- llvm/test/MC/AMDGPU/gfx940_err.s | 3 +++ 13 files changed, 77 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index fd409d6270cc..c77ff94d7621 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -47,6 +47,10 @@ def gi_vop3pmods : GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_vop3pmodsdot : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_vop3opselmods : GIComplexOperandMatcher, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index f5b51abd58ea..ca3f46012a00 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2673,7 +2673,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, } bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, - SDValue &SrcMods) const { + SDValue &SrcMods, bool IsDOT) const { unsigned Mods = 0; Src = In; @@ -2682,7 +2682,8 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, Src = Src.getOperand(0); } - if (Src.getOpcode() == ISD::BUILD_VECTOR) { + if (Src.getOpcode() == ISD::BUILD_VECTOR && + (!IsDOT || !Subtarget->hasDOTOpSelHazard())) { unsigned VecMods = Mods; SDValue Lo = stripBitcast(Src.getOperand(0)); @@ -2770,6 +2771,11 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + return SelectVOP3PMods(In, Src, SrcMods, true); +} + bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const { Src = In; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index acf82d6d15e4..7cd4f613df19 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -219,7 +219,9 @@ private: bool SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp, SDValue &Omod) const; - bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods, + bool IsDOT = false) const; + bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index bf0f9fa976d4..e388ee4a8da6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3407,7 +3407,7 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { std::pair AMDGPUInstructionSelector::selectVOP3PModsImpl( - Register Src, const MachineRegisterInfo &MRI) const { + Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const { unsigned Mods = 0; MachineInstr *MI = MRI.getVRegDef(Src); @@ -3421,6 +3421,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl( } // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. + (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() // Packed instructions do not have abs modifiers. Mods |= SISrcMods::OP_SEL_1; @@ -3443,6 +3444,21 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); + + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { Register Src; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index a879b9a73309..dea592a59870 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -173,11 +173,15 @@ private: selectVOP3Mods_nnan(MachineOperand &Root) const; std::pair - selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI) const; + selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI, + bool IsDOT = false) const; InstructionSelector::ComplexRendererFns selectVOP3PMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3PModsDOT(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index f46136df5529..ae2ab2aa1f54 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4047,6 +4047,20 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) { if (OpSel & ~3) return false; } + + if (isGFX940() && (MII.get(Opc).TSFlags & SIInstrFlags::IsDOT)) { + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + if (OpSelIdx != -1) { + if (Inst.getOperand(OpSelIdx).getImm() != 0) + return false; + } + int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); + if (OpSelHiIdx != -1) { + if (Inst.getOperand(OpSelHiIdx).getImm() != -1) + return false; + } + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 1029822573bb..2b48ba6024b1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -964,6 +964,11 @@ public: return HasLdsBranchVmemWARHazard; } + // Cannot use op_sel with v_dot instructions. + bool hasDOTOpSelHazard() const { + return GFX940Insts; + } + bool hasNSAtoVMEMBug() const { return HasNSAtoVMEMBug; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index a6470f85a313..7dac64d0808a 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -210,6 +210,8 @@ static bool updateOperand(FoldCandidate &Fold, if (Fold.isImm()) { if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked && !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) && + (!ST.hasDOTOpSelHazard() || + !(MI->getDesc().TSFlags & SIInstrFlags::IsDOT)) && AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST.hasInv2PiInlineImm())) { // Set op_sel/op_sel_hi on this operand or bail out if op_sel is diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 33b7bc7008f5..30c67d92b4cd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1428,6 +1428,8 @@ def VOP3OMods : ComplexPattern; def VOP3PMods : ComplexPattern; +def VOP3PModsDOT : ComplexPattern; + def VOP3OpSel : ComplexPattern; def VOP3OpSelMods : ComplexPattern; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 024be304a190..1d57fcd925dc 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -32,10 +32,12 @@ class getVOP3ModPat { ret1)); } -class getVOP3PModPat { - dag src0_dag = (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)); - dag src1_dag = (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)); - dag src2_dag = (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers)); +class getVOP3PModPat { + dag src0_dag = (P.Src0VT (SrcPat P.Src0VT:$src0, i32:$src0_modifiers)); + dag src1_dag = (P.Src1VT (SrcPat P.Src1VT:$src1, i32:$src1_modifiers)); + dag src2_dag = (P.Src2VT (SrcPat P.Src2VT:$src2, i32:$src2_modifiers)); dag clamp_dag = (i1 timm:$clamp); list ret3 = [(set P.DstVT:$vdst, diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 8a3548cd89f2..9f1120833ffb 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -40,14 +40,13 @@ class VOP3P_Mix_Profile { + SDPatternOperator node = null_frag, bit IsDOT = 0> { def NAME : VOP3P_Pseudo.ret, + getVOP3PModPat.ret, getVOP3Pat.ret)>; } - // Non-packed instructions that use the VOP3P encoding. // VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. multiclass VOP3_VOP3PInst { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll index 5a413ebc18cb..f241a2378102 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll @@ -1,4 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX906 +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX906 +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940 +; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940 ; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 @@ -6,7 +8,7 @@ declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp) declare i32 @llvm.amdgcn.workitem.id.x() ; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_clamp: -; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +; GFX9: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} ; GFX10: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} define amdgpu_kernel void @test_llvm_amdgcn_udot2_clamp( i32 addrspace(1)* %r, @@ -23,7 +25,7 @@ entry: } ; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_no_clamp: -; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX10: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} define amdgpu_kernel void @test_llvm_amdgcn_udot2_no_clamp( i32 addrspace(1)* %r, @@ -41,6 +43,7 @@ entry: ; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_op_sel: ; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1,0] op_sel_hi:[0,0,1]{{$}} +; GFX940: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}}{{$}} ; GFX10: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1,0] op_sel_hi:[0,0,1]{{$}} define amdgpu_kernel void @test_llvm_amdgcn_udot2_op_sel( i32 addrspace(1)* %r, diff --git a/llvm/test/MC/AMDGPU/gfx940_err.s b/llvm/test/MC/AMDGPU/gfx940_err.s index 2b3ad0a193e3..a0b9ec84e657 100644 --- a/llvm/test/MC/AMDGPU/gfx940_err.s +++ b/llvm/test/MC/AMDGPU/gfx940_err.s @@ -58,6 +58,9 @@ buffer_wbl2 glc buffer_wbl2 scc // GFX940: error: invalid operand for instruction +v_dot2_u32_u16 v0, 1, v0, s2 op_sel:[0,1,0,1] op_sel_hi:[0,0,1,1] +// GFX940: error: invalid op_sel operand + s_getreg_b32 s1, hwreg(HW_REG_FLAT_SCR_LO) // GFX940: error: specified hardware register is not supported on this GPU -- 2.34.1