From 5b91a6a88bd681f63702116f4a7f28976f4fa848 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 27 Oct 2020 12:29:11 +0000 Subject: [PATCH] [AMDGPU] Allow some modifiers on VOP3B instructions V_DIV_SCALE_F32/F64 are VOP3B encoded so they can't use the ABS src modifier, but they can still use NEG and the usual output modifiers. This partially reverts 3b99f12a4e6f "AMDGPU: Remove modifiers from v_div_scale_*". Differential Revision: https://reviews.llvm.org/D90296 --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 39 +++++++++++++++++-- .../Target/AMDGPU/AMDGPUInstructionSelector.cpp | 44 +++++++++++++++++++--- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h | 8 +++- .../Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 33 ++++++++++++++++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 14 +++---- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 9 +++++ llvm/lib/Target/AMDGPU/VOP3Instructions.td | 8 +--- llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll | 8 ++-- llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir | 2 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll | 6 +-- llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir | 8 ++-- llvm/test/MC/AMDGPU/vop3-errs.s | 10 +++++ llvm/test/MC/AMDGPU/vop3.s | 26 +++++++++++-- 13 files changed, 174 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 0f8c9d0..6074f49 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -256,11 +256,15 @@ private: bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const; + bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods, + bool AllowAbs = true) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; + bool SelectVOP3BMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, SDValue &Omod) const; bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; @@ -1129,7 +1133,12 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { unsigned Opc = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) }; + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, + // omod + SDValue Ops[8]; + SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]); + SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]); CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } @@ -2630,7 +2639,8 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { } bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const { + unsigned &Mods, + bool AllowAbs) const { Mods = 0; Src = In; @@ -2639,7 +2649,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, Src = Src.getOperand(0); } - if (Src.getOpcode() == ISD::FABS) { + if (AllowAbs && Src.getOpcode() == ISD::FABS) { Mods |= SISrcMods::ABS; Src = Src.getOperand(0); } @@ -2658,6 +2668,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, return false; } +bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods; + if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) { + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + + return false; +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const { SelectVOP3Mods(In, Src, SrcMods); @@ -2682,6 +2703,16 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, SDValue &Clamp, + SDValue &Omod) const { + SDLoc DL(In); + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); + + return SelectVOP3BMods(In, Src, SrcMods); +} + bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp, SDValue &Omod) const { Src = In; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 7d7e7dc..fb1b06a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -871,6 +871,8 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { else return false; + // TODO: Match source modifiers. + const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock *MBB = MI.getParent(); @@ -882,9 +884,14 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) .addDef(Dst1) - .addUse(Src0) - .addUse(Denom) - .addUse(Numer); + .addImm(0) // $src0_modifiers + .addUse(Src0) // $src0 + .addImm(0) // $src1_modifiers + .addUse(Denom) // $src1 + .addImm(0) // $src2_modifiers + .addUse(Numer) // $src2 + .addImm(0) // $clamp + .addImm(0); // $omod MI.eraseFromParent(); return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); @@ -3157,7 +3164,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { } std::pair -AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const { +AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, + bool AllowAbs) const { Register Src = Root.getReg(); Register OrigSrc = Src; unsigned Mods = 0; @@ -3169,7 +3177,7 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const { MI = getDefIgnoringCopies(Src, *MRI); } - if (MI && MI->getOpcode() == AMDGPU::G_FABS) { + if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) { Src = MI->getOperand(1).getReg(); Mods |= SISrcMods::ABS; } @@ -3216,6 +3224,20 @@ AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod + }}; +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { return {{ [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, @@ -3237,6 +3259,18 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { Register Reg = Root.getReg(); const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index c744190..9a11c37 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -145,8 +145,8 @@ private: bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const; bool selectBVHIntrinsic(MachineInstr &I) const; - std::pair - selectVOP3ModsImpl(MachineOperand &Root) const; + std::pair selectVOP3ModsImpl(MachineOperand &Root, + bool AllowAbs = true) const; InstructionSelector::ComplexRendererFns selectVCSRC(MachineOperand &Root) const; @@ -157,9 +157,13 @@ private: InstructionSelector::ComplexRendererFns selectVOP3Mods0(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectVOP3BMods0(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3OMods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectVOP3Mods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3BMods(MachineOperand &Root) const; ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index d33d9a2..5a0e777 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1369,6 +1369,7 @@ private: bool validateVccOperand(unsigned Reg) const; bool validateVOP3Literal(const MCInst &Inst) const; bool validateMAIAccWrite(const MCInst &Inst); + bool validateDivScale(const MCInst &Inst); unsigned getConstantBusLimit(unsigned Opcode) const; bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; @@ -3304,6 +3305,35 @@ bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst) { return true; } +bool AMDGPUAsmParser::validateDivScale(const MCInst &Inst) { + switch (Inst.getOpcode()) { + default: + return true; + case V_DIV_SCALE_F32_gfx6_gfx7: + case V_DIV_SCALE_F32_vi: + case V_DIV_SCALE_F32_gfx10: + case V_DIV_SCALE_F64_gfx6_gfx7: + case V_DIV_SCALE_F64_vi: + case V_DIV_SCALE_F64_gfx10: + break; + } + + // TODO: Check that src0 = src1 or src2. + + for (auto Name : {AMDGPU::OpName::src0_modifiers, + AMDGPU::OpName::src2_modifiers, + AMDGPU::OpName::src2_modifiers}) { + if (Inst.getOperand(AMDGPU::getNamedOperandIdx(Inst.getOpcode(), Name)) + .getImm() & + SISrcMods::ABS) { + Error(getLoc(), "ABS not allowed in VOP3B instructions"); + return false; + } + } + + return true; +} + bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) { const unsigned Opc = Inst.getOpcode(); @@ -3777,6 +3807,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, if (!validateMAIAccWrite(Inst)) { return false; } + if (!validateDivScale(Inst)) { + return false; + } return true; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index bfff781..8294044 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11085,9 +11085,9 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, // Satisfy the operand register constraint when one of the inputs is // undefined. Ordinarily each undef value will have its own implicit_def of // a vreg, so force these to use a single register. - SDValue Src0 = Node->getOperand(0); - SDValue Src1 = Node->getOperand(1); - SDValue Src2 = Node->getOperand(2); + SDValue Src0 = Node->getOperand(1); + SDValue Src1 = Node->getOperand(3); + SDValue Src2 = Node->getOperand(5); if ((Src0.isMachineOpcode() && Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) && @@ -11122,10 +11122,10 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, } else break; - SmallVector Ops = { Src0, Src1, Src2 }; - for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I) - Ops.push_back(Node->getOperand(I)); - + SmallVector Ops(Node->op_begin(), Node->op_end()); + Ops[1] = Src0; + Ops[3] = Src1; + Ops[5] = Src2; Ops.push_back(ImpDef.getValue(1)); return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 21e108e..3d9a654 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3887,6 +3887,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } } + if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & + SISrcMods::ABS) || + (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() & + SISrcMods::ABS) || + (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & + SISrcMods::ABS)) { + ErrInfo = "ABS not allowed in VOP3B instructions"; + return false; + } } if (isSOP2(MI) || isSOPC(MI)) { diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index cb8a1c6..a2ff89f 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -193,12 +193,8 @@ class VOP3_Profile : VOPProf } class VOP3b_Profile : VOPProfile<[vt, vt, vt, vt]> { - // v_div_scale_{f32|f64} do not support input modifiers. - let HasModifiers = 0; - let HasClamp = 0; - let HasOMod = 0; let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); - let Asm64 = " $vdst, $sdst, $src0, $src1, $src2"; + let Asm64 = " $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod"; } def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile { @@ -388,13 +384,11 @@ def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile, AMDGPU let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does. def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> { let SchedRW = [WriteFloatFMA, WriteSALU]; - let AsmMatchConverter = ""; } // Double precision division pre-scale. def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> { let SchedRW = [WriteDouble, WriteSALU]; - let AsmMatchConverter = ""; let FPDPRounding = 1; } } // End mayRaiseFPException = 0 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll index 216ab53..c7fc21e 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll @@ -11,8 +11,8 @@ define float @fdiv_f32(float %a, float %b) #0 { ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN: %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32 [[COPY2]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec - ; GCN: %8:vgpr_32, %9:sreg_64 = nofpexcept V_DIV_SCALE_F32 [[COPY1]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec + ; GCN: %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: %8:vgpr_32, %9:sreg_64 = nofpexcept V_DIV_SCALE_F32 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; GCN: %10:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 @@ -44,8 +44,8 @@ define float @fdiv_nnan_f32(float %a, float %b) #0 { ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN: %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32 [[COPY2]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec - ; GCN: %8:vgpr_32, %9:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32 [[COPY1]], [[COPY1]], [[COPY2]], implicit $mode, implicit $exec + ; GCN: %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN: %8:vgpr_32, %9:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; GCN: %10:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 diff --git a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir index 2a1442b..ab725ed 100644 --- a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -55,7 +55,7 @@ body: | S_BRANCH %bb.3 bb.3: - $vgpr4, $vcc = V_DIV_SCALE_F32 $vgpr1, $vgpr1, $vgpr3, implicit $mode, implicit $exec + $vgpr4, $vcc = V_DIV_SCALE_F32 0, $vgpr1, 0, $vgpr1, 0, $vgpr3, 0, 0, implicit $mode, implicit $exec $vgpr0 = V_DIV_FMAS_F32 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll index 25c606c..2dd7430 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll @@ -322,8 +322,7 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* ; SI-LABEL: {{^}}test_div_scale_f32_fneg_num: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[NEG_A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], -[[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_fneg_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -368,8 +367,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, ; SI-LABEL: {{^}}test_div_scale_f32_fneg_den: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_xor_b32_e32 [[NEG_B:v[0-9]+]], 0x80000000, [[B]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[NEG_B]], [[NEG_B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], -[[B]], -[[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_fneg_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir index 6e0d016..0fab87d 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -288,9 +288,9 @@ body: | %87:vgpr_32 = IMPLICIT_DEF %88:vgpr_32 = IMPLICIT_DEF %90:vgpr_32 = IMPLICIT_DEF - %91:vgpr_32, dead %92:sreg_64 = nofpexcept V_DIV_SCALE_F32 %90, %90, 1065353216, implicit $mode, implicit $exec + %91:vgpr_32, dead %92:sreg_64 = nofpexcept V_DIV_SCALE_F32 0, %90, 0, %90, 0, 1065353216, 0, 0, implicit $mode, implicit $exec %95:vgpr_32 = nofpexcept V_FMA_F32 0, 0, 0, 0, 0, undef %93:vgpr_32, 0, 0, implicit $mode, implicit $exec - %96:vgpr_32, %97:sreg_64 = nofpexcept V_DIV_SCALE_F32 1065353216, %90, 1065353216, implicit $mode, implicit $exec + %96:vgpr_32, %97:sreg_64 = nofpexcept V_DIV_SCALE_F32 0, 1065353216, 0, %90, 0, 1065353216, 0, 0, implicit $mode, implicit $exec %98:vgpr_32 = IMPLICIT_DEF %99:vgpr_32 = IMPLICIT_DEF %100:vgpr_32 = IMPLICIT_DEF @@ -299,11 +299,11 @@ body: | %103:vgpr_32 = IMPLICIT_DEF %104:vgpr_32 = IMPLICIT_DEF %105:vgpr_32 = IMPLICIT_DEF - %106:vgpr_32, dead %107:sreg_64 = nofpexcept V_DIV_SCALE_F32 %90, %90, %105, implicit $mode, implicit $exec + %106:vgpr_32, dead %107:sreg_64 = nofpexcept V_DIV_SCALE_F32 0, %90, 0, %90, 0, %105, 0, 0, implicit $mode, implicit $exec %108:vgpr_32 = nofpexcept V_RCP_F32_e32 0, implicit $mode, implicit $exec %109:vgpr_32 = IMPLICIT_DEF %110:vgpr_32 = nofpexcept V_FMA_F32 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - %111:vgpr_32, %112:sreg_64 = nofpexcept V_DIV_SCALE_F32 0, 0, 0, implicit $mode, implicit $exec + %111:vgpr_32, %112:sreg_64 = nofpexcept V_DIV_SCALE_F32 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %113:vgpr_32 = nofpexcept V_MUL_F32_e32 0, %110, implicit $mode, implicit $exec %114:vgpr_32 = IMPLICIT_DEF %115:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/MC/AMDGPU/vop3-errs.s b/llvm/test/MC/AMDGPU/vop3-errs.s index 2d59d55..166e0a1 100644 --- a/llvm/test/MC/AMDGPU/vop3-errs.s +++ b/llvm/test/MC/AMDGPU/vop3-errs.s @@ -92,3 +92,13 @@ v_interp_p1ll_f16 v5, v2, attr31.x v0 v_interp_p2_f16 v5, v2, attr1.x, v3 mul:2 // GFX67: error: instruction not supported on this GPU // GFX89: error: invalid operand for instruction + +// +// v_div_scale_* +// + +v_div_scale_f32 v24, vcc, v22, v22, |v20| +// GCN: error: ABS not allowed in VOP3B instructions + +v_div_scale_f64 v[24:25], vcc, -|v[22:23]|, v[22:23], v[20:21] +// GCN: error: ABS not allowed in VOP3B instructions diff --git a/llvm/test/MC/AMDGPU/vop3.s b/llvm/test/MC/AMDGPU/vop3.s index 580b928..eb11d6e 100644 --- a/llvm/test/MC/AMDGPU/vop3.s +++ b/llvm/test/MC/AMDGPU/vop3.s @@ -411,14 +411,34 @@ v_div_scale_f64 v[24:25], vcc, v[22:23], v[22:23], v[20:21] // SICI: v_div_scale_f64 v[24:25], vcc, v[22:23], v[22:23], v[20:21] ; encoding: [0x18,0x6a,0xdc,0xd2,0x16,0x2d,0x52,0x04] // VI: v_div_scale_f64 v[24:25], vcc, v[22:23], v[22:23], v[20:21] ; encoding: [0x18,0x6a,0xe1,0xd1,0x16,0x2d,0x52,0x04] -v_div_scale_f64 v[24:25], s[10:11], v[22:23], v[20:21], v[20:21] -// SICI: v_div_scale_f64 v[24:25], s[10:11], v[22:23], v[20:21], v[20:21] ; encoding: [0x18,0x0a,0xdc,0xd2,0x16,0x29,0x52,0x04] -// VI: v_div_scale_f64 v[24:25], s[10:11], v[22:23], v[20:21], v[20:21] ; encoding: [0x18,0x0a,0xe1,0xd1,0x16,0x29,0x52,0x04] +v_div_scale_f64 v[24:25], s[10:11], -v[22:23], v[20:21], v[20:21] clamp +// SICI: v_div_scale_f64 v[24:25], s[10:11], -v[22:23], v[20:21], v[20:21] clamp ; encoding: [0x18,0x0a,0xdc,0xd2,0x16,0x29,0x52,0x24] +// VI: v_div_scale_f64 v[24:25], s[10:11], -v[22:23], v[20:21], v[20:21] clamp ; encoding: [0x18,0x8a,0xe1,0xd1,0x16,0x29,0x52,0x24] + +v_div_scale_f64 v[24:25], s[10:11], v[22:23], -v[20:21], v[20:21] clamp mul:2 +// SICI: v_div_scale_f64 v[24:25], s[10:11], v[22:23], -v[20:21], v[20:21] clamp mul:2 ; encoding: [0x18,0x0a,0xdc,0xd2,0x16,0x29,0x52,0x4c] +// VI: v_div_scale_f64 v[24:25], s[10:11], v[22:23], -v[20:21], v[20:21] clamp mul:2 ; encoding: [0x18,0x8a,0xe1,0xd1,0x16,0x29,0x52,0x4c] + +v_div_scale_f64 v[24:25], s[10:11], v[22:23], v[20:21], -v[20:21] +// SICI: v_div_scale_f64 v[24:25], s[10:11], v[22:23], v[20:21], -v[20:21] ; encoding: [0x18,0x0a,0xdc,0xd2,0x16,0x29,0x52,0x84] +// VI: v_div_scale_f64 v[24:25], s[10:11], v[22:23], v[20:21], -v[20:21] ; encoding: [0x18,0x0a,0xe1,0xd1,0x16,0x29,0x52,0x84] v_div_scale_f32 v24, vcc, v22, v22, v20 // SICI: v_div_scale_f32 v24, vcc, v22, v22, v20 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0x52,0x04] // VI: v_div_scale_f32 v24, vcc, v22, v22, v20 ; encoding: [0x18,0x6a,0xe0,0xd1,0x16,0x2d,0x52,0x04] +v_div_scale_f32 v24, vcc, -v22, v22, v20 +// SICI: v_div_scale_f32 v24, vcc, -v22, v22, v20 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0x52,0x24] +// VI: v_div_scale_f32 v24, vcc, -v22, v22, v20 ; encoding: [0x18,0x6a,0xe0,0xd1,0x16,0x2d,0x52,0x24] + +v_div_scale_f32 v24, vcc, v22, -v22, v20 clamp +// SICI: v_div_scale_f32 v24, vcc, v22, -v22, v20 clamp ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0x52,0x44] +// VI: v_div_scale_f32 v24, vcc, v22, -v22, v20 clamp ; encoding: [0x18,0xea,0xe0,0xd1,0x16,0x2d,0x52,0x44] + +v_div_scale_f32 v24, vcc, v22, v22, -v20 clamp div:2 +// SICI: v_div_scale_f32 v24, vcc, v22, v22, -v20 clamp div:2 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0x52,0x9c] +// VI: v_div_scale_f32 v24, vcc, v22, v22, -v20 clamp div:2 ; encoding: [0x18,0xea,0xe0,0xd1,0x16,0x2d,0x52,0x9c] + v_div_scale_f32 v24, s[10:11], v22, v22, v20 // SICI: v_div_scale_f32 v24, s[10:11], v22, v22, v20 ; encoding: [0x18,0x0a,0xda,0xd2,0x16,0x2d,0x52,0x04] // VI: v_div_scale_f32 v24, s[10:11], v22, v22, v20 ; encoding: [0x18,0x0a,0xe0,0xd1,0x16,0x2d,0x52,0x04] -- 2.7.4