From 4672bac1776e2245abfd671e208fd56d21cca1ad Mon Sep 17 00:00:00 2001 From: Piotr Sobczak Date: Wed, 3 Mar 2021 10:18:28 +0100 Subject: [PATCH] [AMDGPU] Introduce Strict WQM mode * Add amdgcn_strict_wqm intrinsic. * Add a corresponding STRICT_WQM machine instruction. * The semantic is similar to amdgcn_strict_wwm with a notable difference that not all threads will be forcibly enabled during the computations of the intrinsic's argument, but only all threads in quads that have at least one thread active. * The difference between amdgc_wqm and amdgcn_strict_wqm, is that in the strict mode an inactive lane will always be enabled irrespective of control flow decisions. Reviewed By: critson Differential Revision: https://reviews.llvm.org/D96258 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 + llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 3 + .../Target/AMDGPU/AMDGPUInstructionSelector.cpp | 2 + llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 1 + llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 1 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 22 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 15 + llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp | 40 ++- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 232 +++++++++----- llvm/test/CodeGen/AMDGPU/wqm.ll | 353 +++++++++++++++++++++ 10 files changed, 579 insertions(+), 94 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 82bb012..bba6972 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1621,6 +1621,10 @@ def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrConvergent, IntrWillReturn] >; +def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, + IntrConvergent, IntrWillReturn] +>; // Given a value, copies it while setting all the inactive lanes to a given // value. Note that OpenGL helper lanes are considered active, so if the diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 415c92a..619a891 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2645,6 +2645,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { case Intrinsic::amdgcn_strict_wwm: Opcode = AMDGPU::STRICT_WWM; break; + case Intrinsic::amdgcn_strict_wqm: + Opcode = AMDGPU::STRICT_WQM; + break; case Intrinsic::amdgcn_interp_p1_f16: SelectInterpP1F16(N); return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index ffd801d..8e83100 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -930,6 +930,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { case Intrinsic::amdgcn_strict_wwm: case Intrinsic::amdgcn_wwm: return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM); + case Intrinsic::amdgcn_strict_wqm: + return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM); case Intrinsic::amdgcn_writelane: return selectWritelane(I); case Intrinsic::amdgcn_div_scale: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 244d4b3..477f04e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3958,6 +3958,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mov_dpp: case Intrinsic::amdgcn_strict_wwm: case Intrinsic::amdgcn_wwm: + case Intrinsic::amdgcn_strict_wqm: case Intrinsic::amdgcn_wqm: case Intrinsic::amdgcn_softwqm: case Intrinsic::amdgcn_set_inactive: diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 93a8935..d5c56bf 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -581,6 +581,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { continue; case AMDGPU::COPY: case AMDGPU::WQM: + case AMDGPU::STRICT_WQM: case AMDGPU::SOFT_WQM: case AMDGPU::STRICT_WWM: { Register DstReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f195208..5a7ef04 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1949,9 +1949,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { : AMDGPU::S_OR_SAVEEXEC_B64)); break; } - case AMDGPU::EXIT_STRICT_WWM: { + case AMDGPU::ENTER_STRICT_WQM: { // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when - // Whole Wave Mode is exited. + // STRICT_WQM is entered. + const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; + const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); + BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); + + MI.eraseFromParent(); + break; + } + case AMDGPU::EXIT_STRICT_WWM: + case AMDGPU::EXIT_STRICT_WQM: { + // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when + // WWM/STICT_WQM is exited. MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } @@ -4407,6 +4420,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::WQM: return AMDGPU::WQM; case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; + case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; case AMDGPU::S_MOV_B32: { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); return MI.getOperand(1).isReg() || @@ -6643,6 +6657,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( case AMDGPU::WQM: case AMDGPU::SOFT_WQM: case AMDGPU::STRICT_WWM: + case AMDGPU::STRICT_WQM: case AMDGPU::REG_SEQUENCE: case AMDGPU::PHI: case AMDGPU::INSERT_SUBREG: @@ -6800,7 +6815,8 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: case AMDGPU::SOFT_WQM: - case AMDGPU::STRICT_WWM: { + case AMDGPU::STRICT_WWM: + case AMDGPU::STRICT_WQM: { const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); if (RI.hasAGPRs(SrcRC)) { if (RI.hasAGPRs(NewDstRC)) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 9263a14..20d591c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -125,6 +125,7 @@ def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; // accidentally clobber inactive channels of $vdst. let Constraints = "@earlyclobber $vdst" in { def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; +def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; } } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] @@ -143,6 +144,20 @@ def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { let mayStore = 0; } +def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { + let Uses = [EXEC]; + let Defs = [EXEC, SCC]; + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + +def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. let Defs = [SCC] in { diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index 51f4ea7..07ff8ef 100644 --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -38,6 +38,9 @@ private: RegisterClassInfo RegClassInfo; std::vector RegsToRewrite; +#ifndef NDEBUG + void printWWMInfo(const MachineInstr &MI); +#endif public: static char ID; @@ -154,6 +157,31 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { MRI->freezeReservedRegs(MF); } +#ifndef NDEBUG +LLVM_DUMP_METHOD void +SIPreAllocateWWMRegs::printWWMInfo(const MachineInstr &MI) { + + unsigned Opc = MI.getOpcode(); + + if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::ENTER_STRICT_WQM) { + dbgs() << "Entering "; + } else { + assert(Opc == AMDGPU::EXIT_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WQM); + dbgs() << "Exiting "; + } + + if (Opc == AMDGPU::ENTER_STRICT_WWM || Opc == AMDGPU::EXIT_STRICT_WWM) { + dbgs() << "Strict WWM "; + } else { + assert(Opc == AMDGPU::ENTER_STRICT_WQM || Opc == AMDGPU::EXIT_STRICT_WQM); + dbgs() << "Strict WQM "; + } + + dbgs() << "region: " << MI; +} + +#endif + bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "SIPreAllocateWWMRegs: function " << MF.getName() << "\n"); @@ -185,21 +213,23 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64) RegsAssigned |= processDef(MI.getOperand(0)); - if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM) { - LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n"); + if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM || + MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM) { + LLVM_DEBUG(printWWMInfo(MI)); InWWM = true; continue; } - if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM) { - LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n"); + if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM || + MI.getOpcode() == AMDGPU::EXIT_STRICT_WQM) { + LLVM_DEBUG(printWWMInfo(MI)); InWWM = false; } if (!InWWM) continue; - LLVM_DEBUG(dbgs() << "processing " << MI << "\n"); + LLVM_DEBUG(dbgs() << "Processing " << MI); for (MachineOperand &DefOpnd : MI.defs()) { RegsAssigned |= processDef(DefOpnd); diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 2fe7a73..f2047f2 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -7,8 +7,13 @@ //===----------------------------------------------------------------------===// // /// \file -/// This pass adds instructions to enable whole quad mode for pixel -/// shaders, and whole wavefront mode for all programs. +/// This pass adds instructions to enable whole quad mode (strict or non-strict) +/// for pixel shaders, and strict whole wavefront mode for all programs. +/// +/// The "strict" prefix indicates that inactive lanes do not take part in +/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will +/// always be enabled irrespective of control flow decisions. Conversely in +/// non-strict WQM inactive lanes may control flow decisions. /// /// Whole quad mode is required for derivative computations, but it interferes /// with shader side effects (stores and atomics). It ensures that WQM is @@ -26,12 +31,21 @@ /// ... /// S_MOV_B64 EXEC, Tmp /// -/// We also compute when a sequence of instructions requires Whole Wavefront -/// Mode (StrictWWM) and insert instructions to save and restore it: +/// We also compute when a sequence of instructions requires strict whole +/// wavefront mode (StrictWWM) and insert instructions to save and restore it: +/// +/// S_OR_SAVEEXEC_B64 Tmp, -1 +/// ... +/// S_MOV_B64 EXEC, Tmp +/// +/// When a sequence of instructions requires strict whole quad mode (StrictWQM) +/// we use a similar save and restore mechanism and force whole quad mode for +/// those instructions: /// -/// S_OR_SAVEEXEC_B64 Tmp, -1 -/// ... -/// S_MOV_B64 EXEC, Tmp +/// S_MOV_B64 Tmp, EXEC +/// S_WQM_B64 EXEC, EXEC +/// ... +/// S_MOV_B64 EXEC, Tmp /// /// In order to avoid excessive switching during sequences of Exact /// instructions, the pass first analyzes which instructions must be run in WQM @@ -77,7 +91,9 @@ namespace { enum { StateWQM = 0x1, StateStrictWWM = 0x2, - StateExact = 0x4, + StateStrictWQM = 0x4, + StateExact = 0x8, + StateStrict = StateStrictWWM | StateStrictWQM, }; struct PrintState { @@ -89,19 +105,23 @@ public: #ifndef NDEBUG static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { - if (PS.State & StateWQM) - OS << "WQM"; - if (PS.State & StateStrictWWM) { - if (PS.State & StateWQM) - OS << '|'; - OS << "StrictWWM"; - } - if (PS.State & StateExact) { - if (PS.State & (StateWQM | StateStrictWWM)) - OS << '|'; - OS << "Exact"; - } + static const std::pair Mapping[] = { + std::make_pair(StateWQM, "WQM"), + std::make_pair(StateStrictWWM, "StrictWWM"), + std::make_pair(StateStrictWQM, "StrictWQM"), + std::make_pair(StateExact, "Exact")}; + char State = PS.State; + for (auto M : Mapping) { + if (State & M.first) { + OS << M.second; + State &= ~M.first; + + if (State) + OS << '|'; + } + } + assert(State == 0); return OS; } #endif @@ -151,7 +171,7 @@ private: DenseMap Instructions; MapVector Blocks; - // Tracks state (WQM/StrictWWM/Exact) after a given instruction + // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction DenseMap StateTransition; SmallVector LiveMaskQueries; @@ -184,10 +204,11 @@ private: Register SaveWQM); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, Register SavedWQM); - void toStrictWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - Register SaveOrig); - void fromStrictWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - Register SavedOrig, char NonStrictWWMState); + void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + Register SaveOrig, char StrictStateNeeded); + void fromStrictMode(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, Register SavedOrig, + char NonStrictState, char CurrentStrictState); MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI); @@ -473,9 +494,17 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, GlobalFlags |= StateStrictWWM; LowerToMovInstrs.push_back(&MI); continue; + } else if (Opcode == AMDGPU::STRICT_WQM) { + // STRICT_WQM is similar to STRICTWWM, but instead of enabling all + // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in + // quads that have at least one active thread. + markInstructionUses(MI, StateStrictWQM, Worklist); + GlobalFlags |= StateStrictWQM; + LowerToMovInstrs.push_back(&MI); + continue; } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || Opcode == AMDGPU::V_SET_INACTIVE_B64) { - III.Disabled = StateStrictWWM; + III.Disabled = StateStrict; MachineOperand &Inactive = MI.getOperand(2); if (Inactive.isReg()) { if (Inactive.isUndef()) { @@ -493,7 +522,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, Worklist.push_back(&MBB); } GlobalFlags |= StateExact; - III.Disabled = StateWQM | StateStrictWWM; + III.Disabled = StateWQM | StateStrict; continue; } else { if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) { @@ -570,7 +599,7 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, // Propagate backwards within block if (MachineInstr *PrevMI = MI.getPrevNode()) { - char InNeeds = (II.Needs & ~StateStrictWWM) | II.OutNeeds; + char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds; if (!PrevMI->isPHI()) { InstrInfo &PrevII = Instructions[PrevMI]; if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { @@ -586,10 +615,12 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, if (II.Needs != 0) markInstructionUses(MI, II.Needs, Worklist); - // Ensure we process a block containing StrictWWM, even if it does not require - // any WQM transitions. + // Ensure we process a block containing StrictWWM/StrictWQM, even if it does + // not require any WQM transitions. if (II.Needs & StateStrictWWM) BI.Needs |= StateStrictWWM; + if (II.Needs & StateStrictWQM) + BI.Needs |= StateStrictWQM; } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, @@ -1105,30 +1136,48 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, StateTransition[MI] = StateWQM; } -void SIWholeQuadMode::toStrictWWM(MachineBasicBlock &MBB, - MachineBasicBlock::iterator Before, - Register SaveOrig) { +void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + Register SaveOrig, char StrictStateNeeded) { MachineInstr *MI; - assert(SaveOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM), - SaveOrig) - .addImm(-1); + assert(StrictStateNeeded == StateStrictWWM || + StrictStateNeeded == StateStrictWQM); + + if (StrictStateNeeded == StateStrictWWM) { + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM), + SaveOrig) + .addImm(-1); + } else { + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM), + SaveOrig) + .addImm(-1); + } LIS->InsertMachineInstrInMaps(*MI); StateTransition[MI] = StateStrictWWM; } -void SIWholeQuadMode::fromStrictWWM(MachineBasicBlock &MBB, - MachineBasicBlock::iterator Before, - Register SavedOrig, - char NonStrictWWMState) { +void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + Register SavedOrig, char NonStrictState, + char CurrentStrictState) { MachineInstr *MI; assert(SavedOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM), Exec) - .addReg(SavedOrig); + assert(CurrentStrictState == StateStrictWWM || + CurrentStrictState == StateStrictWQM); + + if (CurrentStrictState == StateStrictWWM) { + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM), + Exec) + .addReg(SavedOrig); + } else { + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM), + Exec) + .addReg(SavedOrig); + } LIS->InsertMachineInstrInMaps(*MI); - StateTransition[MI] = NonStrictWWMState; + StateTransition[MI] = NonStrictState; } void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { @@ -1149,10 +1198,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { << ":\n"); Register SavedWQMReg; - Register SavedNonStrictWWMReg; + Register SavedNonStrictReg; bool WQMFromExec = IsEntry; char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; - char NonStrictWWMState = 0; + char NonStrictState = 0; const TargetRegisterClass *BoolRC = TRI->getBoolRC(); auto II = MBB.getFirstNonPHI(), IE = MBB.end(); @@ -1166,25 +1215,25 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { // Exact or vice versa. MachineBasicBlock::iterator FirstWQM = IE; - // This stores the first instruction where it's safe to switch from StrictWWM - // to Exact/WQM or to switch to StrictWWM. It must always be the same as, or - // after, FirstWQM since if it's safe to switch to/from StrictWWM, it must be - // safe to switch to/from WQM as well. - MachineBasicBlock::iterator FirstStrictWWM = IE; + // This stores the first instruction where it's safe to switch from Strict + // mode to Exact/WQM or to switch to Strict mode. It must always be the same + // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must + // be safe to switch to/from WQM as well. + MachineBasicBlock::iterator FirstStrict = IE; // Record initial state is block information. BI.InitialState = State; for (;;) { MachineBasicBlock::iterator Next = II; - char Needs = StateExact | StateWQM; // StrictWWM is disabled by default + char Needs = StateExact | StateWQM; // Strict mode is disabled by default. char OutNeeds = 0; if (FirstWQM == IE) FirstWQM = II; - if (FirstStrictWWM == IE) - FirstStrictWWM = II; + if (FirstStrict == IE) + FirstStrict = II; // First, figure out the allowed states (Needs) based on the propagated // flags. @@ -1196,6 +1245,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { if (III != Instructions.end()) { if (III->second.Needs & StateStrictWWM) Needs = StateStrictWWM; + else if (III->second.Needs & StateStrictWQM) + Needs = StateStrictWQM; else if (III->second.Needs & StateWQM) Needs = StateWQM; else @@ -1204,8 +1255,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { } } else { // If the instruction doesn't actually need a correct EXEC, then we can - // safely leave StrictWWM enabled. - Needs = StateExact | StateWQM | StateStrictWWM; + // safely leave Strict mode enabled. + Needs = StateExact | StateWQM | StateStrict; } if (MI.isTerminator() && OutNeeds == StateExact) @@ -1225,27 +1276,28 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { // Now, transition if necessary. if (!(Needs & State)) { MachineBasicBlock::iterator First; - if (State == StateStrictWWM || Needs == StateStrictWWM) { - // We must switch to or from StrictWWM - First = FirstStrictWWM; + if (State == StateStrictWWM || Needs == StateStrictWWM || + State == StateStrictWQM || Needs == StateStrictWQM) { + // We must switch to or from Strict mode. + First = FirstStrict; } else { - // We only need to switch to/from WQM, so we can use FirstWQM + // We only need to switch to/from WQM, so we can use FirstWQM. First = FirstWQM; } - // Whether we need to save SCC depends on start and end states + // Whether we need to save SCC depends on start and end states. bool SaveSCC = false; switch (State) { case StateExact: case StateStrictWWM: - // Exact/WWM -> WWM: save SCC - // Exact/WWM -> WQM: save SCC if WQM mask is generated from exec - // Exact/WWM -> Exact: no save - SaveSCC = - (Needs & StateStrictWWM) || ((Needs & StateWQM) && WQMFromExec); + case StateStrictWQM: + // Exact/Strict -> Strict: save SCC + // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec + // Exact/Strict -> Exact: no save + SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec); break; case StateWQM: - // WQM -> Exact/WMM: save SCC + // WQM -> Exact/Strict: save SCC SaveSCC = !(Needs & StateWQM); break; default: @@ -1255,20 +1307,25 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { MachineBasicBlock::iterator Before = prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC); - if (State == StateStrictWWM) { - assert(SavedNonStrictWWMReg); - fromStrictWWM(MBB, Before, SavedNonStrictWWMReg, NonStrictWWMState); - LIS->createAndComputeVirtRegInterval(SavedNonStrictWWMReg); - SavedNonStrictWWMReg = 0; - State = NonStrictWWMState; + if (State & StateStrict) { + assert(State == StateStrictWWM || State == StateStrictWQM); + assert(SavedNonStrictReg); + fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State); + + LIS->createAndComputeVirtRegInterval(SavedNonStrictReg); + SavedNonStrictReg = 0; + State = NonStrictState; } - if (Needs == StateStrictWWM) { - NonStrictWWMState = State; - assert(!SavedNonStrictWWMReg); - SavedNonStrictWWMReg = MRI->createVirtualRegister(BoolRC); - toStrictWWM(MBB, Before, SavedNonStrictWWMReg); - State = StateStrictWWM; + if (Needs & StateStrict) { + NonStrictState = State; + assert(Needs == StateStrictWWM || Needs == StateStrictWQM); + assert(!SavedNonStrictReg); + SavedNonStrictReg = MRI->createVirtualRegister(BoolRC); + + toStrictMode(MBB, Before, SavedNonStrictReg, Needs); + State = Needs; + } else { if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { if (!WQMFromExec && (OutNeeds & StateWQM)) { @@ -1298,10 +1355,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { } } - if (Needs != (StateExact | StateWQM | StateStrictWWM)) { + if (Needs != (StateExact | StateWQM | StateStrict)) { if (Needs != (StateExact | StateWQM)) FirstWQM = IE; - FirstStrictWWM = IE; + FirstStrict = IE; } if (II == IE) @@ -1310,7 +1367,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { II = Next; } assert(!SavedWQMReg); - assert(!SavedNonStrictWWMReg); + assert(!SavedNonStrictReg); } void SIWholeQuadMode::lowerLiveMaskQueries() { @@ -1402,6 +1459,10 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { } bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName() + << " ------------- \n"); + LLVM_DEBUG(MF.dump();); + Instructions.clear(); Blocks.clear(); LiveMaskQueries.clear(); @@ -1442,10 +1503,9 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LiveMaskReg = Exec; - // Shader is simple does not need WQM/StrictWWM or any complex lowering - if (!(GlobalFlags & (StateWQM | StateStrictWWM)) && - LowerToCopyInstrs.empty() && LowerToMovInstrs.empty() && - KillInstrs.empty()) { + // Shader is simple does not need any state changes or any complex lowering + if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() && + LowerToMovInstrs.empty() && KillInstrs.empty()) { lowerLiveMaskQueries(); return !LiveMaskQueries.empty(); } diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 23f568f..20c44ab 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -186,13 +186,17 @@ main_body: ; Check that we don't leave WWM on for computations that don't require WWM, ; since that will lead clobbering things that aren't supposed to be clobbered ; in cases like this. +; We enforce this by checking that v_add gets emitted in the same block as +; WWM computations. ; ;CHECK-LABEL: {{^}}test_wwm3: +;CHECK: %if ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK: v_add_f32_e32 +;CHECK: %endif define amdgpu_ps float @test_wwm3(i32 inreg %idx) { main_body: ; use mbcnt to make sure the branch is divergent @@ -215,13 +219,17 @@ endif: ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM ; write could clobber disabled channels in the non-WWM one. +; We enforce this by checking that v_mov gets emitted in the same block as +; WWM computations. ; ;CHECK-LABEL: {{^}}test_wwm4: +;CHECK: %if ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK-NEXT: v_mov_b32_e32 +;CHECK: %endif define amdgpu_ps float @test_wwm4(i32 inreg %idx) { main_body: ; use mbcnt to make sure the branch is divergent @@ -277,6 +285,7 @@ main_body: ;VI-CHECK: flat_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG2]] +;CHECK: %endif define amdgpu_ps float @test_wwm6_then() { main_body: %src0 = load volatile float, float addrspace(1)* undef @@ -310,6 +319,7 @@ endif: ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword ;CHECK: s_mov_b64 exec, [[ORIG2]] +;CHECK: %endloop define amdgpu_ps float @test_wwm6_loop() { main_body: %src0 = load volatile float, float addrspace(1)* undef @@ -352,6 +362,208 @@ main_body: ret void } +; Check that Strict WQM is triggered by the strict_wqm intrinsic. +; +;CHECK-LABEL: {{^}}test_strict_wqm1: +;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + ret float %out.0 +} + +; Same as above, but with an integer type. +; +;CHECK-LABEL: {{^}}test_strict_wqm2: +;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_{{[iu]}}32_e32 +define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0.0 = bitcast float %src0 to i32 + %src1.0 = bitcast float %src1 to i32 + %out = add i32 %src0.0, %src1.0 + %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out) + %out.1 = bitcast i32 %out.0 to float + ret float %out.1 +} + +; Check that we don't leave Strict WQM on for computations that don't require it, +; since that will lead clobbering things that aren't supposed to be clobbered +; in cases like this. +; We enforce this by checking that v_add gets emitted in the same block as +; WWM computations. +; +;CHECK-LABEL: {{^}}test_strict_wqm3: +;CHECK: %if +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: v_add_f32_e32 +;CHECK: %endif +define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + %out.1 = fadd float %src, %out.0 + br label %endif + +endif: + %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] + ret float %out.2 +} + +; Check that Strict WQM writes aren't coalesced with non-strict writes, since +; the Strict WQM write could clobber disabled channels in the non-strict one. +; We enforce this by checking that v_mov gets emitted in the same block as +; WWM computations. +; +;CHECK-LABEL: {{^}}test_strict_wqm4: +;CHECK: %if +;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK-NEXT: v_mov_b32_e32 +;CHECK: %endif +define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +; Make sure the transition from Exact to Strict WQM then WQM works properly. +; +;CHECK-LABEL: {{^}}test_strict_wqm5: +;CHECK: buffer_load_dword +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: buffer_store_dword +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: s_wqm_b64 exec, exec +define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %src1, %src1 + %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) + %out = fadd float %temp.0, %temp.0 + %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) + ret float %out.0 +} + +; Check that Strict WQM is turned on correctly across basic block boundaries. +; if..then..endif version +; +;CHECK-LABEL: {{^}}test_strict_wqm6_then: +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: %if +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG2]] +;CHECK: %endif +define amdgpu_ps float @test_strict_wqm6_then() { +main_body: + %src0 = load volatile float, float addrspace(1)* undef + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src1 = load volatile float, float addrspace(1)* undef + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +; Check that Strict WQM is turned on correctly across basic block boundaries. +; loop version +; +;CHECK-LABEL: {{^}}test_strict_wqm6_loop: +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: %loop +;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG2]] +;CHECK: %endloop +define amdgpu_ps float @test_strict_wqm6_loop() { +main_body: + %src0 = load volatile float, float addrspace(1)* undef + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + br label %loop + +loop: + %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ] + %src1 = load volatile float, float addrspace(1)* undef + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) + %counter.1 = sub i32 %counter, 1 + %cc = icmp ne i32 %counter.1, 0 + br i1 %cc, label %loop, label %endloop + +endloop: + ret float %out.0 +} + ; Check that enabling WQM anywhere enables WQM for the set.inactive source. ; ;CHECK-LABEL: {{^}}test_set_inactive2: @@ -862,13 +1074,17 @@ main_body: ; Check that we don't leave WWM on for computations that don't require WWM, ; since that will lead clobbering things that aren't supposed to be clobbered ; in cases like this. +; We enforce this by checking that v_add gets emitted in the same block as +; WWM computations. ; ;CHECK-LABEL: {{^}}test_strict_wwm3: +;CHECK: %if ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK: v_add_f32_e32 +;CHECK: %endif define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { main_body: ; use mbcnt to make sure the branch is divergent @@ -891,13 +1107,17 @@ endif: ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM ; write could clobber disabled channels in the non-WWM one. +; We enforce this by checking that v_mov gets emitted in the same block as +; WWM computations. ; ;CHECK-LABEL: {{^}}test_strict_wwm4: +;CHECK: %if ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK-NEXT: v_mov_b32_e32 +;CHECK: %endif define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { main_body: ; use mbcnt to make sure the branch is divergent @@ -953,6 +1173,7 @@ main_body: ;VI-CHECK: flat_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG2]] +;CHECK: %endif define amdgpu_ps float @test_strict_wwm6_then() { main_body: %src0 = load volatile float, float addrspace(1)* undef @@ -986,6 +1207,7 @@ endif: ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword ;CHECK: s_mov_b64 exec, [[ORIG2]] +;CHECK: %endloop define amdgpu_ps float @test_strict_wwm6_loop() { main_body: %src0 = load volatile float, float addrspace(1)* undef @@ -1059,7 +1281,135 @@ ENDIF: ret float %r } +; Check a case of a block being entirely WQM except for a bit of STRICT WQM. +; +;CHECK-LABEL: {{^}}test_strict_wqm_within_wqm: +;CHECK: %IF +;CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: ds_swizzle +; +define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +main_body: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %IF, label %ENDIF +IF: + %dataf = extractelement <4 x float> %dtex, i32 0 + %data1 = fptosi float %dataf to i32 + %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079) + %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2) + %data3f = sitofp i32 %data3 to float + br label %ENDIF + +ENDIF: + %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ] + ret float %r +} + +;CHECK-LABEL: {{^}}test_strict_wqm_strict_wwm_wqm: +;CHECK: buffer_store_dword + +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] + +;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: s_mov_b64 exec, [[ORIG2]] + +;CHECK: s_mov_b64 [[ORIG3:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: v_add +;CHECK: s_mov_b64 exec, [[ORIG3]] + +;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again. +;CHECK: s_wqm_b64 exec, exec +;CHECK: image_sample + +define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, <4 x i32> inreg %res2, float %inp, <8 x i32> inreg %res3) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %reload, %reload + %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) + %temp3 = fadd float %temp2, %temp2 + %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res2, i32 %idx0, i32 0, i32 0, i32 0) + %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm) + %temp5 = fadd float %temp3, %temp4 + %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res, i1 false, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + ret float %out +} + +;CHECK-LABEL: {{^}}test_strict_wwm_strict_wqm_wqm: +;CHECK: buffer_store_dword + +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] + +;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: s_mov_b64 exec, [[ORIG2]] + +;CHECK: s_or_saveexec_b64 [[ORIG3:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: v_add +;CHECK: s_mov_b64 exec, [[ORIG3]] + +;CHECK: s_wqm_b64 exec, exec +;CHECK: image_sample +define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %reload, %reload + %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) + %temp3 = fadd float %temp2, %temp2 + %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) + %temp5 = fadd float %temp3, %temp4 + %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + ret float %out +} + +;CHECK-LABEL: {{^}}test_wqm_strict_wqm_wqm: +;CHECK: buffer_store_dword + +;CHECK: s_wqm_b64 exec, exec + +;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again. +;CHECK: s_mov_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: buffer_load_dword +;CHECK: s_mov_b64 exec, [[ORIG2]] + +;CHECK: image_sample + +define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %reload, %reload + %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) + %temp2 = fadd float %tex, %tex + %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) + %temp4 = fadd float %temp2, %temp3 + %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %tex2, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) + ret float %out +} declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1 @@ -1074,6 +1424,7 @@ declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 +declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare void @llvm.amdgcn.kill(i1) #1 declare float @llvm.amdgcn.wqm.f32(float) #3 declare i32 @llvm.amdgcn.wqm.i32(i32) #3 @@ -1081,6 +1432,8 @@ declare float @llvm.amdgcn.strict.wwm.f32(float) #3 declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3 declare float @llvm.amdgcn.wwm.f32(float) #3 declare i32 @llvm.amdgcn.wwm.i32(i32) #3 +declare float @llvm.amdgcn.strict.wqm.f32(float) #3 +declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3 -- 2.7.4