bool shouldSkip(const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
- bool dominatesAllReachable(MachineBasicBlock &MBB);
void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec);
- void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- DebugLoc DL);
- bool kill(MachineInstr &MI);
void earlyTerm(MachineInstr &MI);
bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
public:
static char ID;
+ unsigned MovOpc;
+ Register ExecReg;
+
SIInsertSkips() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
return false;
}
-/// Check whether \p MBB dominates all blocks that are reachable from it.
-bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
- for (MachineBasicBlock *Other : depth_first(&MBB)) {
- if (!MDT->dominates(&MBB, Other))
- return false;
- }
- return true;
-}
-
static void generateEndPgm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, DebugLoc DL,
const SIInstrInfo *TII, bool IsPS) {
}
if (ClearExec && !EarlyExitClearsExec) {
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
auto ExitI = EarlyExitBlock->getFirstNonPHI();
- BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0);
+ BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(MovOpc), ExecReg).addImm(0);
EarlyExitClearsExec = true;
}
}
MDT->getBase().applyUpdates(DTUpdates);
}
-/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
-/// iterator. Only applies to pixel shaders.
-void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, DebugLoc DL) {
- MachineFunction *MF = MBB.getParent();
- (void)MF;
- assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
-
- // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
- // basic block that has no further successors (e.g., there was an
- // `unreachable` there in IR). This can happen with original source of the
- // form:
- //
- // if (uniform_condition) {
- // write_to_memory();
- // discard;
- // }
- //
- // In this case, we write the "null_export; s_endpgm" skip code in the
- // already-existing basic block.
- auto NextBBI = std::next(MBB.getIterator());
- bool NoSuccessor =
- I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI);
-
- if (NoSuccessor) {
- generateEndPgm(MBB, I, DL, TII, true);
- } else {
- ensureEarlyExitBlock(MBB, false);
-
- MachineInstr *BranchMI =
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
- .addMBB(EarlyExitBlock);
-
- // Split the block if the branch will not come at the end.
- auto Next = std::next(BranchMI->getIterator());
- if (Next != MBB.end() && !Next->isTerminator())
- splitBlock(MBB, *BranchMI, MDT);
-
- MBB.addSuccessor(EarlyExitBlock);
- MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
- }
-}
-
-/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
-/// Return true unless the terminator is a no-op.
-bool SIInsertSkips::kill(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
-
- switch (MI.getOpcode()) {
- case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
- unsigned Opcode = 0;
-
- // The opcodes are inverted because the inline immediate has to be
- // the first operand, e.g. from "x < imm" to "imm > x"
- switch (MI.getOperand(2).getImm()) {
- case ISD::SETOEQ:
- case ISD::SETEQ:
- Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
- break;
- case ISD::SETOGT:
- case ISD::SETGT:
- Opcode = AMDGPU::V_CMPX_LT_F32_e64;
- break;
- case ISD::SETOGE:
- case ISD::SETGE:
- Opcode = AMDGPU::V_CMPX_LE_F32_e64;
- break;
- case ISD::SETOLT:
- case ISD::SETLT:
- Opcode = AMDGPU::V_CMPX_GT_F32_e64;
- break;
- case ISD::SETOLE:
- case ISD::SETLE:
- Opcode = AMDGPU::V_CMPX_GE_F32_e64;
- break;
- case ISD::SETONE:
- case ISD::SETNE:
- Opcode = AMDGPU::V_CMPX_LG_F32_e64;
- break;
- case ISD::SETO:
- Opcode = AMDGPU::V_CMPX_O_F32_e64;
- break;
- case ISD::SETUO:
- Opcode = AMDGPU::V_CMPX_U_F32_e64;
- break;
- case ISD::SETUEQ:
- Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
- break;
- case ISD::SETUGT:
- Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
- break;
- case ISD::SETUGE:
- Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
- break;
- case ISD::SETULT:
- Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
- break;
- case ISD::SETULE:
- Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
- break;
- case ISD::SETUNE:
- Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
- break;
- default:
- llvm_unreachable("invalid ISD:SET cond code");
- }
-
- const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
- if (ST.hasNoSdstCMPX())
- Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
-
- assert(MI.getOperand(0).isReg());
-
- if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
- MI.getOperand(0).getReg())) {
- Opcode = AMDGPU::getVOPe32(Opcode);
- BuildMI(MBB, &MI, DL, TII->get(Opcode))
- .add(MI.getOperand(1))
- .add(MI.getOperand(0));
- } else {
- auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
- if (!ST.hasNoSdstCMPX())
- I.addReg(AMDGPU::VCC, RegState::Define);
-
- I.addImm(0) // src0 modifiers
- .add(MI.getOperand(1))
- .addImm(0) // src1 modifiers
- .add(MI.getOperand(0));
-
- I.addImm(0); // omod
- }
- return true;
- }
- case AMDGPU::SI_KILL_I1_TERMINATOR: {
- const MachineFunction *MF = MI.getParent()->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- const MachineOperand &Op = MI.getOperand(0);
- int64_t KillVal = MI.getOperand(1).getImm();
- assert(KillVal == 0 || KillVal == -1);
-
- // Kill all threads if Op0 is an immediate and equal to the Kill value.
- if (Op.isImm()) {
- int64_t Imm = Op.getImm();
- assert(Imm == 0 || Imm == -1);
-
- if (Imm == KillVal) {
- BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
- : AMDGPU::S_MOV_B64), Exec)
- .addImm(0);
- return true;
- }
- return false;
- }
-
- unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
- if (ST.isWave32())
- Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
- BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
- .addReg(Exec)
- .add(Op);
- return true;
- }
- default:
- llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
- }
-}
-
void SIInsertSkips::earlyTerm(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc DL = MI.getDebugLoc();
MDT = &getAnalysis<MachineDominatorTree>();
SkipThreshold = SkipThresholdFlag;
- SmallVector<MachineInstr *, 4> KillInstrs;
+ MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
SmallVector<MachineInstr *, 4> EarlyTermInstrs;
bool MadeChange = false;
}
break;
- case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
- case AMDGPU::SI_KILL_I1_TERMINATOR: {
- MadeChange = true;
- bool CanKill = kill(MI);
-
- // Check if we can add an early "if exec=0 { end shader }".
- //
- // Note that we _always_ do this if it is correct, even if the kill
- // happens fairly late in the shader, because the null export should
- // generally still be cheaper than normal export(s).
- //
- // TODO: The dominatesAllReachable check is conservative: if the
- // dominance is only missing due to _uniform_ branches, we could
- // in fact insert the early-exit as well.
- if (CanKill &&
- MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
- dominatesAllReachable(MBB)) {
- // Mark the instruction for kill-if-dead insertion. We delay this
- // change because it modifies the CFG.
- KillInstrs.push_back(&MI);
- } else {
- MI.eraseFromParent();
- }
- break;
- }
-
- case AMDGPU::SI_KILL_CLEANUP:
- if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
- dominatesAllReachable(MBB)) {
- KillInstrs.push_back(&MI);
- } else {
- MI.eraseFromParent();
- }
- break;
-
case AMDGPU::SI_EARLY_TERMINATE_SCC0:
EarlyTermInstrs.push_back(&MI);
break;
earlyTerm(*Instr);
Instr->eraseFromParent();
}
- for (MachineInstr *Kill : KillInstrs) {
- skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
- Kill->getDebugLoc());
- Kill->eraseFromParent();
- }
- KillInstrs.clear();
EarlyTermInstrs.clear();
EarlyExitBlock = nullptr;
MI.setDesc(get(AMDGPU::S_ANDN2_B32));
break;
+ case AMDGPU::S_AND_B64_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_AND_B64));
+ break;
+
+ case AMDGPU::S_AND_B32_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_AND_B32));
+ break;
+
case AMDGPU::V_MOV_B64_PSEUDO: {
Register Dst = MI.getOperand(0).getReg();
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
case AMDGPU::S_XOR_B64_term:
case AMDGPU::S_OR_B64_term:
case AMDGPU::S_ANDN2_B64_term:
+ case AMDGPU::S_AND_B64_term:
case AMDGPU::S_MOV_B32_term:
case AMDGPU::S_XOR_B32_term:
case AMDGPU::S_OR_B32_term:
case AMDGPU::S_ANDN2_B32_term:
+ case AMDGPU::S_AND_B32_term:
break;
case AMDGPU::SI_IF:
case AMDGPU::SI_ELSE:
def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
+def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
}
let WaveSizePredicate = isWave32 in {
def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
+def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
}
// required in degenerate cases (when V_CMPX cannot be used due to constant
// bus limitations) and because it allows us to avoid having to track SCC
// liveness across basic blocks.
- let Defs = [EXEC,VCC,SCC] in
+ let Defs = [EXEC,SCC] in
def _PSEUDO : PseudoInstSI <(outs), ins> {
let isConvergent = 1;
let usesCustomInserter = 1;
}
- let Defs = [EXEC,VCC,SCC] in
+ let Defs = [EXEC,SCC] in
def _TERMINATOR : SPseudoInstSI <(outs), ins> {
let isTerminator = 1;
}
}
defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
+let Defs = [VCC] in
defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
-let Defs = [EXEC] in
-def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
-
let Defs = [EXEC,VCC] in
def SI_ILLEGAL_COPY : SPseudoInstSI <
(outs unknown:$dst), (ins unknown:$src),
MachineRegisterInfo *MRI = nullptr;
SetVector<MachineInstr*> LoweredEndCf;
DenseSet<Register> LoweredIf;
- SmallSet<MachineInstr *, 16> NeedsKillCleanup;
const TargetRegisterClass *BoolRC = nullptr;
- bool InsertKillCleanups;
unsigned AndOpc;
unsigned OrOpc;
unsigned XorOpc;
// just cleared bits.
bool SimpleIf = isSimpleIf(MI, MRI);
- if (InsertKillCleanups) {
- // Check for SI_KILL_*_TERMINATOR on full path of control flow and
- // flag the associated SI_END_CF for insertion of a kill cleanup.
- auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
- while (UseMI->getOpcode() != AMDGPU::SI_END_CF) {
- assert(std::next(UseMI) == MRI->use_instr_nodbg_end());
- assert(UseMI->getOpcode() == AMDGPU::SI_ELSE);
- MachineOperand &NextExec = UseMI->getOperand(0);
- Register NextExecReg = NextExec.getReg();
- if (NextExec.isDead()) {
- assert(!SimpleIf);
- break;
- }
- UseMI = MRI->use_instr_nodbg_begin(NextExecReg);
- }
- if (UseMI->getOpcode() == AMDGPU::SI_END_CF) {
- if (hasKill(MI.getParent(), UseMI->getParent(), TII)) {
- NeedsKillCleanup.insert(&*UseMI);
- SimpleIf = false;
- }
- }
- } else if (SimpleIf) {
+ if (SimpleIf) {
// Check for SI_KILL_*_TERMINATOR on path from if to endif.
// if there is any such terminator simplifications are not safe.
auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
auto E = B->end();
for ( ; It != E; ++It) {
- if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP)
- continue;
if (TII->mayReadEXEC(*MRI, *It))
break;
}
LoweredEndCf.insert(NewMI);
- // If this ends control flow which contains kills (as flagged in emitIf)
- // then insert an SI_KILL_CLEANUP immediately following the exec mask
- // manipulation. This can be lowered to early termination if appropriate.
- MachineInstr *CleanUpMI = nullptr;
- if (NeedsKillCleanup.count(&MI))
- CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP));
-
- if (LIS) {
+ if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
- if (CleanUpMI)
- LIS->InsertMachineInstrInMaps(*CleanUpMI);
- }
MI.eraseFromParent();
LIS = getAnalysisIfAvailable<LiveIntervals>();
MRI = &MF.getRegInfo();
BoolRC = TRI->getBoolRC();
- InsertKillCleanups =
- MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
if (ST.isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
Exec = AMDGPU::EXEC;
}
- SmallVector<MachineInstr *, 32> Worklist;
-
MachineFunction::iterator NextBB;
for (MachineFunction::iterator BI = MF.begin();
BI != MF.end(); BI = NextBB) {
case AMDGPU::SI_LOOP:
case AMDGPU::SI_END_CF:
// Only build worklist if SI_IF instructions must be processed first.
- if (InsertKillCleanups)
- Worklist.push_back(&MI);
- else
- SplitMBB = process(MI);
+ SplitMBB = process(MI);
break;
// FIXME: find a better place for this
}
}
- for (MachineInstr *MI : Worklist)
- process(*MI);
-
optimizeEndCf();
LoweredEndCf.clear();
LoweredIf.clear();
- NeedsKillCleanup.clear();
return true;
}
MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
return true;
}
+ case AMDGPU::S_AND_B64_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_AND_B64));
+ return true;
+ }
+ case AMDGPU::S_AND_B32_term: {
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(TII.get(AMDGPU::S_AND_B32));
+ return true;
+ }
default:
return false;
}
/// shaders, and whole wavefront mode for all programs.
///
/// Whole quad mode is required for derivative computations, but it interferes
-/// with shader side effects (stores and atomics). This pass is run on the
-/// scheduled machine IR but before register coalescing, so that machine SSA is
-/// available for analysis. It ensures that WQM is enabled when necessary, but
-/// disabled around stores and atomics.
+/// with shader side effects (stores and atomics). It ensures that WQM is
+/// enabled when necessary, but disabled around stores and atomics.
///
/// When necessary, this pass creates a function prolog
///
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/raw_ostream.h"
char Needs = 0;
char InNeeds = 0;
char OutNeeds = 0;
+ char InitialState = 0;
+ bool NeedsLowering = false;
};
struct WorkItem {
class SIWholeQuadMode : public MachineFunctionPass {
private:
- CallingConv::ID CallingConv;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
const GCNSubtarget *ST;
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
+ MachineDominatorTree *MDT;
+ MachinePostDominatorTree *PDT;
unsigned AndOpc;
- unsigned XorTermrOpc;
+ unsigned AndN2Opc;
+ unsigned XorOpc;
+ unsigned AndSaveExecOpc;
unsigned OrSaveExecOpc;
- unsigned Exec;
+ unsigned WQMOpc;
+ Register Exec;
+ Register LiveMaskReg;
DenseMap<const MachineInstr *, InstrInfo> Instructions;
MapVector<MachineBasicBlock *, BlockInfo> Blocks;
- SmallVector<MachineInstr *, 1> LiveMaskQueries;
+
+ // Tracks state (WQM/WWM/Exact) after a given instruction
+ DenseMap<const MachineInstr *, char> StateTransition;
+
+ SmallVector<MachineInstr *, 2> LiveMaskQueries;
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
+ SmallVector<MachineInstr *, 4> KillInstrs;
void printInfo();
MachineBasicBlock::iterator Last, bool PreferLast,
bool SaveSCC);
void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
- unsigned SaveWQM, unsigned LiveMaskReg);
+ Register SaveWQM);
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
- unsigned SavedWQM);
+ Register SavedWQM);
void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
- unsigned SaveOrig);
+ Register SaveOrig);
void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
- unsigned SavedOrig);
- void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+ Register SavedOrig, char NonWWMState);
+
+ MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
- void lowerLiveMaskQueries(unsigned LiveMaskReg);
+ MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
+ bool IsWQM);
+ MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
+
+ void lowerBlock(MachineBasicBlock &MBB);
+ void processBlock(MachineBasicBlock &MBB, bool IsEntry);
+
+ void lowerLiveMaskQueries();
void lowerCopyInstrs();
+ void lowerKillInstrs(bool IsWQM);
public:
static char ID;
AU.addRequired<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
- AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachinePostDominatorTree>();
+ AU.addPreserved<MachinePostDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
+
+ MachineFunctionProperties getClearedProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::IsSSA);
+ }
};
} // end anonymous namespace
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
false)
void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
Register Reg, unsigned SubReg, char Flag,
std::vector<WorkItem> &Worklist) {
- assert(!MRI->isSSA());
-
LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
if (!Value)
continue;
- if (MRI->isSSA()) {
- // Since we're in machine SSA, we do not need to track physical
- // registers across basic blocks.
- if (Value->isPHIDef())
- continue;
- markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
- Worklist);
- } else {
- markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
- }
+ markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
}
continue;
}
- if (MRI->isSSA()) {
- for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
- markInstruction(DefMI, Flag, Worklist);
- } else {
- LiveRange &LR = LIS->getInterval(Reg);
- markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist);
- }
+ LiveRange &LR = LIS->getInterval(Reg);
+ markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist);
}
}
} else {
if (Opcode == AMDGPU::SI_PS_LIVE) {
LiveMaskQueries.push_back(&MI);
+ } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
+ Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR) {
+ KillInstrs.push_back(&MI);
+ BBI.NeedsLowering = true;
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
// VGPRs correspond to shader inputs and outputs. Inputs are
// only used, outputs are only defined.
+ // FIXME: is this still valid?
for (const MachineOperand &MO : MI.defs()) {
if (!MO.isReg())
continue;
return Restore;
}
+MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
+ MachineInstr *TermMI) {
+ LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
+ << *TermMI << "\n");
+
+ MachineBasicBlock *SplitBB =
+ BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
+
+ // Convert last instruction in block to a terminator.
+ // Note: this only covers the expected patterns
+ unsigned NewOpcode = 0;
+ switch (TermMI->getOpcode()) {
+ case AMDGPU::S_AND_B32:
+ NewOpcode = AMDGPU::S_AND_B32_term;
+ break;
+ case AMDGPU::S_AND_B64:
+ NewOpcode = AMDGPU::S_AND_B64_term;
+ break;
+ case AMDGPU::S_MOV_B32:
+ NewOpcode = AMDGPU::S_MOV_B32_term;
+ break;
+ case AMDGPU::S_MOV_B64:
+ NewOpcode = AMDGPU::S_MOV_B64_term;
+ break;
+ default:
+ break;
+ }
+ if (NewOpcode)
+ TermMI->setDesc(TII->get(NewOpcode));
+
+ if (SplitBB != BB) {
+ // Update dominator trees
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+ for (MachineBasicBlock *Succ : SplitBB->successors()) {
+ DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+ DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
+ }
+ DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
+ if (MDT)
+ MDT->getBase().applyUpdates(DTUpdates);
+ if (PDT)
+ PDT->getBase().applyUpdates(DTUpdates);
+
+ // Link blocks
+ MachineInstr *MI =
+ BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(SplitBB);
+ LIS->InsertMachineInstrInMaps(*MI);
+ }
+
+ return SplitBB;
+}
+
+MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
+ MachineInstr &MI) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opcode = 0;
+
+ assert(MI.getOperand(0).isReg());
+
+ // Comparison is for live lanes; however here we compute the inverse
+ // (killed lanes). This is because VCMP will always generate 0 bits
+ // for inactive lanes so a mask of live lanes would not be correct
+ // inside control flow.
+ // Invert the comparison by swapping the operands and adjusting
+ // the comparison codes.
+
+ switch (MI.getOperand(2).getImm()) {
+ case ISD::SETUEQ:
+ Opcode = AMDGPU::V_CMP_LG_F32_e64;
+ break;
+ case ISD::SETUGT:
+ Opcode = AMDGPU::V_CMP_GE_F32_e64;
+ break;
+ case ISD::SETUGE:
+ Opcode = AMDGPU::V_CMP_GT_F32_e64;
+ break;
+ case ISD::SETULT:
+ Opcode = AMDGPU::V_CMP_LE_F32_e64;
+ break;
+ case ISD::SETULE:
+ Opcode = AMDGPU::V_CMP_LT_F32_e64;
+ break;
+ case ISD::SETUNE:
+ Opcode = AMDGPU::V_CMP_EQ_F32_e64;
+ break;
+ case ISD::SETO:
+ Opcode = AMDGPU::V_CMP_O_F32_e64;
+ break;
+ case ISD::SETUO:
+ Opcode = AMDGPU::V_CMP_U_F32_e64;
+ break;
+ case ISD::SETOEQ:
+ case ISD::SETEQ:
+ Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
+ break;
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ Opcode = AMDGPU::V_CMP_NLT_F32_e64;
+ break;
+ case ISD::SETOGE:
+ case ISD::SETGE:
+ Opcode = AMDGPU::V_CMP_NLE_F32_e64;
+ break;
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ Opcode = AMDGPU::V_CMP_NGT_F32_e64;
+ break;
+ case ISD::SETOLE:
+ case ISD::SETLE:
+ Opcode = AMDGPU::V_CMP_NGE_F32_e64;
+ break;
+ case ISD::SETONE:
+ case ISD::SETNE:
+ Opcode = AMDGPU::V_CMP_NLG_F32_e64;
+ break;
+ default:
+ llvm_unreachable("invalid ISD:SET cond code");
+ }
+
+ // Pick opcode based on comparison type.
+ MachineInstr *VcmpMI;
+ const MachineOperand &Op0 = MI.getOperand(0);
+ const MachineOperand &Op1 = MI.getOperand(1);
+ if (TRI->isVGPR(*MRI, Op0.getReg())) {
+ Opcode = AMDGPU::getVOPe32(Opcode);
+ VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
+ } else {
+ VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
+ .addReg(AMDGPU::VCC, RegState::Define)
+ .addImm(0) // src0 modifiers
+ .add(Op1)
+ .addImm(0) // src1 modifiers
+ .add(Op0)
+ .addImm(0); // omod
+ }
+
+ // VCC represents lanes killed.
+ Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+
+ MachineInstr *MaskUpdateMI =
+ BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ .addReg(LiveMaskReg)
+ .addReg(VCC);
+
+ // State of SCC represents whether any lanes are live in mask,
+ // if SCC is 0 then no lanes will be alive anymore.
+ MachineInstr *EarlyTermMI =
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
+
+ MachineInstr *ExecMaskMI =
+ BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
+
+ assert(MBB.succ_size() == 1);
+ MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
+ .addMBB(*MBB.succ_begin());
+
+ // Update live intervals
+ LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
+ MBB.remove(&MI);
+
+ LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
+ LIS->InsertMachineInstrInMaps(*ExecMaskMI);
+ LIS->InsertMachineInstrInMaps(*EarlyTermMI);
+ LIS->InsertMachineInstrInMaps(*NewTerm);
+
+ return NewTerm;
+}
+
+MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
+ MachineInstr &MI, bool IsWQM) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineInstr *MaskUpdateMI = nullptr;
+
+ const MachineOperand &Op = MI.getOperand(0);
+ int64_t KillVal = MI.getOperand(1).getImm();
+ MachineInstr *ComputeKilledMaskMI = nullptr;
+ Register CndReg = !Op.isImm() ? Op.getReg() : Register();
+ Register TmpReg;
+
+ // Is this a static or dynamic kill?
+ if (Op.isImm()) {
+ if (Op.getImm() == KillVal) {
+ // Static: all active lanes are killed
+ MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ .addReg(LiveMaskReg)
+ .addReg(Exec);
+ } else {
+ // Static: kill does nothing
+ MachineInstr *NewTerm = nullptr;
+ assert(MBB.succ_size() == 1);
+ NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
+ .addMBB(*MBB.succ_begin());
+ LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
+ MBB.remove(&MI);
+ return NewTerm;
+ }
+ } else {
+ if (!KillVal) {
+ // Op represents live lanes after kill,
+ // so exec mask needs to be factored in.
+ TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
+ ComputeKilledMaskMI =
+ BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
+ MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ .addReg(LiveMaskReg)
+ .addReg(TmpReg);
+ } else {
+ // Op represents lanes to kill
+ MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
+ .addReg(LiveMaskReg)
+ .add(Op);
+ }
+ }
+
+ // State of SCC represents whether any lanes are live in mask,
+ // if SCC is 0 then no lanes will be alive anymore.
+ MachineInstr *EarlyTermMI =
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
+
+ // In the case we got this far some lanes are still live,
+ // update EXEC to deactivate lanes as appropriate.
+ MachineInstr *NewTerm;
+ if (Op.isImm()) {
+ unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
+ } else if (!IsWQM) {
+ NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
+ .addReg(Exec)
+ .addReg(LiveMaskReg);
+ } else {
+ unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
+ NewTerm =
+ BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
+ }
+
+ // Update live intervals
+ LIS->RemoveMachineInstrFromMaps(MI);
+ MBB.remove(&MI);
+ assert(EarlyTermMI);
+ assert(MaskUpdateMI);
+ assert(NewTerm);
+ if (ComputeKilledMaskMI)
+ LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
+ LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
+ LIS->InsertMachineInstrInMaps(*EarlyTermMI);
+ LIS->InsertMachineInstrInMaps(*NewTerm);
+
+ if (CndReg) {
+ LIS->removeInterval(CndReg);
+ LIS->createAndComputeVirtRegInterval(CndReg);
+ }
+ if (TmpReg)
+ LIS->createAndComputeVirtRegInterval(TmpReg);
+
+ return NewTerm;
+}
+
+// Replace (or supplement) instructions accessing live mask.
+// This can only happen once all the live mask registers have been created
+// and the execute state (WQM/WWM/Exact) of instructions is known.
+void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
+ auto BII = Blocks.find(&MBB);
+ if (BII == Blocks.end())
+ return;
+
+ const BlockInfo &BI = BII->second;
+ if (!BI.NeedsLowering)
+ return;
+
+ LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
+
+ SmallVector<MachineInstr *, 4> SplitPoints;
+ char State = BI.InitialState;
+
+ auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+ while (II != IE) {
+ auto Next = std::next(II);
+ MachineInstr &MI = *II;
+
+ if (StateTransition.count(&MI))
+ State = StateTransition[&MI];
+
+ MachineInstr *SplitPoint = nullptr;
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_KILL_I1_TERMINATOR:
+ SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
+ break;
+ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+ SplitPoint = lowerKillF32(MBB, MI);
+ break;
+ default:
+ break;
+ }
+ if (SplitPoint)
+ SplitPoints.push_back(SplitPoint);
+
+ II = Next;
+ }
+
+ // Perform splitting after instruction scan to simplify iteration.
+ if (!SplitPoints.empty()) {
+ MachineBasicBlock *BB = &MBB;
+ for (MachineInstr *MI : SplitPoints) {
+ BB = splitBlock(BB, MI);
+ }
+ }
+}
+
// Return an iterator in the (inclusive) range [First, Last] at which
// instructions can be safely inserted, keeping in mind that some of the
// instructions we want to add necessarily clobber SCC.
void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
- unsigned SaveWQM, unsigned LiveMaskReg) {
+ Register SaveWQM) {
MachineInstr *MI;
if (SaveWQM) {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
- AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
- SaveWQM)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
.addReg(LiveMaskReg);
} else {
- unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
- AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
- Exec)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
.addReg(Exec)
.addReg(LiveMaskReg);
}
LIS->InsertMachineInstrInMaps(*MI);
+ StateTransition[MI] = StateExact;
}
void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
- unsigned SavedWQM) {
+ Register SavedWQM) {
MachineInstr *MI;
- unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
if (SavedWQM) {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
.addReg(SavedWQM);
} else {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
- AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
- Exec)
- .addReg(Exec);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
}
LIS->InsertMachineInstrInMaps(*MI);
+ StateTransition[MI] = StateWQM;
}
void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
- unsigned SaveOrig) {
+ Register SaveOrig) {
MachineInstr *MI;
assert(SaveOrig);
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
.addImm(-1);
LIS->InsertMachineInstrInMaps(*MI);
+ StateTransition[MI] = StateWWM;
}
void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
- unsigned SavedOrig) {
+ Register SavedOrig, char NonWWMState) {
MachineInstr *MI;
assert(SavedOrig);
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
- ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), Exec)
.addReg(SavedOrig);
LIS->InsertMachineInstrInMaps(*MI);
+ StateTransition[MI] = NonWWMState;
}
-void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
- bool isEntry) {
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
auto BII = Blocks.find(&MBB);
if (BII == Blocks.end())
return;
- const BlockInfo &BI = BII->second;
+ BlockInfo &BI = BII->second;
// This is a non-entry block that is WQM throughout, so no need to do
// anything.
- if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
+ if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
+ BI.InitialState = StateWQM;
return;
+ }
LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
<< ":\n");
- unsigned SavedWQMReg = 0;
- unsigned SavedNonWWMReg = 0;
- bool WQMFromExec = isEntry;
- char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
+ Register SavedWQMReg;
+ Register SavedNonWWMReg;
+ bool WQMFromExec = IsEntry;
+ char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
char NonWWMState = 0;
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
- if (isEntry) {
+ if (IsEntry) {
// Skip the instruction that saves LiveMask
if (II != IE && II->getOpcode() == AMDGPU::COPY)
++II;
// switch to/from WQM as well.
MachineBasicBlock::iterator FirstWWM = IE;
+ // Record initial state is block information.
+ BI.InitialState = State;
+
for (;;) {
MachineBasicBlock::iterator Next = II;
char Needs = StateExact | StateWQM; // WWM is disabled by default
if (State == StateWWM) {
assert(SavedNonWWMReg);
- fromWWM(MBB, Before, SavedNonWWMReg);
+ fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState);
LIS->createAndComputeVirtRegInterval(SavedNonWWMReg);
SavedNonWWMReg = 0;
State = NonWWMState;
SavedWQMReg = MRI->createVirtualRegister(BoolRC);
}
- toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+ toExact(MBB, Before, SavedWQMReg);
State = StateExact;
} else if (State == StateExact && (Needs & StateWQM) &&
!(Needs & StateExact)) {
assert(!SavedNonWWMReg);
}
-void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+void SIWholeQuadMode::lowerLiveMaskQueries() {
for (MachineInstr *MI : LiveMaskQueries) {
const DebugLoc &DL = MI->getDebugLoc();
Register Dest = MI->getOperand(0).getReg();
// And make it implicitly depend on exec (like all VALU movs should do).
MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
- } else if (!MRI->isSSA()) {
+ } else {
// Remove early-clobber and exec dependency from simple SGPR copies.
// This allows some to be eliminated during/post RA.
LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
}
}
+void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
+ for (MachineInstr *MI : KillInstrs) {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstr *SplitPoint = nullptr;
+ switch (MI->getOpcode()) {
+ case AMDGPU::SI_KILL_I1_TERMINATOR:
+ SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
+ break;
+ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+ SplitPoint = lowerKillF32(*MBB, *MI);
+ break;
+ default:
+ continue;
+ }
+ if (SplitPoint)
+ splitBlock(MBB, SplitPoint);
+ }
+}
+
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
Instructions.clear();
Blocks.clear();
LiveMaskQueries.clear();
LowerToCopyInstrs.clear();
LowerToMovInstrs.clear();
- CallingConv = MF.getFunction().getCallingConv();
+ KillInstrs.clear();
+ StateTransition.clear();
ST = &MF.getSubtarget<GCNSubtarget>();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ PDT = &getAnalysis<MachinePostDominatorTree>();
if (ST->isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
- XorTermrOpc = AMDGPU::S_XOR_B32_term;
+ AndN2Opc = AMDGPU::S_ANDN2_B32;
+ XorOpc = AMDGPU::S_XOR_B32;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
+ WQMOpc = AMDGPU::S_WQM_B32;
Exec = AMDGPU::EXEC_LO;
} else {
AndOpc = AMDGPU::S_AND_B64;
- XorTermrOpc = AMDGPU::S_XOR_B64_term;
+ AndN2Opc = AMDGPU::S_ANDN2_B64;
+ XorOpc = AMDGPU::S_XOR_B64;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
+ WQMOpc = AMDGPU::S_WQM_B64;
Exec = AMDGPU::EXEC;
}
- char GlobalFlags = analyzeFunction(MF);
- unsigned LiveMaskReg = 0;
- if (!(GlobalFlags & StateWQM)) {
- lowerLiveMaskQueries(Exec);
- if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
- return !LiveMaskQueries.empty();
- } else {
- // Store a copy of the original live mask when required
- MachineBasicBlock &Entry = MF.front();
- MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
-
- if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
- LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
- MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
- TII->get(AMDGPU::COPY), LiveMaskReg)
- .addReg(Exec);
- LIS->InsertMachineInstrInMaps(*MI);
- }
+ const char GlobalFlags = analyzeFunction(MF);
+ const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
- lowerLiveMaskQueries(LiveMaskReg);
+ LiveMaskReg = Exec;
- if (GlobalFlags == StateWQM) {
- // For a shader that needs only WQM, we can just set it once.
- auto MI = BuildMI(Entry, EntryMI, DebugLoc(),
- TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32
- : AMDGPU::S_WQM_B64),
- Exec)
- .addReg(Exec);
- LIS->InsertMachineInstrInMaps(*MI);
+ // Shader is simple does not need WQM/WWM or any complex lowering
+ if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty() &&
+ LowerToMovInstrs.empty() && KillInstrs.empty()) {
+ lowerLiveMaskQueries();
+ return !LiveMaskQueries.empty();
+ }
- lowerCopyInstrs();
- // EntryMI may become invalid here
- return true;
- }
+ MachineBasicBlock &Entry = MF.front();
+ MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
+
+ // Store a copy of the original live mask when required
+ if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
+ LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
+ MachineInstr *MI =
+ BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+ .addReg(Exec);
+ LIS->InsertMachineInstrInMaps(*MI);
}
LLVM_DEBUG(printInfo());
+ lowerLiveMaskQueries();
lowerCopyInstrs();
- // Handle the general case
- for (auto BII : Blocks)
- processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+ // Shader only needs WQM
+ if (GlobalFlags == StateWQM) {
+ auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
+ .addReg(Exec);
+ LIS->InsertMachineInstrInMaps(*MI);
+ lowerKillInstrs(true);
+ } else {
+ for (auto BII : Blocks)
+ processBlock(*BII.first, BII.first == &Entry);
+ // Lowering blocks causes block splitting so perform as a second pass.
+ for (auto BII : Blocks)
+ lowerBlock(*BII.first);
+ }
- if (LiveMaskReg)
+ // Compute live range for live mask
+ if (LiveMaskReg != Exec)
LIS->createAndComputeVirtRegInterval(LiveMaskReg);
// Physical registers like SCC aren't tracked by default anyway, so just
// the analysis results.
LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+ // If we performed any kills then recompute EXEC
+ if (!KillInstrs.empty())
+ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+
return true;
}
;
; GFX8-LABEL: add_i32_varying:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_mov_b64 s[10:11], exec
+; GFX8-NEXT: s_mov_b64 s[8:9], exec
+; GFX8-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
;
; GFX9-LABEL: add_i32_varying:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b64 s[10:11], exec
+; GFX9-NEXT: s_mov_b64 s[8:9], exec
+; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
;
; GFX1064-LABEL: add_i32_varying:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_mov_b64 s[10:11], exec
+; GFX1064-NEXT: s_mov_b64 s[8:9], exec
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[8:9]
; GFX1064-NEXT: ; implicit-def: $vgpr0
; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
; GFX1064-NEXT: s_cbranch_execz BB1_4
;
; GFX1032-LABEL: add_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s9, exec_lo
+; GFX1032-NEXT: s_mov_b32 s8, exec_lo
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-NEXT: s_mov_b32 s9, s8
; GFX1032-NEXT: ; implicit-def: $vgpr0
; GFX1032-NEXT: s_and_saveexec_b32 s8, s9
; GFX1032-NEXT: s_cbranch_execz BB1_4
ret void
}
- define amdgpu_ps void @early_term_scc0_with_kill() {
- ret void
- }
-
define amdgpu_gs void @early_term_scc0_gs() {
ret void
}
...
---
-name: early_term_scc0_with_kill
-tracksRegLiveness: true
-liveins:
- - { reg: '$sgpr0' }
- - { reg: '$sgpr1' }
- - { reg: '$vgpr2' }
-body: |
- ; CHECK-LABEL: name: early_term_scc0_with_kill
- ; CHECK: bb.0:
- ; CHECK: successors: %bb.1(0x80000000), %bb.3(0x00000000)
- ; CHECK: liveins: $sgpr0, $sgpr1, $vgpr2
- ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK: V_CMPX_LE_F32_nosdst_e32 0, killed $vgpr2, implicit-def $exec, implicit $mode, implicit $exec
- ; CHECK: S_CBRANCH_EXECZ %bb.3, implicit $exec
- ; CHECK: bb.1:
- ; CHECK: successors: %bb.4(0x40000000), %bb.3(0x40000000)
- ; CHECK: liveins: $sgpr0, $sgpr1, $vgpr0
- ; CHECK: dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
- ; CHECK: S_CBRANCH_SCC0 %bb.3, implicit $scc
- ; CHECK: bb.4:
- ; CHECK: successors: %bb.2(0x80000000)
- ; CHECK: liveins: $vgpr0, $scc
- ; CHECK: $vgpr1 = V_MOV_B32_e32 1, implicit $exec
- ; CHECK: bb.2:
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK: EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
- ; CHECK: EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
- ; CHECK: S_ENDPGM 0
- ; CHECK: bb.3:
- ; CHECK: $exec_lo = S_MOV_B32 0
- ; CHECK: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
- ; CHECK: S_ENDPGM 0
- bb.0:
- liveins: $sgpr0, $sgpr1, $vgpr2
- successors: %bb.1
- $vgpr0 = V_MOV_B32_e32 0, implicit $exec
- SI_KILL_F32_COND_IMM_TERMINATOR killed $vgpr2, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
-
- bb.1:
- liveins: $sgpr0, $sgpr1, $vgpr0
- successors: %bb.2
- dead $sgpr0 = S_AND_B32 $sgpr0, killed $sgpr1, implicit-def $scc
- SI_EARLY_TERMINATE_SCC0 implicit $scc, implicit $exec
- $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-
- bb.2:
- liveins: $vgpr0, $vgpr1
- EXP 1, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
- EXP_DONE 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
- S_ENDPGM 0
-...
-
----
name: early_term_scc0_gs
tracksRegLiveness: true
liveins:
+++ /dev/null
-# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-insert-skips -amdgpu-skip-threshold-legacy=1 %s -o - | FileCheck %s
-# https://bugs.freedesktop.org/show_bug.cgi?id=99019
---- |
- define amdgpu_ps void @kill_uncond_branch() {
- ret void
- }
-...
----
-
-# CHECK-LABEL: name: kill_uncond_branch
-
-# CHECK: bb.0:
-# CHECK: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
-
-# CHECK: bb.1:
-# CHECK: V_CMPX_LE_F32_e32
-# CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
-
-# CHECK: bb.2:
-# CHECK: S_ENDPGM 0
-
-# CHECK: bb.3:
-# CHECK-NEXT: EXP_DONE
-# CHECK: S_ENDPGM 0
-
-name: kill_uncond_branch
-
-body: |
- bb.0:
- successors: %bb.1
- S_CBRANCH_VCCNZ %bb.1, implicit $vcc
-
- bb.1:
- successors: %bb.2
- $vgpr0 = V_MOV_B32_e32 0, implicit $exec
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
- S_BRANCH %bb.2
-
- bb.2:
- S_ENDPGM 0
%tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00
%c2 = fcmp oge float %tmp3, 0.0
call void @llvm.amdgcn.kill(i1 %c2)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}vcc_implicit_def:
-; GCN-NOT: v_cmp_gt_f32_e32 vcc,
+; GCN: v_cmp_nle_f32_e32 vcc, 0, v{{[0-9]+}}
; GCN: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
-; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
-; GFX10: v_cmpx_le_f32_e32 0, v{{[0-9]+}}
+; GCN: s_andn2_b64 exec, exec, vcc
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) {
%tmp0 = fcmp olt float %arg13, 0.000000e+00
call void @llvm.amdgcn.kill(i1 %c1)
%tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00
call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}true:
; GCN-NEXT: %bb.
-; GCN-NEXT: %bb.
; GCN-NEXT: s_endpgm
define amdgpu_gs void @true() {
call void @llvm.amdgcn.kill(i1 true)
; GCN: s_mov_b64 exec, 0
define amdgpu_gs void @false() {
call void @llvm.amdgcn.kill(i1 false)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN: v_cmp_lt_i32
; GCN: v_cmp_lt_i32
; GCN: s_or_b64 s[0:1]
-; GCN: s_and_b64 exec, exec, s[0:1]
+; GCN: s_xor_b64 s[0:1], s[0:1], exec
+; GCN: s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GCN: s_and_b64 exec, exec, s[2:3]
define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) {
%c1 = icmp slt i32 %a, %b
%c2 = icmp slt i32 %c, %d
%x = or i1 %c1, %c2
call void @llvm.amdgcn.kill(i1 %x)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN: v_cmp_lt_i32
; GCN: v_cmp_lt_i32
; GCN: s_xor_b64 s[0:1]
-; GCN: s_andn2_b64 exec, exec, s[0:1]
+; GCN: s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GCN: s_and_b64 exec, exec, s[2:3]
define amdgpu_gs void @andn2(i32 %a, i32 %b, i32 %c, i32 %d) {
%c1 = icmp slt i32 %a, %b
%c2 = icmp slt i32 %c, %d
%x = xor i1 %c1, %c2
%y = xor i1 %x, 1
call void @llvm.amdgcn.kill(i1 %y)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}oeq:
-; GCN: v_cmpx_eq_f32
-; GCN-NOT: s_and
+; GCN: v_cmp_neq_f32
define amdgpu_gs void @oeq(float %a) {
%c1 = fcmp oeq float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}ogt:
-; GCN: v_cmpx_lt_f32
-; GCN-NOT: s_and
+; GCN: v_cmp_nlt_f32
define amdgpu_gs void @ogt(float %a) {
%c1 = fcmp ogt float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}oge:
-; GCN: v_cmpx_le_f32
-; GCN-NOT: s_and
+; GCN: v_cmp_nle_f32
define amdgpu_gs void @oge(float %a) {
%c1 = fcmp oge float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}olt:
-; GCN: v_cmpx_gt_f32
-; GCN-NOT: s_and
+; GCN: v_cmp_ngt_f32
define amdgpu_gs void @olt(float %a) {
%c1 = fcmp olt float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}ole:
-; GCN: v_cmpx_ge_f32
-; GCN-NOT: s_and
+; GCN: v_cmp_nge_f32
define amdgpu_gs void @ole(float %a) {
%c1 = fcmp ole float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}one:
-; GCN: v_cmpx_lg_f32
-; GCN-NOT: s_and
+; GCN: v_cmp_nlg_f32
define amdgpu_gs void @one(float %a) {
%c1 = fcmp one float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}ord:
-; FIXME: This is absolutely unimportant, but we could use the cmpx variant here.
; GCN: v_cmp_o_f32
define amdgpu_gs void @ord(float %a) {
%c1 = fcmp ord float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}uno:
-; FIXME: This is absolutely unimportant, but we could use the cmpx variant here.
; GCN: v_cmp_u_f32
define amdgpu_gs void @uno(float %a) {
%c1 = fcmp uno float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}ueq:
-; GCN: v_cmpx_nlg_f32
-; GCN-NOT: s_and
+; GCN: v_cmp_lg_f32
define amdgpu_gs void @ueq(float %a) {
%c1 = fcmp ueq float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}ugt:
-; GCN: v_cmpx_nge_f32
-; GCN-NOT: s_and
+; GCN: v_cmp_ge_f32
define amdgpu_gs void @ugt(float %a) {
%c1 = fcmp ugt float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}uge:
-; SI: v_cmpx_ngt_f32_e32 vcc, -1.0
-; GFX10: v_cmpx_ngt_f32_e32 -1.0
-; GCN-NOT: s_and
+; GCN: v_cmp_gt_f32_e32 vcc, -1.0
define amdgpu_gs void @uge(float %a) {
%c1 = fcmp uge float %a, -1.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}ult:
-; SI: v_cmpx_nle_f32_e32 vcc, -2.0
-; GFX10: v_cmpx_nle_f32_e32 -2.0
-; GCN-NOT: s_and
+; GCN: v_cmp_le_f32_e32 vcc, -2.0
define amdgpu_gs void @ult(float %a) {
%c1 = fcmp ult float %a, -2.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}ule:
-; SI: v_cmpx_nlt_f32_e32 vcc, 2.0
-; GFX10: v_cmpx_nlt_f32_e32 2.0
-; GCN-NOT: s_and
+; GCN: v_cmp_lt_f32_e32 vcc, 2.0
define amdgpu_gs void @ule(float %a) {
%c1 = fcmp ule float %a, 2.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}une:
-; SI: v_cmpx_neq_f32_e32 vcc, 0
-; GFX10: v_cmpx_neq_f32_e32 0
-; GCN-NOT: s_and
+; GCN: v_cmp_eq_f32_e32 vcc, 0
define amdgpu_gs void @une(float %a) {
%c1 = fcmp une float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; GCN-LABEL: {{^}}neg_olt:
-; SI: v_cmpx_ngt_f32_e32 vcc, 1.0
-; GFX10: v_cmpx_ngt_f32_e32 1.0
-; GCN-NOT: s_and
+; GCN: v_cmp_gt_f32_e32 vcc, 1.0
define amdgpu_gs void @neg_olt(float %a) {
%c1 = fcmp olt float %a, 1.0
%c2 = xor i1 %c1, 1
call void @llvm.amdgcn.kill(i1 %c2)
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
; SI: v_cmp_lt_f32_e32 vcc, s{{[0-9]+}}, v0
; GFX10: v_cmp_lt_f32_e32 vcc, 0x3e800000, v0
; GCN: v_cndmask_b32
-; GCN: v_cmpx_le_f32
+; GCN: v_cmp_nle_f32
define amdgpu_ps void @fcmp_x2(float %a) #0 {
%ogt = fcmp nsz ogt float %a, 2.500000e-01
%k = select i1 %ogt, float -1.000000e+00, float 0.000000e+00
ret void
}
+; Note: an almost identical test for this exists in llvm.amdgcn.wqm.vote.ll
; GCN-LABEL: {{^}}wqm:
; GCN: v_cmp_neq_f32_e32 vcc, 0
-; GCN: s_wqm_b64 s[0:1], vcc
+; GCN-DAG: s_wqm_b64 s[2:3], vcc
+; GCN-DAG: s_mov_b64 s[0:1], exec
+; GCN: s_xor_b64 s[2:3], s[2:3], exec
+; GCN: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GCN: s_and_b64 exec, exec, s[0:1]
-define amdgpu_ps void @wqm(float %a) {
+define amdgpu_ps float @wqm(float %a) {
%c1 = fcmp une float %a, 0.0
%c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1)
call void @llvm.amdgcn.kill(i1 %c2)
- ret void
+ ret float 0.0
}
; This checks that we use the 64-bit encoding when the operand is a SGPR.
; GCN-LABEL: {{^}}test_sgpr:
-; GCN: v_cmpx_ge_f32_e64
+; GCN: v_cmp_nle_f32_e64
define amdgpu_ps void @test_sgpr(float inreg %a) #0 {
%c = fcmp ole float %a, 1.000000e+00
call void @llvm.amdgcn.kill(i1 %c) #1
}
; GCN-LABEL: {{^}}test_non_inline_imm_sgpr:
-; GCN-NOT: v_cmpx_ge_f32_e64
+; GCN-NOT: v_cmp_le_f32_e64
define amdgpu_ps void @test_non_inline_imm_sgpr(float inreg %a) #0 {
%c = fcmp ole float %a, 1.500000e+00
call void @llvm.amdgcn.kill(i1 %c) #1
ret void
}
+; Check this compiles.
+; If kill is marked as defining VCC then this will fail with live interval issues.
+; GCN-LABEL: {{^}}kill_with_loop_exit:
+; GCN: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; GCN: s_andn2_b64 [[LIVE]], [[LIVE]], exec
+; GCN-NEXT: s_cbranch_scc0
+define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, <4 x i32> inreg %inp2, float inreg %inp3) {
+.entry:
+ %tmp24 = fcmp olt float %inp0, 1.280000e+02
+ %tmp25 = fcmp olt float %inp1, 1.280000e+02
+ %tmp26 = and i1 %tmp24, %tmp25
+ br i1 %tmp26, label %bb35, label %.preheader1.preheader
+
+.preheader1.preheader: ; preds = %.entry
+ %tmp31 = fcmp ogt float %inp3, 0.0
+ br label %bb
+
+bb: ; preds = %bb, %.preheader1.preheader
+ %tmp30 = phi float [ %tmp32, %bb ], [ 1.500000e+00, %.preheader1.preheader ]
+ %tmp32 = fadd reassoc nnan nsz arcp contract float %tmp30, 2.500000e-01
+ %tmp34 = fadd reassoc nnan nsz arcp contract float %tmp30, 2.500000e-01
+ br i1 %tmp31, label %bb, label %bb33
+
+bb33: ; preds = %bb
+ call void @llvm.amdgcn.kill(i1 false)
+ br label %bb35
+
+bb35: ; preds = %bb33, %.entry
+ %tmp36 = phi float [ %tmp34, %bb33 ], [ 1.000000e+00, %.entry ]
+ call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 15, float %tmp36, float %tmp36, float %tmp36, float %tmp36, i1 immarg true, i1 immarg true) #3
+ ret void
+}
+
declare void @llvm.amdgcn.kill(i1) #0
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0
declare i1 @llvm.amdgcn.wqm.vote(i1)
attributes #0 = { nounwind }
ret float %r
}
+; Note: an almost identical test for this exists in llvm.amdgcn.kill.ll
;CHECK-LABEL: {{^}}kill:
;CHECK: v_cmp_eq_u32_e32 [[CMP:[^,]+]], v0, v1
;WAVE64: s_wqm_b64 [[WQM:[^,]+]], [[CMP]]
-;WAVE64: s_and_b64 exec, exec, [[WQM]]
+;WAVE64: s_xor_b64 [[KILL:[^,]+]], [[WQM]], exec
+;WAVE64: s_andn2_b64 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]]
+;WAVE64: s_and_b64 exec, exec, [[MASK]]
;WAVE32: s_wqm_b32 [[WQM:[^,]+]], [[CMP]]
-;WAVE32: s_and_b32 exec_lo, exec_lo, [[WQM]]
+;WAVE32: s_xor_b32 [[KILL:[^,]+]], [[WQM]], exec
+;WAVE32: s_andn2_b32 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]]
+;WAVE32: s_and_b32 exec_lo, exec_lo, [[MASK]]
;CHECK: s_endpgm
-define amdgpu_ps void @kill(i32 %v0, i32 %v1) #1 {
+define amdgpu_ps float @kill(i32 %v0, i32 %v1) #1 {
main_body:
%c = icmp eq i32 %v0, %v1
%w = call i1 @llvm.amdgcn.wqm.vote(i1 %c)
call void @llvm.amdgcn.kill(i1 %w)
- ret void
+ ret float 0.0
}
declare void @llvm.amdgcn.kill(i1) #1
define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
; GCN-LABEL: test_kill_depth_0_imm_pos:
; GCN: ; %bb.0:
-; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_endpgm
call void @llvm.amdgcn.kill(i1 true)
ret void
define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
; GCN-LABEL: test_kill_depth_0_imm_neg:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 exec, 0
-; GCN-NEXT: s_cbranch_execz BB1_2
-; GCN-NEXT: ; %bb.1:
+; GCN-NEXT: s_andn2_b64 exec, exec, exec
+; GCN-NEXT: s_cbranch_scc0 BB1_1
; GCN-NEXT: s_endpgm
-; GCN-NEXT: BB1_2:
+; GCN-NEXT: BB1_1:
+; GCN-NEXT: s_mov_b64 exec, 0
; GCN-NEXT: exp null off, off, off, off done vm
; GCN-NEXT: s_endpgm
call void @llvm.amdgcn.kill(i1 false)
define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
; GCN-LABEL: test_kill_depth_0_imm_neg_x2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 exec, 0
-; GCN-NEXT: s_cbranch_execz BB2_3
+; GCN-NEXT: s_mov_b64 s[0:1], exec
+; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; GCN-NEXT: s_cbranch_scc0 BB2_2
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_mov_b64 exec, 0
-; GCN-NEXT: s_cbranch_execz BB2_3
-; GCN-NEXT: ; %bb.2:
+; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; GCN-NEXT: s_cbranch_scc0 BB2_2
; GCN-NEXT: s_endpgm
-; GCN-NEXT: BB2_3:
+; GCN-NEXT: BB2_2:
+; GCN-NEXT: s_mov_b64 exec, 0
; GCN-NEXT: exp null off, off, off, off done vm
; GCN-NEXT: s_endpgm
call void @llvm.amdgcn.kill(i1 false)
}
define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
-; SI-LABEL: test_kill_depth_var:
-; SI: ; %bb.0:
-; SI-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; SI-NEXT: s_cbranch_execz BB3_2
-; SI-NEXT: ; %bb.1:
-; SI-NEXT: s_endpgm
-; SI-NEXT: BB3_2:
-; SI-NEXT: exp null off, off, off, off done vm
-; SI-NEXT: s_endpgm
-;
-; GFX10-LABEL: test_kill_depth_var:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_cmpx_gt_f32_e32 0, v0
-; GFX10-NEXT: s_cbranch_execz BB3_2
-; GFX10-NEXT: ; %bb.1:
-; GFX10-NEXT: s_endpgm
-; GFX10-NEXT: BB3_2:
-; GFX10-NEXT: exp null off, off, off, off done vm
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: test_kill_depth_var:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
+; GCN-NEXT: s_andn2_b64 exec, exec, vcc
+; GCN-NEXT: s_cbranch_scc0 BB3_1
+; GCN-NEXT: s_endpgm
+; GCN-NEXT: BB3_1:
+; GCN-NEXT: s_mov_b64 exec, 0
+; GCN-NEXT: exp null off, off, off, off done vm
+; GCN-NEXT: s_endpgm
%cmp = fcmp olt float %x, 0.0
call void @llvm.amdgcn.kill(i1 %cmp)
ret void
; FIXME: Ideally only one early-exit would be emitted
define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
-; SI-LABEL: test_kill_depth_var_x2_same:
-; SI: ; %bb.0:
-; SI-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; SI-NEXT: s_cbranch_execz BB4_3
-; SI-NEXT: ; %bb.1:
-; SI-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; SI-NEXT: s_cbranch_execz BB4_3
-; SI-NEXT: ; %bb.2:
-; SI-NEXT: s_endpgm
-; SI-NEXT: BB4_3:
-; SI-NEXT: exp null off, off, off, off done vm
-; SI-NEXT: s_endpgm
-;
-; GFX10-LABEL: test_kill_depth_var_x2_same:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_cmpx_gt_f32_e32 0, v0
-; GFX10-NEXT: s_cbranch_execz BB4_3
-; GFX10-NEXT: ; %bb.1:
-; GFX10-NEXT: s_waitcnt_depctr 0xfffe
-; GFX10-NEXT: v_cmpx_gt_f32_e32 0, v0
-; GFX10-NEXT: s_cbranch_execz BB4_3
-; GFX10-NEXT: ; %bb.2:
-; GFX10-NEXT: s_endpgm
-; GFX10-NEXT: BB4_3:
-; GFX10-NEXT: exp null off, off, off, off done vm
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: test_kill_depth_var_x2_same:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b64 s[0:1], exec
+; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc
+; GCN-NEXT: s_cbranch_scc0 BB4_2
+; GCN-NEXT: ; %bb.1:
+; GCN-NEXT: s_andn2_b64 exec, exec, vcc
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
+; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc
+; GCN-NEXT: s_cbranch_scc0 BB4_2
+; GCN-NEXT: s_endpgm
+; GCN-NEXT: BB4_2:
+; GCN-NEXT: s_mov_b64 exec, 0
+; GCN-NEXT: exp null off, off, off, off done vm
+; GCN-NEXT: s_endpgm
%cmp = fcmp olt float %x, 0.0
call void @llvm.amdgcn.kill(i1 %cmp)
call void @llvm.amdgcn.kill(i1 %cmp)
; FIXME: Ideally only one early-exit would be emitted
define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
-; SI-LABEL: test_kill_depth_var_x2:
-; SI: ; %bb.0:
-; SI-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; SI-NEXT: s_cbranch_execz BB5_3
-; SI-NEXT: ; %bb.1:
-; SI-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1
-; SI-NEXT: s_cbranch_execz BB5_3
-; SI-NEXT: ; %bb.2:
-; SI-NEXT: s_endpgm
-; SI-NEXT: BB5_3:
-; SI-NEXT: exp null off, off, off, off done vm
-; SI-NEXT: s_endpgm
-;
-; GFX10-LABEL: test_kill_depth_var_x2:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_cmpx_gt_f32_e32 0, v0
-; GFX10-NEXT: s_cbranch_execz BB5_3
-; GFX10-NEXT: ; %bb.1:
-; GFX10-NEXT: s_waitcnt_depctr 0xfffe
-; GFX10-NEXT: v_cmpx_gt_f32_e32 0, v1
-; GFX10-NEXT: s_cbranch_execz BB5_3
-; GFX10-NEXT: ; %bb.2:
-; GFX10-NEXT: s_endpgm
-; GFX10-NEXT: BB5_3:
-; GFX10-NEXT: exp null off, off, off, off done vm
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: test_kill_depth_var_x2:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b64 s[0:1], exec
+; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc
+; GCN-NEXT: s_cbranch_scc0 BB5_2
+; GCN-NEXT: ; %bb.1:
+; GCN-NEXT: s_andn2_b64 exec, exec, vcc
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
+; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc
+; GCN-NEXT: s_cbranch_scc0 BB5_2
+; GCN-NEXT: s_endpgm
+; GCN-NEXT: BB5_2:
+; GCN-NEXT: s_mov_b64 exec, 0
+; GCN-NEXT: exp null off, off, off, off done vm
+; GCN-NEXT: s_endpgm
%cmp.x = fcmp olt float %x, 0.0
call void @llvm.amdgcn.kill(i1 %cmp.x)
%cmp.y = fcmp olt float %y, 0.0
}
define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
-; SI-LABEL: test_kill_depth_var_x2_instructions:
-; SI: ; %bb.0:
-; SI-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; SI-NEXT: s_cbranch_execz BB6_3
-; SI-NEXT: ; %bb.1:
-; SI-NEXT: ;;#ASMSTART
-; SI-NEXT: v_mov_b32_e64 v7, -1
-; SI-NEXT: ;;#ASMEND
-; SI-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v7
-; SI-NEXT: s_cbranch_execz BB6_3
-; SI-NEXT: ; %bb.2:
-; SI-NEXT: s_endpgm
-; SI-NEXT: BB6_3:
-; SI-NEXT: exp null off, off, off, off done vm
-; SI-NEXT: s_endpgm
-;
-; GFX10-LABEL: test_kill_depth_var_x2_instructions:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_cmpx_gt_f32_e32 0, v0
-; GFX10-NEXT: s_cbranch_execz BB6_3
-; GFX10-NEXT: ; %bb.1:
-; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: v_mov_b32_e64 v7, -1
-; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: s_waitcnt_depctr 0xfffe
-; GFX10-NEXT: v_cmpx_gt_f32_e32 0, v7
-; GFX10-NEXT: s_cbranch_execz BB6_3
-; GFX10-NEXT: ; %bb.2:
-; GFX10-NEXT: s_endpgm
-; GFX10-NEXT: BB6_3:
-; GFX10-NEXT: exp null off, off, off, off done vm
-; GFX10-NEXT: s_endpgm
+; GCN-LABEL: test_kill_depth_var_x2_instructions:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
+; GCN-NEXT: s_mov_b64 s[0:1], exec
+; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc
+; GCN-NEXT: s_cbranch_scc0 BB6_2
+; GCN-NEXT: ; %bb.1:
+; GCN-NEXT: s_andn2_b64 exec, exec, vcc
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: v_mov_b32_e64 v7, -1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7
+; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc
+; GCN-NEXT: s_cbranch_scc0 BB6_2
+; GCN-NEXT: s_endpgm
+; GCN-NEXT: BB6_2:
+; GCN-NEXT: s_mov_b64 exec, 0
+; GCN-NEXT: exp null off, off, off, off done vm
+; GCN-NEXT: s_endpgm
%cmp.x = fcmp olt float %x, 0.0
call void @llvm.amdgcn.kill(i1 %cmp.x)
%y = call float asm sideeffect "v_mov_b32_e64 v7, -1", "={v7}"()
; FIXME: why does the skip depend on the asm length in the same block?
define amdgpu_ps float @test_kill_control_flow(i32 inreg %arg) #0 {
-; SI-LABEL: test_kill_control_flow:
-; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_cmp_lg_u32 s0, 0
-; SI-NEXT: s_cbranch_scc1 BB7_2
-; SI-NEXT: ; %bb.1: ; %bb
-; SI-NEXT: ;;#ASMSTART
-; SI-NEXT: v_mov_b32_e64 v7, -1
-; SI-NEXT: v_nop_e64
-; SI-NEXT: v_nop_e64
-; SI-NEXT: v_nop_e64
-; SI-NEXT: v_nop_e64
-; SI-NEXT: v_nop_e64
-; SI-NEXT: v_nop_e64
-; SI-NEXT: v_nop_e64
-; SI-NEXT: v_nop_e64
-; SI-NEXT: v_nop_e64
-; SI-NEXT: v_nop_e64
-; SI-NEXT: ;;#ASMEND
-; SI-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v7
-; SI-NEXT: BB7_2: ; %exit
-; SI-NEXT: v_mov_b32_e32 v0, 1.0
-; SI-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: test_kill_control_flow:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_cmp_lg_u32 s0, 0
-; GFX10-NEXT: s_cbranch_scc1 BB7_2
-; GFX10-NEXT: ; %bb.1: ; %bb
-; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: v_mov_b32_e64 v7, -1
-; GFX10-NEXT: v_nop_e64
-; GFX10-NEXT: v_nop_e64
-; GFX10-NEXT: v_nop_e64
-; GFX10-NEXT: v_nop_e64
-; GFX10-NEXT: v_nop_e64
-; GFX10-NEXT: v_nop_e64
-; GFX10-NEXT: v_nop_e64
-; GFX10-NEXT: v_nop_e64
-; GFX10-NEXT: v_nop_e64
-; GFX10-NEXT: v_nop_e64
-; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_cmpx_gt_f32_e32 0, v7
-; GFX10-NEXT: BB7_2: ; %exit
-; GFX10-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: test_kill_control_flow:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_cbranch_scc0 BB7_2
+; GCN-NEXT: ; %bb.1: ; %exit
+; GCN-NEXT: v_mov_b32_e32 v0, 1.0
+; GCN-NEXT: s_branch BB7_5
+; GCN-NEXT: BB7_2: ; %bb
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: v_mov_b32_e64 v7, -1
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7
+; GCN-NEXT: s_mov_b64 s[2:3], exec
+; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
+; GCN-NEXT: s_cbranch_scc0 BB7_4
+; GCN-NEXT: ; %bb.3: ; %bb
+; GCN-NEXT: s_andn2_b64 exec, exec, vcc
+; GCN-NEXT: v_mov_b32_e32 v0, 1.0
+; GCN-NEXT: s_branch BB7_5
+; GCN-NEXT: BB7_4:
+; GCN-NEXT: s_mov_b64 exec, 0
+; GCN-NEXT: exp null off, off, off, off done vm
+; GCN-NEXT: s_endpgm
+; GCN-NEXT: BB7_5:
entry:
%cmp = icmp eq i32 %arg, 0
br i1 %cmp, label %bb, label %exit
; SI-NEXT: v_nop_e64
; SI-NEXT: v_nop_e64
; SI-NEXT: ;;#ASMEND
+; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7
+; SI-NEXT: s_mov_b64 s[2:3], exec
+; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
; SI-NEXT: ;;#ASMSTART
; SI-NEXT: v_mov_b32_e64 v8, -1
; SI-NEXT: ;;#ASMEND
-; SI-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v7
+; SI-NEXT: s_cbranch_scc0 BB8_4
; SI-NEXT: ; %bb.2: ; %bb
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0
; SI-NEXT: s_endpgm
+; SI-NEXT: BB8_4:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: exp null off, off, off, off done vm
+; SI-NEXT: s_endpgm
;
; GFX10-LABEL: test_kill_control_flow_remainder:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_nop_e64
; GFX10-NEXT: v_nop_e64
; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7
+; GFX10-NEXT: s_mov_b64 s[2:3], exec
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: v_mov_b32_e64 v8, -1
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: v_cmpx_gt_f32_e32 0, v7
-; GFX10-NEXT: s_cbranch_execz BB8_4
+; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
+; GFX10-NEXT: s_cbranch_scc0 BB8_4
; GFX10-NEXT: ; %bb.3: ; %bb
-; GFX10-NEXT: s_nop 3
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
; GFX10-NEXT: global_store_dword v[0:1], v8, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: global_store_dword v[0:1], v9, off
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: BB8_4:
+; GFX10-NEXT: s_mov_b64 exec, 0
; GFX10-NEXT: exp null off, off, off, off done vm
; GFX10-NEXT: s_endpgm
entry:
define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
; SI-LABEL: test_kill_control_flow_return:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: v_cmp_eq_u32_e64 s[2:3], s0, 1
-; SI-NEXT: s_and_b64 exec, exec, s[2:3]
-; SI-NEXT: s_cbranch_execz BB9_4
+; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s0, 1
+; SI-NEXT: s_mov_b64 s[2:3], exec
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], exec
+; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; SI-NEXT: s_cbranch_scc0 BB9_4
; SI-NEXT: ; %bb.1: ; %entry
+; SI-NEXT: s_and_b64 exec, exec, s[2:3]
; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: s_cbranch_scc0 BB9_3
; SI-NEXT: v_mov_b32_e32 v0, v7
; SI-NEXT: s_branch BB9_5
; SI-NEXT: BB9_4:
+; SI-NEXT: s_mov_b64 exec, 0
; SI-NEXT: exp null off, off, off, off done vm
; SI-NEXT: s_endpgm
; SI-NEXT: BB9_5:
;
; GFX10-LABEL: test_kill_control_flow_return:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: v_cmp_eq_u32_e64 s[2:3], s0, 1
-; GFX10-NEXT: s_and_b64 exec, exec, s[2:3]
-; GFX10-NEXT: s_cbranch_execz BB9_4
+; GFX10-NEXT: v_cmp_eq_u32_e64 s[4:5], s0, 1
+; GFX10-NEXT: s_mov_b64 s[2:3], exec
+; GFX10-NEXT: s_xor_b64 s[4:5], s[4:5], exec
+; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT: s_cbranch_scc0 BB9_4
; GFX10-NEXT: ; %bb.1: ; %entry
+; GFX10-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_cmp_lg_u32 s0, 0
; GFX10-NEXT: s_cbranch_scc0 BB9_3
; GFX10-NEXT: v_mov_b32_e32 v0, v7
; GFX10-NEXT: s_branch BB9_5
; GFX10-NEXT: BB9_4:
+; GFX10-NEXT: s_mov_b64 exec, 0
; GFX10-NEXT: exp null off, off, off, off done vm
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: BB9_5:
define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
; SI-LABEL: test_kill_divergent_loop:
; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_mov_b64 s[0:1], exec
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; SI-NEXT: s_cbranch_execz BB10_4
; SI-NEXT: ; %bb.1: ; %bb.preheader
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: BB10_2: ; %bb
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: ;;#ASMSTART
; SI-NEXT: v_nop_e64
; SI-NEXT: v_nop_e64
; SI-NEXT: ;;#ASMEND
-; SI-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v7
+; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7
+; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc
+; SI-NEXT: s_cbranch_scc0 BB10_5
; SI-NEXT: ; %bb.3: ; %bb
; SI-NEXT: ; in Loop: Header=BB10_2 Depth=1
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; SI-NEXT: s_and_b64 vcc, exec, vcc
; SI-NEXT: s_cbranch_vccnz BB10_2
; SI-NEXT: BB10_4: ; %Flow1
-; SI-NEXT: s_or_b64 exec, exec, s[0:1]
-; SI-NEXT: s_cbranch_execz BB10_6
-; SI-NEXT: ; %bb.5: ; %Flow1
+; SI-NEXT: s_or_b64 exec, exec, s[2:3]
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
-; SI-NEXT: BB10_6:
+; SI-NEXT: BB10_5:
+; SI-NEXT: s_mov_b64 exec, 0
; SI-NEXT: exp null off, off, off, off done vm
; SI-NEXT: s_endpgm
;
; GFX10-LABEL: test_kill_divergent_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX10-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX10-NEXT: s_mov_b64 s[0:1], exec
+; GFX10-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX10-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX10-NEXT: s_cbranch_execz BB10_3
; GFX10-NEXT: BB10_1: ; %bb
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_nop_e64
; GFX10-NEXT: v_nop_e64
; GFX10-NEXT: ;;#ASMEND
-; GFX10-NEXT: s_waitcnt_depctr 0xfffe
-; GFX10-NEXT: v_cmpx_gt_f32_e32 0, v7
+; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7
+; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc
+; GFX10-NEXT: s_cbranch_scc0 BB10_4
; GFX10-NEXT: ; %bb.2: ; %bb
; GFX10-NEXT: ; in Loop: Header=BB10_1 Depth=1
-; GFX10-NEXT: s_nop 4
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
; GFX10-NEXT: global_load_dword v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10-NEXT: s_and_b64 vcc, exec, vcc
; GFX10-NEXT: s_cbranch_vccnz BB10_1
; GFX10-NEXT: BB10_3: ; %Flow1
-; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX10-NEXT: s_cbranch_execz BB10_5
-; GFX10-NEXT: ; %bb.4: ; %Flow1
+; GFX10-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX10-NEXT: v_mov_b32_e32 v0, 8
; GFX10-NEXT: global_store_dword v[0:1], v0, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
-; GFX10-NEXT: BB10_5:
+; GFX10-NEXT: BB10_4:
+; GFX10-NEXT: s_mov_b64 exec, 0
; GFX10-NEXT: exp null off, off, off, off done vm
; GFX10-NEXT: s_endpgm
entry:
; SI-NEXT: v_add_f32_e64 v1, s0, 1.0
; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
-; SI-NEXT: v_cmpx_lt_f32_e32 vcc, 0, v1
-; SI-NEXT: s_cbranch_execz BB11_6
+; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_cbranch_scc0 BB11_6
; SI-NEXT: ; %bb.1: ; %bb
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
; SI-NEXT: s_cbranch_scc0 BB11_3
; SI-NEXT: ; %bb.2: ; %bb8
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: BB11_5: ; %end
; SI-NEXT: s_endpgm
; SI-NEXT: BB11_6:
+; SI-NEXT: s_mov_b64 exec, 0
; SI-NEXT: exp null off, off, off, off done vm
; SI-NEXT: s_endpgm
;
; GFX10-NEXT: v_add_f32_e64 v1, s0, 1.0
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
-; GFX10-NEXT: v_cmpx_lt_f32_e32 0, v1
-; GFX10-NEXT: s_cbranch_execz BB11_6
+; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_cbranch_scc0 BB11_6
; GFX10-NEXT: ; %bb.1: ; %bb
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
; GFX10-NEXT: s_cbranch_scc0 BB11_3
; GFX10-NEXT: ; %bb.2: ; %bb8
; GFX10-NEXT: v_mov_b32_e32 v1, 8
; GFX10-NEXT: v_mov_b32_e32 v0, 4.0
-; GFX10-NEXT: s_nop 0
; GFX10-NEXT: global_store_dword v[0:1], v1, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: BB11_3: ; %phibb
; GFX10-NEXT: BB11_5: ; %end
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: BB11_6:
+; GFX10-NEXT: s_mov_b64 exec, 0
; GFX10-NEXT: exp null off, off, off, off done vm
; GFX10-NEXT: s_endpgm
bb:
define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1) #0 {
; SI-LABEL: no_skip_no_successors:
; SI: ; %bb.0: ; %bb
-; SI-NEXT: v_cmp_nge_f32_e64 s[2:3], s1, 0
-; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
-; SI-NEXT: s_cbranch_vccz BB12_2
+; SI-NEXT: v_cmp_nge_f32_e64 s[4:5], s1, 0
+; SI-NEXT: s_and_b64 vcc, exec, s[4:5]
+; SI-NEXT: s_cbranch_vccz BB12_3
; SI-NEXT: ; %bb.1: ; %bb6
+; SI-NEXT: s_mov_b64 s[2:3], exec
+; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
+; SI-NEXT: s_cbranch_scc0 BB12_5
+; SI-NEXT: ; %bb.2: ; %bb6
; SI-NEXT: s_mov_b64 exec, 0
-; SI-NEXT: exp null off, off, off, off done vm
-; SI-NEXT: s_endpgm
-; SI-NEXT: BB12_2: ; %bb3
+; SI-NEXT: BB12_3: ; %bb3
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7ae148
; SI-NEXT: v_cmp_nge_f32_e32 vcc, s0, v0
; SI-NEXT: s_and_b64 vcc, exec, vcc
-; SI-NEXT: ; %bb.3: ; %bb5
+; SI-NEXT: ; %bb.4: ; %bb5
+; SI-NEXT: BB12_5:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: exp null off, off, off, off done vm
+; SI-NEXT: s_endpgm
;
; GFX10-LABEL: no_skip_no_successors:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: v_cmp_nge_f32_e64 s[2:3], s1, 0
-; GFX10-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GFX10-NEXT: s_cbranch_vccz BB12_2
+; GFX10-NEXT: v_cmp_nge_f32_e64 s[4:5], s1, 0
+; GFX10-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX10-NEXT: s_cbranch_vccz BB12_3
; GFX10-NEXT: ; %bb.1: ; %bb6
+; GFX10-NEXT: s_mov_b64 s[2:3], exec
+; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
+; GFX10-NEXT: s_cbranch_scc0 BB12_5
+; GFX10-NEXT: ; %bb.2: ; %bb6
; GFX10-NEXT: s_mov_b64 exec, 0
-; GFX10-NEXT: exp null off, off, off, off done vm
-; GFX10-NEXT: s_endpgm
-; GFX10-NEXT: BB12_2: ; %bb3
+; GFX10-NEXT: BB12_3: ; %bb3
; GFX10-NEXT: v_cmp_nle_f32_e64 s[0:1], 0x3e7ae148, s0
; GFX10-NEXT: s_and_b64 vcc, exec, s[0:1]
-; GFX10-NEXT: ; %bb.3: ; %bb5
+; GFX10-NEXT: ; %bb.4: ; %bb5
+; GFX10-NEXT: BB12_5:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: exp null off, off, off, off done vm
+; GFX10-NEXT: s_endpgm
bb:
%tmp = fcmp ult float %arg1, 0.000000e+00
%tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000
define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 {
; SI-LABEL: if_after_kill_block:
; SI: ; %bb.0: ; %bb
+; SI-NEXT: s_mov_b64 s[2:3], exec
; SI-NEXT: s_wqm_b64 exec, exec
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1
-; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; SI-NEXT: s_cbranch_execz BB13_2
+; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: ; %bb.1: ; %bb3
-; SI-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0
-; SI-NEXT: BB13_2: ; %bb4
-; SI-NEXT: s_or_b64 exec, exec, s[2:3]
-; SI-NEXT: s_cbranch_execz BB13_6
+; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
+; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
+; SI-NEXT: s_cbranch_scc0 BB13_6
+; SI-NEXT: ; %bb.2: ; %bb3
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
; SI-NEXT: ; %bb.3: ; %bb4
+; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: s_mov_b32 s2, s0
; SI-NEXT: s_mov_b32 s3, s0
; SI-NEXT: BB13_5: ; %UnifiedReturnBlock
; SI-NEXT: s_endpgm
; SI-NEXT: BB13_6:
+; SI-NEXT: s_mov_b64 exec, 0
; SI-NEXT: exp null off, off, off, off done vm
; SI-NEXT: s_endpgm
;
; GFX10-LABEL: if_after_kill_block:
; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_mov_b64 s[2:3], exec
; GFX10-NEXT: s_wqm_b64 exec, exec
; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1
; GFX10-NEXT: s_mov_b32 s0, 0
-; GFX10-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX10-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX10-NEXT: s_cbranch_execz BB13_2
+; GFX10-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX10-NEXT: ; %bb.1: ; %bb3
-; GFX10-NEXT: s_waitcnt_depctr 0xfffe
-; GFX10-NEXT: v_cmpx_gt_f32_e32 0, v0
-; GFX10-NEXT: BB13_2: ; %bb4
-; GFX10-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10-NEXT: s_cbranch_execz BB13_6
+; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
+; GFX10-NEXT: s_cbranch_scc0 BB13_6
+; GFX10-NEXT: ; %bb.2: ; %bb3
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
; GFX10-NEXT: ; %bb.3: ; %bb4
+; GFX10-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX10-NEXT: s_mov_b32 s1, s0
; GFX10-NEXT: s_mov_b32 s2, s0
; GFX10-NEXT: s_mov_b32 s3, s0
; GFX10-NEXT: BB13_5: ; %UnifiedReturnBlock
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: BB13_6:
+; GFX10-NEXT: s_mov_b64 exec, 0
; GFX10-NEXT: exp null off, off, off, off done vm
; GFX10-NEXT: s_endpgm
bb:
; SI-NEXT: s_mov_b32 m0, s0
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: v_interp_p1_f32 v2, v0, attr1.x
+; SI-NEXT: s_mov_b64 s[2:3], exec
; SI-NEXT: v_mov_b32_e32 v3, v2
; SI-NEXT: v_mov_b32_e32 v4, v2
; SI-NEXT: s_mov_b32 s5, s4
; SI-NEXT: image_sample_lz v2, v[2:4], s[4:11], s[0:3] dmask:0x1 da
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ge_f32_e32 vcc, 0, v2
-; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; SI-NEXT: s_cbranch_execz BB14_2
+; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: ; %bb.1: ; %kill
+; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
+; SI-NEXT: s_cbranch_scc0 BB14_6
+; SI-NEXT: ; %bb.2: ; %kill
; SI-NEXT: s_mov_b64 exec, 0
-; SI-NEXT: BB14_2: ; %Flow
-; SI-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
+; SI-NEXT: ; %bb.3: ; %Flow
+; SI-NEXT: s_or_saveexec_b64 s[2:3], s[4:5]
; SI-NEXT: ; implicit-def: $vgpr3
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: ; implicit-def: $vgpr6
; SI-NEXT: s_xor_b64 exec, exec, s[2:3]
-; SI-NEXT: ; %bb.3: ; %live
+; SI-NEXT: ; %bb.4: ; %live
; SI-NEXT: s_mov_b32 m0, s0
; SI-NEXT: v_interp_p1_f32 v4, v0, attr0.x
; SI-NEXT: v_interp_p1_f32 v0, v0, attr0.y
; SI-NEXT: v_interp_p2_f32 v0, v1, attr0.y
; SI-NEXT: v_mul_f32_e32 v4, v4, v2
; SI-NEXT: v_mul_f32_e32 v6, v0, v2
-; SI-NEXT: ; %bb.4: ; %export
-; SI-NEXT: s_or_b64 exec, exec, s[2:3]
-; SI-NEXT: s_cbranch_execz BB14_6
; SI-NEXT: ; %bb.5: ; %export
+; SI-NEXT: s_or_b64 exec, exec, s[2:3]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v3, v4
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v5, v6
; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm
; SI-NEXT: s_endpgm
; SI-NEXT: BB14_6:
+; SI-NEXT: s_mov_b64 exec, 0
; SI-NEXT: exp null off, off, off, off done vm
; SI-NEXT: s_endpgm
;
; GFX10-NEXT: s_mov_b32 m0, s0
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_interp_p1_f32_e32 v2, v0, attr1.x
+; GFX10-NEXT: s_mov_b64 s[2:3], exec
; GFX10-NEXT: s_mov_b32 s5, s4
; GFX10-NEXT: s_mov_b32 s6, s4
; GFX10-NEXT: s_mov_b32 s7, s4
; GFX10-NEXT: image_sample_lz v2, [v2, v2, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 0, v2
-; GFX10-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX10-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX10-NEXT: s_cbranch_execz BB14_2
+; GFX10-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX10-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX10-NEXT: ; %bb.1: ; %kill
+; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
+; GFX10-NEXT: s_cbranch_scc0 BB14_6
+; GFX10-NEXT: ; %bb.2: ; %kill
; GFX10-NEXT: s_mov_b64 exec, 0
-; GFX10-NEXT: BB14_2: ; %Flow
-; GFX10-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX10-NEXT: ; %bb.3: ; %Flow
+; GFX10-NEXT: s_or_saveexec_b64 s[2:3], s[4:5]
; GFX10-NEXT: ; implicit-def: $vgpr3
; GFX10-NEXT: ; implicit-def: $vgpr5
; GFX10-NEXT: ; implicit-def: $vgpr4
; GFX10-NEXT: ; implicit-def: $vgpr6
; GFX10-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX10-NEXT: ; %bb.3: ; %live
+; GFX10-NEXT: ; %bb.4: ; %live
; GFX10-NEXT: s_mov_b32 m0, s0
; GFX10-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.x
; GFX10-NEXT: v_interp_p1_f32_e32 v0, v0, attr0.y
; GFX10-NEXT: v_interp_p2_f32_e32 v11, v1, attr0.y
; GFX10-NEXT: v_mul_f32_e32 v5, v7, v2
; GFX10-NEXT: v_mul_f32_e32 v6, v11, v2
-; GFX10-NEXT: ; %bb.4: ; %export
-; GFX10-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10-NEXT: s_cbranch_execz BB14_6
; GFX10-NEXT: ; %bb.5: ; %export
+; GFX10-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v3, v5
; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v4, v6
; GFX10-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: BB14_6:
+; GFX10-NEXT: s_mov_b64 exec, 0
; GFX10-NEXT: exp null off, off, off, off done vm
; GFX10-NEXT: s_endpgm
.entry:
; SI: ; %bb.0: ; %.entry
; SI-NEXT: s_cmp_lt_i32 s0, 1
; SI-NEXT: v_mov_b32_e32 v2, -1
-; SI-NEXT: s_cbranch_scc1 BB15_6
+; SI-NEXT: s_cbranch_scc1 BB15_7
; SI-NEXT: ; %bb.1: ; %.lr.ph
-; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b64 s[2:3], exec
+; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b64 s[0:1], 0
-; SI-NEXT: s_branch BB15_4
-; SI-NEXT: BB15_2: ; %kill
-; SI-NEXT: ; in Loop: Header=BB15_4 Depth=1
-; SI-NEXT: s_mov_b64 exec, 0
-; SI-NEXT: BB15_3: ; %latch
-; SI-NEXT: ; in Loop: Header=BB15_4 Depth=1
-; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_add_i32 s2, s2, 1
-; SI-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1
+; SI-NEXT: s_branch BB15_3
+; SI-NEXT: BB15_2: ; %latch
+; SI-NEXT: ; in Loop: Header=BB15_3 Depth=1
+; SI-NEXT: s_or_b64 exec, exec, s[6:7]
+; SI-NEXT: s_add_i32 s4, s4, 1
+; SI-NEXT: v_cmp_ge_i32_e32 vcc, s4, v1
; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mov_b32_e32 v2, s4
; SI-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; SI-NEXT: s_cbranch_execz BB15_5
-; SI-NEXT: BB15_4: ; %hdr
+; SI-NEXT: s_cbranch_execz BB15_6
+; SI-NEXT: BB15_3: ; %hdr
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: v_cmp_gt_u32_e32 vcc, s2, v0
-; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; SI-NEXT: s_cbranch_execnz BB15_2
-; SI-NEXT: s_branch BB15_3
-; SI-NEXT: BB15_5: ; %Flow
+; SI-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
+; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SI-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; SI-NEXT: s_cbranch_execz BB15_2
+; SI-NEXT: ; %bb.4: ; %kill
+; SI-NEXT: ; in Loop: Header=BB15_3 Depth=1
+; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
+; SI-NEXT: s_cbranch_scc0 BB15_8
+; SI-NEXT: ; %bb.5: ; %kill
+; SI-NEXT: ; in Loop: Header=BB15_3 Depth=1
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_branch BB15_2
+; SI-NEXT: BB15_6: ; %Flow
; SI-NEXT: s_or_b64 exec, exec, s[0:1]
-; SI-NEXT: BB15_6: ; %._crit_edge
+; SI-NEXT: BB15_7: ; %._crit_edge
; SI-NEXT: exp mrt0 v2, v2, v0, v0 done compr vm
; SI-NEXT: s_endpgm
+; SI-NEXT: BB15_8:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: exp null off, off, off, off done vm
+; SI-NEXT: s_endpgm
;
; GFX10-LABEL: complex_loop:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: v_mov_b32_e32 v2, -1
; GFX10-NEXT: s_cmp_lt_i32 s0, 1
-; GFX10-NEXT: s_cbranch_scc1 BB15_6
+; GFX10-NEXT: s_cbranch_scc1 BB15_7
; GFX10-NEXT: ; %bb.1: ; %.lr.ph
-; GFX10-NEXT: s_mov_b32 s2, 0
+; GFX10-NEXT: s_mov_b64 s[2:3], exec
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b64 s[0:1], 0
-; GFX10-NEXT: s_branch BB15_4
-; GFX10-NEXT: BB15_2: ; %kill
-; GFX10-NEXT: ; in Loop: Header=BB15_4 Depth=1
-; GFX10-NEXT: s_mov_b64 exec, 0
-; GFX10-NEXT: BB15_3: ; %latch
-; GFX10-NEXT: ; in Loop: Header=BB15_4 Depth=1
-; GFX10-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX10-NEXT: s_add_i32 s2, s2, 1
-; GFX10-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: s_branch BB15_3
+; GFX10-NEXT: BB15_2: ; %latch
+; GFX10-NEXT: ; in Loop: Header=BB15_3 Depth=1
+; GFX10-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX10-NEXT: s_add_i32 s4, s4, 1
+; GFX10-NEXT: v_cmp_ge_i32_e32 vcc, s4, v1
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX10-NEXT: s_cbranch_execz BB15_5
-; GFX10-NEXT: BB15_4: ; %hdr
+; GFX10-NEXT: s_cbranch_execz BB15_6
+; GFX10-NEXT: BB15_3: ; %hdr
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, s2, v0
-; GFX10-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX10-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX10-NEXT: s_cbranch_execnz BB15_2
-; GFX10-NEXT: s_branch BB15_3
-; GFX10-NEXT: BB15_5: ; %Flow
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
+; GFX10-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX10-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX10-NEXT: s_cbranch_execz BB15_2
+; GFX10-NEXT: ; %bb.4: ; %kill
+; GFX10-NEXT: ; in Loop: Header=BB15_3 Depth=1
+; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
+; GFX10-NEXT: s_cbranch_scc0 BB15_8
+; GFX10-NEXT: ; %bb.5: ; %kill
+; GFX10-NEXT: ; in Loop: Header=BB15_3 Depth=1
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_branch BB15_2
+; GFX10-NEXT: BB15_6: ; %Flow
; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX10-NEXT: BB15_6: ; %._crit_edge
+; GFX10-NEXT: BB15_7: ; %._crit_edge
; GFX10-NEXT: exp mrt0 v2, v2, v0, v0 done compr vm
; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: BB15_8:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: exp null off, off, off, off done vm
+; GFX10-NEXT: s_endpgm
.entry:
%flaga = icmp sgt i32 %cmpa, 0
br i1 %flaga, label %.lr.ph, label %._crit_edge
; GCN: successors: %bb.1(0x40000000), %bb.4(0x40000000)
; GCN: liveins: $vgpr0
; GCN: renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec
+ ; GCN: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec
- ; GCN: $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
- ; GCN: renamable $sgpr0_sgpr1 = S_XOR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+ ; GCN: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GCN: renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc
; GCN: S_CBRANCH_EXECZ %bb.4, implicit $exec
; GCN: bb.1.flow.preheader:
; GCN: successors: %bb.2(0x80000000)
- ; GCN: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GCN: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3
; GCN: nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec
- ; GCN: renamable $sgpr2_sgpr3 = S_MOV_B64 0
+ ; GCN: renamable $sgpr4_sgpr5 = S_MOV_B64 0
; GCN: bb.2.flow:
; GCN: successors: %bb.3(0x04000000), %bb.2(0x7c000000)
- ; GCN: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3
- ; GCN: renamable $sgpr4_sgpr5 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc
- ; GCN: renamable $sgpr2_sgpr3 = S_OR_B64 killed renamable $sgpr4_sgpr5, killed renamable $sgpr2_sgpr3, implicit-def $scc
- ; GCN: $exec = S_ANDN2_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc
+ ; GCN: liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
+ ; GCN: renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc
+ ; GCN: renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc
+ ; GCN: $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc
; GCN: S_CBRANCH_EXECNZ %bb.2, implicit $exec
; GCN: bb.3.Flow:
; GCN: successors: %bb.4(0x80000000)
- ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
- ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
+ ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
+ ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
; GCN: bb.4.Flow1:
- ; GCN: successors: %bb.5(0x40000000), %bb.6(0x40000000)
- ; GCN: liveins: $sgpr0_sgpr1
- ; GCN: renamable $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def $scc, implicit $exec
- ; GCN: $exec = S_XOR_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc
- ; GCN: S_CBRANCH_EXECZ %bb.6, implicit $exec
+ ; GCN: successors: %bb.5(0x40000000)
+ ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; GCN: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GCN: $exec = S_XOR_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc
; GCN: bb.5.kill0:
+ ; GCN: successors: %bb.8(0x40000000), %bb.7(0x40000000)
+ ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; GCN: dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc
+ ; GCN: S_CBRANCH_SCC0 %bb.7, implicit $scc
+ ; GCN: bb.8.kill0:
; GCN: successors: %bb.6(0x80000000)
- ; GCN: liveins: $sgpr0_sgpr1
+ ; GCN: liveins: $sgpr2_sgpr3, $scc
; GCN: $exec = S_MOV_B64 0
; GCN: bb.6.end:
- ; GCN: successors: %bb.7(0x40000000), %bb.8(0x40000000)
- ; GCN: liveins: $sgpr0_sgpr1
- ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc
- ; GCN: S_CBRANCH_EXECZ %bb.7, implicit $exec
- ; GCN: S_BRANCH %bb.8
+ ; GCN: successors: %bb.9(0x80000000)
+ ; GCN: liveins: $sgpr2_sgpr3
+ ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
+ ; GCN: S_BRANCH %bb.9
; GCN: bb.7:
+ ; GCN: $exec = S_MOV_B64 0
; GCN: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
; GCN: S_ENDPGM 0
- ; GCN: bb.8:
+ ; GCN: bb.9:
entry:
%.i0 = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, %val
%cmp0 = fcmp olt float %.i0, 0.000000e+00
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass si-insert-skips,post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
# GCN-LABEL: name: hazard_vcmpx_smov_exec_lo
# GCN: $sgpr0 = S_MOV_B32 $exec_lo
successors: %bb.1
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr0 = S_MOV_B32 $exec_lo
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr0_sgpr1 = S_MOV_B64 $exec
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
bb.0:
successors: %bb.1
$vgpr0 = V_MOV_B32_e32 $exec_lo, implicit $exec
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
bb.0:
successors: %bb.1
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr0 = S_MOV_B32 $exec_lo
$vgpr0 = V_ADDC_U32_e32 0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr0 = S_MOV_B32 $exec_lo
$sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $vgpr0, 0, implicit $exec
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr0 = S_MOV_B32 $exec_lo
S_WAITCNT_DEPCTR 65534
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr0 = S_MOV_B32 $exec_lo
S_WAITCNT_DEPCTR 65535
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr0 = S_MOV_B32 $exec_lo
S_WAITCNT_DEPCTR 61438
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-skips,post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
# GCN-LABEL: name: hazard_vcmpx_permlane16
# GCN: V_CMPX_LE_F32_nosdst_e32
bb.0:
successors: %bb.1
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
bb.0:
successors: %bb.1
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
bb.0:
successors: %bb.1
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
bb.0:
successors: %bb.1
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
bb.0:
successors: %bb.1
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
bb.0:
successors: %bb.1
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
- SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
S_BRANCH %bb.1
bb.1:
ret void
}
-; GCN-LABEL: {{^}}test_vopc_vcmpx:
-; GFX1032: v_cmpx_le_f32_e32 0, v{{[0-9]+}}
-; GFX1064: v_cmpx_le_f32_e32 0, v{{[0-9]+}}
-define amdgpu_ps void @test_vopc_vcmpx(float %x) {
+; GCN-LABEL: {{^}}test_vopc_vcmp:
+; GFX1032: v_cmp_nle_f32_e32 vcc_lo, 0, v{{[0-9]+}}
+; GFX1064: v_cmp_nle_f32_e32 vcc, 0, v{{[0-9]+}}
+define amdgpu_ps void @test_vopc_vcmp(float %x) {
%cmp = fcmp oge float %x, 0.0
call void @llvm.amdgcn.kill(i1 %cmp)
ret void
}
; GCN-LABEL: {{^}}test_kill_i1_terminator_i1:
+; GFX1032: s_mov_b32 [[LIVE:s[0-9]+]], exec_lo
; GFX1032: s_or_b32 [[OR:s[0-9]+]],
-; GFX1032: s_and_b32 exec_lo, exec_lo, [[OR]]
+; GFX1032: s_xor_b32 [[KILL:s[0-9]+]], [[OR]], exec_lo
+; GFX1032: s_andn2_b32 [[MASK:s[0-9]+]], [[LIVE]], [[KILL]]
+; GFX1032: s_and_b32 exec_lo, exec_lo, [[MASK]]
+; GFX1064: s_mov_b64 [[LIVE:s\[[0-9:]+\]]], exec
; GFX1064: s_or_b64 [[OR:s\[[0-9:]+\]]],
-; GFX1064: s_and_b64 exec, exec, [[OR]]
+; GFX1064: s_xor_b64 [[KILL:s\[[0-9:]+\]]], [[OR]], exec
+; GFX1064: s_andn2_b64 [[MASK:s\[[0-9:]+\]]], [[LIVE]], [[KILL]]
+; GFX1064: s_and_b64 exec, exec, [[MASK]]
define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 {
%c1 = icmp slt i32 %a, %b
%c2 = icmp slt i32 %c, %d
%x = or i1 %c1, %c2
call void @llvm.amdgcn.kill(i1 %x)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
ret void
}
; GCN-LABEL: {{^}}test_wqm_vote:
; GFX1032: v_cmp_neq_f32_e32 vcc_lo, 0
+; GFX1032: s_mov_b32 [[LIVE:s[0-9]+]], exec_lo
; GFX1032: s_wqm_b32 [[WQM:s[0-9]+]], vcc_lo
-; GFX1032: s_and_b32 exec_lo, exec_lo, [[WQM]]
+; GFX1032: s_xor_b32 [[KILL:s[0-9]+]], [[WQM]], exec_lo
+; GFX1032: s_andn2_b32 [[MASK:s[0-9]+]], [[LIVE]], [[KILL]]
+; GFX1032: s_and_b32 exec_lo, exec_lo, [[MASK]]
; GFX1064: v_cmp_neq_f32_e32 vcc, 0
-; GFX1064: s_wqm_b64 [[WQM:s\[[0-9:]+\]]], vcc{{$}}
-; GFX1064: s_and_b64 exec, exec, [[WQM]]
+; GFX1064: s_mov_b64 [[LIVE:s\[[0-9:]+\]]], exec
+; GFX1064: s_wqm_b64 [[WQM:s\[[0-9:]+\]]], vcc
+; GFX1064: s_xor_b64 [[KILL:s\[[0-9:]+\]]], [[WQM]], exec
+; GFX1064: s_andn2_b64 [[MASK:s\[[0-9:]+\]]], [[LIVE]], [[KILL]]
+; GFX1064: s_and_b64 exec, exec, [[MASK]]
define amdgpu_ps void @test_wqm_vote(float %a) {
%c1 = fcmp une float %a, 0.0
%c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1)
call void @llvm.amdgcn.kill(i1 %c2)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
ret void
}
declare i1 @llvm.amdgcn.ps.live()
declare i64 @llvm.cttz.i64(i64, i1)
declare i32 @llvm.cttz.i32(i32, i1)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #5
attributes #0 = { nounwind readnone speculatable }
attributes #1 = { nounwind }
attributes #2 = { nounwind readnone optnone noinline }
attributes #3 = { "target-features"="+wavefrontsize32" }
attributes #4 = { "target-features"="+wavefrontsize64" }
+attributes #5 = { inaccessiblememonly nounwind }
;CHECK: image_sample
;CHECK: buffer_store_dword
;CHECK: s_wqm_b64 exec, exec
-;CHECK: v_cmpx_
+;CHECK: v_cmp_
;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;CHECK: image_sample
; CHECK: image_sample
; CHECK: s_and_b64 exec, exec, [[ORIG]]
; CHECK: image_sample
-; CHECK: buffer_store_dword
; CHECK-NOT: wqm
-; CHECK: v_cmpx_
+; CHECK-DAG: buffer_store_dword
+; CHECK-DAG: v_cmp_
define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
main_body:
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0