return findRegisterDefOperandIdx(Reg, true, false, TRI) != -1;
}
+ /// Returns true if the MachineInstr has an implicit-use operand of exactly
+ /// the given register (not considering sub/super-registers).
+ bool hasRegisterImplicitUseOperand(unsigned Reg) const;
+
/// Returns the operand index that is a use of the specific register or -1
/// if it is not found. It further tightens the search criteria to a use
/// that kills the register if isKill is true.
[IntrNoMem]>; // See int_amdgcn_v_interp_p1 for why this is
// IntrNoMem.
+// Pixel shaders only: whether the current pixel is live (i.e. not a helper
+// invocation for derivative computation).
+def int_amdgcn_ps_live : Intrinsic <
+ [llvm_i1_ty],
+ [],
+ [IntrNoMem]>;
+
def int_amdgcn_mbcnt_lo :
GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
return Size;
}
+/// Returns true if the MachineInstr has an implicit-use operand of exactly
+/// the given register (not considering sub/super-registers).
+bool MachineInstr::hasRegisterImplicitUseOperand(unsigned Reg) const {
+ for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = getOperand(i);
+ if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == Reg)
+ return true;
+ }
+ return false;
+}
+
/// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of
/// the specific register or -1 if it is not found. It further tightens
/// the search criteria to a use that kills the register if isKill is true.
case Intrinsic::amdgcn_buffer_atomic_or:
case Intrinsic::amdgcn_buffer_atomic_xor:
case Intrinsic::amdgcn_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_ps_live:
return true;
}
// Make sure we aren't losing exec uses in the td files. This mostly requires
// being careful when using let Uses to try to add other use registers.
if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) {
- const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC);
- if (!Exec || !Exec->isImplicit()) {
+ if (!MI->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
ErrInfo = "VALU instruction does not implicitly read exec mask";
return false;
}
} // End mayLoad = 1, mayStore = 1, hasSideEffects = 1
+let SALU = 1 in
+def SI_PS_LIVE : InstSI <
+ (outs SReg_64:$dst),
+ (ins),
+ "si_ps_live $dst",
+ [(set i1:$dst, (int_amdgcn_ps_live))]
+>;
+
// Used as an isel pseudo to directly emit initialization with an
// s_mov_b32 rather than a copy of another initialized
// register. MachineCSE skips copies, and we don't want to have to
DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<const MachineBasicBlock *, BlockInfo> Blocks;
SmallVector<const MachineInstr *, 2> ExecExports;
+ SmallVector<MachineInstr *, 1> LiveMaskQueries;
- char scanInstructions(const MachineFunction &MF, std::vector<WorkItem>& Worklist);
+ char scanInstructions(MachineFunction &MF, std::vector<WorkItem>& Worklist);
void propagateInstruction(const MachineInstr &MI, std::vector<WorkItem>& Worklist);
void propagateBlock(const MachineBasicBlock &MBB, std::vector<WorkItem>& Worklist);
- char analyzeFunction(const MachineFunction &MF);
+ char analyzeFunction(MachineFunction &MF);
void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SaveWQM, unsigned LiveMaskReg);
unsigned SavedWQM);
void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+ void lowerLiveMaskQueries(unsigned LiveMaskReg);
+
public:
static char ID;
// Scan instructions to determine which ones require an Exact execmask and
// which ones seed WQM requirements.
-char SIWholeQuadMode::scanInstructions(const MachineFunction &MF,
+char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
std::vector<WorkItem> &Worklist) {
char GlobalFlags = 0;
for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
- const MachineBasicBlock &MBB = *BI;
+ MachineBasicBlock &MBB = *BI;
for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
- const MachineInstr &MI = *II;
+ MachineInstr &MI = *II;
unsigned Opcode = MI.getOpcode();
char Flags;
Flags = StateExact;
} else {
// Handle export instructions with the exec mask valid flag set
- if (Opcode == AMDGPU::EXP && MI.getOperand(4).getImm() != 0)
- ExecExports.push_back(&MI);
+ if (Opcode == AMDGPU::EXP) {
+ if (MI.getOperand(4).getImm() != 0)
+ ExecExports.push_back(&MI);
+ } else if (Opcode == AMDGPU::SI_PS_LIVE) {
+ LiveMaskQueries.push_back(&MI);
+ }
+
continue;
}
}
}
-char SIWholeQuadMode::analyzeFunction(const MachineFunction &MF) {
+char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
std::vector<WorkItem> Worklist;
char GlobalFlags = scanInstructions(MF, Worklist);
}
}
+void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+ for (MachineInstr *MI : LiveMaskQueries) {
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned Dest = MI->getOperand(0).getReg();
+ BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
+ .addReg(LiveMaskReg);
+ MI->eraseFromParent();
+ }
+}
+
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
return false;
Instructions.clear();
Blocks.clear();
ExecExports.clear();
+ LiveMaskQueries.clear();
TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
MRI = &MF.getRegInfo();
char GlobalFlags = analyzeFunction(MF);
- if (!(GlobalFlags & StateWQM))
- return false;
+ if (!(GlobalFlags & StateWQM)) {
+ lowerLiveMaskQueries(AMDGPU::EXEC);
+ return !LiveMaskQueries.empty();
+ }
+ // Store a copy of the original live mask when required
MachineBasicBlock &Entry = MF.front();
MachineInstr *EntryMI = Entry.getFirstNonPHI();
+ unsigned LiveMaskReg = 0;
+
+ if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
+ LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+ .addReg(AMDGPU::EXEC);
+ }
if (GlobalFlags == StateWQM) {
// For a shader that needs only WQM, we can just set it once.
BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC).addReg(AMDGPU::EXEC);
+
+ lowerLiveMaskQueries(LiveMaskReg);
+ // EntryMI may become invalid here
return true;
}
- // Handle the general case
- unsigned LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
- BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
- .addReg(AMDGPU::EXEC);
+ lowerLiveMaskQueries(LiveMaskReg);
+ EntryMI = nullptr;
+ // Handle the general case
for (const auto &BII : Blocks)
processBlock(const_cast<MachineBasicBlock &>(*BII.first), LiveMaskReg,
BII.first == &*MF.begin());
--- /dev/null
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec
+;
+; Note: We could generate better code here if we recognized earlier that
+; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However,
+; the expectation is that the intrinsic will be used in non-trivial shaders,
+; so such an optimization doesn't seem worth the effort.
+define amdgpu_ps float @test1() {
+ %live = call i1 @llvm.amdgcn.ps.live()
+ %live.32 = zext i1 %live to i32
+ %r = bitcast i32 %live.32 to float
+ ret float %r
+}
+
+; CHECK-LABEL: {{^}}test2:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK-DAG: s_wqm_b64 exec, exec
+; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]]
+; CHECK: image_sample v0, [[VAR]],
+define amdgpu_ps float @test2() {
+ %live = call i1 @llvm.amdgcn.ps.live()
+ %live.32 = zext i1 %live to i32
+
+ %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+ %r = extractelement <4 x float> %t, i32 0
+ ret float %r
+}
+
+; CHECK-LABEL: {{^}}test3:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK-DAG: s_wqm_b64 exec, exec
+; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1
+; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]]
+; CHECK: ; %dead
+define amdgpu_ps float @test3(i32 %in) {
+entry:
+ %live = call i1 @llvm.amdgcn.ps.live()
+ br i1 %live, label %end, label %dead
+
+dead:
+ %tc.dead = mul i32 %in, 2
+ br label %end
+
+end:
+ %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ]
+ %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %tc, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+ %r = extractelement <4 x float> %t, i32 0
+ ret float %r
+}
+
+declare i1 @llvm.amdgcn.ps.live() #0
+
+declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }