}
assert(ScratchWaveOffsetReg);
- if (MF.getFrameInfo().hasCalls()) {
+ if (requiresStackPointerReference(MF)) {
Register SPReg = MFI->getStackPtrOffsetReg();
assert(SPReg != AMDGPU::SP_REG);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
return MBB.erase(I);
}
+/// Returns true if the frame will require a reference to the stack pointer.
+///
+/// This is the set of conditions common to setting up the stack pointer in a
+/// kernel, and for using a frame pointer in a callable function.
+///
+/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
+/// references SP.
+static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
+ return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
+}
+
+// The FP for kernels is always known 0, so we never really need to setup an
+// explicit register for it. However, DisableFramePointerElim will force us to
+// use a register for it.
bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
return MFI.getStackSize() != 0;
}
- return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
- MFI.hasStackMap() || MFI.hasPatchPoint() ||
+ return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
MF.getTarget().Options.DisableFramePointerElim(MF);
}
+
+// This is essentially a reduced version of hasFP for entry functions. Since the
+// stack pointer is known 0 on entry to kernels, we never really need an FP
+// register. We may need to initialize the stack pointer depending on the frame
+// properties, which logically overlaps many of the cases where an ordinary
+// function would require an FP.
+bool SIFrameLowering::requiresStackPointerReference(
+ const MachineFunction &MF) const {
+ // Callable functions always require a stack pointer reference.
+ assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
+ "only expected to call this for entry points");
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ // Entry points ordinarily don't need to initialize SP. We have to set it up
+ // for callees if there are any. Also note tail calls are impossible/don't
+ // make any sense for kernels.
+ if (MFI.hasCalls())
+ return true;
+
+ // We still need to initialize the SP if we're doing anything weird that
+ // references the SP, like variable sized stack objects.
+ return frameTriviallyRequiresSP(MFI);
+}
public:
bool hasFP(const MachineFunction &MF) const override;
+
+ bool requiresStackPointerReference(const MachineFunction &MF) const;
};
} // end namespace llvm
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-NEXT: s_and_b32 s4, s4, -16
+; GFX9-NEXT: s_movk_i32 s32, 0x400
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-NEXT: s_add_u32 s4, s32, s4
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s6, s6, s9
+; GFX10-NEXT: s_movk_i32 s32, 0x200
; GFX10-NEXT: s_mov_b32 s33, 0
; GFX10-NEXT: s_addc_u32 s7, s7, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-NEXT: s_and_b32 s4, s4, -16
+; GFX9-NEXT: s_movk_i32 s32, 0x400
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-NEXT: s_add_u32 s4, s32, s4
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s6, s6, s9
+; GFX10-NEXT: s_movk_i32 s32, 0x200
; GFX10-NEXT: s_mov_b32 s33, 0
; GFX10-NEXT: s_addc_u32 s7, s7, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-NEXT: s_and_b32 s4, s4, -16
+; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-NEXT: s_add_u32 s4, s32, s4
; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800
; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s6, s6, s9
+; GFX10-NEXT: s_movk_i32 s32, 0x400
; GFX10-NEXT: s_mov_b32 s33, 0
; GFX10-NEXT: s_addc_u32 s7, s7, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_mov_b32 s33, 0
+; GCN-NEXT: s_movk_i32 s32, 0x400
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_cselect_b32 s6, 1, 0
; GCN-NEXT: s_and_b32 s6, s6, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-NEXT: s_mov_b32 s33, 0
; GCN-NEXT: s_cbranch_scc1 BB0_3
; GCN-NEXT: ; %bb.1: ; %bb.0
; GCN-NEXT: s_load_dword s6, s[4:5], 0xc
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_mov_b32 s33, 0
+; GCN-NEXT: s_movk_i32 s32, 0x1000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_cselect_b32 s6, 1, 0
; GCN-NEXT: s_and_b32 s6, s6, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 0
+; GCN-NEXT: s_mov_b32 s33, 0
; GCN-NEXT: s_cbranch_scc1 BB1_2
; GCN-NEXT: ; %bb.1: ; %bb.0
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_movk_i32 s32, 0x400
; GCN-NEXT: s_mov_b32 s33, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_movk_i32 s32, 0x1000
; GCN-NEXT: s_mov_b32 s33, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s6, 0