const SIRegisterInfo *TRI = &TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
+ MachineFrameInfo &FrameInfo = MF.getFrameInfo();
assert(MFI->isEntryFunction());
Register SPReg = MFI->getStackPtrOffsetReg();
assert(SPReg != AMDGPU::SP_REG);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
- .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST));
+ .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
}
if (hasFP(MF)) {
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}
- if ((MFI->hasFlatScratchInit() || ScratchRsrcReg) &&
+ bool NeedsFlatScratchInit =
+ MFI->hasFlatScratchInit() &&
+ (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
+ (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+
+ if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
!ST.flatScratchIsArchitected()) {
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}
- if (MFI->hasFlatScratchInit()) {
+ if (NeedsFlatScratchInit) {
emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
}
if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
KernargSegmentPtr = true;
+ // TODO: This could be refined a lot. The attribute is a poor way of
+ // detecting calls or stack objects that may require it before argument
+ // lowering.
if (ST.hasFlatAddressSpace() && isEntryFunction() &&
(isAmdHsaOrMesa || ST.enableFlatScratch()) &&
+ (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
!ST.flatScratchIsArchitected()) {
- // TODO: This could be refined a lot. The attribute is a poor way of
- // detecting calls or stack objects that may require it before argument
- // lowering.
- if (HasCalls || HasStackObjects || ST.enableFlatScratch())
- FlatScratchInit = true;
+ FlatScratchInit = true;
}
Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GFX9-NEXT: s_add_u32 s0, s0, s9
; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_movk_i32 s32, 0x400
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-NEXT: s_and_b32 s4, s4, -16
-; GFX9-NEXT: s_movk_i32 s32, 0x400
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-NEXT: s_add_u32 s4, s32, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_mov_b32 s33, 0
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
;
; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s6, s6, s9
-; GFX10-NEXT: s_movk_i32 s32, 0x200
-; GFX10-NEXT: s_mov_b32 s33, 0
-; GFX10-NEXT: s_addc_u32 s7, s7, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-NEXT: s_add_u32 s0, s0, s9
; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_movk_i32 s32, 0x200
; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s33, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX10-NEXT: s_and_b32 s4, s4, -16
; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GFX9-NEXT: s_add_u32 s0, s0, s9
; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_movk_i32 s32, 0x400
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-NEXT: s_and_b32 s4, s4, -16
-; GFX9-NEXT: s_movk_i32 s32, 0x400
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-NEXT: s_add_u32 s4, s32, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_mov_b32 s33, 0
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
;
; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s6, s6, s9
-; GFX10-NEXT: s_movk_i32 s32, 0x200
-; GFX10-NEXT: s_mov_b32 s33, 0
-; GFX10-NEXT: s_addc_u32 s7, s7, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-NEXT: s_add_u32 s0, s0, s9
; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_movk_i32 s32, 0x200
; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s33, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX10-NEXT: s_and_b32 s4, s4, -16
; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GFX9-NEXT: s_add_u32 s0, s0, s9
; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX9-NEXT: s_and_b32 s4, s4, -16
-; GFX9-NEXT: s_movk_i32 s32, 0x800
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
; GFX9-NEXT: s_add_u32 s4, s32, s4
; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_mov_b32 s33, 0
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
;
; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s6, s6, s9
-; GFX10-NEXT: s_movk_i32 s32, 0x400
-; GFX10-NEXT: s_mov_b32 s33, 0
-; GFX10-NEXT: s_addc_u32 s7, s7, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-NEXT: s_add_u32 s0, s0, s9
; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_movk_i32 s32, 0x400
; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s33, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
; GFX10-NEXT: s_and_b32 s4, s4, -16
; TODO: Could optimize out in this case
; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls:
-; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7
-; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0
; RO-FLAT-NOT: flat_scratch
; RW-FLAT: buffer_store_dword
; RO-FLAT: scratch_store_dword
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
; RO-FLAT: .amdhsa_enable_private_segment 1
-; GCN-NOT: .amdhsa_reserve_flat_scratch
+; RW-FLAT: .amdhsa_reserve_flat_scratch 0
+; RO-FLAT-NOT: .amdhsa_reserve_flat_scratch
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_load_dword s6, s[4:5], 0x8
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_movk_i32 s32, 0x400
+; GCN-NEXT: s_mov_b32 s33, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_mov_b32 s33, 0
; GCN-NEXT: s_cbranch_scc1 BB0_3
; GCN-NEXT: ; %bb.1: ; %bb.0
; GCN-NEXT: s_load_dword s6, s[4:5], 0xc
define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_load_dword s6, s[4:5], 0x8
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_movk_i32 s32, 0x1000
+; GCN-NEXT: s_mov_b32 s33, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_mov_b32 s33, 0
; GCN-NEXT: s_cbranch_scc1 BB1_2
; GCN-NEXT: ; %bb.1: ; %bb.0
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; RUN: llc -global-isel=0 -amdgpu-fixed-function-abi=0 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,SDAG %s
; RUN: llc -global-isel=1 -amdgpu-fixed-function-abi=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,GISEL %s
-; FIXME: Emitting unnecessary flat_scratch setup
-
; GCN-LABEL: {{^}}test_call_undef:
-; SDAG: s_mov_b32 flat_scratch_lo, s13
-; SDAG: s_add_i32 s12, s12, s17
-; SDAG: s_lshr_b32
; GCN: s_endpgm
define amdgpu_kernel void @test_call_undef() #0 {
%val = call i32 undef(i32 1)
}
; GCN-LABEL: {{^}}test_call_null:
-; SDAG: s_mov_b32 flat_scratch_lo, s13
-; SDAG: s_add_i32 s12, s12, s17
-; SDAG: s_lshr_b32
-
; GISEL: s_swappc_b64 s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @test_call_null() #0 {
define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
; GFX803-LABEL: test_kern_stack:
; GFX803: ; %bb.0: ; %entry
-; GFX803-NEXT: s_add_i32 s4, s4, s7
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; GFX803-NEXT: s_add_u32 s0, s0, s7
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: v_mov_b32_e32 v0, 0
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_kern_stack:
; GFX900: ; %bb.0: ; %entry
-; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
-; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; GFX900-NEXT: s_add_u32 s0, s0, s7
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: v_mov_b32_e32 v0, 0
;
; GFX1010-LABEL: test_kern_stack:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_add_u32 s4, s4, s7
-; GFX1010-NEXT: s_addc_u32 s5, s5, 0
-; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
-; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
; GFX1010-NEXT: v_mov_b32_e32 v0, 0
; GFX1010-NEXT: s_add_u32 s0, s0, s7
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
; GFX803-LABEL: test_force_fp_kern_stack:
; GFX803: ; %bb.0: ; %entry
-; GFX803-NEXT: s_add_i32 s4, s4, s7
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; GFX803-NEXT: s_add_u32 s0, s0, s7
; GFX803-NEXT: s_mov_b32 s33, 0
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: v_mov_b32_e32 v0, 0
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_endpgm
;
; GFX900-LABEL: test_force_fp_kern_stack:
; GFX900: ; %bb.0: ; %entry
-; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
-; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; GFX900-NEXT: s_add_u32 s0, s0, s7
; GFX900-NEXT: s_mov_b32 s33, 0
; GFX900-NEXT: s_addc_u32 s1, s1, 0
;
; GFX1010-LABEL: test_force_fp_kern_stack:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_add_u32 s4, s4, s7
-; GFX1010-NEXT: s_mov_b32 s33, 0
-; GFX1010-NEXT: s_addc_u32 s5, s5, 0
-; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
-; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
; GFX1010-NEXT: v_mov_b32_e32 v0, 0
; GFX1010-NEXT: s_add_u32 s0, s0, s7
+; GFX1010-NEXT: s_mov_b32 s33, 0
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
; GFX803-LABEL: test_sgpr_offset_kernel:
; GFX803: ; %bb.0: ; %entry
-; GFX803-NEXT: s_add_i32 s4, s4, s7
-; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; GFX803-NEXT: s_add_u32 s0, s0, s7
; GFX803-NEXT: s_addc_u32 s1, s1, 0
; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: s_mov_b32 s4, 0x40000
-; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
; GFX803-NEXT: ;;#ASMSTART
; GFX803-NEXT: ;;#ASMEND
;
; GFX900-LABEL: test_sgpr_offset_kernel:
; GFX900: ; %bb.0: ; %entry
-; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7
-; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; GFX900-NEXT: s_add_u32 s0, s0, s7
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: s_mov_b32 s6, 0x40000
-; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT: s_mov_b32 s4, 0x40000
+; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: s_mov_b32 s6, 0x40000
-; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT: s_mov_b32 s4, 0x40000
+; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; GFX900-NEXT: s_waitcnt vmcnt(0)
;
; GFX1010-LABEL: test_sgpr_offset_kernel:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_add_u32 s4, s4, s7
-; GFX1010-NEXT: s_addc_u32 s5, s5, 0
-; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
-; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
; GFX1010-NEXT: s_add_u32 s0, s0, s7
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
-; GFX1010-NEXT: s_mov_b32 s6, 0x20000
+; GFX1010-NEXT: s_mov_b32 s4, 0x20000
; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc
; GFX1010-NEXT: s_waitcnt vmcnt(0)
-; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1010-NEXT: s_mov_b32 s6, 0x20000
+; GFX1010-NEXT: s_mov_b32 s4, 0x20000
; GFX1010-NEXT: ;;#ASMSTART
; GFX1010-NEXT: ;;#ASMEND
-; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; GFX1010-NEXT: s_waitcnt vmcnt(0)
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 {
; GFX900-LABEL: vload2_private:
; GFX900: ; %bb.0: ; %entry
-; GFX900-NEXT: s_add_u32 flat_scratch_lo, s6, s9
-; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GFX900-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX900-NEXT: v_mov_b32_e32 v2, 0
; GFX900-NEXT: s_add_u32 s0, s0, s9
;
; GFX10_DEFAULT-LABEL: vload2_private:
; GFX10_DEFAULT: ; %bb.0: ; %entry
-; GFX10_DEFAULT-NEXT: s_add_u32 s6, s6, s9
-; GFX10_DEFAULT-NEXT: s_addc_u32 s7, s7, 0
-; GFX10_DEFAULT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; GFX10_DEFAULT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v2, 0
; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: v_mov_b32_e32 v0, vcc_lo
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 s2, exec_lo
; GCN-NEXT: s_mov_b32 exec_lo, 3
; GCN-NEXT: s_mov_b32 s3, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_waitcnt_depctr 0xffe3
; GCN-NEXT: s_mov_b32 exec_lo, s2
+; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: global_store_dword v1, v0, s[0:1]
; GCN-NEXT: s_endpgm
call void asm sideeffect "", "~{s[0:7]}" ()
define amdgpu_kernel void @kernel_no_calls_no_stack() {
; GCN-LABEL: kernel_no_calls_no_stack:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 s0, s0, s3
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
-; GCN-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GCN-NEXT: s_endpgm
ret void
}
; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck --check-prefixes=GCN,GFX9,ALL %s
; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 < %s -mattr=-flat-for-global | FileCheck --check-prefixes=GCNHSA,ALL %s
; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,GFX10HSA,ALL %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
; FIXME: align on alloca seems to be ignored for private_segment_alignment
; VI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe80000
; GFX9-DAG: s_mov_b32 s{{[0-9]+}}, 0xe00000
-
-; GFX10HSA: s_add_u32 [[FLAT_SCR_LO:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
-; GFX10HSA-DAG: s_addc_u32 [[FLAT_SCR_HI:s[0-9]+]], s{{[0-9]+}}, 0
-; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), [[FLAT_SCR_LO]]
-; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), [[FLAT_SCR_HI]]
-
; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen
; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen
; MUBUF-LABEL: local_stack_offset_uses_sp:
; MUBUF: ; %bb.0: ; %entry
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9
-; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; MUBUF-NEXT: s_add_u32 s0, s0, s9
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
; MUBUF-LABEL: local_stack_offset_uses_sp_flat:
; MUBUF: ; %bb.0: ; %entry
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9
-; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; MUBUF-NEXT: s_add_u32 s0, s0, s9
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000
;
; GCN-SCRATCH-LABEL: vector_clause:
; GCN-SCRATCH: ; %bb.0: ; %bb
-; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5
-; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0
-; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v16, 4, v0
; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
;
; GCN-SCRATCH-LABEL: scalar_clause:
; GCN-SCRATCH: ; %bb.0: ; %bb
-; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5
-; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0
-; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GCN-SCRATCH-NEXT: s_clause 0x1
; GCN-SCRATCH-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
; GCN-SCRATCH-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x2c
;
; GCN-SCRATCH-LABEL: vector_clause_indirect:
; GCN-SCRATCH: ; %bb.0: ; %bb
-; GCN-SCRATCH-NEXT: s_add_u32 s2, s2, s5
-; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0
-; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
; MUBUF: ; %bb.0: ; %entry
-; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9
-; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; MUBUF-NEXT: s_add_u32 s0, s0, s9
; MUBUF-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
; MUBUF: ; %bb.0: ; %entry
-; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s6, s9
-; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; MUBUF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
; MUBUF-NEXT: s_add_u32 s0, s0, s9
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
; 0x40000 / 64 = 4096 (for wave64)
- ; MUBUF: s_mov_b32 s6, 0x40000
- ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
+ ; MUBUF: s_mov_b32 s4, 0x40000
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
; FLATSCR: s_movk_i32 s2, 0x1000
; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill
%a = load volatile i32, i32 addrspace(5)* %aptr
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
; 0x3ff00 / 64 = 4092 (for wave64)
- ; MUBUF: s_mov_b32 s6, 0x3ff00
- ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
- ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
+ ; MUBUF: s_mov_b32 s4, 0x3ff00
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
+ ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc
; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]] ; 8-byte Folded Spill
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
define amdgpu_kernel void @max_alignment_128() #0 {
; VI-LABEL: max_alignment_128:
; VI: ; %bb.0:
-; VI-NEXT: s_add_i32 s4, s4, s7
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; VI-NEXT: s_add_u32 s0, s0, s7
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, 9
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
; VI-NEXT: .amdhsa_next_free_vgpr 1
; VI-NEXT: .amdhsa_next_free_sgpr 8
; VI-NEXT: .amdhsa_reserve_vcc 0
+; VI-NEXT: .amdhsa_reserve_flat_scratch 0
; VI-NEXT: .amdhsa_float_round_mode_32 0
; VI-NEXT: .amdhsa_float_round_mode_16_64 0
; VI-NEXT: .amdhsa_float_denorm_mode_32 3
;
; GFX9-LABEL: max_alignment_128:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; GFX9-NEXT: s_add_u32 s0, s0, s7
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v0, 9
; GFX9-NEXT: .amdhsa_next_free_vgpr 1
; GFX9-NEXT: .amdhsa_next_free_sgpr 8
; GFX9-NEXT: .amdhsa_reserve_vcc 0
+; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT: .amdhsa_float_round_mode_32 0
; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0
define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-LABEL: stackrealign_attr:
; VI: ; %bb.0:
-; VI-NEXT: s_add_i32 s4, s4, s7
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; VI-NEXT: s_add_u32 s0, s0, s7
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, 9
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
; VI-NEXT: .amdhsa_next_free_vgpr 1
; VI-NEXT: .amdhsa_next_free_sgpr 8
; VI-NEXT: .amdhsa_reserve_vcc 0
+; VI-NEXT: .amdhsa_reserve_flat_scratch 0
; VI-NEXT: .amdhsa_float_round_mode_32 0
; VI-NEXT: .amdhsa_float_round_mode_16_64 0
; VI-NEXT: .amdhsa_float_denorm_mode_32 3
;
; GFX9-LABEL: stackrealign_attr:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; GFX9-NEXT: s_add_u32 s0, s0, s7
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v0, 9
; GFX9-NEXT: .amdhsa_next_free_vgpr 1
; GFX9-NEXT: .amdhsa_next_free_sgpr 8
; GFX9-NEXT: .amdhsa_reserve_vcc 0
+; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT: .amdhsa_float_round_mode_32 0
; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0
define amdgpu_kernel void @alignstack_attr() #2 {
; VI-LABEL: alignstack_attr:
; VI: ; %bb.0:
-; VI-NEXT: s_add_i32 s4, s4, s7
-; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; VI-NEXT: s_add_u32 s0, s0, s7
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, 9
-; VI-NEXT: s_mov_b32 flat_scratch_lo, s5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
; VI-NEXT: .amdhsa_next_free_vgpr 1
; VI-NEXT: .amdhsa_next_free_sgpr 8
; VI-NEXT: .amdhsa_reserve_vcc 0
+; VI-NEXT: .amdhsa_reserve_flat_scratch 0
; VI-NEXT: .amdhsa_float_round_mode_32 0
; VI-NEXT: .amdhsa_float_round_mode_16_64 0
; VI-NEXT: .amdhsa_float_denorm_mode_32 3
;
; GFX9-LABEL: alignstack_attr:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; GFX9-NEXT: s_add_u32 s0, s0, s7
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v0, 9
; GFX9-NEXT: .amdhsa_next_free_vgpr 1
; GFX9-NEXT: .amdhsa_next_free_sgpr 8
; GFX9-NEXT: .amdhsa_reserve_vcc 0
+; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT: .amdhsa_float_round_mode_32 0
; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0