; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
define amdgpu_kernel void @zero_init_kernel() {
; GFX9-LABEL: zero_init_kernel:
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
; GFX9-PAL-NEXT: s_endpgm
;
-; GFX10-PAL-LABEL: zero_init_kernel:
-; GFX10-PAL: ; %bb.0:
-; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT: s_mov_b32 s2, s0
-; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT: s_mov_b32 s0, 0
-; GFX10-PAL-NEXT: s_mov_b32 s1, s0
-; GFX10-PAL-NEXT: s_mov_b32 s2, s0
-; GFX10-PAL-NEXT: s_mov_b32 s3, s0
-; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16
-; GFX10-PAL-NEXT: s_endpgm
+; GFX1010-PAL-LABEL: zero_init_kernel:
+; GFX1010-PAL: ; %bb.0:
+; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT: s_mov_b32 s0, 0
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: s_mov_b32 s1, s0
+; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT: s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64
+; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
+; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
+; GFX1010-PAL-NEXT: s_endpgm
+;
+; GFX1030-PAL-LABEL: zero_init_kernel:
+; GFX1030-PAL: ; %bb.0:
+; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT: s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT: s_mov_b32 s1, s0
+; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT: s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16
+; GFX1030-PAL-NEXT: s_endpgm
%alloca = alloca [32 x i16], align 2, addrspace(5)
%cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
; GFX9-PAL-NEXT: s_endpgm
;
-; GFX10-PAL-LABEL: zero_init_small_offset_kernel:
-; GFX10-PAL: ; %bb.0:
-; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT: s_mov_b32 s2, s0
-; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: s_mov_b32 s0, 0
-; GFX10-PAL-NEXT: s_mov_b32 s1, s0
-; GFX10-PAL-NEXT: s_mov_b32 s2, s0
-; GFX10-PAL-NEXT: s_mov_b32 s3, s0
-; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320
-; GFX10-PAL-NEXT: s_endpgm
+; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
+; GFX1010-PAL: ; %bb.0:
+; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: s_mov_b32 s0, 0
+; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: s_mov_b32 s1, s0
+; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT: s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272
+; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288
+; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304
+; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320
+; GFX1010-PAL-NEXT: s_endpgm
+;
+; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
+; GFX1030-PAL: ; %bb.0:
+; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT: s_mov_b32 s1, s0
+; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT: s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320
+; GFX1030-PAL-NEXT: s_endpgm
%padding = alloca [64 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
%pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_endpgm
;
-; GFX10-PAL-LABEL: store_load_sindex_small_offset_kernel:
-; GFX10-PAL: ; %bb.0: ; %bb
-; GFX10-PAL-NEXT: s_getpc_b64 s[4:5]
-; GFX10-PAL-NEXT: s_mov_b32 s4, s0
-; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3
-; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
-; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15
-; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15
-; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2
-; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2
-; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0
-; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1
-; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0
-; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: s_endpgm
+; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel:
+; GFX1010-PAL: ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5]
+; GFX1010-PAL-NEXT: s_mov_b32 s4, s0
+; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3
+; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX1010-PAL-NEXT: s_add_u32 s0, 0x104, s0
+; GFX1010-PAL-NEXT: s_add_u32 s1, 0x104, s1
+; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel:
+; GFX1030-PAL: ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5]
+; GFX1030-PAL-NEXT: s_mov_b32 s4, s0
+; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3
+; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX1030-PAL-NEXT: s_add_u32 s0, 0x104, s0
+; GFX1030-PAL-NEXT: s_add_u32 s1, 0x104, s1
+; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_endpgm
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_endpgm
;
-; GFX10-PAL-LABEL: store_load_sindex_small_offset_foo:
-; GFX10-PAL: ; %bb.0: ; %bb
-; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT: s_mov_b32 s2, s0
-; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15
-; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15
-; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2
-; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2
-; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0
-; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1
-; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0
-; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: s_endpgm
+; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
+; GFX1010-PAL: ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX1010-PAL-NEXT: s_add_u32 s0, 0x104, s0
+; GFX1010-PAL-NEXT: s_add_u32 s1, 0x104, s1
+; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo:
+; GFX1030-PAL: ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX1030-PAL-NEXT: s_add_u32 s0, 0x104, s0
+; GFX1030-PAL-NEXT: s_add_u32 s1, 0x104, s1
+; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_endpgm
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_endpgm
;
-; GFX10-PAL-LABEL: store_load_vindex_small_offset_kernel:
-; GFX10-PAL: ; %bb.0: ; %bb
-; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT: s_mov_b32 s2, s0
-; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x104
-; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15
-; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
-; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off
-; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: s_endpgm
+; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel:
+; GFX1010-PAL: ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104
+; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
+; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel:
+; GFX1030-PAL: ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104
+; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
+; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_endpgm
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
; GFX9-PAL-NEXT: s_endpgm
;
-; GFX10-PAL-LABEL: zero_init_large_offset_kernel:
-; GFX10-PAL: ; %bb.0:
-; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT: s_mov_b32 s2, s0
-; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: s_mov_b32 s0, 0
-; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-PAL-NEXT: s_mov_b32 s1, s0
-; GFX10-PAL-NEXT: s_mov_b32 s2, s0
-; GFX10-PAL-NEXT: s_mov_b32 s3, s0
-; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo
-; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
-; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
-; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
-; GFX10-PAL-NEXT: s_endpgm
+; GFX1010-PAL-LABEL: zero_init_large_offset_kernel:
+; GFX1010-PAL: ; %bb.0:
+; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: s_mov_b32 s0, 0
+; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:16 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: s_mov_b32 s1, s0
+; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT: s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo
+; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
+; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
+; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1010-PAL-NEXT: s_endpgm
+;
+; GFX1030-PAL-LABEL: zero_init_large_offset_kernel:
+; GFX1030-PAL: ; %bb.0:
+; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX1030-PAL-NEXT: s_mov_b32 s1, s0
+; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT: s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo
+; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
+; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
+; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1030-PAL-NEXT: s_endpgm
%padding = alloca [4096 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
%pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-PAL-LABEL: zero_init_large_offset_foo:
-; GFX10-PAL: ; %bb.0:
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: s_mov_b32 s0, 0
-; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-PAL-NEXT: s_mov_b32 s1, s0
-; GFX10-PAL-NEXT: s_mov_b32 s2, s0
-; GFX10-PAL-NEXT: s_mov_b32 s3, s0
-; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo
-; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
-; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
-; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
-; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
+; GFX1010-PAL-LABEL: zero_init_large_offset_foo:
+; GFX1010-PAL: ; %bb.0:
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: s_mov_b32 s0, 0
+; GFX1010-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX1010-PAL-NEXT: s_mov_b32 s1, s0
+; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT: s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo
+; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
+; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
+; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-PAL-LABEL: zero_init_large_offset_foo:
+; GFX1030-PAL: ; %bb.0:
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX1030-PAL-NEXT: s_mov_b32 s1, s0
+; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT: s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo
+; GFX1030-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
+; GFX1030-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
+; GFX1030-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31]
%padding = alloca [4096 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
%pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_endpgm
;
-; GFX10-PAL-LABEL: store_load_sindex_large_offset_kernel:
-; GFX10-PAL: ; %bb.0: ; %bb
-; GFX10-PAL-NEXT: s_getpc_b64 s[4:5]
-; GFX10-PAL-NEXT: s_mov_b32 s4, s0
-; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3
-; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
-; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15
-; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15
-; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2
-; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2
-; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0
-; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1
-; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0
-; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: s_endpgm
+; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel:
+; GFX1010-PAL: ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5]
+; GFX1010-PAL-NEXT: s_mov_b32 s4, s0
+; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3
+; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX1010-PAL-NEXT: s_add_u32 s0, 0x4004, s0
+; GFX1010-PAL-NEXT: s_add_u32 s1, 0x4004, s1
+; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel:
+; GFX1030-PAL: ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5]
+; GFX1030-PAL-NEXT: s_mov_b32 s4, s0
+; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3
+; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX1030-PAL-NEXT: s_add_u32 s0, 0x4004, s0
+; GFX1030-PAL-NEXT: s_add_u32 s1, 0x4004, s1
+; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_endpgm
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_endpgm
;
-; GFX10-PAL-LABEL: store_load_sindex_large_offset_foo:
-; GFX10-PAL: ; %bb.0: ; %bb
-; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT: s_mov_b32 s2, s0
-; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15
-; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15
-; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2
-; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2
-; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0
-; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1
-; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0
-; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: s_endpgm
+; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo:
+; GFX1010-PAL: ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX1010-PAL-NEXT: s_add_u32 s0, 0x4004, s0
+; GFX1010-PAL-NEXT: s_add_u32 s1, 0x4004, s1
+; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo:
+; GFX1030-PAL: ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX1030-PAL-NEXT: s_add_u32 s0, 0x4004, s0
+; GFX1030-PAL-NEXT: s_add_u32 s1, 0x4004, s1
+; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_endpgm
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_endpgm
;
-; GFX10-PAL-LABEL: store_load_vindex_large_offset_kernel:
-; GFX10-PAL: ; %bb.0: ; %bb
-; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT: s_mov_b32 s2, s0
-; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
-; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15
-; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
-; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off
-; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: s_endpgm
+; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel:
+; GFX1010-PAL: ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
+; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
+; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel:
+; GFX1030-PAL: ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
+; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
+; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_endpgm
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX9-PAL-NEXT: s_endpgm
;
-; GFX10-PAL-LABEL: store_load_large_imm_offset_kernel:
-; GFX10-PAL: ; %bb.0: ; %bb
-; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT: s_mov_b32 s2, s0
-; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13
-; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800
-; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0
-; GFX10-PAL-NEXT: scratch_store_dword off, v0, off offset:4
-; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
-; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
-; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT: s_endpgm
+; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel:
+; GFX1010-PAL: ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800
+; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT: s_add_u32 s0, 4, s0
+; GFX1010-PAL-NEXT: scratch_store_dword off, v0, vcc_lo offset:4
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel:
+; GFX1030-PAL: ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800
+; GFX1030-PAL-NEXT: s_add_u32 s0, 4, s0
+; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_endpgm
bb:
%i = alloca [4096 x i32], align 4, addrspace(5)
%i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
ret void
}
+define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
+; GFX9-LABEL: store_load_i32_negative_unaligned:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-NEXT: scratch_store_byte v0, v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: store_load_i32_negative_unaligned:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_mov_b32_e32 v1, 1
+; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-1
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: v_add_u32_e32 v0, -1, v0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX1010-PAL: ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, -1, v0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1
+; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX1030-PAL: ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1
+; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-1
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1
+ store volatile i8 1, i8 addrspace(5)* %ptr, align 1
+ %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
+ ret void
+}
+
+define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
+; GFX9-LABEL: store_load_i32_large_negative_unaligned:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-NEXT: scratch_store_byte v0, v1, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: store_load_i32_large_negative_unaligned:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, 1
+; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-129
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned:
+; GFX1010-PAL: ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0xffffefff, v0
+; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1
+; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off offset:-128
+; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-128 glc dlc
+; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned:
+; GFX1030-PAL: ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0
+; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1
+; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-129
+; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc
+; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225
+ store volatile i8 1, i8 addrspace(5)* %ptr, align 1
+ %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
+ ret void
+}
+
declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
declare i32 @llvm.amdgcn.workitem.id.x()