; GFX9-LABEL: zero_init_kernel:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
+; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_mov_b32 s2, s0
+; GFX9-NEXT: s_mov_b32 s3, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:76
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:72
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:68
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:64
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zero_init_kernel:
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:76
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:72
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:68
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:64
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:60
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:56
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:52
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:48
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:44
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:40
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:36
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:32
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:28
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:24
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:20
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:16
+; GFX10-NEXT: s_mov_b32 s1, s0
+; GFX10-NEXT: s_mov_b32 s2, s0
+; GFX10-NEXT: s_mov_b32 s3, s0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16
; GFX10-NEXT: s_endpgm
%alloca = alloca [32 x i16], align 2, addrspace(5)
%cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
; GFX9-LABEL: zero_init_foo:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:60
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:56
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:52
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:48
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:44
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:40
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:36
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:32
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:28
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:24
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:20
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:16
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:12
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:8
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
-; GFX9-NEXT: scratch_store_dword off, v0, s32
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_mov_b32 s2, s0
+; GFX9-NEXT: s_mov_b32 s3, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:60
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:56
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:52
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:48
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:44
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:40
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:36
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:32
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:28
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:24
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:20
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:16
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:12
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:8
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
-; GFX10-NEXT: scratch_store_dword off, v0, s32
+; GFX10-NEXT: s_mov_b32 s1, s0
+; GFX10-NEXT: s_mov_b32 s2, s0
+; GFX10-NEXT: s_mov_b32 s3, s0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca [32 x i16], align 2, addrspace(5)
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_mov_b32 s2, s0
+; GFX9-NEXT: s_mov_b32 s3, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:284
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:280
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:276
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:272
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:300
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:296
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:292
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:288
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:316
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:312
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:308
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:304
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:332
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:328
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:324
-; GFX9-NEXT: s_mov_b32 vcc_hi, 0
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:320
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zero_init_small_offset_kernel:
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: scratch_load_dword v0, off, off offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:284
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:280
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:276
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:272
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:300
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:296
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:292
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:288
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:316
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:312
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:308
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:304
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:332
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:328
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:324
-; GFX10-NEXT: scratch_store_dword off, v0, off offset:320
+; GFX10-NEXT: s_mov_b32 s1, s0
+; GFX10-NEXT: s_mov_b32 s2, s0
+; GFX10-NEXT: s_mov_b32 s3, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320
; GFX10-NEXT: s_endpgm
%padding = alloca [64 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s32
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_mov_b32 s2, s0
+; GFX9-NEXT: s_mov_b32 s3, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:268
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:264
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:260
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:256
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:284
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:280
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:276
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:272
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:300
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:296
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:292
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:288
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:316
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:312
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:308
-; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:304
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, off, s32
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:268
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:264
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:260
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:256
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:284
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:280
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:276
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:272
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:300
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:296
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:292
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:288
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:316
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:312
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:308
-; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:304
+; GFX10-NEXT: s_mov_b32 s1, s0
+; GFX10-NEXT: s_mov_b32 s2, s0
+; GFX10-NEXT: s_mov_b32 s3, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%padding = alloca [64 x i32], align 4, addrspace(5)
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; GFX9-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_mov_b32 s2, s0
+; GFX9-NEXT: s_mov_b32 s3, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:12
-; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:8
-; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi
; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20
-; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16
-; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44
-; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40
-; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36
-; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32
-; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60
-; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56
-; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52
-; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zero_init_large_offset_kernel:
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: scratch_load_dword v0, off, off offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT: s_mov_b32 s1, s0
+; GFX10-NEXT: s_mov_b32 s2, s0
+; GFX10-NEXT: s_mov_b32 s3, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:12
-; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:8
-; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:4
-; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo
-; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:28
-; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:24
-; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:20
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo
; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:16
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:44
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:40
-; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:36
-; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:32
-; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:60
-; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:56
-; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:52
-; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:48
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
; GFX10-NEXT: s_endpgm
%padding = alloca [4096 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s32
+; GFX9-NEXT: s_mov_b32 s0, 0
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_mov_b32 s2, s0
+; GFX9-NEXT: s_mov_b32 s3, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:12
-; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:8
-; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4
-; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi
-; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28
-; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24
-; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20
-; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16
-; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi
; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60
-; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56
-; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52
-; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
-; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48
+; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, off, s32
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT: s_mov_b32 s1, s0
+; GFX10-NEXT: s_mov_b32 s2, s0
+; GFX10-NEXT: s_mov_b32 s3, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:12
-; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:8
-; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:4
-; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo
-; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:28
-; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:24
-; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:20
-; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:16
-; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:44
-; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:40
-; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:36
-; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:32
-; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:60
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo
; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:56
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:52
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:48
+; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%padding = alloca [4096 x i32], align 4, addrspace(5)
ret void
}
-; FIXME: Multi-DWORD scratch shall be supported
define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
; GFX9-LABEL: store_load_i64_aligned:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4
; GFX9-NEXT: v_mov_b32_e32 v1, 15
-; GFX9-NEXT: scratch_store_dword v0, v1, off
-; GFX9-NEXT: scratch_load_dword v1, v0, off offset:4
-; GFX9-NEXT: scratch_load_dword v0, v0, off
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off
+; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_mov_b32_e32 v2, 15
+; GFX10-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4
-; GFX10-NEXT: scratch_store_dword v0, v2, off
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: scratch_load_dword v1, v0, off offset:4
-; GFX10-NEXT: scratch_load_dword v0, v0, off
+; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off
+; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
ret void
}
-; FIXME: Multi-DWORD unaligned scratch shall be supported
define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
; GFX9-LABEL: store_load_i64_unaligned:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: scratch_store_byte v0, v1, off offset:7
-; GFX9-NEXT: scratch_store_byte v0, v1, off offset:6
-; GFX9-NEXT: scratch_store_byte v0, v1, off offset:5
-; GFX9-NEXT: scratch_store_byte v0, v1, off offset:4
-; GFX9-NEXT: scratch_store_byte v0, v1, off offset:3
-; GFX9-NEXT: scratch_store_byte v0, v1, off offset:2
-; GFX9-NEXT: scratch_store_byte v0, v1, off offset:1
; GFX9-NEXT: v_mov_b32_e32 v1, 15
-; GFX9-NEXT: scratch_store_byte v0, v1, off
-; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:6
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:7
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:5
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:3
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: scratch_load_ubyte v1, v0, off
-; GFX9-NEXT: scratch_load_ubyte v0, v0, off offset:1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off
+; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_mov_b32_e32 v2, 15
+; GFX10-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: scratch_store_byte v0, v1, off offset:7
-; GFX10-NEXT: scratch_store_byte v0, v1, off offset:6
-; GFX10-NEXT: scratch_store_byte v0, v1, off offset:5
-; GFX10-NEXT: scratch_store_byte v0, v1, off offset:4
-; GFX10-NEXT: scratch_store_byte v0, v1, off offset:3
-; GFX10-NEXT: scratch_store_byte v0, v1, off offset:2
-; GFX10-NEXT: scratch_store_byte v0, v1, off offset:1
-; GFX10-NEXT: scratch_store_byte v0, v2, off
-; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:6
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:7
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:4
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:5
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:2
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:3
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: scratch_load_ubyte v1, v0, off
-; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1
+; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off
+; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
ret void
}
+define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) {
+; GFX9-LABEL: store_load_v3i32_unaligned:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-NEXT: v_mov_b32_e32 v2, 2
+; GFX9-NEXT: v_mov_b32_e32 v3, 3
+; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off
+; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: store_load_v3i32_unaligned:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_mov_b32_e32 v1, 1
+; GFX10-NEXT: v_mov_b32_e32 v2, 2
+; GFX10-NEXT: v_mov_b32_e32 v3, 3
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off
+; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
+ %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
+ ret void
+}
+
+define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) {
+; GFX9-LABEL: store_load_v4i32_unaligned:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-NEXT: v_mov_b32_e32 v2, 2
+; GFX9-NEXT: v_mov_b32_e32 v3, 3
+; GFX9-NEXT: v_mov_b32_e32 v4, 4
+; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off
+; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: store_load_v4i32_unaligned:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_mov_b32_e32 v1, 1
+; GFX10-NEXT: v_mov_b32_e32 v2, 2
+; GFX10-NEXT: v_mov_b32_e32 v3, 3
+; GFX10-NEXT: v_mov_b32_e32 v4, 4
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off
+; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
+ %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
+ ret void
+}
+
declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
declare i32 @llvm.amdgcn.workitem.id.x()