; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_signext@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_i8 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_zeroext@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_signext@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_zeroext@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64@rel32@hi+12
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12
-; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 1
; GFX10-NEXT: v_mov_b32_e32 v5, 2
-; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i64@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i64@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2
-; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 1
; GFX10-NEXT: v_mov_b32_e32 v5, 2
; GFX10-NEXT: v_mov_b32_e32 v6, 3
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v7, 4
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i64@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i64@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 4.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 4.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: v_mov_b32_e32 v2, 4.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_mov_b32_e32 v2, 4.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: v_mov_b32_e32 v2, 4.0
; GFX10-NEXT: v_mov_b32_e32 v3, -1.0
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0.5
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v4, 0.5
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0.5
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x40200000
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-NEXT: v_mov_b32_e32 v1, 3
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00
; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00
; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 3
; GFX10-NEXT: v_mov_b32_e32 v1, 4
; GFX10-NEXT: v_mov_b32_e32 v2, 5
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
; GFX11-NEXT: v_mov_b32_e32 v2, 5
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 3
; GFX10-NEXT: v_mov_b32_e32 v1, 4
; GFX10-NEXT: v_mov_b32_e32 v2, 5
; GFX10-NEXT: v_mov_b32_e32 v3, 6
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 5
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v4, 5
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35]
; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[34:35] offset:16
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1]
; GFX11-NEXT: global_load_b128 v[4:7], v4, s[0:1] offset:16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: v_mov_b32_e32 v2, 3
; GFX10-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 5
; GFX10-NEXT: v_mov_b32_e32 v5, 6
; GFX10-NEXT: v_mov_b32_e32 v6, 7
; GFX10-NEXT: v_mov_b32_e32 v7, 8
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6
; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 6
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 7
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 8
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v16, 0
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v16, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[34:35] offset:16
; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[34:35] offset:32
; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[34:35] offset:48
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v16i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v16i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v12, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[4:7], v12, s[0:1] offset:16
; GFX11-NEXT: global_load_b128 v[8:11], v12, s[0:1] offset:32
; GFX11-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:48
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v28, 0
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v32, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80
; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96
; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v28, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80
; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96
; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v28, 0
; GFX9-NEXT: global_load_dword v32, v[0:1], off
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[34:35]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v32, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: global_load_dword v33, v[0:1], off
; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80
; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96
; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32_i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32_i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(8)
; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v28, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: global_load_b32 v32, v[0:1], off
; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80
; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96
; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: scratch_store_b32 off, v32, s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8)
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v43, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: s_addk_i32 s32, 0xf800
+; GFX9-NEXT: v_readlane_b32 s33, v43, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v43, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT: v_mov_b32_e32 v41, v0
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_mov_b32_e32 v41, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
-; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: v_mov_b32_e32 v42, v1
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: global_store_dword v[41:42], v0, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: s_addk_i32 s32, 0xfc00
+; GFX10-NEXT: v_readlane_b32 s33, v43, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
+; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8
+; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v43, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4
; GFX11-NEXT: scratch_store_b32 off, v42, s33
-; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
-; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_add_i32 s32, s32, 32
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: global_store_b32 v[41:42], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: s_addk_i32 s32, 0xffe0
+; GFX11-NEXT: v_readlane_b32 s33, v43, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s32 offset:12 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v43, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v43, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8
+; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s32 offset:12
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ubyte v0, v2, s[34:35]
; GFX10-NEXT: global_load_dword v1, v2, s[34:35] offset:4
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_struct_i8_i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_struct_i8_i32@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u8 v0, v1, s[0:1]
; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] offset:4
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v2, s[0:1]
; GFX10-SCRATCH-NEXT: global_load_dword v1, v2, s[0:1] offset:4
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33
; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: s_addk_i32 s32, 0xf800
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: v_mov_b32_e32 v0, 3
; GFX10-NEXT: v_mov_b32_e32 v1, 8
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4
; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
+; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32@rel32@hi+12
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: s_addk_i32 s32, 0xfc00
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b8 off, v0, s33
; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4
; GFX11-NEXT: v_mov_b32_e32 v0, s33
+; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: s_addk_i32 s32, 0xffe0
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:12 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:12
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xf800
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_mov_b32_e32 v0, 3
; GFX10-NEXT: v_mov_b32_e32 v1, 8
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_addk_i32 s32, 0x400
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4
; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33
+; GFX10-NEXT: s_addk_i32 s32, 0x400
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfc00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v[0:1], v1, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:20
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_addk_i32 s32, 0xffe0
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:20
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:16 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:20 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: global_store_byte v[0:1], v0, off
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: global_store_dword v[0:1], v1, off
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:16
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:20
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1]
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 32
+; GFX9-NEXT: s_mov_b32 s6, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xf800
-; GFX9-NEXT: v_readlane_b32 s33, v40, 32
+; GFX9-NEXT: s_mov_b32 s33, s6
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_mov_b32_e32 v0, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: scratch_store_b8 off, v0, s32
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_movk_i32 s4, 0x7b
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_movk_i32 s4, 0x7b
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_movk_i32 s4, 0x7b
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_movk_i32 s4, 0x7b
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_movk_i32 s4, 0x7b
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_movk_i32 s4, 0x7b
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 42
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 42
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 42
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 42
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_movk_i32 s4, 0x7b
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_movk_i32 s4, 0x7b
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_movk_i32 s4, 0x7b
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 6
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 6
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b64 s[34:35], 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 6
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 6
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 6
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 6
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 3
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 6
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 3
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 6
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 8
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
; GFX9-NEXT: v_writelane_b32 v40, s9, 5
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 6
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 8
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 8
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b64 s[34:35], 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 8
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 8
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 8
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 8
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 8
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 10
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s9, 5
; GFX9-NEXT: v_writelane_b32 v40, s10, 6
; GFX9-NEXT: v_writelane_b32 v40, s11, 7
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 8
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 10
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b64 s[34:35], 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 10
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 10
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_movk_i32 s4, 0x4400
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_movk_i32 s4, 0x4400
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_movk_i32 s4, 0x4400
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_movk_i32 s4, 0x4400
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 4.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 4.0
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 4.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 4.0
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1.0
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1.0
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 5
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 5
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 5
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1.0
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 4.0
; GFX10-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 5
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 5
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1.0
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 4.0
; GFX11-NEXT: v_writelane_b32 v40, s30, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 5
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 5
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 5
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 7
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 5
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 7
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 7
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: s_getpc_b64 s[34:35]
-; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4
-; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32_inreg@rel32@hi+12
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_mov_b32 s4, 1.0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: s_mov_b32 s5, 2.0
+; GFX10-NEXT: s_getpc_b64 s[34:35]
+; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4
+; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32_inreg@rel32@hi+12
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 4.0
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 7
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 7
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1.0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1.0
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 4.0
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 7
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 7
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 7
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 0x40100000
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 0x40100000
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 0x40100000
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 0x40100000
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 6
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 6
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 6
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 0
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 6
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 8
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
; GFX9-NEXT: v_writelane_b32 v40, s9, 5
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 6
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 8
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 8
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f64_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2.0
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 8
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 8
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2.0
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 0
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 8
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 8
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 8
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
-; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 0x20001
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 3
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0x20001
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 3
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 0x20001
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 3
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0x20001
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 3
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 0x40003c00
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_movk_i32 s5, 0x4400
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0x40003c00
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_movk_i32 s5, 0x4400
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 0x40003c00
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_movk_i32 s5, 0x4400
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0x40003c00
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_movk_i32 s5, 0x4400
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 0x20001
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 0x40003
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 0x20001
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 0x40003
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 0x20001
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 0x40003
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 0x20001
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 0x40003
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 3
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 3
-; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
+; GFX10-NEXT: s_mov_b32 s33, s32
+; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 3
-; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 3
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3
-; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 4
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 2
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 4
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 4
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s30, 2
; GFX11-NEXT: v_writelane_b32 v40, s31, 3
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 4
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 5
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 3
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 5
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 5
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 3
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 4
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 3
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 4
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 5
; GFX10-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 5
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 5
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 3
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 4
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 3
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 4
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 5
; GFX11-NEXT: v_writelane_b32 v40, s30, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 5
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 5
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 5
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 6
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 6
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 3
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 4
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 3
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 4
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 5
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 6
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 3
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 4
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 3
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 4
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 5
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 6
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 6
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 6
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 6
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 6
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 6
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 4
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 6
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 3
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 6
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 3
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 6
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 7
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s7, 3
; GFX9-NEXT: v_writelane_b32 v40, s8, 4
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 5
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 7
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 7
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 3
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 7
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 7
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 3
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 7
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 7
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 7
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 10
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s11, 7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 8
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 10
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 10
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 10
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 10
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s9, 5
; GFX9-NEXT: v_writelane_b32 v40, s10, 6
; GFX9-NEXT: v_writelane_b32 v40, s11, 7
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 8
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 10
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-NEXT: s_mov_b32 s4, 1
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32_inreg@rel32@hi+12
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-NEXT: s_mov_b32 s4, 1
-; GFX10-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-NEXT: s_mov_b32 s5, 2
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: s_mov_b32 s6, 3
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 10
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
+; GFX11-NEXT: s_mov_b32 s4, 1
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s5, 1
+; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
-; GFX11-NEXT: s_mov_b32 s4, 1
-; GFX11-NEXT: v_writelane_b32 v40, s5, 1
-; GFX11-NEXT: s_mov_b32 s5, 2
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: s_mov_b32 s6, 3
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 10
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 18
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s19, 15
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s30, 16
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 18
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 18
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 18
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 18
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 18
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 18
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 18
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 28
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: v_writelane_b32 v40, s25, 21
; GFX9-NEXT: v_writelane_b32 v40, s26, 22
; GFX9-NEXT: v_writelane_b32 v40, s27, 23
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s28, 24
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 28
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 28
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 28
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 28
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 28
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 28
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 28
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 28
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s5, 1
; GFX9-NEXT: v_writelane_b32 v40, s6, 2
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s24, 20
; GFX9-NEXT: v_writelane_b32 v40, s25, 21
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s26, 22
; GFX9-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 28
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 28
+; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
-; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 28
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 28
+; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: v_writelane_b32 v40, s5, 1
; GFX11-NEXT: v_writelane_b32 v40, s6, 2
; GFX11-NEXT: v_writelane_b32 v40, s7, 3
; GFX11-NEXT: v_readlane_b32 s5, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 28
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 28
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 28
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4
-; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[34:35]
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: s_addk_i32 s32, 0xf800
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4
-; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: s_getpc_b64 s[34:35]
; GFX10-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s35, s35, stack_passed_f64_arg@rel32@hi+12
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: s_addk_i32 s32, 0xfc00
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33
+; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: s_addk_i32 s32, 0xffe0
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:12
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:12 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33
+; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1]
; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4
; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:12
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_mov_b32_e32 v0, 12
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: v_mov_b32_e32 v0, 12
; GFX10-NEXT: v_mov_b32_e32 v1, 13
; GFX10-NEXT: v_mov_b32_e32 v2, 14
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13
; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1
; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1
+; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1
; GFX11-NEXT: v_dual_mov_b32 v6, 2 :: v_dual_mov_b32 v7, 2
; GFX11-NEXT: v_dual_mov_b32 v8, 2 :: v_dual_mov_b32 v9, 3
; GFX11-NEXT: v_dual_mov_b32 v10, 3 :: v_dual_mov_b32 v11, 3
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 1
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_mov_b32_e32 v0, 8
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_mov_b32_e32 v0, 8
; GFX10-NEXT: v_mov_b32_e32 v1, 9
; GFX10-NEXT: v_mov_b32_e32 v2, 10
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13
; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15
; GFX11-NEXT: v_dual_mov_b32 v4, 8 :: v_dual_mov_b32 v5, 9
; GFX11-NEXT: v_dual_mov_b32 v6, 10 :: v_dual_mov_b32 v7, 11
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 9
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 10
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 2
+; GFX9-NEXT: v_writelane_b32 v41, s33, 0
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-NEXT: v_readlane_b32 s33, v40, 2
+; GFX9-NEXT: v_readlane_b32 s33, v41, 0
; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000
; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000
-; GFX10-NEXT: v_writelane_b32 v40, s33, 2
+; GFX10-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfe00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-NEXT: s_or_saveexec_b32 s34, -1
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s34
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_store_b32 off, v40, s32
+; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 2
; GFX11-NEXT: v_mov_b32_e32 v0, 0x41400000
; GFX11-NEXT: v_mov_b32_e32 v1, 0x41500000
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41600000
; GFX11-NEXT: v_mov_b32_e32 v5, 0x41100000
; GFX11-NEXT: v_mov_b32_e32 v6, 0x41200000
; GFX11-NEXT: v_mov_b32_e32 v7, 0x41300000
+; GFX11-NEXT: v_writelane_b32 v41, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_add_i32 s32, s32, -16
-; GFX11-NEXT: v_readlane_b32 s33, v40, 2
+; GFX11-NEXT: v_readlane_b32 s33, v41, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: scratch_load_b32 v40, off, s32
+; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s32 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41400000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41500000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41600000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x41100000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41200000
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000
+; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s33, 0
; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16
-; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2
+; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v41, 0
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1
-; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT: s_clause 0x1
+; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32
+; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s32 offset:4
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0)