case AMDGPU::V_SET_INACTIVE_B32: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
+ // optimizations (mainly Register Coalescer) aware of WWM register liveness.
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
+ .add(MI.getOperand(1));
auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
+ .add(MI.getOperand(1));
+ expandPostRAPseudo(*Copy);
+ auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
+ FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
+ Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
+ MI.getOperand(0).getReg())
+ .add(MI.getOperand(2));
expandPostRAPseudo(*Copy);
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
.addReg(Exec);
// restoring it after we're done.
let Defs = [SCC] in {
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
- (ins VGPR_32: $src, VSrc_b32:$inactive),
+ (ins VSrc_b32: $src, VSrc_b32:$inactive),
[(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
- let Constraints = "$src = $vdst";
}
def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
- (ins VReg_64: $src, VSrc_b64:$inactive),
+ (ins VSrc_b64: $src, VSrc_b64:$inactive),
[(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
- let Constraints = "$src = $vdst";
}
} // End Defs = [SCC]
; GCN-LABEL: set_inactive_scc:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0
+; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0
+; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s3, 56
+; GCN-NEXT: s_cmp_lg_u32 s2, 56
; GCN-NEXT: s_cselect_b32 s2, 1, 0
+; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
; GFX8-LABEL: add_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-NEXT: s_cbranch_execz .LBB2_2
; GFX9-LABEL: add_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB2_2
; GFX8-LABEL: sub_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-NEXT: s_cbranch_execz .LBB8_2
; GFX9-LABEL: sub_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB8_2
; GFX8-LABEL: add_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB2_2
; GFX9-LABEL: add_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB2_2
;
; GFX8-LABEL: add_i32_varying_nouse:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_readlane_b32 s2, v1, 63
; GFX8-NEXT: s_mov_b64 exec, s[0:1]
; GFX8-NEXT: s_mov_b32 s0, s2
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB3_2
; GFX8-NEXT: ; %bb.1:
;
; GFX9-LABEL: add_i32_varying_nouse:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_readlane_b32 s2, v1, 63
; GFX9-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-NEXT: s_mov_b32 s0, s2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB3_2
; GFX9-NEXT: ; %bb.1:
; GFX8-LABEL: sub_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB9_2
; GFX9-LABEL: sub_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB9_2
;
; GFX8-LABEL: sub_i32_varying_nouse:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_readlane_b32 s2, v1, 63
; GFX8-NEXT: s_mov_b64 exec, s[0:1]
; GFX8-NEXT: s_mov_b32 s0, s2
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB10_2
; GFX8-NEXT: ; %bb.1:
;
; GFX9-LABEL: sub_i32_varying_nouse:
; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_readlane_b32 s2, v1, 63
; GFX9-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-NEXT: s_mov_b32 s0, s2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB10_2
; GFX9-NEXT: ; %bb.1:
; GFX8-LABEL: and_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, -1
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, -1
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB14_2
; GFX9-LABEL: and_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, -1
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, -1
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB14_2
; GFX8-LABEL: or_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB15_2
; GFX9-LABEL: or_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB15_2
; GFX8-LABEL: xor_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB16_2
; GFX9-LABEL: xor_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB16_2
; GFX8-LABEL: max_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, v1
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB17_2
; GFX9-LABEL: max_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB17_2
;
; GFX1064-LABEL: max_i32_varying:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
; GFX1064-NEXT: s_not_b64 exec, exec
;
; GFX1032-LABEL: max_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX8-LABEL: min_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_bfrev_b32_e32 v1, -2
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, v1
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB19_2
; GFX9-LABEL: min_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_bfrev_b32_e32 v1, -2
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB19_2
;
; GFX1064-LABEL: min_i32_varying:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
; GFX1064-NEXT: s_not_b64 exec, exec
;
; GFX1032-LABEL: min_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX8-LABEL: umax_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB21_2
; GFX9-LABEL: umax_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB21_2
; GFX8-LABEL: umin_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_mov_b32_e32 v1, -1
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, -1
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8-NEXT: s_cbranch_execz .LBB23_2
; GFX9-LABEL: umin_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_mov_b32_e32 v1, -1
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, -1
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB23_2
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b64 s[8:9], exec
; GFX8-NEXT: s_mov_b64 s[10:11], s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: ; implicit-def: $vgpr0
+; GFX8-NEXT: ; implicit-def: $vgpr3
; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
; GFX8-NEXT: s_cbranch_execz .LBB1_4
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b64 exec, s[10:11]
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8-NEXT: s_mov_b64 exec, s[10:11]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX8-NEXT: ; implicit-def: $vgpr0
; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc
; GFX8-NEXT: s_cbranch_execz .LBB1_3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v0
; GFX8-NEXT: .LBB1_4: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_wqm_b64 s[4:5], -1
; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GFX8-NEXT: s_cbranch_vccnz .LBB1_6
; GFX8-NEXT: ; %bb.5: ; %if
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], 0
; GFX8-NEXT: .LBB1_6: ; %UnifiedReturnBlock
; GFX8-NEXT: s_endpgm
;
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b64 s[8:9], exec
; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9]
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: ; implicit-def: $vgpr0
+; GFX9-NEXT: ; implicit-def: $vgpr3
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
; GFX9-NEXT: s_cbranch_execz .LBB1_4
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 exec, s[10:11]
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9-NEXT: s_mov_b64 exec, s[10:11]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-NEXT: ; implicit-def: $vgpr0
; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc
; GFX9-NEXT: s_cbranch_execz .LBB1_3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX9-NEXT: v_add_u32_e32 v3, s4, v0
; GFX9-NEXT: .LBB1_4: ; %Flow
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_wqm_b64 s[4:5], -1
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GFX9-NEXT: s_cbranch_vccnz .LBB1_6
; GFX9-NEXT: ; %bb.5: ; %if
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], 0
; GFX9-NEXT: .LBB1_6: ; %UnifiedReturnBlock
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_varying:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_mov_b64 s[8:9], exec
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
+; GFX1064-NEXT: ; implicit-def: $vgpr4
; GFX1064-NEXT: s_mov_b64 s[10:11], s[8:9]
-; GFX1064-NEXT: ; implicit-def: $vgpr0
; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
; GFX1064-NEXT: s_cbranch_execz .LBB1_4
; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
-; GFX1064-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX1064-NEXT: v_add_nc_u32_e32 v4, s4, v0
; GFX1064-NEXT: .LBB1_4: ; %Flow
; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1064-NEXT: s_wqm_b64 s[4:5], -1
; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5]
; GFX1064-NEXT: s_cbranch_vccnz .LBB1_6
; GFX1064-NEXT: ; %bb.5: ; %if
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v4, off, s[0:3], 0
; GFX1064-NEXT: .LBB1_6: ; %UnifiedReturnBlock
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_varying:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_mov_b32 s8, exec_lo
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
+; GFX1032-NEXT: ; implicit-def: $vgpr4
; GFX1032-NEXT: s_mov_b32 s9, s8
-; GFX1032-NEXT: ; implicit-def: $vgpr0
; GFX1032-NEXT: s_and_saveexec_b32 s8, s9
; GFX1032-NEXT: s_cbranch_execz .LBB1_4
; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_readfirstlane_b32 s4, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
-; GFX1032-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX1032-NEXT: v_add_nc_u32_e32 v4, s4, v0
; GFX1032-NEXT: .LBB1_4: ; %Flow
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1032-NEXT: s_wqm_b32 s4, -1
; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_vccnz .LBB1_6
; GFX1032-NEXT: ; %bb.5: ; %if
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], 0
; GFX1032-NEXT: .LBB1_6: ; %UnifiedReturnBlock
; GFX1032-NEXT: s_endpgm
entry:
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-LABEL: if_then:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: ; %bb.1: ; %.bb0
-; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: v_mov_b32_e32 v3, 1
; GCN-NEXT: ; %bb.2: ; %.merge
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: s_cbranch_execz .LBB0_4
; GCN-NEXT: ; %bb.3: ; %.then
+; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: s_not_b32 exec_lo, exec_lo
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b32 exec_lo, exec_lo
; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s1
; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v3, -1
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v4, -1
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: buffer_store_dword v4, v0, s[4:7], 0 offen
; GCN-NEXT: .LBB0_4: ; %.end
; GCN-NEXT: s_waitcnt_depctr 0xffe3
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GCN-NEXT: v_mov_b32_e32 v0, -1
-; GCN-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen
; GCN-NEXT: s_endpgm
.entry:
%LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0
; GCN-LABEL: if_else_vgpr_opt:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: ; %bb.1: ; %.bb0
-; GCN-NEXT: v_mov_b32_e32 v2, 1
+; GCN-NEXT: v_mov_b32_e32 v3, 1
; GCN-NEXT: ; %bb.2: ; %.merge
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0
; GCN-NEXT: s_or_saveexec_b32 s1, -1
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 exec_lo, s1
+; GCN-NEXT: v_mov_b32_e32 v2, v3
; GCN-NEXT: s_not_b32 exec_lo, exec_lo
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_not_b32 exec_lo, exec_lo
; GCN-NEXT: s_mov_b32 exec_lo, s1
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: v_mov_b32_e32 v3, -1
-; GCN-NEXT: ; implicit-def: $vgpr2
; GCN-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen
+; GCN-NEXT: ; implicit-def: $vgpr3
; GCN-NEXT: .LBB1_4: ; %Flow
; GCN-NEXT: s_or_saveexec_b32 s0, s0
; GCN-NEXT: s_waitcnt_depctr 0xffe3
; GCN-NEXT: s_cbranch_execz .LBB1_6
; GCN-NEXT: ; %bb.5: ; %.then
; GCN-NEXT: v_mov_b32_e32 v0, -1
-; GCN-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen
+; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen
; GCN-NEXT: .LBB1_6: ; %.end
; GCN-NEXT: s_endpgm
.entry:
; GFX9-W64-LABEL: test_wwm_set_inactive1:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_not_b64 exec, exec
+; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX10-W32-LABEL: test_wwm_set_inactive1:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v0
+; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v0
+; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX9-W64-LABEL: test_strict_wwm_set_inactive1:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_not_b64 exec, exec
+; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX10-W32-LABEL: test_strict_wwm_set_inactive1:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v0
+; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v0
+; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX9-O3-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5
; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0
; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc
+; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2
; GFX9-O3-NEXT: ; %bb.1: ; %if
; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
-; GFX9-O3-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-O3-NEXT: v_add_u32_e32 v1, v3, v1
+; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT: ; %bb.2: ; %merge
+; GFX9-O3-NEXT: .LBB1_2: ; %merge
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35]
-; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0
; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: s_nop 0
-; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
; GFX9-O0-NEXT: s_mov_b32 s39, s7
; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39
; GFX9-O0-NEXT: s_mov_b32 s34, 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00
; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0
; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1
-; GFX9-O0-NEXT: s_mov_b32 s34, s8
+; GFX9-O0-NEXT: s_mov_b32 s30, s8
; GFX9-O0-NEXT: s_mov_b32 s36, s4
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
; GFX9-O0-NEXT: s_mov_b32 s37, s5
; GFX9-O0-NEXT: v_writelane_b32 v10, s37, 3
; GFX9-O0-NEXT: v_writelane_b32 v10, s38, 4
; GFX9-O0-NEXT: v_writelane_b32 v10, s39, 5
-; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT: s_mov_b32 s35, s9
-; GFX9-O0-NEXT: ; kill: def $sgpr30_sgpr31 killed $sgpr34_sgpr35
-; GFX9-O0-NEXT: s_mov_b64 s[30:31], 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s35
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: ; kill: def $sgpr30 killed $sgpr30 def $sgpr30_sgpr31
+; GFX9-O0-NEXT: s_mov_b32 s31, s9
+; GFX9-O0-NEXT: ; kill: def $sgpr34_sgpr35 killed $sgpr30_sgpr31
+; GFX9-O0-NEXT: s_mov_b64 s[34:35], 0
; GFX9-O0-NEXT: v_mov_b32_e32 v8, s30
; GFX9-O0-NEXT: v_mov_b32_e32 v9, s31
; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35
+; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1
; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 6
; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 7
; GFX9-O3-NEXT: s_mov_b32 s33, s32
; GFX9-O3-NEXT: s_addk_i32 s32, 0x800
; GFX9-O3-NEXT: s_mov_b64 s[36:37], s[30:31]
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9
; GFX9-O3-NEXT: s_or_saveexec_b64 s[30:31], -1
; GFX9-O3-NEXT: s_getpc_b64 s[34:35]
; GFX9-O3-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4
; GFX9-O3-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12
; GFX9-O3-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-O3-NEXT: s_mov_b64 exec, s[30:31]
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8
+; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9
; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT: buffer_load_dwordx4 v[1:4], v0, s[4:7], 0 offen
-; GFX9-O3-NEXT: buffer_load_dwordx2 v[5:6], v0, s[4:7], 0 offen offset:16
+; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen
+; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
; GFX9-O3-NEXT: s_mov_b32 s34, -1
; GFX9-O3-NEXT: s_brev_b32 s35, -2
-; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8
+; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: v_mov_b32_e32 v1, s34
; GFX9-O3-NEXT: v_mov_b32_e32 v2, s35
; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9
+; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10
; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: v_mov_b32_e32 v3, s34
; GFX9-O3-NEXT: v_mov_b32_e32 v4, s35
; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12
+; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: v_mov_b32_e32 v5, s34
; GFX9-O3-NEXT: v_mov_b32_e32 v6, s35
; GFX9-O3-NEXT: s_not_b64 exec, exec
define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) {
; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]]
; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}}
-; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]]
-; GFX9-O0-DAG: v_mov_b32_e32 v2, v0
+; GFX9-O0-DAG: v_mov_b32_e32 v2, [[ARG]]
; GFX9-O3: v_mov_b32_e32 v2, [[ARG]]
; GFX9: s_load_dwordx2 s{{\[}}[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]{{\]}}
; GFX9-O0: s_mov_b64 s{{\[}}[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]{{\]}}, 0{{$}}
-; GFX9-O0: v_mov_b32_e32 v0, s[[ARG_LO]]
-; GFX9-O0: v_mov_b32_e32 v1, s[[ARG_HI]]
-; GFX9-O0-DAG: v_mov_b32_e32 v9, v1
-; GFX9-O0-DAG: v_mov_b32_e32 v8, v0
+; GFX9-O0-DAG: v_mov_b32_e32 v9, s[[ARG_HI]]
+; GFX9-O0-DAG: v_mov_b32_e32 v8, s[[ARG_LO]]
; GFX9-O3-DAG: v_mov_b32_e32 v7, s[[ARG_HI]]
; GFX9-O3-DAG: v_mov_b32_e32 v6, s[[ARG_LO]]
define amdgpu_kernel void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) {
; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]]
; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}}
-; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]]
-; GFX9-O0-DAG: v_mov_b32_e32 v2, v0
+; GFX9-O0-DAG: v_mov_b32_e32 v2, [[ARG]]
; GFX9-O3: v_mov_b32_e32 v2, [[ARG]]
; GFX9: s_load_dwordx2 s{{\[}}[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]{{\]}}
; GFX9-O0: s_mov_b64 s{{\[}}[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]{{\]}}, 0{{$}}
-; GFX9-O0: v_mov_b32_e32 v0, s[[ARG_LO]]
-; GFX9-O0: v_mov_b32_e32 v1, s[[ARG_HI]]
-; GFX9-O0-DAG: v_mov_b32_e32 v9, v1
-; GFX9-O0-DAG: v_mov_b32_e32 v8, v0
+; GFX9-O0-DAG: v_mov_b32_e32 v9, s[[ARG_HI]]
+; GFX9-O0-DAG: v_mov_b32_e32 v8, s[[ARG_LO]]
; GFX9-O3-DAG: v_mov_b32_e32 v7, s[[ARG_HI]]
; GFX9-O3-DAG: v_mov_b32_e32 v6, s[[ARG_LO]]