From 0719c43735b2a40ba11f5431aaf1b64c2e1cb084 Mon Sep 17 00:00:00 2001 From: Ruiling Song Date: Mon, 17 Jan 2022 14:54:38 +0800 Subject: [PATCH] AMDGPU: Don't clobber source register for V_SET_INACTIVE_* The WWM register has unmodeled register liveness, For v_set_inactive_*, clobberring source register is dangerous because it will overwrite the inactive lanes. When the source vgpr is dead at v_set_inactive_lane, the inactive lanes may be not really dead. This may make common optimizations doing wrong. For example in a simple if-then cfg in Machine IR: bb.if: %src = bb.then: %src1 = COPY %src %dst = V_SET_INACTIVE %src1(tied-def 0), %inactive bb.end ... = PHI [0, %bb.then] [%src, %bb.if] The register coalescer will think it is safe to optimize "%src1 = COPY %src" in bb.then. And at the same time, there is no interference for the PHI in bb.end. The source and destination values of the PHI will be assigned the same register. The single PHI register will be overwritten by the v_set_inactive, then we would get wrong value in bb.end. With this change, we will copy the content of the source register before setting inactive lanes after register allocation. Yes, this will sacrifice the WWM code generation a little, but I don't have any better idea to do things correctly. Differential Revision: https://reviews.llvm.org/D117482 --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 14 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 6 +- .../AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll | 8 +- .../AMDGPU/atomic_optimizations_global_pointer.ll | 32 ++-- .../AMDGPU/atomic_optimizations_local_pointer.ll | 176 ++++++++++----------- .../AMDGPU/atomic_optimizations_pixelshader.ll | 44 +++--- .../CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll | 2 +- .../CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll | 22 +-- llvm/test/CodeGen/AMDGPU/wqm.ll | 32 ++-- llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 73 ++++----- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 18 +-- 11 files changed, 219 insertions(+), 208 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index b422750..972bef2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1887,6 +1887,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::V_SET_INACTIVE_B32: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + // FIXME: We may possibly optimize the COPY once we find ways to make LLVM + // optimizations (mainly Register Coalescer) aware of WWM register liveness. + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) + .add(MI.getOperand(1)); auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) @@ -1899,11 +1903,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::V_SET_INACTIVE_B64: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); - FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), MI.getOperand(0).getReg()) - .add(MI.getOperand(2)); + .add(MI.getOperand(1)); + expandPostRAPseudo(*Copy); + auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); + FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten + Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), + MI.getOperand(0).getReg()) + .add(MI.getOperand(2)); expandPostRAPseudo(*Copy); BuildMI(MBB, MI, DL, get(NotOpc), Exec) .addReg(Exec); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 3f1d10b..7fe3524 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -180,15 +180,13 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { // restoring it after we're done. let Defs = [SCC] in { def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), - (ins VGPR_32: $src, VSrc_b32:$inactive), + (ins VSrc_b32: $src, VSrc_b32:$inactive), [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { - let Constraints = "$src = $vdst"; } def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), - (ins VReg_64: $src, VSrc_b64:$inactive), + (ins VSrc_b64: $src, VSrc_b64:$inactive), [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { - let Constraints = "$src = $vdst"; } } // End Defs = [SCC] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 73130be..d9061eb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -44,14 +44,14 @@ define amdgpu_kernel void @set_inactive_scc(i32 addrspace(1)* %out, i32 %in, <4 ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0 +; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 +; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s3, 56 +; GCN-NEXT: s_cmp_lg_u32 s2, 56 ; GCN-NEXT: s_cselect_b32 s2, 1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 4c4def0..a9f5d28 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -363,12 +363,12 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -388,7 +388,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB2_2 @@ -417,12 +417,12 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -442,7 +442,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 @@ -1448,12 +1448,12 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -1473,7 +1473,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB8_2 @@ -1502,12 +1502,12 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -1527,7 +1527,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB8_2 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index bf91960..7d1e3f1 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -360,12 +360,12 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -385,7 +385,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB2_2 @@ -411,12 +411,12 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -436,7 +436,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 @@ -587,9 +587,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX8-LABEL: add_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -608,7 +608,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8-NEXT: s_mov_b64 exec, s[0:1] ; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB3_2 ; GFX8-NEXT: ; %bb.1: @@ -623,9 +623,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; ; GFX9-LABEL: add_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -644,7 +644,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_2 ; GFX9-NEXT: ; %bb.1: @@ -1525,12 +1525,12 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -1550,7 +1550,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB9_2 @@ -1576,12 +1576,12 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -1601,7 +1601,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_2 @@ -1752,9 +1752,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX8-LABEL: sub_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -1773,7 +1773,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8-NEXT: s_mov_b64 exec, s[0:1] ; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB10_2 ; GFX8-NEXT: ; %bb.1: @@ -1788,9 +1788,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; ; GFX9-LABEL: sub_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -1809,7 +1809,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: @@ -2364,12 +2364,12 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, -1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, -1 ; GFX8-NEXT: s_not_b64 exec, exec @@ -2389,7 +2389,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB14_2 @@ -2415,12 +2415,12 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, -1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, -1 ; GFX9-NEXT: s_not_b64 exec, exec @@ -2440,7 +2440,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB14_2 @@ -2598,12 +2598,12 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -2623,7 +2623,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB15_2 @@ -2649,12 +2649,12 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -2674,7 +2674,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB15_2 @@ -2832,12 +2832,12 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -2857,7 +2857,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB16_2 @@ -2883,12 +2883,12 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -2908,7 +2908,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_2 @@ -3066,12 +3066,12 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, v1 ; GFX8-NEXT: s_not_b64 exec, exec @@ -3091,7 +3091,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB17_2 @@ -3117,12 +3117,12 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: s_not_b64 exec, exec @@ -3142,7 +3142,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB17_2 @@ -3166,10 +3166,10 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064-NEXT: s_not_b64 exec, exec @@ -3229,10 +3229,10 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo @@ -3483,12 +3483,12 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, v1 ; GFX8-NEXT: s_not_b64 exec, exec @@ -3508,7 +3508,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB19_2 @@ -3534,12 +3534,12 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: s_not_b64 exec, exec @@ -3559,7 +3559,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB19_2 @@ -3583,10 +3583,10 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064-NEXT: s_not_b64 exec, exec @@ -3646,10 +3646,10 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo @@ -3900,12 +3900,12 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -3925,7 +3925,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB21_2 @@ -3951,12 +3951,12 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -3976,7 +3976,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB21_2 @@ -4310,12 +4310,12 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, -1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, -1 ; GFX8-NEXT: s_not_b64 exec, exec @@ -4335,7 +4335,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB23_2 @@ -4361,12 +4361,12 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, -1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, -1 ; GFX9-NEXT: s_not_b64 exec, exec @@ -4386,7 +4386,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 253df34..3a8e0b2 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -179,16 +179,16 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 +; GFX8-NEXT: ; implicit-def: $vgpr3 ; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX8-NEXT: s_cbranch_execz .LBB1_4 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b64 exec, s[10:11] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -208,7 +208,7 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[10:11] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX8-NEXT: s_cbranch_execz .LBB1_3 @@ -220,14 +220,14 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v0 ; GFX8-NEXT: .LBB1_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_wqm_b64 s[4:5], -1 ; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX8-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX8-NEXT: ; %bb.5: ; %if -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX8-NEXT: .LBB1_6: ; %UnifiedReturnBlock ; GFX8-NEXT: s_endpgm ; @@ -235,16 +235,16 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX9-NEXT: s_cbranch_execz .LBB1_4 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b64 exec, s[10:11] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -264,7 +264,7 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[10:11] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_3 @@ -276,26 +276,26 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v3, s4, v0 ; GFX9-NEXT: .LBB1_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_wqm_b64 s[4:5], -1 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX9-NEXT: ; %bb.5: ; %if -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX9-NEXT: .LBB1_6: ; %UnifiedReturnBlock ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr4 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[8:9] -; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX1064-NEXT: s_cbranch_execz .LBB1_4 ; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec @@ -339,26 +339,26 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX1064-NEXT: v_add_nc_u32_e32 v4, s4, v0 ; GFX1064-NEXT: .LBB1_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 ; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX1064-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX1064-NEXT: ; %bb.5: ; %if -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[0:3], 0 ; GFX1064-NEXT: .LBB1_6: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr4 ; GFX1032-NEXT: s_mov_b32 s9, s8 -; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 ; GFX1032-NEXT: s_cbranch_execz .LBB1_4 ; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo @@ -393,14 +393,14 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX1032-NEXT: v_add_nc_u32_e32 v4, s4, v0 ; GFX1032-NEXT: .LBB1_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 ; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX1032-NEXT: ; %bb.5: ; %if -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], 0 ; GFX1032-NEXT: .LBB1_6: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index d411ec7..21a36c5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -28,9 +28,9 @@ define amdgpu_kernel void @set_inactive_64(i64 addrspace(1)* %out, i64 %in) { ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll index e5d17d1..19bdd90 100644 --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -5,16 +5,17 @@ define amdgpu_cs void @if_then(<4 x i32> inreg %input, <4 x i32> inreg %output, ; GCN-LABEL: if_then: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: ; %bb.1: ; %.bb0 -; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: ; %bb.2: ; %.merge ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.3: ; %.then +; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo @@ -23,14 +24,14 @@ define amdgpu_cs void @if_then(<4 x i32> inreg %input, <4 x i32> inreg %output, ; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GCN-NEXT: s_mov_b32 exec_lo, s1 ; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v3, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen +; GCN-NEXT: v_mov_b32_e32 v4, -1 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: buffer_store_dword v4, v0, s[4:7], 0 offen ; GCN-NEXT: .LBB0_4: ; %.end ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_mov_b32_e32 v0, -1 -; GCN-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen ; GCN-NEXT: s_endpgm .entry: %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0 @@ -63,10 +64,10 @@ define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg ; GCN-LABEL: if_else_vgpr_opt: ; GCN: ; %bb.0: ; %.entry ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: ; %bb.1: ; %.bb0 -; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: ; %bb.2: ; %.merge ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v0 @@ -77,6 +78,7 @@ define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg ; GCN-NEXT: s_or_saveexec_b32 s1, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 exec_lo, s1 +; GCN-NEXT: v_mov_b32_e32 v2, v3 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_not_b32 exec_lo, exec_lo @@ -85,8 +87,8 @@ define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg ; GCN-NEXT: s_mov_b32 exec_lo, s1 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: v_mov_b32_e32 v3, -1 -; GCN-NEXT: ; implicit-def: $vgpr2 ; GCN-NEXT: buffer_store_dword v3, v0, s[4:7], 0 offen +; GCN-NEXT: ; implicit-def: $vgpr3 ; GCN-NEXT: .LBB1_4: ; %Flow ; GCN-NEXT: s_or_saveexec_b32 s0, s0 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 @@ -94,7 +96,7 @@ define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg ; GCN-NEXT: s_cbranch_execz .LBB1_6 ; GCN-NEXT: ; %bb.5: ; %.then ; GCN-NEXT: v_mov_b32_e32 v0, -1 -; GCN-NEXT: buffer_store_dword v0, v2, s[4:7], 0 offen +; GCN-NEXT: buffer_store_dword v0, v3, s[4:7], 0 offen ; GCN-NEXT: .LBB1_6: ; %.end ; GCN-NEXT: s_endpgm .entry: diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 8addc51..3234d66 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -691,9 +691,10 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { ; GFX9-W64-LABEL: test_wwm_set_inactive1: ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_not_b64 exec, exec +; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -706,9 +707,10 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { ; GFX10-W32-LABEL: test_wwm_set_inactive1: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 @@ -2315,7 +2317,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-W64-NEXT: s_not_b64 exec, exec @@ -2344,7 +2347,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo @@ -2795,9 +2799,10 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) { ; GFX9-W64-LABEL: test_strict_wwm_set_inactive1: ; GFX9-W64: ; %bb.0: ; %main_body ; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen -; GFX9-W64-NEXT: s_not_b64 exec, exec +; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 @@ -2810,9 +2815,10 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) { ; GFX10-W32-LABEL: test_strict_wwm_set_inactive1: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen -; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 @@ -2848,7 +2854,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-W64-NEXT: s_not_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-W64-NEXT: s_not_b64 exec, exec @@ -2877,7 +2884,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v0 +; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 516c154..a39bed7 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -66,18 +66,19 @@ define amdgpu_gfx void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) { ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec @@ -97,9 +98,7 @@ define amdgpu_gfx void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) { ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) @@ -233,50 +232,48 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc +; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O3-NEXT: ; %bb.1: ; %if ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-O3-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-O3-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O3-NEXT: ; %bb.2: ; %merge +; GFX9-O3-NEXT: .LBB1_2: ; %merge ; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] @@ -355,8 +352,7 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) ; GFX9-O0-NEXT: s_mov_b32 s39, s7 ; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_not_b64 exec, exec @@ -534,7 +530,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 -; GFX9-O0-NEXT: s_mov_b32 s34, s8 +; GFX9-O0-NEXT: s_mov_b32 s30, s8 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 @@ -544,18 +540,16 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a ; GFX9-O0-NEXT: v_writelane_b32 v10, s37, 3 ; GFX9-O0-NEXT: v_writelane_b32 v10, s38, 4 ; GFX9-O0-NEXT: v_writelane_b32 v10, s39, 5 -; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 -; GFX9-O0-NEXT: s_mov_b32 s35, s9 -; GFX9-O0-NEXT: ; kill: def $sgpr30_sgpr31 killed $sgpr34_sgpr35 -; GFX9-O0-NEXT: s_mov_b64 s[30:31], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s35 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: ; kill: def $sgpr30 killed $sgpr30 def $sgpr30_sgpr31 +; GFX9-O0-NEXT: s_mov_b32 s31, s9 +; GFX9-O0-NEXT: ; kill: def $sgpr34_sgpr35 killed $sgpr30_sgpr31 +; GFX9-O0-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, s30 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s31 ; GFX9-O0-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35 +; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 6 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 7 @@ -638,14 +632,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: s_mov_b64 s[36:37], s[30:31] -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-O3-NEXT: s_getpc_b64 s[34:35] ; GFX9-O3-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 ; GFX9-O3-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-O3-NEXT: s_mov_b64 exec, s[30:31] +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 @@ -772,27 +766,34 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX9-O3-NEXT: buffer_load_dwordx4 v[1:4], v0, s[4:7], 0 offen -; GFX9-O3-NEXT: buffer_load_dwordx2 v[5:6], v0, s[4:7], 0 offen offset:16 +; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen +; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16 ; GFX9-O3-NEXT: s_mov_b32 s34, -1 ; GFX9-O3-NEXT: s_brev_b32 s35, -2 -; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v1, s34 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s35 ; GFX9-O3-NEXT: s_not_b64 exec, exec +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v3, s34 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, s35 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) +; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v5, s34 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s35 ; GFX9-O3-NEXT: s_not_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 3bc6595..ca83115 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -106,8 +106,7 @@ define hidden i32 @called(i32 %a) noinline { define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) { ; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]] ; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}} -; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]] -; GFX9-O0-DAG: v_mov_b32_e32 v2, v0 +; GFX9-O0-DAG: v_mov_b32_e32 v2, [[ARG]] ; GFX9-O3: v_mov_b32_e32 v2, [[ARG]] @@ -142,10 +141,8 @@ define amdgpu_kernel void @call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) { ; GFX9: s_load_dwordx2 s{{\[}}[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]{{\]}} ; GFX9-O0: s_mov_b64 s{{\[}}[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]{{\]}}, 0{{$}} -; GFX9-O0: v_mov_b32_e32 v0, s[[ARG_LO]] -; GFX9-O0: v_mov_b32_e32 v1, s[[ARG_HI]] -; GFX9-O0-DAG: v_mov_b32_e32 v9, v1 -; GFX9-O0-DAG: v_mov_b32_e32 v8, v0 +; GFX9-O0-DAG: v_mov_b32_e32 v9, s[[ARG_HI]] +; GFX9-O0-DAG: v_mov_b32_e32 v8, s[[ARG_LO]] ; GFX9-O3-DAG: v_mov_b32_e32 v7, s[[ARG_HI]] ; GFX9-O3-DAG: v_mov_b32_e32 v6, s[[ARG_LO]] @@ -300,8 +297,7 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline { define amdgpu_kernel void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) { ; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]] ; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}} -; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]] -; GFX9-O0-DAG: v_mov_b32_e32 v2, v0 +; GFX9-O0-DAG: v_mov_b32_e32 v2, [[ARG]] ; GFX9-O3: v_mov_b32_e32 v2, [[ARG]] @@ -336,10 +332,8 @@ define amdgpu_kernel void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg ; GFX9: s_load_dwordx2 s{{\[}}[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]{{\]}} ; GFX9-O0: s_mov_b64 s{{\[}}[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]{{\]}}, 0{{$}} -; GFX9-O0: v_mov_b32_e32 v0, s[[ARG_LO]] -; GFX9-O0: v_mov_b32_e32 v1, s[[ARG_HI]] -; GFX9-O0-DAG: v_mov_b32_e32 v9, v1 -; GFX9-O0-DAG: v_mov_b32_e32 v8, v0 +; GFX9-O0-DAG: v_mov_b32_e32 v9, s[[ARG_HI]] +; GFX9-O0-DAG: v_mov_b32_e32 v8, s[[ARG_LO]] ; GFX9-O3-DAG: v_mov_b32_e32 v7, s[[ARG_HI]] ; GFX9-O3-DAG: v_mov_b32_e32 v6, s[[ARG_LO]] -- 2.7.4