From 3828ea6181fd007438379de70fc7b9fc9c8dbb02 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 16 Sep 2021 14:36:51 +0100 Subject: [PATCH] [AMDGPU] Divergence-driven instruction selection for mul i32 Differential Revision: https://reviews.llvm.org/D109881 --- llvm/lib/Target/AMDGPU/SOPInstructions.td | 3 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 2 +- .../CodeGen/AMDGPU/urem-seteq-illegal-types.ll | 16 +++---- llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll | 11 +++-- llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 52 ++++++++-------------- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 8 ++-- 6 files changed, 38 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 0a6afe0..ff7c0a8 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -622,9 +622,8 @@ def S_BFM_B32 : SOP2_32 <"s_bfm_b32", [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))]>; def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">; -// TODO: S_MUL_I32 require V_MUL_LO_I32 from VOP3 change def S_MUL_I32 : SOP2_32 <"s_mul_i32", - [(set i32:$sdst, (mul i32:$src0, i32:$src1))]> { + [(set i32:$sdst, (UniformBinFrag i32:$src0, i32:$src1))]> { let isCommutable = 1; } } // End isReMaterializable = 1 diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index f317a6d..4b216a4 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -304,7 +304,7 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile, fmaxnum_l } // End SchedRW = [WriteDoubleAdd] let SchedRW = [WriteIntMul] in { -defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile, mul>; +defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile, DivergentBinFrag>; defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile, mulhu>; defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile>; defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile, mulhs>; diff --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll index 96b1c77..7972360 100644 --- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll @@ -76,22 +76,22 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; CHECK-NEXT: s_mov_b32 s5, 0x8311eb33 ; CHECK-NEXT: s_mov_b32 s6, 0x20140c ; CHECK-NEXT: s_mov_b32 s7, 0xb6db6db7 -; CHECK-NEXT: s_mov_b32 s11, 0x49249249 -; CHECK-NEXT: s_mov_b32 s8, 0x24924924 -; CHECK-NEXT: s_mov_b32 s9, 0xaaaaaaab -; CHECK-NEXT: s_mov_b32 s10, 0x2aaaaaaa +; CHECK-NEXT: s_mov_b32 s8, 0x49249249 +; CHECK-NEXT: s_mov_b32 s9, 0x24924924 +; CHECK-NEXT: s_mov_b32 s10, 0xaaaaaaab +; CHECK-NEXT: s_mov_b32 s11, 0x2aaaaaaa ; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 ; CHECK-NEXT: v_and_b32_e32 v1, s4, v1 ; CHECK-NEXT: v_and_b32_e32 v2, s4, v2 ; CHECK-NEXT: v_mul_lo_u32 v2, v2, s5 ; CHECK-NEXT: v_mul_lo_u32 v1, v1, s7 -; CHECK-NEXT: v_mul_lo_u32 v0, v0, s9 +; CHECK-NEXT: v_mul_lo_u32 v0, v0, s10 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xf9dc299a, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, s11, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, s8, v1 ; CHECK-NEXT: v_alignbit_b32 v0, v0, v0, 1 -; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s11, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s8, v1 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s9, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index 1641913..b19867e 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -119,9 +119,9 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI: S_BRANCH %bb.4 ; SI: bb.2.Flow: ; SI: successors: %bb.3(0x40000000), %bb.5(0x40000000) - ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %34:vgpr_32, %bb.1, %10, %bb.4 - ; SI: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %35:vgpr_32, %bb.1, %9, %bb.4 - ; SI: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %38:vgpr_32, %bb.4 + ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %10, %bb.4 + ; SI: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.1, %9, %bb.4 + ; SI: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %35:vgpr_32, %bb.4 ; SI: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI: S_BRANCH %bb.3 ; SI: bb.3.if: @@ -133,7 +133,6 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI: successors: %bb.2(0x80000000) ; SI: %9:vgpr_32 = nofpexcept V_MUL_F32_e32 [[COPY2]], [[PHI1]], implicit $mode, implicit $exec ; SI: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec - ; SI: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_MUL_LO_U32_e64_]] ; SI: S_BRANCH %bb.2 ; SI: bb.5.if.end: ; SI: successors: %bb.6(0x04000000), %bb.1(0x7c000000) @@ -146,8 +145,8 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI: S_CBRANCH_SCC1 %bb.1, implicit killed $scc ; SI: S_BRANCH %bb.6 ; SI: bb.6.for.end: - ; SI: %33:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI6]], killed [[PHI5]], implicit $mode, implicit $exec - ; SI: $vgpr0 = COPY killed %33 + ; SI: %31:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI6]], killed [[PHI5]], implicit $mode, implicit $exec + ; SI: $vgpr0 = COPY killed %31 ; SI: SI_RETURN_TO_EPILOG killed $vgpr0 entry: ; %break = icmp sgt i32 %bound, 0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 7061a23..3d655d2 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -322,8 +322,6 @@ define hidden amdgpu_gfx i32 @strict_wwm_called(i32 %a) noinline { ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: v_add_u32_e64 v1, v0, v0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX9-O0-NEXT: v_sub_u32_e64 v0, v0, v1 ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -350,42 +348,36 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11] -; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 7 +; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 2 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 ; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-O0-NEXT: v_writelane_b32 v3, s8, 2 -; GFX9-O0-NEXT: s_mov_b32 s8, s4 -; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 2 -; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 -; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: s_mov_b32 s9, s8 +; GFX9-O0-NEXT: s_mov_b32 s8, s7 ; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_writelane_b32 v3, s8, 3 -; GFX9-O0-NEXT: v_writelane_b32 v3, s9, 4 -; GFX9-O0-NEXT: v_writelane_b32 v3, s10, 5 -; GFX9-O0-NEXT: v_writelane_b32 v3, s11, 6 +; GFX9-O0-NEXT: s_mov_b32 s11, s5 +; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 +; GFX9-O0-NEXT: s_mov_b32 s5, s11 +; GFX9-O0-NEXT: s_mov_b32 s6, s10 +; GFX9-O0-NEXT: s_mov_b32 s7, s8 +; GFX9-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15 killed $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: s_getpc_b64 s[4:5] -; GFX9-O0-NEXT: s_add_u32 s4, s4, strict_wwm_called@rel32@lo+4 -; GFX9-O0-NEXT: s_addc_u32 s5, s5, strict_wwm_called@rel32@hi+12 -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[2:3] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[0:1] -; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[12:13] -; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX9-O0-NEXT: s_getpc_b64 s[12:13] +; GFX9-O0-NEXT: s_add_u32 s12, s12, strict_wwm_called@rel32@lo+4 +; GFX9-O0-NEXT: s_addc_u32 s13, s13, strict_wwm_called@rel32@hi+12 +; GFX9-O0-NEXT: s_mov_b64 s[18:19], s[2:3] +; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[0:1] +; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[16:17] +; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[18:19] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 3 -; GFX9-O0-NEXT: v_readlane_b32 s5, v3, 4 -; GFX9-O0-NEXT: v_readlane_b32 s6, v3, 5 -; GFX9-O0-NEXT: v_readlane_b32 s7, v3, 6 +; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 @@ -394,7 +386,7 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[4:7], s8 offset:4 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 7 +; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload @@ -467,15 +459,11 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline { ; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mul_hi_u32 v1, v0, v6 ; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: v_mul_lo_u32 v3, v3, v6 ; GFX9-O0-NEXT: v_add3_u32 v1, v1, v2, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 @@ -485,8 +473,6 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline { ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[1:2] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: v_mul_lo_u32 v6, v0, v6 ; GFX9-O0-NEXT: s_mov_b32 s5, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 340e7e5..1a7e432 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -105,14 +105,14 @@ define hidden i32 @called(i32 %a) noinline { ; GFX9-LABEL: {{^}}call: define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) { ; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]] -; GFX9-O0-DAG: s_mov_b32 s0, 0{{$}} +; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}} ; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]] ; GFX9-O0-DAG: v_mov_b32_e32 v2, v0 ; GFX9-O3: v_mov_b32_e32 v2, [[ARG]] ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0) @@ -299,14 +299,14 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline { ; GFX9-LABEL: {{^}}strict_wwm_call: define amdgpu_kernel void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) { ; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]] -; GFX9-O0-DAG: s_mov_b32 s0, 0{{$}} +; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}} ; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]] ; GFX9-O0-DAG: v_mov_b32_e32 v2, v0 ; GFX9-O3: v_mov_b32_e32 v2, [[ARG]] ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0) -- 2.7.4