From e3cbf1d4374129ae814f9c3b572e03c6d92ee65c Mon Sep 17 00:00:00 2001 From: alex-t Date: Wed, 1 Sep 2021 23:31:33 +0300 Subject: [PATCH] [AMDGPU] enable scalar compare in truncate selection Currently, the truncate selection dag node is expanded as a bitwise AND plus compare to 1. This change enables scalar comparison in the pattern if the truncate node is uniform. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D108925 --- llvm/lib/Target/AMDGPU/SIInstructions.td | 16 ++++++++++++++++ .../AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll | 3 ++- .../CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll | 6 ++++-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll | 2 +- llvm/test/CodeGen/AMDGPU/select-i1.ll | 6 ++++-- llvm/test/CodeGen/AMDGPU/trunc.ll | 4 ++-- llvm/test/CodeGen/AMDGPU/wave32.ll | 12 ++++++++---- 7 files changed, 37 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 540b27b..10f0813 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2108,6 +2108,22 @@ def : GCNPat < >; def : GCNPat < + (i1 (UniformUnaryFrag i32:$a)), + (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) +>; + +def : GCNPat < + (i1 (UniformUnaryFrag i16:$a)), + (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) +>; + +def : GCNPat < + (i1 (UniformUnaryFrag i64:$a)), + (S_CMP_EQ_U32 (S_AND_B32 (i32 1), + (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) +>; + +def : GCNPat < (i1 (trunc i32:$a)), (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) >; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index e0037f0..0543a4e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -397,7 +397,8 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) { ; GCN-NEXT: v_mov_b32_e32 v1, 0x80 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s0, 1, s0 -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GCN-NEXT: s_cmp_eq_u32 s0, 1 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: flat_store_short v[0:1], v0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 9c0a3e1..b642457 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -175,7 +175,8 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 { ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, 1, s4 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_cbranch_vccnz BB4_2 @@ -220,7 +221,8 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 { ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, 1, s4 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_cbranch_vccnz BB5_2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll index 49f3504..2902b02 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -17,7 +17,7 @@ declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind re ; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94 ; GCN-DAG: s_and_b32 [[AND_I1:s[0-9]+]], 1, s{{[0-9]+}} -; GCN: v_cmp_eq_u32_e64 vcc, [[AND_I1]], 1 +; GCN: s_cmp_eq_u32 [[AND_I1]], 1 ; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] diff --git a/llvm/test/CodeGen/AMDGPU/select-i1.ll b/llvm/test/CodeGen/AMDGPU/select-i1.ll index 951449c..a546923 100644 --- a/llvm/test/CodeGen/AMDGPU/select-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/select-i1.ll @@ -15,12 +15,14 @@ define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 ; GCN-LABEL: {{^}}s_minmax_i1: ; GCN: s_load_dword [[LOAD:s[0-9]+]], +; GCN: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]] +; GCN: s_cmp_eq_u32 [[COND]], 1 +; GCN: s_cselect_b64 vcc, -1, 0 ; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8 ; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16 -; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]] + ; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] ; GCN: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]] -; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1 ; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]] ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]] define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind { diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll index 29604b7..ac19b52 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -97,9 +97,9 @@ define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) ; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; VI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]] -; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}} +; GCN: s_cmp_eq_u32 [[MASKED]], 1{{$}} +; SI: s_cselect_b64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], -1, 0 ; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]] -; VI: s_cmp_lg_u64 s{{\[}}[[VLO]]:[[VHI]]], 0 ; VI: s_cselect_b32 {{s[0-9]+}}, 63, -12 define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32], i64 %x) { %trunc = trunc i64 %x to i1 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 367c5ec..390ec5e 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -418,8 +418,10 @@ define i64 @test_mad_u64_u32(i32 %arg0, i32 %arg1, i64 %arg2) #0 { } ; GCN-LABEL: {{^}}test_div_fmas_f32: -; GFX1032: v_cmp_eq_u32_e64 vcc_lo, -; GFX1064: v_cmp_eq_u32_e64 vcc, +; GFX1032: s_cmp_eq_u32 s0, 1 +; GFX1032: s_cselect_b32 vcc_lo, -1, 0 +; GFX1064: s_cmp_eq_u32 s0, 1 +; GFX1064: s_cselect_b64 vcc, -1, 0 ; GCN: v_div_fmas_f32 v{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone @@ -428,8 +430,10 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, } ; GCN-LABEL: {{^}}test_div_fmas_f64: -; GFX1032: v_cmp_eq_u32_e64 vcc_lo, -; GFX1064: v_cmp_eq_u32_e64 vcc, +; GFX1032: s_cmp_eq_u32 s0, 1 +; GFX1032: s_cselect_b32 vcc_lo, -1, 0 +; GFX1064: s_cmp_eq_u32 s0, 1 +; GFX1064: s_cselect_b64 vcc, -1, 0 ; GCN-DAG: v_div_fmas_f64 v[{{[0-9:]+}}], {{[vs]}}[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone -- 2.7.4