Currently, the truncate selection dag node is expanded as a bitwise AND plus compare to 1. This change enables scalar comparison in the pattern if the truncate node is uniform.
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D108925
>;
def : GCNPat <
+ (i1 (UniformUnaryFrag<trunc> i32:$a)),
+ (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1))
+>;
+
+def : GCNPat <
+ (i1 (UniformUnaryFrag<trunc> i16:$a)),
+ (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1))
+>;
+
+def : GCNPat <
+ (i1 (UniformUnaryFrag<trunc> i64:$a)),
+ (S_CMP_EQ_U32 (S_AND_B32 (i32 1),
+ (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
+>;
+
+def : GCNPat <
(i1 (trunc i32:$a)),
(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
>;
; GCN-NEXT: v_mov_b32_e32 v1, 0x80
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s0, 1, s0
-; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
+; GCN-NEXT: s_cmp_eq_u32 s0, 1
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: flat_store_short v[0:1], v0
; GCN-NEXT: s_endpgm
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s4, 1, s4
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1
+; GCN-NEXT: s_cmp_eq_u32 s4, 1
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_cbranch_vccnz BB4_2
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s4, 1, s4
-; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1
+; GCN-NEXT: s_cmp_eq_u32 s4, 1
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_cbranch_vccnz BB5_2
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
; GCN-DAG: s_and_b32 [[AND_I1:s[0-9]+]], 1, s{{[0-9]+}}
-; GCN: v_cmp_eq_u32_e64 vcc, [[AND_I1]], 1
+; GCN: s_cmp_eq_u32 [[AND_I1]], 1
; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; GCN-LABEL: {{^}}s_minmax_i1:
; GCN: s_load_dword [[LOAD:s[0-9]+]],
+; GCN: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]]
+; GCN: s_cmp_eq_u32 [[COND]], 1
+; GCN: s_cselect_b64 vcc, -1, 0
; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8
; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16
-; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]]
+
; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
; GCN: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
-; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1
; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]]
; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]]
define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]]
-; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}}
+; GCN: s_cmp_eq_u32 [[MASKED]], 1{{$}}
+; SI: s_cselect_b64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], -1, 0
; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]]
-; VI: s_cmp_lg_u64 s{{\[}}[[VLO]]:[[VHI]]], 0
; VI: s_cselect_b32 {{s[0-9]+}}, 63, -12
define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32], i64 %x) {
%trunc = trunc i64 %x to i1
}
; GCN-LABEL: {{^}}test_div_fmas_f32:
-; GFX1032: v_cmp_eq_u32_e64 vcc_lo,
-; GFX1064: v_cmp_eq_u32_e64 vcc,
+; GFX1032: s_cmp_eq_u32 s0, 1
+; GFX1032: s_cselect_b32 vcc_lo, -1, 0
+; GFX1064: s_cmp_eq_u32 s0, 1
+; GFX1064: s_cselect_b64 vcc, -1, 0
; GCN: v_div_fmas_f32 v{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
}
; GCN-LABEL: {{^}}test_div_fmas_f64:
-; GFX1032: v_cmp_eq_u32_e64 vcc_lo,
-; GFX1064: v_cmp_eq_u32_e64 vcc,
+; GFX1032: s_cmp_eq_u32 s0, 1
+; GFX1032: s_cselect_b32 vcc_lo, -1, 0
+; GFX1064: s_cmp_eq_u32 s0, 1
+; GFX1064: s_cselect_b64 vcc, -1, 0
; GCN-DAG: v_div_fmas_f64 v[{{[0-9:]+}}], {{[vs]}}[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
%result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone