From: Matt Arsenault Date: Thu, 26 Apr 2018 19:21:32 +0000 (+0000) Subject: AMDGPU: Extend extract_vector_elt fneg combine to fabs X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=fcc5ba46b796eaab5d218321e9cb8a76c18811ac;p=platform%2Fupstream%2Fllvm.git AMDGPU: Extend extract_vector_elt fneg combine to fabs Fixes a regression in a future commit. llvm-svn: 330980 --- diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8b9447b..0d71bdd 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6600,13 +6600,14 @@ SDValue SITargetLowering::performExtractVectorEltCombine( SDValue Vec = N->getOperand(0); SelectionDAG &DAG = DCI.DAG; - if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) { + if ((Vec.getOpcode() == ISD::FNEG || + Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) { SDLoc SL(N); EVT EltVT = N->getValueType(0); SDValue Idx = N->getOperand(1); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec.getOperand(0), Idx); - return DAG.getNode(ISD::FNEG, SL, EltVT, Elt); + return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt); } return SDValue(); diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 4bbaf0e..dafabdf 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -142,6 +142,49 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x ret void } +; GCN-LABEL: {{^}}v_extract_fabs_fold_v2f16: +; GCN-DAG: {{flat|global}}_load_dword [[VAL:v[0-9]+]] +; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +; CI-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} + +; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] +; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 +; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +define amdgpu_kernel void @v_extract_fabs_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + %elt0 = extractelement <2 x half> %fabs, i32 0 + %elt1 = extractelement <2 x half> %fabs, i32 1 + + %fmul0 = fmul half %elt0, 4.0 + %fadd1 = fadd half %elt1, 2.0 + store volatile half %fmul0, half addrspace(1)* undef + store volatile half %fadd1, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_extract_fabs_no_fold_v2f16: +; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] + +; FIXME: Extra bfe on VI +; GFX9-NOT: v_bfe_u32 +; VI: v_bfe_u32 +; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 0x7fff7fff, [[VAL]] +; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[AND]], off +define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + %elt0 = extractelement <2 x half> %fabs, i32 0 + %elt1 = extractelement <2 x half> %fabs, i32 1 + store volatile half %elt0, half addrspace(1)* undef + store volatile half %elt1, half addrspace(1)* undef + ret void +} + declare half @llvm.fabs.f16(half) #1 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index c95b7aa..58f8ab9 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -461,11 +461,9 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half ret float %result } -; FIXME: Should be able to fold ; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: ; GFX9: s_waitcnt -; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] ; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> @@ -478,11 +476,9 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half ret float %result } -; FIXME: Should be able to fold ; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: ; GFX9: s_waitcnt -; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 -; GFX9-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] ; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { %src0.arg.bc = bitcast i32 %src0.arg to <2 x half>