From ebf46143eaf7fa3ad956baf8cc876cbe2c6ae306 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Sep 2018 02:34:54 +0000 Subject: [PATCH] AMDGPU: Don't form fmed3 if it will require materialization If there is a single use constant, it can be folded into the min/max, but not into med3. llvm-svn: 342443 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 +++++- llvm/test/CodeGen/AMDGPU/clamp.ll | 4 +- llvm/test/CodeGen/AMDGPU/fmed3.ll | 62 +++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 47217a0..3de6a54 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7733,8 +7733,15 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, if (!DAG.isKnownNeverSNaN(Var)) return SDValue(); - return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), - Var, SDValue(K0, 0), SDValue(K1, 0)); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + if ((!K0->hasOneUse() || + TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) && + (!K1->hasOneUse() || + TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) { + return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), + Var, SDValue(K0, 0), SDValue(K1, 0)); + } } return SDValue(); diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index ec9ec10..e73f286 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -54,8 +54,8 @@ define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float ad ; GCN-LABEL: {{^}}v_clamp_negzero_f32: ; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]] -; GCN-DAG: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1 -; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[SIGNBIT]], 1.0 +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[ADD]] +; GCN: v_min_f32_e32 v{{[0-9]+}}, 1.0, [[MAX]] define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index e27f430..8b88167 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -951,6 +951,68 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, ret void } +; GCN-LABEL: {{^}}two_non_inline_constant: +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x41000000, [[ADD]] +; GCN: v_min_f32_e32 v{{[0-9]+}}, 0x41800000, [[MAX]] +define amdgpu_kernel void @two_non_inline_constant(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd nnan float %a, 0.5 + %max = call float @llvm.maxnum.f32(float %add, float 8.0) + %med = call float @llvm.minnum.f32(float %max, float 16.0) + + store float %med, float addrspace(1)* %out.gep + ret void +} + +; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants. +; GCN-LABEL: {{^}}one_non_inline_constant: +; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000 +; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, +; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 1.0, [[K1]] +define amdgpu_kernel void @one_non_inline_constant(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd nnan float %a, 0.5 + %max = call float @llvm.maxnum.f32(float %add, float 1.0) + %med = call float @llvm.minnum.f32(float %max, float 16.0) + + store float %med, float addrspace(1)* %out.gep + + %extra.use = fadd float %a, 16.0 + store volatile float %extra.use, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}two_non_inline_constant_multi_use: +; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000 +; GCN-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x41000000 +; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], [[K1]] +; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, +; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[K0]], [[VK1]] +define amdgpu_kernel void @two_non_inline_constant_multi_use(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + %add = fadd nnan float %a, 0.5 + %max = call float @llvm.maxnum.f32(float %add, float 8.0) + %med = call float @llvm.minnum.f32(float %max, float 16.0) + + store float %med, float addrspace(1)* %out.gep + + %extra.use0 = fadd float %a, 16.0 + store volatile float %extra.use0, float addrspace(1)* undef + %extra.use1 = fadd float %a, 8.0 + store volatile float %extra.use1, float addrspace(1)* undef + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #0 declare float @llvm.fabs.f32(float) #0 declare float @llvm.minnum.f32(float, float) #0 -- 2.7.4