From 5b39b34ca53da637cbf7bfa1f3ff8351bc97fdd6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 28 Jan 2016 20:53:48 +0000 Subject: [PATCH] AMDGPU: Match fmed3 patterns with legacy fmin/fmax llvm-svn: 259090 --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 9 ++++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 58 +++++++++++++++------------ llvm/test/CodeGen/AMDGPU/fmed3.ll | 23 +++++++++++ 3 files changed, 62 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 0d5a808..ce9d5b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2705,8 +2705,13 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, SDValue True = N->getOperand(1); SDValue False = N->getOperand(2); - if (VT == MVT::f32 && Cond.hasOneUse()) - return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + if (VT == MVT::f32 && Cond.hasOneUse()) { + SDValue MinMax + = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + // Revisit this node so we can catch min3/max3/med3 patterns. + //DCI.AddToWorklist(MinMax.getNode()); + return MinMax; + } // There's no reason to not do this if the condition has other uses. return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 90f74d4..c00323335 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2204,29 +2204,31 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - // max(max(a, b), c) -> max3(a, b, c) - // min(min(a, b), c) -> min3(a, b, c) - if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0.getOperand(0), - Op0.getOperand(1), - Op1); - } - - // Try commuted. - // max(a, max(b, c)) -> max3(a, b, c) - // min(a, min(b, c)) -> min3(a, b, c) - if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0, - Op1.getOperand(0), - Op1.getOperand(1)); + if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) { + // max(max(a, b), c) -> max3(a, b, c) + // min(min(a, b), c) -> min3(a, b, c) + if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0.getOperand(0), + Op0.getOperand(1), + Op1); + } + + // Try commuted. + // max(a, max(b, c)) -> max3(a, b, c) + // min(a, min(b, c)) -> min3(a, b, c) + if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0, + Op1.getOperand(0), + Op1.getOperand(1)); + } } // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) @@ -2241,7 +2243,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, } // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) - if (Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM && + if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || + (Opc == AMDGPUISD::FMIN_LEGACY && + Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) return Res; @@ -2291,12 +2295,14 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); - case ISD::FMAXNUM: // TODO: What about fmax_legacy? + case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: - case ISD::UMIN: { + case ISD::UMIN: + case AMDGPUISD::FMIN_LEGACY: + case AMDGPUISD::FMAX_LEGACY: { if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index c02c084..4094311 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -126,6 +126,29 @@ define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addr ret void } +; GCN-LABEL: {{^}}v_test_legacy_fmed3_r_i_i_f32: +; NOSNAN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 + +; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} +; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() + %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep0 + + ; fmax_legacy + %cmp0 = fcmp ule float %a, 2.0 + %max = select i1 %cmp0, float 2.0, float %a + + ; fmin_legacy + %cmp1 = fcmp uge float %max, 4.0 + %med = select i1 %cmp1, float 4.0, float %max + + store float %med, float addrspace(1)* %outgep + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } -- 2.7.4