From b00554886f3ad7cd5a65a8955230bae1ed8c48e4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 21 Jan 2015 18:18:25 +0000 Subject: [PATCH] R600/SI: Custom lower fround This fixes it for SI. It also removes the pattern used previously for Evergreen for f32. I'm not sure if the the new R600 output is better or not, but it uses 1 fewer instructions if BFI is available. llvm-svn: 226682 --- llvm/lib/Target/R600/AMDGPUISelLowering.cpp | 123 +++++++++++++++++++++++--- llvm/lib/Target/R600/AMDGPUISelLowering.h | 4 + llvm/lib/Target/R600/EvergreenInstructions.td | 2 - llvm/lib/Target/R600/R600Instructions.td | 12 --- llvm/test/CodeGen/R600/llvm.round.f64.ll | 74 ++++++++++++++++ llvm/test/CodeGen/R600/llvm.round.ll | 77 ++++++++++------ 6 files changed, 241 insertions(+), 51 deletions(-) create mode 100644 llvm/test/CodeGen/R600/llvm.round.f64.ll diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp index 206050d..7ce0c36 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp @@ -127,9 +127,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FABS, MVT::f32, Legal); setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FROUND, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FROUND, MVT::f32, Custom); + setOperationAction(ISD::FROUND, MVT::f64, Custom); + setOperationAction(ISD::FREM, MVT::f32, Custom); setOperationAction(ISD::FREM, MVT::f64, Custom); @@ -610,6 +612,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); case ISD::FRINT: return LowerFRINT(Op, DAG); case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); + case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); @@ -1917,6 +1920,20 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } +static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) { + const unsigned FractBits = 52; + const unsigned ExpBits = 11; + + SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + Hi, + DAG.getConstant(FractBits - 32, MVT::i32), + DAG.getConstant(ExpBits, MVT::i32)); + SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, + DAG.getConstant(1023, MVT::i32)); + + return Exp; +} + SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -1932,16 +1949,9 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { // exponent. SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); - const unsigned FractBits = 52; - const unsigned ExpBits = 11; + SDValue Exp = extractF64Exponent(Hi, SL, DAG); - // Extract the exponent. - SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, - Hi, - DAG.getConstant(FractBits - 32, MVT::i32), - DAG.getConstant(ExpBits, MVT::i32)); - SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, - DAG.getConstant(1023, MVT::i32)); + const unsigned FractBits = 52; // Extract the sign bit. const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32); @@ -2004,6 +2014,99 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); } +// XXX - May require not supporting f32 denormals? +SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); + + SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); + + SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); + + const SDValue Zero = DAG.getConstantFP(0.0, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, MVT::f32); + const SDValue Half = DAG.getConstantFP(0.5, MVT::f32); + + SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); + + SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero); + + return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel); +} + +SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); + + const SDValue Zero = DAG.getConstant(0, MVT::i32); + const SDValue One = DAG.getConstant(1, MVT::i32); + const SDValue NegOne = DAG.getConstant(-1, MVT::i32); + const SDValue FiftyOne = DAG.getConstant(51, MVT::i32); + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); + + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); + + SDValue Exp = extractF64Exponent(Hi, SL, DAG); + + const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), MVT::i64); + + SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); + SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, + DAG.getConstant(INT64_C(0x0008000000000000), MVT::i64), + Exp); + + SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); + SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, + DAG.getConstant(0, MVT::i64), Tmp0, + ISD::SETNE); + + SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, + D, DAG.getConstant(0, MVT::i64)); + SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); + + K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); + K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); + + SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); + SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); + SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); + + SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, + ExpEqNegOne, + DAG.getConstantFP(1.0, MVT::f64), + DAG.getConstantFP(0.0, MVT::f64)); + + SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); + + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); + + return K; +} + +SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFROUND32(Op, DAG); + + if (VT == MVT::f64) + return LowerFROUND64(Op, DAG); + + llvm_unreachable("unhandled type"); +} + SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.h b/llvm/lib/Target/R600/AMDGPUISelLowering.h index 15f529c..87c40d0 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.h +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.h @@ -49,6 +49,10 @@ private: SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; diff --git a/llvm/lib/Target/R600/EvergreenInstructions.td b/llvm/lib/Target/R600/EvergreenInstructions.td index f24f76b..8ecaf5e 100644 --- a/llvm/lib/Target/R600/EvergreenInstructions.td +++ b/llvm/lib/Target/R600/EvergreenInstructions.td @@ -590,8 +590,6 @@ def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; // SHA-256 Patterns def : SHA256MaPattern ; -def : FROUNDPat ; - def EG_ExportSwz : ExportSwzInst { let Word1{19-16} = 0; // BURST_COUNT let Word1{20} = 0; // VALID_PIXEL_MODE diff --git a/llvm/lib/Target/R600/R600Instructions.td b/llvm/lib/Target/R600/R600Instructions.td index b1d3ce2..d004262 100644 --- a/llvm/lib/Target/R600/R600Instructions.td +++ b/llvm/lib/Target/R600/R600Instructions.td @@ -1142,16 +1142,6 @@ class TGSI_LIT_Z_Common ; -// FROUND pattern -class FROUNDPat : Pat < - (AMDGPUround f32:$x), - (CNDGE $x, - (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)), - (CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)) - ) ->; - - //===----------------------------------------------------------------------===// // R600 / R700 Instructions //===----------------------------------------------------------------------===// @@ -1195,8 +1185,6 @@ let Predicates = [isR600] in { def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; defm : RsqPat; - def : FROUNDPat ; - def R600_ExportSwz : ExportSwzInst { let Word1{20-17} = 0; // BURST_COUNT let Word1{21} = eop; diff --git a/llvm/test/CodeGen/R600/llvm.round.f64.ll b/llvm/test/CodeGen/R600/llvm.round.f64.ll new file mode 100644 index 0000000..404cb0f --- /dev/null +++ b/llvm/test/CodeGen/R600/llvm.round.f64.ll @@ -0,0 +1,74 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}round_f64: +; SI: s_endpgm +define void @round_f64(double addrspace(1)* %out, double %x) #0 { + %result = call double @llvm.round.f64(double %x) #1 + store double %result, double addrspace(1)* %out + ret void +} + +; This is a pretty large function, so just test a few of the +; instructions that are necessary. + +; FUNC-LABEL: {{^}}v_round_f64: +; SI: buffer_load_dwordx2 +; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11 + +; SI: v_not_b32_e32 +; SI: v_not_b32_e32 + +; SI: v_cmp_eq_i32 + +; SI: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff +; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] + +; SI: v_cmp_lt_i32_e64 +; SI: v_cmp_gt_i32_e64 + + +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep = getelementptr double addrspace(1)* %in, i32 %tid + %out.gep = getelementptr double addrspace(1)* %out, i32 %tid + %x = load double addrspace(1)* %gep + %result = call double @llvm.round.f64(double %x) #1 + store double %result, double addrspace(1)* %out.gep + ret void +} + +; FUNC-LABEL: {{^}}round_v2f64: +; SI: s_endpgm +define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { + %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 + store <2 x double> %result, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}round_v4f64: +; SI: s_endpgm +define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { + %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}round_v8f64: +; SI: s_endpgm +define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { + %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 + store <8 x double> %result, <8 x double> addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 + +declare double @llvm.round.f64(double) #1 +declare <2 x double> @llvm.round.v2f64(<2 x double>) #1 +declare <4 x double> @llvm.round.v4f64(<4 x double>) #1 +declare <8 x double> @llvm.round.v8f64(<8 x double>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/llvm.round.ll b/llvm/test/CodeGen/R600/llvm.round.ll index bedf4ba..109f4c7 100644 --- a/llvm/test/CodeGen/R600/llvm.round.ll +++ b/llvm/test/CodeGen/R600/llvm.round.ll @@ -1,17 +1,27 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600 --check-prefix=FUNC - -; FUNC-LABEL: {{^}}f32: -; R600: FRACT {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]] -; R600-DAG: ADD {{.*}}, -0.5 -; R600-DAG: CEIL {{.*}} [[ARG]] -; R600-DAG: FLOOR {{.*}} [[ARG]] -; R600-DAG: CNDGE -; R600-DAG: CNDGT -; R600: CNDGE {{[^,]+}}, [[ARG]] -define void @f32(float addrspace(1)* %out, float %in) { -entry: - %0 = call float @llvm.round.f32(float %in) - store float %0, float addrspace(1)* %out +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}round_f32: +; SI-DAG: s_load_dword [[SX:s[0-9]+]] +; SI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]] +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff +; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]] +; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]] +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] +; SI: v_cmp_ge_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SUB]]|, 0.5 +; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] +; SI: buffer_store_dword [[RESULT]] + +; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]] +; R600-DAG: ADD {{.*}}, +; R600-DAG: BFI_INT +; R600-DAG: SETGE +; R600-DAG: CNDE +; R600-DAG: ADD +define void @round_f32(float addrspace(1)* %out, float %x) #0 { + %result = call float @llvm.round.f32(float %x) #1 + store float %result, float addrspace(1)* %out ret void } @@ -20,24 +30,37 @@ entry: ; a test for the scalar case, so the vector tests just check that the ; compiler doesn't crash. -; FUNC-LABEL: v2f32 +; FUNC-LABEL: {{^}}round_v2f32: +; SI: s_endpgm ; R600: CF_END -define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %0 = call <2 x float> @llvm.round.v2f32(<2 x float> %in) - store <2 x float> %0, <2 x float> addrspace(1)* %out +define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 { + %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1 + store <2 x float> %result, <2 x float> addrspace(1)* %out ret void } -; FUNC-LABEL: v4f32 +; FUNC-LABEL: {{^}}round_v4f32: +; SI: s_endpgm ; R600: CF_END -define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { -entry: - %0 = call <4 x float> @llvm.round.v4f32(<4 x float> %in) - store <4 x float> %0, <4 x float> addrspace(1)* %out +define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 { + %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1 + store <4 x float> %result, <4 x float> addrspace(1)* %out ret void } -declare float @llvm.round.f32(float) -declare <2 x float> @llvm.round.v2f32(<2 x float>) -declare <4 x float> @llvm.round.v4f32(<4 x float>) +; FUNC-LABEL: {{^}}round_v8f32: +; SI: s_endpgm +; R600: CF_END +define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 { + %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1 + store <8 x float> %result, <8 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.round.f32(float) #1 +declare <2 x float> @llvm.round.v2f32(<2 x float>) #1 +declare <4 x float> @llvm.round.v4f32(<4 x float>) #1 +declare <8 x float> @llvm.round.v8f32(<8 x float>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } -- 2.7.4