From a0050b0961b1fd1cbdd58bc6257bc3f54b25acf4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 19 Jun 2014 01:19:19 +0000 Subject: [PATCH] R600/SI: Add intrinsics for various math instructions. These will be used for custom lowering and for library implementations of various math functions, so it's useful to expose these as builtins. llvm-svn: 211247 --- llvm/include/llvm/IR/IntrinsicsR600.td | 36 ++++++++++++++ llvm/lib/Target/R600/AMDGPUISelLowering.cpp | 32 +++++++++++- llvm/lib/Target/R600/AMDGPUISelLowering.h | 12 +++++ llvm/lib/Target/R600/AMDGPUInstrInfo.td | 29 +++++++++++ llvm/lib/Target/R600/AMDGPUInstructions.td | 10 ++++ llvm/lib/Target/R600/AMDGPUIntrinsics.td | 2 - llvm/lib/Target/R600/R600Instructions.td | 2 +- llvm/lib/Target/R600/SIInsertWaits.cpp | 2 + llvm/lib/Target/R600/SIInstructions.td | 37 +++++++++++--- .../Transforms/InstCombine/InstCombineCalls.cpp | 14 ++++++ llvm/test/CodeGen/R600/big_alu.ll | 12 ++--- llvm/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll | 27 ++++++++++ llvm/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll | 27 ++++++++++ llvm/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll | 23 +++++++++ llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll | 58 ++++++++++++++++++++++ llvm/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll | 29 +++++++++++ llvm/test/CodeGen/R600/pv.ll | 4 +- llvm/test/CodeGen/R600/sgpr-copy.ll | 4 +- llvm/test/CodeGen/R600/si-sgpr-spill.ll | 18 +++---- .../test/Transforms/InstCombine/r600-intrinsics.ll | 47 ++++++++++++++++++ 20 files changed, 393 insertions(+), 32 deletions(-) create mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll create mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll create mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll create mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll create mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll create mode 100644 llvm/test/Transforms/InstCombine/r600-intrinsics.ll diff --git a/llvm/include/llvm/IR/IntrinsicsR600.td b/llvm/include/llvm/IR/IntrinsicsR600.td index ecb5668..09607d5 100644 --- a/llvm/include/llvm/IR/IntrinsicsR600.td +++ b/llvm/include/llvm/IR/IntrinsicsR600.td @@ -33,4 +33,40 @@ defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz < "__builtin_r600_read_tgid">; defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz < "__builtin_r600_read_tidig">; + } // End TargetPrefix = "r600" + +let TargetPrefix = "AMDGPU" in { +def int_AMDGPU_div_scale : + Intrinsic<[llvm_anyfloat_ty, llvm_i1_ty], + [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>, + GCCBuiltin<"__builtin_amdgpu_div_scale">; + +def int_AMDGPU_div_fmas : + Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>, + GCCBuiltin<"__builtin_amdgpu_div_fmas">; + +def int_AMDGPU_div_fixup : + Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>, + GCCBuiltin<"__builtin_amdgpu_div_fixup">; + +def int_AMDGPU_trig_preop : + Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>, + GCCBuiltin<"__builtin_amdgpu_trig_preop">; + +def int_AMDGPU_rcp : + Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>], [IntrNoMem]>, + GCCBuiltin<"__builtin_amdgpu_rcp">; + +def int_AMDGPU_rsq : + Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>], [IntrNoMem]>, + GCCBuiltin<"__builtin_amdgpu_rsq">; + + +} // End TargetPrefix = "AMDGPU" diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp index 34c2b2b..1aa92fa 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp @@ -842,6 +842,28 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::AMDGPU_div_scale: + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::AMDGPU_div_fmas: + return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_div_fixup: + return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_trig_preop: + return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::AMDGPU_rcp: + return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq: + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_imax: return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -2042,6 +2064,14 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMIN) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(DIV_SCALE) + NODE_NAME_CASE(DIV_FMAS) + NODE_NAME_CASE(DIV_FIXUP) + NODE_NAME_CASE(TRIG_PREOP) + NODE_NAME_CASE(RCP) + NODE_NAME_CASE(RSQ) + NODE_NAME_CASE(DOT4) NODE_NAME_CASE(BFE_U32) NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) @@ -2051,8 +2081,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MAD_U24) NODE_NAME_CASE(MAD_I24) - NODE_NAME_CASE(URECIP) - NODE_NAME_CASE(DOT4) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.h b/llvm/lib/Target/R600/AMDGPUISelLowering.h index b2bb257..e2000a0 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.h +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.h @@ -175,6 +175,9 @@ enum { DWORDADDR, FRACT, CLAMP, + + // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. + // Denormals handled on some parts. COS_HW, SIN_HW, FMAX, @@ -184,6 +187,15 @@ enum { SMIN, UMIN, URECIP, + DIV_SCALE, + DIV_FMAS, + DIV_FIXUP, + TRIG_PREOP, // 1 ULP max error for f64 + + // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. + // For f64, max error 2^29 ULP, handles denormals. + RCP, + RSQ, DOT4, BFE_U32, // Extract range of bits with zero extension to 32-bits. BFE_I32, // Extract range of bits with sign extension to 32-bits. diff --git a/llvm/lib/Target/R600/AMDGPUInstrInfo.td b/llvm/lib/Target/R600/AMDGPUInstrInfo.td index 942a9e8..d0ee40a 100644 --- a/llvm/lib/Target/R600/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/R600/AMDGPUInstrInfo.td @@ -19,6 +19,14 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> ]>; +def AMDGPUTrigPreOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + +def AMDGPUDivScaleOp : SDTypeProfile<2, 3, + [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] +>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -29,6 +37,12 @@ def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; // out = a - floor(a) def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; +// out = 1.0 / a +def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; + // out = max(a, b) a and b are floats def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, [SDNPCommutative, SDNPAssociative] @@ -78,6 +92,21 @@ def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", // e is rounding error def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; +// Special case divide preop and flags. +def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; + +// Special case divide FMA with scale and flags (src0 = Quotient, +// src1 = Denominator, src2 = Numerator). +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>; + +// Single or double precision division fixup. +// Special case divide fixup and flags(src0 = Quotient, src1 = +// Denominator, src2 = Numerator). +def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; + +// Look Up 2.0 / pi src0 with segment select src1[4:0] +def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; + def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, [SDNPHasChain, SDNPMayLoad]>; diff --git a/llvm/lib/Target/R600/AMDGPUInstructions.td b/llvm/lib/Target/R600/AMDGPUInstructions.td index 8bfc11c..14bfd8c 100644 --- a/llvm/lib/Target/R600/AMDGPUInstructions.td +++ b/llvm/lib/Target/R600/AMDGPUInstructions.td @@ -519,6 +519,16 @@ multiclass Expand24UBitOps { >; } +class RcpPat : Pat < + (fdiv FP_ONE, vt:$src), + (RcpInst $src) +>; + +class RsqPat : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) +>; + include "R600Instructions.td" include "R700Instructions.td" include "EvergreenInstructions.td" diff --git a/llvm/lib/Target/R600/AMDGPUIntrinsics.td b/llvm/lib/Target/R600/AMDGPUIntrinsics.td index 6dc7612..538b4cd 100644 --- a/llvm/lib/Target/R600/AMDGPUIntrinsics.td +++ b/llvm/lib/Target/R600/AMDGPUIntrinsics.td @@ -30,8 +30,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/R600/R600Instructions.td b/llvm/lib/Target/R600/R600Instructions.td index 58c704d..47b7da0 100644 --- a/llvm/lib/Target/R600/R600Instructions.td +++ b/llvm/lib/Target/R600/R600Instructions.td @@ -1083,7 +1083,7 @@ class RECIP_UINT_Common inst> : R600_1OP_Helper < } class RECIPSQRT_CLAMPED_Common inst> : R600_1OP_Helper < - inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq + inst, "RECIPSQRT_CLAMPED", AMDGPUrsq > { let Itinerary = TransALU; } diff --git a/llvm/lib/Target/R600/SIInsertWaits.cpp b/llvm/lib/Target/R600/SIInsertWaits.cpp index a17fed7..1733326 100644 --- a/llvm/lib/Target/R600/SIInsertWaits.cpp +++ b/llvm/lib/Target/R600/SIInsertWaits.cpp @@ -341,6 +341,8 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { return Result; } +// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" +// around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; diff --git a/llvm/lib/Target/R600/SIInstructions.td b/llvm/lib/Target/R600/SIInstructions.td index 60eb8f9..26024dc 100644 --- a/llvm/lib/Target/R600/SIInstructions.td +++ b/llvm/lib/Target/R600/SIInstructions.td @@ -1116,22 +1116,23 @@ defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", [(set f32:$dst, (flog2 f32:$src0))] >; + defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", - [(set f32:$dst, (fdiv FP_ONE, f32:$src0))] + [(set f32:$dst, (AMDGPUrcp f32:$src0))] >; defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; defm V_RSQ_LEGACY_F32 : VOP1_32 < 0x0000002d, "V_RSQ_LEGACY_F32", - [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))] + [(set f32:$dst, (AMDGPUrsq f32:$src0))] >; defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", [(set f32:$dst, (fdiv FP_ONE, (fsqrt f32:$src0)))] >; defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", - [(set f64:$dst, (fdiv FP_ONE, f64:$src0))] + [(set f64:$dst, (AMDGPUrcp f64:$src0))] >; defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>; defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", @@ -1417,8 +1418,12 @@ defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; //def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>; defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>; ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>; -defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>; -def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>; +defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", + [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))] +>; +def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", + [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))] +>; def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64", [(set i64:$dst, (shl i64:$src0, i32:$src1))] @@ -1452,12 +1457,19 @@ defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; -defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>; -def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>; + +defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", + [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))] +>; +def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", + [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))] +>; //def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>; //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>; -def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>; +def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64", + [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))] +>; //===----------------------------------------------------------------------===// // Pseudo Instructions @@ -1749,6 +1761,15 @@ def : Pat < >; //===----------------------------------------------------------------------===// +// VOP1 Patterns +//===----------------------------------------------------------------------===// + +def : RcpPat; +def : RcpPat; +def : RsqPat; +def : RsqPat; + +//===----------------------------------------------------------------------===// // VOP2 Patterns //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index d4bdd75..ff74564 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -922,6 +922,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::AMDGPU_rcp: { + if (const ConstantFP *C = dyn_cast(II->getArgOperand(0))) { + const APFloat &ArgVal = C->getValueAPF(); + APFloat Val(ArgVal.getSemantics(), 1.0); + APFloat::opStatus Status = Val.divide(ArgVal, + APFloat::rmNearestTiesToEven); + // Only do this if it was exact and therefore not dependent on the + // rounding mode. + if (Status == APFloat::opOK) + return ReplaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); + } + + break; + } case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. diff --git a/llvm/test/CodeGen/R600/big_alu.ll b/llvm/test/CodeGen/R600/big_alu.ll index 6b68376..511e8ef 100644 --- a/llvm/test/CodeGen/R600/big_alu.ll +++ b/llvm/test/CodeGen/R600/big_alu.ll @@ -101,7 +101,7 @@ IF137: ; preds = %main_body %88 = insertelement <4 x float> %87, float %32, i32 2 %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3 %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89) - %91 = call float @llvm.AMDGPU.rsq(float %90) + %91 = call float @llvm.AMDGPU.rsq.f32(float %90) %92 = fmul float %30, %91 %93 = fmul float %31, %91 %94 = fmul float %32, %91 @@ -344,7 +344,7 @@ ENDIF136: ; preds = %main_body, %ENDIF15 %325 = insertelement <4 x float> %324, float %318, i32 2 %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3 %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326) - %328 = call float @llvm.AMDGPU.rsq(float %327) + %328 = call float @llvm.AMDGPU.rsq.f32(float %327) %329 = fmul float %314, %328 %330 = fmul float %316, %328 %331 = fmul float %318, %328 @@ -377,7 +377,7 @@ ENDIF136: ; preds = %main_body, %ENDIF15 %358 = insertelement <4 x float> %357, float %45, i32 2 %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3 %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359) - %361 = call float @llvm.AMDGPU.rsq(float %360) + %361 = call float @llvm.AMDGPU.rsq.f32(float %360) %362 = fmul float %45, %361 %363 = call float @fabs(float %362) %364 = fmul float %176, 0x3FECCCCCC0000000 @@ -403,7 +403,7 @@ ENDIF136: ; preds = %main_body, %ENDIF15 %384 = insertelement <4 x float> %383, float %45, i32 2 %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3 %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385) - %387 = call float @llvm.AMDGPU.rsq(float %386) + %387 = call float @llvm.AMDGPU.rsq.f32(float %386) %388 = fmul float %45, %387 %389 = call float @fabs(float %388) %390 = fmul float %176, 0x3FF51EB860000000 @@ -1041,7 +1041,7 @@ IF179: ; preds = %ENDIF175 %896 = insertelement <4 x float> %895, float %45, i32 2 %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3 %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897) - %899 = call float @llvm.AMDGPU.rsq(float %898) + %899 = call float @llvm.AMDGPU.rsq.f32(float %898) %900 = fmul float %45, %899 %901 = call float @fabs(float %900) %902 = fmul float %176, 0x3FECCCCCC0000000 @@ -1150,7 +1150,7 @@ ENDIF178: ; preds = %ENDIF175, %IF179 declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq(float) #1 +declare float @llvm.AMDGPU.rsq.f32(float) #1 ; Function Attrs: readnone declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll new file mode 100644 index 0000000..c8c7357 --- /dev/null +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone +declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone + +; SI-LABEL: @test_div_fixup_f32: +; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]] +; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: V_DIV_FIXUP_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] +; SI: BUFFER_STORE_DWORD [[RESULT]], +; SI: S_ENDPGM +define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @test_div_fixup_f64: +; SI: V_DIV_FIXUP_F64 +define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { + %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll new file mode 100644 index 0000000..4f1e827 --- /dev/null +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.AMDGPU.div.fmas.f32(float, float, float) nounwind readnone +declare double @llvm.AMDGPU.div.fmas.f64(double, double, double) nounwind readnone + +; SI-LABEL: @test_div_fmas_f32: +; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]] +; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: V_DIV_FMAS_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] +; SI: BUFFER_STORE_DWORD [[RESULT]], +; SI: S_ENDPGM +define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @test_div_fmas_f64: +; SI: V_DIV_FMAS_F64 +define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { + %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll new file mode 100644 index 0000000..1bcbe2f --- /dev/null +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll @@ -0,0 +1,23 @@ +; XFAIL: * +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.AMDGPU.div.scale.f32(float, float) nounwind readnone +declare double @llvm.AMDGPU.div.scale.f64(double, double) nounwind readnone + +; SI-LABEL @test_div_scale_f32: +define void @test_div_scale_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %result = call float @llvm.AMDGPU.div.scale.f32(float %a, float %b) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f64: +define void @test_div_scale_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr) nounwind { + %a = load double addrspace(1)* %aptr, align 8 + %b = load double addrspace(1)* %bptr, align 8 + %result = call double @llvm.AMDGPU.div.scale.f64(double %a, double %b) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll new file mode 100644 index 0000000..ca5260d --- /dev/null +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll @@ -0,0 +1,58 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone +declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone + + +declare float @llvm.sqrt.f32(float) nounwind readnone +declare double @llvm.sqrt.f64(double) nounwind readnone + +; FUNC-LABEL: @rcp_f32 +; SI: V_RCP_F32_e32 +define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind { + %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @rcp_f64 +; SI: V_RCP_F64_e32 +define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind { + %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @rcp_pat_f32 +; SI: V_RCP_F32_e32 +define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { + %rcp = fdiv float 1.0, %src + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @rcp_pat_f64 +; SI: V_RCP_F64_e32 +define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { + %rcp = fdiv double 1.0, %src + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @rsq_rcp_pat_f32 +; SI: V_RSQ_F32_e32 +define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { + %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone + %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @rsq_rcp_pat_f64 +; SI: V_RSQ_F64_e32 +define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { + %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone + %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll new file mode 100644 index 0000000..1c736d4 --- /dev/null +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone + +; SI-LABEL: @test_trig_preop_f64: +; SI-DAG: BUFFER_LOAD_DWORD [[SEG:v[0-9]+]] +; SI-DAG: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]], +; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]] +; SI: BUFFER_STORE_DWORDX2 [[RESULT]], +; SI: S_ENDPGM +define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load double addrspace(1)* %aptr, align 8 + %b = load i32 addrspace(1)* %bptr, align 4 + %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: @test_trig_preop_f64_imm_segment: +; SI: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]], +; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7 +; SI: BUFFER_STORE_DWORDX2 [[RESULT]], +; SI: S_ENDPGM +define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { + %a = load double addrspace(1)* %aptr, align 8 + %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/R600/pv.ll b/llvm/test/CodeGen/R600/pv.ll index f322bc7..55eb56d 100644 --- a/llvm/test/CodeGen/R600/pv.ll +++ b/llvm/test/CodeGen/R600/pv.ll @@ -103,7 +103,7 @@ main_body: %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3 %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95) %97 = call float @fabs(float %96) - %98 = call float @llvm.AMDGPU.rsq(float %97) + %98 = call float @llvm.AMDGPU.rsq.f32(float %97) %99 = fmul float %4, %98 %100 = fmul float %5, %98 %101 = fmul float %6, %98 @@ -225,7 +225,7 @@ declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 declare float @fabs(float) #2 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq(float) #1 +declare float @llvm.AMDGPU.rsq.f32(float) #1 ; Function Attrs: readnone declare float @llvm.AMDIL.clamp.(float, float, float) #1 diff --git a/llvm/test/CodeGen/R600/sgpr-copy.ll b/llvm/test/CodeGen/R600/sgpr-copy.ll index c581d86..c7d5bf9 100644 --- a/llvm/test/CodeGen/R600/sgpr-copy.ll +++ b/llvm/test/CodeGen/R600/sgpr-copy.ll @@ -70,7 +70,7 @@ main_body: %55 = fadd float %54, %53 %56 = fmul float %45, %45 %57 = fadd float %55, %56 - %58 = call float @llvm.AMDGPU.rsq(float %57) + %58 = call float @llvm.AMDGPU.rsq.f32(float %57) %59 = fmul float %43, %58 %60 = fmul float %44, %58 %61 = fmul float %45, %58 @@ -212,7 +212,7 @@ declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq(float) #3 +declare float @llvm.AMDGPU.rsq.f32(float) #3 ; Function Attrs: readnone declare float @llvm.AMDIL.exp.(float) #3 diff --git a/llvm/test/CodeGen/R600/si-sgpr-spill.ll b/llvm/test/CodeGen/R600/si-sgpr-spill.ll index b34a757..53a0965 100644 --- a/llvm/test/CodeGen/R600/si-sgpr-spill.ll +++ b/llvm/test/CodeGen/R600/si-sgpr-spill.ll @@ -203,7 +203,7 @@ main_body: %198 = fadd float %197, %196 %199 = fmul float %97, %97 %200 = fadd float %198, %199 - %201 = call float @llvm.AMDGPU.rsq(float %200) + %201 = call float @llvm.AMDGPU.rsq.f32(float %200) %202 = fmul float %95, %201 %203 = fmul float %96, %201 %204 = fmul float %202, %29 @@ -384,7 +384,7 @@ IF67: ; preds = %LOOP65 %355 = fadd float %354, %353 %356 = fmul float %352, %352 %357 = fadd float %355, %356 - %358 = call float @llvm.AMDGPU.rsq(float %357) + %358 = call float @llvm.AMDGPU.rsq.f32(float %357) %359 = fmul float %350, %358 %360 = fmul float %351, %358 %361 = fmul float %352, %358 @@ -512,7 +512,7 @@ IF67: ; preds = %LOOP65 %483 = fadd float %482, %481 %484 = fmul float %109, %109 %485 = fadd float %483, %484 - %486 = call float @llvm.AMDGPU.rsq(float %485) + %486 = call float @llvm.AMDGPU.rsq.f32(float %485) %487 = fmul float %107, %486 %488 = fmul float %108, %486 %489 = fmul float %109, %486 @@ -541,7 +541,7 @@ IF67: ; preds = %LOOP65 %512 = fadd float %511, %510 %513 = fmul float %97, %97 %514 = fadd float %512, %513 - %515 = call float @llvm.AMDGPU.rsq(float %514) + %515 = call float @llvm.AMDGPU.rsq.f32(float %514) %516 = fmul float %95, %515 %517 = fmul float %96, %515 %518 = fmul float %97, %515 @@ -658,7 +658,7 @@ declare i32 @llvm.SI.tid() #2 declare float @ceil(float) #3 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq(float) #2 +declare float @llvm.AMDGPU.rsq.f32(float) #2 ; Function Attrs: nounwind readnone declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1 @@ -887,7 +887,7 @@ main_body: %212 = fadd float %211, %210 %213 = fmul float %209, %209 %214 = fadd float %212, %213 - %215 = call float @llvm.AMDGPU.rsq(float %214) + %215 = call float @llvm.AMDGPU.rsq.f32(float %214) %216 = fmul float %205, %215 %217 = fmul float %207, %215 %218 = fmul float %209, %215 @@ -1123,7 +1123,7 @@ IF189: ; preds = %LOOP %434 = fsub float -0.000000e+00, %433 %435 = fadd float 0x3FF00068E0000000, %434 %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00) - %437 = call float @llvm.AMDGPU.rsq(float %436) + %437 = call float @llvm.AMDGPU.rsq.f32(float %436) %438 = fmul float %437, %436 %439 = fsub float -0.000000e+00, %436 %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00) @@ -1147,7 +1147,7 @@ IF189: ; preds = %LOOP %458 = fadd float %457, %456 %459 = fmul float %455, %455 %460 = fadd float %458, %459 - %461 = call float @llvm.AMDGPU.rsq(float %460) + %461 = call float @llvm.AMDGPU.rsq.f32(float %460) %462 = fmul float %451, %461 %463 = fmul float %453, %461 %464 = fmul float %455, %461 @@ -1257,7 +1257,7 @@ ENDIF197: ; preds = %IF189, %IF198 %559 = fadd float %558, %557 %560 = fmul float %556, %556 %561 = fadd float %559, %560 - %562 = call float @llvm.AMDGPU.rsq(float %561) + %562 = call float @llvm.AMDGPU.rsq.f32(float %561) %563 = fmul float %562, %561 %564 = fsub float -0.000000e+00, %561 %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00) diff --git a/llvm/test/Transforms/InstCombine/r600-intrinsics.ll b/llvm/test/Transforms/InstCombine/r600-intrinsics.ll new file mode 100644 index 0000000..1db6b0d --- /dev/null +++ b/llvm/test/Transforms/InstCombine/r600-intrinsics.ll @@ -0,0 +1,47 @@ +; RUN: opt -instcombine -S < %s | FileCheck %s + +declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone +declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone + +; CHECK-LABEL: @test_constant_fold_rcp_f32_1 +; CHECK-NEXT: ret float 1.000000e+00 +define float @test_constant_fold_rcp_f32_1() nounwind { + %val = call float @llvm.AMDGPU.rcp.f32(float 1.0) nounwind readnone + ret float %val +} + +; CHECK-LABEL: @test_constant_fold_rcp_f64_1 +; CHECK-NEXT: ret double 1.000000e+00 +define double @test_constant_fold_rcp_f64_1() nounwind { + %val = call double @llvm.AMDGPU.rcp.f64(double 1.0) nounwind readnone + ret double %val +} + +; CHECK-LABEL: @test_constant_fold_rcp_f32_half +; CHECK-NEXT: ret float 2.000000e+00 +define float @test_constant_fold_rcp_f32_half() nounwind { + %val = call float @llvm.AMDGPU.rcp.f32(float 0.5) nounwind readnone + ret float %val +} + +; CHECK-LABEL: @test_constant_fold_rcp_f64_half +; CHECK-NEXT: ret double 2.000000e+00 +define double @test_constant_fold_rcp_f64_half() nounwind { + %val = call double @llvm.AMDGPU.rcp.f64(double 0.5) nounwind readnone + ret double %val +} + +; CHECK-LABEL: @test_constant_fold_rcp_f32_43 +; CHECK-NEXT: call float @llvm.AMDGPU.rcp.f32(float 4.300000e+01) +define float @test_constant_fold_rcp_f32_43() nounwind { + %val = call float @llvm.AMDGPU.rcp.f32(float 4.300000e+01) nounwind readnone + ret float %val +} + +; CHECK-LABEL: @test_constant_fold_rcp_f64_43 +; CHECK-NEXT: call double @llvm.AMDGPU.rcp.f64(double 4.300000e+01) +define double @test_constant_fold_rcp_f64_43() nounwind { + %val = call double @llvm.AMDGPU.rcp.f64(double 4.300000e+01) nounwind readnone + ret double %val +} + -- 2.7.4