From 257d48d22cadb5677b1be4a756b5bc74f286139b Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 24 Jun 2014 22:13:39 +0000 Subject: [PATCH] R600: Fix inconsistency in rsq instructions. R600 was using a clamped version of rsq, but SI was not. Add a new rsq_clamped intrinsic and use them consistently. It's unclear to me from the documentation what behavior the R600 instructions have, so I assume they have the legacy behavior described by the SI documents. For R600, use RECIPSQRT_IEEE for both llvm.AMDGPU.rsq.legacy and llvm.AMDGPU.rsq. R600 also has RECIPSQRT_FF, which I'm not sure how it fits in here. llvm-svn: 211637 --- llvm/include/llvm/IR/IntrinsicsR600.td | 3 +++ llvm/lib/Target/R600/AMDGPUISelLowering.cpp | 8 ++++++++ llvm/lib/Target/R600/AMDGPUISelLowering.h | 2 ++ llvm/lib/Target/R600/AMDGPUInstrInfo.td | 6 ++++++ llvm/lib/Target/R600/AMDGPUInstructions.td | 15 +++++++++++---- llvm/lib/Target/R600/AMDGPUIntrinsics.td | 8 ++++++++ llvm/lib/Target/R600/R600ISelLowering.cpp | 3 +++ llvm/lib/Target/R600/R600Instructions.td | 9 ++++++--- llvm/lib/Target/R600/SIInstructions.td | 18 +++++++++++------- llvm/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll | 13 +++++++++++++ llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll | 11 +++++++++++ llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll | 14 ++++++++++++++ llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.ll | 13 +++++++++++++ 13 files changed, 109 insertions(+), 14 deletions(-) create mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll create mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll create mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll create mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.ll diff --git a/llvm/include/llvm/IR/IntrinsicsR600.td b/llvm/include/llvm/IR/IntrinsicsR600.td index 2566cc2..ba69eaa 100644 --- a/llvm/include/llvm/IR/IntrinsicsR600.td +++ b/llvm/include/llvm/IR/IntrinsicsR600.td @@ -66,4 +66,7 @@ def int_AMDGPU_rcp : GCCBuiltin<"__builtin_amdgpu_rcp">, def int_AMDGPU_rsq : GCCBuiltin<"__builtin_amdgpu_rsq">, Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; +def int_AMDGPU_rsq_clamped : GCCBuiltin<"__builtin_amdgpu_rsq_clamped">, + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + } // End TargetPrefix = "AMDGPU" diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp index 6b70d4c..f467436 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp @@ -805,6 +805,12 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::AMDGPU_rsq: return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_legacy_rsq: + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq_clamped: + return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_imax: return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -2052,6 +2058,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TRIG_PREOP) NODE_NAME_CASE(RCP) NODE_NAME_CASE(RSQ) + NODE_NAME_CASE(RSQ_LEGACY) + NODE_NAME_CASE(RSQ_CLAMPED) NODE_NAME_CASE(DOT4) NODE_NAME_CASE(BFE_U32) NODE_NAME_CASE(BFE_I32) diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.h b/llvm/lib/Target/R600/AMDGPUISelLowering.h index 7d9dcc9..307921e 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.h +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.h @@ -192,6 +192,8 @@ enum { // For f64, max error 2^29 ULP, handles denormals. RCP, RSQ, + RSQ_LEGACY, + RSQ_CLAMPED, DOT4, BFE_U32, // Extract range of bits with zero extension to 32-bits. BFE_I32, // Extract range of bits with sign extension to 32-bits. diff --git a/llvm/lib/Target/R600/AMDGPUInstrInfo.td b/llvm/lib/Target/R600/AMDGPUInstrInfo.td index d0ee40a..934d59d 100644 --- a/llvm/lib/Target/R600/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/R600/AMDGPUInstrInfo.td @@ -43,6 +43,12 @@ def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; +// out = 1.0 / sqrt(a) +def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) result clamped to +/- max_float. +def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; + // out = max(a, b) a and b are floats def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, [SDNPCommutative, SDNPAssociative] diff --git a/llvm/lib/Target/R600/AMDGPUInstructions.td b/llvm/lib/Target/R600/AMDGPUInstructions.td index 14bfd8c..b86b781 100644 --- a/llvm/lib/Target/R600/AMDGPUInstructions.td +++ b/llvm/lib/Target/R600/AMDGPUInstructions.td @@ -524,10 +524,17 @@ class RcpPat : Pat < (RcpInst $src) >; -class RsqPat : Pat < - (AMDGPUrcp (fsqrt vt:$src)), - (RsqInst $src) ->; +multiclass RsqPat { + def : Pat < + (fdiv FP_ONE, (fsqrt vt:$src)), + (RsqInst $src) + >; + + def : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) + >; +} include "R600Instructions.td" include "R700Instructions.td" diff --git a/llvm/lib/Target/R600/AMDGPUIntrinsics.td b/llvm/lib/Target/R600/AMDGPUIntrinsics.td index 27c0dbe..d934676 100644 --- a/llvm/lib/Target/R600/AMDGPUIntrinsics.td +++ b/llvm/lib/Target/R600/AMDGPUIntrinsics.td @@ -24,6 +24,14 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + + // This is named backwards (instead of rsq_legacy) so we don't have + // to define it with the public builtins intrinsics. This is a + // workaround for how intrinsic names are parsed. If the name is + // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant + // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name. + def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; def int_AMDGPU_kilp : Intrinsic<[], [], []>; diff --git a/llvm/lib/Target/R600/R600ISelLowering.cpp b/llvm/lib/Target/R600/R600ISelLowering.cpp index 13d555e..996117c 100644 --- a/llvm/lib/Target/R600/R600ISelLowering.cpp +++ b/llvm/lib/Target/R600/R600ISelLowering.cpp @@ -814,6 +814,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, AMDGPU::T0_Z, VT); + case Intrinsic::AMDGPU_rsq: + // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); } // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) break; diff --git a/llvm/lib/Target/R600/R600Instructions.td b/llvm/lib/Target/R600/R600Instructions.td index 616b3b5..73fa345 100644 --- a/llvm/lib/Target/R600/R600Instructions.td +++ b/llvm/lib/Target/R600/R600Instructions.td @@ -1079,18 +1079,21 @@ class RECIP_UINT_Common inst> : R600_1OP_Helper < let Itinerary = TransALU; } +// Clamped to maximum. class RECIPSQRT_CLAMPED_Common inst> : R600_1OP_Helper < - inst, "RECIPSQRT_CLAMPED", AMDGPUrsq + inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped > { let Itinerary = TransALU; } -class RECIPSQRT_IEEE_Common inst> : R600_1OP < - inst, "RECIPSQRT_IEEE", [] +class RECIPSQRT_IEEE_Common inst> : R600_1OP_Helper < + inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy > { let Itinerary = TransALU; } +// TODO: There is also RECIPSQRT_FF which clamps to zero. + class SIN_Common inst> : R600_1OP < inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{ let Trig = 1; diff --git a/llvm/lib/Target/R600/SIInstructions.td b/llvm/lib/Target/R600/SIInstructions.td index 72c51a1..507cfe8 100644 --- a/llvm/lib/Target/R600/SIInstructions.td +++ b/llvm/lib/Target/R600/SIInstructions.td @@ -1123,22 +1123,26 @@ defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", [(set f32:$dst, (AMDGPUrcp f32:$src0))] >; defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; -defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; +defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", + [(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))] +>; defm V_RSQ_LEGACY_F32 : VOP1_32 < 0x0000002d, "V_RSQ_LEGACY_F32", - [(set f32:$dst, (AMDGPUrsq f32:$src0))] + [(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))] >; defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", - [(set f32:$dst, (fdiv FP_ONE, (fsqrt f32:$src0)))] + [(set f32:$dst, (AMDGPUrsq f32:$src0))] >; defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", [(set f64:$dst, (AMDGPUrcp f64:$src0))] >; defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>; defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", - [(set f64:$dst, (fdiv FP_ONE, (fsqrt f64:$src0)))] + [(set f64:$dst, (AMDGPUrsq f64:$src0))] +>; +defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", + [(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))] >; -defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>; defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", [(set f32:$dst, (fsqrt f32:$src0))] >; @@ -1781,8 +1785,8 @@ def : Pat < def : RcpPat; def : RcpPat; -def : RsqPat; -def : RsqPat; +defm : RsqPat; +defm : RsqPat; //===----------------------------------------------------------------------===// // VOP2 Patterns diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll new file mode 100644 index 0000000..51964ee --- /dev/null +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone + +; FUNC-LABEL: @rsq_legacy_f32 +; SI: V_RSQ_LEGACY_F32_e32 +; EG: RECIPSQRT_IEEE +define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll new file mode 100644 index 0000000..100d6ff --- /dev/null +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll @@ -0,0 +1,11 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone + +; FUNC-LABEL: @rsq_clamped_f64 +; SI: V_RSQ_CLAMP_F64_e32 +define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { + %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone + store double %rsq_clamped, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll new file mode 100644 index 0000000..683df73 --- /dev/null +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone + +; FUNC-LABEL: @rsq_clamped_f32 +; SI: V_RSQ_CLAMP_F32_e32 +; EG: RECIPSQRT_CLAMPED +define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone + store float %rsq_clamped, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.ll new file mode 100644 index 0000000..27cf6b2 --- /dev/null +++ b/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone + +; FUNC-LABEL: @rsq_f32 +; SI: V_RSQ_F32_e32 +; EG: RECIPSQRT_IEEE +define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} -- 2.7.4