From: Matt Arsenault Date: Sat, 25 Jul 2020 20:26:33 +0000 (-0400) Subject: AMDGPU/GlobalISel: Implement expansion for rsq.clamp X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5a503521e7b757bda70325f4c01bdbc0f4e3128e;p=platform%2Fupstream%2Fllvm.git AMDGPU/GlobalISel: Implement expansion for rsq.clamp Not sure why we handle this removed instruction on newer subtargets for this one and no others, but maintain compatibility with the DAG. --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 457cb61..fb5bdf8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3160,6 +3160,55 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, return true; } +// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. +// FIXME: Why do we handle this one but not other removed instructions? +// +// Reciprocal square root. The clamp prevents infinite results, clamping +// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to +// +-max_float. +bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + return true; + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(2).getReg(); + auto Flags = MI.getFlags(); + + LLT Ty = MRI.getType(Dst); + + const fltSemantics *FltSemantics; + if (Ty == LLT::scalar(32)) + FltSemantics = &APFloat::IEEEsingle(); + else if (Ty == LLT::scalar(64)) + FltSemantics = &APFloat::IEEEdouble(); + else + return false; + + auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) + .addUse(Src) + .setMIFlags(Flags); + + // We don't need to concern ourselves with the snan handling difference, since + // the rsq quieted (or not) so use the one which will directly select. + const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); + const bool UseIEEE = MFI->getMode().IEEE; + + auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); + auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : + B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); + + auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); + + if (UseIEEE) + B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); + else + B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -4393,6 +4442,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return legalizeTrapIntrinsic(MI, MRI, B); case Intrinsic::debugtrap: return legalizeDebugTrapIntrinsic(MI, MRI, B); + case Intrinsic::amdgcn_rsq_clamp: + return legalizeRsqClampIntrinsic(MI, MRI, B); default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 9919148..3e3e1f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -128,6 +128,9 @@ public: bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.rsq.clamp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.rsq.clamp.mir new file mode 100644 index 0000000..890a224 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.rsq.clamp.mir @@ -0,0 +1,63 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s + +--- +name: test_rsq_clamp_flags_ieee_on_f32 +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: true + +body: | + bb.0: + liveins: $vgpr0 + + ; SI-LABEL: name: test_rsq_clamp_flags_ieee_on_f32 + ; SI: liveins: $vgpr0 + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; SI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), [[COPY]](s32) + ; SI: $vgpr0 = COPY [[INT]](s32) + ; VI-LABEL: name: test_rsq_clamp_flags_ieee_on_f32 + ; VI: liveins: $vgpr0 + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; VI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x47EFFFFFE0000000 + ; VI: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMINNUM_IEEE [[INT]], [[C]] + ; VI: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC7EFFFFFE0000000 + ; VI: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[C1]] + ; VI: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0 + $vgpr0 = COPY %1 +... + +--- +name: test_rsq_clamp_flags_ieee_off_f32 +tracksRegLiveness: true +machineFunctionInfo: + mode: + ieee: false + +body: | + bb.0: + liveins: $vgpr0 + + ; SI-LABEL: name: test_rsq_clamp_flags_ieee_off_f32 + ; SI: liveins: $vgpr0 + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; SI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), [[COPY]](s32) + ; SI: $vgpr0 = COPY [[INT]](s32) + ; VI-LABEL: name: test_rsq_clamp_flags_ieee_off_f32 + ; VI: liveins: $vgpr0 + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; VI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x47EFFFFFE0000000 + ; VI: [[FMINNUM:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMINNUM [[INT]], [[C]] + ; VI: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC7EFFFFFE0000000 + ; VI: [[FMAXNUM:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMAXNUM [[FMINNUM]], [[C1]] + ; VI: $vgpr0 = COPY [[FMAXNUM]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0 + $vgpr0 = COPY %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll new file mode 100644 index 0000000..bd570df --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll @@ -0,0 +1,170 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s + +define float @v_rsq_clamp_f32(float %src) #0 { +; SI-LABEL: v_rsq_clamp_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f32_e32 v0, v0 +; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0 +; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src) + ret float %rsq_clamp +} + +define float @v_rsq_clamp_fabs_f32(float %src) #0 { +; SI-LABEL: v_rsq_clamp_fabs_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f32_e64 v0, |v0| +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_fabs_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f32_e64 v0, |v0| +; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0 +; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call float @llvm.fabs.f32(float %src) + %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %fabs.src) + ret float %rsq_clamp +} + +define double @v_rsq_clamp_f64(double %src) #0 { +; SI-LABEL: v_rsq_clamp_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; VI-NEXT: s_mov_b32 s4, -1 +; VI-NEXT: s_mov_b32 s5, 0x7fefffff +; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s5, 0xffefffff +; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] + %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src) + ret double %rsq_clamp +} + +define double @v_rsq_clamp_fabs_f64(double %src) #0 { +; SI-LABEL: v_rsq_clamp_fabs_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f64_e64 v[0:1], |v[0:1]| +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_fabs_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f64_e64 v[0:1], |v[0:1]| +; VI-NEXT: s_mov_b32 s4, -1 +; VI-NEXT: s_mov_b32 s5, 0x7fefffff +; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s5, 0xffefffff +; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] + %fabs.src = call double @llvm.fabs.f64(double %src) + %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src) + ret double %rsq_clamp +} + +define float @v_rsq_clamp_undef_f32() #0 { +; SI-LABEL: v_rsq_clamp_undef_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f32_e32 v0, s4 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_undef_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f32_e32 v0, s4 +; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0 +; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef) + ret float %rsq_clamp +} + +define double @v_rsq_clamp_undef_f64() #0 { +; SI-LABEL: v_rsq_clamp_undef_f64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], s[4:5] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_undef_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f64_e32 v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s4, -1 +; VI-NEXT: s_mov_b32 s5, 0x7fefffff +; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s5, 0xffefffff +; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] + %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef) + ret double %rsq_clamp +} + +define float @v_rsq_clamp_f32_non_ieee(float %src) #2 { +; SI-LABEL: v_rsq_clamp_f32_non_ieee: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_f32_non_ieee: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f32_e32 v0, v0 +; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0 +; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src) + ret float %rsq_clamp +} + +define double @v_rsq_clamp_f64_non_ieee(double %src) #2 { +; SI-LABEL: v_rsq_clamp_f64_non_ieee: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1] +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_rsq_clamp_f64_non_ieee: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; VI-NEXT: s_mov_b32 s4, -1 +; VI-NEXT: s_mov_b32 s5, 0x7fefffff +; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_mov_b32 s5, 0xffefffff +; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] + %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src) + ret double %rsq_clamp +} + +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.amdgcn.rsq.clamp.f32(float) #1 +declare double @llvm.fabs.f64(double) #1 +declare double @llvm.amdgcn.rsq.clamp.f64(double) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "amdgpu-ieee"="false" }