return true;
}
+// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
+// FIXME: Why do we handle this one but not other removed instructions?
+//
+// Reciprocal square root. The clamp prevents infinite results, clamping
+// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
+// +-max_float.
+bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return true;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(2).getReg();
+ auto Flags = MI.getFlags();
+
+ LLT Ty = MRI.getType(Dst);
+
+ const fltSemantics *FltSemantics;
+ if (Ty == LLT::scalar(32))
+ FltSemantics = &APFloat::IEEEsingle();
+ else if (Ty == LLT::scalar(64))
+ FltSemantics = &APFloat::IEEEdouble();
+ else
+ return false;
+
+ auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
+ .addUse(Src)
+ .setMIFlags(Flags);
+
+ // We don't need to concern ourselves with the snan handling difference, since
+ // the rsq quieted (or not) so use the one which will directly select.
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ const bool UseIEEE = MFI->getMode().IEEE;
+
+ auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
+ auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
+ B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
+
+ auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
+
+ if (UseIEEE)
+ B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
+ else
+ B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
return legalizeTrapIntrinsic(MI, MRI, B);
case Intrinsic::debugtrap:
return legalizeDebugTrapIntrinsic(MI, MRI, B);
+ case Intrinsic::amdgcn_rsq_clamp:
+ return legalizeRsqClampIntrinsic(MI, MRI, B);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s
+
+---
+name: test_rsq_clamp_flags_ieee_on_f32
+tracksRegLiveness: true
+machineFunctionInfo:
+ mode:
+ ieee: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; SI-LABEL: name: test_rsq_clamp_flags_ieee_on_f32
+ ; SI: liveins: $vgpr0
+ ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; SI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), [[COPY]](s32)
+ ; SI: $vgpr0 = COPY [[INT]](s32)
+ ; VI-LABEL: name: test_rsq_clamp_flags_ieee_on_f32
+ ; VI: liveins: $vgpr0
+ ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; VI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+ ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x47EFFFFFE0000000
+ ; VI: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMINNUM_IEEE [[INT]], [[C]]
+ ; VI: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC7EFFFFFE0000000
+ ; VI: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[C1]]
+ ; VI: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0
+ $vgpr0 = COPY %1
+...
+
+---
+name: test_rsq_clamp_flags_ieee_off_f32
+tracksRegLiveness: true
+machineFunctionInfo:
+ mode:
+ ieee: false
+
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; SI-LABEL: name: test_rsq_clamp_flags_ieee_off_f32
+ ; SI: liveins: $vgpr0
+ ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; SI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), [[COPY]](s32)
+ ; SI: $vgpr0 = COPY [[INT]](s32)
+ ; VI-LABEL: name: test_rsq_clamp_flags_ieee_off_f32
+ ; VI: liveins: $vgpr0
+ ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; VI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+ ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x47EFFFFFE0000000
+ ; VI: [[FMINNUM:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMINNUM [[INT]], [[C]]
+ ; VI: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC7EFFFFFE0000000
+ ; VI: [[FMAXNUM:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMAXNUM [[FMINNUM]], [[C1]]
+ ; VI: $vgpr0 = COPY [[FMAXNUM]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0
+ $vgpr0 = COPY %1
+...
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+
+define float @v_rsq_clamp_f32(float %src) #0 {
+; SI-LABEL: v_rsq_clamp_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f32_e32 v0, v0
+; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
+; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
+ ret float %rsq_clamp
+}
+
+define float @v_rsq_clamp_fabs_f32(float %src) #0 {
+; SI-LABEL: v_rsq_clamp_fabs_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f32_e64 v0, |v0|
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_fabs_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f32_e64 v0, |v0|
+; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
+; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %fabs.src = call float @llvm.fabs.f32(float %src)
+ %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %fabs.src)
+ ret float %rsq_clamp
+}
+
+define double @v_rsq_clamp_f64(double %src) #0 {
+; SI-LABEL: v_rsq_clamp_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1]
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
+; VI-NEXT: s_mov_b32 s4, -1
+; VI-NEXT: s_mov_b32 s5, 0x7fefffff
+; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_mov_b32 s5, 0xffefffff
+; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
+ ret double %rsq_clamp
+}
+
+define double @v_rsq_clamp_fabs_f64(double %src) #0 {
+; SI-LABEL: v_rsq_clamp_fabs_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f64_e64 v[0:1], |v[0:1]|
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_fabs_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f64_e64 v[0:1], |v[0:1]|
+; VI-NEXT: s_mov_b32 s4, -1
+; VI-NEXT: s_mov_b32 s5, 0x7fefffff
+; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_mov_b32 s5, 0xffefffff
+; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %fabs.src = call double @llvm.fabs.f64(double %src)
+ %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src)
+ ret double %rsq_clamp
+}
+
+define float @v_rsq_clamp_undef_f32() #0 {
+; SI-LABEL: v_rsq_clamp_undef_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f32_e32 v0, s4
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_undef_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f32_e32 v0, s4
+; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
+; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
+ ret float %rsq_clamp
+}
+
+define double @v_rsq_clamp_undef_f64() #0 {
+; SI-LABEL: v_rsq_clamp_undef_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], s[4:5]
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_undef_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f64_e32 v[0:1], s[4:5]
+; VI-NEXT: s_mov_b32 s4, -1
+; VI-NEXT: s_mov_b32 s5, 0x7fefffff
+; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_mov_b32 s5, 0xffefffff
+; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef)
+ ret double %rsq_clamp
+}
+
+define float @v_rsq_clamp_f32_non_ieee(float %src) #2 {
+; SI-LABEL: v_rsq_clamp_f32_non_ieee:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_f32_non_ieee:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f32_e32 v0, v0
+; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
+; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
+ ret float %rsq_clamp
+}
+
+define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
+; SI-LABEL: v_rsq_clamp_f64_non_ieee:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1]
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_f64_non_ieee:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
+; VI-NEXT: s_mov_b32 s4, -1
+; VI-NEXT: s_mov_b32 s5, 0x7fefffff
+; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_mov_b32 s5, 0xffefffff
+; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
+ ret double %rsq_clamp
+}
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.amdgcn.rsq.clamp.f32(float) #1
+declare double @llvm.fabs.f64(double) #1
+declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "amdgpu-ieee"="false" }