Combine V_RCP and V_SQRT into V_RSQ on AMDGPU for GlobalISel.
Change-Id: I93c5dcb412483156a6e8b68c4085cbce83ac9703
return UnaryOp_match<SrcTy, TargetOpcode::COPY>(std::forward<SrcTy>(Src));
}
+template <typename SrcTy>
+inline UnaryOp_match<SrcTy, TargetOpcode::G_FSQRT> m_GFSqrt(const SrcTy &Src) {
+ return UnaryOp_match<SrcTy, TargetOpcode::G_FSQRT>(Src);
+}
+
// General helper for generic MI compares, i.e. G_ICMP and G_FCMP
// TODO: Allow checking a specific predicate.
template <typename Pred_P, typename LHS_P, typename RHS_P, unsigned Opcode>
[{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]),
(apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>;
+
+def rcp_sqrt_to_rsq : GICombineRule<
+ (defs root:$rcp, build_fn_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_INTRINSIC, G_FSQRT):$rcp,
+ [{ return PostLegalizerHelper.matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
+
+
def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">;
def cvt_f32_ubyteN : GICombineRule<
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPostLegalizerCombinerHelper",
[all_combines, gfx6gfx7_combines,
- uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg]> {
+ uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
+ rcp_sqrt_to_rsq]> {
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
let AdditionalArguments = [];
(RcpInst $src)
>;
-class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
- (AMDGPUrcp (fsqrt vt:$src)),
- (RsqInst $src)
->;
-
// Instructions which select to the same v_min_f*
def fminnum_like : PatFrags<(ops node:$src0, node:$src1),
[(fminnum_ieee node:$src0, node:$src1),
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Target/TargetMachine.h"
#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
bool matchUCharToFloat(MachineInstr &MI);
void applyUCharToFloat(MachineInstr &MI);
+ bool matchRcpSqrtToRsq(MachineInstr &MI,
+ std::function<void(MachineIRBuilder &)> &MatchInfo);
+
// FIXME: Should be able to have 2 separate matchdatas rather than custom
// struct boilerplate.
struct CvtF32UByteMatchInfo {
MI.eraseFromParent();
}
+bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq(
+ MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+
+ auto getRcpSrc = [=](const MachineInstr &MI) {
+ MachineInstr *ResMI = nullptr;
+ if (MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
+ MI.getIntrinsicID() == Intrinsic::amdgcn_rcp)
+ ResMI = MRI.getVRegDef(MI.getOperand(2).getReg());
+
+ return ResMI;
+ };
+
+ auto getSqrtSrc = [=](const MachineInstr &MI) {
+ MachineInstr *SqrtSrcMI = nullptr;
+ mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
+ return SqrtSrcMI;
+ };
+
+ MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;
+ // rcp(sqrt(x))
+ if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
+ MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
+ B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
+ .addUse(SqrtSrcMI->getOperand(0).getReg())
+ .setMIFlags(MI.getFlags());
+ };
+ return true;
+ }
+
+ // sqrt(rcp(x))
+ if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
+ MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
+ B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
+ .addUse(RcpSrcMI->getOperand(0).getReg())
+ .setMIFlags(MI.getFlags());
+ };
+ return true;
+ }
+
+ return false;
+}
+
bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
Register SrcReg = MI.getOperand(1).getReg();
def COS_cm : COS_Common<0x8E>;
} // End isVector = 1
-def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
-
def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>;
def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
-def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>;
def SIN_eg : SIN_Common<0x8D>;
defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
- def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>;
def R600_ExportSwz : ExportSwzInst {
let OtherPredicates = [UnsafeFPMath] in {
-//defm : RsqPat<V_RSQ_F32_e32, f32>;
-
-def : RsqPat<V_RSQ_F32_e32, f32>;
-
// Convert (x - floor(x)) to fract(x)
def : GCNPat <
(f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+define amdgpu_cs float @div_sqrt(float inreg %arg1) {
+; GCN-LABEL: div_sqrt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_rsq_f32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+.entry:
+ %a = call float @llvm.sqrt.f32(float %arg1)
+ %b = fdiv afn float 1.000000e+00, %a
+ ret float %b
+}
+
+define amdgpu_cs float @sqrt_div(float inreg %arg1) {
+; GCN-LABEL: sqrt_div:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_rsq_f32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+.entry:
+ %a = fdiv afn float 1.000000e+00, %arg1
+ %b = call float @llvm.sqrt.f32(float %a)
+ ret float %b
+}
+
+define amdgpu_cs float @rcp_sqrt(float inreg %arg1) {
+; GCN-LABEL: rcp_sqrt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_rsq_f32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+.entry:
+ %a = call float @llvm.sqrt.f32(float %arg1)
+ %b = call float @llvm.amdgcn.rcp.f32(float %a)
+ ret float %b
+}
+
+define amdgpu_cs float @sqrt_rcp(float inreg %arg1) {
+; GCN-LABEL: sqrt_rcp:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_rsq_f32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+.entry:
+ %a = call float @llvm.amdgcn.rcp.f32(float %arg1)
+ %b = call float @llvm.sqrt.f32(float %a)
+ ret float %b
+}
+
+
+declare float @llvm.sqrt.f32(float)
+declare float @llvm.amdgcn.rcp.f32(float)
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: rcp_sqrt_test
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK: $vgpr0 = COPY %3
+ ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GCN-LABEL: name: rcp_sqrt_test
+ ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GCN: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+ ; GCN: $vgpr0 = COPY [[INT]](s32)
+ ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:_(s32) = COPY $sgpr0
+ %2:_(s32) = G_FSQRT %0:_
+ %3:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %2:_(s32)
+ $vgpr0 = COPY %3:_(s32)
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
+
+---
+name: sqrt_rcp_test
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GCN-LABEL: name: sqrt_rcp_test
+ ; GCN: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GCN: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+ ; GCN: $vgpr0 = COPY [[INT]](s32)
+ ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:_(s32) = COPY $sgpr0
+ %2:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %0:_(s32)
+ %3:_(s32) = G_FSQRT %2:_
+ $vgpr0 = COPY %3:_(s32)
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...