From 5aa6e246a1e44655a66581bc2ca6a20e3051e7e9 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 24 Jan 2020 10:01:15 -0500 Subject: [PATCH] AMDGPU/GlobalISel: Legalize f64 G_FFLOOR for SI Use cmp ord instead of cmp_class compared to the DAG version for the nan check, but mostly try to match the existsing pattern. I think the sign doesn't matter for fract, so we could do a little better with the source modifier matching. I think this is also still broken as in D22898, but I'm leaving it as-is for now while I don't have an SI system to test on. --- .../llvm/CodeGen/GlobalISel/MachineIRBuilder.h | 24 ++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 86 +++++- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 3 + llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 4 + llvm/lib/Target/AMDGPU/SIInstructions.td | 5 + llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll | 298 +++++++++++++++++++++ .../AMDGPU/GlobalISel/inst-select-ffloor.mir | 150 ----------- .../AMDGPU/GlobalISel/inst-select-ffloor.s32.mir | 83 ++++++ .../AMDGPU/GlobalISel/inst-select-ffloor.s64.mir | 59 ++++ .../CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir | 86 +++++- .../CodeGen/AMDGPU/GlobalISel/legalize-fptosi.mir | 53 ++-- .../CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir | 53 ++-- 12 files changed, 716 insertions(+), 188 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.mir create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s32.mir create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s64.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index 1823f28..003903e 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1285,6 +1285,30 @@ public: return buildInstr(TargetOpcode::G_FMUL, {Dst}, {Src0, Src1}, Flags); } + MachineInstrBuilder buildFMinNum(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_FMINNUM, {Dst}, {Src0, Src1}, Flags); + } + + MachineInstrBuilder buildFMaxNum(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_FMAXNUM, {Dst}, {Src0, Src1}, Flags); + } + + MachineInstrBuilder buildFMinNumIEEE(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_FMINNUM_IEEE, {Dst}, {Src0, Src1}, Flags); + } + + MachineInstrBuilder buildFMaxNumIEEE(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_FMAXNUM_IEEE, {Dst}, {Src0, Src1}, Flags); + } + MachineInstrBuilder buildShl(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional Flags = None) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 8cca8e4..c3790fd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -417,10 +417,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .clampScalar(0, S16, S64); } else { - getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) + getActionDefinitionsBuilder(G_FSQRT) .legalFor({S32, S64}) .scalarize(0) .clampScalar(0, S32, S64); + + if (ST.hasFractBug()) { + getActionDefinitionsBuilder(G_FFLOOR) + .customFor({S64}) + .legalFor({S32, S64}) + .scalarize(0) + .clampScalar(0, S32, S64); + } else { + getActionDefinitionsBuilder(G_FFLOOR) + .legalFor({S32, S64}) + .scalarize(0) + .clampScalar(0, S32, S64); + } } getActionDefinitionsBuilder(G_FPTRUNC) @@ -1249,6 +1262,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); case TargetOpcode::G_FEXP: return legalizeFExp(MI, B); + case TargetOpcode::G_FFLOOR: + return legalizeFFloor(MI, MRI, B); case TargetOpcode::G_BUILD_VECTOR: return legalizeBuildVector(MI, MRI, B); default: @@ -1973,6 +1988,75 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, return true; } +// Find a source register, ignoring any possible source modifiers. +static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { + Register ModSrc = OrigSrc; + if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { + ModSrc = SrcFNeg->getOperand(1).getReg(); + if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) + ModSrc = SrcFAbs->getOperand(1).getReg(); + } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) + ModSrc = SrcFAbs->getOperand(1).getReg(); + return ModSrc; +} + +bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + + const LLT S1 = LLT::scalar(1); + const LLT S64 = LLT::scalar(64); + Register Dst = MI.getOperand(0).getReg(); + Register OrigSrc = MI.getOperand(1).getReg(); + unsigned Flags = MI.getFlags(); + assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && + "this should not have been custom lowered"); + + // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) + // is used instead. However, SI doesn't have V_FLOOR_F64, so the most + // efficient way to implement it is using V_FRACT_F64. The workaround for the + // V_FRACT bug is: + // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) + // + // Convert floor(x) to (x - fract(x)) + + auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) + .addUse(OrigSrc) + .setMIFlags(Flags); + + // Give source modifier matching some assistance before obscuring a foldable + // pattern. + + // TODO: We can avoid the neg on the fract? The input sign to fract + // shouldn't matter? + Register ModSrc = stripAnySourceMods(OrigSrc, MRI); + + auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); + + Register Min = MRI.createGenericVirtualRegister(S64); + + // We don't need to concern ourselves with the snan handling difference, so + // use the one which will directly select. + const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); + if (MFI->getMode().IEEE) + B.buildFMinNumIEEE(Min, Fract, Const, Flags); + else + B.buildFMinNum(Min, Fract, Const, Flags); + + Register CorrectedFract = Min; + if (!MI.getFlag(MachineInstr::FmNoNans)) { + auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); + CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); + } + + auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); + B.buildFAdd(Dst, OrigSrc, NegFract, Flags); + + MI.eraseFromParent(); + return true; +} + // Turn an illegal packed v2s16 build vector into bit operations. // TODO: This should probably be a bitcast action in LegalizerHelper. bool AMDGPULegalizerInfo::legalizeBuildVector( diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 421641e..184f4bd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -82,6 +82,9 @@ public: bool legalizeFlog(MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const; bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 99cb4e9..6f3ca01 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -506,6 +506,10 @@ public: return getGeneration() >= VOLCANIC_ISLANDS; } + bool hasFractBug() const { + return getGeneration() == SOUTHERN_ISLANDS; + } + bool hasBFE() const { return true; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index d15a9a0..2dbc668 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1990,6 +1990,11 @@ let SubtargetPredicate = isGFX6 in { // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) // Convert floor(x) to (x - fract(x)) + +// Don't bother handling this for GlobalISel, it's handled during +// lowering. +// +// FIXME: DAG should also custom lower this. def : GCNPat < (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), (V_ADD_F64 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll new file mode 100644 index 0000000..a4f5948 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll @@ -0,0 +1,298 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefix=GFX78 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=GFX78 %s + +define double @v_floor_f64_ieee(double %x) { +; GFX6-LABEL: v_floor_f64_ieee: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_ieee: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e32 v[0:1], v[0:1] +; GFX78-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.floor.f64(double %x) + ret double %result +} + +define double @v_floor_f64_ieee_nnan(double %x) { +; GFX6-LABEL: v_floor_f64_ieee_nnan: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_ieee_nnan: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e32 v[0:1], v[0:1] +; GFX78-NEXT: s_setpc_b64 s[30:31] + %result = call nnan double @llvm.floor.f64(double %x) + ret double %result +} + +define double @v_floor_f64_ieee_fneg(double %x) { +; GFX6-LABEL: v_floor_f64_ieee_fneg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], -v[0:1], -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_ieee_fneg: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e64 v[0:1], -v[0:1] +; GFX78-NEXT: s_setpc_b64 s[30:31] + %neg.x = fneg double %x + %result = call double @llvm.floor.f64(double %neg.x) + ret double %result +} + +define double @v_floor_f64_nonieee(double %x) #1 { +; GFX6-LABEL: v_floor_f64_nonieee: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_nonieee: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e32 v[0:1], v[0:1] +; GFX78-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.floor.f64(double %x) + ret double %result +} + +define double @v_floor_f64_nonieee_nnan(double %x) #1 { +; GFX6-LABEL: v_floor_f64_nonieee_nnan: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_nonieee_nnan: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e32 v[0:1], v[0:1] +; GFX78-NEXT: s_setpc_b64 s[30:31] + %result = call nnan double @llvm.floor.f64(double %x) + ret double %result +} + +define double @v_floor_f64_non_ieee_fneg(double %x) #1 { +; GFX6-LABEL: v_floor_f64_non_ieee_fneg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], -v[0:1], -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_non_ieee_fneg: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e64 v[0:1], -v[0:1] +; GFX78-NEXT: s_setpc_b64 s[30:31] + %neg.x = fneg double %x + %result = call double @llvm.floor.f64(double %neg.x) + ret double %result +} + +define double @v_floor_f64_fabs(double %x) { +; GFX6-LABEL: v_floor_f64_fabs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]| +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], |v[0:1]|, -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_fabs: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e64 v[0:1], |v[0:1]| +; GFX78-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_floor_f64_fabs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_floor_f64_e64 v[0:1], |v[0:1]| +; GFX7-NEXT: s_setpc_b64 s[30:31] + %abs.x = call double @llvm.fabs.f64(double %x) + %result = call double @llvm.floor.f64(double %abs.x) + ret double %result +} + +define double @v_floor_f64_fneg_fabs(double %x) { +; GFX6-LABEL: v_floor_f64_fneg_fabs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]| +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] +; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], -|v[0:1]|, -v[2:3] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX78-LABEL: v_floor_f64_fneg_fabs: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_floor_f64_e64 v[0:1], -|v[0:1]| +; GFX78-NEXT: s_setpc_b64 s[30:31] + %abs.x = call double @llvm.fabs.f64(double %x) + %neg.abs.x = fneg double %abs.x + %result = call double @llvm.floor.f64(double %neg.abs.x) + ret double %result +} + +define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) { +; GFX6-LABEL: s_floor_f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3] +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] +; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX78-LABEL: s_floor_f64: +; GFX78: ; %bb.0: +; GFX78-NEXT: v_floor_f64_e32 v[0:1], s[2:3] +; GFX78-NEXT: ; return to shader part epilog + %result = call double @llvm.floor.f64(double %x) + %cast = bitcast double %result to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) { +; GFX6-LABEL: s_floor_f64_fneg: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3] +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] +; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_add_f64 v[0:1], -s[2:3], -v[0:1] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX78-LABEL: s_floor_f64_fneg: +; GFX78: ; %bb.0: +; GFX78-NEXT: v_floor_f64_e64 v[0:1], -s[2:3] +; GFX78-NEXT: ; return to shader part epilog + %neg.x = fneg double %x + %result = call double @llvm.floor.f64(double %neg.x) + %cast = bitcast double %result to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) { +; GFX6-LABEL: s_floor_f64_fabs: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]| +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] +; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_add_f64 v[0:1], |s[2:3]|, -v[0:1] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX78-LABEL: s_floor_f64_fabs: +; GFX78: ; %bb.0: +; GFX78-NEXT: v_floor_f64_e64 v[0:1], |s[2:3]| +; GFX78-NEXT: ; return to shader part epilog + %abs.x = call double @llvm.fabs.f64(double %x) + %result = call double @llvm.floor.f64(double %abs.x) + %cast = bitcast double %result to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) { +; GFX6-LABEL: s_floor_f64_fneg_fabs: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]| +; GFX6-NEXT: s_mov_b32 s0, -1 +; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] +; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_add_f64 v[0:1], -|s[2:3]|, -v[0:1] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX78-LABEL: s_floor_f64_fneg_fabs: +; GFX78: ; %bb.0: +; GFX78-NEXT: v_floor_f64_e64 v[0:1], -|s[2:3]| +; GFX78-NEXT: ; return to shader part epilog + %abs.x = call double @llvm.fabs.f64(double %x) + %neg.abs.x = fneg double %abs.x + %result = call double @llvm.floor.f64(double %neg.abs.x) + %cast = bitcast double %result to <2 x float> + ret <2 x float> %cast +} + +declare double @llvm.floor.f64(double) #0 +declare double @llvm.fabs.f64(double) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } +attributes #1 = { "amdgpu-ieee"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.mir deleted file mode 100644 index 76f16eb..0000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.mir +++ /dev/null @@ -1,150 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck %s - ---- -name: ffloor_s32_vv -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0 - - ; CHECK-LABEL: name: ffloor_s32_vv - ; CHECK: liveins: $vgpr0 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 0, [[COPY]], 0, 0, implicit $exec - ; CHECK: $vgpr0 = COPY [[V_FLOOR_F32_e64_]] - %0:vgpr(s32) = COPY $vgpr0 - %1:vgpr(s32) = G_FFLOOR %0 - $vgpr0 = COPY %1 -... - ---- -name: ffloor_s32_vs -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $sgpr0 - - ; CHECK-LABEL: name: ffloor_s32_vs - ; CHECK: liveins: $sgpr0 - ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 0, [[COPY]], 0, 0, implicit $exec - ; CHECK: $vgpr0 = COPY [[V_FLOOR_F32_e64_]] - %0:sgpr(s32) = COPY $sgpr0 - %1:vgpr(s32) = G_FFLOOR %0 - $vgpr0 = COPY %1 -... - ---- -name: ffloor_s64_vv -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; CHECK-LABEL: name: ffloor_s64_vv - ; CHECK: liveins: $vgpr0_vgpr1 - ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK: [[V_CMP_CLASS_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_CLASS_F64_e64 0, [[COPY]], 3, implicit $exec - ; CHECK: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4607182418800017407, implicit $exec - ; CHECK: [[V_FRACT_F64_e64_:%[0-9]+]]:vreg_64 = V_FRACT_F64_e64 0, [[COPY]], 0, 0, implicit $exec - ; CHECK: [[V_MIN_F64_:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[V_FRACT_F64_e64_]], 0, [[V_MOV_B]], 0, 0, implicit $exec - ; CHECK: [[V_CNDMA:%[0-9]+]]:vreg_64 = V_CNDMASK_B64_PSEUDO [[V_MIN_F64_]], [[COPY]], [[V_CMP_CLASS_F64_e64_]], implicit $exec - ; CHECK: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 0, [[COPY]], 1, [[V_CNDMA]], 0, 0, implicit $exec - ; CHECK: $vgpr0_vgpr1 = COPY [[V_ADD_F64_]] - %0:vgpr(s64) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_FFLOOR %0 - $vgpr0_vgpr1 = COPY %1 -... - -# FIXME: Constant bus restriction -# --- -# name: ffloor_s64_vs -# legalized: true -# regBankSelected: true -# tracksRegLiveness: true - -# body: | -# bb.0: -# liveins: $sgpr0_sgpr1 - -# %0:sgpr(s64) = COPY $sgpr0_sgpr1 -# %1:vgpr(s64) = G_FFLOOR %0 -# $vgpr0_vgpr1 = COPY %1 -# ... - ---- -name: ffloor_fneg_s32_vs -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $sgpr0 - - ; CHECK-LABEL: name: ffloor_fneg_s32_vs - ; CHECK: liveins: $sgpr0 - ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 1, [[COPY]], 0, 0, implicit $exec - ; CHECK: $vgpr0 = COPY [[V_FLOOR_F32_e64_]] - %0:sgpr(s32) = COPY $sgpr0 - %1:sgpr(s32) = G_FNEG %0 - %2:vgpr(s32) = G_FFLOOR %1 - $vgpr0 = COPY %2 -... - ---- -name: ffloor_fneg_s32_vv -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0 - ; CHECK-LABEL: name: ffloor_fneg_s32_vv - ; CHECK: liveins: $vgpr0 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 1, [[COPY]], 0, 0, implicit $exec - ; CHECK: $vgpr0 = COPY [[V_FLOOR_F32_e64_]] - %0:vgpr(s32) = COPY $vgpr0 - %1:vgpr(s32) = G_FNEG %0 - %2:vgpr(s32) = G_FFLOOR %1 - $vgpr0 = COPY %2 -... - ---- -name: ffloor_fneg_s64_vv -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1 - - ; CHECK-LABEL: name: ffloor_fneg_s64_vv - ; CHECK: liveins: $vgpr0_vgpr1 - ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK: [[V_CMP_CLASS_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_CLASS_F64_e64 0, [[COPY]], 3, implicit $exec - ; CHECK: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4607182418800017407, implicit $exec - ; CHECK: [[V_FRACT_F64_e64_:%[0-9]+]]:vreg_64 = V_FRACT_F64_e64 1, [[COPY]], 0, 0, implicit $exec - ; CHECK: [[V_MIN_F64_:%[0-9]+]]:vreg_64 = V_MIN_F64 0, [[V_FRACT_F64_e64_]], 0, [[V_MOV_B]], 0, 0, implicit $exec - ; CHECK: [[V_CNDMA:%[0-9]+]]:vreg_64 = V_CNDMASK_B64_PSEUDO [[V_MIN_F64_]], [[COPY]], [[V_CMP_CLASS_F64_e64_]], implicit $exec - ; CHECK: [[V_ADD_F64_:%[0-9]+]]:vreg_64 = V_ADD_F64 1, [[COPY]], 1, [[V_CNDMA]], 0, 0, implicit $exec - ; CHECK: $vgpr0_vgpr1 = COPY [[V_ADD_F64_]] - %0:vgpr(s64) = COPY $vgpr0_vgpr1 - %1:vgpr(s64) = G_FNEG %0 - %2:vgpr(s64) = G_FFLOOR %1 - $vgpr0_vgpr1 = COPY %2 -... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s32.mir new file mode 100644 index 0000000..e7d27af --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s32.mir @@ -0,0 +1,83 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: ffloor_s32_vv +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: ffloor_s32_vv + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 0, [[COPY]], 0, 0, implicit $exec + ; CHECK: $vgpr0 = COPY [[V_FLOOR_F32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = G_FFLOOR %0 + $vgpr0 = COPY %1 +... + +--- +name: ffloor_s32_vs +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: ffloor_s32_vs + ; CHECK: liveins: $sgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 0, [[COPY]], 0, 0, implicit $exec + ; CHECK: $vgpr0 = COPY [[V_FLOOR_F32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:vgpr(s32) = G_FFLOOR %0 + $vgpr0 = COPY %1 +... + +--- +name: ffloor_fneg_s32_vs +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: ffloor_fneg_s32_vs + ; CHECK: liveins: $sgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 1, [[COPY]], 0, 0, implicit $exec + ; CHECK: $vgpr0 = COPY [[V_FLOOR_F32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = G_FNEG %0 + %2:vgpr(s32) = G_FFLOOR %1 + $vgpr0 = COPY %2 +... + +--- +name: ffloor_fneg_s32_vv +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: ffloor_fneg_s32_vv + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[V_FLOOR_F32_e64_:%[0-9]+]]:vgpr_32 = V_FLOOR_F32_e64 1, [[COPY]], 0, 0, implicit $exec + ; CHECK: $vgpr0 = COPY [[V_FLOOR_F32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = G_FNEG %0 + %2:vgpr(s32) = G_FFLOOR %1 + $vgpr0 = COPY %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s64.mir new file mode 100644 index 0000000..1af481c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s64.mir @@ -0,0 +1,59 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: ffloor_s64_vv +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: ffloor_s64_vv + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; CHECK: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = V_FLOOR_F64_e64 0, [[COPY]], 0, 0, implicit $exec + ; CHECK: $vgpr0_vgpr1 = COPY [[V_FLOOR_F64_e64_]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_FFLOOR %0 + $vgpr0_vgpr1 = COPY %1 +... + +# FIXME: Constant bus restriction +# --- +# name: ffloor_s64_vs +# legalized: true +# regBankSelected: true +# tracksRegLiveness: true + +# body: | +# bb.0: +# liveins: $sgpr0_sgpr1 + +# %0:sgpr(s64) = COPY $sgpr0_sgpr1 +# %1:vgpr(s64) = G_FFLOOR %0 +# $vgpr0_vgpr1 = COPY %1 +# ... + +--- +name: ffloor_fneg_s64_vv +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: ffloor_fneg_s64_vv + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; CHECK: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = V_FLOOR_F64_e64 1, [[COPY]], 0, 0, implicit $exec + ; CHECK: $vgpr0_vgpr1 = COPY [[V_FLOOR_F64_e64_]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_FNEG %0 + %2:vgpr(s64) = G_FFLOOR %1 + $vgpr0_vgpr1 = COPY %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir index 2abe2c9..4140e89 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir @@ -34,8 +34,14 @@ body: | ; SI-LABEL: name: test_ffloor_s64 ; SI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; SI: [[FFLOOR:%[0-9]+]]:_(s64) = G_FFLOOR [[COPY]] - ; SI: $vgpr0_vgpr1 = COPY [[FFLOOR]](s64) + ; SI: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[COPY]](s64) + ; SI: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[INT]], [[C]] + ; SI: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY]] + ; SI: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[COPY]], [[FMINNUM_IEEE]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[SELECT]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[COPY]], [[FNEG]] + ; SI: $vgpr0_vgpr1 = COPY [[FADD]](s64) ; VI-LABEL: name: test_ffloor_s64 ; VI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; VI: [[FFLOOR:%[0-9]+]]:_(s64) = G_FFLOOR [[COPY]] @@ -49,6 +55,65 @@ body: | $vgpr0_vgpr1 = COPY %1 ... + +--- +name: test_ffloor_s64_nnan +body: | + bb.0: + liveins: $vgpr0 + + ; SI-LABEL: name: test_ffloor_s64_nnan + ; SI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; SI: [[INT:%[0-9]+]]:_(s64) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[COPY]](s64) + ; SI: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = nnan G_FMINNUM_IEEE [[INT]], [[C]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = nnan G_FNEG [[FMINNUM_IEEE]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = nnan G_FADD [[COPY]], [[FNEG]] + ; SI: $vgpr0_vgpr1 = COPY [[FADD]](s64) + ; VI-LABEL: name: test_ffloor_s64_nnan + ; VI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; VI: [[FFLOOR:%[0-9]+]]:_(s64) = nnan G_FFLOOR [[COPY]] + ; VI: $vgpr0_vgpr1 = COPY [[FFLOOR]](s64) + ; GFX9-LABEL: name: test_ffloor_s64_nnan + ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9: [[FFLOOR:%[0-9]+]]:_(s64) = nnan G_FFLOOR [[COPY]] + ; GFX9: $vgpr0_vgpr1 = COPY [[FFLOOR]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = nnan G_FFLOOR %0 + $vgpr0_vgpr1 = COPY %1 + +... + +--- +name: test_ffloor_s64_nssaz +body: | + bb.0: + liveins: $vgpr0 + + ; SI-LABEL: name: test_ffloor_s64_nssaz + ; SI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; SI: [[INT:%[0-9]+]]:_(s64) = nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[COPY]](s64) + ; SI: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = nsz G_FMINNUM_IEEE [[INT]], [[C]] + ; SI: [[FCMP:%[0-9]+]]:_(s1) = nsz G_FCMP floatpred(ord), [[COPY]](s64), [[COPY]] + ; SI: [[SELECT:%[0-9]+]]:_(s64) = nsz G_SELECT [[FCMP]](s1), [[COPY]], [[FMINNUM_IEEE]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = nsz G_FNEG [[SELECT]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = nsz G_FADD [[COPY]], [[FNEG]] + ; SI: $vgpr0_vgpr1 = COPY [[FADD]](s64) + ; VI-LABEL: name: test_ffloor_s64_nssaz + ; VI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; VI: [[FFLOOR:%[0-9]+]]:_(s64) = nsz G_FFLOOR [[COPY]] + ; VI: $vgpr0_vgpr1 = COPY [[FFLOOR]](s64) + ; GFX9-LABEL: name: test_ffloor_s64_nssaz + ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9: [[FFLOOR:%[0-9]+]]:_(s64) = nsz G_FFLOOR [[COPY]] + ; GFX9: $vgpr0_vgpr1 = COPY [[FFLOOR]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = nsz G_FFLOOR %0 + $vgpr0_vgpr1 = COPY %1 + +... + --- name: test_ffloor_s16 body: | @@ -158,9 +223,20 @@ body: | ; SI-LABEL: name: test_ffloor_v2s64 ; SI: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; SI: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; SI: [[FFLOOR:%[0-9]+]]:_(s64) = G_FFLOOR [[UV]] - ; SI: [[FFLOOR1:%[0-9]+]]:_(s64) = G_FFLOOR [[UV1]] - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FFLOOR]](s64), [[FFLOOR1]](s64) + ; SI: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[UV]](s64) + ; SI: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[INT]], [[C]] + ; SI: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[UV]](s64), [[UV]] + ; SI: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[UV]], [[FMINNUM_IEEE]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[SELECT]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[UV]], [[FNEG]] + ; SI: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[UV1]](s64) + ; SI: [[FMINNUM_IEEE1:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[INT1]], [[C]] + ; SI: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[UV1]](s64), [[UV1]] + ; SI: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[UV1]], [[FMINNUM_IEEE1]] + ; SI: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[SELECT1]] + ; SI: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[UV1]], [[FNEG1]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FADD]](s64), [[FADD1]](s64) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; VI-LABEL: name: test_ffloor_v2s64 ; VI: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptosi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptosi.mir index 7bc4ef5..81056ce 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptosi.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptosi.mir @@ -192,9 +192,15 @@ body: | ; SI: [[C8:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3DF0000000000000 ; SI: [[C9:%[0-9]+]]:_(s64) = G_FCONSTANT double 0xC1F0000000000000 ; SI: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INTRINSIC_TRUNC]], [[C8]] - ; SI: [[FFLOOR:%[0-9]+]]:_(s64) = G_FFLOOR [[FMUL]] - ; SI: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FFLOOR]], [[C9]], [[INTRINSIC_TRUNC]] - ; SI: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[FFLOOR]](s64) + ; SI: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[FMUL]](s64) + ; SI: [[C10:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[INT1]], [[C10]] + ; SI: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[FMUL]](s64), [[FMUL]] + ; SI: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMUL]], [[FMINNUM_IEEE]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[SELECT1]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[FMUL]], [[FNEG]] + ; SI: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FADD]], [[C9]], [[INTRINSIC_TRUNC]] + ; SI: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[FADD]](s64) ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMA]](s64) ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[FPTOUI]](s32), [[FPTOSI]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV1]](s64) @@ -247,9 +253,13 @@ body: | ; SI: [[C8:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3DF0000000000000 ; SI: [[C9:%[0-9]+]]:_(s64) = G_FCONSTANT double 0xC1F0000000000000 ; SI: [[FMUL:%[0-9]+]]:_(s64) = nnan G_FMUL [[INTRINSIC_TRUNC]], [[C8]] - ; SI: [[FFLOOR:%[0-9]+]]:_(s64) = nnan G_FFLOOR [[FMUL]] - ; SI: [[FMA:%[0-9]+]]:_(s64) = nnan G_FMA [[FFLOOR]], [[C9]], [[INTRINSIC_TRUNC]] - ; SI: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[FFLOOR]](s64) + ; SI: [[INT1:%[0-9]+]]:_(s64) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[FMUL]](s64) + ; SI: [[C10:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = nnan G_FMINNUM_IEEE [[INT1]], [[C10]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = nnan G_FNEG [[FMINNUM_IEEE]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = nnan G_FADD [[FMUL]], [[FNEG]] + ; SI: [[FMA:%[0-9]+]]:_(s64) = nnan G_FMA [[FADD]], [[C9]], [[INTRINSIC_TRUNC]] + ; SI: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[FADD]](s64) ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMA]](s64) ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[FPTOUI]](s32), [[FPTOSI]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV1]](s64) @@ -303,14 +313,20 @@ body: | ; SI: [[C8:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3DF0000000000000 ; SI: [[C9:%[0-9]+]]:_(s64) = G_FCONSTANT double 0xC1F0000000000000 ; SI: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INTRINSIC_TRUNC]], [[C8]] - ; SI: [[FFLOOR:%[0-9]+]]:_(s64) = G_FFLOOR [[FMUL]] - ; SI: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FFLOOR]], [[C9]], [[INTRINSIC_TRUNC]] - ; SI: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[FFLOOR]](s64) + ; SI: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[FMUL]](s64) + ; SI: [[C10:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[INT1]], [[C10]] + ; SI: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[FMUL]](s64), [[FMUL]] + ; SI: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMUL]], [[FMINNUM_IEEE]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[SELECT1]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[FMUL]], [[FNEG]] + ; SI: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FADD]], [[C9]], [[INTRINSIC_TRUNC]] + ; SI: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[FADD]](s64) ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMA]](s64) ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[FPTOUI]](s32), [[FPTOSI]](s32) ; SI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) - ; SI: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ubfe), [[C]](s32), [[C1]](s32) - ; SI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[INT1]], [[C2]] + ; SI: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ubfe), [[C]](s32), [[C1]](s32) + ; SI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[INT2]], [[C2]] ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C3]] ; SI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C5]](s32), [[AND2]](s32) ; SI: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[C4]], [[SUB1]](s32) @@ -318,13 +334,18 @@ body: | ; SI: [[AND3:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[XOR1]] ; SI: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB1]](s32), [[C5]] ; SI: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SUB1]](s32), [[C7]] - ; SI: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[MV2]], [[AND3]] - ; SI: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV1]], [[SELECT1]] + ; SI: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[MV2]], [[AND3]] + ; SI: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV1]], [[SELECT2]] ; SI: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s64) = G_INTRINSIC_TRUNC [[UV1]] ; SI: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[INTRINSIC_TRUNC1]], [[C8]] - ; SI: [[FFLOOR1:%[0-9]+]]:_(s64) = G_FFLOOR [[FMUL1]] - ; SI: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FFLOOR1]], [[C9]], [[INTRINSIC_TRUNC1]] - ; SI: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FFLOOR1]](s64) + ; SI: [[INT3:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[FMUL1]](s64) + ; SI: [[FMINNUM_IEEE1:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[INT3]], [[C10]] + ; SI: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[FMUL1]](s64), [[FMUL1]] + ; SI: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[FMUL1]], [[FMINNUM_IEEE1]] + ; SI: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[SELECT3]] + ; SI: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[FMUL1]], [[FNEG1]] + ; SI: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FADD1]], [[C9]], [[INTRINSIC_TRUNC1]] + ; SI: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FADD1]](s64) ; SI: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMA1]](s64) ; SI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[FPTOUI1]](s32), [[FPTOSI1]](s32) ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV1]](s64), [[MV3]](s64) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir index 4a17a96..b895a1a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptoui.mir @@ -192,9 +192,15 @@ body: | ; SI: [[C8:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3DF0000000000000 ; SI: [[C9:%[0-9]+]]:_(s64) = G_FCONSTANT double 0xC1F0000000000000 ; SI: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INTRINSIC_TRUNC]], [[C8]] - ; SI: [[FFLOOR:%[0-9]+]]:_(s64) = G_FFLOOR [[FMUL]] - ; SI: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FFLOOR]], [[C9]], [[INTRINSIC_TRUNC]] - ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FFLOOR]](s64) + ; SI: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[FMUL]](s64) + ; SI: [[C10:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[INT1]], [[C10]] + ; SI: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[FMUL]](s64), [[FMUL]] + ; SI: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMUL]], [[FMINNUM_IEEE]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[SELECT1]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[FMUL]], [[FNEG]] + ; SI: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FADD]], [[C9]], [[INTRINSIC_TRUNC]] + ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD]](s64) ; SI: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMA]](s64) ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[FPTOUI1]](s32), [[FPTOUI]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV1]](s64) @@ -247,9 +253,13 @@ body: | ; SI: [[C8:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3DF0000000000000 ; SI: [[C9:%[0-9]+]]:_(s64) = G_FCONSTANT double 0xC1F0000000000000 ; SI: [[FMUL:%[0-9]+]]:_(s64) = nnan G_FMUL [[INTRINSIC_TRUNC]], [[C8]] - ; SI: [[FFLOOR:%[0-9]+]]:_(s64) = nnan G_FFLOOR [[FMUL]] - ; SI: [[FMA:%[0-9]+]]:_(s64) = nnan G_FMA [[FFLOOR]], [[C9]], [[INTRINSIC_TRUNC]] - ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FFLOOR]](s64) + ; SI: [[INT1:%[0-9]+]]:_(s64) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[FMUL]](s64) + ; SI: [[C10:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = nnan G_FMINNUM_IEEE [[INT1]], [[C10]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = nnan G_FNEG [[FMINNUM_IEEE]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = nnan G_FADD [[FMUL]], [[FNEG]] + ; SI: [[FMA:%[0-9]+]]:_(s64) = nnan G_FMA [[FADD]], [[C9]], [[INTRINSIC_TRUNC]] + ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD]](s64) ; SI: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMA]](s64) ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[FPTOUI1]](s32), [[FPTOUI]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV1]](s64) @@ -303,14 +313,20 @@ body: | ; SI: [[C8:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3DF0000000000000 ; SI: [[C9:%[0-9]+]]:_(s64) = G_FCONSTANT double 0xC1F0000000000000 ; SI: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INTRINSIC_TRUNC]], [[C8]] - ; SI: [[FFLOOR:%[0-9]+]]:_(s64) = G_FFLOOR [[FMUL]] - ; SI: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FFLOOR]], [[C9]], [[INTRINSIC_TRUNC]] - ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FFLOOR]](s64) + ; SI: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[FMUL]](s64) + ; SI: [[C10:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x3FEFFFFFFFFFFFFF + ; SI: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[INT1]], [[C10]] + ; SI: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[FMUL]](s64), [[FMUL]] + ; SI: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMUL]], [[FMINNUM_IEEE]] + ; SI: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[SELECT1]] + ; SI: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[FMUL]], [[FNEG]] + ; SI: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FADD]], [[C9]], [[INTRINSIC_TRUNC]] + ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD]](s64) ; SI: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMA]](s64) ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[FPTOUI1]](s32), [[FPTOUI]](s32) ; SI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) - ; SI: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ubfe), [[C]](s32), [[C1]](s32) - ; SI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[INT1]], [[C2]] + ; SI: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ubfe), [[C]](s32), [[C1]](s32) + ; SI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[INT2]], [[C2]] ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C3]] ; SI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C5]](s32), [[AND2]](s32) ; SI: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[C4]], [[SUB1]](s32) @@ -318,13 +334,18 @@ body: | ; SI: [[AND3:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[XOR1]] ; SI: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB1]](s32), [[C5]] ; SI: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SUB1]](s32), [[C7]] - ; SI: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[MV2]], [[AND3]] - ; SI: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV1]], [[SELECT1]] + ; SI: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[MV2]], [[AND3]] + ; SI: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV1]], [[SELECT2]] ; SI: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s64) = G_INTRINSIC_TRUNC [[UV1]] ; SI: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[INTRINSIC_TRUNC1]], [[C8]] - ; SI: [[FFLOOR1:%[0-9]+]]:_(s64) = G_FFLOOR [[FMUL1]] - ; SI: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FFLOOR1]], [[C9]], [[INTRINSIC_TRUNC1]] - ; SI: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI [[FFLOOR1]](s64) + ; SI: [[INT3:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.fract), [[FMUL1]](s64) + ; SI: [[FMINNUM_IEEE1:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[INT3]], [[C10]] + ; SI: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[FMUL1]](s64), [[FMUL1]] + ; SI: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[FMUL1]], [[FMINNUM_IEEE1]] + ; SI: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[SELECT3]] + ; SI: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[FMUL1]], [[FNEG1]] + ; SI: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FADD1]], [[C9]], [[INTRINSIC_TRUNC1]] + ; SI: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI [[FADD1]](s64) ; SI: [[FPTOUI3:%[0-9]+]]:_(s32) = G_FPTOUI [[FMA1]](s64) ; SI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[FPTOUI3]](s32), [[FPTOUI2]](s32) ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV1]](s64), [[MV3]](s64) -- 2.7.4