From eece6ba283bd763e6d7109ae9e155e81cfee0651 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 26 Apr 2023 22:02:42 -0400 Subject: [PATCH] IR: Add llvm.ldexp and llvm.experimental.constrained.ldexp intrinsics AMDGPU has native instructions and target intrinsics for this, but these really should be subject to legalization and generic optimizations. This will enable legalization of f16->f32 on targets without f16 support. Implement a somewhat horrible inline expansion for targets without libcall support. This could be better if we could introduce control flow (GlobalISel version not yet implemented). Support for strictfp legalization is less complete but works for the simple cases. --- clang/lib/CodeGen/CGBuiltin.cpp | 9 +- clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl | 2 +- clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 4 +- llvm/docs/LangRef.rst | 82 +++ llvm/docs/ReleaseNotes.rst | 2 + llvm/include/llvm/Analysis/TargetLibraryInfo.h | 1 + .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 1 + .../llvm/CodeGen/GlobalISel/MachineIRBuilder.h | 7 + llvm/include/llvm/CodeGen/ISDOpcodes.h | 5 +- llvm/include/llvm/CodeGen/RuntimeLibcalls.h | 4 + llvm/include/llvm/IR/ConstrainedOps.def | 1 + llvm/include/llvm/IR/Intrinsics.td | 9 + llvm/include/llvm/IR/RuntimeLibcalls.def | 5 + llvm/include/llvm/Support/TargetOpcodes.def | 4 + llvm/include/llvm/Target/GenericOpcodes.td | 8 + .../llvm/Target/GlobalISel/SelectionDAGCompat.td | 2 + llvm/include/llvm/Target/TargetSelectionDAG.td | 9 + llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 4 + llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 65 +- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 137 ++++ .../CodeGen/SelectionDAG/LegalizeFloatTypes.cpp | 29 +- .../CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 31 +- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 19 +- .../lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 1 + .../CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 108 +-- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 3 +- .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 12 + .../CodeGen/SelectionDAG/SelectionDAGDumper.cpp | 2 + llvm/lib/CodeGen/TargetLoweringBase.cpp | 15 +- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 7 +- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 - llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 10 - llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 23 +- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 50 +- llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + llvm/lib/Target/AMDGPU/VOP2Instructions.td | 8 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 2 +- llvm/lib/Target/Hexagon/HexagonISelLowering.cpp | 2 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 1 + llvm/lib/Target/X86/X86ISelLowering.cpp | 2 + .../GlobalISel/legalizer-info-validation.mir | 6 + .../AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.mir | 134 ---- .../GlobalISel/inst-select-amdgcn.ldexp.s16.mir | 76 -- .../CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir | 56 +- .../CodeGen/AMDGPU/GlobalISel/legalize-uitofp.mir | 48 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll | 57 +- llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll | 640 +++++++++++++++++ llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll | 400 +++++++++++ llvm/test/CodeGen/AMDGPU/strict_ldexp.f32.ll | 255 +++++++ llvm/test/CodeGen/AMDGPU/strict_ldexp.f64.ll | 180 +++++ llvm/test/CodeGen/Mips/ldexp.ll | 172 +++++ llvm/test/CodeGen/PowerPC/ldexp-libcall.ll | 66 ++ llvm/test/CodeGen/PowerPC/ldexp.ll | 223 ++++++ llvm/test/CodeGen/X86/ldexp-f80.ll | 43 ++ llvm/test/CodeGen/X86/ldexp-libcall.ll | 77 ++ llvm/test/CodeGen/X86/ldexp-not-readonly.ll | 54 ++ llvm/test/CodeGen/X86/ldexp-strict.ll | 75 ++ llvm/test/CodeGen/X86/ldexp-wrong-signature.ll | 58 ++ llvm/test/CodeGen/X86/ldexp-wrong-signature2.ll | 53 ++ llvm/test/CodeGen/X86/ldexp.ll | 784 +++++++++++++++++++++ llvm/test/MC/AMDGPU/gfx10_asm_vop2.s | 6 +- llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s | 2 +- llvm/test/MC/AMDGPU/gfx8_asm_vop3.s | 7 +- llvm/test/MC/AMDGPU/gfx9_asm_vop3.s | 7 +- llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt | 4 +- .../AMDGPU/gfx11_dasm_vop3_from_vop2.txt | 2 +- llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3.txt | 4 +- llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt | 4 +- .../Transforms/SpeculativeExecution/spec-calls.ll | 16 + 71 files changed, 3780 insertions(+), 422 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.mir delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.s16.mir create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/strict_ldexp.f32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/strict_ldexp.f64.ll create mode 100644 llvm/test/CodeGen/Mips/ldexp.ll create mode 100644 llvm/test/CodeGen/PowerPC/ldexp-libcall.ll create mode 100644 llvm/test/CodeGen/PowerPC/ldexp.ll create mode 100644 llvm/test/CodeGen/X86/ldexp-f80.ll create mode 100644 llvm/test/CodeGen/X86/ldexp-libcall.ll create mode 100644 llvm/test/CodeGen/X86/ldexp-not-readonly.ll create mode 100644 llvm/test/CodeGen/X86/ldexp-strict.ll create mode 100644 llvm/test/CodeGen/X86/ldexp-wrong-signature.ll create mode 100644 llvm/test/CodeGen/X86/ldexp-wrong-signature2.ll create mode 100644 llvm/test/CodeGen/X86/ldexp.ll diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index c09e5b5..ddf28dc 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17146,8 +17146,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp); case AMDGPU::BI__builtin_amdgcn_ldexp: case AMDGPU::BI__builtin_amdgcn_ldexpf: - case AMDGPU::BI__builtin_amdgcn_ldexph: - return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_ldexp); + case AMDGPU::BI__builtin_amdgcn_ldexph: { + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Function *F = + CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()}); + return Builder.CreateCall(F, {Src0, Src1}); + } case AMDGPU::BI__builtin_amdgcn_frexp_mant: case AMDGPU::BI__builtin_amdgcn_frexp_mantf: case AMDGPU::BI__builtin_amdgcn_frexp_manth: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl index 590670b..4f7ac16 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -52,7 +52,7 @@ void test_cos_f16(global half* out, half a) } // CHECK-LABEL: @test_ldexp_f16 -// CHECK: call half @llvm.amdgcn.ldexp.f16 +// CHECK: call half @llvm.ldexp.f16.i32 void test_ldexp_f16(global half* out, half a, int b) { *out = __builtin_amdgcn_ldexph(a, b); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index ff13357..5cf965e 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -180,14 +180,14 @@ void test_log_clamp_f32(global float* out, float a) } // CHECK-LABEL: @test_ldexp_f32 -// CHECK: call float @llvm.amdgcn.ldexp.f32 +// CHECK: call float @llvm.ldexp.f32.i32 void test_ldexp_f32(global float* out, float a, int b) { *out = __builtin_amdgcn_ldexpf(a, b); } // CHECK-LABEL: @test_ldexp_f64 -// CHECK: call double @llvm.amdgcn.ldexp.f64 +// CHECK: call double @llvm.ldexp.f64.i32 void test_ldexp_f64(global double* out, double a, int b) { *out = __builtin_amdgcn_ldexp(a, b); diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index de107d0..8217bee 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -14713,6 +14713,47 @@ trapping or setting ``errno``. When specified with the fast-math-flag 'afn', the result may be approximated using a less accurate calculation. +'``llvm.ldexp.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``llvm.ldexp`` on any +floating point or vector of floating point type. Not all targets support +all types however. + +:: + + declare float @llvm.ldexp.f32.i32(float %Val, i32 %Exp) + declare double @llvm.ldexp.f64.i32(double %Val, i32 %Exp) + declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80 %Val, i32 %Exp) + declare fp128 @llvm.ldexp.f128.i32(fp128 %Val, i32 %Exp) + declare ppc_fp128 @llvm.ldexp.ppcf128.i32(ppc_fp128 %Val, i32 %Exp) + declare <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %Val, <2 x i32> %Exp) + +Overview: +""""""""" + +The '``llvm.ldexp.*``' intrinsics perform the ldexp function. + +Arguments: +"""""""""" + +The first argument and the return value are :ref:`floating-point +` or :ref:`vector ` of floating-point values of +the same type. The second argument is an integer with the same number +of elements. + +Semantics: +"""""""""" + +This function multiplies the first argument by 2 raised to the second +argument's power. If the first argument is NaN or infinite, the same +value is returned. If the result underflows a zero with the same sign +is returned. If the result overflows, the result is an infinity with +the same sign. + '``llvm.log.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -24306,6 +24347,47 @@ This function returns the first value raised to the second power with an unspecified sequence of rounding operations. +'``llvm.experimental.constrained.ldexp``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.ldexp( , , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.ldexp``' performs the ldexp function. + + +Arguments: +"""""""""" + +The first argument and the return value are :ref:`floating-point +` or :ref:`vector ` of floating-point values of +the same type. The second argument is an integer with the same number +of elements. + + +The third and fourth arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function multiplies the first argument by 2 raised to the second +argument's power. If the first argument is NaN or infinite, the same +value is returned. If the result underflows a zero with the same sign +is returned. If the result overflows, the result is an infinity with +the same sign. + + '``llvm.experimental.constrained.sin``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 1dd2590..6176439 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -61,6 +61,8 @@ Changes to the LLVM IR * The ``nofpclass`` attribute was introduced. This allows more optimizations around special floating point value comparisons. +* Introduced new ``llvm.ldexp`` and ``llvm.experimental.constrained.ldexp`` intrinsics. + * The constant expression variants of the following instructions have been removed: diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h index 7209616..6f045fa 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h @@ -378,6 +378,7 @@ public: case LibFunc_trunc: case LibFunc_truncf: case LibFunc_truncl: case LibFunc_log2: case LibFunc_log2f: case LibFunc_log2l: case LibFunc_exp2: case LibFunc_exp2f: case LibFunc_exp2l: + case LibFunc_ldexp: case LibFunc_ldexpf: case LibFunc_ldexpl: case LibFunc_memcpy: case LibFunc_memset: case LibFunc_memmove: case LibFunc_memcmp: case LibFunc_bcmp: case LibFunc_strcmp: case LibFunc_strcpy: case LibFunc_stpcpy: case LibFunc_strlen: diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index f43390c..a568edd 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -357,6 +357,7 @@ public: LegalizeResult narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, LLT Ty); + LegalizeResult narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx, LLT Ty); /// Perform Bitcast legalize action on G_EXTRACT_VECTOR_ELT. LegalizeResult bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx, diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index 071fbe8..905172c 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1825,6 +1825,13 @@ public: return buildInstr(TargetOpcode::G_FPOW, {Dst}, {Src0, Src1}, Flags); } + /// Build and insert \p Dst = G_FLDEXP \p Src0, \p Src1 + MachineInstrBuilder + buildFLdexp(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, + std::optional Flags = std::nullopt) { + return buildInstr(TargetOpcode::G_FLDEXP, {Dst}, {Src0, Src1}, Flags); + } + /// Build and insert \p Res = G_FCOPYSIGN \p Op0, \p Op1 MachineInstrBuilder buildFCopysign(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1) { diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 09fb76c..0b1d1d7 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -411,6 +411,7 @@ enum NodeType { STRICT_FSQRT, STRICT_FPOW, STRICT_FPOWI, + STRICT_FLDEXP, STRICT_FSIN, STRICT_FCOS, STRICT_FEXP, @@ -926,8 +927,10 @@ enum NodeType { FCBRT, FSIN, FCOS, - FPOWI, FPOW, + FPOWI, + /// FLDEXP - ldexp, inspired by libm (op0 * 2**op1). + FLDEXP, FLOG, FLOG2, FLOG10, diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcalls.h b/llvm/include/llvm/CodeGen/RuntimeLibcalls.h index d8c6310..088ad1f 100644 --- a/llvm/include/llvm/CodeGen/RuntimeLibcalls.h +++ b/llvm/include/llvm/CodeGen/RuntimeLibcalls.h @@ -70,6 +70,10 @@ namespace RTLIB { /// UNKNOWN_LIBCALL if there is none. Libcall getPOWI(EVT RetVT); + /// getLDEXP - Return the LDEXP_* value for the given types, or + /// UNKNOWN_LIBCALL if there is none. + Libcall getLDEXP(EVT RetVT); + /// Return the SYNC_FETCH_AND_* value for the given opcode and type, or /// UNKNOWN_LIBCALL if there is none. Libcall getSYNC(unsigned Opc, MVT VT); diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def index ecba68f..41aa44d 100644 --- a/llvm/include/llvm/IR/ConstrainedOps.def +++ b/llvm/include/llvm/IR/ConstrainedOps.def @@ -89,6 +89,7 @@ DAG_FUNCTION(minimum, 2, 0, experimental_constrained_minimum, FMINIMU DAG_FUNCTION(nearbyint, 1, 1, experimental_constrained_nearbyint, FNEARBYINT) DAG_FUNCTION(pow, 2, 1, experimental_constrained_pow, FPOW) DAG_FUNCTION(powi, 2, 1, experimental_constrained_powi, FPOWI) +DAG_FUNCTION(ldexp, 2, 1, experimental_constrained_ldexp, FLDEXP) DAG_FUNCTION(rint, 1, 1, experimental_constrained_rint, FRINT) DAG_FUNCTION(round, 1, 0, experimental_constrained_round, FROUND) DAG_FUNCTION(roundeven, 1, 0, experimental_constrained_roundeven, FROUNDEVEN) diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 9bf29f7..ebbd2af 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1037,6 +1037,10 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in { def int_llround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; def int_lrint : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; def int_llrint : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; + + // TODO: int operand should be constrained to same number of elements as the result. + def int_ldexp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, + llvm_anyint_ty]>; } def int_minnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], @@ -1168,6 +1172,11 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in { llvm_i32_ty, llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_ldexp : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_anyint_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; def int_experimental_constrained_sin : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], [ LLVMMatchType<0>, llvm_metadata_ty, diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index c9118e7..82e2408 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -279,6 +279,11 @@ HANDLE_LIBCALL(LLRINT_F64, "llrint") HANDLE_LIBCALL(LLRINT_F80, "llrintl") HANDLE_LIBCALL(LLRINT_F128, "llrintl") HANDLE_LIBCALL(LLRINT_PPCF128, "llrintl") +HANDLE_LIBCALL(LDEXP_F32, "ldexpf") +HANDLE_LIBCALL(LDEXP_F64, "ldexp") +HANDLE_LIBCALL(LDEXP_F80, "ldexpl") +HANDLE_LIBCALL(LDEXP_F128, "ldexpl") +HANDLE_LIBCALL(LDEXP_PPCF128, "ldexpl") // Floating point environment HANDLE_LIBCALL(FEGETENV, "fegetenv") diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index 5fd6523..afb7e61 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -613,6 +613,9 @@ HANDLE_TARGET_OPCODE(G_FLOG2) /// Floating point base-10 logarithm of a value. HANDLE_TARGET_OPCODE(G_FLOG10) +/// Floating point x * 2^n +HANDLE_TARGET_OPCODE(G_FLDEXP) + /// Generic FP negation. HANDLE_TARGET_OPCODE(G_FNEG) @@ -762,6 +765,7 @@ HANDLE_TARGET_OPCODE(G_STRICT_FDIV) HANDLE_TARGET_OPCODE(G_STRICT_FREM) HANDLE_TARGET_OPCODE(G_STRICT_FMA) HANDLE_TARGET_OPCODE(G_STRICT_FSQRT) +HANDLE_TARGET_OPCODE(G_STRICT_FLDEXP) /// read_register intrinsic HANDLE_TARGET_OPCODE(G_READ_REGISTER) diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index 3b33477..e661226 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -923,6 +923,13 @@ def G_FLOG10 : GenericInstruction { let hasSideEffects = false; } +// Floating point x * 2^n +def G_FLDEXP : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type1:$src1); + let hasSideEffects = false; +} + // Floating point ceiling of a value. def G_FCEIL : GenericInstruction { let OutOperandList = (outs type0:$dst); @@ -1384,6 +1391,7 @@ def G_STRICT_FDIV : ConstrainedInstruction; def G_STRICT_FREM : ConstrainedInstruction; def G_STRICT_FMA : ConstrainedInstruction; def G_STRICT_FSQRT : ConstrainedInstruction; +def G_STRICT_FLDEXP : ConstrainedInstruction; //------------------------------------------------------------------------------ // Memory intrinsics diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index 3ab0d1b..cf78ac7 100644 --- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -103,6 +103,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; @@ -158,6 +159,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; // Broadly speaking G_LOAD is equivalent to ISD::LOAD but there are some // complications that tablegen must take care of. For example, Predicates such diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 210aa8f..a172d01 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -173,6 +173,9 @@ def SDTFPToIntOp : SDTypeProfile<1, 1, [ // fp_to_[su]int def SDTFPToIntSatOp : SDTypeProfile<1, 2, [ // fp_to_[su]int_sat SDTCisInt<0>, SDTCisFP<1>, SDTCisSameNumEltsAs<0, 1>, SDTCisVT<2, OtherVT> ]>; +def SDTFPExpOp : SDTypeProfile<1, 2, [ // ldexp + SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2> +]>; def SDTExtInreg : SDTypeProfile<1, 2, [ // sext_inreg SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisVT<2, OtherVT>, SDTCisVTSmallerThanOp<2, 1> @@ -499,6 +502,7 @@ def fcos : SDNode<"ISD::FCOS" , SDTFPUnaryOp>; def fexp2 : SDNode<"ISD::FEXP2" , SDTFPUnaryOp>; def fpow : SDNode<"ISD::FPOW" , SDTFPBinOp>; def flog2 : SDNode<"ISD::FLOG2" , SDTFPUnaryOp>; +def fldexp : SDNode<"ISD::FLDEXP" , SDTFPExpOp>; def frint : SDNode<"ISD::FRINT" , SDTFPUnaryOp>; def ftrunc : SDNode<"ISD::FTRUNC" , SDTFPUnaryOp>; def fceil : SDNode<"ISD::FCEIL" , SDTFPUnaryOp>; @@ -549,6 +553,8 @@ def strict_fexp2 : SDNode<"ISD::STRICT_FEXP2", SDTFPUnaryOp, [SDNPHasChain]>; def strict_fpow : SDNode<"ISD::STRICT_FPOW", SDTFPBinOp, [SDNPHasChain]>; +def strict_fldexp : SDNode<"ISD::STRICT_FLDEXP", + SDTFPExpOp, [SDNPHasChain]>; def strict_flog2 : SDNode<"ISD::STRICT_FLOG2", SDTFPUnaryOp, [SDNPHasChain]>; def strict_frint : SDNode<"ISD::STRICT_FRINT", @@ -1449,6 +1455,9 @@ def any_fexp2 : PatFrags<(ops node:$src), def any_fpow : PatFrags<(ops node:$lhs, node:$rhs), [(strict_fpow node:$lhs, node:$rhs), (fpow node:$lhs, node:$rhs)]>; +def any_fldexp : PatFrags<(ops node:$lhs, node:$rhs), + [(strict_fldexp node:$lhs, node:$rhs), + (fldexp node:$lhs, node:$rhs)]>; def any_flog2 : PatFrags<(ops node:$src), [(strict_flog2 node:$src), (flog2 node:$src)]>; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index b2f89a8..ba05f10 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1759,6 +1759,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) { return TargetOpcode::G_FLOG2; case Intrinsic::log10: return TargetOpcode::G_FLOG10; + case Intrinsic::ldexp: + return TargetOpcode::G_FLDEXP; case Intrinsic::nearbyint: return TargetOpcode::G_FNEARBYINT; case Intrinsic::pow: @@ -1851,6 +1853,8 @@ static unsigned getConstrainedOpcode(Intrinsic::ID ID) { return TargetOpcode::G_STRICT_FMA; case Intrinsic::experimental_constrained_sqrt: return TargetOpcode::G_STRICT_FSQRT; + case Intrinsic::experimental_constrained_ldexp: + return TargetOpcode::G_STRICT_FLDEXP; default: return 0; } diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 5c5f843..c38951a 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -542,6 +542,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { RTLIBCASE(LOG_F); case TargetOpcode::G_FLOG2: RTLIBCASE(LOG2_F); + case TargetOpcode::G_FLDEXP: + RTLIBCASE(LDEXP_F); case TargetOpcode::G_FCEIL: RTLIBCASE(CEIL_F); case TargetOpcode::G_FFLOOR: @@ -826,6 +828,7 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) { case TargetOpcode::G_FLOG10: case TargetOpcode::G_FLOG: case TargetOpcode::G_FLOG2: + case TargetOpcode::G_FLDEXP: case TargetOpcode::G_FEXP: case TargetOpcode::G_FEXP2: case TargetOpcode::G_FCEIL: @@ -1413,6 +1416,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT); Observer.changedInstr(MI); return Legalized; + case TargetOpcode::G_FLDEXP: + case TargetOpcode::G_STRICT_FLDEXP: + return narrowScalarFLDEXP(MI, TypeIdx, NarrowTy); } } @@ -2553,14 +2559,30 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); Observer.changedInstr(MI); return Legalized; - case TargetOpcode::G_FPOWI: { - if (TypeIdx != 0) - return UnableToLegalize; - Observer.changingInstr(MI); - widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); - widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); - Observer.changedInstr(MI); - return Legalized; + case TargetOpcode::G_FPOWI: + case TargetOpcode::G_FLDEXP: + case TargetOpcode::G_STRICT_FLDEXP: { + if (TypeIdx == 0) { + if (MI.getOpcode() == TargetOpcode::G_STRICT_FLDEXP) + return UnableToLegalize; + + Observer.changingInstr(MI); + widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); + widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); + Observer.changedInstr(MI); + return Legalized; + } + + if (TypeIdx == 1) { + // For some reason SelectionDAG tries to promote to a libcall without + // actually changing the integer type for promotion. + Observer.changingInstr(MI); + widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); + Observer.changedInstr(MI); + return Legalized; + } + + return UnableToLegalize; } case TargetOpcode::G_INTTOPTR: if (TypeIdx != 1) @@ -4136,6 +4158,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_FLOG: case G_FLOG2: case G_FLOG10: + case G_FLDEXP: case G_FNEARBYINT: case G_FCEIL: case G_FFLOOR: @@ -4211,6 +4234,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_STRICT_FSUB: case G_STRICT_FMUL: case G_STRICT_FMA: + case G_STRICT_FLDEXP: return fewerElementsVectorMultiEltType(GMI, NumElts); case G_ICMP: case G_FCMP: @@ -5593,6 +5617,31 @@ LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, } LegalizerHelper::LegalizeResult +LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx, + LLT NarrowTy) { + if (TypeIdx != 1) + return UnableToLegalize; + + MachineIRBuilder &B = MIRBuilder; + Register ExpReg = MI.getOperand(2).getReg(); + LLT ExpTy = MRI.getType(ExpReg); + + unsigned ClampSize = NarrowTy.getScalarSizeInBits(); + + // Clamp the exponent to the range of the target type. + auto MinExp = B.buildConstant(ExpTy, minIntN(ClampSize)); + auto ClampMin = B.buildSMax(ExpTy, ExpReg, MinExp); + auto MaxExp = B.buildConstant(ExpTy, maxIntN(ClampSize)); + auto Clamp = B.buildSMin(ExpTy, ClampMin, MaxExp); + + auto Trunc = B.buildTrunc(NarrowTy, Clamp); + Observer.changingInstr(MI); + MI.getOperand(2).setReg(Trunc.getReg(0)); + Observer.changedInstr(MI); + return Legalized; +} + +LegalizerHelper::LegalizeResult LegalizerHelper::lowerBitCount(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); const auto &TII = MIRBuilder.getTII(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 8cf5154..d902b35 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -172,6 +172,8 @@ private: SDValue ExpandFCOPYSIGN(SDNode *Node) const; SDValue ExpandFABS(SDNode *Node) const; SDValue ExpandFNEG(SDNode *Node) const; + SDValue expandLdexp(SDNode *Node) const; + SDValue ExpandLegalINT_TO_FP(SDNode *Node, SDValue &Chain); void PromoteLegalINT_TO_FP(SDNode *N, const SDLoc &dl, SmallVectorImpl &Results); @@ -2313,6 +2315,118 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node, DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo())); } +SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { + SDLoc dl(Node); + EVT VT = Node->getValueType(0); + SDValue X = Node->getOperand(0); + SDValue N = Node->getOperand(1); + EVT ExpVT = N.getValueType(); + EVT AsIntVT = VT.changeTypeToInteger(); + if (AsIntVT == EVT()) // TODO: How to handle f80? + return SDValue(); + + if (Node->getOpcode() == ISD::STRICT_FLDEXP) // TODO + return SDValue(); + + SDNodeFlags NSW; + NSW.setNoSignedWrap(true); + SDNodeFlags NUW_NSW; + NUW_NSW.setNoUnsignedWrap(true); + NUW_NSW.setNoSignedWrap(true); + + EVT SetCCVT = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ExpVT); + const fltSemantics &FltSem = SelectionDAG::EVTToAPFloatSemantics(VT); + + const APFloat::ExponentType MaxExpVal = APFloat::semanticsMaxExponent(FltSem); + const APFloat::ExponentType MinExpVal = APFloat::semanticsMinExponent(FltSem); + const int Precision = APFloat::semanticsPrecision(FltSem); + + const SDValue MaxExp = DAG.getConstant(MaxExpVal, dl, ExpVT); + const SDValue MinExp = DAG.getConstant(MinExpVal, dl, ExpVT); + + const SDValue DoubleMaxExp = DAG.getConstant(2 * MaxExpVal, dl, ExpVT); + + const APFloat One(FltSem, "1.0"); + APFloat ScaleUpK = scalbn(One, MaxExpVal, APFloat::rmNearestTiesToEven); + + // Offset by precision to avoid denormal range. + APFloat ScaleDownK = + scalbn(One, MinExpVal + Precision, APFloat::rmNearestTiesToEven); + + // TODO: Should really introduce control flow and use a block for the > + // MaxExp, < MinExp cases + + // First, handle exponents Exp > MaxExp and scale down. + SDValue NGtMaxExp = DAG.getSetCC(dl, SetCCVT, N, MaxExp, ISD::SETGT); + + SDValue DecN0 = DAG.getNode(ISD::SUB, dl, ExpVT, N, MaxExp, NSW); + SDValue ClampMaxVal = DAG.getConstant(3 * MaxExpVal, dl, ExpVT); + SDValue ClampN_Big = DAG.getNode(ISD::SMIN, dl, ExpVT, N, ClampMaxVal); + SDValue DecN1 = + DAG.getNode(ISD::SUB, dl, ExpVT, ClampN_Big, DoubleMaxExp, NSW); + + SDValue ScaleUpTwice = + DAG.getSetCC(dl, SetCCVT, N, DoubleMaxExp, ISD::SETUGT); + + const SDValue ScaleUpVal = DAG.getConstantFP(ScaleUpK, dl, VT); + SDValue ScaleUp0 = DAG.getNode(ISD::FMUL, dl, VT, X, ScaleUpVal); + SDValue ScaleUp1 = DAG.getNode(ISD::FMUL, dl, VT, ScaleUp0, ScaleUpVal); + + SDValue SelectN_Big = + DAG.getNode(ISD::SELECT, dl, ExpVT, ScaleUpTwice, DecN1, DecN0); + SDValue SelectX_Big = + DAG.getNode(ISD::SELECT, dl, VT, ScaleUpTwice, ScaleUp1, ScaleUp0); + + // Now handle exponents Exp < MinExp + SDValue NLtMinExp = DAG.getSetCC(dl, SetCCVT, N, MinExp, ISD::SETLT); + + SDValue Increment0 = DAG.getConstant(-(MinExpVal + Precision), dl, ExpVT); + SDValue Increment1 = DAG.getConstant(-2 * (MinExpVal + Precision), dl, ExpVT); + + SDValue IncN0 = DAG.getNode(ISD::ADD, dl, ExpVT, N, Increment0, NUW_NSW); + + SDValue ClampMinVal = + DAG.getConstant(3 * MinExpVal + 2 * Precision, dl, ExpVT); + SDValue ClampN_Small = DAG.getNode(ISD::SMAX, dl, ExpVT, N, ClampMinVal); + SDValue IncN1 = + DAG.getNode(ISD::ADD, dl, ExpVT, ClampN_Small, Increment1, NSW); + + const SDValue ScaleDownVal = DAG.getConstantFP(ScaleDownK, dl, VT); + SDValue ScaleDown0 = DAG.getNode(ISD::FMUL, dl, VT, X, ScaleDownVal); + SDValue ScaleDown1 = DAG.getNode(ISD::FMUL, dl, VT, ScaleDown0, ScaleDownVal); + + SDValue ScaleDownTwice = DAG.getSetCC( + dl, SetCCVT, N, DAG.getConstant(2 * MinExpVal + Precision, dl, ExpVT), + ISD::SETULT); + + SDValue SelectN_Small = + DAG.getNode(ISD::SELECT, dl, ExpVT, ScaleDownTwice, IncN1, IncN0); + SDValue SelectX_Small = + DAG.getNode(ISD::SELECT, dl, VT, ScaleDownTwice, ScaleDown1, ScaleDown0); + + // Now combine the two out of range exponent handling cases with the base + // case. + SDValue NewX = DAG.getNode( + ISD::SELECT, dl, VT, NGtMaxExp, SelectX_Big, + DAG.getNode(ISD::SELECT, dl, VT, NLtMinExp, SelectX_Small, X)); + + SDValue NewN = DAG.getNode( + ISD::SELECT, dl, ExpVT, NGtMaxExp, SelectN_Big, + DAG.getNode(ISD::SELECT, dl, ExpVT, NLtMinExp, SelectN_Small, N)); + + SDValue BiasedN = DAG.getNode(ISD::ADD, dl, ExpVT, NewN, MaxExp, NSW); + + SDValue ExponentShiftAmt = + DAG.getShiftAmountConstant(Precision - 1, ExpVT, dl); + SDValue CastExpToValTy = DAG.getZExtOrTrunc(BiasedN, dl, AsIntVT); + + SDValue AsInt = DAG.getNode(ISD::SHL, dl, AsIntVT, CastExpToValTy, + ExponentShiftAmt, NUW_NSW); + SDValue AsFP = DAG.getNode(ISD::BITCAST, dl, VT, AsInt); + return DAG.getNode(ISD::FMUL, dl, VT, NewX, AsFP); +} + /// This function is responsible for legalizing a /// INT_TO_FP operation of the specified operand when the target requests that /// we expand it. At this point, we know that the result and operand types are @@ -3250,6 +3364,23 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { } break; } + case ISD::FLDEXP: + case ISD::STRICT_FLDEXP: { + EVT VT = Node->getValueType(0); + RTLIB::Libcall LC = RTLIB::getLDEXP(VT); + // Use the LibCall instead, it is very likely faster + // FIXME: Use separate LibCall action. + if (TLI.getLibcallName(LC)) + break; + + if (SDValue Expanded = expandLdexp(Node)) { + Results.push_back(Expanded); + if (Node->getOpcode() == ISD::STRICT_FLDEXP) + Results.push_back(Expanded.getValue(1)); + } + + break; + } case ISD::FMAD: llvm_unreachable("Illegal fmad should never be formed"); @@ -4142,6 +4273,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::ROUNDEVEN_F128, RTLIB::ROUNDEVEN_PPCF128, Results); break; + case ISD::FLDEXP: + case ISD::STRICT_FLDEXP: + ExpandFPLibCall(Node, RTLIB::LDEXP_F32, RTLIB::LDEXP_F64, RTLIB::LDEXP_F80, + RTLIB::LDEXP_F128, RTLIB::LDEXP_PPCF128, Results); + break; case ISD::FPOWI: case ISD::STRICT_FPOWI: { RTLIB::Libcall LC = RTLIB::getPOWI(Node->getSimpleValueType(0)); @@ -4871,6 +5007,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { Results.push_back(Tmp4.getValue(1)); break; case ISD::FCOPYSIGN: + case ISD::FLDEXP: case ISD::FPOWI: { Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); Tmp2 = Node->getOperand(1); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 29a1951..366a8d5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -111,7 +111,9 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::STRICT_FPOW: case ISD::FPOW: R = SoftenFloatRes_FPOW(N); break; case ISD::STRICT_FPOWI: - case ISD::FPOWI: R = SoftenFloatRes_FPOWI(N); break; + case ISD::FPOWI: + case ISD::FLDEXP: + case ISD::STRICT_FLDEXP: R = SoftenFloatRes_ExpOp(N); break; case ISD::STRICT_FREM: case ISD::FREM: R = SoftenFloatRes_FREM(N); break; case ISD::STRICT_FRINT: @@ -603,13 +605,17 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) { RTLIB::POW_PPCF128)); } -SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) { +SDValue DAGTypeLegalizer::SoftenFloatRes_ExpOp(SDNode *N) { bool IsStrict = N->isStrictFPOpcode(); unsigned Offset = IsStrict ? 1 : 0; assert((N->getOperand(1 + Offset).getValueType() == MVT::i16 || N->getOperand(1 + Offset).getValueType() == MVT::i32) && "Unsupported power type!"); - RTLIB::Libcall LC = RTLIB::getPOWI(N->getValueType(0)); + bool IsPowI = + N->getOpcode() == ISD::FPOWI || N->getOpcode() == ISD::STRICT_FPOWI; + + RTLIB::Libcall LC = IsPowI ? RTLIB::getPOWI(N->getValueType(0)) + : RTLIB::getLDEXP(N->getValueType(0)); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fpowi."); if (!TLI.getLibcallName(LC)) { // Some targets don't have a powi libcall; use pow instead. @@ -1274,6 +1280,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { case ISD::FPOW: ExpandFloatRes_FPOW(N, Lo, Hi); break; case ISD::STRICT_FPOWI: case ISD::FPOWI: ExpandFloatRes_FPOWI(N, Lo, Hi); break; + case ISD::FLDEXP: + case ISD::STRICT_FLDEXP: ExpandFloatRes_FLDEXP(N, Lo, Hi); break; case ISD::FREEZE: ExpandFloatRes_FREEZE(N, Lo, Hi); break; case ISD::STRICT_FRINT: case ISD::FRINT: ExpandFloatRes_FRINT(N, Lo, Hi); break; @@ -1569,6 +1577,11 @@ void DAGTypeLegalizer::ExpandFloatRes_FPOWI(SDNode *N, ExpandFloatRes_Binary(N, RTLIB::getPOWI(N->getValueType(0)), Lo, Hi); } +void DAGTypeLegalizer::ExpandFloatRes_FLDEXP(SDNode *N, SDValue &Lo, + SDValue &Hi) { + ExpandFloatRes_Binary(N, RTLIB::getLDEXP(N->getValueType(0)), Lo, Hi); +} + void DAGTypeLegalizer::ExpandFloatRes_FREEZE(SDNode *N, SDValue &Lo, SDValue &Hi) { assert(N->getValueType(0) == MVT::ppcf128 && @@ -2310,7 +2323,8 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { case ISD::FMA: // FMA is same as FMAD case ISD::FMAD: R = PromoteFloatRes_FMAD(N); break; - case ISD::FPOWI: R = PromoteFloatRes_FPOWI(N); break; + case ISD::FPOWI: + case ISD::FLDEXP: R = PromoteFloatRes_ExpOp(N); break; case ISD::FP_ROUND: R = PromoteFloatRes_FP_ROUND(N); break; case ISD::LOAD: R = PromoteFloatRes_LOAD(N); break; @@ -2479,7 +2493,7 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_FMAD(SDNode *N) { } // Promote the Float (first) operand and retain the Integer (second) operand -SDValue DAGTypeLegalizer::PromoteFloatRes_FPOWI(SDNode *N) { +SDValue DAGTypeLegalizer::PromoteFloatRes_ExpOp(SDNode *N) { EVT VT = N->getValueType(0); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDValue Op0 = GetPromotedFloat(N->getOperand(0)); @@ -2676,7 +2690,8 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::FMA: // FMA is same as FMAD case ISD::FMAD: R = SoftPromoteHalfRes_FMAD(N); break; - case ISD::FPOWI: R = SoftPromoteHalfRes_FPOWI(N); break; + case ISD::FPOWI: + case ISD::FLDEXP: R = SoftPromoteHalfRes_ExpOp(N); break; case ISD::LOAD: R = SoftPromoteHalfRes_LOAD(N); break; case ISD::SELECT: R = SoftPromoteHalfRes_SELECT(N); break; @@ -2788,7 +2803,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) { return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res); } -SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FPOWI(SDNode *N) { +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ExpOp(SDNode *N) { EVT OVT = N->getValueType(0); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT); SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 21e117f..1155c3c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -1714,10 +1714,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::SDIVFIXSAT: case ISD::UDIVFIX: case ISD::UDIVFIXSAT: Res = PromoteIntOp_FIX(N); break; - case ISD::FPOWI: - case ISD::STRICT_FPOWI: Res = PromoteIntOp_FPOWI(N); break; - + case ISD::STRICT_FPOWI: + case ISD::FLDEXP: + case ISD::STRICT_FLDEXP: Res = PromoteIntOp_ExpOp(N); break; case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_MUL: case ISD::VECREDUCE_AND: @@ -2201,26 +2201,29 @@ SDValue DAGTypeLegalizer::PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo) { 0); } -SDValue DAGTypeLegalizer::PromoteIntOp_FPOWI(SDNode *N) { +SDValue DAGTypeLegalizer::PromoteIntOp_ExpOp(SDNode *N) { bool IsStrict = N->isStrictFPOpcode(); SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); - // The integer operand is the last operand in FPOWI (so the result and - // floating point operand is already type legalized). + bool IsPowI = + N->getOpcode() == ISD::FPOWI || N->getOpcode() == ISD::STRICT_FPOWI; + + // The integer operand is the last operand in FPOWI (or FLDEXP) (so the result + // and floating point operand is already type legalized). + RTLIB::Libcall LC = IsPowI ? RTLIB::getPOWI(N->getValueType(0)) + : RTLIB::getLDEXP(N->getValueType(0)); + + if (LC == RTLIB::UNKNOWN_LIBCALL || !TLI.getLibcallName(LC)) { + SDValue Op = SExtPromotedInteger(N->getOperand(1)); + return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op), 0); + } // We can't just promote the exponent type in FPOWI, since we want to lower // the node to a libcall and we if we promote to a type larger than // sizeof(int) the libcall might not be according to the targets ABI. Instead // we rewrite to a libcall here directly, letting makeLibCall handle promotion // if the target accepts it according to shouldSignExtendTypeInLibCall. - RTLIB::Libcall LC = RTLIB::getPOWI(N->getValueType(0)); - assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fpowi."); - if (!TLI.getLibcallName(LC)) { - // Some targets don't have a powi libcall; use pow instead. - // FIXME: Implement this if some target needs it. - DAG.getContext()->emitError("Don't know how to promote fpowi to fpow"); - return DAG.getUNDEF(N->getValueType(0)); - } + unsigned OpOffset = IsStrict ? 1 : 0; // The exponent should fit in a sizeof(int) type for the libcall to be valid. assert(DAG.getLibInfo().getIntSize() == diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index e73b6b1..0611cde 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -400,7 +400,7 @@ private: SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N); SDValue PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_FIX(SDNode *N); - SDValue PromoteIntOp_FPOWI(SDNode *N); + SDValue PromoteIntOp_ExpOp(SDNode *N); SDValue PromoteIntOp_VECREDUCE(SDNode *N); SDValue PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_SET_ROUNDING(SDNode *N); @@ -563,7 +563,7 @@ private: SDValue SoftenFloatRes_BF16_TO_FP(SDNode *N); SDValue SoftenFloatRes_FP_ROUND(SDNode *N); SDValue SoftenFloatRes_FPOW(SDNode *N); - SDValue SoftenFloatRes_FPOWI(SDNode *N); + SDValue SoftenFloatRes_ExpOp(SDNode *N); SDValue SoftenFloatRes_FREEZE(SDNode *N); SDValue SoftenFloatRes_FREM(SDNode *N); SDValue SoftenFloatRes_FRINT(SDNode *N); @@ -641,6 +641,7 @@ private: void ExpandFloatRes_FP_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FPOW (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FPOWI (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandFloatRes_FLDEXP (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FREEZE (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FREM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FRINT (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -690,7 +691,7 @@ private: SDValue PromoteFloatRes_EXTRACT_VECTOR_ELT(SDNode *N); SDValue PromoteFloatRes_FCOPYSIGN(SDNode *N); SDValue PromoteFloatRes_FMAD(SDNode *N); - SDValue PromoteFloatRes_FPOWI(SDNode *N); + SDValue PromoteFloatRes_ExpOp(SDNode *N); SDValue PromoteFloatRes_FP_ROUND(SDNode *N); SDValue PromoteFloatRes_LOAD(SDNode *N); SDValue PromoteFloatRes_SELECT(SDNode *N); @@ -731,7 +732,7 @@ private: SDValue SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(SDNode *N); SDValue SoftPromoteHalfRes_FCOPYSIGN(SDNode *N); SDValue SoftPromoteHalfRes_FMAD(SDNode *N); - SDValue SoftPromoteHalfRes_FPOWI(SDNode *N); + SDValue SoftPromoteHalfRes_ExpOp(SDNode *N); SDValue SoftPromoteHalfRes_FP_ROUND(SDNode *N); SDValue SoftPromoteHalfRes_LOAD(SDNode *N); SDValue SoftPromoteHalfRes_SELECT(SDNode *N); @@ -784,7 +785,7 @@ private: SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N); SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N); SDValue ScalarizeVecRes_FP_ROUND(SDNode *N); - SDValue ScalarizeVecRes_FPOWI(SDNode *N); + SDValue ScalarizeVecRes_ExpOp(SDNode *N); SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N); SDValue ScalarizeVecRes_LOAD(LoadSDNode *N); SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N); @@ -860,8 +861,7 @@ private: void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); - void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi); - void SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); @@ -906,7 +906,7 @@ private: SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N); SDValue SplitVecOp_VSETCC(SDNode *N); SDValue SplitVecOp_FP_ROUND(SDNode *N); - SDValue SplitVecOp_FCOPYSIGN(SDNode *N); + SDValue SplitVecOp_FPOpDifferentTypes(SDNode *N); SDValue SplitVecOp_FP_TO_XINT_SAT(SDNode *N); //===--------------------------------------------------------------------===// @@ -982,7 +982,7 @@ private: SDValue WidenVecRes_FP_TO_XINT_SAT(SDNode *N); SDValue WidenVecRes_FCOPYSIGN(SDNode *N); SDValue WidenVecRes_IS_FPCLASS(SDNode *N); - SDValue WidenVecRes_POWI(SDNode *N); + SDValue WidenVecRes_ExpOp(SDNode *N); SDValue WidenVecRes_Unary(SDNode *N); SDValue WidenVecRes_InregOp(SDNode *N); @@ -1012,6 +1012,7 @@ private: SDValue WidenVecOp_VECREDUCE(SDNode *N); SDValue WidenVecOp_VECREDUCE_SEQ(SDNode *N); SDValue WidenVecOp_VP_REDUCE(SDNode *N); + SDValue WidenVecOp_ExpOp(SDNode *N); /// Helper function to generate a set of operations to perform /// a vector operation for a wider type. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 6c03f2d..5980383 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -377,6 +377,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::FSQRT: case ISD::FSIN: case ISD::FCOS: + case ISD::FLDEXP: case ISD::FPOWI: case ISD::FPOW: case ISD::FLOG: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index d947e76..e2e1a83 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -57,7 +57,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::BUILD_VECTOR: R = ScalarizeVecRes_BUILD_VECTOR(N); break; case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break; case ISD::FP_ROUND: R = ScalarizeVecRes_FP_ROUND(N); break; - case ISD::FPOWI: R = ScalarizeVecRes_FPOWI(N); break; + case ISD::FPOWI: R = ScalarizeVecRes_ExpOp(N); break; case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break; case ISD::LOAD: R = ScalarizeVecRes_LOAD(cast(N));break; case ISD::SCALAR_TO_VECTOR: R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break; @@ -126,6 +126,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::FMAXNUM_IEEE: case ISD::FMINIMUM: case ISD::FMAXIMUM: + case ISD::FLDEXP: case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: @@ -348,10 +349,10 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_ROUND(SDNode *N) { N->getOperand(1)); } -SDValue DAGTypeLegalizer::ScalarizeVecRes_FPOWI(SDNode *N) { +SDValue DAGTypeLegalizer::ScalarizeVecRes_ExpOp(SDNode *N) { SDValue Op = GetScalarizedVector(N->getOperand(0)); - return DAG.getNode(ISD::FPOWI, SDLoc(N), - Op.getValueType(), Op, N->getOperand(1)); + return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op, + N->getOperand(1)); } SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) { @@ -960,8 +961,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::CONCAT_VECTORS: SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break; case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break; case ISD::INSERT_SUBVECTOR: SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break; - case ISD::FPOWI: SplitVecRes_FPOWI(N, Lo, Hi); break; - case ISD::FCOPYSIGN: SplitVecRes_FCOPYSIGN(N, Lo, Hi); break; + case ISD::FPOWI: + case ISD::FLDEXP: + case ISD::FCOPYSIGN: SplitVecRes_FPOp_MultiType(N, Lo, Hi); break; case ISD::IS_FPCLASS: SplitVecRes_IS_FPCLASS(N, Lo, Hi); break; case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break; case ISD::SPLAT_VECTOR: @@ -1463,16 +1465,11 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MPI, SmallestAlign); } -void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, - SDValue &Hi) { - SDLoc dl(N); - GetSplitVector(N->getOperand(0), Lo, Hi); - Lo = DAG.getNode(ISD::FPOWI, dl, Lo.getValueType(), Lo, N->getOperand(1)); - Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1)); -} - -void DAGTypeLegalizer::SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, - SDValue &Hi) { +// Handle splitting an FP where the second operand does not match the first +// type. The second operand may be a scalar, or a vector that has exactly as +// many elements as the first +void DAGTypeLegalizer::SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, + SDValue &Hi) { SDValue LHSLo, LHSHi; GetSplitVector(N->getOperand(0), LHSLo, LHSHi); SDLoc DL(N); @@ -1480,14 +1477,18 @@ void DAGTypeLegalizer::SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue RHSLo, RHSHi; SDValue RHS = N->getOperand(1); EVT RHSVT = RHS.getValueType(); - if (getTypeAction(RHSVT) == TargetLowering::TypeSplitVector) - GetSplitVector(RHS, RHSLo, RHSHi); - else - std::tie(RHSLo, RHSHi) = DAG.SplitVector(RHS, SDLoc(RHS)); - + if (RHSVT.isVector()) { + if (getTypeAction(RHSVT) == TargetLowering::TypeSplitVector) + GetSplitVector(RHS, RHSLo, RHSHi); + else + std::tie(RHSLo, RHSHi) = DAG.SplitVector(RHS, SDLoc(RHS)); - Lo = DAG.getNode(ISD::FCOPYSIGN, DL, LHSLo.getValueType(), LHSLo, RHSLo); - Hi = DAG.getNode(ISD::FCOPYSIGN, DL, LHSHi.getValueType(), LHSHi, RHSHi); + Lo = DAG.getNode(N->getOpcode(), DL, LHSLo.getValueType(), LHSLo, RHSLo); + Hi = DAG.getNode(N->getOpcode(), DL, LHSHi.getValueType(), LHSHi, RHSHi); + } else { + Lo = DAG.getNode(N->getOpcode(), DL, LHSLo.getValueType(), LHSLo, RHS); + Hi = DAG.getNode(N->getOpcode(), DL, LHSHi.getValueType(), LHSHi, RHS); + } } void DAGTypeLegalizer::SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, @@ -2846,7 +2847,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::STRICT_FP_ROUND: case ISD::VP_FP_ROUND: case ISD::FP_ROUND: Res = SplitVecOp_FP_ROUND(N); break; - case ISD::FCOPYSIGN: Res = SplitVecOp_FCOPYSIGN(N); break; + case ISD::FCOPYSIGN: Res = SplitVecOp_FPOpDifferentTypes(N); break; case ISD::STORE: Res = SplitVecOp_STORE(cast(N), OpNo); break; @@ -2900,6 +2901,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::FTRUNC: Res = SplitVecOp_UnaryOp(N); break; + case ISD::FLDEXP: + Res = SplitVecOp_FPOpDifferentTypes(N); + break; case ISD::ANY_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: @@ -3845,10 +3849,12 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) { return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); } -SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) { - // The result (and the first input) has a legal vector type, but the second - // input needs splitting. - +// Split a vector type in an FP binary operation where the second operand has a +// different type from the first. +// +// The result (and the first input) has a legal vector type, but the second +// input needs splitting. +SDValue DAGTypeLegalizer::SplitVecOp_FPOpDifferentTypes(SDNode *N) { SDLoc DL(N); EVT LHSLoVT, LHSHiVT; @@ -3864,8 +3870,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) { SDValue RHSLo, RHSHi; std::tie(RHSLo, RHSHi) = DAG.SplitVector(N->getOperand(1), DL); - SDValue Lo = DAG.getNode(ISD::FCOPYSIGN, DL, LHSLoVT, LHSLo, RHSLo); - SDValue Hi = DAG.getNode(ISD::FCOPYSIGN, DL, LHSHiVT, LHSHi, RHSHi); + SDValue Lo = DAG.getNode(N->getOpcode(), DL, LHSLoVT, LHSLo, RHSLo); + SDValue Hi = DAG.getNode(N->getOpcode(), DL, LHSHiVT, LHSHi, RHSHi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Lo, Hi); } @@ -4075,8 +4081,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { Res = WidenVecRes_IS_FPCLASS(N); break; + case ISD::FLDEXP: case ISD::FPOWI: - Res = WidenVecRes_POWI(N); + Res = WidenVecRes_ExpOp(N); break; case ISD::ANY_EXTEND_VECTOR_INREG: @@ -4433,10 +4440,18 @@ SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) { for (unsigned i = 1; i < NumOpers; ++i) { SDValue Oper = N->getOperand(i); - if (Oper.getValueType().isVector()) { - assert(Oper.getValueType() == N->getValueType(0) && - "Invalid operand type to widen!"); - Oper = GetWidenedVector(Oper); + EVT OpVT = Oper.getValueType(); + if (OpVT.isVector()) { + if (getTypeAction(OpVT) == TargetLowering::TypeWidenVector) + Oper = GetWidenedVector(Oper); + else { + EVT WideOpVT = + EVT::getVectorVT(*DAG.getContext(), OpVT.getVectorElementType(), + WidenVT.getVectorElementCount()); + Oper = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + DAG.getUNDEF(WideOpVT), Oper, + DAG.getVectorIdxConstant(0, dl)); + } } InOps.push_back(Oper); @@ -4454,9 +4469,14 @@ SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) { for (unsigned i = 0; i < NumOpers; ++i) { SDValue Op = InOps[i]; - if (Op.getValueType().isVector()) - Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op, + EVT OpVT = Op.getValueType(); + if (OpVT.isVector()) { + EVT OpExtractVT = + EVT::getVectorVT(*DAG.getContext(), OpVT.getVectorElementType(), + VT.getVectorElementCount()); + Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpExtractVT, Op, DAG.getVectorIdxConstant(Idx, dl)); + } EOps.push_back(Op); } @@ -4480,8 +4500,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) { for (unsigned i = 0; i < NumOpers; ++i) { SDValue Op = InOps[i]; - if (Op.getValueType().isVector()) - Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, Op, + EVT OpVT = Op.getValueType(); + if (OpVT.isVector()) + Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + OpVT.getVectorElementType(), Op, DAG.getVectorIdxConstant(Idx, dl)); EOps.push_back(Op); @@ -4790,11 +4812,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_IS_FPCLASS(SDNode *N) { N->getFlags()); } -SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) { +SDValue DAGTypeLegalizer::WidenVecRes_ExpOp(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp = GetWidenedVector(N->getOperand(0)); - SDValue ShOp = N->getOperand(1); - return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp); + SDValue RHS = N->getOperand(1); + SDValue ExpOp = RHS.getValueType().isVector() ? GetWidenedVector(RHS) : RHS; + + return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ExpOp); } SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 12d47a0..7a162e2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4977,7 +4977,8 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const case ISD::FROUND: case ISD::FROUNDEVEN: case ISD::FRINT: - case ISD::FNEARBYINT: { + case ISD::FNEARBYINT: + case ISD::FLDEXP: { if (SNaN) return true; return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 109cd12..83aacd6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6447,6 +6447,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), Flags)); return; + case Intrinsic::ldexp: + setValue(&I, DAG.getNode(ISD::FLDEXP, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)), Flags)); + return; case Intrinsic::arithmetic_fence: { setValue(&I, DAG.getNode(ISD::ARITH_FENCE, sdl, getValue(I.getArgOperand(0)).getValueType(), @@ -8635,6 +8641,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { if (visitUnaryFloatCall(I, ISD::FEXP2)) return; break; + case LibFunc_ldexp: + case LibFunc_ldexpf: + case LibFunc_ldexpl: + if (visitBinaryFloatCall(I, ISD::FLDEXP)) + return; + break; case LibFunc_memcmp: if (visitMemCmpBCmpCall(I)) return; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 8cc0e9a..0767d8b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -283,6 +283,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::UMIN: return "umin"; case ISD::UMAX: return "umax"; + case ISD::FLDEXP: return "fldexp"; + case ISD::STRICT_FLDEXP: return "strict_fldexp"; case ISD::FPOWI: return "fpowi"; case ISD::STRICT_FPOWI: return "strict_fpowi"; case ISD::SETCC: return "setcc"; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index f5d2b70..c32f861 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -209,6 +209,13 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) { if (TT.isOSOpenBSD()) { setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr); } + + if (TT.isOSWindows() && !TT.isOSCygMing()) { + setLibcallName(RTLIB::LDEXP_F32, nullptr); + setLibcallName(RTLIB::LDEXP_F80, nullptr); + setLibcallName(RTLIB::LDEXP_F128, nullptr); + setLibcallName(RTLIB::LDEXP_PPCF128, nullptr); + } } /// GetFPLibCall - Helper to return the right libcall for the given floating @@ -498,6 +505,11 @@ RTLIB::Libcall RTLIB::getPOWI(EVT RetVT) { POWI_PPCF128); } +RTLIB::Libcall RTLIB::getLDEXP(EVT RetVT) { + return getFPLibCall(RetVT, LDEXP_F32, LDEXP_F64, LDEXP_F80, LDEXP_F128, + LDEXP_PPCF128); +} + RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order, MVT VT) { unsigned ModeN, ModelN; @@ -845,7 +857,8 @@ void TargetLoweringBase::initActions() { setOperationAction({ISD::BITREVERSE, ISD::PARITY}, VT, Expand); // These library functions default to expand. - setOperationAction({ISD::FROUND, ISD::FROUNDEVEN, ISD::FPOWI}, VT, Expand); + setOperationAction({ISD::FROUND, ISD::FROUNDEVEN, ISD::FPOWI, ISD::FLDEXP}, + VT, Expand); // These operations default to expand for vector types. if (VT.isVector()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b50687b..86abd0f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -168,6 +168,7 @@ bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const { case ISD::FFLOOR: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FLDEXP: case AMDGPUISD::FRACT: case AMDGPUISD::CLAMP: case AMDGPUISD::COS_HW: @@ -179,7 +180,6 @@ bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const { case AMDGPUISD::RCP: case AMDGPUISD::RSQ: case AMDGPUISD::RCP_IFLAG: - case AMDGPUISD::LDEXP: // On gfx10, all 16-bit instructions preserve the high bits. return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9; case ISD::FP_ROUND: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3def0c4..91c5257 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2613,7 +2613,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, ShAmt); // On GCN, use LDEXP directly. if (Subtarget->isGCN()) - return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt); + return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt); // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent // part directly to emulate the multiplication of 2^ShAmt. That 8-bit @@ -2646,7 +2646,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); - SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, + SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi, DAG.getConstant(32, SL, MVT::i32)); // TODO: Should this propagate fast-math-flags? return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); @@ -4637,7 +4637,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RCP_IFLAG) NODE_NAME_CASE(FMUL_LEGACY) NODE_NAME_CASE(RSQ_CLAMP) - NODE_NAME_CASE(LDEXP) NODE_NAME_CASE(FP_CLASS) NODE_NAME_CASE(DOT4) NODE_NAME_CASE(CARRY) @@ -5044,7 +5043,7 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, // TODO: Need is known positive check. return false; } - case AMDGPUISD::LDEXP: + case ISD::FLDEXP: case AMDGPUISD::FRACT: { if (SNaN) return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 81a9de8..99bb63b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -438,7 +438,6 @@ enum NodeType : unsigned { RCP_IFLAG, FMUL_LEGACY, RSQ_CLAMP, - LDEXP, FP_CLASS, DOT4, CARRY, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index d1e19e6..8dcccdd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -18,10 +18,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> ]>; -def AMDGPULdExpOp : SDTypeProfile<1, 2, - [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] ->; - def AMDGPUFPClassOp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>] >; @@ -128,8 +124,6 @@ def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) result clamped to +/- max_float. def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; -def AMDGPUldexp_impl : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; - def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>; def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>; @@ -389,10 +383,6 @@ def AMDGPUcos : PatFrags<(ops node:$src), [(int_amdgcn_cos node:$src), def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src), (AMDGPUfract_impl node:$src)]>; -def AMDGPUldexp : PatFrags<(ops node:$src0, node:$src1), - [(int_amdgcn_ldexp node:$src0, node:$src1), - (AMDGPUldexp_impl node:$src0, node:$src1)]>; - def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1), [(int_amdgcn_class node:$src0, node:$src1), (AMDGPUfp_class_impl node:$src0, node:$src1)]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index c8485ea..de5778f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -911,6 +911,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor({S32, S64, S16}) .scalarize(0) .clampScalar(0, S16, S64); + + getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) + .legalFor({{S32, S32}, {S64, S32}, {S16, S16}}) + .scalarize(0) + .maxScalarIf(typeIs(0, S16), 1, S16) + .clampScalar(1, S32, S32) + .lower(); } else { getActionDefinitionsBuilder(G_FSQRT) .legalFor({S32, S64}) @@ -929,6 +936,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .clampScalar(0, S32, S64); } + + getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) + .legalFor({{S32, S32}, {S64, S32}}) + .scalarize(0) + .clampScalar(0, S32, S64) + .clampScalar(1, S32, S32) + .lower(); } getActionDefinitionsBuilder(G_FPTRUNC) @@ -2373,9 +2387,7 @@ bool AMDGPULegalizerInfo::legalizeITOFP( : B.buildUITOFP(S64, Unmerge.getReg(1)); auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); - auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) - .addUse(CvtHi.getReg(0)) - .addUse(ThirtyTwo.getReg(0)); + auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo); // TODO: Should this propagate fast-math-flags? B.buildFAdd(Dst, LdExp, CvtLo); @@ -2406,10 +2418,7 @@ bool AMDGPULegalizerInfo::legalizeITOFP( auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); - B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef{Dst}, - /*HasSideEffects=*/false) - .addUse(FVal.getReg(0)) - .addUse(Scale.getReg(0)); + B.buildFLdexp(Dst, FVal, Scale); MI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index e02dc78..4ffbac0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3740,6 +3740,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_FPEXT: case AMDGPU::G_FEXP2: case AMDGPU::G_FLOG2: + case AMDGPU::G_FLDEXP: case AMDGPU::G_FMINNUM: case AMDGPU::G_FMAXNUM: case AMDGPU::G_FMINNUM_IEEE: @@ -3750,6 +3751,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_STRICT_FSUB: case AMDGPU::G_STRICT_FMUL: case AMDGPU::G_STRICT_FMA: + case AMDGPU::G_STRICT_FLDEXP: case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? case AMDGPU::G_FSHR: // TODO: Expand for scalar case AMDGPU::G_AMDGPU_FMIN_LEGACY: @@ -4213,7 +4215,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_rsq_clamp: case Intrinsic::amdgcn_fmul_legacy: case Intrinsic::amdgcn_fma_legacy: - case Intrinsic::amdgcn_ldexp: case Intrinsic::amdgcn_frexp_mant: case Intrinsic::amdgcn_frexp_exp: case Intrinsic::amdgcn_fract: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 7536d3f..f090b0c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -472,6 +472,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, MVT::f64, Custom); setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64}, + Legal); setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); @@ -528,7 +530,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // F16 - VOP2 Actions. setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand); - + setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom); setOperationAction(ISD::FDIV, MVT::f16, Custom); // F16 - VOP3 Actions. @@ -4843,6 +4845,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMINNUM: case ISD::FMAXNUM: return lowerFMINNUM_FMAXNUM(Op, DAG); + case ISD::FLDEXP: + case ISD::STRICT_FLDEXP: + return lowerFLDEXP(Op, DAG); case ISD::FMA: return splitTernaryVectorOp(Op, DAG); case ISD::FP_TO_SINT: @@ -5464,6 +5469,40 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, return Op; } +SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { + bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP; + EVT VT = Op.getValueType(); + assert(VT == MVT::f16); + + SDValue Exp = Op.getOperand(IsStrict ? 2 : 1); + EVT ExpVT = Exp.getValueType(); + if (ExpVT == MVT::i16) + return Op; + + SDLoc DL(Op); + + // Correct the exponent type for f16 to i16. + // Clamp the range of the exponent to the instruction's range. + + // TODO: This should be a generic narrowing legalization, and can easily be + // for GlobalISel. + + SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT); + SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp); + + SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT); + SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp); + + SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp); + + if (IsStrict) { + return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other}, + {Op.getOperand(0), Op.getOperand(1), TruncExp}); + } + + return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp); +} + SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc SL(Op); @@ -7151,8 +7190,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return emitRemovedIntrinsicError(DAG, DL, VT); } case Intrinsic::amdgcn_ldexp: - return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, - Op.getOperand(1), Op.getOperand(2)); + return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(1), Op.getOperand(2)); case Intrinsic::amdgcn_fract: return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); @@ -10379,6 +10417,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case ISD::FREM: case ISD::FP_ROUND: case ISD::FP_EXTEND: + case ISD::FLDEXP: case AMDGPUISD::FMUL_LEGACY: case AMDGPUISD::FMAD_FTZ: case AMDGPUISD::RCP: @@ -10390,7 +10429,6 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case AMDGPUISD::DIV_FMAS: case AMDGPUISD::DIV_FIXUP: case AMDGPUISD::FRACT: - case AMDGPUISD::LDEXP: case AMDGPUISD::CVT_PKRTZ_F16_F32: case AMDGPUISD::CVT_F32_UBYTE0: case AMDGPUISD::CVT_F32_UBYTE1: @@ -11976,12 +12014,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFCanonicalizeCombine(N, DCI); case AMDGPUISD::RCP: return performRcpCombine(N, DCI); + case ISD::FLDEXP: case AMDGPUISD::FRACT: case AMDGPUISD::RSQ: case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::RCP_IFLAG: - case AMDGPUISD::RSQ_CLAMP: - case AMDGPUISD::LDEXP: { + case AMDGPUISD::RSQ_CLAMP: { // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted SDValue Src = N->getOperand(0); if (Src.isUndef()) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 75215a7..4a18c34 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -141,6 +141,7 @@ private: /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 9442b1f..35429a4 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -777,7 +777,7 @@ let IsNeverUniform = 1 in { defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>; defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>; } // End IsNeverUniform = 1 -defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>; +defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, any_fldexp>; let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>; @@ -863,7 +863,7 @@ def : divergent_i64_BinOp ; // 16-Bit Operand Instructions //===----------------------------------------------------------------------===// -def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16 { +def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16 { // The ldexp.f16 intrinsic expects a i32 src1 operand, though the hardware // encoding treats src1 as an f16 let Src1RC32 = RegisterOperand; @@ -874,9 +874,9 @@ def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16 { let isReMaterializable = 1 in { let FPDPRounding = 1 in { let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts] in - defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; + defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I16, any_fldexp>; let SubtargetPredicate = HasTrue16BitInsts in - defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16, AMDGPUldexp>; + defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16, any_fldexp>; } // End FPDPRounding = 1 // FIXME VOP3 Only instructions. NFC using VOPProfile_True16 for these until a planned change to use a new register class for VOP3 encoded True16 instuctions defm V_LSHLREV_B16 : VOP2Inst_e64_t16 <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 8216139..b19ae97 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -219,7 +219,7 @@ defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdi let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile, AMDGPUdiv_fixup>; - defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile, AMDGPUldexp>; + defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile, any_fldexp>; } // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 } // End isReMaterializable = 1 diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index d2b67de..3b05709 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1635,7 +1635,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, ISD::FCOS, ISD::FPOW, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT, ISD::FROUND, ISD::FFLOOR, - ISD::FMINNUM, ISD::FMAXNUM, ISD::FSINCOS, + ISD::FMINNUM, ISD::FMAXNUM, ISD::FSINCOS, ISD::FLDEXP, // Misc: ISD::BR_CC, ISD::SELECT_CC, ISD::ConstantPool, // Vector: diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index dadacf0..012052a 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -849,6 +849,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FLDEXP, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index cd6ba54..c2638b3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -619,6 +619,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FROUND, VT, Action); setOperationAction(ISD::FROUNDEVEN, VT, Action); setOperationAction(ISD::FTRUNC, VT, Action); + setOperationAction(ISD::FLDEXP, VT, Action); }; if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -678,6 +679,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote); setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FLDEXP, MVT::f16, Promote); setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote); setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote); setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index e206ec9..13d4395 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -470,6 +470,9 @@ # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FLDEXP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_FNEG (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. the first uncovered type index: 1, OK @@ -653,6 +656,9 @@ # DEBUG-NEXT: G_STRICT_FSQRT (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_STRICT_FLDEXP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_READ_REGISTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.mir deleted file mode 100644 index f107a4b..0000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.mir +++ /dev/null @@ -1,134 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s - ---- -name: ldexp_s32_vsv -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $sgpr0, $vgpr0 - ; GCN-LABEL: name: ldexp_s32_vsv - ; GCN: liveins: $sgpr0, $vgpr0 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: %2:vgpr_32 = nofpexcept V_LDEXP_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit %2 - %0:sgpr(s32) = COPY $sgpr0 - %1:vgpr(s32) = COPY $vgpr0 - %2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), %0, %1 - S_ENDPGM 0, implicit %2 -... - ---- -name: ldexp_s32_vvs -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $sgpr0, $vgpr0 - ; GCN-LABEL: name: ldexp_s32_vvs - ; GCN: liveins: $sgpr0, $vgpr0 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: %2:vgpr_32 = nofpexcept V_LDEXP_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit %2 - %0:vgpr(s32) = COPY $vgpr0 - %1:sgpr(s32) = COPY $sgpr0 - %2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), %0, %1 - S_ENDPGM 0, implicit %2 -... - ---- -name: ldexp_s32_vvv -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0, $vgpr1 - ; GCN-LABEL: name: ldexp_s32_vvv - ; GCN: liveins: $vgpr0, $vgpr1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GCN-NEXT: %2:vgpr_32 = nofpexcept V_LDEXP_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit %2 - %0:vgpr(s32) = COPY $vgpr0 - %1:vgpr(s32) = COPY $vgpr1 - %2:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), %0, %1 - S_ENDPGM 0, implicit %2 -... - ---- -name: ldexp_s64_vsv -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1, $vgpr0 - ; GCN-LABEL: name: ldexp_s64_vsv - ; GCN: liveins: $sgpr0_sgpr1, $vgpr0 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: %2:vreg_64 = nofpexcept V_LDEXP_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit %2 - %0:sgpr(s64) = COPY $sgpr0_sgpr1 - %1:vgpr(s32) = COPY $vgpr0 - %2:vgpr(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), %0, %1 - S_ENDPGM 0, implicit %2 -... - ---- -name: ldexp_s64_vvs -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $sgpr0_sgpr1, $vgpr0 - ; GCN-LABEL: name: ldexp_s64_vvs - ; GCN: liveins: $sgpr0_sgpr1, $vgpr0 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: %2:vreg_64 = nofpexcept V_LDEXP_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit %2 - %0:vgpr(s64) = COPY $vgpr0_vgpr1 - %1:sgpr(s32) = COPY $sgpr0 - %2:vgpr(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), %0, %1 - S_ENDPGM 0, implicit %2 -... - ---- -name: ldexp_s64_vvv -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0_vgpr1, $vgpr2 - ; GCN-LABEL: name: ldexp_s64_vvv - ; GCN: liveins: $vgpr0_vgpr1, $vgpr2 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GCN-NEXT: %2:vreg_64 = nofpexcept V_LDEXP_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit %2 - %0:vgpr(s64) = COPY $vgpr0_vgpr1 - %1:vgpr(s32) = COPY $vgpr2 - %2:vgpr(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), %0, %1 - S_ENDPGM 0, implicit %2 -... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.s16.mir deleted file mode 100644 index 8cb0885..0000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.ldexp.s16.mir +++ /dev/null @@ -1,76 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o /dev/null 2>&1 | FileCheck -check-prefix=SI-ERR %s - -# SI-ERR: remark: :0:0: cannot select: %3:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), %2:sgpr(s16), %1:vgpr(s32) (in function: ldexp_s16_vsv) -# SI-ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), %2:vgpr(s16), %1:sgpr(s32) (in function: ldexp_s16_vvs) -# SI-ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), %2:vgpr(s16), %1:vgpr(s32) (in function: ldexp_s16_vvv) - ---- -name: ldexp_s16_vsv -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $sgpr0, $vgpr0 - ; GCN-LABEL: name: ldexp_s16_vsv - ; GCN: liveins: $sgpr0, $vgpr0 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[V_LDEXP_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit [[V_LDEXP_F16_e64_]] - %0:sgpr(s32) = COPY $sgpr0 - %1:vgpr(s32) = COPY $vgpr0 - %2:sgpr(s16) = G_TRUNC %0 - %3:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), %2, %1 - S_ENDPGM 0, implicit %3 -... - ---- -name: ldexp_s16_vvs -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $sgpr0, $vgpr0 - ; GCN-LABEL: name: ldexp_s16_vvs - ; GCN: liveins: $sgpr0, $vgpr0 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[V_LDEXP_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit [[V_LDEXP_F16_e64_]] - %0:vgpr(s32) = COPY $vgpr0 - %1:sgpr(s32) = COPY $sgpr0 - %2:vgpr(s16) = G_TRUNC %0 - %3:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), %2, %1 - S_ENDPGM 0, implicit %3 -... - ---- -name: ldexp_s16_vvv -legalized: true -regBankSelected: true -tracksRegLiveness: true - -body: | - bb.0: - liveins: $vgpr0, $vgpr1 - ; GCN-LABEL: name: ldexp_s16_vvv - ; GCN: liveins: $vgpr0, $vgpr1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GCN-NEXT: [[V_LDEXP_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LDEXP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit [[V_LDEXP_F16_e64_]] - %0:vgpr(s32) = COPY $vgpr0 - %1:vgpr(s32) = COPY $vgpr1 - %2:vgpr(s16) = G_TRUNC %0 - %3:vgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), %2, %1 - S_ENDPGM 0, implicit %3 -... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir index ccd4509..4cbdea64 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir @@ -132,8 +132,8 @@ body: | ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX6-NEXT: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) - ; GFX6-NEXT: $vgpr0 = COPY [[INT1]](s32) + ; GFX6-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[SITOFP]], [[SUB1]](s32) + ; GFX6-NEXT: $vgpr0 = COPY [[FLDEXP]](s32) ; GFX8-LABEL: name: test_sitofp_s64_to_s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -154,8 +154,8 @@ body: | ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX8-NEXT: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX8-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) - ; GFX8-NEXT: $vgpr0 = COPY [[INT1]](s32) + ; GFX8-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[SITOFP]], [[SUB1]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[FLDEXP]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_SITOFP %0 $vgpr0 = COPY %1 @@ -175,8 +175,8 @@ body: | ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; GFX6-NEXT: [[SITOFP:%[0-9]+]]:_(s64) = G_SITOFP [[UV1]](s32) ; GFX6-NEXT: [[UITOFP:%[0-9]+]]:_(s64) = G_UITOFP [[UV]](s32) - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s64), [[C]](s32) - ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[INT]], [[UITOFP]] + ; GFX6-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[SITOFP]], [[C]](s32) + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[FLDEXP]], [[UITOFP]] ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[FADD]](s64) ; GFX8-LABEL: name: test_sitofp_s64_to_s64 ; GFX8: liveins: $vgpr0_vgpr1 @@ -186,8 +186,8 @@ body: | ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; GFX8-NEXT: [[SITOFP:%[0-9]+]]:_(s64) = G_SITOFP [[UV1]](s32) ; GFX8-NEXT: [[UITOFP:%[0-9]+]]:_(s64) = G_UITOFP [[UV]](s32) - ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s64), [[C]](s32) - ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[INT]], [[UITOFP]] + ; GFX8-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[SITOFP]], [[C]](s32) + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[FLDEXP]], [[UITOFP]] ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FADD]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_SITOFP %0 @@ -476,8 +476,8 @@ body: | ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX6-NEXT: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) - ; GFX6-NEXT: $vgpr0 = COPY [[INT1]](s32) + ; GFX6-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[SITOFP]], [[SUB1]](s32) + ; GFX6-NEXT: $vgpr0 = COPY [[FLDEXP]](s32) ; GFX8-LABEL: name: test_sitofp_s33_to_s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -499,8 +499,8 @@ body: | ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX8-NEXT: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX8-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) - ; GFX8-NEXT: $vgpr0 = COPY [[INT1]](s32) + ; GFX8-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[SITOFP]], [[SUB1]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[FLDEXP]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s33) = G_TRUNC %0 %2:_(s32) = G_SITOFP %1 @@ -533,8 +533,8 @@ body: | ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX6-NEXT: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) - ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) + ; GFX6-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[SITOFP]], [[SUB1]](s32) + ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FLDEXP]](s32) ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX8-LABEL: name: test_sitofp_s64_to_s16 @@ -557,8 +557,8 @@ body: | ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX8-NEXT: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX8-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) - ; GFX8-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) + ; GFX8-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[SITOFP]], [[SUB1]](s32) + ; GFX8-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FLDEXP]](s32) ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 @@ -594,14 +594,14 @@ body: | ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[UMIN1]] ; GFX6-NEXT: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) - ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) + ; GFX6-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[SITOFP]], [[SUB1]](s32) + ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FLDEXP]](s32) ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV6]], [[UV7]] ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[XOR1]], [[C2]](s32) ; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR1]] - ; GFX6-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV7]](s32) - ; GFX6-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[INT2]], [[C1]] + ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV7]](s32) + ; GFX6-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[INT1]], [[C1]] ; GFX6-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[SUB2]], [[ADD1]] ; GFX6-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN2]](s32) ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL1]](s64) @@ -609,8 +609,8 @@ body: | ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[UMIN3]] ; GFX6-NEXT: [[SITOFP1:%[0-9]+]]:_(s32) = G_SITOFP [[OR1]](s32) ; GFX6-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN2]] - ; GFX6-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP1]](s32), [[SUB3]](s32) - ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT3]](s32) + ; GFX6-NEXT: [[FLDEXP1:%[0-9]+]]:_(s32) = G_FLDEXP [[SITOFP1]], [[SUB3]](s32) + ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FLDEXP1]](s32) ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 @@ -639,14 +639,14 @@ body: | ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[UMIN1]] ; GFX8-NEXT: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX8-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) - ; GFX8-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) + ; GFX8-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[SITOFP]], [[SUB1]](s32) + ; GFX8-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FLDEXP]](s32) ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV6]], [[UV7]] ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[XOR1]], [[C2]](s32) ; GFX8-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR1]] - ; GFX8-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV7]](s32) - ; GFX8-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[INT2]], [[C1]] + ; GFX8-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV7]](s32) + ; GFX8-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[INT1]], [[C1]] ; GFX8-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[SUB2]], [[ADD1]] ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN2]](s32) ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL1]](s64) @@ -654,8 +654,8 @@ body: | ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[UMIN3]] ; GFX8-NEXT: [[SITOFP1:%[0-9]+]]:_(s32) = G_SITOFP [[OR1]](s32) ; GFX8-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN2]] - ; GFX8-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP1]](s32), [[SUB3]](s32) - ; GFX8-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT3]](s32) + ; GFX8-NEXT: [[FLDEXP1:%[0-9]+]]:_(s32) = G_FLDEXP [[SITOFP1]], [[SUB3]](s32) + ; GFX8-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FLDEXP1]](s32) ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX8-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uitofp.mir index c44a7c1..65826d7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uitofp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uitofp.mir @@ -98,8 +98,8 @@ body: | ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX6-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) - ; GFX6-NEXT: $vgpr0 = COPY [[INT]](s32) + ; GFX6-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[UITOFP]], [[SUB]](s32) + ; GFX6-NEXT: $vgpr0 = COPY [[FLDEXP]](s32) ; GFX8-LABEL: name: test_uitofp_s64_to_s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -115,8 +115,8 @@ body: | ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX8-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) - ; GFX8-NEXT: $vgpr0 = COPY [[INT]](s32) + ; GFX8-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[UITOFP]], [[SUB]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[FLDEXP]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_UITOFP %0 $vgpr0 = COPY %1 @@ -136,8 +136,8 @@ body: | ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; GFX6-NEXT: [[UITOFP:%[0-9]+]]:_(s64) = G_UITOFP [[UV1]](s32) ; GFX6-NEXT: [[UITOFP1:%[0-9]+]]:_(s64) = G_UITOFP [[UV]](s32) - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s64), [[C]](s32) - ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[INT]], [[UITOFP1]] + ; GFX6-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[UITOFP]], [[C]](s32) + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[FLDEXP]], [[UITOFP1]] ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[FADD]](s64) ; GFX8-LABEL: name: test_uitofp_s64_to_s64 ; GFX8: liveins: $vgpr0_vgpr1 @@ -147,8 +147,8 @@ body: | ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; GFX8-NEXT: [[UITOFP:%[0-9]+]]:_(s64) = G_UITOFP [[UV1]](s32) ; GFX8-NEXT: [[UITOFP1:%[0-9]+]]:_(s64) = G_UITOFP [[UV]](s32) - ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s64), [[C]](s32) - ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[INT]], [[UITOFP1]] + ; GFX8-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[UITOFP]], [[C]](s32) + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[FLDEXP]], [[UITOFP1]] ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FADD]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_UITOFP %0 @@ -444,8 +444,8 @@ body: | ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX6-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[UMIN]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) - ; GFX6-NEXT: $vgpr0 = COPY [[INT]](s32) + ; GFX6-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[UITOFP]], [[SUB]](s32) + ; GFX6-NEXT: $vgpr0 = COPY [[FLDEXP]](s32) ; GFX8-LABEL: name: test_uitofp_s33_to_s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -463,8 +463,8 @@ body: | ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX8-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[UMIN]] - ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) - ; GFX8-NEXT: $vgpr0 = COPY [[INT]](s32) + ; GFX8-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[UITOFP]], [[SUB]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[FLDEXP]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s33) = G_TRUNC %0 %2:_(s32) = G_UITOFP %1 @@ -492,8 +492,8 @@ body: | ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX6-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) - ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) + ; GFX6-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[UITOFP]], [[SUB]](s32) + ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FLDEXP]](s32) ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX8-LABEL: name: test_uitofp_s64_to_s16 @@ -511,8 +511,8 @@ body: | ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX8-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) - ; GFX8-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) + ; GFX8-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[UITOFP]], [[SUB]](s32) + ; GFX8-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FLDEXP]](s32) ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 @@ -543,8 +543,8 @@ body: | ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[UMIN1]] ; GFX6-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) - ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) + ; GFX6-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[UITOFP]], [[SUB]](s32) + ; GFX6-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FLDEXP]](s32) ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV7]](s32) ; GFX6-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C]] @@ -554,8 +554,8 @@ body: | ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[UMIN3]] ; GFX6-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[OR1]](s32) ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN2]] - ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP1]](s32), [[SUB1]](s32) - ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) + ; GFX6-NEXT: [[FLDEXP1:%[0-9]+]]:_(s32) = G_FLDEXP [[UITOFP1]], [[SUB1]](s32) + ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FLDEXP1]](s32) ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX6-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 @@ -579,8 +579,8 @@ body: | ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[UMIN1]] ; GFX8-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] - ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) - ; GFX8-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) + ; GFX8-NEXT: [[FLDEXP:%[0-9]+]]:_(s32) = G_FLDEXP [[UITOFP]], [[SUB]](s32) + ; GFX8-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FLDEXP]](s32) ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV7]](s32) ; GFX8-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C]] @@ -590,8 +590,8 @@ body: | ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[UMIN3]] ; GFX8-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[OR1]](s32) ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN2]] - ; GFX8-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP1]](s32), [[SUB1]](s32) - ; GFX8-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) + ; GFX8-NEXT: [[FLDEXP1:%[0-9]+]]:_(s32) = G_FLDEXP [[UITOFP1]], [[SUB1]](s32) + ; GFX8-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FLDEXP1]](s32) ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX8-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll index b9934f1..595dffe 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll @@ -12,19 +12,23 @@ define amdgpu_kernel void @ldexp_f16( ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s6 ; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_movk_i32 s4, 0x8000 +; VI-NEXT: v_mov_b32_e32 v2, 0x7fff ; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_med3_i32 v0, v0, s4, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; VI-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -35,19 +39,22 @@ define amdgpu_kernel void @ldexp_f16( ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-NEXT: s_mov_b32 s14, s2 -; GFX10-NEXT: s_mov_b32 s15, s3 ; GFX10-NEXT: s_mov_b32 s10, s2 ; GFX10-NEXT: s_mov_b32 s11, s3 +; GFX10-NEXT: s_mov_b32 s14, s2 +; GFX10-NEXT: s_mov_b32 s15, s3 +; GFX10-NEXT: s_movk_i32 s0, 0x8000 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX10-NEXT: s_mov_b32 s13, s7 -; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_med3_i32 v0, v0, s0, 0x7fff +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; @@ -58,19 +65,23 @@ define amdgpu_kernel void @ldexp_f16( ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_mov_b32 s10, -1 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_mov_b32 s2, s10 ; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 +; GFX11-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-NEXT: buffer_load_u16 v1, off, s[12:15], 0 ; GFX11-NEXT: s_mov_b32 s8, s4 ; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_med3_i32 v0, v0, s0, 0x7fff ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -97,8 +108,11 @@ define amdgpu_kernel void @ldexp_f16_imm_a( ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_movk_i32 s0, 0x8000 +; VI-NEXT: v_mov_b32_e32 v1, 0x7fff ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_med3_i32 v0, v0, s0, v1 ; VI-NEXT: v_ldexp_f16_e32 v0, 2.0, v0 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -113,10 +127,12 @@ define amdgpu_kernel void @ldexp_f16_imm_a( ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s8, s2 ; GFX10-NEXT: s_mov_b32 s9, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_movk_i32 s2, 0x8000 ; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX10-NEXT: s_mov_b32 s4, s0 ; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_med3_i32 v0, v0, s2, 0x7fff ; GFX10-NEXT: v_ldexp_f16_e32 v0, 2.0, v0 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -131,10 +147,13 @@ define amdgpu_kernel void @ldexp_f16_imm_a( ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s8, s2 ; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_movk_i32 s2, 0x8000 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s4, s0 ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_i32 v0, v0, s2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f16_e32 v0, 2.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll new file mode 100644 index 0000000..8f95304 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -0,0 +1,640 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s + +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s + +define float @test_ldexp_f32_i32(ptr addrspace(1) %out, float %a, i32 %b) { +; GFX6-LABEL: test_ldexp_f32_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ldexp_f32_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_ldexp_f32 v0, v2, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ldexp_f32_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ldexp_f32 v0, v2, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_f32_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.ldexp.f32.i32(float %a, i32 %b) + ret float %result +} + +define <2 x float> @test_ldexp_v2f32_v2i32(ptr addrspace(1) %out, <2 x float> %a, <2 x i32> %b) { +; GFX6-LABEL: test_ldexp_v2f32_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v4 +; GFX6-NEXT: v_ldexp_f32_e32 v1, v3, v5 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ldexp_v2f32_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_ldexp_f32 v0, v2, v4 +; GFX8-NEXT: v_ldexp_f32 v1, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ldexp_v2f32_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ldexp_f32 v0, v2, v4 +; GFX9-NEXT: v_ldexp_f32 v1, v3, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_v2f32_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v4 +; GFX11-NEXT: v_ldexp_f32 v1, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %a, <2 x i32> %b) + ret <2 x float> %result +} + +define <3 x float> @test_ldexp_v3f32_v3i32(ptr addrspace(1) %out, <3 x float> %a, <3 x i32> %b) { +; GFX6-LABEL: test_ldexp_v3f32_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v5 +; GFX6-NEXT: v_ldexp_f32_e32 v1, v3, v6 +; GFX6-NEXT: v_ldexp_f32_e32 v2, v4, v7 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ldexp_v3f32_v3i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_ldexp_f32 v0, v2, v5 +; GFX8-NEXT: v_ldexp_f32 v1, v3, v6 +; GFX8-NEXT: v_ldexp_f32 v2, v4, v7 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ldexp_v3f32_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ldexp_f32 v0, v2, v5 +; GFX9-NEXT: v_ldexp_f32 v1, v3, v6 +; GFX9-NEXT: v_ldexp_f32 v2, v4, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_v3f32_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v5 +; GFX11-NEXT: v_ldexp_f32 v1, v3, v6 +; GFX11-NEXT: v_ldexp_f32 v2, v4, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x float> @llvm.ldexp.v3f32.v3i32(<3 x float> %a, <3 x i32> %b) + ret <3 x float> %result +} + +define <4 x float> @test_ldexp_v4f32_v4i32(ptr addrspace(1) %out, <4 x float> %a, <4 x i32> %b) { +; GFX6-LABEL: test_ldexp_v4f32_v4i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v6 +; GFX6-NEXT: v_ldexp_f32_e32 v1, v3, v7 +; GFX6-NEXT: v_ldexp_f32_e32 v2, v4, v8 +; GFX6-NEXT: v_ldexp_f32_e32 v3, v5, v9 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ldexp_v4f32_v4i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_ldexp_f32 v0, v2, v6 +; GFX8-NEXT: v_ldexp_f32 v1, v3, v7 +; GFX8-NEXT: v_ldexp_f32 v2, v4, v8 +; GFX8-NEXT: v_ldexp_f32 v3, v5, v9 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ldexp_v4f32_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ldexp_f32 v0, v2, v6 +; GFX9-NEXT: v_ldexp_f32 v1, v3, v7 +; GFX9-NEXT: v_ldexp_f32 v2, v4, v8 +; GFX9-NEXT: v_ldexp_f32 v3, v5, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_v4f32_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v6 +; GFX11-NEXT: v_ldexp_f32 v1, v3, v7 +; GFX11-NEXT: v_ldexp_f32 v2, v4, v8 +; GFX11-NEXT: v_ldexp_f32 v3, v5, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %a, <4 x i32> %b) + ret <4 x float> %result +} + +define double @test_ldexp_f64_i32(double %a, i32 %b) { +; GFX6-LABEL: test_ldexp_f64_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ldexp_f64_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ldexp_f64_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_f64_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.ldexp.f64.i32(double %a, i32 %b) + ret double %result +} + +define <2 x double> @test_ldexp_v2f64_v2i32(<2 x double> %a, <2 x i32> %b) { +; GFX6-LABEL: test_ldexp_v2f64_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX6-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ldexp_v2f64_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX8-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ldexp_v2f64_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX9-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_v2f64_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f64 v[0:1], v[0:1], v4 +; GFX11-NEXT: v_ldexp_f64 v[2:3], v[2:3], v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %a, <2 x i32> %b) + ret <2 x double> %result +} + +; Broken for DAG +; define float @test_ldexp_f32_i16(float %a, i16 %b) { +; %result = call float @llvm.ldexp.f32.i16(float %a, i16 %b) +; ret float %result +; } + +; FIXME: Should be able to truncate to i32 +; define float @test_ldexp_f32_i64(float %a, i64 %b) { +; %result = call float @llvm.ldexp.f32.i64(float %a, i64 %b) +; ret float %result +; } + +; define <2 x float> @test_ldexp_v2f32_v2i16(<2 x float> %a, <2 x i16> %b) { +; %result = call <2 x float> @llvm.ldexp.v2f32.v2i16(<2 x float> %a, <2 x i16> %b) +; ret <2 x float> %result +; } + +; FIXME: Should be able to truncate to i32 +; define <2 x float> @test_ldexp_v2f32_v2i64(<2 x float> %a, <2 x i64> %b) { +; %result = call <2 x float> @llvm.ldexp.v2f32.v2i64(<2 x float> %a, <2 x i64> %b) +; ret <2 x float> %result +; } + +define half @test_ldexp_f16_i8(half %a, i8 %b) { +; GFX6-SDAG-LABEL: test_ldexp_f16_i8: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: test_ldexp_f16_i8: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: test_ldexp_f16_i8: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v0, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_ldexp_f16_i8: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_ldexp_f16_i8: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: test_ldexp_f16_i8: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX8-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 +; GFX8-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_ldexp_f16_i8: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_ldexp_f16_i8: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.ldexp.f16.i8(half %a, i8 %b) + ret half %result +} + +define half @test_ldexp_f16_i16(half %a, i16 %b) { +; GFX6-SDAG-LABEL: test_ldexp_f16_i16: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ldexp_f16_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ldexp_f16_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_f16_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_ldexp_f16_i16: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.ldexp.f16.i16(half %a, i16 %b) + ret half %result +} + +define half @test_ldexp_f16_i32(half %a, i32 %b) { +; GFX6-SDAG-LABEL: test_ldexp_f16_i32: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: test_ldexp_f16_i32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX8-SDAG-NEXT: v_med3_i32 v1, v1, s4, v2 +; GFX8-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: test_ldexp_f16_i32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX9-SDAG-NEXT: v_med3_i32 v1, v1, s4, v2 +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_ldexp_f16_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_med3_i32 v1, v1, s0, 0x7fff +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_ldexp_f16_i32: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: test_ldexp_f16_i32: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX8-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 +; GFX8-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_ldexp_f16_i32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_ldexp_f16_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.ldexp.f16.i32(half %a, i32 %b) + ret half %result +} + +define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) { +; GFX6-SDAG-LABEL: test_ldexp_v2f16_v2i32: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: test_ldexp_v2f16_v2i32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX8-SDAG-NEXT: v_med3_i32 v2, v2, s4, v3 +; GFX8-SDAG-NEXT: v_med3_i32 v1, v1, s4, v3 +; GFX8-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v3 +; GFX9-SDAG-NEXT: v_med3_i32 v1, v1, s4, v3 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_ldexp_v2f16_v2i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-NEXT: v_med3_i32 v2, v2, s0, 0x7fff +; GFX11-SDAG-NEXT: v_med3_i32 v1, v1, s0, 0x7fff +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v2, v3, v2 +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_ldexp_v2f16_v2i32: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i32: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff +; GFX8-GISEL-NEXT: v_med3_i32 v1, v1, v3, v4 +; GFX8-GISEL-NEXT: v_med3_i32 v2, v2, v3, v4 +; GFX8-GISEL-NEXT: v_ldexp_f16_e32 v1, v0, v1 +; GFX8-GISEL-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_ldexp_v2f16_v2i32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v3, v4 +; GFX9-GISEL-NEXT: v_med3_i32 v2, v2, v3, v4 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v1, v0, v1 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_ldexp_v2f16_v2i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v3 +; GFX11-GISEL-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v4, v2 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> %a, <2 x i32> %b) + ret <2 x half> %result +} + +define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) { +; GFX6-SDAG-LABEL: test_ldexp_v2f16_v2i16: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-SDAG-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-SDAG-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v1, v3 +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: test_ldexp_v2f16_v2i16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i16: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_ldexp_v2f16_v2i16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v2, v3, v2 +; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_ldexp_v2f16_v2i16: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-GISEL-NEXT: v_bfe_i32 v2, v3, 0, 16 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v2 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_ldexp_f16_e32 v2, v0, v1 +; GFX8-GISEL-NEXT: v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_ldexp_v2f16_v2i16: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v2, v0, v1 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_ldexp_v2f16_v2i16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v2, v3 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> %a, <2 x i16> %b) + ret <2 x half> %result +} + +declare float @llvm.ldexp.f32.i32(float, i32) #0 +declare float @llvm.ldexp.f32.i16(float, i16) #0 +declare float @llvm.ldexp.f32.i64(float, i64) #0 +declare half @llvm.ldexp.f16.i8(half, i8) #0 +declare half @llvm.ldexp.f16.i16(half, i16) #0 +declare half @llvm.ldexp.f16.i32(half, i32) #0 +declare <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half>, <2 x i16>) #0 +declare <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half>, <2 x i32>) #0 +declare <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float>, <2 x i32>) #0 +declare <2 x float> @llvm.ldexp.v2f32.v2i16(<2 x float>, <2 x i16>) #0 +declare <2 x float> @llvm.ldexp.v2f32.v2i64(<2 x float>, <2 x i64>) #0 +declare <3 x float> @llvm.ldexp.v3f32.v3i32(<3 x float>, <3 x i32>) #0 +declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>) #0 +declare double @llvm.ldexp.f64.i32(double, i32) #0 +declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>) #0 + +attributes #0 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll new file mode 100644 index 0000000..592917a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll @@ -0,0 +1,400 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; FIXME: Enable f16 promotion +; XUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s + +; XUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s + +; define half @test_ldexp_f16_i16(ptr addrspace(1) %out, half %a, i16 %b) #0 { +; %result = call half @llvm.experimental.constrained.ldexp.f16.i16(half %a, i16 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") +; ret half %result +; } + +define half @test_ldexp_f16_i32(ptr addrspace(1) %out, half %a, i32 %b) #0 { +; GFX8-SDAG-LABEL: test_ldexp_f16_i32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX8-SDAG-NEXT: v_med3_i32 v0, v3, s4, v0 +; GFX8-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: test_ldexp_f16_i32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX9-SDAG-NEXT: v_med3_i32 v0, v3, s4, v0 +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_ldexp_f16_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_med3_i32 v0, v3, s0, 0x7fff +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: test_ldexp_f16_i32: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0xffff8000 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX8-GISEL-NEXT: v_med3_i32 v0, v3, v0, v1 +; GFX8-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_ldexp_f16_i32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v0, v3, v0, v1 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_ldexp_f16_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v3, v0 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.experimental.constrained.ldexp.f16.i32(half %a, i32 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret half %result +} + +; define <2 x half> @test_ldexp_v2f16_v2i16(ptr addrspace(1) %out, <2 x half> %a, <2 x i16> %b) #0 { +; %result = call <2 x half> @llvm.experimental.constrained.ldexp.v2f16.v2i16(<2 x half> %a, <2 x i16> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") +; ret <2 x half> %result +; } + +define <2 x half> @test_ldexp_v2f16_v2i32(ptr addrspace(1) %out, <2 x half> %a, <2 x i32> %b) #0 { +; GFX8-SDAG-LABEL: test_ldexp_v2f16_v2i32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX8-SDAG-NEXT: v_med3_i32 v1, v3, s4, v0 +; GFX8-SDAG-NEXT: v_med3_i32 v0, v4, s4, v0 +; GFX8-SDAG-NEXT: v_ldexp_f16_e32 v1, v2, v1 +; GFX8-SDAG-NEXT: v_ldexp_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX9-SDAG-NEXT: v_med3_i32 v1, v3, s4, v0 +; GFX9-SDAG-NEXT: v_med3_i32 v0, v4, s4, v0 +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v2, v1 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_ldexp_v2f16_v2i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_med3_i32 v0, v3, s0, 0x7fff +; GFX11-SDAG-NEXT: v_med3_i32 v1, v4, s0, 0x7fff +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v1, v3, v1 +; GFX11-SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i32: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0xffff8000 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX8-GISEL-NEXT: v_med3_i32 v3, v3, v0, v1 +; GFX8-GISEL-NEXT: v_med3_i32 v0, v4, v0, v1 +; GFX8-GISEL-NEXT: v_ldexp_f16_e32 v3, v2, v3 +; GFX8-GISEL-NEXT: v_ldexp_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_ldexp_v2f16_v2i32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v3, v3, v0, v1 +; GFX9-GISEL-NEXT: v_med3_i32 v0, v4, v0, v1 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v3, v2, v3 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_ldexp_v2f16_v2i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v3, v0 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v4, v0 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v2, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v3, v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x half> @llvm.experimental.constrained.ldexp.v2f16.v2i32(<2 x half> %a, <2 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x half> %result +} + +define <3 x half> @test_ldexp_v3f16_v3i32(ptr addrspace(1) %out, <3 x half> %a, <3 x i32> %b) #0 { +; GFX8-SDAG-LABEL: test_ldexp_v3f16_v3i32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX8-SDAG-NEXT: v_med3_i32 v0, v4, s4, v1 +; GFX8-SDAG-NEXT: v_med3_i32 v4, v5, s4, v1 +; GFX8-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX8-SDAG-NEXT: v_ldexp_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_med3_i32 v1, v6, s4, v1 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_ldexp_f16_e32 v1, v3, v1 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: test_ldexp_v3f16_v3i32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX9-SDAG-NEXT: v_med3_i32 v0, v4, s4, v1 +; GFX9-SDAG-NEXT: v_med3_i32 v4, v5, s4, v1 +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-SDAG-NEXT: s_mov_b32 s5, 0x5040100 +; GFX9-SDAG-NEXT: v_med3_i32 v1, v6, s4, v1 +; GFX9-SDAG-NEXT: v_perm_b32 v0, v2, v0, s5 +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v3, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_ldexp_v3f16_v3i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_med3_i32 v0, v4, s0, 0x7fff +; GFX11-SDAG-NEXT: v_med3_i32 v1, v5, s0, 0x7fff +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-SDAG-NEXT: v_med3_i32 v2, v6, s0, 0x7fff +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v1, v4, v1 +; GFX11-SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v1, v3, v2 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: test_ldexp_v3f16_v3i32: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0xffff8000 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX8-GISEL-NEXT: v_med3_i32 v4, v4, v0, v1 +; GFX8-GISEL-NEXT: v_med3_i32 v5, v5, v0, v1 +; GFX8-GISEL-NEXT: v_ldexp_f16_e32 v4, v2, v4 +; GFX8-GISEL-NEXT: v_ldexp_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_med3_i32 v0, v6, v0, v1 +; GFX8-GISEL-NEXT: v_ldexp_f16_e32 v1, v3, v0 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v2 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_ldexp_v3f16_v3i32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v4, v4, v0, v1 +; GFX9-GISEL-NEXT: v_med3_i32 v5, v5, v0, v1 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v4, v2, v4 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_med3_i32 v0, v6, v0, v1 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v1, v3, v0 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v4 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_ldexp_v3f16_v3i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v4, v0 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-GISEL-NEXT: v_med3_i32 v5, 0xffff8000, v5, v0 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v2, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v2, v4, v5 +; GFX11-GISEL-NEXT: v_med3_i32 v4, 0xffff8000, v6, v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v3, v4 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x half> @llvm.experimental.constrained.ldexp.v3f16.v3i32(<3 x half> %a, <3 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x half> %result +} + +define <4 x half> @test_ldexp_v4f16_v4i32(ptr addrspace(1) %out, <4 x half> %a, <4 x i32> %b) #0 { +; GFX8-SDAG-LABEL: test_ldexp_v4f16_v4i32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX8-SDAG-NEXT: v_med3_i32 v1, v7, s4, v0 +; GFX8-SDAG-NEXT: v_med3_i32 v6, v6, s4, v0 +; GFX8-SDAG-NEXT: v_med3_i32 v5, v5, s4, v0 +; GFX8-SDAG-NEXT: v_med3_i32 v0, v4, s4, v0 +; GFX8-SDAG-NEXT: v_ldexp_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_ldexp_f16_e32 v3, v3, v6 +; GFX8-SDAG-NEXT: v_ldexp_f16_sdwa v5, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: test_ldexp_v4f16_v4i32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX9-SDAG-NEXT: v_med3_i32 v1, v6, s4, v0 +; GFX9-SDAG-NEXT: v_med3_i32 v6, v7, s4, v0 +; GFX9-SDAG-NEXT: v_med3_i32 v4, v4, s4, v0 +; GFX9-SDAG-NEXT: v_med3_i32 v0, v5, s4, v0 +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v3, v1 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v4, v2, v4 +; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX9-SDAG-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: test_ldexp_v4f16_v4i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_med3_i32 v0, v6, s0, 0x7fff +; GFX11-SDAG-NEXT: v_med3_i32 v1, v7, s0, 0x7fff +; GFX11-SDAG-NEXT: v_med3_i32 v4, v4, s0, 0x7fff +; GFX11-SDAG-NEXT: v_med3_i32 v5, v5, s0, 0x7fff +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v3, v3, v0 +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v2, v6, v5 +; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v1, v7, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SDAG-NEXT: v_perm_b32 v1, v1, v3, 0x5040100 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i32: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0xffff8000 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX8-GISEL-NEXT: v_med3_i32 v4, v4, v0, v1 +; GFX8-GISEL-NEXT: v_med3_i32 v5, v5, v0, v1 +; GFX8-GISEL-NEXT: v_ldexp_f16_e32 v4, v2, v4 +; GFX8-GISEL-NEXT: v_ldexp_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_med3_i32 v5, v6, v0, v1 +; GFX8-GISEL-NEXT: v_med3_i32 v0, v7, v0, v1 +; GFX8-GISEL-NEXT: v_ldexp_f16_e32 v5, v3, v5 +; GFX8-GISEL-NEXT: v_ldexp_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v2 +; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_ldexp_v4f16_v4i32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0xffff8000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX9-GISEL-NEXT: v_med3_i32 v4, v4, v0, v1 +; GFX9-GISEL-NEXT: v_med3_i32 v5, v5, v0, v1 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v4, v2, v4 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_med3_i32 v5, v6, v0, v1 +; GFX9-GISEL-NEXT: v_med3_i32 v0, v7, v0, v1 +; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v5, v3, v5 +; GFX9-GISEL-NEXT: v_ldexp_f16_sdwa v1, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v4 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v5 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: test_ldexp_v4f16_v4i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_med3_i32 v4, 0xffff8000, v4, v0 +; GFX11-GISEL-NEXT: v_med3_i32 v6, 0xffff8000, v6, v0 +; GFX11-GISEL-NEXT: v_med3_i32 v5, 0xffff8000, v5, v0 +; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v7, v0 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v2, v2, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v3, v3, v6 +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v1, v5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v4, v8, v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_lshl_or_b32 v1, v4, 16, v2 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x half> @llvm.experimental.constrained.ldexp.v4f16.v4i32(<4 x half> %a, <4 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <4 x half> %result +} + +declare half @llvm.experimental.constrained.ldexp.f16.i16(half, i16, metadata, metadata) #1 +declare half @llvm.experimental.constrained.ldexp.f16.i32(half, i32, metadata, metadata) #1 +declare <2 x half> @llvm.experimental.constrained.ldexp.v2f16.v2i16(<2 x half>, <2 x i16>, metadata, metadata) #1 +declare <2 x half> @llvm.experimental.constrained.ldexp.v2f16.v2i32(<2 x half>, <2 x i32>, metadata, metadata) #1 +declare <3 x half> @llvm.experimental.constrained.ldexp.v3f16.v3i32(<3 x half>, <3 x i32>, metadata, metadata) #1 +declare <4 x half> @llvm.experimental.constrained.ldexp.v4f16.v4i32(<4 x half>, <4 x i32>, metadata, metadata) #1 + +attributes #0 = { strictfp } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX11: {{.*}} +; GFX8: {{.*}} +; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f32.ll new file mode 100644 index 0000000..f807467 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f32.ll @@ -0,0 +1,255 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s + +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s + +; define float @test_ldexp_f32_i16(ptr addrspace(1) %out, float %a, i16 %b) #0 { +; %result = call float @llvm.experimental.constrained.ldexp.f32.i16(float %a, i16 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") +; ret float %result +; } + +define float @test_ldexp_f32_i32(ptr addrspace(1) %out, float %a, i32 %b) #0 { +; GFX6-LABEL: test_ldexp_f32_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ldexp_f32_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_ldexp_f32 v0, v2, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ldexp_f32_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ldexp_f32 v0, v2, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_f32_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.experimental.constrained.ldexp.f32.i32(float %a, i32 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret float %result +} + +; define <2 x float> @test_ldexp_v2f32_v2i16(ptr addrspace(1) %out, <2 x float> %a, <2 x i16> %b) #0 { +; %result = call <2 x float> @llvm.experimental.constrained.ldexp.v2f32.v2i16(<2 x float> %a, <2 x i16> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") +; ret <2 x float> %result +; } + +define <2 x float> @test_ldexp_v2f32_v2i32(ptr addrspace(1) %out, <2 x float> %a, <2 x i32> %b) #0 { +; GFX6-SDAG-LABEL: test_ldexp_v2f32_v2i32: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v3, v5 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v2, v4 +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: test_ldexp_v2f32_v2i32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_ldexp_f32 v1, v3, v5 +; GFX8-SDAG-NEXT: v_ldexp_f32 v0, v2, v4 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: test_ldexp_v2f32_v2i32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_ldexp_f32 v1, v3, v5 +; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v2, v4 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_v2f32_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v4 +; GFX11-NEXT: v_ldexp_f32 v1, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_ldexp_v2f32_v2i32: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v2, v4 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v3, v5 +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: test_ldexp_v2f32_v2i32: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_ldexp_f32 v0, v2, v4 +; GFX8-GISEL-NEXT: v_ldexp_f32 v1, v3, v5 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_ldexp_v2f32_v2i32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v2, v4 +; GFX9-GISEL-NEXT: v_ldexp_f32 v1, v3, v5 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x float> @llvm.experimental.constrained.ldexp.v2f32.v2i32(<2 x float> %a, <2 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x float> %result +} + +define <3 x float> @test_ldexp_v3f32_v3i32(ptr addrspace(1) %out, <3 x float> %a, <3 x i32> %b) #0 { +; GFX6-SDAG-LABEL: test_ldexp_v3f32_v3i32: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v4, v7 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v3, v6 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v2, v5 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: test_ldexp_v3f32_v3i32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_ldexp_f32 v4, v4, v7 +; GFX8-SDAG-NEXT: v_ldexp_f32 v1, v3, v6 +; GFX8-SDAG-NEXT: v_ldexp_f32 v0, v2, v5 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: test_ldexp_v3f32_v3i32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_ldexp_f32 v4, v4, v7 +; GFX9-SDAG-NEXT: v_ldexp_f32 v1, v3, v6 +; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v2, v5 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_v3f32_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v5 +; GFX11-NEXT: v_ldexp_f32 v1, v3, v6 +; GFX11-NEXT: v_ldexp_f32 v2, v4, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_ldexp_v3f32_v3i32: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v2, v5 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v3, v6 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v4, v7 +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: test_ldexp_v3f32_v3i32: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_ldexp_f32 v0, v2, v5 +; GFX8-GISEL-NEXT: v_ldexp_f32 v1, v3, v6 +; GFX8-GISEL-NEXT: v_ldexp_f32 v2, v4, v7 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_ldexp_v3f32_v3i32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v2, v5 +; GFX9-GISEL-NEXT: v_ldexp_f32 v1, v3, v6 +; GFX9-GISEL-NEXT: v_ldexp_f32 v2, v4, v7 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x float> @llvm.experimental.constrained.ldexp.v3f32.v3i32(<3 x float> %a, <3 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x float> %result +} + +define <4 x float> @test_ldexp_v4f32_v4i32(ptr addrspace(1) %out, <4 x float> %a, <4 x i32> %b) #0 { +; GFX6-SDAG-LABEL: test_ldexp_v4f32_v4i32: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v5, v5, v9 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v4, v4, v8 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v1, v3, v7 +; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v2, v6 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: test_ldexp_v4f32_v4i32: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_ldexp_f32 v5, v5, v9 +; GFX8-SDAG-NEXT: v_ldexp_f32 v4, v4, v8 +; GFX8-SDAG-NEXT: v_ldexp_f32 v1, v3, v7 +; GFX8-SDAG-NEXT: v_ldexp_f32 v0, v2, v6 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v3, v5 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: test_ldexp_v4f32_v4i32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_ldexp_f32 v5, v5, v9 +; GFX9-SDAG-NEXT: v_ldexp_f32 v4, v4, v8 +; GFX9-SDAG-NEXT: v_ldexp_f32 v1, v3, v7 +; GFX9-SDAG-NEXT: v_ldexp_f32 v0, v2, v6 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_v4f32_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v6 +; GFX11-NEXT: v_ldexp_f32 v1, v3, v7 +; GFX11-NEXT: v_ldexp_f32 v2, v4, v8 +; GFX11-NEXT: v_ldexp_f32 v3, v5, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_ldexp_v4f32_v4i32: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v2, v6 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v1, v3, v7 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v2, v4, v8 +; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v3, v5, v9 +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: test_ldexp_v4f32_v4i32: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_ldexp_f32 v0, v2, v6 +; GFX8-GISEL-NEXT: v_ldexp_f32 v1, v3, v7 +; GFX8-GISEL-NEXT: v_ldexp_f32 v2, v4, v8 +; GFX8-GISEL-NEXT: v_ldexp_f32 v3, v5, v9 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_ldexp_v4f32_v4i32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_ldexp_f32 v0, v2, v6 +; GFX9-GISEL-NEXT: v_ldexp_f32 v1, v3, v7 +; GFX9-GISEL-NEXT: v_ldexp_f32 v2, v4, v8 +; GFX9-GISEL-NEXT: v_ldexp_f32 v3, v5, v9 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.experimental.constrained.ldexp.v4f32.v4i32(<4 x float> %a, <4 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <4 x float> %result +} + +declare float @llvm.experimental.constrained.ldexp.f32.i16(float, i16, metadata, metadata) #1 +declare float @llvm.experimental.constrained.ldexp.f32.i32(float, i32, metadata, metadata) #1 +declare <2 x float> @llvm.experimental.constrained.ldexp.v2f32.v2i16(<2 x float>, <2 x i16>, metadata, metadata) #1 +declare <2 x float> @llvm.experimental.constrained.ldexp.v2f32.v2i32(<2 x float>, <2 x i32>, metadata, metadata) #1 +declare <3 x float> @llvm.experimental.constrained.ldexp.v3f32.v3i32(<3 x float>, <3 x i32>, metadata, metadata) #1 +declare <4 x float> @llvm.experimental.constrained.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>, metadata, metadata) #1 + +attributes #0 = { strictfp } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX11-GISEL: {{.*}} +; GFX11-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f64.ll new file mode 100644 index 0000000..a3b8a8f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f64.ll @@ -0,0 +1,180 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s + +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s + +; define double @test_ldexp_f64_i16(ptr addrspace(1) %out, double %a, i16 %b) #0 { +; %result = call double @llvm.experimental.constrained.ldexp.f64.i16(double %a, i16 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") +; ret double %result +; } + +define double @test_ldexp_f64_i32(ptr addrspace(1) %out, double %a, i32 %b) #0 { +; GFX6-LABEL: test_ldexp_f64_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ldexp_f64 v[0:1], v[2:3], v4 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ldexp_f64_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_ldexp_f64 v[0:1], v[2:3], v4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ldexp_f64_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[2:3], v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_f64_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f64 v[0:1], v[2:3], v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.experimental.constrained.ldexp.f64.i32(double %a, i32 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret double %result +} + +; define <2 x double> @test_ldexp_v2f64_v2i16(ptr addrspace(1) %out, <2 x double> %a, <2 x i16> %b) #0 { +; %result = call <2 x double> @llvm.experimental.constrained.ldexp.v2f64.v2i16(<2 x double> %a, <2 x i16> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") +; ret <2 x double> %result +; } + +define <2 x double> @test_ldexp_v2f64_v2i32(ptr addrspace(1) %out, <2 x double> %a, <2 x i32> %b) #0 { +; GFX6-LABEL: test_ldexp_v2f64_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ldexp_f64 v[0:1], v[2:3], v6 +; GFX6-NEXT: v_ldexp_f64 v[2:3], v[4:5], v7 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ldexp_v2f64_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_ldexp_f64 v[0:1], v[2:3], v6 +; GFX8-NEXT: v_ldexp_f64 v[2:3], v[4:5], v7 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ldexp_v2f64_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[2:3], v6 +; GFX9-NEXT: v_ldexp_f64 v[2:3], v[4:5], v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_v2f64_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f64 v[0:1], v[2:3], v6 +; GFX11-NEXT: v_ldexp_f64 v[2:3], v[4:5], v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x double> @llvm.experimental.constrained.ldexp.v2f64.v2i32(<2 x double> %a, <2 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x double> %result +} + +define <3 x double> @test_ldexp_v3f64_v3i32(ptr addrspace(1) %out, <3 x double> %a, <3 x i32> %b) #0 { +; GFX6-LABEL: test_ldexp_v3f64_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ldexp_f64 v[0:1], v[2:3], v8 +; GFX6-NEXT: v_ldexp_f64 v[2:3], v[4:5], v9 +; GFX6-NEXT: v_ldexp_f64 v[4:5], v[6:7], v10 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ldexp_v3f64_v3i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_ldexp_f64 v[0:1], v[2:3], v8 +; GFX8-NEXT: v_ldexp_f64 v[2:3], v[4:5], v9 +; GFX8-NEXT: v_ldexp_f64 v[4:5], v[6:7], v10 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ldexp_v3f64_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[2:3], v8 +; GFX9-NEXT: v_ldexp_f64 v[2:3], v[4:5], v9 +; GFX9-NEXT: v_ldexp_f64 v[4:5], v[6:7], v10 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_v3f64_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f64 v[0:1], v[2:3], v8 +; GFX11-NEXT: v_ldexp_f64 v[2:3], v[4:5], v9 +; GFX11-NEXT: v_ldexp_f64 v[4:5], v[6:7], v10 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x double> @llvm.experimental.constrained.ldexp.v3f64.v3i32(<3 x double> %a, <3 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x double> %result +} + +define <4 x double> @test_ldexp_v4f64_v4i32(ptr addrspace(1) %out, <4 x double> %a, <4 x i32> %b) #0 { +; GFX6-LABEL: test_ldexp_v4f64_v4i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ldexp_f64 v[0:1], v[2:3], v10 +; GFX6-NEXT: v_ldexp_f64 v[2:3], v[4:5], v11 +; GFX6-NEXT: v_ldexp_f64 v[4:5], v[6:7], v12 +; GFX6-NEXT: v_ldexp_f64 v[6:7], v[8:9], v13 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: test_ldexp_v4f64_v4i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_ldexp_f64 v[0:1], v[2:3], v10 +; GFX8-NEXT: v_ldexp_f64 v[2:3], v[4:5], v11 +; GFX8-NEXT: v_ldexp_f64 v[4:5], v[6:7], v12 +; GFX8-NEXT: v_ldexp_f64 v[6:7], v[8:9], v13 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_ldexp_v4f64_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ldexp_f64 v[0:1], v[2:3], v10 +; GFX9-NEXT: v_ldexp_f64 v[2:3], v[4:5], v11 +; GFX9-NEXT: v_ldexp_f64 v[4:5], v[6:7], v12 +; GFX9-NEXT: v_ldexp_f64 v[6:7], v[8:9], v13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_ldexp_v4f64_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_ldexp_f64 v[0:1], v[2:3], v10 +; GFX11-NEXT: v_ldexp_f64 v[2:3], v[4:5], v11 +; GFX11-NEXT: v_ldexp_f64 v[4:5], v[6:7], v12 +; GFX11-NEXT: v_ldexp_f64 v[6:7], v[8:9], v13 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x double> @llvm.experimental.constrained.ldexp.v4f64.v4i32(<4 x double> %a, <4 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <4 x double> %result +} + +declare double @llvm.experimental.constrained.ldexp.f64.i16(double, i16, metadata, metadata) #1 +declare double @llvm.experimental.constrained.ldexp.f64.i32(double, i32, metadata, metadata) #1 +declare <2 x double> @llvm.experimental.constrained.ldexp.v2f64.v2i16(<2 x double>, <2 x i16>, metadata, metadata) #1 +declare <2 x double> @llvm.experimental.constrained.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>, metadata, metadata) #1 +declare <3 x double> @llvm.experimental.constrained.ldexp.v3f64.v3i32(<3 x double>, <3 x i32>, metadata, metadata) #1 +declare <4 x double> @llvm.experimental.constrained.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>, metadata, metadata) #1 + +attributes #0 = { strictfp } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX11-GISEL: {{.*}} +; GFX11-SDAG: {{.*}} +; GFX6-GISEL: {{.*}} +; GFX6-SDAG: {{.*}} +; GFX8-GISEL: {{.*}} +; GFX8-SDAG: {{.*}} +; GFX9-GISEL: {{.*}} +; GFX9-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/Mips/ldexp.ll b/llvm/test/CodeGen/Mips/ldexp.ll new file mode 100644 index 0000000..3753fd5 --- /dev/null +++ b/llvm/test/CodeGen/Mips/ldexp.ll @@ -0,0 +1,172 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=mips-- -mattr=+soft-float < %s | FileCheck -check-prefix=SOFT %s + +define float @ldexp_f32(i8 zeroext %x) { +; SOFT-LABEL: ldexp_f32: +; SOFT: # %bb.0: +; SOFT-NEXT: addiu $sp, $sp, -24 +; SOFT-NEXT: .cfi_def_cfa_offset 24 +; SOFT-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; SOFT-NEXT: .cfi_offset 31, -4 +; SOFT-NEXT: move $5, $4 +; SOFT-NEXT: jal ldexpf +; SOFT-NEXT: lui $4, 16256 +; SOFT-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; SOFT-NEXT: jr $ra +; SOFT-NEXT: addiu $sp, $sp, 24 + %zext = zext i8 %x to i32 + %ldexp = call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 %zext) + ret float %ldexp +} + +define double @ldexp_f64(i8 zeroext %x) { +; SOFT-LABEL: ldexp_f64: +; SOFT: # %bb.0: +; SOFT-NEXT: addiu $sp, $sp, -24 +; SOFT-NEXT: .cfi_def_cfa_offset 24 +; SOFT-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; SOFT-NEXT: .cfi_offset 31, -4 +; SOFT-NEXT: move $6, $4 +; SOFT-NEXT: lui $4, 16368 +; SOFT-NEXT: jal ldexp +; SOFT-NEXT: addiu $5, $zero, 0 +; SOFT-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; SOFT-NEXT: jr $ra +; SOFT-NEXT: addiu $sp, $sp, 24 + %zext = zext i8 %x to i32 + %ldexp = call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 %zext) + ret double %ldexp +} + +define <2 x float> @ldexp_v2f32(<2 x float> %val, <2 x i32> %exp) { +; SOFT-LABEL: ldexp_v2f32: +; SOFT: # %bb.0: +; SOFT-NEXT: addiu $sp, $sp, -32 +; SOFT-NEXT: .cfi_def_cfa_offset 32 +; SOFT-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; SOFT-NEXT: sw $17, 24($sp) # 4-byte Folded Spill +; SOFT-NEXT: sw $16, 20($sp) # 4-byte Folded Spill +; SOFT-NEXT: .cfi_offset 31, -4 +; SOFT-NEXT: .cfi_offset 17, -8 +; SOFT-NEXT: .cfi_offset 16, -12 +; SOFT-NEXT: move $16, $6 +; SOFT-NEXT: move $17, $4 +; SOFT-NEXT: lw $5, 52($sp) +; SOFT-NEXT: jal ldexpf +; SOFT-NEXT: move $4, $7 +; SOFT-NEXT: lw $5, 48($sp) +; SOFT-NEXT: sw $2, 4($17) +; SOFT-NEXT: jal ldexpf +; SOFT-NEXT: move $4, $16 +; SOFT-NEXT: sw $2, 0($17) +; SOFT-NEXT: lw $16, 20($sp) # 4-byte Folded Reload +; SOFT-NEXT: lw $17, 24($sp) # 4-byte Folded Reload +; SOFT-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload +; SOFT-NEXT: jr $ra +; SOFT-NEXT: addiu $sp, $sp, 32 + %1 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %val, <2 x i32> %exp) + ret <2 x float> %1 +} + +define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { +; SOFT-LABEL: ldexp_v4f32: +; SOFT: # %bb.0: +; SOFT-NEXT: addiu $sp, $sp, -40 +; SOFT-NEXT: .cfi_def_cfa_offset 40 +; SOFT-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; SOFT-NEXT: sw $20, 32($sp) # 4-byte Folded Spill +; SOFT-NEXT: sw $19, 28($sp) # 4-byte Folded Spill +; SOFT-NEXT: sw $18, 24($sp) # 4-byte Folded Spill +; SOFT-NEXT: sw $17, 20($sp) # 4-byte Folded Spill +; SOFT-NEXT: sw $16, 16($sp) # 4-byte Folded Spill +; SOFT-NEXT: .cfi_offset 31, -4 +; SOFT-NEXT: .cfi_offset 20, -8 +; SOFT-NEXT: .cfi_offset 19, -12 +; SOFT-NEXT: .cfi_offset 18, -16 +; SOFT-NEXT: .cfi_offset 17, -20 +; SOFT-NEXT: .cfi_offset 16, -24 +; SOFT-NEXT: move $16, $7 +; SOFT-NEXT: move $18, $4 +; SOFT-NEXT: lw $4, 60($sp) +; SOFT-NEXT: lw $5, 76($sp) +; SOFT-NEXT: jal ldexpf +; SOFT-NEXT: move $17, $6 +; SOFT-NEXT: lw $19, 64($sp) +; SOFT-NEXT: lw $20, 68($sp) +; SOFT-NEXT: lw $5, 72($sp) +; SOFT-NEXT: lw $4, 56($sp) +; SOFT-NEXT: jal ldexpf +; SOFT-NEXT: sw $2, 12($18) +; SOFT-NEXT: sw $2, 8($18) +; SOFT-NEXT: move $4, $16 +; SOFT-NEXT: jal ldexpf +; SOFT-NEXT: move $5, $20 +; SOFT-NEXT: sw $2, 4($18) +; SOFT-NEXT: move $4, $17 +; SOFT-NEXT: jal ldexpf +; SOFT-NEXT: move $5, $19 +; SOFT-NEXT: sw $2, 0($18) +; SOFT-NEXT: lw $16, 16($sp) # 4-byte Folded Reload +; SOFT-NEXT: lw $17, 20($sp) # 4-byte Folded Reload +; SOFT-NEXT: lw $18, 24($sp) # 4-byte Folded Reload +; SOFT-NEXT: lw $19, 28($sp) # 4-byte Folded Reload +; SOFT-NEXT: lw $20, 32($sp) # 4-byte Folded Reload +; SOFT-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; SOFT-NEXT: jr $ra +; SOFT-NEXT: addiu $sp, $sp, 40 + %1 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %val, <4 x i32> %exp) + ret <4 x float> %1 +} + +define half @ldexp_f16(half %arg0, i32 %arg1) { +; SOFT-LABEL: ldexp_f16: +; SOFT: # %bb.0: +; SOFT-NEXT: addiu $sp, $sp, -24 +; SOFT-NEXT: .cfi_def_cfa_offset 24 +; SOFT-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; SOFT-NEXT: sw $16, 16($sp) # 4-byte Folded Spill +; SOFT-NEXT: .cfi_offset 31, -4 +; SOFT-NEXT: .cfi_offset 16, -8 +; SOFT-NEXT: move $16, $5 +; SOFT-NEXT: jal __gnu_h2f_ieee +; SOFT-NEXT: andi $4, $4, 65535 +; SOFT-NEXT: move $4, $2 +; SOFT-NEXT: jal ldexpf +; SOFT-NEXT: move $5, $16 +; SOFT-NEXT: jal __gnu_f2h_ieee +; SOFT-NEXT: move $4, $2 +; SOFT-NEXT: lw $16, 16($sp) # 4-byte Folded Reload +; SOFT-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; SOFT-NEXT: jr $ra +; SOFT-NEXT: addiu $sp, $sp, 24 + %ldexp = call half @llvm.ldexp.f16.i32(half %arg0, i32 %arg1) + ret half %ldexp +} + +define x86_fp80 @ldexp_f80(x86_fp80 %arg0, i32 %arg1) { +; SOFT-LABEL: ldexp_f80: +; SOFT: # %bb.0: +; SOFT-NEXT: addiu $sp, $sp, -24 +; SOFT-NEXT: .cfi_def_cfa_offset 24 +; SOFT-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; SOFT-NEXT: .cfi_offset 31, -4 +; SOFT-NEXT: jal ldexpl +; SOFT-NEXT: andi $4, $4, 65535 +; SOFT-NEXT: move $4, $2 +; SOFT-NEXT: addiu $2, $zero, 0 +; SOFT-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; SOFT-NEXT: jr $ra +; SOFT-NEXT: addiu $sp, $sp, 24 + %ldexp = call x86_fp80 @llvm.ldexp.f80.i32(x86_fp80 %arg0, i32 %arg1) + ret x86_fp80 %ldexp +} + + +declare double @llvm.ldexp.f64.i32(double, i32) #0 +declare float @llvm.ldexp.f32.i32(float, i32) #0 +declare <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float>, <2 x i32>) #0 +declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>) #0 +declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) +declare half @llvm.ldexp.f16.i32(half, i32) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/PowerPC/ldexp-libcall.ll b/llvm/test/CodeGen/PowerPC/ldexp-libcall.ll new file mode 100644 index 0000000..6144a9d --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/ldexp-libcall.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \ +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s + +define float @call_ldexpf(float %a, i32 %b) { +; CHECK-LABEL: call_ldexpf: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stdu r1, -32(r1) +; CHECK-NEXT: std r0, 48(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: bl ldexpf +; CHECK-NEXT: nop +; CHECK-NEXT: addi r1, r1, 32 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr + %result = call float @ldexpf(float %a, i32 %b) + ret float %result +} + +define double @call_ldexp(double %a, i32 %b) { +; CHECK-LABEL: call_ldexp: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stdu r1, -32(r1) +; CHECK-NEXT: std r0, 48(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: bl ldexp +; CHECK-NEXT: nop +; CHECK-NEXT: addi r1, r1, 32 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr + %result = call double @ldexp(double %a, i32 %b) + ret double %result +} + +define ppc_fp128 @call_ldexpl(ppc_fp128 %a, i32 %b) { +; CHECK-LABEL: call_ldexpl: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stdu r1, -32(r1) +; CHECK-NEXT: std r0, 48(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: clrldi r5, r5, 32 +; CHECK-NEXT: bl ldexpl +; CHECK-NEXT: nop +; CHECK-NEXT: addi r1, r1, 32 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr + %result = call ppc_fp128 @ldexpl(ppc_fp128 %a, i32 %b) + ret ppc_fp128 %result +} + +declare float @ldexpf(float %a, i32 %b) #0 +declare double @ldexp(double %a, i32 %b) #0 +declare ppc_fp128 @ldexpl(ppc_fp128 %a, i32 %b) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/PowerPC/ldexp.ll b/llvm/test/CodeGen/PowerPC/ldexp.ll new file mode 100644 index 0000000..7a6bab8 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/ldexp.ll @@ -0,0 +1,223 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \ +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s + +define float @ldexp_f32(i8 zeroext %x) { +; CHECK-LABEL: ldexp_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stdu r1, -32(r1) +; CHECK-NEXT: std r0, 48(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: vspltisw v2, 1 +; CHECK-NEXT: mr r4, r3 +; CHECK-NEXT: xvcvsxwdp vs1, v2 +; CHECK-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; CHECK-NEXT: bl ldexpf +; CHECK-NEXT: nop +; CHECK-NEXT: addi r1, r1, 32 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr + %zext = zext i8 %x to i32 + %ldexp = call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 %zext) + ret float %ldexp +} + +define double @ldexp_f64(i8 zeroext %x) { +; CHECK-LABEL: ldexp_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stdu r1, -32(r1) +; CHECK-NEXT: std r0, 48(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: vspltisw v2, 1 +; CHECK-NEXT: mr r4, r3 +; CHECK-NEXT: xvcvsxwdp vs1, v2 +; CHECK-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; CHECK-NEXT: bl ldexp +; CHECK-NEXT: nop +; CHECK-NEXT: addi r1, r1, 32 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr + %zext = zext i8 %x to i32 + %ldexp = call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 %zext) + ret double %ldexp +} + +define <2 x float> @ldexp_v2f32(<2 x float> %val, <2 x i32> %exp) { +; CHECK-LABEL: ldexp_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stdu r1, -96(r1) +; CHECK-NEXT: std r0, 112(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: .cfi_offset v28, -64 +; CHECK-NEXT: .cfi_offset v29, -48 +; CHECK-NEXT: .cfi_offset v30, -32 +; CHECK-NEXT: .cfi_offset v31, -16 +; CHECK-NEXT: li r3, 12 +; CHECK-NEXT: xscvspdpn f1, v2 +; CHECK-NEXT: stxv v28, 32(r1) # 16-byte Folded Spill +; CHECK-NEXT: stxv v29, 48(r1) # 16-byte Folded Spill +; CHECK-NEXT: stxv v30, 64(r1) # 16-byte Folded Spill +; CHECK-NEXT: stxv v31, 80(r1) # 16-byte Folded Spill +; CHECK-NEXT: vmr v31, v3 +; CHECK-NEXT: vmr v30, v2 +; CHECK-NEXT: vextuwrx r4, r3, v3 +; CHECK-NEXT: bl ldexpf +; CHECK-NEXT: nop +; CHECK-NEXT: xxswapd vs0, v30 +; CHECK-NEXT: li r3, 4 +; CHECK-NEXT: xscpsgndp v29, f1, f1 +; CHECK-NEXT: xscvspdpn f1, vs0 +; CHECK-NEXT: vextuwrx r4, r3, v31 +; CHECK-NEXT: bl ldexpf +; CHECK-NEXT: nop +; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; CHECK-NEXT: xxmrghd vs0, v29, vs1 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: vextuwrx r4, r3, v31 +; CHECK-NEXT: xvcvdpsp v28, vs0 +; CHECK-NEXT: xxsldwi vs0, v30, v30, 3 +; CHECK-NEXT: xscvspdpn f1, vs0 +; CHECK-NEXT: bl ldexpf +; CHECK-NEXT: nop +; CHECK-NEXT: xxsldwi vs0, v30, v30, 1 +; CHECK-NEXT: xscpsgndp v29, f1, f1 +; CHECK-NEXT: mfvsrwz r4, v31 +; CHECK-NEXT: xscvspdpn f1, vs0 +; CHECK-NEXT: bl ldexpf +; CHECK-NEXT: nop +; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; CHECK-NEXT: xxmrghd vs0, vs1, v29 +; CHECK-NEXT: lxv v31, 80(r1) # 16-byte Folded Reload +; CHECK-NEXT: lxv v30, 64(r1) # 16-byte Folded Reload +; CHECK-NEXT: lxv v29, 48(r1) # 16-byte Folded Reload +; CHECK-NEXT: xvcvdpsp v2, vs0 +; CHECK-NEXT: vmrgew v2, v28, v2 +; CHECK-NEXT: lxv v28, 32(r1) # 16-byte Folded Reload +; CHECK-NEXT: addi r1, r1, 96 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr + %1 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %val, <2 x i32> %exp) + ret <2 x float> %1 +} + +define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { +; CHECK-LABEL: ldexp_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stdu r1, -96(r1) +; CHECK-NEXT: std r0, 112(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: .cfi_offset v28, -64 +; CHECK-NEXT: .cfi_offset v29, -48 +; CHECK-NEXT: .cfi_offset v30, -32 +; CHECK-NEXT: .cfi_offset v31, -16 +; CHECK-NEXT: li r3, 12 +; CHECK-NEXT: xscvspdpn f1, v2 +; CHECK-NEXT: stxv v28, 32(r1) # 16-byte Folded Spill +; CHECK-NEXT: stxv v29, 48(r1) # 16-byte Folded Spill +; CHECK-NEXT: stxv v30, 64(r1) # 16-byte Folded Spill +; CHECK-NEXT: stxv v31, 80(r1) # 16-byte Folded Spill +; CHECK-NEXT: vmr v31, v3 +; CHECK-NEXT: vmr v30, v2 +; CHECK-NEXT: vextuwrx r4, r3, v3 +; CHECK-NEXT: bl ldexpf +; CHECK-NEXT: nop +; CHECK-NEXT: xxswapd vs0, v30 +; CHECK-NEXT: li r3, 4 +; CHECK-NEXT: xscpsgndp v29, f1, f1 +; CHECK-NEXT: xscvspdpn f1, vs0 +; CHECK-NEXT: vextuwrx r4, r3, v31 +; CHECK-NEXT: bl ldexpf +; CHECK-NEXT: nop +; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; CHECK-NEXT: xxmrghd vs0, v29, vs1 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: vextuwrx r4, r3, v31 +; CHECK-NEXT: xvcvdpsp v28, vs0 +; CHECK-NEXT: xxsldwi vs0, v30, v30, 3 +; CHECK-NEXT: xscvspdpn f1, vs0 +; CHECK-NEXT: bl ldexpf +; CHECK-NEXT: nop +; CHECK-NEXT: xxsldwi vs0, v30, v30, 1 +; CHECK-NEXT: xscpsgndp v29, f1, f1 +; CHECK-NEXT: mfvsrwz r4, v31 +; CHECK-NEXT: xscvspdpn f1, vs0 +; CHECK-NEXT: bl ldexpf +; CHECK-NEXT: nop +; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; CHECK-NEXT: xxmrghd vs0, vs1, v29 +; CHECK-NEXT: lxv v31, 80(r1) # 16-byte Folded Reload +; CHECK-NEXT: lxv v30, 64(r1) # 16-byte Folded Reload +; CHECK-NEXT: lxv v29, 48(r1) # 16-byte Folded Reload +; CHECK-NEXT: xvcvdpsp v2, vs0 +; CHECK-NEXT: vmrgew v2, v28, v2 +; CHECK-NEXT: lxv v28, 32(r1) # 16-byte Folded Reload +; CHECK-NEXT: addi r1, r1, 96 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr + %1 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %val, <4 x i32> %exp) + ret <4 x float> %1 +} + +define half @ldexp_f16(half %arg0, i32 %arg1) { +; CHECK-LABEL: ldexp_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stdu r1, -32(r1) +; CHECK-NEXT: std r0, 48(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: xscvdphp f0, f1 +; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: mffprwz r3, f0 +; CHECK-NEXT: clrlwi r3, r3, 16 +; CHECK-NEXT: mtfprwz f0, r3 +; CHECK-NEXT: xscvhpdp f1, f0 +; CHECK-NEXT: bl ldexpf +; CHECK-NEXT: nop +; CHECK-NEXT: addi r1, r1, 32 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr + %ldexp = call half @llvm.ldexp.f16.i32(half %arg0, i32 %arg1) + ret half %ldexp +} + +define ppc_fp128 @ldexp_fp128(ppc_fp128 %arg0, i32 %arg1) { +; CHECK-LABEL: ldexp_fp128: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: stdu r1, -32(r1) +; CHECK-NEXT: std r0, 48(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: clrldi r5, r5, 32 +; CHECK-NEXT: bl ldexpl +; CHECK-NEXT: nop +; CHECK-NEXT: addi r1, r1, 32 +; CHECK-NEXT: ld r0, 16(r1) +; CHECK-NEXT: mtlr r0 +; CHECK-NEXT: blr + %ldexp = call ppc_fp128 @llvm.ldexp.ppcf128.i32(ppc_fp128 %arg0, i32 %arg1) + ret ppc_fp128 %ldexp +} + +declare double @llvm.ldexp.f64.i32(double, i32) #0 +declare float @llvm.ldexp.f32.i32(float, i32) #0 +declare <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float>, <2 x i32>) #0 +declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>) #0 +declare half @llvm.ldexp.f16.i32(half, i32) #0 +declare ppc_fp128 @llvm.ldexp.ppcf128.i32(ppc_fp128, i32) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/X86/ldexp-f80.ll b/llvm/test/CodeGen/X86/ldexp-f80.ll new file mode 100644 index 0000000..3a10eab --- /dev/null +++ b/llvm/test/CodeGen/X86/ldexp-f80.ll @@ -0,0 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck -check-prefixes=X64 %s +; FIXME: Expansion without libcall +; XUN: llc -mtriple=i386-pc-win32 < %s | FileCheck -check-prefix=WIN32 %s + +define x86_fp80 @ldexp_f80(x86_fp80 %arg0, i32 %arg1) { +; X64-LABEL: ldexp_f80: +; X64: # %bb.0: +; X64-NEXT: subq $24, %rsp +; X64-NEXT: .cfi_def_cfa_offset 32 +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: callq ldexpl@PLT +; X64-NEXT: addq $24, %rsp +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq + %ldexp = call x86_fp80 @llvm.ldexp.f80.i32(x86_fp80 %arg0, i32 %arg1) + ret x86_fp80 %ldexp +} + +define x86_fp80 @test_strict_ldexp_f80_i32(ptr addrspace(1) %out, x86_fp80 %a, i32 %b) #2 { +; X64-LABEL: test_strict_ldexp_f80_i32: +; X64: # %bb.0: +; X64-NEXT: subq $24, %rsp +; X64-NEXT: .cfi_def_cfa_offset 32 +; X64-NEXT: movl %esi, %edi +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq ldexpl@PLT +; X64-NEXT: addq $24, %rsp +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq + %result = call x86_fp80 @llvm.experimental.constrained.ldexp.f80.i32(x86_fp80 %a, i32 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret x86_fp80 %result +} + +declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) +declare x86_fp80 @llvm.experimental.constrained.ldexp.f80.i32(x86_fp80, i32, metadata, metadata) #1 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } +attributes #2 = { strictfp } diff --git a/llvm/test/CodeGen/X86/ldexp-libcall.ll b/llvm/test/CodeGen/X86/ldexp-libcall.ll new file mode 100644 index 0000000..3aec496 --- /dev/null +++ b/llvm/test/CodeGen/X86/ldexp-libcall.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=i386-pc-win32 | FileCheck %s -check-prefix=CHECK-WIN + +define float @call_ldexpf(float %a, i32 %b) { +; CHECK-LABEL: call_ldexpf: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp ldexpf@PLT # TAILCALL +; +; CHECK-WIN-LABEL: call_ldexpf: +; CHECK-WIN: # %bb.0: +; CHECK-WIN-NEXT: subl $8, %esp +; CHECK-WIN-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-WIN-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: fstps (%esp) +; CHECK-WIN-NEXT: calll _ldexpf +; CHECK-WIN-NEXT: addl $8, %esp +; CHECK-WIN-NEXT: retl + %result = call float @ldexpf(float %a, i32 %b) + ret float %result +} + +define double @call_ldexp(double %a, i32 %b) { +; CHECK-LABEL: call_ldexp: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp ldexp@PLT # TAILCALL +; +; CHECK-WIN-LABEL: call_ldexp: +; CHECK-WIN: # %bb.0: +; CHECK-WIN-NEXT: subl $12, %esp +; CHECK-WIN-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-WIN-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: fstpl (%esp) +; CHECK-WIN-NEXT: calll _ldexp +; CHECK-WIN-NEXT: addl $12, %esp +; CHECK-WIN-NEXT: retl + %result = call double @ldexp(double %a, i32 %b) + ret double %result +} + +define x86_fp80 @call_ldexpl(x86_fp80 %a, i32 %b) { +; CHECK-LABEL: call_ldexpl: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt (%rsp) +; CHECK-NEXT: callq ldexpl@PLT +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; CHECK-WIN-LABEL: call_ldexpl: +; CHECK-WIN: # %bb.0: +; CHECK-WIN-NEXT: pushl %ebp +; CHECK-WIN-NEXT: movl %esp, %ebp +; CHECK-WIN-NEXT: andl $-16, %esp +; CHECK-WIN-NEXT: subl $48, %esp +; CHECK-WIN-NEXT: fldt 8(%ebp) +; CHECK-WIN-NEXT: movl 24(%ebp), %eax +; CHECK-WIN-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: fstpt (%esp) +; CHECK-WIN-NEXT: calll _ldexpl +; CHECK-WIN-NEXT: movl %ebp, %esp +; CHECK-WIN-NEXT: popl %ebp +; CHECK-WIN-NEXT: retl + %result = call x86_fp80 @ldexpl(x86_fp80 %a, i32 %b) + ret x86_fp80 %result +} + +declare float @ldexpf(float %a, i32 %b) #0 +declare double @ldexp(double %a, i32 %b) #0 +declare x86_fp80 @ldexpl(x86_fp80 %a, i32 %b) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/X86/ldexp-not-readonly.ll b/llvm/test/CodeGen/X86/ldexp-not-readonly.ll new file mode 100644 index 0000000..aec7773 --- /dev/null +++ b/llvm/test/CodeGen/X86/ldexp-not-readonly.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=i386-pc-win32 | FileCheck %s -check-prefix=CHECK-WIN + +define float @call_ldexpf(float %a, i32 %b) { +; CHECK-LABEL: call_ldexpf: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; CHECK-WIN-LABEL: call_ldexpf: +; CHECK-WIN: # %bb.0: +; CHECK-WIN-NEXT: subl $8, %esp +; CHECK-WIN-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-WIN-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: fstps (%esp) +; CHECK-WIN-NEXT: calll _ldexpf +; CHECK-WIN-NEXT: addl $8, %esp +; CHECK-WIN-NEXT: retl + %result = call float @ldexpf(float %a, i32 %b) + ret float %result +} + +define double @call_ldexp(double %a, i32 %b) { +; CHECK-LABEL: call_ldexp: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; CHECK-WIN-LABEL: call_ldexp: +; CHECK-WIN: # %bb.0: +; CHECK-WIN-NEXT: subl $12, %esp +; CHECK-WIN-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-WIN-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: fstpl (%esp) +; CHECK-WIN-NEXT: calll _ldexp +; CHECK-WIN-NEXT: addl $12, %esp +; CHECK-WIN-NEXT: retl + %result = call double @ldexp(double %a, i32 %b) + ret double %result +} + +declare float @ldexpf(float %a, i32 %b) +declare double @ldexp(double %a, i32 %b) diff --git a/llvm/test/CodeGen/X86/ldexp-strict.ll b/llvm/test/CodeGen/X86/ldexp-strict.ll new file mode 100644 index 0000000..67e348c --- /dev/null +++ b/llvm/test/CodeGen/X86/ldexp-strict.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck -check-prefixes=X64 %s +; XUN: llc -mtriple=i386-pc-win32 < %s | FileCheck -check-prefix=WIN32 %s +; FIXME: Expansion support without libcalls + +; FIXME: Implement f16->f32 promotion for strictfp +; define half @test_strict_ldexp_f16_i32(ptr addrspace(1) %out, half %a, i32 %b) #2 { +; %result = call half @llvm.experimental.constrained.ldexp.f16.i32(half %a, i32 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") +; ret half %result +; } + +define float @test_strict_ldexp_f32_i32(ptr addrspace(1) %out, float %a, i32 %b) #2 { +; X64-LABEL: test_strict_ldexp_f32_i32: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: movl %esi, %edi +; X64-NEXT: callq ldexpf@PLT +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq + %result = call float @llvm.experimental.constrained.ldexp.f32.i32(float %a, i32 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret float %result +} + +define double @test_strict_ldexp_f64_i32(ptr addrspace(1) %out, double %a, i32 %b) #2 { +; X64-LABEL: test_strict_ldexp_f64_i32: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: movl %esi, %edi +; X64-NEXT: callq ldexp@PLT +; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq + %result = call double @llvm.experimental.constrained.ldexp.f64.i32(double %a, i32 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret double %result +} + + +define <2 x float> @test_strict_ldexp_v2f32_v2i32(ptr addrspace(1) %out, <2 x float> %a, <2 x i32> %b) #2 { +; X64-LABEL: test_strict_ldexp_v2f32_v2i32: +; X64: # %bb.0: +; X64-NEXT: subq $56, %rsp +; X64-NEXT: .cfi_def_cfa_offset 64 +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: callq ldexpf@PLT +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[1,1,1,1] +; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: callq ldexpf@PLT +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: addq $56, %rsp +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq + %result = call <2 x float> @llvm.experimental.constrained.ldexp.v2f32.v2i32(<2 x float> %a, <2 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x float> %result +} + +declare half @llvm.experimental.constrained.ldexp.f16.i32(half, i32, metadata, metadata) #1 +declare float @llvm.experimental.constrained.ldexp.f32.i32(float, i32, metadata, metadata) #1 +declare double @llvm.experimental.constrained.ldexp.f64.i32(double, i32, metadata, metadata) #1 +declare x86_fp80 @llvm.experimental.constrained.ldexp.f80.i32(x86_fp80, i32, metadata, metadata) #1 +declare <2 x float> @llvm.experimental.constrained.ldexp.v2f32.v2i32(<2 x float>, <2 x i32>, metadata, metadata) #1 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } +attributes #2 = { strictfp } diff --git a/llvm/test/CodeGen/X86/ldexp-wrong-signature.ll b/llvm/test/CodeGen/X86/ldexp-wrong-signature.ll new file mode 100644 index 0000000..ac58bb5 --- /dev/null +++ b/llvm/test/CodeGen/X86/ldexp-wrong-signature.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=i386-pc-win32 | FileCheck %s -check-prefix=CHECK-WIN + +define float @ldexpf_too_many_args(float %a, i32 %b, i32 %c) { +; CHECK-LABEL: ldexpf_too_many_args: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; CHECK-WIN-LABEL: ldexpf_too_many_args: +; CHECK-WIN: # %bb.0: +; CHECK-WIN-NEXT: subl $12, %esp +; CHECK-WIN-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-WIN-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-WIN-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: fstps (%esp) +; CHECK-WIN-NEXT: calll _ldexpf +; CHECK-WIN-NEXT: addl $12, %esp +; CHECK-WIN-NEXT: retl + %result = call float @ldexpf(float %a, i32 %b, i32 %c) #0 + ret float %result +} + +define float @ldexp_wrong_fp_type(float %a, i32 %b) { +; CHECK-LABEL: ldexp_wrong_fp_type: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; CHECK-WIN-LABEL: ldexp_wrong_fp_type: +; CHECK-WIN: # %bb.0: +; CHECK-WIN-NEXT: subl $8, %esp +; CHECK-WIN-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-WIN-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: fstps (%esp) +; CHECK-WIN-NEXT: calll _ldexp +; CHECK-WIN-NEXT: addl $8, %esp +; CHECK-WIN-NEXT: retl + %result = call float @ldexp(float %a, i32 %b) #0 + ret float %result +} + +declare float @ldexpf(float, i32, i32) #0 +declare float @ldexp(float, i32) #0 + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/X86/ldexp-wrong-signature2.ll b/llvm/test/CodeGen/X86/ldexp-wrong-signature2.ll new file mode 100644 index 0000000..ac79973 --- /dev/null +++ b/llvm/test/CodeGen/X86/ldexp-wrong-signature2.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=i386-pc-win32 | FileCheck %s -check-prefix=CHECK-WIN + +define i32 @ldexpf_not_fp(i32 %a, i32 %b) { +; CHECK-LABEL: ldexpf_not_fp: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq ldexpf@PLT +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; CHECK-WIN-LABEL: ldexpf_not_fp: +; CHECK-WIN: # %bb.0: +; CHECK-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: calll _ldexpf +; CHECK-WIN-NEXT: addl $8, %esp +; CHECK-WIN-NEXT: retl + %result = call i32 @ldexpf(i32 %a, i32 %b) #0 + ret i32 %result +} + +define float @ldexp_not_int(float %a, float %b) { +; CHECK-LABEL: ldexp_not_int: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq ldexp@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; CHECK-WIN-LABEL: ldexp_not_int: +; CHECK-WIN: # %bb.0: +; CHECK-WIN-NEXT: subl $8, %esp +; CHECK-WIN-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-WIN-NEXT: fstps (%esp) +; CHECK-WIN-NEXT: calll _ldexp +; CHECK-WIN-NEXT: addl $8, %esp +; CHECK-WIN-NEXT: retl + %result = call float @ldexp(float %a, float %b) #0 + ret float %result +} + +declare i32 @ldexpf(i32, i32) #0 +declare float @ldexp(float, float) #0 + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll new file mode 100644 index 0000000..bbf0f97 --- /dev/null +++ b/llvm/test/CodeGen/X86/ldexp.ll @@ -0,0 +1,784 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck -check-prefixes=X64 %s +; RUN: llc -mtriple=i386-pc-win32 < %s | FileCheck -check-prefix=WIN32 %s + +define float @ldexp_f32(i8 zeroext %x) { +; X64-LABEL: ldexp_f32: +; X64: # %bb.0: +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: jmp ldexpf@PLT # TAILCALL +; +; WIN32-LABEL: ldexp_f32: +; WIN32: # %bb.0: +; WIN32-NEXT: pushl %eax +; WIN32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: cmpl $381, %ecx # imm = 0x17D +; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: jl LBB0_2 +; WIN32-NEXT: # %bb.1: +; WIN32-NEXT: movl $381, %eax # imm = 0x17D +; WIN32-NEXT: LBB0_2: +; WIN32-NEXT: addl $-254, %eax +; WIN32-NEXT: leal -127(%ecx), %edx +; WIN32-NEXT: cmpl $255, %ecx +; WIN32-NEXT: jae LBB0_4 +; WIN32-NEXT: # %bb.3: +; WIN32-NEXT: movl %edx, %eax +; WIN32-NEXT: LBB0_4: +; WIN32-NEXT: flds __real@7f800000 +; WIN32-NEXT: flds __real@7f000000 +; WIN32-NEXT: jae LBB0_6 +; WIN32-NEXT: # %bb.5: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: fldz +; WIN32-NEXT: LBB0_6: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: cmpl $-329, %ecx # imm = 0xFEB7 +; WIN32-NEXT: movl %ecx, %edx +; WIN32-NEXT: jge LBB0_8 +; WIN32-NEXT: # %bb.7: +; WIN32-NEXT: movl $-330, %edx # imm = 0xFEB6 +; WIN32-NEXT: LBB0_8: +; WIN32-NEXT: cmpl $-228, %ecx +; WIN32-NEXT: fldz +; WIN32-NEXT: flds __real@0c800000 +; WIN32-NEXT: jb LBB0_9 +; WIN32-NEXT: # %bb.10: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: leal 102(%ecx), %edx +; WIN32-NEXT: cmpl $-126, %ecx +; WIN32-NEXT: jge LBB0_12 +; WIN32-NEXT: jmp LBB0_13 +; WIN32-NEXT: LBB0_9: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: addl $204, %edx +; WIN32-NEXT: cmpl $-126, %ecx +; WIN32-NEXT: jl LBB0_13 +; WIN32-NEXT: LBB0_12: +; WIN32-NEXT: movl %ecx, %edx +; WIN32-NEXT: LBB0_13: +; WIN32-NEXT: fld1 +; WIN32-NEXT: jl LBB0_15 +; WIN32-NEXT: # %bb.14: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: fldz +; WIN32-NEXT: LBB0_15: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: cmpl $127, %ecx +; WIN32-NEXT: jg LBB0_17 +; WIN32-NEXT: # %bb.16: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: movl %edx, %eax +; WIN32-NEXT: fldz +; WIN32-NEXT: LBB0_17: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: shll $23, %eax +; WIN32-NEXT: addl $1065353216, %eax # imm = 0x3F800000 +; WIN32-NEXT: movl %eax, (%esp) +; WIN32-NEXT: fmuls (%esp) +; WIN32-NEXT: popl %eax +; WIN32-NEXT: retl + %zext = zext i8 %x to i32 + %ldexp = call float @llvm.ldexp.f32.i32(float 1.000000e+00, i32 %zext) + ret float %ldexp +} + +define double @ldexp_f64(i8 zeroext %x) { +; X64-LABEL: ldexp_f64: +; X64: # %bb.0: +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: jmp ldexp@PLT # TAILCALL +; +; WIN32-LABEL: ldexp_f64: +; WIN32: # %bb.0: +; WIN32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: pushl %eax +; WIN32-NEXT: pushl $1072693248 # imm = 0x3FF00000 +; WIN32-NEXT: pushl $0 +; WIN32-NEXT: calll _ldexp +; WIN32-NEXT: addl $12, %esp +; WIN32-NEXT: retl + %zext = zext i8 %x to i32 + %ldexp = call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 %zext) + ret double %ldexp +} + +define <2 x float> @ldexp_v2f32(<2 x float> %val, <2 x i32> %exp) { +; X64-LABEL: ldexp_v2f32: +; X64: # %bb.0: +; X64-NEXT: subq $72, %rsp +; X64-NEXT: .cfi_def_cfa_offset 80 +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; X64-NEXT: movd %xmm2, %edi +; X64-NEXT: callq ldexpf@PLT +; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[2,3,2,3] +; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: callq ldexpf@PLT +; X64-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: movd %xmm0, %edi +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: callq ldexpf@PLT +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[1,1,1,1] +; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: callq ldexpf@PLT +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = xmm1[0],mem[0] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: addq $72, %rsp +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq +; +; WIN32-LABEL: ldexp_v2f32: +; WIN32: # %bb.0: +; WIN32-NEXT: pushl %edi +; WIN32-NEXT: pushl %esi +; WIN32-NEXT: subl $8, %esp +; WIN32-NEXT: flds {{[0-9]+}}(%esp) +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: cmpl $-329, %eax # imm = 0xFEB7 +; WIN32-NEXT: movl %eax, %edx +; WIN32-NEXT: jge LBB2_2 +; WIN32-NEXT: # %bb.1: +; WIN32-NEXT: movl $-330, %edx # imm = 0xFEB6 +; WIN32-NEXT: LBB2_2: +; WIN32-NEXT: addl $204, %edx +; WIN32-NEXT: leal 102(%eax), %ecx +; WIN32-NEXT: cmpl $-228, %eax +; WIN32-NEXT: jb LBB2_4 +; WIN32-NEXT: # %bb.3: +; WIN32-NEXT: movl %ecx, %edx +; WIN32-NEXT: LBB2_4: +; WIN32-NEXT: flds __real@0c800000 +; WIN32-NEXT: fld %st(1) +; WIN32-NEXT: fmul %st(1), %st +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(2), %st +; WIN32-NEXT: jb LBB2_6 +; WIN32-NEXT: # %bb.5: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: LBB2_6: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: cmpl $-126, %eax +; WIN32-NEXT: jl LBB2_8 +; WIN32-NEXT: # %bb.7: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: fld %st(1) +; WIN32-NEXT: movl %eax, %edx +; WIN32-NEXT: LBB2_8: +; WIN32-NEXT: cmpl $381, %eax # imm = 0x17D +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: jl LBB2_10 +; WIN32-NEXT: # %bb.9: +; WIN32-NEXT: movl $381, %esi # imm = 0x17D +; WIN32-NEXT: LBB2_10: +; WIN32-NEXT: flds __real@7f000000 +; WIN32-NEXT: fmul %st, %st(3) +; WIN32-NEXT: fld %st(3) +; WIN32-NEXT: fmul %st(1), %st +; WIN32-NEXT: leal -127(%eax), %ecx +; WIN32-NEXT: cmpl $255, %eax +; WIN32-NEXT: jae LBB2_11 +; WIN32-NEXT: # %bb.12: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: jmp LBB2_13 +; WIN32-NEXT: LBB2_11: +; WIN32-NEXT: fstp %st(4) +; WIN32-NEXT: addl $-254, %esi +; WIN32-NEXT: movl %esi, %ecx +; WIN32-NEXT: LBB2_13: +; WIN32-NEXT: cmpl $127, %eax +; WIN32-NEXT: flds {{[0-9]+}}(%esp) +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: jg LBB2_15 +; WIN32-NEXT: # %bb.14: +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: LBB2_15: +; WIN32-NEXT: cmpl $381, %esi # imm = 0x17D +; WIN32-NEXT: movl %esi, %edx +; WIN32-NEXT: jl LBB2_17 +; WIN32-NEXT: # %bb.16: +; WIN32-NEXT: movl $381, %edx # imm = 0x17D +; WIN32-NEXT: LBB2_17: +; WIN32-NEXT: addl $-254, %edx +; WIN32-NEXT: leal -127(%esi), %edi +; WIN32-NEXT: cmpl $255, %esi +; WIN32-NEXT: jae LBB2_19 +; WIN32-NEXT: # %bb.18: +; WIN32-NEXT: movl %edi, %edx +; WIN32-NEXT: LBB2_19: +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(2), %st +; WIN32-NEXT: fmul %st, %st(2) +; WIN32-NEXT: jae LBB2_21 +; WIN32-NEXT: # %bb.20: +; WIN32-NEXT: fstp %st(2) +; WIN32-NEXT: fldz +; WIN32-NEXT: LBB2_21: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: cmpl $-329, %esi # imm = 0xFEB7 +; WIN32-NEXT: movl %esi, %edi +; WIN32-NEXT: jge LBB2_23 +; WIN32-NEXT: # %bb.22: +; WIN32-NEXT: movl $-330, %edi # imm = 0xFEB6 +; WIN32-NEXT: LBB2_23: +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(4), %st +; WIN32-NEXT: fmul %st, %st(4) +; WIN32-NEXT: cmpl $-228, %esi +; WIN32-NEXT: jb LBB2_24 +; WIN32-NEXT: # %bb.25: +; WIN32-NEXT: fstp %st(4) +; WIN32-NEXT: leal 102(%esi), %edi +; WIN32-NEXT: cmpl $-126, %esi +; WIN32-NEXT: jge LBB2_27 +; WIN32-NEXT: jmp LBB2_28 +; WIN32-NEXT: LBB2_24: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: addl $204, %edi +; WIN32-NEXT: cmpl $-126, %esi +; WIN32-NEXT: jl LBB2_28 +; WIN32-NEXT: LBB2_27: +; WIN32-NEXT: fstp %st(3) +; WIN32-NEXT: movl %esi, %edi +; WIN32-NEXT: fldz +; WIN32-NEXT: LBB2_28: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: cmpl $127, %esi +; WIN32-NEXT: jg LBB2_30 +; WIN32-NEXT: # %bb.29: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: movl %edi, %edx +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(2) +; WIN32-NEXT: LBB2_30: +; WIN32-NEXT: fstp %st(2) +; WIN32-NEXT: cmpl $127, %eax +; WIN32-NEXT: jg LBB2_32 +; WIN32-NEXT: # %bb.31: +; WIN32-NEXT: fstp %st(2) +; WIN32-NEXT: fldz +; WIN32-NEXT: LBB2_32: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: shll $23, %ecx +; WIN32-NEXT: addl $1065353216, %ecx # imm = 0x3F800000 +; WIN32-NEXT: movl %ecx, (%esp) +; WIN32-NEXT: shll $23, %edx +; WIN32-NEXT: addl $1065353216, %edx # imm = 0x3F800000 +; WIN32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: fmuls (%esp) +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: fmuls {{[0-9]+}}(%esp) +; WIN32-NEXT: addl $8, %esp +; WIN32-NEXT: popl %esi +; WIN32-NEXT: popl %edi +; WIN32-NEXT: retl + %1 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %val, <2 x i32> %exp) + ret <2 x float> %1 +} + +define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) { +; X64-LABEL: ldexp_v4f32: +; X64: # %bb.0: +; X64-NEXT: subq $72, %rsp +; X64-NEXT: .cfi_def_cfa_offset 80 +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; X64-NEXT: movd %xmm2, %edi +; X64-NEXT: callq ldexpf@PLT +; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[2,3,2,3] +; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: callq ldexpf@PLT +; X64-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: movd %xmm0, %edi +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: callq ldexpf@PLT +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[1,1,1,1] +; X64-NEXT: movd %xmm1, %edi +; X64-NEXT: callq ldexpf@PLT +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = xmm1[0],mem[0] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: addq $72, %rsp +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq +; +; WIN32-LABEL: ldexp_v4f32: +; WIN32: # %bb.0: +; WIN32-NEXT: pushl %ebp +; WIN32-NEXT: pushl %ebx +; WIN32-NEXT: pushl %edi +; WIN32-NEXT: pushl %esi +; WIN32-NEXT: subl $32, %esp +; WIN32-NEXT: flds {{[0-9]+}}(%esp) +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: flds {{[0-9]+}}(%esp) +; WIN32-NEXT: flds __real@7f000000 +; WIN32-NEXT: fld %st(1) +; WIN32-NEXT: fmul %st(1), %st +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(2), %st +; WIN32-NEXT: cmpl $255, %ecx +; WIN32-NEXT: jae LBB3_2 +; WIN32-NEXT: # %bb.1: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: LBB3_2: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: cmpl $-329, %ecx # imm = 0xFEB7 +; WIN32-NEXT: movl %ecx, %esi +; WIN32-NEXT: jge LBB3_4 +; WIN32-NEXT: # %bb.3: +; WIN32-NEXT: movl $-330, %esi # imm = 0xFEB6 +; WIN32-NEXT: LBB3_4: +; WIN32-NEXT: addl $204, %esi +; WIN32-NEXT: leal 102(%ecx), %eax +; WIN32-NEXT: cmpl $-228, %ecx +; WIN32-NEXT: jb LBB3_6 +; WIN32-NEXT: # %bb.5: +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: LBB3_6: +; WIN32-NEXT: flds __real@0c800000 +; WIN32-NEXT: fld %st(3) +; WIN32-NEXT: fmul %st(1), %st +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(2), %st +; WIN32-NEXT: jb LBB3_8 +; WIN32-NEXT: # %bb.7: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: LBB3_8: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: cmpl $-126, %ecx +; WIN32-NEXT: jl LBB3_10 +; WIN32-NEXT: # %bb.9: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(4) +; WIN32-NEXT: LBB3_10: +; WIN32-NEXT: fstp %st(4) +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl %ecx, %edx +; WIN32-NEXT: subl $127, %edx +; WIN32-NEXT: jg LBB3_12 +; WIN32-NEXT: # %bb.11: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(3) +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: LBB3_12: +; WIN32-NEXT: fstp %st(3) +; WIN32-NEXT: fld %st(3) +; WIN32-NEXT: fmul %st(2), %st +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(3), %st +; WIN32-NEXT: cmpl $255, %edi +; WIN32-NEXT: jae LBB3_14 +; WIN32-NEXT: # %bb.13: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: LBB3_14: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; WIN32-NEXT: cmpl $-329, %edi # imm = 0xFEB7 +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: jge LBB3_16 +; WIN32-NEXT: # %bb.15: +; WIN32-NEXT: movl $-330, %eax # imm = 0xFEB6 +; WIN32-NEXT: LBB3_16: +; WIN32-NEXT: fld %st(3) +; WIN32-NEXT: fmul %st(3), %st +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(4), %st +; WIN32-NEXT: cmpl $-228, %edi +; WIN32-NEXT: jb LBB3_17 +; WIN32-NEXT: # %bb.18: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: leal 102(%edi), %eax +; WIN32-NEXT: cmpl $-126, %edi +; WIN32-NEXT: jge LBB3_20 +; WIN32-NEXT: jmp LBB3_21 +; WIN32-NEXT: LBB3_17: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: addl $204, %eax +; WIN32-NEXT: cmpl $-126, %edi +; WIN32-NEXT: jl LBB3_21 +; WIN32-NEXT: LBB3_20: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(4) +; WIN32-NEXT: LBB3_21: +; WIN32-NEXT: fstp %st(4) +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill +; WIN32-NEXT: movl %edi, %ebx +; WIN32-NEXT: subl $127, %ebx +; WIN32-NEXT: jg LBB3_23 +; WIN32-NEXT: # %bb.22: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(3) +; WIN32-NEXT: LBB3_23: +; WIN32-NEXT: fstp %st(3) +; WIN32-NEXT: cmpl $381, %edi # imm = 0x17D +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: jge LBB3_24 +; WIN32-NEXT: # %bb.25: +; WIN32-NEXT: cmpl $255, %edi +; WIN32-NEXT: jae LBB3_26 +; WIN32-NEXT: LBB3_27: +; WIN32-NEXT: cmpl $-126, %ecx +; WIN32-NEXT: jl LBB3_29 +; WIN32-NEXT: LBB3_28: +; WIN32-NEXT: movl %ecx, %esi +; WIN32-NEXT: LBB3_29: +; WIN32-NEXT: cmpl $381, %ecx # imm = 0x17D +; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: jl LBB3_31 +; WIN32-NEXT: # %bb.30: +; WIN32-NEXT: movl $381, %eax # imm = 0x17D +; WIN32-NEXT: LBB3_31: +; WIN32-NEXT: cmpl $255, %ecx +; WIN32-NEXT: flds {{[0-9]+}}(%esp) +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: jb LBB3_33 +; WIN32-NEXT: # %bb.32: +; WIN32-NEXT: addl $-254, %eax +; WIN32-NEXT: movl %eax, %edx +; WIN32-NEXT: LBB3_33: +; WIN32-NEXT: fxch %st(3) +; WIN32-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; WIN32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: cmpl $381, %ebp # imm = 0x17D +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: jl LBB3_35 +; WIN32-NEXT: # %bb.34: +; WIN32-NEXT: movl $381, %eax # imm = 0x17D +; WIN32-NEXT: LBB3_35: +; WIN32-NEXT: fld %st(2) +; WIN32-NEXT: fmul %st(1), %st +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(2), %st +; WIN32-NEXT: leal -127(%ebp), %edi +; WIN32-NEXT: cmpl $255, %ebp +; WIN32-NEXT: jae LBB3_36 +; WIN32-NEXT: # %bb.37: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: jmp LBB3_38 +; WIN32-NEXT: LBB3_24: +; WIN32-NEXT: movl $381, %eax # imm = 0x17D +; WIN32-NEXT: cmpl $255, %edi +; WIN32-NEXT: jb LBB3_27 +; WIN32-NEXT: LBB3_26: +; WIN32-NEXT: addl $-254, %eax +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: cmpl $-126, %ecx +; WIN32-NEXT: jge LBB3_28 +; WIN32-NEXT: jmp LBB3_29 +; WIN32-NEXT: LBB3_36: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: addl $-254, %eax +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: LBB3_38: +; WIN32-NEXT: cmpl $-329, %ebp # imm = 0xFEB7 +; WIN32-NEXT: movl %ebp, %ecx +; WIN32-NEXT: jge LBB3_40 +; WIN32-NEXT: # %bb.39: +; WIN32-NEXT: movl $-330, %ecx # imm = 0xFEB6 +; WIN32-NEXT: LBB3_40: +; WIN32-NEXT: addl $204, %ecx +; WIN32-NEXT: leal 102(%ebp), %eax +; WIN32-NEXT: cmpl $-228, %ebp +; WIN32-NEXT: jb LBB3_42 +; WIN32-NEXT: # %bb.41: +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: LBB3_42: +; WIN32-NEXT: fld %st(3) +; WIN32-NEXT: fmul %st(3), %st +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(4), %st +; WIN32-NEXT: jb LBB3_44 +; WIN32-NEXT: # %bb.43: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: LBB3_44: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: cmpl $-126, %ebp +; WIN32-NEXT: jl LBB3_46 +; WIN32-NEXT: # %bb.45: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: movl %ebp, %ecx +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(4) +; WIN32-NEXT: LBB3_46: +; WIN32-NEXT: fstp %st(4) +; WIN32-NEXT: cmpl $127, %ebp +; WIN32-NEXT: flds {{[0-9]+}}(%esp) +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: jg LBB3_48 +; WIN32-NEXT: # %bb.47: +; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: LBB3_48: +; WIN32-NEXT: cmpl $381, %esi # imm = 0x17D +; WIN32-NEXT: movl %esi, %ecx +; WIN32-NEXT: jl LBB3_50 +; WIN32-NEXT: # %bb.49: +; WIN32-NEXT: movl $381, %ecx # imm = 0x17D +; WIN32-NEXT: LBB3_50: +; WIN32-NEXT: addl $-254, %ecx +; WIN32-NEXT: leal -127(%esi), %eax +; WIN32-NEXT: cmpl $255, %esi +; WIN32-NEXT: jae LBB3_52 +; WIN32-NEXT: # %bb.51: +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: LBB3_52: +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(3), %st +; WIN32-NEXT: fmul %st, %st(3) +; WIN32-NEXT: jae LBB3_54 +; WIN32-NEXT: # %bb.53: +; WIN32-NEXT: fstp %st(3) +; WIN32-NEXT: fldz +; WIN32-NEXT: LBB3_54: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: cmpl $-329, %esi # imm = 0xFEB7 +; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: jge LBB3_56 +; WIN32-NEXT: # %bb.55: +; WIN32-NEXT: movl $-330, %eax # imm = 0xFEB6 +; WIN32-NEXT: LBB3_56: +; WIN32-NEXT: fld %st(0) +; WIN32-NEXT: fmul %st(4), %st +; WIN32-NEXT: fmul %st, %st(4) +; WIN32-NEXT: cmpl $-228, %esi +; WIN32-NEXT: jb LBB3_57 +; WIN32-NEXT: # %bb.58: +; WIN32-NEXT: fstp %st(4) +; WIN32-NEXT: leal 102(%esi), %eax +; WIN32-NEXT: cmpl $-126, %esi +; WIN32-NEXT: jge LBB3_60 +; WIN32-NEXT: jmp LBB3_61 +; WIN32-NEXT: LBB3_57: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: addl $204, %eax +; WIN32-NEXT: cmpl $-126, %esi +; WIN32-NEXT: jl LBB3_61 +; WIN32-NEXT: LBB3_60: +; WIN32-NEXT: fstp %st(3) +; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: fldz +; WIN32-NEXT: LBB3_61: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: cmpl $127, %esi +; WIN32-NEXT: jg LBB3_63 +; WIN32-NEXT: # %bb.62: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(2) +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: LBB3_63: +; WIN32-NEXT: fstp %st(2) +; WIN32-NEXT: cmpl $127, {{[0-9]+}}(%esp) +; WIN32-NEXT: jg LBB3_65 +; WIN32-NEXT: # %bb.64: +; WIN32-NEXT: movl (%esp), %ebx # 4-byte Reload +; WIN32-NEXT: LBB3_65: +; WIN32-NEXT: cmpl $127, {{[0-9]+}}(%esp) +; WIN32-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; WIN32-NEXT: jg LBB3_67 +; WIN32-NEXT: # %bb.66: +; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; WIN32-NEXT: LBB3_67: +; WIN32-NEXT: cmpl $127, %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: jg LBB3_69 +; WIN32-NEXT: # %bb.68: +; WIN32-NEXT: fstp %st(2) +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(3) +; WIN32-NEXT: fxch %st(2) +; WIN32-NEXT: LBB3_69: +; WIN32-NEXT: fstp %st(3) +; WIN32-NEXT: shll $23, %edi +; WIN32-NEXT: addl $1065353216, %edi # imm = 0x3F800000 +; WIN32-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN32-NEXT: shll $23, %ecx +; WIN32-NEXT: addl $1065353216, %ecx # imm = 0x3F800000 +; WIN32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN32-NEXT: shll $23, %ebx +; WIN32-NEXT: addl $1065353216, %ebx # imm = 0x3F800000 +; WIN32-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN32-NEXT: shll $23, %edx +; WIN32-NEXT: addl $1065353216, %edx # imm = 0x3F800000 +; WIN32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: fmuls {{[0-9]+}}(%esp) +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: fmuls {{[0-9]+}}(%esp) +; WIN32-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; WIN32-NEXT: fmuls {{[0-9]+}}(%esp) +; WIN32-NEXT: fxch %st(3) +; WIN32-NEXT: fmuls {{[0-9]+}}(%esp) +; WIN32-NEXT: fstps 12(%eax) +; WIN32-NEXT: fxch %st(2) +; WIN32-NEXT: fstps 8(%eax) +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: fstps 4(%eax) +; WIN32-NEXT: fstps (%eax) +; WIN32-NEXT: addl $32, %esp +; WIN32-NEXT: popl %esi +; WIN32-NEXT: popl %edi +; WIN32-NEXT: popl %ebx +; WIN32-NEXT: popl %ebp +; WIN32-NEXT: retl + %1 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %val, <4 x i32> %exp) + ret <4 x float> %1 +} + +define half @ldexp_f16(half %arg0, i32 %arg1) { +; X64-LABEL: ldexp_f16: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: .cfi_offset %rbx, -16 +; X64-NEXT: movl %edi, %ebx +; X64-NEXT: callq __extendhfsf2@PLT +; X64-NEXT: movl %ebx, %edi +; X64-NEXT: callq ldexpf@PLT +; X64-NEXT: callq __truncsfhf2@PLT +; X64-NEXT: popq %rbx +; X64-NEXT: .cfi_def_cfa_offset 8 +; X64-NEXT: retq +; +; WIN32-LABEL: ldexp_f16: +; WIN32: # %bb.0: +; WIN32-NEXT: pushl %edi +; WIN32-NEXT: pushl %esi +; WIN32-NEXT: subl $8, %esp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: movl %eax, (%esp) +; WIN32-NEXT: cmpl $381, %edi # imm = 0x17D +; WIN32-NEXT: movl %edi, %esi +; WIN32-NEXT: jl LBB4_2 +; WIN32-NEXT: # %bb.1: +; WIN32-NEXT: movl $381, %esi # imm = 0x17D +; WIN32-NEXT: LBB4_2: +; WIN32-NEXT: addl $-254, %esi +; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: leal -127(%edi), %eax +; WIN32-NEXT: cmpl $255, %edi +; WIN32-NEXT: jae LBB4_4 +; WIN32-NEXT: # %bb.3: +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: LBB4_4: +; WIN32-NEXT: flds __real@7f000000 +; WIN32-NEXT: fld %st(1) +; WIN32-NEXT: fmul %st(1), %st +; WIN32-NEXT: fmul %st, %st(1) +; WIN32-NEXT: jae LBB4_6 +; WIN32-NEXT: # %bb.5: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: fldz +; WIN32-NEXT: LBB4_6: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: cmpl $-329, %edi # imm = 0xFEB7 +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: jge LBB4_8 +; WIN32-NEXT: # %bb.7: +; WIN32-NEXT: movl $-330, %eax # imm = 0xFEB6 +; WIN32-NEXT: LBB4_8: +; WIN32-NEXT: flds __real@0c800000 +; WIN32-NEXT: fld %st(2) +; WIN32-NEXT: fmul %st(1), %st +; WIN32-NEXT: fmul %st, %st(1) +; WIN32-NEXT: cmpl $-228, %edi +; WIN32-NEXT: jb LBB4_9 +; WIN32-NEXT: # %bb.10: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: leal 102(%edi), %eax +; WIN32-NEXT: cmpl $-126, %edi +; WIN32-NEXT: jge LBB4_12 +; WIN32-NEXT: jmp LBB4_13 +; WIN32-NEXT: LBB4_9: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: addl $204, %eax +; WIN32-NEXT: cmpl $-126, %edi +; WIN32-NEXT: jl LBB4_13 +; WIN32-NEXT: LBB4_12: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(2) +; WIN32-NEXT: LBB4_13: +; WIN32-NEXT: fstp %st(2) +; WIN32-NEXT: cmpl $127, %edi +; WIN32-NEXT: jg LBB4_15 +; WIN32-NEXT: # %bb.14: +; WIN32-NEXT: fstp %st(0) +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: fldz +; WIN32-NEXT: fxch %st(1) +; WIN32-NEXT: LBB4_15: +; WIN32-NEXT: fstp %st(1) +; WIN32-NEXT: shll $23, %esi +; WIN32-NEXT: addl $1065353216, %esi # imm = 0x3F800000 +; WIN32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN32-NEXT: fmuls {{[0-9]+}}(%esp) +; WIN32-NEXT: fstps (%esp) +; WIN32-NEXT: calll ___gnu_f2h_ieee +; WIN32-NEXT: addl $8, %esp +; WIN32-NEXT: popl %esi +; WIN32-NEXT: popl %edi +; WIN32-NEXT: retl + %ldexp = call half @llvm.ldexp.f16.i32(half %arg0, i32 %arg1) + ret half %ldexp +} + +declare double @llvm.ldexp.f64.i32(double, i32) #0 +declare float @llvm.ldexp.f32.i32(float, i32) #0 +declare <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float>, <2 x i32>) #0 +declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>) #0 +declare half @llvm.ldexp.f16.i32(half, i32) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s index b1b5400..6ea135c 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s @@ -12931,11 +12931,11 @@ v_ldexp_f16_e64 v5, v1, 0 v_ldexp_f16_e64 v5, v1, -1 // GFX10: encoding: [0x05,0x00,0x3b,0xd5,0x01,0x83,0x01,0x00] -v_ldexp_f16_e64 v5, v1, 0.5 -// GFX10: encoding: [0x05,0x00,0x3b,0xd5,0x01,0xe1,0x01,0x00] +v_ldexp_f16_e64 v5, v1, 0x3800 +// GFX10: encoding: [0x05,0x00,0x3b,0xd5,0x01,0xff,0x01,0x00,0x00,0x38,0x00,0x00] v_ldexp_f16_e64 v5, v1, -4.0 -// GFX10: encoding: [0x05,0x00,0x3b,0xd5,0x01,0xef,0x01,0x00] +// GFX10: encoding: [0x05,0x00,0x3b,0xd5,0x01,0xff,0x01,0x00,0x00,0xc4,0x00,0x00] v_ldexp_f16_e64 v5, -v1, v2 // GFX10: encoding: [0x05,0x00,0x3b,0xd5,0x01,0x05,0x02,0x20] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s index 43c71617..ffb8310 100644 --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vop2.s @@ -734,7 +734,7 @@ v_ldexp_f16_e64 v5, ttmp15, src_scc // GFX11: encoding: [0x05,0x00,0x3b,0xd5,0x7b,0xfa,0x01,0x00] v_ldexp_f16_e64 v5, m0, 0.5 -// GFX11: encoding: [0x05,0x00,0x3b,0xd5,0x7d,0xe0,0x01,0x00] +// GFX11: encoding: [0x05,0x00,0x3b,0xd5,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] v_ldexp_f16_e64 v5, exec_lo, -1 // GFX11: encoding: [0x05,0x00,0x3b,0xd5,0x7e,0x82,0x01,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx8_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx8_asm_vop3.s index d4c31f1..d46a97f 100644 --- a/llvm/test/MC/AMDGPU/gfx8_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx8_asm_vop3.s @@ -1,4 +1,5 @@ -// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding -filetype=null 2>&1 %s | FileCheck -check-prefix=ERR --implicit-check-not=error %s v_interp_p1_f32_e64 v5, v2, attr0.x // CHECK: [0x05,0x00,0x70,0xd2,0x00,0x04,0x02,0x00] @@ -12826,10 +12827,10 @@ v_ldexp_f16_e64 v5, v1, -1 // CHECK: [0x05,0x00,0x33,0xd1,0x01,0x83,0x01,0x00] v_ldexp_f16_e64 v5, v1, 0.5 -// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xe1,0x01,0x00] +// ERR: [[@LINE-1]]:25: error: literal operands are not supported v_ldexp_f16_e64 v5, v1, -4.0 -// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xef,0x01,0x00] +// ERR: [[@LINE-1]]:25: error: literal operands are not supported v_ldexp_f16_e64 v5, v1, src_vccz // CHECK: [0x05,0x00,0x33,0xd1,0x01,0xf7,0x01,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop3.s index 8781a01..34d0d77 100644 --- a/llvm/test/MC/AMDGPU/gfx9_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop3.s @@ -1,4 +1,5 @@ -// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding 2>&1 %s | FileCheck -check-prefix=ERR --implicit-check-not=error %s v_interp_p1_f32_e64 v5, v2, attr0.x // CHECK: [0x05,0x00,0x70,0xd2,0x00,0x04,0x02,0x00] @@ -11239,10 +11240,10 @@ v_ldexp_f16_e64 v5, v1, -1 // CHECK: [0x05,0x00,0x33,0xd1,0x01,0x83,0x01,0x00] v_ldexp_f16_e64 v5, v1, 0.5 -// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xe1,0x01,0x00] +// ERR: [[@LINE-1]]:25: error: literal operands are not supported v_ldexp_f16_e64 v5, v1, -4.0 -// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xef,0x01,0x00] +// ERR: [[@LINE-1]]:25: error: literal operands are not supported v_ldexp_f16_e64 v5, v1, src_vccz // CHECK: [0x05,0x00,0x33,0xd1,0x01,0xf7,0x01,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt index 0785ba2..c5cd6f6 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt @@ -7520,13 +7520,13 @@ # GFX10: v_ldexp_f16_e64 v5, v1, -1 ; encoding: [0x05,0x00,0x3b,0xd5,0x01,0x83,0x01,0x00] 0x05,0x00,0x3b,0xd5,0x01,0x83,0x01,0x00 -# GFX10: v_ldexp_f16_e64 v5, v1, -4.0 ; encoding: [0x05,0x00,0x3b,0xd5,0x01,0xef,0x01,0x00] +# GFX10: v_ldexp_f16_e64 v5, v1, 0xc400 ; encoding: [0x05,0x00,0x3b,0xd5,0x01,0xff,0x01,0x00,0x00,0xc4,0x00,0x00] 0x05,0x00,0x3b,0xd5,0x01,0xef,0x01,0x00 # GFX10: v_ldexp_f16_e64 v5, v1, 0 ; encoding: [0x05,0x00,0x3b,0xd5,0x01,0x01,0x01,0x00] 0x05,0x00,0x3b,0xd5,0x01,0x01,0x01,0x00 -# GFX10: v_ldexp_f16_e64 v5, v1, 0.5 ; encoding: [0x05,0x00,0x3b,0xd5,0x01,0xe1,0x01,0x00] +# GFX10: v_ldexp_f16_e64 v5, v1, 0x3800 ; encoding: [0x05,0x00,0x3b,0xd5,0x01,0xff,0x01,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0x3b,0xd5,0x01,0xe1,0x01,0x00 # GFX10: v_ldexp_f16_e64 v5, v1, exec_hi ; encoding: [0x05,0x00,0x3b,0xd5,0x01,0xff,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop2.txt index 3141e8f..4e430a8 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop2.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vop2.txt @@ -543,7 +543,7 @@ # GFX11: v_ldexp_f16_e64 v5, ttmp15, src_scc ; encoding: [0x05,0x00,0x3b,0xd5,0x7b,0xfa,0x01,0x00] 0x05,0x00,0x3b,0xd5,0x7b,0xfa,0x01,0x00 -# GFX11: v_ldexp_f16_e64 v5, m0, 0.5 ; encoding: [0x05,0x00,0x3b,0xd5,0x7d,0xe0,0x01,0x00] +# GFX11: v_ldexp_f16_e64 v5, m0, 0x3800 ; encoding: [0x05,0x00,0x3b,0xd5,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00] 0x05,0x00,0x3b,0xd5,0x7d,0xe0,0x01,0x00 # GFX11: v_ldexp_f16_e64 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x3b,0xd5,0x7e,0x82,0x01,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3.txt index 2b07d62..a0277c7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3.txt @@ -11178,10 +11178,10 @@ # CHECK: v_ldexp_f16_e64 v5, v1, -1 ; encoding: [0x05,0x00,0x33,0xd1,0x01,0x83,0x01,0x00] 0x05,0x00,0x33,0xd1,0x01,0x83,0x01,0x00 -# CHECK: v_ldexp_f16_e64 v5, v1, 0.5 ; encoding: [0x05,0x00,0x33,0xd1,0x01,0xe1,0x01,0x00] +# CHECK: v_ldexp_f16_e64 v5, v1, 0x3800 ; encoding: [0x05,0x00,0x33,0xd1,0x01,0xff,0x01,0x00] 0x05,0x00,0x33,0xd1,0x01,0xe1,0x01,0x00 -# CHECK: v_ldexp_f16_e64 v5, v1, -4.0 ; encoding: [0x05,0x00,0x33,0xd1,0x01,0xef,0x01,0x00] +# CHECK: v_ldexp_f16_e64 v5, v1, 0xc400 ; encoding: [0x05,0x00,0x33,0xd1,0x01,0xff,0x01,0x00] 0x05,0x00,0x33,0xd1,0x01,0xef,0x01,0x00 # CHECK: v_ldexp_f16_e64 v5, -v1, v2 ; encoding: [0x05,0x00,0x33,0xd1,0x01,0x05,0x02,0x20] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt index e3ed977..c2ac84b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt @@ -8814,10 +8814,10 @@ # CHECK: v_ldexp_f16_e64 v5, v1, -1 ; encoding: [0x05,0x00,0x33,0xd1,0x01,0x83,0x01,0x00] 0x05,0x00,0x33,0xd1,0x01,0x83,0x01,0x00 -# CHECK: v_ldexp_f16_e64 v5, v1, 0.5 ; encoding: [0x05,0x00,0x33,0xd1,0x01,0xe1,0x01,0x00] +# CHECK: v_ldexp_f16_e64 v5, v1, 0x3800 ; encoding: [0x05,0x00,0x33,0xd1,0x01,0xff,0x01,0x00] 0x05,0x00,0x33,0xd1,0x01,0xe1,0x01,0x00 -# CHECK: v_ldexp_f16_e64 v5, v1, -4.0 ; encoding: [0x05,0x00,0x33,0xd1,0x01,0xef,0x01,0x00] +# CHECK: v_ldexp_f16_e64 v5, v1, 0xc400 ; encoding: [0x05,0x00,0x33,0xd1,0x01,0xff,0x01,0x00] 0x05,0x00,0x33,0xd1,0x01,0xef,0x01,0x00 # CHECK: v_ldexp_f16_e64 v5, -v1, v2 ; encoding: [0x05,0x00,0x33,0xd1,0x01,0x05,0x02,0x20] diff --git a/llvm/test/Transforms/SpeculativeExecution/spec-calls.ll b/llvm/test/Transforms/SpeculativeExecution/spec-calls.ll index 8b40ac5..f9033e7 100644 --- a/llvm/test/Transforms/SpeculativeExecution/spec-calls.ll +++ b/llvm/test/Transforms/SpeculativeExecution/spec-calls.ll @@ -302,3 +302,19 @@ b: } declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) + +; CHECK-LABEL: @ifThen_ldexp( +; CHECK: %ldexp = call float @llvm.ldexp.f32.i32(float %x, i32 %y) +; CHECK-NEXT: br i1 true +define void @ifThen_ldexp(float %x, i32 %y) { + br i1 true, label %a, label %b + +a: + %ldexp = call float @llvm.ldexp.f32.i32(float %x, i32 %y) + br label %b + +b: + ret void +} + +declare float @llvm.ldexp.f32.i32(float, i32) -- 2.7.4