From 674d52e8ced27bf427b3ea2c763a566ca9b8212a Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Sun, 27 Mar 2022 12:23:21 +0800 Subject: [PATCH] [X86] Refactor X86ScalarSSEf16/32/64 with hasFP16/SSE1/SSE2. NFCI This is used for f16 emulation. We emulate f16 for SSE2 targets and above. Refactoring makes the future code to be more clean. Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D122475 --- llvm/lib/Target/X86/X86FastISel.cpp | 113 ++++++++++++++------------------ llvm/lib/Target/X86/X86ISelLowering.cpp | 36 +++++++--- llvm/lib/Target/X86/X86ISelLowering.h | 25 +------ 3 files changed, 78 insertions(+), 96 deletions(-) diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 2e40ad1..397c612 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -49,22 +49,11 @@ class X86FastISel final : public FastISel { /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 - /// floating point ops. - /// When SSE is available, use it for f32 operations. - /// When SSE2 is available, use it for f64 operations. - bool X86ScalarSSEf64; - bool X86ScalarSSEf32; - bool X86ScalarSSEf16; - public: explicit X86FastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) : FastISel(funcInfo, libInfo) { Subtarget = &funcInfo.MF->getSubtarget(); - X86ScalarSSEf64 = Subtarget->hasSSE2(); - X86ScalarSSEf32 = Subtarget->hasSSE1(); - X86ScalarSSEf16 = Subtarget->hasFP16(); } bool fastSelectInstruction(const Instruction *I) override; @@ -158,9 +147,9 @@ private: /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is /// computed in an SSE register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { - return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 - (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1 - (VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16 + return (VT == MVT::f64 && Subtarget->hasSSE2()) || + (VT == MVT::f32 && Subtarget->hasSSE1()) || + (VT == MVT::f16 && Subtarget->hasFP16()); } bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); @@ -305,9 +294,9 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { VT = evt.getSimpleVT(); // For now, require SSE/SSE2 for performing floating-point operations, // since x87 requires additional work. - if (VT == MVT::f64 && !X86ScalarSSEf64) + if (VT == MVT::f64 && !Subtarget->hasSSE2()) return false; - if (VT == MVT::f32 && !X86ScalarSSEf32) + if (VT == MVT::f32 && !Subtarget->hasSSE1()) return false; // Similarly, no f80 support yet. if (VT == MVT::f80) @@ -325,6 +314,8 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO, unsigned &ResultReg, unsigned Alignment) { + bool HasSSE1 = Subtarget->hasSSE1(); + bool HasSSE2 = Subtarget->hasSSE2(); bool HasSSE41 = Subtarget->hasSSE41(); bool HasAVX = Subtarget->hasAVX(); bool HasAVX2 = Subtarget->hasAVX2(); @@ -354,20 +345,16 @@ bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM, Opc = X86::MOV64rm; break; case MVT::f32: - if (X86ScalarSSEf32) - Opc = HasAVX512 ? X86::VMOVSSZrm_alt : - HasAVX ? X86::VMOVSSrm_alt : - X86::MOVSSrm_alt; - else - Opc = X86::LD_Fp32m; + Opc = HasAVX512 ? X86::VMOVSSZrm_alt + : HasAVX ? X86::VMOVSSrm_alt + : HasSSE1 ? X86::MOVSSrm_alt + : X86::LD_Fp32m; break; case MVT::f64: - if (X86ScalarSSEf64) - Opc = HasAVX512 ? X86::VMOVSDZrm_alt : - HasAVX ? X86::VMOVSDrm_alt : - X86::MOVSDrm_alt; - else - Opc = X86::LD_Fp64m; + Opc = HasAVX512 ? X86::VMOVSDZrm_alt + : HasAVX ? X86::VMOVSDrm_alt + : HasSSE2 ? X86::MOVSDrm_alt + : X86::LD_Fp64m; break; case MVT::f80: // No f80 support yet. @@ -521,7 +508,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM, Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr; break; case MVT::f32: - if (X86ScalarSSEf32) { + if (HasSSE1) { if (IsNonTemporal && HasSSE4A) Opc = X86::MOVNTSS; else @@ -531,7 +518,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM, Opc = X86::ST_Fp32m; break; case MVT::f64: - if (X86ScalarSSEf32) { + if (HasSSE2) { if (IsNonTemporal && HasSSE4A) Opc = X86::MOVNTSD; else @@ -1362,8 +1349,8 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) { static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { bool HasAVX512 = Subtarget->hasAVX512(); bool HasAVX = Subtarget->hasAVX(); - bool X86ScalarSSEf32 = Subtarget->hasSSE1(); - bool X86ScalarSSEf64 = Subtarget->hasSSE2(); + bool HasSSE1 = Subtarget->hasSSE1(); + bool HasSSE2 = Subtarget->hasSSE2(); switch (VT.getSimpleVT().SimpleTy) { default: return 0; @@ -1372,15 +1359,15 @@ static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { case MVT::i32: return X86::CMP32rr; case MVT::i64: return X86::CMP64rr; case MVT::f32: - return X86ScalarSSEf32 - ? (HasAVX512 ? X86::VUCOMISSZrr - : HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) - : 0; + return HasAVX512 ? X86::VUCOMISSZrr + : HasAVX ? X86::VUCOMISSrr + : HasSSE1 ? X86::UCOMISSrr + : 0; case MVT::f64: - return X86ScalarSSEf64 - ? (HasAVX512 ? X86::VUCOMISDZrr - : HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) - : 0; + return HasAVX512 ? X86::VUCOMISDZrr + : HasAVX ? X86::VUCOMISDrr + : HasSSE2 ? X86::UCOMISDrr + : 0; } } @@ -2495,7 +2482,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, } bool X86FastISel::X86SelectFPExt(const Instruction *I) { - if (X86ScalarSSEf64 && I->getType()->isDoubleTy() && + if (Subtarget->hasSSE2() && I->getType()->isDoubleTy() && I->getOperand(0)->getType()->isFloatTy()) { bool HasAVX512 = Subtarget->hasAVX512(); // fpext from float to double. @@ -2509,7 +2496,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) { } bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { - if (X86ScalarSSEf64 && I->getType()->isFloatTy() && + if (Subtarget->hasSSE2() && I->getType()->isFloatTy() && I->getOperand(0)->getType()->isDoubleTy()) { bool HasAVX512 = Subtarget->hasAVX512(); // fptrunc from double to float. @@ -3733,25 +3720,23 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; + bool HasSSE1 = Subtarget->hasSSE1(); + bool HasSSE2 = Subtarget->hasSSE2(); bool HasAVX = Subtarget->hasAVX(); bool HasAVX512 = Subtarget->hasAVX512(); switch (VT.SimpleTy) { default: return 0; case MVT::f32: - if (X86ScalarSSEf32) - Opc = HasAVX512 ? X86::VMOVSSZrm_alt : - HasAVX ? X86::VMOVSSrm_alt : - X86::MOVSSrm_alt; - else - Opc = X86::LD_Fp32m; + Opc = HasAVX512 ? X86::VMOVSSZrm_alt + : HasAVX ? X86::VMOVSSrm_alt + : HasSSE1 ? X86::MOVSSrm_alt + : X86::LD_Fp32m; break; case MVT::f64: - if (X86ScalarSSEf64) - Opc = HasAVX512 ? X86::VMOVSDZrm_alt : - HasAVX ? X86::VMOVSDrm_alt : - X86::MOVSDrm_alt; - else - Opc = X86::LD_Fp64m; + Opc = HasAVX512 ? X86::VMOVSDZrm_alt + : HasAVX ? X86::VMOVSDrm_alt + : HasSSE2 ? X86::MOVSDrm_alt + : X86::LD_Fp64m; break; case MVT::f80: // No f80 support yet. @@ -3852,11 +3837,11 @@ unsigned X86FastISel::fastMaterializeConstant(const Constant *C) { default: break; case MVT::f32: - if (!X86ScalarSSEf32) + if (!Subtarget->hasSSE1()) Opc = X86::LD_Fp032; break; case MVT::f64: - if (!X86ScalarSSEf64) + if (!Subtarget->hasSSE2()) Opc = X86::LD_Fp064; break; case MVT::f80: @@ -3907,21 +3892,21 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { return 0; // Get opcode and regclass for the given zero. + bool HasSSE1 = Subtarget->hasSSE1(); + bool HasSSE2 = Subtarget->hasSSE2(); bool HasAVX512 = Subtarget->hasAVX512(); unsigned Opc = 0; switch (VT.SimpleTy) { default: return 0; case MVT::f32: - if (X86ScalarSSEf32) - Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS; - else - Opc = X86::LD_Fp032; + Opc = HasAVX512 ? X86::AVX512_FsFLD0SS + : HasSSE1 ? X86::FsFLD0SS + : X86::LD_Fp032; break; case MVT::f64: - if (X86ScalarSSEf64) - Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD; - else - Opc = X86::LD_Fp064; + Opc = HasAVX512 ? X86::AVX512_FsFLD0SD + : HasSSE2 ? X86::FsFLD0SD + : X86::LD_Fp064; break; case MVT::f80: // No f80 support yet. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e232bd3..be8ec70 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -108,9 +108,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); - X86ScalarSSEf64 = Subtarget.hasSSE2(); - X86ScalarSSEf32 = Subtarget.hasSSE1(); - X86ScalarSSEf16 = Subtarget.hasFP16(); MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); // Set up the TargetLowering object. @@ -314,7 +311,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); // TODO: when we have SSE, these could be more efficient, by using movd/movq. - if (!X86ScalarSSEf64) { + if (!Subtarget.hasSSE2()) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); if (Subtarget.is64Bit()) { @@ -555,7 +552,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); - if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) { + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass @@ -593,7 +590,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); - } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 && + } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() && (UseX87 || Is64Bit)) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. @@ -2572,9 +2569,9 @@ EVT X86TargetLowering::getOptimalMemOpType( bool X86TargetLowering::isSafeMemOpType(MVT VT) const { if (VT == MVT::f32) - return X86ScalarSSEf32; + return Subtarget.hasSSE1(); if (VT == MVT::f64) - return X86ScalarSSEf64; + return Subtarget.hasSSE2(); return true; } @@ -5669,6 +5666,24 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget.hasLZCNT(); } +bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const { + return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() || + (VT == MVT::f16 && Subtarget.hasFP16()); +} + +bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const { + // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more + // expensive than a straight movsd. On the other hand, it's important to + // shrink long double fp constant since fldt is very slow. + return !Subtarget.hasSSE2() || VT == MVT::f80; +} + +bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const { + return (VT == MVT::f64 && Subtarget.hasSSE2()) || + (VT == MVT::f32 && Subtarget.hasSSE1()) || + (VT == MVT::f16 && Subtarget.hasFP16()); +} + bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const { @@ -21196,9 +21211,10 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // The transform for i64->f64 isn't correct for 0 when rounding to negative // infinity. It produces -0.0, so disable under strictfp. - if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict) + if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() && + !IsStrict) return LowerUINT_TO_FP_i64(Op, DAG, Subtarget); - if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80) + if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80) return LowerUINT_TO_FP_i32(Op, DAG, Subtarget); if (Subtarget.is64Bit() && SrcVT == MVT::i64 && (DstVT == MVT::f32 || DstVT == MVT::f64)) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 80ffbf9..6e54806 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1039,10 +1039,7 @@ namespace llvm { bool isCtlzFast() const override; - bool hasBitPreservingFPLogic(EVT VT) const override { - return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() || - (VT == MVT::f16 && X86ScalarSSEf16); - } + bool hasBitPreservingFPLogic(EVT VT) const override; bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override { // If the pair to store is a mixture of float and int values, we will @@ -1322,12 +1319,7 @@ namespace llvm { /// If true, then instruction selection should /// seek to shrink the FP constant of the specified type to a smaller type /// in order to save space and / or reduce runtime. - bool ShouldShrinkFPConstant(EVT VT) const override { - // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more - // expensive than a straight movsd. On the other hand, it's important to - // shrink long double fp constant since fldt is very slow. - return !X86ScalarSSEf64 || VT == MVT::f80; - } + bool ShouldShrinkFPConstant(EVT VT) const override; /// Return true if we believe it is correct and profitable to reduce the /// load node to a smaller type. @@ -1336,11 +1328,7 @@ namespace llvm { /// Return true if the specified scalar FP type is computed in an SSE /// register, not on the X87 floating point stack. - bool isScalarFPTypeInSSEReg(EVT VT) const { - return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 - (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1 - (VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16 - } + bool isScalarFPTypeInSSEReg(EVT VT) const; /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. @@ -1494,13 +1482,6 @@ namespace llvm { /// make the right decision when generating code for different targets. const X86Subtarget &Subtarget; - /// Select between SSE or x87 floating point ops. - /// When SSE is available, use it for f32 operations. - /// When SSE2 is available, use it for f64 operations. - bool X86ScalarSSEf32; - bool X86ScalarSSEf64; - bool X86ScalarSSEf16; - /// A list of legal FP immediates. std::vector LegalFPImmediates; -- 2.7.4