From cf9daa33a7870c235e0edc176dd40579f376cafc Mon Sep 17 00:00:00 2001 From: Amara Emerson Date: Tue, 9 May 2017 10:43:25 +0000 Subject: [PATCH] Introduce experimental generic intrinsics for horizontal vector reductions. - This change allows targets to opt-in to using them instead of the log2 shufflevector algorithm. - The SLP and Loop vectorizers have the common code to do shuffle reductions factored out into LoopUtils, and now have a unified interface for generating reductions regardless of the preference of the target. LoopUtils now uses TTI to determine what kind of reductions the target wants to handle. - For CodeGen, basic legalization support is added. Differential Revision: https://reviews.llvm.org/D30086 llvm-svn: 302514 --- llvm/docs/LangRef.rst | 332 +++++++++++++++++++++ llvm/include/llvm/Analysis/TargetTransformInfo.h | 19 ++ .../llvm/Analysis/TargetTransformInfoImpl.h | 6 + llvm/include/llvm/CodeGen/ISDOpcodes.h | 14 + llvm/include/llvm/IR/IRBuilder.h | 39 +++ llvm/include/llvm/IR/Intrinsics.td | 44 +++ llvm/include/llvm/Transforms/Utils/LoopUtils.h | 26 ++ llvm/lib/Analysis/TargetTransformInfo.cpp | 6 + llvm/lib/Analysis/VectorUtils.cpp | 1 + llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 58 ++++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +- .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 88 ++++++ .../lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h | 2 + .../CodeGen/SelectionDAG/SelectionDAGDumper.cpp | 13 + llvm/lib/IR/IRBuilder.cpp | 88 ++++++ llvm/lib/Transforms/Utils/LoopUtils.cpp | 202 +++++++++++++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 37 +-- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 48 +-- 19 files changed, 960 insertions(+), 66 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index dad99e33..dc6350c 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -11687,6 +11687,338 @@ Examples: %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c + +Experimental Vector Reduction Intrinsics +---------------------------------------- + +Horizontal reductions of vectors can be expressed using the following +intrinsics. Each one takes a vector operand as an input and applies its +respective operation across all elements of the vector, returning a single +scalar result of the same element type. + + +'``llvm.experimental.vector.reduce.add.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a) + declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.add.*``' intrinsics do an integer ``ADD`` +reduction of a vector, returning the result as a scalar. The return type matches +the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.fadd.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %a) + declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %acc, <2 x double> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.fadd.*``' intrinsics do a floating point +``ADD`` reduction of a vector, returning the result as a scalar. The return type +matches the element-type of the vector input. + +If the intrinsic call has fast-math flags, then the reduction will not preserve +the associativity of an equivalent scalarized counterpart. If it does not have +fast-math flags, then the reduction will be *ordered*, implying that the +operation respects the associativity of a scalarized reduction. + + +Arguments: +"""""""""" +The first argument to this intrinsic is a scalar accumulator value, which is +only used when there are no fast-math flags attached. This argument may be undef +when fast-math flags are used. + +The second argument must be a vector of floating point values. + +Examples: +""""""""" + +.. code-block:: llvm + + %fast = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %input) ; fast reduction + %ord = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction + + +'``llvm.experimental.vector.reduce.mul.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a) + declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.mul.*``' intrinsics do an integer ``MUL`` +reduction of a vector, returning the result as a scalar. The return type matches +the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.fmul.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %a) + declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %acc, <2 x double> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.fmul.*``' intrinsics do a floating point +``MUL`` reduction of a vector, returning the result as a scalar. The return type +matches the element-type of the vector input. + +If the intrinsic call has fast-math flags, then the reduction will not preserve +the associativity of an equivalent scalarized counterpart. If it does not have +fast-math flags, then the reduction will be *ordered*, implying that the +operation respects the associativity of a scalarized reduction. + + +Arguments: +"""""""""" +The first argument to this intrinsic is a scalar accumulator value, which is +only used when there are no fast-math flags attached. This argument may be undef +when fast-math flags are used. + +The second argument must be a vector of floating point values. + +Examples: +""""""""" + +.. code-block:: llvm + + %fast = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %input) ; fast reduction + %ord = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction + +'``llvm.experimental.vector.reduce.and.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.and.*``' intrinsics do a bitwise ``AND`` +reduction of a vector, returning the result as a scalar. The return type matches +the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.or.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction +of a vector, returning the result as a scalar. The return type matches the +element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.xor.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR`` +reduction of a vector, returning the result as a scalar. The return type matches +the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.smax.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.smax.*``' intrinsics do a signed integer +``MAX`` reduction of a vector, returning the result as a scalar. The return type +matches the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.smin.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.smin.*``' intrinsics do a signed integer +``MIN`` reduction of a vector, returning the result as a scalar. The return type +matches the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.umax.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.umax.*``' intrinsics do an unsigned +integer ``MAX`` reduction of a vector, returning the result as a scalar. The +return type matches the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.umin.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.umin.*``' intrinsics do an unsigned +integer ``MIN`` reduction of a vector, returning the result as a scalar. The +return type matches the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.fmax.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %a) + declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.fmax.*``' intrinsics do a floating point +``MAX`` reduction of a vector, returning the result as a scalar. The return type +matches the element-type of the vector input. + +If the intrinsic call has the ``nnan`` fast-math flag then the operation can +assume that NaNs are not present in the input vector. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of floating point values. + +'``llvm.experimental.vector.reduce.fmin.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %a) + declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.fmin.*``' intrinsics do a floating point +``MIN`` reduction of a vector, returning the result as a scalar. The return type +matches the element-type of the vector input. + +If the intrinsic call has the ``nnan`` fast-math flag then the operation can +assume that NaNs are not present in the input vector. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of floating point values. + Half Precision Floating Point Intrinsics ---------------------------------------- diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index b9639db..a769d5f 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -740,6 +740,19 @@ public: unsigned ChainSizeInBytes, VectorType *VecTy) const; + /// Flags describing the kind of vector reduction. + struct ReductionFlags { + ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {} + bool IsMaxOp; ///< If the op a min/max kind, true if it's a max operation. + bool IsSigned; ///< Whether the operation is a signed int reduction. + bool NoNaN; ///< If op is an fp min/max, whether NaNs may be present. + }; + + /// \returns True if the target wants to handle the given reduction idiom in + /// the intrinsics form instead of the shuffle form. + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + ReductionFlags Flags) const; + /// @} private: @@ -895,6 +908,8 @@ public: virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; + virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + ReductionFlags) const = 0; }; template @@ -1200,6 +1215,10 @@ public: VectorType *VecTy) const override { return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + ReductionFlags Flags) const override { + return Impl.useReductionIntrinsic(Opcode, Ty, Flags); + } }; template diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index d7fda9e..83b975e 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -456,6 +456,12 @@ public: VectorType *VecTy) const { return VF; } + + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + return false; + } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index ca0f3fb..2bc218f 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -783,6 +783,20 @@ namespace ISD { /// known nonzero constant. The only operand here is the chain. GET_DYNAMIC_AREA_OFFSET, + /// Generic reduction nodes. These nodes represent horizontal vector + /// reduction operations, producing a scalar result. + /// The STRICT variants perform reductions in sequential order. The first + /// operand is an initial scalar accumulator value, and the second operand + /// is the vector to reduce. + VECREDUCE_STRICT_FADD, VECREDUCE_STRICT_FMUL, + /// These reductions are non-strict, and have a single vector operand. + VECREDUCE_FADD, VECREDUCE_FMUL, + VECREDUCE_ADD, VECREDUCE_MUL, + VECREDUCE_AND, VECREDUCE_OR, VECREDUCE_XOR, + VECREDUCE_SMAX, VECREDUCE_SMIN, VECREDUCE_UMAX, VECREDUCE_UMIN, + /// FMIN/FMAX nodes can have flags, for NaN/NoNaN variants. + VECREDUCE_FMAX, VECREDUCE_FMIN, + /// BUILTIN_OP_END - This must be the last enum value in this list. /// The target-specific pre-isel opcode values start here. BUILTIN_OP_END diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index bc689f3..9d4c13c 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -454,6 +454,45 @@ public: MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); + /// \brief Create a vector fadd reduction intrinsic of the source vector. + /// The first parameter is a scalar accumulator value for ordered reductions. + CallInst *CreateFAddReduce(Value *Acc, Value *Src); + + /// \brief Create a vector fmul reduction intrinsic of the source vector. + /// The first parameter is a scalar accumulator value for ordered reductions. + CallInst *CreateFMulReduce(Value *Acc, Value *Src); + + /// \brief Create a vector int add reduction intrinsic of the source vector. + CallInst *CreateAddReduce(Value *Src); + + /// \brief Create a vector int mul reduction intrinsic of the source vector. + CallInst *CreateMulReduce(Value *Src); + + /// \brief Create a vector int AND reduction intrinsic of the source vector. + CallInst *CreateAndReduce(Value *Src); + + /// \brief Create a vector int OR reduction intrinsic of the source vector. + CallInst *CreateOrReduce(Value *Src); + + /// \brief Create a vector int XOR reduction intrinsic of the source vector. + CallInst *CreateXorReduce(Value *Src); + + /// \brief Create a vector integer max reduction intrinsic of the source + /// vector. + CallInst *CreateIntMaxReduce(Value *Src, bool IsSigned = false); + + /// \brief Create a vector integer min reduction intrinsic of the source + /// vector. + CallInst *CreateIntMinReduce(Value *Src, bool IsSigned = false); + + /// \brief Create a vector float max reduction intrinsic of the source + /// vector. + CallInst *CreateFPMaxReduce(Value *Src, bool NoNaN = false); + + /// \brief Create a vector float min reduction intrinsic of the source + /// vector. + CallInst *CreateFPMinReduce(Value *Src, bool NoNaN = false); + /// \brief Create a lifetime.start intrinsic. /// /// If the pointer isn't i8* it will be converted. diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 7b78d4d..19f6045 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -812,6 +812,50 @@ def int_memcpy_element_atomic : Intrinsic<[], [IntrArgMemOnly, NoCapture<0>, NoCapture<1>, WriteOnly<0>, ReadOnly<1>]>; +//===------------------------ Reduction Intrinsics ------------------------===// +// +def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty], + [llvm_anyfloat_ty, + llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty], + [llvm_anyfloat_ty, + llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_mul : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_and : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_or : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_xor : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_smax : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_smin : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_umax : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_umin : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_fmax : Intrinsic<[llvm_anyfloat_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_fmin : Intrinsic<[llvm_anyfloat_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; + //===----- Intrinsics that are used to provide predicate information -----===// def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index a1cf41d..94d10c9 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -21,6 +21,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" @@ -42,6 +43,7 @@ class PredIteratorCache; class ScalarEvolution; class SCEV; class TargetLibraryInfo; +class TargetTransformInfo; /// \brief Captures loop safety information. /// It keep information for loop & its header may throw exception. @@ -489,6 +491,30 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE = nullptr); +/// Create a target reduction of the given vector. The reduction operation +/// is described by the \p Opcode parameter. min/max reductions require +/// additional information supplied in \p Flags. +/// The target is queried to determine if intrinsics or shuffle sequences are +/// required to implement the reduction. +Value * +createSimpleTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI, + unsigned Opcode, Value *Src, + TargetTransformInfo::ReductionFlags Flags = + TargetTransformInfo::ReductionFlags(), + ArrayRef RedOps = ArrayRef()); + +/// Create a generic target reduction using a recurrence descriptor \p Desc +/// The target is queried to determine if intrinsics or shuffle sequences are +/// required to implement the reduction. +Value *createTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI, + RecurrenceDescriptor &Desc, Value *Src, + bool NoNaN = false); + +/// Get the intersection (logical and) of all of the potential IR flags +/// of each scalar operation (VL) that will be converted into a vector (I). +/// Flag set: NSW, NUW, exact, and all of fast-math. +void propagateIRFlags(Value *I, ArrayRef VL); + } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 26d606c..a73fe65 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -500,6 +500,12 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF, return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } +bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, + Type *Ty, ReductionFlags Flags) const { + return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags); +} + + TargetTransformInfo::Concept::~Concept() {} TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 722f17a..2d2249d 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Value.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" using namespace llvm; using namespace llvm::PatternMatch; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index cde4331..4c3b5148 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -675,6 +675,7 @@ private: // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>. bool SplitVectorOperand(SDNode *N, unsigned OpNo); SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo); + SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo); SDValue SplitVecOp_UnaryOp(SDNode *N); SDValue SplitVecOp_TruncateHelper(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 97a7fab..ff0e609 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1513,6 +1513,22 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::ZERO_EXTEND_VECTOR_INREG: Res = SplitVecOp_ExtVecInRegOp(N); break; + + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + Res = SplitVecOp_VECREDUCE(N, OpNo); + break; } } @@ -1565,6 +1581,48 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) { return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect); } +SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) { + EVT ResVT = N->getValueType(0); + SDValue Lo, Hi; + SDLoc dl(N); + + SDValue VecOp = N->getOperand(OpNo); + EVT VecVT = VecOp.getValueType(); + assert(VecVT.isVector() && "Can only split reduce vector operand"); + GetSplitVector(VecOp, Lo, Hi); + EVT LoOpVT, HiOpVT; + std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT); + + bool NoNaN = N->getFlags().hasNoNaNs(); + unsigned CombineOpc = 0; + switch (N->getOpcode()) { + case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break; + case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break; + case ISD::VECREDUCE_ADD: CombineOpc = ISD::ADD; break; + case ISD::VECREDUCE_MUL: CombineOpc = ISD::MUL; break; + case ISD::VECREDUCE_AND: CombineOpc = ISD::AND; break; + case ISD::VECREDUCE_OR: CombineOpc = ISD::OR; break; + case ISD::VECREDUCE_XOR: CombineOpc = ISD::XOR; break; + case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break; + case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break; + case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break; + case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break; + case ISD::VECREDUCE_FMAX: + CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN; + break; + case ISD::VECREDUCE_FMIN: + CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN; + break; + default: + llvm_unreachable("Unexpected reduce ISD node"); + } + + // Use the appropriate scalar instruction on the split subvectors before + // reducing the now partially reduced smaller vector. + SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi); + return DAG.getNode(N->getOpcode(), dl, ResVT, Partial); +} + SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) { // The result has a legal vector type, but the input needs splitting. EVT ResVT = N->getValueType(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index d605a1d..ab1eaae 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5970,7 +5970,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, unsigned NumOps = Ops.size(); switch (NumOps) { case 0: return getNode(Opcode, DL, VT); - case 1: return getNode(Opcode, DL, VT, Ops[0]); + case 1: return getNode(Opcode, DL, VT, Ops[0], Flags); case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags); case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]); default: break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 50313e2..4ccf8c9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5737,6 +5737,24 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { case Intrinsic::experimental_deoptimize: LowerDeoptimizeCall(&I); return nullptr; + + case Intrinsic::experimental_vector_reduce_fadd: + case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: { + visitVectorReduce(I, Intrinsic); + return nullptr; + } + } } @@ -7616,6 +7634,76 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS, FuncInfo.MF->getFrameInfo().setHasPatchPoint(); } +void SelectionDAGBuilder::visitVectorReduce(const CallInst &I, + unsigned Intrinsic) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Op1 = getValue(I.getArgOperand(0)); + SDValue Op2; + if (I.getNumArgOperands() > 1) + Op2 = getValue(I.getArgOperand(1)); + SDLoc dl = getCurSDLoc(); + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + SDValue Res; + FastMathFlags FMF; + if (isa(I)) + FMF = I.getFastMathFlags(); + SDNodeFlags SDFlags; + SDFlags.setNoNaNs(FMF.noNaNs()); + + switch (Intrinsic) { + case Intrinsic::experimental_vector_reduce_fadd: + if (FMF.unsafeAlgebra()) + Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2); + else + Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2); + break; + case Intrinsic::experimental_vector_reduce_fmul: + if (FMF.unsafeAlgebra()) + Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2); + else + Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2); + break; + case Intrinsic::experimental_vector_reduce_add: + Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_mul: + Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_and: + Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_or: + Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_xor: + Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_smax: + Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_smin: + Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_umax: + Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_umin: + Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_fmax: { + Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags); + break; + } + case Intrinsic::experimental_vector_reduce_fmin: { + Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags); + break; + } + default: + llvm_unreachable("Unhandled vector reduce intrinsic"); + } + setValue(&I, Res); +} + /// Returns an AttributeList representing the attributes applied to the return /// value of the given call. static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 9e99890..010104b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -909,6 +909,8 @@ private: void visitGCRelocate(const GCRelocateInst &I); void visitGCResult(const GCResultInst &I); + void visitVectorReduce(const CallInst &I, unsigned Intrinsic); + void visitUserOp1(const Instruction &I) { llvm_unreachable("UserOp1 should not exist at instruction selection time!"); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 26dd45e..c37d708 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -346,6 +346,19 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::SETFALSE: return "setfalse"; case ISD::SETFALSE2: return "setfalse2"; } + case ISD::VECREDUCE_FADD: return "vecreduce_fadd"; + case ISD::VECREDUCE_FMUL: return "vecreduce_fmul"; + case ISD::VECREDUCE_ADD: return "vecreduce_add"; + case ISD::VECREDUCE_MUL: return "vecreduce_mul"; + case ISD::VECREDUCE_AND: return "vecreduce_and"; + case ISD::VECREDUCE_OR: return "vecreduce_or"; + case ISD::VECREDUCE_XOR: return "vecreduce_xor"; + case ISD::VECREDUCE_SMAX: return "vecreduce_smax"; + case ISD::VECREDUCE_SMIN: return "vecreduce_smin"; + case ISD::VECREDUCE_UMAX: return "vecreduce_umax"; + case ISD::VECREDUCE_UMIN: return "vecreduce_umin"; + case ISD::VECREDUCE_FMAX: return "vecreduce_fmax"; + case ISD::VECREDUCE_FMIN: return "vecreduce_fmin"; } } diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index e265a82..3477c08 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -161,6 +161,94 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align, return CI; } +static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID, + Value *Src) { + Module *M = Builder->GetInsertBlock()->getParent()->getParent(); + Value *Ops[] = {Src}; + Type *Tys[] = { Src->getType()->getVectorElementType(), Src->getType() }; + auto Decl = Intrinsic::getDeclaration(M, ID, Tys); + return createCallHelper(Decl, Ops, Builder); +} + +CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) { + Module *M = GetInsertBlock()->getParent()->getParent(); + Value *Ops[] = {Acc, Src}; + Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(), + Src->getType()}; + auto Decl = Intrinsic::getDeclaration( + M, Intrinsic::experimental_vector_reduce_fadd, Tys); + return createCallHelper(Decl, Ops, this); +} + +CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) { + Module *M = GetInsertBlock()->getParent()->getParent(); + Value *Ops[] = {Acc, Src}; + Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(), + Src->getType()}; + auto Decl = Intrinsic::getDeclaration( + M, Intrinsic::experimental_vector_reduce_fmul, Tys); + return createCallHelper(Decl, Ops, this); +} + +CallInst *IRBuilderBase::CreateAddReduce(Value *Src) { + return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add, + Src); +} + +CallInst *IRBuilderBase::CreateMulReduce(Value *Src) { + return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul, + Src); +} + +CallInst *IRBuilderBase::CreateAndReduce(Value *Src) { + return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and, + Src); +} + +CallInst *IRBuilderBase::CreateOrReduce(Value *Src) { + return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or, + Src); +} + +CallInst *IRBuilderBase::CreateXorReduce(Value *Src) { + return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor, + Src); +} + +CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) { + auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax + : Intrinsic::experimental_vector_reduce_umax; + return getReductionIntrinsic(this, ID, Src); +} + +CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) { + auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin + : Intrinsic::experimental_vector_reduce_umin; + return getReductionIntrinsic(this, ID, Src); +} + +CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) { + auto Rdx = getReductionIntrinsic( + this, Intrinsic::experimental_vector_reduce_fmax, Src); + if (NoNaN) { + FastMathFlags FMF; + FMF.setNoNaNs(); + Rdx->setFastMathFlags(FMF); + } + return Rdx; +} + +CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) { + auto Rdx = getReductionIntrinsic( + this, Intrinsic::experimental_vector_reduce_fmin, Src); + if (NoNaN) { + FastMathFlags FMF; + FMF.setNoNaNs(); + Rdx->setFastMathFlags(FMF); + } + return Rdx; +} + CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) { assert(isa(Ptr->getType()) && "lifetime.start only applies to pointers."); diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 175d013..d41fe62 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" @@ -1112,3 +1113,204 @@ Optional llvm::getLoopEstimatedTripCount(Loop *L) { else return (FalseVal + (TrueVal / 2)) / TrueVal; } + +/// \brief Adds a 'fast' flag to floating point operations. +static Value *addFastMathFlag(Value *V) { + if (isa(V)) { + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + cast(V)->setFastMathFlags(Flags); + } + return V; +} + +// Helper to generate a log2 shuffle reduction. +static Value * +getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = + RecurrenceDescriptor::MRK_Invalid, + ArrayRef RedOps = ArrayRef()) { + unsigned VF = Src->getType()->getVectorNumElements(); + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = Src; + SmallVector ShuffleMask(VF, nullptr); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = Builder.getInt32(i / 2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), + UndefValue::get(Builder.getInt32Ty())); + + Value *Shuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), + ConstantVector::get(ShuffleMask), "rdx.shuf"); + + if (Op != Instruction::ICmp && Op != Instruction::FCmp) { + // Floating point operations had to be 'fast' to enable the reduction. + TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op, + TmpVec, Shuf, "bin.rdx")); + } else { + assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && + "Invalid min/max"); + TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec, + Shuf); + } + if (!RedOps.empty()) + propagateIRFlags(TmpVec, RedOps); + } + // The result is in the first element of the vector. + return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); +} + +/// Create a simple vector reduction specified by an opcode and some +/// flags (if generating min/max reductions). +Value *llvm::createSimpleTargetReduction( + IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode, + Value *Src, TargetTransformInfo::ReductionFlags Flags, + ArrayRef RedOps) { + assert(isa(Src->getType()) && "Type must be a vector"); + + Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType()); + std::function BuildFunc; + using RD = RecurrenceDescriptor; + RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid; + // TODO: Support creating ordered reductions. + FastMathFlags FMFUnsafe; + FMFUnsafe.setUnsafeAlgebra(); + + switch (Opcode) { + case Instruction::Add: + BuildFunc = [&]() { return Builder.CreateAddReduce(Src); }; + break; + case Instruction::Mul: + BuildFunc = [&]() { return Builder.CreateMulReduce(Src); }; + break; + case Instruction::And: + BuildFunc = [&]() { return Builder.CreateAndReduce(Src); }; + break; + case Instruction::Or: + BuildFunc = [&]() { return Builder.CreateOrReduce(Src); }; + break; + case Instruction::Xor: + BuildFunc = [&]() { return Builder.CreateXorReduce(Src); }; + break; + case Instruction::FAdd: + BuildFunc = [&]() { + auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src); + cast(Rdx)->setFastMathFlags(FMFUnsafe); + return Rdx; + }; + break; + case Instruction::FMul: + BuildFunc = [&]() { + auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src); + cast(Rdx)->setFastMathFlags(FMFUnsafe); + return Rdx; + }; + break; + case Instruction::ICmp: + if (Flags.IsMaxOp) { + MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax; + BuildFunc = [&]() { + return Builder.CreateIntMaxReduce(Src, Flags.IsSigned); + }; + } else { + MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin; + BuildFunc = [&]() { + return Builder.CreateIntMinReduce(Src, Flags.IsSigned); + }; + } + break; + case Instruction::FCmp: + if (Flags.IsMaxOp) { + MinMaxKind = RD::MRK_FloatMax; + BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); }; + } else { + MinMaxKind = RD::MRK_FloatMin; + BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); }; + } + break; + default: + llvm_unreachable("Unhandled opcode"); + break; + } + if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags)) + return BuildFunc(); + return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps); +} + +/// Create a vector reduction using a given recurrence descriptor. +Value *llvm::createTargetReduction(IRBuilder<> &Builder, + const TargetTransformInfo *TTI, + RecurrenceDescriptor &Desc, Value *Src, + bool NoNaN) { + // TODO: Support in-order reductions based on the recurrence descriptor. + RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind(); + TargetTransformInfo::ReductionFlags Flags; + Flags.NoNaN = NoNaN; + auto getSimpleRdx = [&](unsigned Opc) { + return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags); + }; + switch (RecKind) { + case RecurrenceDescriptor::RK_FloatAdd: + return getSimpleRdx(Instruction::FAdd); + case RecurrenceDescriptor::RK_FloatMult: + return getSimpleRdx(Instruction::FMul); + case RecurrenceDescriptor::RK_IntegerAdd: + return getSimpleRdx(Instruction::Add); + case RecurrenceDescriptor::RK_IntegerMult: + return getSimpleRdx(Instruction::Mul); + case RecurrenceDescriptor::RK_IntegerAnd: + return getSimpleRdx(Instruction::And); + case RecurrenceDescriptor::RK_IntegerOr: + return getSimpleRdx(Instruction::Or); + case RecurrenceDescriptor::RK_IntegerXor: + return getSimpleRdx(Instruction::Xor); + case RecurrenceDescriptor::RK_IntegerMinMax: { + switch (Desc.getMinMaxRecurrenceKind()) { + case RecurrenceDescriptor::MRK_SIntMax: + Flags.IsSigned = true; + Flags.IsMaxOp = true; + break; + case RecurrenceDescriptor::MRK_UIntMax: + Flags.IsMaxOp = true; + break; + case RecurrenceDescriptor::MRK_SIntMin: + Flags.IsSigned = true; + break; + case RecurrenceDescriptor::MRK_UIntMin: + break; + default: + llvm_unreachable("Unhandled MRK"); + } + return getSimpleRdx(Instruction::ICmp); + } + case RecurrenceDescriptor::RK_FloatMinMax: { + Flags.IsMaxOp = + Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax; + return getSimpleRdx(Instruction::FCmp); + } + default: + llvm_unreachable("Unhandled RecKind"); + } +} + +void llvm::propagateIRFlags(Value *I, ArrayRef VL) { + if (auto *VecOp = dyn_cast(I)) { + if (auto *I0 = dyn_cast(VL[0])) { + // VecOVp is initialized to the 0th scalar, so start counting from index + // '1'. + VecOp->copyIRFlags(I0); + for (int i = 1, e = VL.size(); i < e; ++i) { + if (auto *Scalar = dyn_cast(VL[i])) + VecOp->andIRFlags(Scalar); + } + } + } +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3fde0a4..a9aa483 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1700,6 +1700,9 @@ public: /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + // Returns true if the NoNaN attribute is set on the function. + bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -4258,39 +4261,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } if (VF > 1) { - // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles - // and vector ops, reducing the set of values being computed by half each - // round. - assert(isPowerOf2_32(VF) && - "Reduction emission only supported for pow2 vectors!"); - Value *TmpVec = ReducedPartRdx; - SmallVector ShuffleMask(VF, nullptr); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i / 2; ++j) - ShuffleMask[j] = Builder.getInt32(i / 2 + j); - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), - UndefValue::get(Builder.getInt32Ty())); - - Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), - ConstantVector::get(ShuffleMask), "rdx.shuf"); - - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - TmpVec = addFastMathFlag(Builder.CreateBinOp( - (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); - else - TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, - TmpVec, Shuf); - } - - // The result is in the first element of the vector. + bool NoNaN = Legal->hasFunNoNaNAttr(); ReducedPartRdx = - Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); - + createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (Phi->getType() != RdxDesc.getRecurrenceType()) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f112c55..9908444 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -41,6 +41,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Vectorize.h" #include #include @@ -212,23 +213,6 @@ static unsigned getSameOpcode(ArrayRef VL) { return Opcode; } -/// Get the intersection (logical and) of all of the potential IR flags -/// of each scalar operation (VL) that will be converted into a vector (I). -/// Flag set: NSW, NUW, exact, and all of fast-math. -static void propagateIRFlags(Value *I, ArrayRef VL) { - if (auto *VecOp = dyn_cast(I)) { - if (auto *I0 = dyn_cast(VL[0])) { - // VecOVp is initialized to the 0th scalar, so start counting from index - // '1'. - VecOp->copyIRFlags(I0); - for (int i = 1, e = VL.size(); i < e; ++i) { - if (auto *Scalar = dyn_cast(VL[i])) - VecOp->andIRFlags(Scalar); - } - } - } -} - /// \returns true if all of the values in \p VL have the same type or false /// otherwise. static bool allSameType(ArrayRef VL) { @@ -4513,7 +4497,7 @@ public: // Emit a reduction. Value *ReducedSubTree = - emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps); + emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI); if (VectorizedTree) { Builder.SetCurrentDebugLocation(Loc); VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, @@ -4583,33 +4567,31 @@ private: /// \brief Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, - unsigned ReduxWidth, ArrayRef RedOps) { + unsigned ReduxWidth, ArrayRef RedOps, + const TargetTransformInfo *TTI) { assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); + if (!IsPairwiseReduction) + return createSimpleTargetReduction( + Builder, TTI, ReductionOpcode, VectorizedValue, + TargetTransformInfo::ReductionFlags(), RedOps); + Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { - if (IsPairwiseReduction) { - Value *LeftMask = + Value *LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true, Builder); - Value *RightMask = + Value *RightMask = createRdxShuffleMask(ReduxWidth, i, true, false, Builder); - Value *LeftShuf = Builder.CreateShuffleVector( + Value *LeftShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); - Value *RightShuf = Builder.CreateShuffleVector( + Value *RightShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), "rdx.shuf.r"); - TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, - "bin.rdx"); - } else { - Value *UpperHalf = - createRdxShuffleMask(ReduxWidth, i, false, false, Builder); - Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); - TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx"); - } + TmpVec = + Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx"); propagateIRFlags(TmpVec, RedOps); } -- 2.7.4