From da74ed42dadbbf62644fb8fe0ff1e7885c7b2a04 Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Wed, 12 Apr 2017 12:41:37 +0000 Subject: [PATCH] [LoopVectorizer, TTI] New method supportsEfficientVectorElementLoadStore() Since SystemZ supports vector element load/store instructions, there is no need for extracts/inserts if a vector load/store gets scalarized. This patch lets Target specify that it supports such instructions by means of a new TTI hook that defaults to false. The use for this is in the LoopVectorizer getScalarizationOverhead() method, which will with this patch produce a smaller sum for a vector load/store on SystemZ. New test: test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll Review: Adam Nemet https://reviews.llvm.org/D30680 llvm-svn: 300056 --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 10 +++++++ .../llvm/Analysis/TargetTransformInfoImpl.h | 2 ++ llvm/lib/Analysis/TargetTransformInfo.cpp | 4 +++ .../Target/SystemZ/SystemZTargetTransformInfo.h | 1 + llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++-- .../SystemZ/load-store-scalarization-cost.ll | 33 ++++++++++++++++++++++ 6 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 3d92208..6719668 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -437,6 +437,11 @@ public: unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) const; + /// If target has efficient vector element load/store instructions, it can + /// return true here so that insertion/extraction costs are not added to + /// the scalarization cost of a load/store. + bool supportsEfficientVectorElementLoadStore() const; + /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; @@ -790,6 +795,7 @@ public: getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0; virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) = 0; + virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; @@ -996,6 +1002,10 @@ public: return Impl.getOperandsScalarizationOverhead(Args, VF); } + bool supportsEfficientVectorElementLoadStore() override { + return Impl.supportsEfficientVectorElementLoadStore(); + } + bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 790acbc..9ab6b74 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -262,6 +262,8 @@ public: unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) { return 0; } + bool supportsEfficientVectorElementLoadStore() { return false; } + bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } bool enableInterleavedAccessVectorization() { return false; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c8b8740..d73b1a1 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -197,6 +197,10 @@ getOperandsScalarizationOverhead(ArrayRef Args, return TTIImpl->getOperandsScalarizationOverhead(Args, VF); } +bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const { + return TTIImpl->supportsEfficientVectorElementLoadStore(); +} + bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const { return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index d2639cb..3766ed4 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -55,6 +55,7 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); + bool supportsEfficientVectorElementLoadStore() { return true; } bool enableInterleavedAccessVectorization() { return true; } int getArithmeticInstrCost( diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f891cd9..26bcbcb 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3663,13 +3663,17 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, unsigned Cost = 0; Type *RetTy = ToVectorTy(I->getType(), VF); - if (!RetTy->isVoidTy()) + if (!RetTy->isVoidTy() && + (!isa(I) || + !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead(RetTy, true, false); if (CallInst *CI = dyn_cast(I)) { SmallVector Operands(CI->arg_operands()); Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } else { + } + else if (!isa(I) || + !TTI.supportsEfficientVectorElementLoadStore()) { SmallVector Operands(I->operand_values()); Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); } diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll new file mode 100644 index 0000000..e7096c2 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll @@ -0,0 +1,33 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \ +; RUN: -force-vector-width=4 -debug-only=loop-vectorize \ +; RUN: -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \ +; RUN: FileCheck %s +; +; Check that a scalarized load/store does not get a cost for insterts/ +; extracts, since z13 supports element load/store. + +define void @fun(i32* %data, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + %tmp2 = add i32 %tmp1, 1 + store i32 %tmp2, i32* %tmp0, align 4 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void + +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4 + +; CHECK: LV: Scalarizing: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4 +} + -- 2.7.4