unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) const;
+ /// If target has efficient vector element load/store instructions, it can
+ /// return true here so that insertion/extraction costs are not added to
+ /// the scalarization cost of a load/store.
+ bool supportsEfficientVectorElementLoadStore() const;
+
/// \brief Don't restrict interleaved unrolling to small loops.
bool enableAggressiveInterleaving(bool LoopHasReductions) const;
getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0;
virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) = 0;
+ virtual bool supportsEfficientVectorElementLoadStore() = 0;
virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
virtual bool enableInterleavedAccessVectorization() = 0;
virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
return Impl.getOperandsScalarizationOverhead(Args, VF);
}
+ bool supportsEfficientVectorElementLoadStore() override {
+ return Impl.supportsEfficientVectorElementLoadStore();
+ }
+
bool enableAggressiveInterleaving(bool LoopHasReductions) override {
return Impl.enableAggressiveInterleaving(LoopHasReductions);
}
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
unsigned VF) { return 0; }
+ bool supportsEfficientVectorElementLoadStore() { return false; }
+
bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
bool enableInterleavedAccessVectorization() { return false; }
return TTIImpl->getOperandsScalarizationOverhead(Args, VF);
}
+bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
+ return TTIImpl->supportsEfficientVectorElementLoadStore();
+}
+
bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const {
return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
}
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);
+ bool supportsEfficientVectorElementLoadStore() { return true; }
bool enableInterleavedAccessVectorization() { return true; }
int getArithmeticInstrCost(
unsigned Cost = 0;
Type *RetTy = ToVectorTy(I->getType(), VF);
- if (!RetTy->isVoidTy())
+ if (!RetTy->isVoidTy() &&
+ (!isa<LoadInst>(I) ||
+ !TTI.supportsEfficientVectorElementLoadStore()))
Cost += TTI.getScalarizationOverhead(RetTy, true, false);
if (CallInst *CI = dyn_cast<CallInst>(I)) {
SmallVector<const Value *, 4> Operands(CI->arg_operands());
Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
- } else {
+ }
+ else if (!isa<StoreInst>(I) ||
+ !TTI.supportsEfficientVectorElementLoadStore()) {
SmallVector<const Value *, 4> Operands(I->operand_values());
Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
}
--- /dev/null
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN: -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN: -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \
+; RUN: FileCheck %s
+;
+; Check that a scalarized load/store does not get a cost for insterts/
+; extracts, since z13 supports element load/store.
+
+define void @fun(i32* %data, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+ %tmp1 = load i32, i32* %tmp0, align 4
+ %tmp2 = add i32 %tmp1, 1
+ store i32 %tmp2, i32* %tmp0, align 4
+ %i.next = add nuw nsw i64 %i, 2
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+
+; CHECK: LV: Scalarizing: %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4
+}
+