/// \return the value of vscale to tune the cost model for.
std::optional<unsigned> getVScaleForTuning() const;
+ /// \return true if vscale is known to be a power of 2
+ bool isVScaleKnownToBeAPowerOfTwo() const;
+
/// \return True if the vectorization factor should be chosen to
/// make the vector of the smallest element type match the size of a
/// vector register. For wider element types, this could result in
virtual unsigned getMinVectorRegisterBitWidth() const = 0;
virtual std::optional<unsigned> getMaxVScale() const = 0;
virtual std::optional<unsigned> getVScaleForTuning() const = 0;
+ virtual bool isVScaleKnownToBeAPowerOfTwo() const = 0;
virtual bool
shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const = 0;
virtual ElementCount getMinimumVF(unsigned ElemWidth,
std::optional<unsigned> getVScaleForTuning() const override {
return Impl.getVScaleForTuning();
}
+ bool isVScaleKnownToBeAPowerOfTwo() const override {
+ return Impl.isVScaleKnownToBeAPowerOfTwo();
+ }
bool shouldMaximizeVectorBandwidth(
TargetTransformInfo::RegisterKind K) const override {
return Impl.shouldMaximizeVectorBandwidth(K);
std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
+ bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
bool
shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const {
std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
+ bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
/// Estimate the overhead of scalarizing an instruction. Insert and Extract
/// are set if the demanded result elements need to be inserted and/or
return TTIImpl->getVScaleForTuning();
}
+bool TargetTransformInfo::isVScaleKnownToBeAPowerOfTwo() const {
+ return TTIImpl->isVScaleKnownToBeAPowerOfTwo();
+}
+
bool TargetTransformInfo::shouldMaximizeVectorBandwidth(
TargetTransformInfo::RegisterKind K) const {
return TTIImpl->shouldMaximizeVectorBandwidth(K);
return !Subtarget->useSVEForFixedLengthVectors();
}
-bool AArch64TargetLowering::isVScaleKnownToBeAPowerOfTwo() const {
- return true;
-}
-
bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
EVT VT, bool OverrideNEON) const {
if (!VT.isFixedLengthVector() || !VT.isSimple())
SDValue Chain, SDValue InFlag,
SDValue PStateSM, bool Entry) const;
- bool isVScaleKnownToBeAPowerOfTwo() const override;
+ bool isVScaleKnownToBeAPowerOfTwo() const override { return true; }
// Normally SVE is only used for byte size vectors that do not fit within a
// NEON vector. This changes when OverrideNEON is true, allowing SVE to be
return ST->getVScaleForTuning();
}
+ bool isVScaleKnownToBeAPowerOfTwo() const { return true; }
+
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
/// Try to return an estimate cost factor that can be used as a multiplier
return ST->is64Bit() && !ST->hasVInstructionsI64();
}
+ bool isVScaleKnownToBeAPowerOfTwo() const {
+ return TLI->isVScaleKnownToBeAPowerOfTwo();
+ }
+
/// \returns How the target needs this vector-predicated operation to be
/// transformed.
TargetTransformInfo::VPLegalization
}
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
+
// Avoid tail folding if the trip count is known to be a multiple of any VF
- // we chose.
- // FIXME: The condition below pessimises the case for fixed-width vectors,
- // when scalable VFs are also candidates for vectorization.
- if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
- ElementCount MaxFixedVF = MaxFactors.FixedVF;
- assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
+ // we choose.
+ std::optional<unsigned> MaxPowerOf2RuntimeVF =
+ MaxFactors.FixedVF.getFixedValue();
+ if (MaxFactors.ScalableVF) {
+ std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
+ if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
+ MaxPowerOf2RuntimeVF = std::max<unsigned>(
+ *MaxPowerOf2RuntimeVF,
+ *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
+ } else
+ MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
+ }
+
+ if (MaxPowerOf2RuntimeVF) {
+ assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
"MaxFixedVF must be a power of 2");
- unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
- : MaxFixedVF.getFixedValue();
+ unsigned MaxVFtimesIC =
+ UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
ScalarEvolution *SE = PSE.getSE();
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
const SCEV *ExitCount = SE->getAddExpr(
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s 2>&1 | FileCheck %s
-; This test currently fails when the LV calculates a maximums safe
-; distance for scalable vectors, because the code to eliminate the tail is
-; pessimistic when scalable vectors are considered. This will be addressed
-; in a future patch, at which point we should be able to un-XFAIL the
-; test. The expected output is to vectorize this loop without predication
-; (and thus have unpredicated vector store).
-; XFAIL: *
-
-; CHECK: store <4 x i32>
-
target triple = "aarch64"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
define void @f1(ptr %A) #0 {
+; CHECK-LABEL: define void @f1
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT: store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP6]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
entry:
br label %for.body
ret void
}
-attributes #0 = { "target-features"="+sve" }
+attributes #0 = { "target-features"="+sve" vscale_range(1,16) }
define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-LABEL: @simple_memset_trip1024(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-NEXT: [[TMP7:%.*]] = sub i64 1024, [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 1024, [[TMP6]]
-; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1024)
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
-; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
-; CHECK-NEXT: [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT: store <vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
!3 = distinct !{!3, !4}
!4 = !{!"llvm.loop.vectorize.width", i32 4}
-attributes #0 = { "target-features"="+sve" }
+attributes #0 = { "target-features"="+sve" vscale_range(1,16) }