/// This is currently measured in number of instructions.
unsigned getPrefetchDistance() const;
+ /// \return Some HW prefetchers can handle accesses up to a certain constant
+ /// stride. This is the minimum stride in bytes where it makes sense to start
+ /// adding SW prefetches. The default is 1, i.e. prefetch with any stride.
+ unsigned getMinPrefetchStride() const;
+
/// \return The maximum interleave factor that any transform should try to
/// perform for this target. This number depends on the level of parallelism
/// and the number of execution units in the CPU.
virtual unsigned getRegisterBitWidth(bool Vector) = 0;
virtual unsigned getCacheLineSize() = 0;
virtual unsigned getPrefetchDistance() = 0;
+ virtual unsigned getMinPrefetchStride() = 0;
virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
virtual unsigned
getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
return Impl.getCacheLineSize();
}
unsigned getPrefetchDistance() override { return Impl.getPrefetchDistance(); }
+ unsigned getMinPrefetchStride() override {
+ return Impl.getMinPrefetchStride();
+ }
unsigned getMaxInterleaveFactor(unsigned VF) override {
return Impl.getMaxInterleaveFactor(VF);
}
unsigned getPrefetchDistance() { return 0; }
+ unsigned getMinPrefetchStride() { return 1; }
+
unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return TTIImpl->getPrefetchDistance();
}
+unsigned TargetTransformInfo::getMinPrefetchStride() const {
+ return TTIImpl->getMinPrefetchStride();
+}
+
unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const {
return TTIImpl->getMaxInterleaveFactor(VF);
}
cl::desc("Number of instructions to prefetch ahead for Cyclone"),
cl::init(280), cl::Hidden);
+// The HW prefetcher handles accesses with strides up to 2KB.
+static cl::opt<unsigned> CycloneMinPrefetchStride(
+ "cyclone-min-prefetch-stride",
+ cl::desc("Min stride to add prefetches for Cyclone"),
+ cl::init(2048), cl::Hidden);
+
/// \brief Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
return CyclonePrefetchDistance;
return BaseT::getPrefetchDistance();
}
+
+unsigned AArch64TTIImpl::getMinPrefetchStride() {
+ if (ST->isCyclone())
+ return CycloneMinPrefetchStride;
+ return BaseT::getMinPrefetchStride();
+}
unsigned getCacheLineSize();
unsigned getPrefetchDistance();
+
+ unsigned getMinPrefetchStride();
/// @}
};
bool runOnFunction(Function &F) override;
bool runOnLoop(Loop *L);
+ /// \brief Check if the the stride of the accesses is large enough to
+ /// warrant a prefetch.
+ bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
+
private:
AssumptionCache *AC;
LoopInfo *LI;
FunctionPass *llvm::createLoopDataPrefetchPass() { return new LoopDataPrefetch(); }
+bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
+ unsigned TargetMinStride = TTI->getMinPrefetchStride();
+ // No need to check if any stride goes.
+ if (TargetMinStride <= 1)
+ return true;
+
+ const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+ // If MinStride is set, don't prefetch unless we can ensure that stride is
+ // larger.
+ if (!ConstStride)
+ return false;
+
+ unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());
+ return TargetMinStride <= AbsStride;
+}
+
bool LoopDataPrefetch::runOnFunction(Function &F) {
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
if (!LSCEVAddRec)
continue;
+ // Check if the the stride of the accesses is large enough to warrant a
+ // prefetch.
+ if (!isStrideLargeEnough(LSCEVAddRec))
+ continue;
+
// We don't want to double prefetch individual cache lines. If this load
// is known to be within one cache line of some other load that has
// already been prefetched, then don't prefetch this one as well.
--- /dev/null
+; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=generic -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+
+; ALL-LABEL: @small_stride(
+define void @small_stride(double* nocapture %a, double* nocapture readonly %b) {
+entry:
+ br label %for.body
+
+; ALL: for.body:
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+; ALL-NOT: call void @llvm.prefetch
+ %0 = load double, double* %arrayidx, align 8
+ %add = fadd double %0, 1.000000e+00
+ %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+ store double %add, double* %arrayidx2, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1600
+ br i1 %exitcond, label %for.end, label %for.body
+
+; ALL: for.end:
+for.end: ; preds = %for.body
+ ret void
+}
+
+; ALL-LABEL: @large_stride(
+define void @large_stride(double* nocapture %a, double* nocapture readonly %b) {
+entry:
+ br label %for.body
+
+; ALL: for.body:
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+; LARGE_PREFETCH: call void @llvm.prefetch
+; NO_LARGE_PREFETCH-NOT: call void @llvm.prefetch
+ %0 = load double, double* %arrayidx, align 8
+ %add = fadd double %0, 1.000000e+00
+ %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+ store double %add, double* %arrayidx2, align 8
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 300
+ %exitcond = icmp eq i64 %indvars.iv.next, 160000
+ br i1 %exitcond, label %for.end, label %for.body
+
+; ALL: for.end:
+for.end: ; preds = %for.body
+ ret void
+}
--- /dev/null
+config.suffixes = ['.ll']
+
+if not 'AArch64' in config.root.targets:
+ config.unsupported = True