/// \return Some HW prefetchers can handle accesses up to a certain
/// constant stride. This is the minimum stride in bytes where it
/// makes sense to start adding SW prefetches. The default is 1,
- /// i.e. prefetch with any stride.
- unsigned getMinPrefetchStride() const;
+ /// i.e. prefetch with any stride. Sometimes prefetching is beneficial
+ /// even below the HW prefetcher limit, and the arguments provided are
+ /// meant to serve as a basis for deciding this for a particular loop:
+ /// \param NumMemAccesses Number of memory accesses in the loop.
+ /// \param NumStridedMemAccesses Number of the memory accesses that
+ /// ScalarEvolution could find a known stride for.
+ /// \param NumPrefetches Number of software prefetches that will be emitted
+ /// as determined by the addresses involved and the cache line size.
+ /// \param HasCall True if the loop contains a call.
+ unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+ unsigned NumStridedMemAccesses,
+ unsigned NumPrefetches,
+ bool HasCall) const;
/// \return The maximum number of iterations to prefetch ahead. If
/// the required number of iterations is more than this number, no
/// prefetching is performed.
unsigned getMaxPrefetchIterationsAhead() const;
+ /// \return True if prefetching should also be done for writes.
+ bool enableWritePrefetching() const;
+
/// \return The maximum interleave factor that any transform should try to
/// perform for this target. This number depends on the level of parallelism
/// and the number of execution units in the CPU.
/// \return Some HW prefetchers can handle accesses up to a certain
/// constant stride. This is the minimum stride in bytes where it
/// makes sense to start adding SW prefetches. The default is 1,
- /// i.e. prefetch with any stride.
- virtual unsigned getMinPrefetchStride() const = 0;
+ /// i.e. prefetch with any stride. Sometimes prefetching is beneficial
+ /// even below the HW prefetcher limit, and the arguments provided are
+ /// meant to serve as a basis for deciding this for a particular loop.
+ virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+ unsigned NumStridedMemAccesses,
+ unsigned NumPrefetches,
+ bool HasCall) const = 0;
/// \return The maximum number of iterations to prefetch ahead. If
/// the required number of iterations is more than this number, no
/// prefetching is performed.
virtual unsigned getMaxPrefetchIterationsAhead() const = 0;
+ /// \return True if prefetching should also be done for writes.
+ virtual bool enableWritePrefetching() const = 0;
+
virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
virtual unsigned getArithmeticInstrCost(
unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
/// Return the minimum stride necessary to trigger software
/// prefetching.
///
- unsigned getMinPrefetchStride() const override {
- return Impl.getMinPrefetchStride();
+ unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+ unsigned NumStridedMemAccesses,
+ unsigned NumPrefetches,
+ bool HasCall) const override {
+ return Impl.getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+ NumPrefetches, HasCall);
}
/// Return the maximum prefetch distance in terms of loop
return Impl.getMaxPrefetchIterationsAhead();
}
+ /// \return True if prefetching should also be done for writes.
+ bool enableWritePrefetching() const override {
+ return Impl.enableWritePrefetching();
+ }
+
unsigned getMaxInterleaveFactor(unsigned VF) override {
return Impl.getMaxInterleaveFactor(VF);
}
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
/// Loop prefetch implementation class.
class LoopDataPrefetch {
public:
- LoopDataPrefetch(AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE,
- const TargetTransformInfo *TTI,
+ LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
+ ScalarEvolution *SE, const TargetTransformInfo *TTI,
OptimizationRemarkEmitter *ORE)
- : AC(AC), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
+ : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
bool run();
/// Check if the stride of the accesses is large enough to
/// warrant a prefetch.
- bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
+ bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride);
- unsigned getMinPrefetchStride() {
+ unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+ unsigned NumStridedMemAccesses,
+ unsigned NumPrefetches,
+ bool HasCall) {
if (MinPrefetchStride.getNumOccurrences() > 0)
return MinPrefetchStride;
- return TTI->getMinPrefetchStride();
+ return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+ NumPrefetches, HasCall);
}
unsigned getPrefetchDistance() {
return TTI->getMaxPrefetchIterationsAhead();
}
+ bool doPrefetchWrites() {
+ if (PrefetchWrites.getNumOccurrences() > 0)
+ return PrefetchWrites;
+ return TTI->enableWritePrefetching();
+ }
+
AssumptionCache *AC;
+ DominatorTree *DT;
LoopInfo *LI;
ScalarEvolution *SE;
const TargetTransformInfo *TTI;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
return new LoopDataPrefetchLegacyPass();
}
-bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
- unsigned TargetMinStride = getMinPrefetchStride();
+bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR,
+ unsigned TargetMinStride) {
// No need to check if any stride goes.
if (TargetMinStride <= 1)
return true;
PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
FunctionAnalysisManager &AM) {
+ DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
&AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
- LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+ LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
bool Changed = LDP.run();
if (Changed) {
if (skipFunction(F))
return false;
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
AssumptionCache *AC =
const TargetTransformInfo *TTI =
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+ LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
return LDP.run();
}
return MadeChange;
}
+/// A record for a potential prefetch made during the initial scan of the
+/// loop. This is used to let a single prefetch target multiple memory accesses.
+struct Prefetch {
+ /// The address formula for this prefetch as returned by ScalarEvolution.
+ const SCEVAddRecExpr *LSCEVAddRec;
+ /// The point of insertion for the prefetch instruction.
+ Instruction *InsertPt;
+ /// True if targeting a write memory access.
+ bool Writes;
+ /// The (first seen) prefetched instruction.
+ Instruction *MemI;
+
+ /// Constructor to create a new Prefetch for \param I.
+ Prefetch(const SCEVAddRecExpr *L, Instruction *I)
+ : LSCEVAddRec(L), InsertPt(nullptr), Writes(false), MemI(nullptr) {
+ addInstruction(I);
+ };
+
+ /// Add the instruction \param I to this prefetch. If it's not the first
+ /// one, 'InsertPt' and 'Writes' will be updated as required.
+ /// \param PtrDiff the known constant address difference to the first added
+ /// instruction.
+ void addInstruction(Instruction *I, DominatorTree *DT = nullptr,
+ int64_t PtrDiff = 0) {
+ if (!InsertPt) {
+ MemI = I;
+ InsertPt = I;
+ Writes = isa<StoreInst>(I);
+ } else {
+ BasicBlock *PrefBB = InsertPt->getParent();
+ BasicBlock *InsBB = I->getParent();
+ if (PrefBB != InsBB) {
+ BasicBlock *DomBB = DT->findNearestCommonDominator(PrefBB, InsBB);
+ if (DomBB != PrefBB)
+ InsertPt = DomBB->getTerminator();
+ }
+
+ if (isa<StoreInst>(I) && PtrDiff == 0)
+ Writes = true;
+ }
+ }
+};
+
bool LoopDataPrefetch::runOnLoop(Loop *L) {
bool MadeChange = false;
// Calculate the number of iterations ahead to prefetch
CodeMetrics Metrics;
+ bool HasCall = false;
for (const auto BB : L->blocks()) {
// If the loop already has prefetches, then assume that the user knows
// what they are doing and don't add any more.
- for (auto &I : *BB)
- if (CallInst *CI = dyn_cast<CallInst>(&I))
- if (Function *F = CI->getCalledFunction())
+ for (auto &I : *BB) {
+ if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
+ ImmutableCallSite CS(&I);
+ if (const Function *F = CS.getCalledFunction()) {
if (F->getIntrinsicID() == Intrinsic::prefetch)
return MadeChange;
-
+ if (TTI->isLoweredToCall(F))
+ HasCall = true;
+ } else { // indirect call.
+ HasCall = true;
+ }
+ }
+ }
Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
}
unsigned LoopSize = Metrics.NumInsts;
if (ItersAhead > getMaxPrefetchIterationsAhead())
return MadeChange;
- LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
- << " iterations ahead (loop size: " << LoopSize << ") in "
- << L->getHeader()->getParent()->getName() << ": " << *L);
+ unsigned ConstantMaxTripCount = SE->getSmallConstantMaxTripCount(L);
+ if (ConstantMaxTripCount && ConstantMaxTripCount < ItersAhead + 1)
+ return MadeChange;
- SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
- for (const auto BB : L->blocks()) {
+ unsigned NumMemAccesses = 0;
+ unsigned NumStridedMemAccesses = 0;
+ SmallVector<Prefetch, 16> Prefetches;
+ for (const auto BB : L->blocks())
for (auto &I : *BB) {
Value *PtrValue;
Instruction *MemI;
MemI = LMemI;
PtrValue = LMemI->getPointerOperand();
} else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
- if (!PrefetchWrites) continue;
+ if (!doPrefetchWrites()) continue;
MemI = SMemI;
PtrValue = SMemI->getPointerOperand();
} else continue;
unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
if (PtrAddrSpace)
continue;
-
+ NumMemAccesses++;
if (L->isLoopInvariant(PtrValue))
continue;
const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
if (!LSCEVAddRec)
continue;
+ NumStridedMemAccesses++;
- // Check if the stride of the accesses is large enough to warrant a
- // prefetch.
- if (!isStrideLargeEnough(LSCEVAddRec))
- continue;
-
- // We don't want to double prefetch individual cache lines. If this load
- // is known to be within one cache line of some other load that has
- // already been prefetched, then don't prefetch this one as well.
+ // We don't want to double prefetch individual cache lines. If this
+ // access is known to be within one cache line of some other one that
+ // has already been prefetched, then don't prefetch this one as well.
bool DupPref = false;
- for (const auto &PrefLoad : PrefLoads) {
- const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second);
+ for (auto &Pref : Prefetches) {
+ const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);
if (const SCEVConstant *ConstPtrDiff =
dyn_cast<SCEVConstant>(PtrDiff)) {
int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
if (PD < (int64_t) TTI->getCacheLineSize()) {
+ Pref.addInstruction(MemI, DT, PD);
DupPref = true;
break;
}
}
}
- if (DupPref)
- continue;
+ if (!DupPref)
+ Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
+ }
- const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr(
- SE->getConstant(LSCEVAddRec->getType(), ItersAhead),
- LSCEVAddRec->getStepRecurrence(*SE)));
- if (!isSafeToExpand(NextLSCEV, *SE))
- continue;
+ unsigned TargetMinStride =
+ getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+ Prefetches.size(), HasCall);
- PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec));
-
- Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), PtrAddrSpace);
- SCEVExpander SCEVE(*SE, I.getModule()->getDataLayout(), "prefaddr");
- Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI);
-
- IRBuilder<> Builder(MemI);
- Module *M = BB->getParent()->getParent();
- Type *I32 = Type::getInt32Ty(BB->getContext());
- Function *PrefetchFunc = Intrinsic::getDeclaration(
- M, Intrinsic::prefetch, PrefPtrValue->getType());
- Builder.CreateCall(
- PrefetchFunc,
- {PrefPtrValue,
- ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
- ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
- ++NumPrefetches;
- LLVM_DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV
- << "\n");
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
- << "prefetched memory access";
+ LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
+ << " iterations ahead (loop size: " << LoopSize << ") in "
+ << L->getHeader()->getParent()->getName() << ": " << *L);
+ LLVM_DEBUG(dbgs() << "Loop has: "
+ << NumMemAccesses << " memory accesses, "
+ << NumStridedMemAccesses << " strided memory accesses, "
+ << Prefetches.size() << " potential prefetch(es), "
+ << "a minimum stride of " << TargetMinStride << ", "
+ << (HasCall ? "calls" : "no calls") << ".\n");
+
+ for (auto &P : Prefetches) {
+ // Check if the stride of the accesses is large enough to warrant a
+ // prefetch.
+ if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride))
+ continue;
+
+ const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
+ SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
+ P.LSCEVAddRec->getStepRecurrence(*SE)));
+ if (!isSafeToExpand(NextLSCEV, *SE))
+ continue;
+
+ BasicBlock *BB = P.InsertPt->getParent();
+ Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), 0/*PtrAddrSpace*/);
+ SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
+ Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt);
+
+ IRBuilder<> Builder(P.InsertPt);
+ Module *M = BB->getParent()->getParent();
+ Type *I32 = Type::getInt32Ty(BB->getContext());
+ Function *PrefetchFunc = Intrinsic::getDeclaration(
+ M, Intrinsic::prefetch, PrefPtrValue->getType());
+ Builder.CreateCall(
+ PrefetchFunc,
+ {PrefPtrValue,
+ ConstantInt::get(I32, P.Writes),
+ ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+ ++NumPrefetches;
+ LLVM_DEBUG(dbgs() << " Access: "
+ << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
+ << ", SCEV: " << *P.LSCEVAddRec << "\n");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI)
+ << "prefetched memory access";
});
- MadeChange = true;
- }
+ MadeChange = true;
}
return MadeChange;