[LoopDataPrefetch + SystemZ] Let target decide on prefetching for each loop.

author Jonas Paulsson <paulsson@linux.vnet.ibm.com>

Thu, 31 Oct 2019 15:05:58 +0000 (16:05 +0100)

committer Jonas Paulsson <paulsson@linux.vnet.ibm.com>

Thu, 2 Apr 2020 12:57:46 +0000 (14:57 +0200)
author Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Thu, 31 Oct 2019 15:05:58 +0000 (16:05 +0100)
committer Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Thu, 2 Apr 2020 12:57:46 +0000 (14:57 +0200)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h

index 5f5ef62..bf23de2 100644 (file)
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -847,14 +847,28 @@ public:
    /// \return Some HW prefetchers can handle accesses up to a certain
    /// constant stride.  This is the minimum stride in bytes where it
    /// makes sense to start adding SW prefetches.  The default is 1,
-  /// i.e. prefetch with any stride.
-  unsigned getMinPrefetchStride() const;
+  /// i.e. prefetch with any stride.  Sometimes prefetching is beneficial
+  /// even below the HW prefetcher limit, and the arguments provided are
+  /// meant to serve as a basis for deciding this for a particular loop:
+  /// \param NumMemAccesses Number of memory accesses in the loop.
+  /// \param NumStridedMemAccesses Number of the memory accesses that
+  /// ScalarEvolution could find a known stride for.
+  /// \param NumPrefetches Number of software prefetches that will be emitted
+  /// as determined by the addresses involved and the cache line size.
+  /// \param HasCall True if the loop contains a call.
+  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                unsigned NumStridedMemAccesses,
+                                unsigned NumPrefetches,
+                                bool HasCall) const;
  
    /// \return The maximum number of iterations to prefetch ahead.  If
    /// the required number of iterations is more than this number, no
    /// prefetching is performed.
    unsigned getMaxPrefetchIterationsAhead() const;
  
+  /// \return True if prefetching should also be done for writes.
+  bool enableWritePrefetching() const;
+
    /// \return The maximum interleave factor that any transform should try to
    /// perform for this target. This number depends on the level of parallelism
    /// and the number of execution units in the CPU.
@@ -1298,14 +1312,22 @@ public:
    /// \return Some HW prefetchers can handle accesses up to a certain
    /// constant stride.  This is the minimum stride in bytes where it
    /// makes sense to start adding SW prefetches.  The default is 1,
-  /// i.e. prefetch with any stride.
-  virtual unsigned getMinPrefetchStride() const = 0;
+  /// i.e. prefetch with any stride.  Sometimes prefetching is beneficial
+  /// even below the HW prefetcher limit, and the arguments provided are
+  /// meant to serve as a basis for deciding this for a particular loop.
+  virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                        unsigned NumStridedMemAccesses,
+                                        unsigned NumPrefetches,
+                                        bool HasCall) const = 0;
  
    /// \return The maximum number of iterations to prefetch ahead.  If
    /// the required number of iterations is more than this number, no
    /// prefetching is performed.
    virtual unsigned getMaxPrefetchIterationsAhead() const = 0;
  
+  /// \return True if prefetching should also be done for writes.
+  virtual bool enableWritePrefetching() const = 0;
+
    virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
    virtual unsigned getArithmeticInstrCost(
        unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
@@ -1684,8 +1706,12 @@ public:
    /// Return the minimum stride necessary to trigger software
    /// prefetching.
    ///
-  unsigned getMinPrefetchStride() const override {
-    return Impl.getMinPrefetchStride();
+  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                unsigned NumStridedMemAccesses,
+                                unsigned NumPrefetches,
+                                bool HasCall) const override {
+    return Impl.getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+                                     NumPrefetches, HasCall);
    }
  
    /// Return the maximum prefetch distance in terms of loop
@@ -1695,6 +1721,11 @@ public:
      return Impl.getMaxPrefetchIterationsAhead();
    }
  
+  /// \return True if prefetching should also be done for writes.
+  bool enableWritePrefetching() const override {
+    return Impl.enableWritePrefetching();
+  }
+
    unsigned getMaxInterleaveFactor(unsigned VF) override {
      return Impl.getMaxInterleaveFactor(VF);
    }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

index 8749fa4..0cd3dba 100644 (file)
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -416,8 +416,12 @@ public:
    }
  
    unsigned getPrefetchDistance() const { return 0; }
-  unsigned getMinPrefetchStride() const { return 1; }
+  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                unsigned NumStridedMemAccesses,
+                                unsigned NumPrefetches,
+                                bool HasCall) const { return 1; }
    unsigned getMaxPrefetchIterationsAhead() const { return UINT_MAX; }
+  bool enableWritePrefetching() const { return false; }
  
    unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
  
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h

index fc04c48..8a13fd8 100644 (file)
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -551,14 +551,22 @@ public:
      return getST()->getPrefetchDistance();
    }
  
-  virtual unsigned getMinPrefetchStride() const {
-    return getST()->getMinPrefetchStride();
+  virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                        unsigned NumStridedMemAccesses,
+                                        unsigned NumPrefetches,
+                                        bool HasCall) const {
+    return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+                                         NumPrefetches, HasCall);
    }
  
    virtual unsigned getMaxPrefetchIterationsAhead() const {
      return getST()->getMaxPrefetchIterationsAhead();
    }
  
+  virtual bool enableWritePrefetching() const {
+    return getST()->enableWritePrefetching();
+  }
+
    /// @}
  
    /// \name Vector TTI Implementations
diff --git a/llvm/include/llvm/MC/MCSubtargetInfo.h b/llvm/include/llvm/MC/MCSubtargetInfo.h

index 09130c4..61cbb84 100644 (file)
--- a/llvm/include/llvm/MC/MCSubtargetInfo.h
+++ b/llvm/include/llvm/MC/MCSubtargetInfo.h
@@ -263,10 +263,17 @@ public:
    ///
    virtual unsigned getMaxPrefetchIterationsAhead() const;
  
+  /// \return True if prefetching should also be done for writes.
+  ///
+  virtual bool enableWritePrefetching() const;
+
    /// Return the minimum stride necessary to trigger software
    /// prefetching.
    ///
-  virtual unsigned getMinPrefetchStride() const;
+  virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                        unsigned NumStridedMemAccesses,
+                                        unsigned NumPrefetches,
+                                        bool HasCall) const;
  };
  
  } // end namespace llvm
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp

index a240571..150a395 100644 (file)
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -519,14 +519,22 @@ unsigned TargetTransformInfo::getPrefetchDistance() const {
    return TTIImpl->getPrefetchDistance();
  }
  
-unsigned TargetTransformInfo::getMinPrefetchStride() const {
-  return TTIImpl->getMinPrefetchStride();
+unsigned TargetTransformInfo::getMinPrefetchStride(unsigned NumMemAccesses,
+                                                  unsigned NumStridedMemAccesses,
+                                                   unsigned NumPrefetches,
+                                                   bool HasCall) const {
+  return TTIImpl->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+                                       NumPrefetches, HasCall);
  }
  
  unsigned TargetTransformInfo::getMaxPrefetchIterationsAhead() const {
    return TTIImpl->getMaxPrefetchIterationsAhead();
  }
  
+bool TargetTransformInfo::enableWritePrefetching() const {
+  return TTIImpl->enableWritePrefetching();
+}
+
  unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const {
    return TTIImpl->getMaxInterleaveFactor(VF);
  }
diff --git a/llvm/lib/MC/MCSubtargetInfo.cpp b/llvm/lib/MC/MCSubtargetInfo.cpp

index ac4f590..efe1e95 100644 (file)
--- a/llvm/lib/MC/MCSubtargetInfo.cpp
+++ b/llvm/lib/MC/MCSubtargetInfo.cpp
@@ -339,6 +339,13 @@ unsigned MCSubtargetInfo::getMaxPrefetchIterationsAhead() const {
    return UINT_MAX;
  }
  
-unsigned MCSubtargetInfo::getMinPrefetchStride() const {
+bool MCSubtargetInfo::enableWritePrefetching() const {
+  return false;
+}
+
+unsigned MCSubtargetInfo::getMinPrefetchStride(unsigned NumMemAccesses,
+                                               unsigned NumStridedMemAccesses,
+                                               unsigned NumPrefetches,
+                                               bool HasCall) const {
    return 1;
  }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h

index 3ff99bf..e69404e 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -364,7 +364,12 @@ public:
    }
    unsigned getCacheLineSize() const override { return CacheLineSize; }
    unsigned getPrefetchDistance() const override { return PrefetchDistance; }
-  unsigned getMinPrefetchStride() const override { return MinPrefetchStride; }
+  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                unsigned NumStridedMemAccesses,
+                                unsigned NumPrefetches,
+                                bool HasCall) const override {
+    return MinPrefetchStride;
+  }
    unsigned getMaxPrefetchIterationsAhead() const override {
      return MaxPrefetchIterationsAhead;
    }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

index d088682..84ab66d 100644 (file)
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -323,6 +323,23 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
    return 0;
  }
  
+unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
+                                              unsigned NumStridedMemAccesses,
+                                              unsigned NumPrefetches,
+                                              bool HasCall) const {
+  // Don't prefetch a loop with many far apart accesses.
+  if (NumPrefetches > 16)
+    return UINT_MAX;
+
+  // Emit prefetch instructions for smaller strides in cases where we think
+  // the hardware prefetcher might not be able to keep up.
+  if (NumStridedMemAccesses > 32 &&
+      NumStridedMemAccesses == NumMemAccesses && !HasCall)
+    return 1;
+
+  return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
+}
+
  bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
    EVT VT = TLI->getValueType(DL, DataType);
    return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h

index 5905057..c6e3b36 100644 (file)
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -60,8 +60,12 @@ public:
    unsigned getRegisterBitWidth(bool Vector) const;
  
    unsigned getCacheLineSize() const override { return 256; }
-  unsigned getPrefetchDistance() const override { return 2000; }
-  unsigned getMinPrefetchStride() const override { return 2048; }
+  unsigned getPrefetchDistance() const override { return 4500; }
+  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                unsigned NumStridedMemAccesses,
+                                unsigned NumPrefetches,
+                                bool HasCall) const override;
+  bool enableWritePrefetching() const override { return true; }
  
    bool hasDivRemOp(Type *DataType, bool IsSigned);
    bool prefersVectorizedAddressing() { return false; }
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp

index ab65f56..e5255c3 100644 (file)
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -24,6 +24,7 @@
  #include "llvm/Analysis/ScalarEvolutionExpander.h"
  #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
  #include "llvm/IR/CFG.h"
  #include "llvm/IR/Dominators.h"
  #include "llvm/IR/Function.h"
@@ -61,10 +62,10 @@ namespace {
  /// Loop prefetch implementation class.
  class LoopDataPrefetch {
  public:
-  LoopDataPrefetch(AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE,
-                   const TargetTransformInfo *TTI,
+  LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
+                   ScalarEvolution *SE, const TargetTransformInfo *TTI,
                     OptimizationRemarkEmitter *ORE)
-      : AC(AC), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
+      : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
  
    bool run();
  
@@ -73,12 +74,16 @@ private:
  
    /// Check if the stride of the accesses is large enough to
    /// warrant a prefetch.
-  bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
+  bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride);
  
-  unsigned getMinPrefetchStride() {
+  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                unsigned NumStridedMemAccesses,
+                                unsigned NumPrefetches,
+                                bool HasCall) {
      if (MinPrefetchStride.getNumOccurrences() > 0)
        return MinPrefetchStride;
-    return TTI->getMinPrefetchStride();
+    return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+                                     NumPrefetches, HasCall);
    }
  
    unsigned getPrefetchDistance() {
@@ -93,7 +98,14 @@ private:
      return TTI->getMaxPrefetchIterationsAhead();
    }
  
+  bool doPrefetchWrites() {
+    if (PrefetchWrites.getNumOccurrences() > 0)
+      return PrefetchWrites;
+    return TTI->enableWritePrefetching();
+  }
+
    AssumptionCache *AC;
+  DominatorTree *DT;
    LoopInfo *LI;
    ScalarEvolution *SE;
    const TargetTransformInfo *TTI;
@@ -110,6 +122,7 @@ public:
  
    void getAnalysisUsage(AnalysisUsage &AU) const override {
      AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
      AU.addPreserved<DominatorTreeWrapperPass>();
      AU.addRequired<LoopInfoWrapperPass>();
      AU.addPreserved<LoopInfoWrapperPass>();
@@ -138,8 +151,8 @@ FunctionPass *llvm::createLoopDataPrefetchPass() {
    return new LoopDataPrefetchLegacyPass();
  }
  
-bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
-  unsigned TargetMinStride = getMinPrefetchStride();
+bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR,
+                                           unsigned TargetMinStride) {
    // No need to check if any stride goes.
    if (TargetMinStride <= 1)
      return true;
@@ -156,6 +169,7 @@ bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
  
  PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
                                              FunctionAnalysisManager &AM) {
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
    LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
    ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
    AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
@@ -163,7 +177,7 @@ PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
        &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
    const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
  
-  LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
    bool Changed = LDP.run();
  
    if (Changed) {
@@ -180,6 +194,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
    if (skipFunction(F))
      return false;
  
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
    AssumptionCache *AC =
@@ -189,7 +204,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
    const TargetTransformInfo *TTI =
        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  
-  LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
    return LDP.run();
  }
  
@@ -210,6 +225,49 @@ bool LoopDataPrefetch::run() {
    return MadeChange;
  }
  
+/// A record for a potential prefetch made during the initial scan of the
+/// loop. This is used to let a single prefetch target multiple memory accesses.
+struct Prefetch {
+  /// The address formula for this prefetch as returned by ScalarEvolution.
+  const SCEVAddRecExpr *LSCEVAddRec;
+  /// The point of insertion for the prefetch instruction.
+  Instruction *InsertPt;
+  /// True if targeting a write memory access.
+  bool Writes;
+  /// The (first seen) prefetched instruction.
+  Instruction *MemI;
+
+  /// Constructor to create a new Prefetch for \param I.
+  Prefetch(const SCEVAddRecExpr *L, Instruction *I)
+      : LSCEVAddRec(L), InsertPt(nullptr), Writes(false), MemI(nullptr) {
+    addInstruction(I);
+  };
+
+  /// Add the instruction \param I to this prefetch. If it's not the first
+  /// one, 'InsertPt' and 'Writes' will be updated as required.
+  /// \param PtrDiff the known constant address difference to the first added
+  /// instruction.
+  void addInstruction(Instruction *I, DominatorTree *DT = nullptr,
+                      int64_t PtrDiff = 0) {
+    if (!InsertPt) {
+      MemI = I;
+      InsertPt = I;
+      Writes = isa<StoreInst>(I);
+    } else {
+      BasicBlock *PrefBB = InsertPt->getParent();
+      BasicBlock *InsBB = I->getParent();
+      if (PrefBB != InsBB) {
+        BasicBlock *DomBB = DT->findNearestCommonDominator(PrefBB, InsBB);
+        if (DomBB != PrefBB)
+          InsertPt = DomBB->getTerminator();
+      }
+
+      if (isa<StoreInst>(I) && PtrDiff == 0)
+        Writes = true;
+    }
+  }
+};
+
  bool LoopDataPrefetch::runOnLoop(Loop *L) {
    bool MadeChange = false;
  
@@ -222,15 +280,23 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
  
    // Calculate the number of iterations ahead to prefetch
    CodeMetrics Metrics;
+  bool HasCall = false;
    for (const auto BB : L->blocks()) {
      // If the loop already has prefetches, then assume that the user knows
      // what they are doing and don't add any more.
-    for (auto &I : *BB)
-      if (CallInst *CI = dyn_cast<CallInst>(&I))
-        if (Function *F = CI->getCalledFunction())
+    for (auto &I : *BB) {
+      if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
+        ImmutableCallSite CS(&I);
+        if (const Function *F = CS.getCalledFunction()) {
            if (F->getIntrinsicID() == Intrinsic::prefetch)
              return MadeChange;
-
+          if (TTI->isLoweredToCall(F))
+            HasCall = true;
+        } else { // indirect call.
+          HasCall = true;
+        }
+      }
+    }
      Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
    }
    unsigned LoopSize = Metrics.NumInsts;
@@ -244,12 +310,14 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
    if (ItersAhead > getMaxPrefetchIterationsAhead())
      return MadeChange;
  
-  LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
-                    << " iterations ahead (loop size: " << LoopSize << ") in "
-                    << L->getHeader()->getParent()->getName() << ": " << *L);
+  unsigned ConstantMaxTripCount = SE->getSmallConstantMaxTripCount(L);
+  if (ConstantMaxTripCount && ConstantMaxTripCount < ItersAhead + 1)
+    return MadeChange;
  
-  SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
-  for (const auto BB : L->blocks()) {
+  unsigned NumMemAccesses = 0;
+  unsigned NumStridedMemAccesses = 0;
+  SmallVector<Prefetch, 16> Prefetches;
+  for (const auto BB : L->blocks())
      for (auto &I : *BB) {
        Value *PtrValue;
        Instruction *MemI;
@@ -258,7 +326,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
          MemI = LMemI;
          PtrValue = LMemI->getPointerOperand();
        } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
-        if (!PrefetchWrites) continue;
+        if (!doPrefetchWrites()) continue;
          MemI = SMemI;
          PtrValue = SMemI->getPointerOperand();
        } else continue;
@@ -266,7 +334,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
        unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
        if (PtrAddrSpace)
          continue;
-
+      NumMemAccesses++;
        if (L->isLoopInvariant(PtrValue))
          continue;
  
@@ -274,62 +342,79 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
        const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
        if (!LSCEVAddRec)
          continue;
+      NumStridedMemAccesses++;
  
-      // Check if the stride of the accesses is large enough to warrant a
-      // prefetch.
-      if (!isStrideLargeEnough(LSCEVAddRec))
-        continue;
-
-      // We don't want to double prefetch individual cache lines. If this load
-      // is known to be within one cache line of some other load that has
-      // already been prefetched, then don't prefetch this one as well.
+      // We don't want to double prefetch individual cache lines. If this
+      // access is known to be within one cache line of some other one that
+      // has already been prefetched, then don't prefetch this one as well.
        bool DupPref = false;
-      for (const auto &PrefLoad : PrefLoads) {
-        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second);
+      for (auto &Pref : Prefetches) {
+        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);
          if (const SCEVConstant *ConstPtrDiff =
              dyn_cast<SCEVConstant>(PtrDiff)) {
            int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
            if (PD < (int64_t) TTI->getCacheLineSize()) {
+            Pref.addInstruction(MemI, DT, PD);
              DupPref = true;
              break;
            }
          }
        }
-      if (DupPref)
-        continue;
+      if (!DupPref)
+        Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
+    }
  
-      const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr(
-        SE->getConstant(LSCEVAddRec->getType(), ItersAhead),
-        LSCEVAddRec->getStepRecurrence(*SE)));
-      if (!isSafeToExpand(NextLSCEV, *SE))
-        continue;
+  unsigned TargetMinStride =
+    getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+                         Prefetches.size(), HasCall);
  
-      PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec));
-
-      Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), PtrAddrSpace);
-      SCEVExpander SCEVE(*SE, I.getModule()->getDataLayout(), "prefaddr");
-      Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI);
-
-      IRBuilder<> Builder(MemI);
-      Module *M = BB->getParent()->getParent();
-      Type *I32 = Type::getInt32Ty(BB->getContext());
-      Function *PrefetchFunc = Intrinsic::getDeclaration(
-          M, Intrinsic::prefetch, PrefPtrValue->getType());
-      Builder.CreateCall(
-          PrefetchFunc,
-          {PrefPtrValue,
-           ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
-           ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
-      ++NumPrefetches;
-      LLVM_DEBUG(dbgs() << "  Access: " << *PtrValue << ", SCEV: " << *LSCEV
-                        << "\n");
-      ORE->emit([&]() {
-        return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
-               << "prefetched memory access";
+  LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
+             << " iterations ahead (loop size: " << LoopSize << ") in "
+             << L->getHeader()->getParent()->getName() << ": " << *L);
+  LLVM_DEBUG(dbgs() << "Loop has: "
+             << NumMemAccesses << " memory accesses, "
+             << NumStridedMemAccesses << " strided memory accesses, "
+             << Prefetches.size() << " potential prefetch(es), "
+             << "a minimum stride of " << TargetMinStride << ", "
+             << (HasCall ? "calls" : "no calls") << ".\n");
+
+  for (auto &P : Prefetches) {
+    // Check if the stride of the accesses is large enough to warrant a
+    // prefetch.
+    if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride))
+      continue;
+
+    const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
+      SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
+      P.LSCEVAddRec->getStepRecurrence(*SE)));
+    if (!isSafeToExpand(NextLSCEV, *SE))
+      continue;
+
+    BasicBlock *BB = P.InsertPt->getParent();
+    Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), 0/*PtrAddrSpace*/);
+    SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
+    Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt);
+
+    IRBuilder<> Builder(P.InsertPt);
+    Module *M = BB->getParent()->getParent();
+    Type *I32 = Type::getInt32Ty(BB->getContext());
+    Function *PrefetchFunc = Intrinsic::getDeclaration(
+        M, Intrinsic::prefetch, PrefPtrValue->getType());
+    Builder.CreateCall(
+        PrefetchFunc,
+        {PrefPtrValue,
+         ConstantInt::get(I32, P.Writes),
+         ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+    ++NumPrefetches;
+    LLVM_DEBUG(dbgs() << "  Access: "
+               << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
+               << ", SCEV: " << *P.LSCEVAddRec << "\n");
+    ORE->emit([&]() {
+        return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI)
+          << "prefetched memory access";
        });
  
-      MadeChange = true;
-    }
+    MadeChange = true;
    }
  
    return MadeChange;
diff --git a/llvm/test/CodeGen/SystemZ/prefetch-02.ll b/llvm/test/CodeGen/SystemZ/prefetch-02.ll

new file mode 100644 (file)

index 0000000..5f41769
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/prefetch-02.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -prefetch-distance=100 \
+; RUN:   -stop-after=loop-data-prefetch | FileCheck %s -check-prefix=FAR-PREFETCH
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -prefetch-distance=20 \
+; RUN:   -stop-after=loop-data-prefetch | FileCheck %s -check-prefix=NEAR-PREFETCH
+;
+; Check that prefetches are not emitted when the known constant trip count of
+; the loop is smaller than the estimated "iterations ahead" of the prefetch.
+;
+; FAR-PREFETCH-LABEL: fun
+; FAR-PREFETCH-NOT: call void @llvm.prefetch
+
+; NEAR-PREFETCH-LABEL: fun
+; NEAR-PREFETCH: call void @llvm.prefetch
+
+
+define void @fun(i32* nocapture %Src, i32* nocapture readonly %Dst) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.9, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %Dst, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %Src, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %indvars.iv.next.9 = add nuw nsw i64 %indvars.iv, 1600
+  %cmp.9 = icmp ult i64 %indvars.iv.next.9, 11200
+  br i1 %cmp.9, label %for.body, label %for.cond.cleanup
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/prefetch-03.ll b/llvm/test/CodeGen/SystemZ/prefetch-03.ll

new file mode 100644 (file)

index 0000000..9c2e926
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/prefetch-03.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -prefetch-distance=50 \
+; RUN:   -loop-prefetch-writes -stop-after=loop-data-prefetch | FileCheck %s
+;
+; Check that prefetches are emitted in a position that is executed each
+; iteration for each targeted memory instruction. The two stores in %true and
+; %false are within one cache line in memory, so they should get a single
+; prefetch in %for.body.
+;
+; CHECK-LABEL: for.body
+; CHECK: call void @llvm.prefetch.p0i8(i8* {{.*}}, i32 0
+; CHECK: call void @llvm.prefetch.p0i8(i8* {{.*}}, i32 1
+; CHECK-LABEL: true
+; CHECK-LABEL: false
+; CHECK-LABEL: latch
+
+define void @fun(i32* nocapture %Src, i32* nocapture readonly %Dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.9, %latch ]
+  %arrayidx = getelementptr inbounds i32, i32* %Dst, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %true, label %false
+
+true:  
+  %arrayidx2 = getelementptr inbounds i32, i32* %Src, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  br label %latch
+
+false:
+  %a = add i64 %indvars.iv, 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %Src, i64 %a
+  store i32 %0, i32* %arrayidx3, align 4
+  br label %latch
+
+latch:
+  %indvars.iv.next.9 = add nuw nsw i64 %indvars.iv, 1600
+  %cmp.9 = icmp ult i64 %indvars.iv.next.9, 11200
+  br i1 %cmp.9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/prefetch-04.ll b/llvm/test/CodeGen/SystemZ/prefetch-04.ll

new file mode 100644 (file)

index 0000000..af101ec
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/prefetch-04.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -prefetch-distance=20 \
+; RUN:   -loop-prefetch-writes -stop-after=loop-data-prefetch | FileCheck %s
+;
+; Check that for a load followed by a store to the same address gets a single
+; write prefetch.
+;
+; CHECK-LABEL: for.body
+; CHECK: call void @llvm.prefetch.p0i8(i8* %scevgep{{.*}}, i32 1, i32 3, i32 1
+; CHECK-not: call void @llvm.prefetch
+
+define void @fun(i32* nocapture %Src, i32* nocapture readonly %Dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.9, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %Dst, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %a = add i32 %0, 128
+  store i32 %a, i32* %arrayidx, align 4
+  %indvars.iv.next.9 = add nuw nsw i64 %indvars.iv, 1600
+  %cmp.9 = icmp ult i64 %indvars.iv.next.9, 11200
+  br i1 %cmp.9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
author	Jonas Paulsson <paulsson@linux.vnet.ibm.com>
	Thu, 31 Oct 2019 15:05:58 +0000 (16:05 +0100)
committer	Jonas Paulsson <paulsson@linux.vnet.ibm.com>
	Thu, 2 Apr 2020 12:57:46 +0000 (14:57 +0200)
llvm/include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h		patch \| blob \| history
llvm/include/llvm/CodeGen/BasicTTIImpl.h		patch \| blob \| history
llvm/include/llvm/MC/MCSubtargetInfo.h		patch \| blob \| history
llvm/lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/MC/MCSubtargetInfo.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64Subtarget.h		patch \| blob \| history
llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h		patch \| blob \| history
llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp		patch \| blob \| history
llvm/test/CodeGen/SystemZ/prefetch-02.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/SystemZ/prefetch-03.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/SystemZ/prefetch-04.ll	[new file with mode: 0644]	patch \| blob