[LoopVectorize] Support reductions that store intermediary result

author Igor Kirillov <Igor.Kirillov@arm.com>

Wed, 15 Sep 2021 18:42:01 +0000 (19:42 +0100)

committer Igor Kirillov <Igor.Kirillov@arm.com>

Tue, 3 May 2022 09:12:30 +0000 (10:12 +0100)
author Igor Kirillov <Igor.Kirillov@arm.com>
Wed, 15 Sep 2021 18:42:01 +0000 (19:42 +0100)
committer Igor Kirillov <Igor.Kirillov@arm.com>
Tue, 3 May 2022 09:12:30 +0000 (10:12 +0100)
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h

index 869b144..231d3bb 100644 (file)
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -29,6 +29,7 @@ class Loop;
  class PredicatedScalarEvolution;
  class ScalarEvolution;
  class SCEV;
+class StoreInst;
  
  /// These are the kinds of recurrences that we support.
  enum class RecurKind {
@@ -69,14 +70,14 @@ class RecurrenceDescriptor {
  public:
    RecurrenceDescriptor() = default;
  
-  RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K,
-                       FastMathFlags FMF, Instruction *ExactFP, Type *RT,
-                       bool Signed, bool Ordered,
+  RecurrenceDescriptor(Value *Start, Instruction *Exit, StoreInst *Store,
+                       RecurKind K, FastMathFlags FMF, Instruction *ExactFP,
+                       Type *RT, bool Signed, bool Ordered,
                         SmallPtrSetImpl<Instruction *> &CI,
                         unsigned MinWidthCastToRecurTy)
-      : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF),
-        ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed),
-        IsOrdered(Ordered),
+      : IntermediateStore(Store), StartValue(Start), LoopExitInstr(Exit),
+        Kind(K), FMF(FMF), ExactFPMathInst(ExactFP), RecurrenceType(RT),
+        IsSigned(Signed), IsOrdered(Ordered),
          MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) {
      CastInsts.insert(CI.begin(), CI.end());
    }
@@ -163,22 +164,21 @@ public:
    /// RecurrenceDescriptor. If either \p DB is non-null or \p AC and \p DT are
    /// non-null, the minimal bit width needed to compute the reduction will be
    /// computed.
-  static bool AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop,
-                              FastMathFlags FuncFMF,
-                              RecurrenceDescriptor &RedDes,
-                              DemandedBits *DB = nullptr,
-                              AssumptionCache *AC = nullptr,
-                              DominatorTree *DT = nullptr);
+  static bool
+  AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop,
+                  FastMathFlags FuncFMF, RecurrenceDescriptor &RedDes,
+                  DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr,
+                  DominatorTree *DT = nullptr, ScalarEvolution *SE = nullptr);
  
    /// Returns true if Phi is a reduction in TheLoop. The RecurrenceDescriptor
    /// is returned in RedDes. If either \p DB is non-null or \p AC and \p DT are
    /// non-null, the minimal bit width needed to compute the reduction will be
-  /// computed.
-  static bool isReductionPHI(PHINode *Phi, Loop *TheLoop,
-                             RecurrenceDescriptor &RedDes,
-                             DemandedBits *DB = nullptr,
-                             AssumptionCache *AC = nullptr,
-                             DominatorTree *DT = nullptr);
+  /// computed. If \p SE is non-null, store instructions to loop invariant
+  /// addresses are processed.
+  static bool
+  isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes,
+                 DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr,
+                 DominatorTree *DT = nullptr, ScalarEvolution *SE = nullptr);
  
    /// Returns true if Phi is a first-order recurrence. A first-order recurrence
    /// is a non-reduction recurrence relation in which the value of the
@@ -270,6 +270,11 @@ public:
             cast<IntrinsicInst>(I)->getIntrinsicID() == Intrinsic::fmuladd;
    }
  
+  /// Reductions may store temporary or final result to an invariant address.
+  /// If there is such a store in the loop then, after successfull run of
+  /// AddReductionVar method, this field will be assigned the last met store.
+  StoreInst *IntermediateStore = nullptr;
+
  private:
    // The starting value of the recurrence.
    // It does not have to be zero!
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h

index 3724fd4..9a9a984 100644 (file)
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -575,6 +575,11 @@ public:
      return HasDependenceInvolvingLoopInvariantAddress;
    }
  
+  /// Return the list of stores to invariant addresses.
+  const ArrayRef<StoreInst *> getStoresToInvariantAddresses() const {
+    return StoresToInvariantAddresses;
+  }
+
    /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
    /// them to a more usable form.  All SCEV expressions during the analysis
    /// should be re-written (and therefore simplified) according to PSE.
@@ -634,6 +639,9 @@ private:
    /// Indicator that there are non vectorizable stores to a uniform address.
    bool HasDependenceInvolvingLoopInvariantAddress = false;
  
+  /// List of stores to invariant addresses.
+  SmallVector<StoreInst *> StoresToInvariantAddresses;
+
    /// The diagnostics report generated for the analysis.  E.g. why we
    /// couldn't analyze the loop.
    std::unique_ptr<OptimizationRemarkAnalysis> Report;
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

index d3dfd39..830fd25 100644 (file)
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -308,6 +308,14 @@ public:
    /// Returns the widest induction type.
    Type *getWidestInductionType() { return WidestIndTy; }
  
+  /// Returns True if given store is a final invariant store of one of the
+  /// reductions found in the loop.
+  bool isInvariantStoreOfReduction(StoreInst *SI);
+
+  /// Returns True if given address is invariant and is used to store recurrent
+  /// expression
+  bool isInvariantAddressOfReduction(Value *V);
+
    /// Returns True if V is a Phi node of an induction variable in this loop.
    bool isInductionPhi(const Value *V) const;
  
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp

index e03cf6c..e4d706a 100644 (file)
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -227,12 +227,10 @@ static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
    return true;
  }
  
-bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
-                                           Loop *TheLoop, FastMathFlags FuncFMF,
-                                           RecurrenceDescriptor &RedDes,
-                                           DemandedBits *DB,
-                                           AssumptionCache *AC,
-                                           DominatorTree *DT) {
+bool RecurrenceDescriptor::AddReductionVar(
+    PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF,
+    RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC,
+    DominatorTree *DT, ScalarEvolution *SE) {
    if (Phi->getNumIncomingValues() != 2)
      return false;
  
@@ -249,6 +247,12 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
    // This includes users of the reduction, variables (which form a cycle
    // which ends in the phi node).
    Instruction *ExitInstruction = nullptr;
+
+  // Variable to keep last visited store instruction. By the end of the
+  // algorithm this variable will be either empty or having intermediate
+  // reduction value stored in invariant address.
+  StoreInst *IntermediateStore = nullptr;
+
    // Indicates that we found a reduction operation in our scan.
    bool FoundReduxOp = false;
  
@@ -314,6 +318,10 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
    //  - By instructions outside of the loop (safe).
    //      * One value may have several outside users, but all outside
    //        uses must be of the same value.
+  //  - By store instructions with a loop invariant address (safe with
+  //    the following restrictions):
+  //      * If there are several stores, all must have the same address.
+  //      * Final value should be stored in that loop invariant address.
    //  - By an instruction that is not part of the reduction (not safe).
    //    This is either:
    //      * An instruction type other than PHI or the reduction operation.
@@ -321,6 +329,43 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
    while (!Worklist.empty()) {
      Instruction *Cur = Worklist.pop_back_val();
  
+    // Store instructions are allowed iff it is the store of the reduction
+    // value to the same loop invariant memory location.
+    if (auto *SI = dyn_cast<StoreInst>(Cur)) {
+      if (!SE) {
+        LLVM_DEBUG(dbgs() << "Store instructions are not processed without "
+                          << "Scalar Evolution Analysis\n");
+        return false;
+      }
+
+      const SCEV *PtrScev = SE->getSCEV(SI->getPointerOperand());
+      // Check it is the same address as previous stores
+      if (IntermediateStore) {
+        const SCEV *OtherScev =
+            SE->getSCEV(IntermediateStore->getPointerOperand());
+
+        if (OtherScev != PtrScev) {
+          LLVM_DEBUG(dbgs() << "Storing reduction value to different addresses "
+                            << "inside the loop: " << *SI->getPointerOperand()
+                            << " and "
+                            << *IntermediateStore->getPointerOperand() << '\n');
+          return false;
+        }
+      }
+
+      // Check the pointer is loop invariant
+      if (!SE->isLoopInvariant(PtrScev, TheLoop)) {
+        LLVM_DEBUG(dbgs() << "Storing reduction value to non-uniform address "
+                          << "inside the loop: " << *SI->getPointerOperand()
+                          << '\n');
+        return false;
+      }
+
+      // IntermediateStore is always the last store in the loop.
+      IntermediateStore = SI;
+      continue;
+    }
+
      // No Users.
      // If the instruction has no users then this is a broken chain and can't be
      // a reduction variable.
@@ -443,10 +488,17 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
        // reductions which are represented as a cmp followed by a select.
        InstDesc IgnoredVal(false, nullptr);
        if (VisitedInsts.insert(UI).second) {
-        if (isa<PHINode>(UI))
+        if (isa<PHINode>(UI)) {
            PHIs.push_back(UI);
-        else
+        } else {
+          StoreInst *SI = dyn_cast<StoreInst>(UI);
+          if (SI && SI->getPointerOperand() == Cur) {
+            // Reduction variable chain can only be stored somewhere but it
+            // can't be used as an address.
+            return false;
+          }
            NonPHIs.push_back(UI);
+        }
        } else if (!isa<PHINode>(UI) &&
                   ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
                     !isa<SelectInst>(UI)) ||
@@ -474,6 +526,32 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
    if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1)
      return false;
  
+  if (IntermediateStore) {
+    // Check that stored value goes to the phi node again. This way we make sure
+    // that the value stored in IntermediateStore is indeed the final reduction
+    // value.
+    if (!is_contained(Phi->operands(), IntermediateStore->getValueOperand())) {
+      LLVM_DEBUG(dbgs() << "Not a final reduction value stored: "
+                        << *IntermediateStore << '\n');
+      return false;
+    }
+
+    // If there is an exit instruction it's value should be stored in
+    // IntermediateStore
+    if (ExitInstruction &&
+        IntermediateStore->getValueOperand() != ExitInstruction) {
+      LLVM_DEBUG(dbgs() << "Last store Instruction of reduction value does not "
+                           "store last calculated value of the reduction: "
+                        << *IntermediateStore << '\n');
+      return false;
+    }
+
+    // If all uses are inside the loop (intermediate stores), then the
+    // reduction value after the loop will be the one used in the last store.
+    if (!ExitInstruction)
+      ExitInstruction = cast<Instruction>(IntermediateStore->getValueOperand());
+  }
+
    if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
      return false;
  
@@ -535,9 +613,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
    // is saved as part of the RecurrenceDescriptor.
  
    // Save the description of this reduction variable.
-  RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ExactFPMathInst,
-                          RecurrenceType, IsSigned, IsOrdered, CastInsts,
-                          MinWidthCastToRecurrenceType);
+  RecurrenceDescriptor RD(RdxStart, ExitInstruction, IntermediateStore, Kind,
+                          FMF, ExactFPMathInst, RecurrenceType, IsSigned,
+                          IsOrdered, CastInsts, MinWidthCastToRecurrenceType);
    RedDes = RD;
  
    return true;
@@ -761,7 +839,8 @@ bool RecurrenceDescriptor::hasMultipleUsesOf(
  bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
                                            RecurrenceDescriptor &RedDes,
                                            DemandedBits *DB, AssumptionCache *AC,
-                                          DominatorTree *DT) {
+                                          DominatorTree *DT,
+                                          ScalarEvolution *SE) {
    BasicBlock *Header = TheLoop->getHeader();
    Function &F = *Header->getParent();
    FastMathFlags FMF;
@@ -770,72 +849,85 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
    FMF.setNoSignedZeros(
        F.getFnAttribute("no-signed-zeros-fp-math").getValueAsBool());
  
-  if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found a SMAX reduction PHI." << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found a SMIN reduction PHI." << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found a UMAX reduction PHI." << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n");
      return true;
    }
    if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC,
-                      DT)) {
+                      DT, SE)) {
      LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI."
                        << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found a float MAX reduction PHI." << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n");
      return true;
    }
    if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC,
-                      DT)) {
+                      DT, SE)) {
      LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI."
                        << " PHI." << *Phi << "\n");
      return true;
    }
-  if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC,
-                      DT)) {
+  if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, DT,
+                      SE)) {
      LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n");
      return true;
    }
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp

index b1773db..d0276df 100644 (file)
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1993,9 +1993,12 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
    for (StoreInst *ST : Stores) {
      Value *Ptr = ST->getPointerOperand();
  
-    if (isUniform(Ptr))
+    if (isUniform(Ptr)) {
+      // Record store instructions to loop invariant addresses
+      StoresToInvariantAddresses.push_back(ST);
        HasDependenceInvolvingLoopInvariantAddress |=
            !UniformStores.insert(Ptr).second;
+    }
  
      // If we did *not* see this pointer before, insert it to  the read-write
      // list. At this phase it is only a 'write' list.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

index dc652e2..d3a944c 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -441,6 +441,26 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
    return false;
  }
  
+/// Returns true if A and B have same pointer operands or same SCEVs addresses
+static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
+                               StoreInst *B) {
+  // Compare store
+  if (A == B)
+    return true;
+
+  // Otherwise Compare pointers
+  Value *APtr = A->getPointerOperand();
+  Value *BPtr = B->getPointerOperand();
+  if (APtr == BPtr)
+    return true;
+
+  // Otherwise compare address SCEVs
+  if (SE->getSCEV(APtr) == SE->getSCEV(BPtr))
+    return true;
+
+  return false;
+}
+
  int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
                                                  Value *Ptr) const {
    const ValueToValueMap &Strides =
@@ -678,7 +698,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
  
          RecurrenceDescriptor RedDes;
          if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
-                                                 DT)) {
+                                                 DT, PSE.getSE())) {
            Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
            AllowedExit.insert(RedDes.getLoopExitInstr());
            Reductions[Phi] = RedDes;
@@ -913,11 +933,66 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
    if (!LAI->canVectorizeMemory())
      return false;
  
-  if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
-    reportVectorizationFailure("Stores to a uniform address",
-        "write to a loop invariant address could not be vectorized",
-        "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
-    return false;
+  // We can vectorize stores to invariant address when final reduction value is
+  // guaranteed to be stored at the end of the loop. Also, if decision to
+  // vectorize loop is made, runtime checks are added so as to make sure that
+  // invariant address won't alias with any other objects.
+  if (!LAI->getStoresToInvariantAddresses().empty()) {
+    // For each invariant address, check its last stored value is unconditional.
+    for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+      if (isInvariantStoreOfReduction(SI) &&
+          blockNeedsPredication(SI->getParent())) {
+        reportVectorizationFailure(
+            "We don't allow storing to uniform addresses",
+            "write of conditional recurring variant value to a loop "
+            "invariant address could not be vectorized",
+            "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+        return false;
+      }
+    }
+
+    if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
+      // For each invariant address, check its last stored value is the result
+      // of one of our reductions.
+      //
+      // We do not check if dependence with loads exists because they are
+      // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this
+      // behaviour changes we have to modify this code.
+      ScalarEvolution *SE = PSE.getSE();
+      SmallVector<StoreInst *, 4> UnhandledStores;
+      for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+        if (isInvariantStoreOfReduction(SI)) {
+          // Earlier stores to this address are effectively deadcode.
+          // With opaque pointers it is possible for one pointer to be used with
+          // different sizes of stored values:
+          //    store i32 0, ptr %x
+          //    store i8 0, ptr %x
+          // The latest store doesn't complitely overwrite the first one in the
+          // example. That is why we have to make sure that types of stored
+          // values are same.
+          // TODO: Check that bitwidth of unhandled store is smaller then the
+          // one that overwrites it and add a test.
+          erase_if(UnhandledStores, [SE, SI](StoreInst *I) {
+            return storeToSameAddress(SE, SI, I) &&
+                   I->getValueOperand()->getType() ==
+                       SI->getValueOperand()->getType();
+          });
+          continue;
+        }
+        UnhandledStores.push_back(SI);
+      }
+
+      bool IsOK = UnhandledStores.empty();
+      // TODO: we should also validate against InvariantMemSets.
+      if (!IsOK) {
+        reportVectorizationFailure(
+            "We don't allow storing to uniform addresses",
+            "write to a loop invariant address could not "
+            "be vectorized",
+            "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+        return false;
+      }
+    }
    }
  
    Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
@@ -944,13 +1019,34 @@ bool LoopVectorizationLegality::canVectorizeFPMath(
  
    // We can now only vectorize if all reductions with Exact FP math also
    // have the isOrdered flag set, which indicates that we can move the
-  // reduction operations in-loop.
+  // reduction operations in-loop, and do not have intermediate store.
    return (all_of(getReductionVars(), [&](auto &Reduction) -> bool {
      const RecurrenceDescriptor &RdxDesc = Reduction.second;
-    return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
+    return !RdxDesc.hasExactFPMath() ||
+           (RdxDesc.isOrdered() && !RdxDesc.IntermediateStore);
    }));
  }
  
+bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
+  return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
+    const RecurrenceDescriptor &RdxDesc = Reduction.second;
+    return RdxDesc.IntermediateStore == SI;
+  });
+}
+
+bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
+  return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
+    const RecurrenceDescriptor &RdxDesc = Reduction.second;
+    if (!RdxDesc.IntermediateStore)
+      return false;
+
+    ScalarEvolution *SE = PSE.getSE();
+    Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
+    return V == InvariantAddress ||
+           SE->getSCEV(V) == SE->getSCEV(InvariantAddress);
+  });
+}
+
  bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
    Value *In0 = const_cast<Value *>(V);
    PHINode *PN = dyn_cast_or_null<PHINode>(In0);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index f948f32..d59abd2 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3998,6 +3998,17 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
    // Set the resume value for this reduction
    ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
  
+  // If there were stores of the reduction value to a uniform memory address
+  // inside the loop, create the final store here.
+  if (StoreInst *SI = RdxDesc.IntermediateStore) {
+    StoreInst *NewSI =
+        Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
+    propagateMetadata(NewSI, SI);
+
+    // If the reduction value is used in other places,
+    // then let the code below create PHI's for that.
+  }
+
    // Now, we need to fix the users of the reduction variable
    // inside and outside of the scalar remainder loop.
  
@@ -7340,6 +7351,16 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
    // Ignore ephemeral values.
    CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
  
+  // Find all stores to invariant variables. Since they are going to sink
+  // outside the loop we do not need calculate cost for them.
+  for (BasicBlock *BB : TheLoop->blocks())
+    for (Instruction &I : *BB) {
+      StoreInst *SI;
+      if ((SI = dyn_cast<StoreInst>(&I)) &&
+          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+        ValuesToIgnore.insert(&I);
+    }
+
    // Ignore type-promoting instructions we identified during reduction
    // detection.
    for (auto &Reduction : Legal->getReductionVars()) {
@@ -8845,6 +8866,13 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
          continue;
        }
  
+      // Invariant stores inside loop will be deleted and a single store
+      // with the final reduction value will be added to the exit block
+      StoreInst *SI;
+      if ((SI = dyn_cast<StoreInst>(&I)) &&
+          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+        continue;
+
        // Otherwise, if all widening options failed, Instruction is to be
        // replicated. This may create a successor for VPBB.
        VPBasicBlock *NextVPBB =
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp

index 02550da..21bd231 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1420,6 +1420,9 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
      getCondOp()->printAsOperand(O, SlotTracker);
    }
    O << ")";
+  if (RdxDesc->IntermediateStore)
+    O << " (with final reduction value stored in invariant address sank "
+         "outside of loop)";
  }
  
  void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll

index feafd88..56d8a07 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -320,10 +320,20 @@ for.end:
  
  ; ADD (with reduction stored in invariant address)
  
-; CHECK-REMARK: loop not vectorized: value that could not be identified as reduction is used outside the loop
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 4, interleaved count: 2)
  define void @invariant_store(i32* %dst, i32* readonly %src) {
  ; CHECK-LABEL: @invariant_store
-; CHECK-NOT: vector.body
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 4 x i32>
+; CHECK: %[[ADD1:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD2]]
+; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD1]]
+; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD2]]
+; CHECK: middle.block:
+; CHECK: %[[ADD:.*]] = add <vscale x 4 x i32> %[[ADD2]], %[[ADD1]]
+; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[ADD]])
+; CHECK-NEXT: store i32 %[[SUM]], i32* %gep.dst, align 4
  entry:
    %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
    store i32 0, i32* %gep.dst, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll

index d002b1b..862a384 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -11,7 +11,23 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
  ;   dst[42] = sum;
  ; }
  ; CHECK-LABEL: @reduc_store
-; CHECK-NOT: vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[GEP_DST:%.*]], align 4
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, 1000
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
  define void @reduc_store(i32* %dst, i32* readonly %src) {
  entry:
    %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
@@ -41,7 +57,14 @@ exit:
  ;   dst[42] = sum;
  ; }
  ; CHECK-LABEL: @reduc_store_fadd_fast
-; CHECK-NOT: vector.body
+; CHECK: vector.body:
+; CHECK: phi <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK-NOT: store float %{{[0-9]+}}, float* %gep.dst
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32
+; CHECK-NEXT: store float %{{[0-9]+}}, float* %gep.dst
  define void @reduc_store_fadd_fast(float* %dst, float* readonly %src) {
  entry:
    %gep.dst = getelementptr inbounds float, float* %dst, i64 42
@@ -152,7 +175,55 @@ for.end:
  ;    dst[42] = sum;
  ;  }
  ; CHECK-LABEL: @reduc_store_inside_unrolled
-; CHECK-NOT: vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 2, i64 4, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !alias.scope !11
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !alias.scope !11
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !alias.scope !11
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP7]], align 4, !alias.scope !11
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP19]], align 4, !alias.scope !11
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP21]], align 4, !alias.scope !11
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP23]], align 4, !alias.scope !11
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP25]], align 4, !alias.scope !11
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP27]], i32 1
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP28]], i32 2
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 3
+; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP33]], [[TMP16]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP34]])
+; CHECK-NEXT:    store i32 [[TMP36]], i32* [[GEP_DST:%.*]], align 4
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 500, 500
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
  define void @reduc_store_inside_unrolled(i32* %dst, i32* readonly %src) {
  entry:
    %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
@@ -160,7 +231,7 @@ entry:
  
  for.body:
    %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %sum.2, %for.body ]
    %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
    %0 = load i32, i32* %gep.src, align 4
    %sum.1 = add nsw i32 %0, %sum
@@ -178,6 +249,38 @@ exit:
    ret void
  }
  
+; Check that we cannot vectorize code if stored value is not the final reduction
+; value
+;
+;  int sum = 0;
+;  for(int i=0; i < 1000; i++) {
+;    sum += src[i];
+;    dst[42] = sum + 1;
+;  }
+; CHECK-LABEL: @reduc_store_not_final_value
+; CHECK-NOT: vector.body:
+define void @reduc_store_not_final_value(i32* %dst, i32* readonly %src) {
+entry:
+  %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
+  store i32 0, i32* %gep.dst, align 4
+  br label %for.body
+
+for.body:
+  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
+  %0 = load i32, i32* %gep.src, align 4
+  %add = add nsw i32 %sum, %0
+  %sum_plus_one = add i32 %add, 1
+  store i32 %sum_plus_one, i32* %gep.dst, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
  ; We cannot vectorize if two (or more) invariant stores exist in a loop.
  ;
  ;  int sum = 0;
@@ -197,7 +300,7 @@ entry:
  
  for.body:
    %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %sum.2, %for.body ]
    %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
    %0 = load i32, i32* %arrayidx, align 4
    %sum.1 = add nsw i32 %0, %sum
@@ -224,7 +327,12 @@ exit:
  ;    dst[42] = sum;
  ;  }
  ; CHECK-LABEL: @reduc_store_middle_store_predicated
-; CHECK-NOT: vector.body
+; CHECK: vector.body:
+; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.dst
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32
+; CHECK-NEXT: store i32 [[TMP]], i32* %gep.dst
+; CHECK: ret void
  define void @reduc_store_middle_store_predicated(i32* %dst, i32* readonly %src) {
  entry:
    %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
@@ -299,6 +407,36 @@ exit:                                 ; preds = %latch
    ret void
  }
  
+; Final reduction value is overwritten inside loop
+;
+; for(int i=0; i < 1000; i++) {
+;   sum += src[i];
+;   dst[42] = sum;
+;   dst[42] = 0;
+; }
+; CHECK-LABEL: @reduc_store_final_store_overwritten
+; CHECK-NOT: vector.body:
+define void @reduc_store_final_store_overwritten(i32* %dst, i32* readonly %src) {
+entry:
+  %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
+  br label %for.body
+
+for.body:
+  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
+  %0 = load i32, i32* %gep.src, align 4
+  %add = add nsw i32 %sum, %0
+  store i32 %add, i32* %gep.dst, align 4
+  store i32 0, i32* %gep.dst, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
  ; Final value used outside of loop does not prevent vectorization
  ;
  ; int sum = 0;
@@ -308,7 +446,16 @@ exit:                                 ; preds = %latch
  ; }
  ; dst[43] = sum;
  ; CHECK-LABEL: @reduc_store_inoutside
-; CHECK-NOT: vector.body
+; CHECK: vector.body:
+; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.src
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32
+; CHECK-NEXT: store i32 [[TMP]], i32* %gep.dst
+; CHECK: exit:
+; CHECK: [[PHI:%.*]] = phi i32 [ [[TMP1:%.*]], %for.body ], [ [[TMP2:%.*]], %middle.block ]
+; CHECK: [[ADDR:%.*]] = getelementptr inbounds i32, i32* %dst, i64 43
+; CHECK: store i32 [[PHI]], i32* [[ADDR]]
+; CHECK: ret void
  define void @reduc_store_inoutside(i32* %dst, i32* readonly %src) {
  entry:
    %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll

index 2582791..81593f9 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -146,6 +146,50 @@ for.end:                                          ; preds = %for.body, %entry
    ret float %red.next
  }
  
+define void @print_reduction_with_invariant_store(i64 %n, float* noalias %y, float* noalias %dst) {
+; CHECK-LABEL: Checking a loop in 'print_reduction_with_invariant_store'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next>
+; CHECK-NEXT:   vp<[[IV:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<0>, ir<1>
+; CHECK-NEXT:   CLONE ir<%arrayidx> = getelementptr ir<%y>, vp<[[IV]]>
+; CHECK-NEXT:   WIDEN ir<%lv> = load ir<%arrayidx>
+; CHECK-NEXT:   REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) (with final reduction value stored in invariant address sank outside of loop)
+; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF +(nuw) vp<[[CAN_IV]]>
+; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]> vp<[[VEC_TC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %red = phi float [ %red.next, %for.body ], [ 0.0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %iv
+  %lv = load float, float* %arrayidx, align 4
+  %red.next = fadd fast float %lv, %red
+  store float %red.next, float* %dst, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
  define void @print_replicate_predicated_phi(i64 %n, i64* %x) {
  ; CHECK-LABEL: Checking a loop in 'print_replicate_predicated_phi'
  ; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
author	Igor Kirillov <Igor.Kirillov@arm.com>
	Wed, 15 Sep 2021 18:42:01 +0000 (19:42 +0100)
committer	Igor Kirillov <Igor.Kirillov@arm.com>
	Tue, 3 May 2022 09:12:30 +0000 (10:12 +0100)
llvm/include/llvm/Analysis/IVDescriptors.h		patch \| blob \| history
llvm/include/llvm/Analysis/LoopAccessAnalysis.h		patch \| blob \| history
llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h		patch \| blob \| history
llvm/lib/Analysis/IVDescriptors.cpp		patch \| blob \| history
llvm/lib/Analysis/LoopAccessAnalysis.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/VPlan.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/vplan-printing.ll		patch \| blob \| history