class PredicatedScalarEvolution;
class ScalarEvolution;
class SCEV;
+class StoreInst;
/// These are the kinds of recurrences that we support.
enum class RecurKind {
public:
RecurrenceDescriptor() = default;
- RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K,
- FastMathFlags FMF, Instruction *ExactFP, Type *RT,
- bool Signed, bool Ordered,
+ RecurrenceDescriptor(Value *Start, Instruction *Exit, StoreInst *Store,
+ RecurKind K, FastMathFlags FMF, Instruction *ExactFP,
+ Type *RT, bool Signed, bool Ordered,
SmallPtrSetImpl<Instruction *> &CI,
unsigned MinWidthCastToRecurTy)
- : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF),
- ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed),
- IsOrdered(Ordered),
+ : IntermediateStore(Store), StartValue(Start), LoopExitInstr(Exit),
+ Kind(K), FMF(FMF), ExactFPMathInst(ExactFP), RecurrenceType(RT),
+ IsSigned(Signed), IsOrdered(Ordered),
MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) {
CastInsts.insert(CI.begin(), CI.end());
}
/// RecurrenceDescriptor. If either \p DB is non-null or \p AC and \p DT are
/// non-null, the minimal bit width needed to compute the reduction will be
/// computed.
- static bool AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop,
- FastMathFlags FuncFMF,
- RecurrenceDescriptor &RedDes,
- DemandedBits *DB = nullptr,
- AssumptionCache *AC = nullptr,
- DominatorTree *DT = nullptr);
+ static bool
+ AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop,
+ FastMathFlags FuncFMF, RecurrenceDescriptor &RedDes,
+ DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr,
+ DominatorTree *DT = nullptr, ScalarEvolution *SE = nullptr);
/// Returns true if Phi is a reduction in TheLoop. The RecurrenceDescriptor
/// is returned in RedDes. If either \p DB is non-null or \p AC and \p DT are
/// non-null, the minimal bit width needed to compute the reduction will be
- /// computed.
- static bool isReductionPHI(PHINode *Phi, Loop *TheLoop,
- RecurrenceDescriptor &RedDes,
- DemandedBits *DB = nullptr,
- AssumptionCache *AC = nullptr,
- DominatorTree *DT = nullptr);
+ /// computed. If \p SE is non-null, store instructions to loop invariant
+ /// addresses are processed.
+ static bool
+ isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes,
+ DemandedBits *DB = nullptr, AssumptionCache *AC = nullptr,
+ DominatorTree *DT = nullptr, ScalarEvolution *SE = nullptr);
/// Returns true if Phi is a first-order recurrence. A first-order recurrence
/// is a non-reduction recurrence relation in which the value of the
cast<IntrinsicInst>(I)->getIntrinsicID() == Intrinsic::fmuladd;
}
+ /// Reductions may store temporary or final result to an invariant address.
+ /// If there is such a store in the loop then, after successfull run of
+ /// AddReductionVar method, this field will be assigned the last met store.
+ StoreInst *IntermediateStore = nullptr;
+
private:
// The starting value of the recurrence.
// It does not have to be zero!
return HasDependenceInvolvingLoopInvariantAddress;
}
+ /// Return the list of stores to invariant addresses.
+ const ArrayRef<StoreInst *> getStoresToInvariantAddresses() const {
+ return StoresToInvariantAddresses;
+ }
+
/// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
/// them to a more usable form. All SCEV expressions during the analysis
/// should be re-written (and therefore simplified) according to PSE.
/// Indicator that there are non vectorizable stores to a uniform address.
bool HasDependenceInvolvingLoopInvariantAddress = false;
+ /// List of stores to invariant addresses.
+ SmallVector<StoreInst *> StoresToInvariantAddresses;
+
/// The diagnostics report generated for the analysis. E.g. why we
/// couldn't analyze the loop.
std::unique_ptr<OptimizationRemarkAnalysis> Report;
/// Returns the widest induction type.
Type *getWidestInductionType() { return WidestIndTy; }
+ /// Returns True if given store is a final invariant store of one of the
+ /// reductions found in the loop.
+ bool isInvariantStoreOfReduction(StoreInst *SI);
+
+ /// Returns True if given address is invariant and is used to store recurrent
+ /// expression
+ bool isInvariantAddressOfReduction(Value *V);
+
/// Returns True if V is a Phi node of an induction variable in this loop.
bool isInductionPhi(const Value *V) const;
return true;
}
-bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
- Loop *TheLoop, FastMathFlags FuncFMF,
- RecurrenceDescriptor &RedDes,
- DemandedBits *DB,
- AssumptionCache *AC,
- DominatorTree *DT) {
+bool RecurrenceDescriptor::AddReductionVar(
+ PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF,
+ RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC,
+ DominatorTree *DT, ScalarEvolution *SE) {
if (Phi->getNumIncomingValues() != 2)
return false;
// This includes users of the reduction, variables (which form a cycle
// which ends in the phi node).
Instruction *ExitInstruction = nullptr;
+
+ // Variable to keep last visited store instruction. By the end of the
+ // algorithm this variable will be either empty or having intermediate
+ // reduction value stored in invariant address.
+ StoreInst *IntermediateStore = nullptr;
+
// Indicates that we found a reduction operation in our scan.
bool FoundReduxOp = false;
// - By instructions outside of the loop (safe).
// * One value may have several outside users, but all outside
// uses must be of the same value.
+ // - By store instructions with a loop invariant address (safe with
+ // the following restrictions):
+ // * If there are several stores, all must have the same address.
+ // * Final value should be stored in that loop invariant address.
// - By an instruction that is not part of the reduction (not safe).
// This is either:
// * An instruction type other than PHI or the reduction operation.
while (!Worklist.empty()) {
Instruction *Cur = Worklist.pop_back_val();
+ // Store instructions are allowed iff it is the store of the reduction
+ // value to the same loop invariant memory location.
+ if (auto *SI = dyn_cast<StoreInst>(Cur)) {
+ if (!SE) {
+ LLVM_DEBUG(dbgs() << "Store instructions are not processed without "
+ << "Scalar Evolution Analysis\n");
+ return false;
+ }
+
+ const SCEV *PtrScev = SE->getSCEV(SI->getPointerOperand());
+ // Check it is the same address as previous stores
+ if (IntermediateStore) {
+ const SCEV *OtherScev =
+ SE->getSCEV(IntermediateStore->getPointerOperand());
+
+ if (OtherScev != PtrScev) {
+ LLVM_DEBUG(dbgs() << "Storing reduction value to different addresses "
+ << "inside the loop: " << *SI->getPointerOperand()
+ << " and "
+ << *IntermediateStore->getPointerOperand() << '\n');
+ return false;
+ }
+ }
+
+ // Check the pointer is loop invariant
+ if (!SE->isLoopInvariant(PtrScev, TheLoop)) {
+ LLVM_DEBUG(dbgs() << "Storing reduction value to non-uniform address "
+ << "inside the loop: " << *SI->getPointerOperand()
+ << '\n');
+ return false;
+ }
+
+ // IntermediateStore is always the last store in the loop.
+ IntermediateStore = SI;
+ continue;
+ }
+
// No Users.
// If the instruction has no users then this is a broken chain and can't be
// a reduction variable.
// reductions which are represented as a cmp followed by a select.
InstDesc IgnoredVal(false, nullptr);
if (VisitedInsts.insert(UI).second) {
- if (isa<PHINode>(UI))
+ if (isa<PHINode>(UI)) {
PHIs.push_back(UI);
- else
+ } else {
+ StoreInst *SI = dyn_cast<StoreInst>(UI);
+ if (SI && SI->getPointerOperand() == Cur) {
+ // Reduction variable chain can only be stored somewhere but it
+ // can't be used as an address.
+ return false;
+ }
NonPHIs.push_back(UI);
+ }
} else if (!isa<PHINode>(UI) &&
((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
!isa<SelectInst>(UI)) ||
if (isSelectCmpRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1)
return false;
+ if (IntermediateStore) {
+ // Check that stored value goes to the phi node again. This way we make sure
+ // that the value stored in IntermediateStore is indeed the final reduction
+ // value.
+ if (!is_contained(Phi->operands(), IntermediateStore->getValueOperand())) {
+ LLVM_DEBUG(dbgs() << "Not a final reduction value stored: "
+ << *IntermediateStore << '\n');
+ return false;
+ }
+
+ // If there is an exit instruction it's value should be stored in
+ // IntermediateStore
+ if (ExitInstruction &&
+ IntermediateStore->getValueOperand() != ExitInstruction) {
+ LLVM_DEBUG(dbgs() << "Last store Instruction of reduction value does not "
+ "store last calculated value of the reduction: "
+ << *IntermediateStore << '\n');
+ return false;
+ }
+
+ // If all uses are inside the loop (intermediate stores), then the
+ // reduction value after the loop will be the one used in the last store.
+ if (!ExitInstruction)
+ ExitInstruction = cast<Instruction>(IntermediateStore->getValueOperand());
+ }
+
if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
return false;
// is saved as part of the RecurrenceDescriptor.
// Save the description of this reduction variable.
- RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ExactFPMathInst,
- RecurrenceType, IsSigned, IsOrdered, CastInsts,
- MinWidthCastToRecurrenceType);
+ RecurrenceDescriptor RD(RdxStart, ExitInstruction, IntermediateStore, Kind,
+ FMF, ExactFPMathInst, RecurrenceType, IsSigned,
+ IsOrdered, CastInsts, MinWidthCastToRecurrenceType);
RedDes = RD;
return true;
bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
RecurrenceDescriptor &RedDes,
DemandedBits *DB, AssumptionCache *AC,
- DominatorTree *DT) {
+ DominatorTree *DT,
+ ScalarEvolution *SE) {
BasicBlock *Header = TheLoop->getHeader();
Function &F = *Header->getParent();
FastMathFlags FMF;
FMF.setNoSignedZeros(
F.getFnAttribute("no-signed-zeros-fp-math").getValueAsBool());
- if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::Add, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::Mul, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::Or, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::And, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::Xor, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a SMAX reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a SMIN reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a UMAX reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n");
return true;
}
if (AddReductionVar(Phi, RecurKind::SelectICmp, TheLoop, FMF, RedDes, DB, AC,
- DT)) {
+ DT, SE)) {
LLVM_DEBUG(dbgs() << "Found an integer conditional select reduction PHI."
<< *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::FAdd, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a float MAX reduction PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT)) {
+ if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n");
return true;
}
if (AddReductionVar(Phi, RecurKind::SelectFCmp, TheLoop, FMF, RedDes, DB, AC,
- DT)) {
+ DT, SE)) {
LLVM_DEBUG(dbgs() << "Found a float conditional select reduction PHI."
<< " PHI." << *Phi << "\n");
return true;
}
- if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC,
- DT)) {
+ if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, DT,
+ SE)) {
LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n");
return true;
}
for (StoreInst *ST : Stores) {
Value *Ptr = ST->getPointerOperand();
- if (isUniform(Ptr))
+ if (isUniform(Ptr)) {
+ // Record store instructions to loop invariant addresses
+ StoresToInvariantAddresses.push_back(ST);
HasDependenceInvolvingLoopInvariantAddress |=
!UniformStores.insert(Ptr).second;
+ }
// If we did *not* see this pointer before, insert it to the read-write
// list. At this phase it is only a 'write' list.
return false;
}
+/// Returns true if A and B have same pointer operands or same SCEVs addresses
+static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
+ StoreInst *B) {
+ // Compare store
+ if (A == B)
+ return true;
+
+ // Otherwise Compare pointers
+ Value *APtr = A->getPointerOperand();
+ Value *BPtr = B->getPointerOperand();
+ if (APtr == BPtr)
+ return true;
+
+ // Otherwise compare address SCEVs
+ if (SE->getSCEV(APtr) == SE->getSCEV(BPtr))
+ return true;
+
+ return false;
+}
+
int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
Value *Ptr) const {
const ValueToValueMap &Strides =
RecurrenceDescriptor RedDes;
if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
- DT)) {
+ DT, PSE.getSE())) {
Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
AllowedExit.insert(RedDes.getLoopExitInstr());
Reductions[Phi] = RedDes;
if (!LAI->canVectorizeMemory())
return false;
- if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
- reportVectorizationFailure("Stores to a uniform address",
- "write to a loop invariant address could not be vectorized",
- "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
- return false;
+ // We can vectorize stores to invariant address when final reduction value is
+ // guaranteed to be stored at the end of the loop. Also, if decision to
+ // vectorize loop is made, runtime checks are added so as to make sure that
+ // invariant address won't alias with any other objects.
+ if (!LAI->getStoresToInvariantAddresses().empty()) {
+ // For each invariant address, check its last stored value is unconditional.
+ for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+ if (isInvariantStoreOfReduction(SI) &&
+ blockNeedsPredication(SI->getParent())) {
+ reportVectorizationFailure(
+ "We don't allow storing to uniform addresses",
+ "write of conditional recurring variant value to a loop "
+ "invariant address could not be vectorized",
+ "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+ return false;
+ }
+ }
+
+ if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
+ // For each invariant address, check its last stored value is the result
+ // of one of our reductions.
+ //
+ // We do not check if dependence with loads exists because they are
+ // currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this
+ // behaviour changes we have to modify this code.
+ ScalarEvolution *SE = PSE.getSE();
+ SmallVector<StoreInst *, 4> UnhandledStores;
+ for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
+ if (isInvariantStoreOfReduction(SI)) {
+ // Earlier stores to this address are effectively deadcode.
+ // With opaque pointers it is possible for one pointer to be used with
+ // different sizes of stored values:
+ // store i32 0, ptr %x
+ // store i8 0, ptr %x
+ // The latest store doesn't complitely overwrite the first one in the
+ // example. That is why we have to make sure that types of stored
+ // values are same.
+ // TODO: Check that bitwidth of unhandled store is smaller then the
+ // one that overwrites it and add a test.
+ erase_if(UnhandledStores, [SE, SI](StoreInst *I) {
+ return storeToSameAddress(SE, SI, I) &&
+ I->getValueOperand()->getType() ==
+ SI->getValueOperand()->getType();
+ });
+ continue;
+ }
+ UnhandledStores.push_back(SI);
+ }
+
+ bool IsOK = UnhandledStores.empty();
+ // TODO: we should also validate against InvariantMemSets.
+ if (!IsOK) {
+ reportVectorizationFailure(
+ "We don't allow storing to uniform addresses",
+ "write to a loop invariant address could not "
+ "be vectorized",
+ "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+ return false;
+ }
+ }
}
Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
// We can now only vectorize if all reductions with Exact FP math also
// have the isOrdered flag set, which indicates that we can move the
- // reduction operations in-loop.
+ // reduction operations in-loop, and do not have intermediate store.
return (all_of(getReductionVars(), [&](auto &Reduction) -> bool {
const RecurrenceDescriptor &RdxDesc = Reduction.second;
- return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
+ return !RdxDesc.hasExactFPMath() ||
+ (RdxDesc.isOrdered() && !RdxDesc.IntermediateStore);
}));
}
+bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
+ return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return RdxDesc.IntermediateStore == SI;
+ });
+}
+
+bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
+ return any_of(getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ if (!RdxDesc.IntermediateStore)
+ return false;
+
+ ScalarEvolution *SE = PSE.getSE();
+ Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
+ return V == InvariantAddress ||
+ SE->getSCEV(V) == SE->getSCEV(InvariantAddress);
+ });
+}
+
bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
Value *In0 = const_cast<Value *>(V);
PHINode *PN = dyn_cast_or_null<PHINode>(In0);
// Set the resume value for this reduction
ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
+ // If there were stores of the reduction value to a uniform memory address
+ // inside the loop, create the final store here.
+ if (StoreInst *SI = RdxDesc.IntermediateStore) {
+ StoreInst *NewSI =
+ Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
+ propagateMetadata(NewSI, SI);
+
+ // If the reduction value is used in other places,
+ // then let the code below create PHI's for that.
+ }
+
// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+ // Find all stores to invariant variables. Since they are going to sink
+ // outside the loop we do not need calculate cost for them.
+ for (BasicBlock *BB : TheLoop->blocks())
+ for (Instruction &I : *BB) {
+ StoreInst *SI;
+ if ((SI = dyn_cast<StoreInst>(&I)) &&
+ Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+ ValuesToIgnore.insert(&I);
+ }
+
// Ignore type-promoting instructions we identified during reduction
// detection.
for (auto &Reduction : Legal->getReductionVars()) {
continue;
}
+ // Invariant stores inside loop will be deleted and a single store
+ // with the final reduction value will be added to the exit block
+ StoreInst *SI;
+ if ((SI = dyn_cast<StoreInst>(&I)) &&
+ Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
+ continue;
+
// Otherwise, if all widening options failed, Instruction is to be
// replicated. This may create a successor for VPBB.
VPBasicBlock *NextVPBB =
getCondOp()->printAsOperand(O, SlotTracker);
}
O << ")";
+ if (RdxDesc->IntermediateStore)
+ O << " (with final reduction value stored in invariant address sank "
+ "outside of loop)";
}
void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
; ADD (with reduction stored in invariant address)
-; CHECK-REMARK: loop not vectorized: value that could not be identified as reduction is used outside the loop
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 4, interleaved count: 2)
define void @invariant_store(i32* %dst, i32* readonly %src) {
; CHECK-LABEL: @invariant_store
-; CHECK-NOT: vector.body
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 4 x i32>
+; CHECK: %[[ADD1:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD2]]
+; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD1]]
+; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[ADD2]]
+; CHECK: middle.block:
+; CHECK: %[[ADD:.*]] = add <vscale x 4 x i32> %[[ADD2]], %[[ADD1]]
+; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[ADD]])
+; CHECK-NEXT: store i32 %[[SUM]], i32* %gep.dst, align 4
entry:
%gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
store i32 0, i32* %gep.dst, align 4
; dst[42] = sum;
; }
; CHECK-LABEL: @reduc_store
-; CHECK-NOT: vector.body
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !0
+; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT: store i32 [[TMP6]], i32* [[GEP_DST:%.*]], align 4
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
define void @reduc_store(i32* %dst, i32* readonly %src) {
entry:
%gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
; dst[42] = sum;
; }
; CHECK-LABEL: @reduc_store_fadd_fast
-; CHECK-NOT: vector.body
+; CHECK: vector.body:
+; CHECK: phi <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK-NOT: store float %{{[0-9]+}}, float* %gep.dst
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32
+; CHECK-NEXT: store float %{{[0-9]+}}, float* %gep.dst
define void @reduc_store_fadd_fast(float* %dst, float* readonly %src) {
entry:
%gep.dst = getelementptr inbounds float, float* %dst, i64 42
; dst[42] = sum;
; }
; CHECK-LABEL: @reduc_store_inside_unrolled
-; CHECK-NOT: vector.body
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 2, i64 4, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !alias.scope !11
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !alias.scope !11
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !alias.scope !11
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP7]], align 4, !alias.scope !11
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3
+; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP20]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP22]]
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP24]]
+; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP19]], align 4, !alias.scope !11
+; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP21]], align 4, !alias.scope !11
+; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP23]], align 4, !alias.scope !11
+; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP25]], align 4, !alias.scope !11
+; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
+; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP27]], i32 1
+; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP28]], i32 2
+; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 3
+; CHECK-NEXT: [[TMP34]] = add <4 x i32> [[TMP33]], [[TMP16]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
+; CHECK-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP34]])
+; CHECK-NEXT: store i32 [[TMP36]], i32* [[GEP_DST:%.*]], align 4
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 500, 500
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
define void @reduc_store_inside_unrolled(i32* %dst, i32* readonly %src) {
entry:
%gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
- %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ]
+ %sum = phi i32 [ 0, %entry ], [ %sum.2, %for.body ]
%gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
%0 = load i32, i32* %gep.src, align 4
%sum.1 = add nsw i32 %0, %sum
ret void
}
+; Check that we cannot vectorize code if stored value is not the final reduction
+; value
+;
+; int sum = 0;
+; for(int i=0; i < 1000; i++) {
+; sum += src[i];
+; dst[42] = sum + 1;
+; }
+; CHECK-LABEL: @reduc_store_not_final_value
+; CHECK-NOT: vector.body:
+define void @reduc_store_not_final_value(i32* %dst, i32* readonly %src) {
+entry:
+ %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
+ store i32 0, i32* %gep.dst, align 4
+ br label %for.body
+
+for.body:
+ %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
+ %0 = load i32, i32* %gep.src, align 4
+ %add = add nsw i32 %sum, %0
+ %sum_plus_one = add i32 %add, 1
+ store i32 %sum_plus_one, i32* %gep.dst, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
; We cannot vectorize if two (or more) invariant stores exist in a loop.
;
; int sum = 0;
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
- %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ]
+ %sum = phi i32 [ 0, %entry ], [ %sum.2, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%sum.1 = add nsw i32 %0, %sum
; dst[42] = sum;
; }
; CHECK-LABEL: @reduc_store_middle_store_predicated
-; CHECK-NOT: vector.body
+; CHECK: vector.body:
+; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.dst
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32
+; CHECK-NEXT: store i32 [[TMP]], i32* %gep.dst
+; CHECK: ret void
define void @reduc_store_middle_store_predicated(i32* %dst, i32* readonly %src) {
entry:
%gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
ret void
}
+; Final reduction value is overwritten inside loop
+;
+; for(int i=0; i < 1000; i++) {
+; sum += src[i];
+; dst[42] = sum;
+; dst[42] = 0;
+; }
+; CHECK-LABEL: @reduc_store_final_store_overwritten
+; CHECK-NOT: vector.body:
+define void @reduc_store_final_store_overwritten(i32* %dst, i32* readonly %src) {
+entry:
+ %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
+ br label %for.body
+
+for.body:
+ %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
+ %0 = load i32, i32* %gep.src, align 4
+ %add = add nsw i32 %sum, %0
+ store i32 %add, i32* %gep.dst, align 4
+ store i32 0, i32* %gep.dst, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
; Final value used outside of loop does not prevent vectorization
;
; int sum = 0;
; }
; dst[43] = sum;
; CHECK-LABEL: @reduc_store_inoutside
-; CHECK-NOT: vector.body
+; CHECK: vector.body:
+; CHECK-NOT: store i32 %{{[0-9]+}}, i32* %gep.src
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32
+; CHECK-NEXT: store i32 [[TMP]], i32* %gep.dst
+; CHECK: exit:
+; CHECK: [[PHI:%.*]] = phi i32 [ [[TMP1:%.*]], %for.body ], [ [[TMP2:%.*]], %middle.block ]
+; CHECK: [[ADDR:%.*]] = getelementptr inbounds i32, i32* %dst, i64 43
+; CHECK: store i32 [[PHI]], i32* [[ADDR]]
+; CHECK: ret void
define void @reduc_store_inoutside(i32* %dst, i32* readonly %src) {
entry:
%gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
ret float %red.next
}
+define void @print_reduction_with_invariant_store(i64 %n, float* noalias %y, float* noalias %dst) {
+; CHECK-LABEL: Checking a loop in 'print_reduction_with_invariant_store'
+; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next>
+; CHECK-NEXT: vp<[[IV:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<0>, ir<1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%y>, vp<[[IV]]>
+; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx>
+; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) (with final reduction value stored in invariant address sank outside of loop)
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF +(nuw) vp<[[CAN_IV]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]> vp<[[VEC_TC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+ %red = phi float [ %red.next, %for.body ], [ 0.0, %entry ]
+ %arrayidx = getelementptr inbounds float, float* %y, i64 %iv
+ %lv = load float, float* %arrayidx, align 4
+ %red.next = fadd fast float %lv, %red
+ store float %red.next, float* %dst, align 4
+ %iv.next = add i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, %n
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
define void @print_replicate_predicated_phi(i64 %n, i64* %x) {
; CHECK-LABEL: Checking a loop in 'print_replicate_predicated_phi'
; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {