// that the cost model has chosen to ignore because they will not be
// vectorized.
void vectorize(LoopVectorizationLegality *L,
- const MapVector<Instruction *, uint64_t> &MinimumBitWidths,
- SmallPtrSetImpl<const Value *> &VecValuesToIgnore) {
+ const MapVector<Instruction *, uint64_t> &MinimumBitWidths) {
MinBWs = &MinimumBitWidths;
- ValuesNotWidened = &VecValuesToIgnore;
Legal = L;
// Create a new empty loop. Unlink the old loop and connect the new one.
createEmptyLoop();
/// to this type.
const MapVector<Instruction *, uint64_t> *MinBWs;
- /// A set of values that should not be widened. This is taken from
- /// VecValuesToIgnore in the cost model.
- SmallPtrSetImpl<const Value *> *ValuesNotWidened;
-
LoopVectorizationLegality *Legal;
// Record whether runtime checks are added.
/// Returns true if the value V is uniform within the loop.
bool isUniform(Value *V);
- /// Returns true if this instruction will remain scalar after vectorization.
+ /// Returns true if \p I is known to be uniform after vectorization.
bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); }
+ /// Returns true if \p I is known to be scalar after vectorization.
+ bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); }
+
/// Returns the information that we collected about runtime memory check.
const RuntimePointerChecking *getRuntimePointerChecking() const {
return LAI->getRuntimePointerChecking();
/// transformation.
bool canVectorizeWithIfConvert();
- /// Collect the variables that need to stay uniform after vectorization.
+ /// Collect the instructions that are uniform after vectorization. An
+ /// instruction is uniform if we represent it with a single scalar value in
+ /// the vectorized loop corresponding to each vector iteration. Examples of
+ /// uniform instructions include pointer operands of consecutive or
+ /// interleaved memory accesses. Note that although uniformity implies an
+ /// instruction will be scalar, the reverse is not true. In general, a
+ /// scalarized instruction will be represented by VF scalar values in the
+ /// vectorized loop, each corresponding to an iteration of the original
+ /// scalar loop.
void collectLoopUniforms();
+ /// Collect the instructions that are scalar after vectorization. An
+ /// instruction is scalar if it is known to be uniform or will be scalarized
+ /// during vectorization. Non-uniform scalarized instructions will be
+ /// represented by VF values in the vectorized loop, each corresponding to an
+ /// iteration of the original scalar loop.
+ void collectLoopScalars();
+
/// Return true if all of the instructions in the block can be speculatively
/// executed. \p SafePtrs is a list of addresses that are known to be legal
/// and we know that we can read from them without segfault.
/// Allowed outside users. This holds the induction and reduction
/// vars which can be accessed from outside the loop.
SmallPtrSet<Value *, 4> AllowedExit;
- /// This set holds the variables which are known to be uniform after
- /// vectorization.
+
+ /// Holds the instructions known to be uniform after vectorization.
SmallPtrSet<Instruction *, 4> Uniforms;
+ /// Holds the instructions known to be scalar after vectorization.
+ SmallPtrSet<Instruction *, 4> Scalars;
+
/// Can we assume the absence of NaNs.
bool HasFunNoNaNAttr;
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (VF > 1 && IV->getType() == Induction->getType() && Step &&
- !ValuesNotWidened->count(IV))
+ !Legal->isScalarAfterVectorization(IV))
return createVectorIntInductionPHI(ID, Entry, TruncType);
// The scalar value to broadcast. This will be derived from the canonical
// addition of the scalar steps will not increase the number of instructions
// in the loop in the common case prior to InstCombine. We will be trading
// one vector extract for each scalar step.
- if (VF > 1 && ValuesNotWidened->count(IV)) {
+ if (VF > 1 && Legal->isScalarAfterVectorization(IV)) {
auto *EntryVal = Trunc ? cast<Value>(Trunc) : IV;
buildScalarSteps(ScalarIV, Step, EntryVal);
}
return false;
}
- // Collect all of the variables that remain uniform after vectorization.
- collectLoopUniforms();
-
DEBUG(dbgs() << "LV: We can vectorize this loop"
<< (LAI->getRuntimePointerChecking()->Need
? " (with a runtime bound check)"
if (UseInterleaved)
InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
+ // Collect all instructions that are known to be uniform after vectorization.
+ collectLoopUniforms();
+
+ // Collect all instructions that are known to be scalar after vectorization.
+ collectLoopScalars();
+
unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
return true;
}
+void LoopVectorizationLegality::collectLoopScalars() {
+
+ // If an instruction is uniform after vectorization, it will remain scalar.
+ Scalars.insert(Uniforms.begin(), Uniforms.end());
+
+ // Collect the getelementptr instructions that will not be vectorized. A
+ // getelementptr instruction is only vectorized if it is used for a legal
+ // gather or scatter operation.
+ for (auto *BB : TheLoop->blocks())
+ for (auto &I : *BB) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ Scalars.insert(GEP);
+ continue;
+ }
+ auto *Ptr = getPointerOperand(&I);
+ if (!Ptr)
+ continue;
+ auto *GEP = getGEPInstruction(Ptr);
+ if (GEP && isLegalGatherOrScatter(&I))
+ Scalars.erase(GEP);
+ }
+
+ // An induction variable will remain scalar if all users of the induction
+ // variable and induction variable update remain scalar.
+ auto *Latch = TheLoop->getLoopLatch();
+ for (auto &Induction : *getInductionVars()) {
+ auto *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+ // Determine if all users of the induction variable are scalar after
+ // vectorization.
+ auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I);
+ });
+ if (!ScalarInd)
+ continue;
+
+ // Determine if all users of the induction variable update instruction are
+ // scalar after vectorization.
+ auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == Ind || !TheLoop->contains(I) || Scalars.count(I);
+ });
+ if (!ScalarIndUpdate)
+ continue;
+
+ // The induction variable and its update instruction will remain scalar.
+ Scalars.insert(Ind);
+ Scalars.insert(IndUpdate);
+ }
+}
+
void LoopVectorizationLegality::collectLoopUniforms() {
// We now know that the loop is vectorizable!
// Collect variables that will remain uniform after vectorization.
DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
}
- // Also add all consecutive pointer values; these values will be uniform
- // after vectorization (and subsequent cleanup).
- for (auto *BB : TheLoop->blocks()) {
+ // Add all consecutive pointer values; these values will be uniform after
+ // vectorization (and subsequent cleanup). Although non-consecutive, we also
+ // add the pointer operands of interleaved accesses since they are treated
+ // like consecutive pointers during vectorization.
+ for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
- if (I.getType()->isPointerTy() && isConsecutivePtr(&I)) {
- Worklist.insert(&I);
- DEBUG(dbgs() << "LV: Found uniform instruction: " << I << "\n");
- }
+ Instruction *Ptr = nullptr;
+ if (I.getType()->isPointerTy() && isConsecutivePtr(&I))
+ Ptr = &I;
+ else if (isAccessInterleaved(&I))
+ Ptr = cast<Instruction>(getPointerOperand(&I));
+ else
+ continue;
+ Worklist.insert(Ptr);
+ DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ptr << "\n");
}
- }
// Expand Worklist in topological order: whenever a new instruction
// is added , its users should be either already inside Worklist, or
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
- // Insert uniform instruction into VecValuesToIgnore.
- // Collect non-gather/scatter and non-consecutive ptr in NonConsecutivePtr.
- SmallPtrSet<Instruction *, 8> NonConsecutivePtr;
- for (auto *BB : TheLoop->getBlocks()) {
- for (auto &I : *BB) {
- if (Legal->isUniformAfterVectorization(&I))
+ // Insert values known to be scalar into VecValuesToIgnore.
+ for (auto *BB : TheLoop->getBlocks())
+ for (auto &I : *BB)
+ if (Legal->isScalarAfterVectorization(&I))
VecValuesToIgnore.insert(&I);
- Instruction *PI = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
- if (PI && !Legal->isConsecutivePtr(PI) &&
- !Legal->isLegalGatherOrScatter(&I))
- NonConsecutivePtr.insert(PI);
- }
- }
-
- // Ignore induction phis that are either used in uniform instructions or
- // NonConsecutivePtr.
- for (auto &Induction : *Legal->getInductionVars()) {
- auto *PN = Induction.first;
- auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
-
- if (std::all_of(PN->user_begin(), PN->user_end(),
- [&](User *U) -> bool {
- Instruction *UI = dyn_cast<Instruction>(U);
- return U == UpdateV || !TheLoop->contains(UI) ||
- Legal->isUniformAfterVectorization(UI) ||
- NonConsecutivePtr.count(UI);
- }) &&
- std::all_of(UpdateV->user_begin(), UpdateV->user_end(),
- [&](User *U) -> bool {
- Instruction *UI = dyn_cast<Instruction>(U);
- return U == PN || !TheLoop->contains(UI) ||
- Legal->isUniformAfterVectorization(UI) ||
- NonConsecutivePtr.count(UI);
- })) {
- VecValuesToIgnore.insert(PN);
- VecValuesToIgnore.insert(UpdateV);
- }
- }
}
void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC);
- Unroller.vectorize(&LVL, CM.MinBWs, CM.VecValuesToIgnore);
+ Unroller.vectorize(&LVL, CM.MinBWs);
ORE->emitOptimizationRemark(LV_NAME, L,
Twine("interleaved loop (interleaved count: ") +
} else {
// If we decided that it is *legal* to vectorize the loop, then do it.
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC);
- LB.vectorize(&LVL, CM.MinBWs, CM.VecValuesToIgnore);
+ LB.vectorize(&LVL, CM.MinBWs);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there are
ret void
}
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors. (Interleaved accesses.)
+;
+; for (int i = 0; i < n; ++i)
+; p[i].f = a[i * 4]
+;
+; INTERLEAVE-LABEL: @scalarize_induction_variable_04(
+; INTERLEAVE: vector.body:
+; INTERLEAVE: %[[i0:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTERLEAVE: %[[i1:.+]] = or i64 %[[i0]], 1
+; INTERLEAVE: %[[i2:.+]] = or i64 %[[i0]], 2
+; INTERLEAVE: %[[i3:.+]] = or i64 %[[i0]], 3
+; INTERLEAVE: %[[i4:.+]] = or i64 %[[i0]], 4
+; INTERLEAVE: %[[i5:.+]] = or i64 %[[i0]], 5
+; INTERLEAVE: %[[i6:.+]] = or i64 %[[i0]], 6
+; INTERLEAVE: %[[i7:.+]] = or i64 %[[i0]], 7
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i0]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i1]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i2]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i3]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i4]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i5]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i6]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i7]], i32 1
+
+define void @scalarize_induction_variable_04(i32* %a, %pair* %p, i32 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry]
+ %0 = shl nsw i64 %i, 2
+ %1 = getelementptr inbounds i32, i32* %a, i64 %0
+ %2 = load i32, i32* %1, align 1
+ %3 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+ store i32 %2, i32* %3, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %4 = trunc i64 %i.next to i32
+ %cond = icmp eq i32 %4, %n
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
; Make sure that the loop exit count computation does not overflow for i8 and
; i16. The exit count of these loops is i8/i16 max + 1. If we don't cast the
; induction variable to a bigger type the exit count computation will overflow