/// induction variable will first be truncated to the corresponding type.
void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+ /// Returns true if an instruction \p I should be scalarized instead of
+ /// vectorized for the chosen vectorization factor.
+ bool shouldScalarizeInstruction(Instruction *I) const;
+
/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;
return MinBWs;
}
+ /// \returns True if it is more profitable to scalarize instruction \p I for
+ /// vectorization factor \p VF.
+ bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
+ auto Scalars = InstsToScalarize.find(VF);
+ assert(Scalars != InstsToScalarize.end() &&
+ "VF not yet analyzed for scalarization profitability");
+ return Scalars->second.count(I);
+ }
+
private:
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
/// to this type.
MapVector<Instruction *, uint64_t> MinBWs;
+ /// A type representing the costs for instructions if they were to be
+ /// scalarized rather than vectorized. The entries are Instruction-Cost
+ /// pairs.
+ typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
+
+ /// A map holding scalar costs for different vectorization factors. The
+ /// presence of a cost for an instruction in the mapping indicates that the
+ /// instruction will be scalarized when vectorizing with the associated
+ /// vectorization factor. The entries are VF-ScalarCostTy pairs.
+ DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
+
+ /// Returns the expected difference in cost from scalarizing the expression
+ /// feeding a predicated instruction \p PredInst. The instructions to
+ /// scalarize and their scalar costs are collected in \p ScalarCosts. A
+ /// non-negative return value implies the expression will be scalarized.
+ /// Currently, only single-use chains are considered for scalarization.
+ int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
+ unsigned VF);
+
+ /// Collects the instructions to scalarize for each predicated instruction in
+ /// the loop.
+ void collectInstsToScalarize(unsigned VF);
+
public:
/// The loop that we evaluate.
Loop *TheLoop;
VecInd->addIncoming(LastInduction, LoopVectorLatch);
}
+bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
+ return Legal->isScalarAfterVectorization(I) ||
+ Cost->isProfitableToScalarize(I, VF);
+}
+
bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
- if (Legal->isScalarAfterVectorization(IV))
+ if (shouldScalarizeInstruction(IV))
return true;
auto isScalarInst = [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
- return (OrigLoop->contains(I) && Legal->isScalarAfterVectorization(I));
+ return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
};
return any_of(IV->users(), isScalarInst);
}
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (VF > 1 && IV->getType() == Induction->getType() && Step &&
- !Legal->isScalarAfterVectorization(EntryVal)) {
+ !shouldScalarizeInstruction(EntryVal)) {
createVectorIntInductionPHI(ID, EntryVal);
VectorizedIV = true;
}
continue;
// Scalarize instructions that should remain scalar after vectorization.
- if (!(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
+ if (VF > 1 &&
+ !(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
isa<DbgInfoIntrinsic>(&I)) &&
- Legal->isScalarAfterVectorization(&I)) {
- scalarizeInstruction(&I);
+ shouldScalarizeInstruction(&I)) {
+ scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
continue;
}
DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
Factor.Width = UserVF;
+ collectInstsToScalarize(UserVF);
return Factor;
}
return RUs;
}
+void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
+
+ // If we aren't vectorizing the loop, or if we've already collected the
+ // instructions to scalarize, there's nothing to do. Collection may already
+ // have occurred if we have a user-selected VF and are now computing the
+ // expected cost for interleaving.
+ if (VF < 2 || InstsToScalarize.count(VF))
+ return;
+
+ // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
+ // not profitable to scalarize any instructions, the presence of VF in the
+ // map will indicate that we've analyzed it already.
+ ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
+
+ // Find all the instructions that are scalar with predication in the loop and
+ // determine if it would be better to not if-convert the blocks they are in.
+ // If so, we also record the instructions to scalarize.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ if (!Legal->blockNeedsPredication(BB))
+ continue;
+ for (Instruction &I : *BB)
+ if (Legal->isScalarWithPredication(&I)) {
+ ScalarCostsTy ScalarCosts;
+ if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+ ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+ }
+ }
+}
+
+int LoopVectorizationCostModel::computePredInstDiscount(
+ Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
+ unsigned VF) {
+
+ assert(!Legal->isUniformAfterVectorization(PredInst) &&
+ "Instruction marked uniform-after-vectorization will be predicated");
+
+ // Initialize the discount to zero, meaning that the scalar version and the
+ // vector version cost the same.
+ int Discount = 0;
+
+ // Holds instructions to analyze. The instructions we visit are mapped in
+ // ScalarCosts. Those instructions are the ones that would be scalarized if
+ // we find that the scalar version costs less.
+ SmallVector<Instruction *, 8> Worklist;
+
+ // Returns true if the given instruction can be scalarized.
+ auto canBeScalarized = [&](Instruction *I) -> bool {
+
+ // We only attempt to scalarize instructions forming a single-use chain
+ // from the original predicated block that would otherwise be vectorized.
+ // Although not strictly necessary, we give up on instructions we know will
+ // already be scalar to avoid traversing chains that are unlikely to be
+ // beneficial.
+ if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
+ Legal->isScalarAfterVectorization(I))
+ return false;
+
+ // If the instruction is scalar with predication, it will be analyzed
+ // separately. We ignore it within the context of PredInst.
+ if (Legal->isScalarWithPredication(I))
+ return false;
+
+ // If any of the instruction's operands are uniform after vectorization,
+ // the instruction cannot be scalarized. This prevents, for example, a
+ // masked load from being scalarized.
+ //
+ // We assume we will only emit a value for lane zero of an instruction
+ // marked uniform after vectorization, rather than VF identical values.
+ // Thus, if we scalarize an instruction that uses a uniform, we would
+ // create uses of values corresponding to the lanes we aren't emitting code
+ // for. This behavior can be changed by allowing getScalarValue to clone
+ // the lane zero values for uniforms rather than asserting.
+ for (Use &U : I->operands())
+ if (auto *J = dyn_cast<Instruction>(U.get()))
+ if (Legal->isUniformAfterVectorization(J))
+ return false;
+
+ // Otherwise, we can scalarize the instruction.
+ return true;
+ };
+
+ // Returns true if an operand that cannot be scalarized must be extracted
+ // from a vector. We will account for this scalarization overhead below. Note
+ // that the non-void predicated instructions are placed in their own blocks,
+ // and their return values are inserted into vectors. Thus, an extract would
+ // still be required.
+ auto needsExtract = [&](Instruction *I) -> bool {
+ return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I);
+ };
+
+ // Compute the expected cost discount from scalarizing the entire expression
+ // feeding the predicated instruction. We currently only consider expressions
+ // that are single-use instruction chains.
+ Worklist.push_back(PredInst);
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+
+ // If we've already analyzed the instruction, there's nothing to do.
+ if (ScalarCosts.count(I))
+ continue;
+
+ // Compute the cost of the vector instruction. Note that this cost already
+ // includes the scalarization overhead of the predicated instruction.
+ unsigned VectorCost = getInstructionCost(I, VF).first;
+
+ // Compute the cost of the scalarized instruction. This cost is the cost of
+ // the instruction as if it wasn't if-converted and instead remained in the
+ // predicated block. We will scale this cost by block probability after
+ // computing the scalarization overhead.
+ unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
+
+ // Compute the scalarization overhead of needed insertelement instructions
+ // and phi nodes.
+ if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+ ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true,
+ false, TTI);
+ ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
+ }
+
+ // Compute the scalarization overhead of needed extractelement
+ // instructions. For each of the instruction's operands, if the operand can
+ // be scalarized, add it to the worklist; otherwise, account for the
+ // overhead.
+ for (Use &U : I->operands())
+ if (auto *J = dyn_cast<Instruction>(U.get())) {
+ assert(VectorType::isValidElementType(J->getType()) &&
+ "Instruction has non-scalar type");
+ if (canBeScalarized(J))
+ Worklist.push_back(J);
+ else if (needsExtract(J))
+ ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF),
+ false, true, TTI);
+ }
+
+ // Scale the total scalar cost by block probability.
+ ScalarCost /= getReciprocalPredBlockProb();
+
+ // Compute the discount. A non-negative discount means the vector version
+ // of the instruction costs more, and scalarizing would be beneficial.
+ Discount += VectorCost - ScalarCost;
+ ScalarCosts[I] = ScalarCost;
+ }
+
+ return Discount;
+}
+
LoopVectorizationCostModel::VectorizationCostTy
LoopVectorizationCostModel::expectedCost(unsigned VF) {
VectorizationCostTy Cost;
+ // Collect the instructions (and their associated costs) that will be more
+ // profitable to scalarize.
+ collectInstsToScalarize(VF);
+
// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
VectorizationCostTy BlockCost;
if (Legal->isUniformAfterVectorization(I))
VF = 1;
+ if (VF > 1 && isProfitableToScalarize(I, VF))
+ return VectorizationCostTy(InstsToScalarize[VF][I], false);
+
Type *VectorTy;
unsigned C = getInstructionCost(I, VF, VectorTy);
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
- // Insert values known to be scalar into VecValuesToIgnore.
+ // Insert values known to be scalar into VecValuesToIgnore. This is a
+ // conservative estimation of the values that will later be scalarized.
+ //
+ // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may
+ // still be scalarized. For example, we may find an instruction to be
+ // more profitable for a given vectorization factor if it were to be
+ // scalarized. But at this point, we haven't yet computed the
+ // vectorization factor.
for (auto *BB : TheLoop->getBlocks())
for (auto &I : *BB)
if (Legal->isScalarAfterVectorization(&I))
--- /dev/null
+; RUN: opt < %s -loop-vectorize -simplifycfg -S | FileCheck %s
+; RUN: opt < %s -force-vector-width=2 -loop-vectorize -simplifycfg -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: predicated_udiv_scalarized_operand
+;
+; This test checks that we correctly compute the scalarized operands for a
+; user-specified vectorization factor when interleaving is disabled. We use the
+; "optsize" attribute to disable all interleaving calculations.
+;
+; CHECK: vector.body:
+; CHECK: %wide.load = load <2 x i64>, <2 x i64>* {{.*}}, align 4
+; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; CHECK: [[IF0]]:
+; CHECK: %[[T00:.+]] = extractelement <2 x i64> %wide.load, i32 0
+; CHECK: %[[T01:.+]] = extractelement <2 x i64> %wide.load, i32 0
+; CHECK: %[[T02:.+]] = add nsw i64 %[[T01]], %x
+; CHECK: %[[T03:.+]] = udiv i64 %[[T00]], %[[T02]]
+; CHECK: %[[T04:.+]] = insertelement <2 x i64> undef, i64 %[[T03]], i32 0
+; CHECK: br label %[[CONT0]]
+; CHECK: [[CONT0]]:
+; CHECK: %[[T05:.+]] = phi <2 x i64> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ]
+; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; CHECK: [[IF1]]:
+; CHECK: %[[T06:.+]] = extractelement <2 x i64> %wide.load, i32 1
+; CHECK: %[[T07:.+]] = extractelement <2 x i64> %wide.load, i32 1
+; CHECK: %[[T08:.+]] = add nsw i64 %[[T07]], %x
+; CHECK: %[[T09:.+]] = udiv i64 %[[T06]], %[[T08]]
+; CHECK: %[[T10:.+]] = insertelement <2 x i64> %[[T05]], i64 %[[T09]], i32 1
+; CHECK: br label %[[CONT1]]
+; CHECK: [[CONT1]]:
+; CHECK: phi <2 x i64> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ]
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
+
+define i64 @predicated_udiv_scalarized_operand(i64* %a, i1 %c, i64 %x) optsize {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+ %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ]
+ %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
+ %tmp2 = load i64, i64* %tmp0, align 4
+ br i1 %c, label %if.then, label %for.inc
+
+if.then:
+ %tmp3 = add nsw i64 %tmp2, %x
+ %tmp4 = udiv i64 %tmp2, %tmp3
+ br label %for.inc
+
+for.inc:
+ %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then]
+ %tmp6 = add i64 %r, %tmp5
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, 100
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ %tmp7 = phi i64 [ %tmp6, %for.inc ]
+ ret i64 %tmp7
+}
; as:
;
; Cost of store:
-; (store(4) + extractelement(6)) / 2 = 5
+; (store(4) + extractelement(3)) / 2 = 3
;
-; CHECK: Found an estimated cost of 5 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
;
define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) {
for.end:
ret void
}
+
+; CHECK-LABEL: predicated_udiv_scalarized_operand
+;
+; This test checks that we correctly compute the cost of the predicated udiv
+; instruction and the add instruction it uses. The add is scalarized and sunk
+; inside the predicated block. If we assume the block probability is 50%, we
+; compute the cost as:
+;
+; Cost of add:
+; (add(2) + extractelement(3)) / 2 = 2
+; Cost of udiv:
+; (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4
+;
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
+; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
+; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
+;
+define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+ %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+ %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+ %tmp2 = load i32, i32* %tmp0, align 4
+ br i1 %c, label %if.then, label %for.inc
+
+if.then:
+ %tmp3 = add nsw i32 %tmp2, %x
+ %tmp4 = udiv i32 %tmp2, %tmp3
+ br label %for.inc
+
+for.inc:
+ %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then]
+ %tmp6 = add i32 %r, %tmp5
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ %tmp7 = phi i32 [ %tmp6, %for.inc ]
+ ret i32 %tmp7
+}
+
+; CHECK-LABEL: predicated_store_scalarized_operand
+;
+; This test checks that we correctly compute the cost of the predicated store
+; instruction and the add instruction it uses. The add is scalarized and sunk
+; inside the predicated block. If we assume the block probability is 50%, we
+; compute the cost as:
+;
+; Cost of add:
+; (add(2) + extractelement(3)) / 2 = 2
+; Cost of store:
+; store(4) / 2 = 2
+;
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
+; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
+;
+define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+ %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+ %tmp1 = load i32, i32* %tmp0, align 4
+ br i1 %c, label %if.then, label %for.inc
+
+if.then:
+ %tmp2 = add nsw i32 %tmp1, %x
+ store i32 %tmp2, i32* %tmp0, align 4
+ br label %for.inc
+
+for.inc:
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
+; CHECK-LABEL: predication_multi_context
+;
+; This test checks that we correctly compute the cost of multiple predicated
+; instructions in the same block. The sdiv, udiv, and store must be scalarized
+; and predicated. The sub feeding the store is scalarized and sunk inside the
+; store's predicated block. However, the add feeding the sdiv and udiv cannot
+; be sunk and is not scalarized. If we assume the block probability is 50%, we
+; compute the cost as:
+;
+; Cost of add:
+; add(1) = 1
+; Cost of sdiv:
+; (sdiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+; Cost of udiv:
+; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+; Cost of sub:
+; (sub(2) + extractelement(3)) / 2 = 2
+; Cost of store:
+; store(4) / 2 = 2
+;
+; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
+; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4
+; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
+; CHECK: Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x
+; CHECK: Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4
+;
+define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+ %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+ %tmp1 = load i32, i32* %tmp0, align 4
+ br i1 %c, label %if.then, label %for.inc
+
+if.then:
+ %tmp2 = add i32 %tmp1, %x
+ %tmp3 = sdiv i32 %tmp1, %tmp2
+ %tmp4 = udiv i32 %tmp3, %tmp2
+ %tmp5 = sub i32 %tmp4, %x
+ store i32 %tmp5, i32* %tmp0, align 4
+ br label %for.inc
+
+for.inc:
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
--- /dev/null
+; RUN: opt < %s -mattr=avx -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -simplifycfg -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: predicated_sdiv_masked_load
+;
+; This test ensures that we don't scalarize the predicated load. Since the load
+; can be vectorized with predication, scalarizing it would cause its pointer
+; operand to become non-uniform.
+;
+; CHECK: vector.body:
+; CHECK: %wide.masked.load = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32
+; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; CHECK: [[IF0]]:
+; CHECK: %[[T0:.+]] = extractelement <2 x i32> %wide.masked.load, i32 0
+; CHECK: %[[T1:.+]] = sdiv i32 %[[T0]], %x
+; CHECK: %[[T2:.+]] = insertelement <2 x i32> undef, i32 %[[T1]], i32 0
+; CHECK: br label %[[CONT0]]
+; CHECK: [[CONT0]]:
+; CHECK: %[[T3:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[T2]], %[[IF0]] ]
+; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; CHECK: [[IF1]]:
+; CHECK: %[[T4:.+]] = extractelement <2 x i32> %wide.masked.load, i32 1
+; CHECK: %[[T5:.+]] = sdiv i32 %[[T4]], %x
+; CHECK: %[[T6:.+]] = insertelement <2 x i32> %[[T3]], i32 %[[T5]], i32 1
+; CHECK: br label %[[CONT1]]
+; CHECK: [[CONT1]]:
+; CHECK: phi <2 x i32> [ %[[T3]], %[[CONT0]] ], [ %[[T6]], %[[IF1]] ]
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
+
+define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+ %r = phi i32 [ 0, %entry ], [ %tmp7, %for.inc ]
+ %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+ %tmp1 = load i32, i32* %tmp0, align 4
+ br i1 %c, label %if.then, label %for.inc
+
+if.then:
+ %tmp2 = getelementptr inbounds i32, i32* %b, i64 %i
+ %tmp3 = load i32, i32* %tmp2, align 4
+ %tmp4 = sdiv i32 %tmp3, %x
+ %tmp5 = add nsw i32 %tmp4, %tmp1
+ br label %for.inc
+
+for.inc:
+ %tmp6 = phi i32 [ %tmp1, %for.body ], [ %tmp5, %if.then]
+ %tmp7 = add i32 %r, %tmp6
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 10000
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ %tmp8 = phi i32 [ %tmp7, %for.inc ]
+ ret i32 %tmp8
+}
%exitcond = icmp eq i64 %indvars.iv.next, 128
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
+
+
+define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+ br label %for.body
+
+; CHECK-LABEL: predicated_udiv_scalarized_operand
+; CHECK: vector.body:
+; CHECK: %wide.load = load <2 x i32>, <2 x i32>* {{.*}}, align 4
+; CHECK: br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; CHECK: [[IF0]]:
+; CHECK: %[[T00:.+]] = extractelement <2 x i32> %wide.load, i32 0
+; CHECK: %[[T01:.+]] = extractelement <2 x i32> %wide.load, i32 0
+; CHECK: %[[T02:.+]] = add nsw i32 %[[T01]], %x
+; CHECK: %[[T03:.+]] = udiv i32 %[[T00]], %[[T02]]
+; CHECK: %[[T04:.+]] = insertelement <2 x i32> undef, i32 %[[T03]], i32 0
+; CHECK: br label %[[CONT0]]
+; CHECK: [[CONT0]]:
+; CHECK: %[[T05:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ]
+; CHECK: br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; CHECK: [[IF1]]:
+; CHECK: %[[T06:.+]] = extractelement <2 x i32> %wide.load, i32 1
+; CHECK: %[[T07:.+]] = extractelement <2 x i32> %wide.load, i32 1
+; CHECK: %[[T08:.+]] = add nsw i32 %[[T07]], %x
+; CHECK: %[[T09:.+]] = udiv i32 %[[T06]], %[[T08]]
+; CHECK: %[[T10:.+]] = insertelement <2 x i32> %[[T05]], i32 %[[T09]], i32 1
+; CHECK: br label %[[CONT1]]
+; CHECK: [[CONT1]]:
+; CHECK: phi <2 x i32> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ]
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+ %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+ %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+ %tmp2 = load i32, i32* %tmp0, align 4
+ br i1 %c, label %if.then, label %for.inc
+
+if.then:
+ %tmp3 = add nsw i32 %tmp2, %x
+ %tmp4 = udiv i32 %tmp2, %tmp3
+ br label %for.inc
+
+for.inc:
+ %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then]
+ %tmp6 = add i32 %r, %tmp5
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ %tmp7 = phi i32 [ %tmp6, %for.inc ]
+ ret i32 %tmp7
+}
; VEC-LABEL: test
; VEC: %[[v0:.+]] = add i64 %index, 0
; VEC: %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
-; VEC: %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, <i32 20, i32 20>
; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true>
; VEC: %[[o1:.+]] = or <2 x i1> zeroinitializer, %[[v10]]
; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[o1]], i32 0
; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]]
;
; VEC: [[cond]]:
-; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
+; VEC: %[[v13:.+]] = extractelement <2 x i32> %wide.load, i32 0
+; VEC: %[[v9a:.+]] = add nsw i32 %[[v13]], 20
; VEC: %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]]
-; VEC: store i32 %[[v13]], i32* %[[v2]], align 4
+; VEC: store i32 %[[v9a]], i32* %[[v2]], align 4
; VEC: br label %[[else:.+]]
;
; VEC: [[else]]:
; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]]
;
; VEC: [[cond2]]:
-; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
+; VEC: %[[v17:.+]] = extractelement <2 x i32> %wide.load, i32 1
+; VEC: %[[v9b:.+]] = add nsw i32 %[[v17]], 20
; VEC: %[[v1:.+]] = add i64 %index, 1
; VEC: %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]]
-; VEC: store i32 %[[v17]], i32* %[[v4]], align 4
+; VEC: store i32 %[[v9b]], i32* %[[v4]], align 4
; VEC: br label %[[else2:.+]]
;
; VEC: [[else2]]: