cl::desc("The maximum interleave count to use when interleaving a scalar "
"reduction in a nested loop."));
+static cl::opt<bool>
+ PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
+ cl::Hidden,
+ cl::desc("Prefer in-loop vector reductions, "
+ "overriding the targets preference."));
+
cl::opt<bool> EnableVPlanNativePath(
"enable-vplan-native-path", cl::init(false), cl::Hidden,
cl::desc("Enable VPlan-native vectorization path with "
/// Collect values we want to ignore in the cost model.
void collectValuesToIgnore();
+ /// Split reductions into those that happen in the loop, and those that happen
+ /// outside. In loop reductions are collected into InLoopReductionChains.
+ void collectInLoopReductions();
+
/// \returns The smallest bitwidth each instruction can be represented with.
/// The vector equivalents of these instructions should be truncated to this
/// type.
return foldTailByMasking() || Legal->blockNeedsPredication(BB);
}
+ /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
+ /// nodes to the chain of instructions representing the reductions. Uses a
+ /// MapVector to ensure deterministic iteration order.
+ using ReductionChainMap =
+ SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
+
+ /// Return the chain of instructions representing an inloop reduction.
+ const ReductionChainMap &getInLoopReductionChains() const {
+ return InLoopReductionChains;
+ }
+
+ /// Returns true if the Phi is part of an inloop reduction.
+ bool isInLoopReduction(PHINode *Phi) const {
+ return InLoopReductionChains.count(Phi);
+ }
+
/// Estimate cost of an intrinsic call instruction CI if it were vectorized
/// with factor VF. Return the cost of the instruction, including
/// scalarization overhead if it's needed.
/// scalarized.
DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+ /// PHINodes of the reductions that should be expanded in-loop along with
+ /// their associated chains of reduction operations, in program order from top
+ /// (PHI) to bottom
+ ReductionChainMap InLoopReductionChains;
+
/// Returns the expected difference in cost from scalarizing the expression
/// feeding a predicated instruction \p PredInst. The instructions to
/// scalarize and their scalar costs are collected in \p ScalarCosts. A
RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
RdxDesc.getMinMaxRecurrenceKind();
setDebugLocFromInst(Builder, ReductionStartValue);
+ bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
// We need to generate a reduction vector from the incoming scalar.
// To do so, we need to generate the 'identity' vector and override
if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
RK == RecurrenceDescriptor::RK_FloatMinMax) {
// MinMax reduction have the start value as their identify.
- if (VF == 1) {
+ if (VF == 1 || IsInLoopReductionPhi) {
VectorStart = Identity = ReductionStartValue;
} else {
VectorStart = Identity =
// Handle other reduction kinds:
Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
RK, VecTy->getScalarType());
- if (VF == 1) {
+ if (VF == 1 || IsInLoopReductionPhi) {
Identity = Iden;
// This vector is the Identity vector where the first element is the
// incoming scalar reduction.
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
+ assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF);
Builder.SetInsertPoint(
LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
RdxPart);
}
- if (VF > 1) {
+ // Create the reduction after the loop. Note that inloop reductions create the
+ // target reduction in the loop using a Reduction recipe.
+ if (VF > 1 && !IsInLoopReductionPhi) {
bool NoNaN = Legal->hasFunNoNaNAttr();
ReducedPartRdx =
createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
for (unsigned Part = 0; Part < UF; ++Part) {
// This is phase one of vectorizing PHIs.
+ bool ScalarPHI = (VF == 1) || Cost->isInLoopReduction(cast<PHINode>(PN));
Type *VecTy =
- (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
+ ScalarPHI ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
Value *EntryPart = PHINode::Create(
VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
}
}
+void LoopVectorizationCostModel::collectInLoopReductions() {
+ // For the moment, without predicated reduction instructions, we do not
+ // support inloop reductions whilst folding the tail, and hence in those cases
+ // all reductions are currently out of the loop.
+ if (!PreferInLoopReductions || foldTailByMasking())
+ return;
+
+ for (auto &Reduction : Legal->getReductionVars()) {
+ PHINode *Phi = Reduction.first;
+ RecurrenceDescriptor &RdxDesc = Reduction.second;
+
+ // We don't collect reductions that are type promoted (yet).
+ if (RdxDesc.getRecurrenceType() != Phi->getType())
+ continue;
+
+ // Check that we can correctly put the reductions into the loop, by
+ // finding the chain of operations that leads from the phi to the loop
+ // exit value.
+ SmallVector<Instruction *, 4> ReductionOperations =
+ RdxDesc.getReductionOpChain(Phi, TheLoop);
+ bool InLoop = !ReductionOperations.empty();
+ if (InLoop)
+ InLoopReductionChains[Phi] = ReductionOperations;
+ LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
+ << " reduction for phi: " << *Phi << "\n");
+ }
+}
+
// TODO: we could return a pair of values that specify the max VF and
// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
CM.selectUserVectorizationFactor(UserVF);
+ CM.collectInLoopReductions();
buildVPlansWithVPRecipes(UserVF, UserVF);
LLVM_DEBUG(printPlans(dbgs()));
return {{UserVF, 0}};
CM.collectInstsToScalarize(VF);
}
+ CM.collectInLoopReductions();
+
buildVPlansWithVPRecipes(1, MaxVF);
LLVM_DEBUG(printPlans(dbgs()));
if (MaxVF == 1)
RecipeBuilder.recordRecipeOf(Entry.first);
RecipeBuilder.recordRecipeOf(Entry.second);
}
+ for (auto &Reduction : CM.getInLoopReductionChains()) {
+ PHINode *Phi = Reduction.first;
+ RecurrenceDescriptor::RecurrenceKind Kind =
+ Legal->getReductionVars()[Phi].getRecurrenceKind();
+ const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
+
+ RecipeBuilder.recordRecipeOf(Phi);
+ for (auto &R : ReductionOperations) {
+ RecipeBuilder.recordRecipeOf(R);
+ // For min/max reducitons, where we have a pair of icmp/select, we also
+ // need to record the ICmp recipe, so it can be removed later.
+ if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
+ Kind == RecurrenceDescriptor::RK_FloatMinMax) {
+ RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
+ }
+ }
+ }
// For each interleave group which is relevant for this (possibly trimmed)
// Range, add it to the set of groups to be later applied to the VPlan and add
}
}
+ // Adjust the recipes for any inloop reductions.
+ if (Range.Start > 1)
+ adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
+
// Finally, if tail is folded by masking, introduce selects between the phi
// and the live-out instruction of each reduction, at the end of the latch.
if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
Builder.setInsertPoint(VPBB);
auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
for (auto &Reduction : Legal->getReductionVars()) {
+ assert(!CM.isInLoopReduction(Reduction.first) &&
+ "Didn't expect inloop tail folded reduction yet!");
VPValue *Phi = Plan->getVPValue(Reduction.first);
VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
return Plan;
}
+// Adjust the recipes for any inloop reductions. The chain of instructions
+// leading from the loop exit instr to the phi need to be converted to
+// reductions, with one operand being vector and the other being the scalar
+// reduction chain.
+void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
+ VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
+ for (auto &Reduction : CM.getInLoopReductionChains()) {
+ PHINode *Phi = Reduction.first;
+ RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
+ const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
+
+ // ReductionOperations are orders top-down from the phi's use to the
+ // LoopExitValue. We keep a track of the previous item (the Chain) to tell
+ // which of the two operands will remain scalar and which will be reduced.
+ // For minmax the chain will be the select instructions.
+ Instruction *Chain = Phi;
+ for (Instruction *R : ReductionOperations) {
+ VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
+ RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
+
+ VPValue *ChainOp = Plan->getVPValue(Chain);
+ unsigned FirstOpId;
+ if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
+ Kind == RecurrenceDescriptor::RK_FloatMinMax) {
+ assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC &&
+ "Expected to replace a VPWidenSelectSC");
+ FirstOpId = 1;
+ } else {
+ assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
+ "Expected to replace a VPWidenSC");
+ FirstOpId = 0;
+ }
+ unsigned VecOpId =
+ R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
+ VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
+
+ VPReductionRecipe *RedRecipe = new VPReductionRecipe(
+ &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI);
+ WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
+ WidenRecipe->eraseFromParent();
+
+ if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
+ Kind == RecurrenceDescriptor::RK_FloatMinMax) {
+ VPRecipeBase *CompareRecipe =
+ RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
+ assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
+ "Expected to replace a VPWidenSC");
+ CompareRecipe->eraseFromParent();
+ }
+ Chain = R;
+ }
+ }
+}
+
Value* LoopVectorizationPlanner::VPCallbackILV::
getOrCreateVectorValues(Value *V, unsigned Part) {
return ILV.getOrCreateVectorValue(V, Part);
State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
}
+void VPReductionRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Reduction being replicated.");
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ unsigned Kind = RdxDesc->getRecurrenceKind();
+ Value *NewVecOp = State.get(VecOp, Part);
+ Value *NewRed =
+ createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
+ Value *PrevInChain = State.get(ChainOp, Part);
+ Value *NextInChain;
+ if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
+ Kind == RecurrenceDescriptor::RK_FloatMinMax) {
+ NextInChain =
+ createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
+ NewRed, PrevInChain);
+ } else {
+ NextInChain = State.Builder.CreateBinOp(
+ (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain);
+ }
+ State.ValueMap.setVectorValue(I, Part, NextInChain);
+ }
+}
+
void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.Instance) { // Generate a single instance.
State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP3]] = add i32 [[TMP2]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
; CHECK: .lr.ph:
; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !2
; CHECK: ._crit_edge:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
-; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[TMP5]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
; CHECK: .lr.ph:
; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !5
; CHECK: ._crit_edge:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[TMP2]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], 12
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
; CHECK: .lr.ph:
; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !7
; CHECK: ._crit_edge:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
-; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP6]] = mul <4 x i32> [[TMP5]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT: [[TMP9]] = mul i32 [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
; CHECK: .lr.ph:
; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !9
; CHECK: ._crit_edge:
-; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[PROD_0_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
-; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP8]] = add i32 [[TMP7]], [[TMP6]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 19, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 19, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = mul <4 x i32> [[TMP4]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT: [[TMP7]] = mul i32 [[TMP6]], [[TMP5]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 120, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 120, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP6]] = add i32 [[TMP5]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !15
; CHECK: for.end:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ -1, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = and <4 x i32> [[TMP4]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT: [[TMP7]] = and i32 [[TMP6]], [[TMP5]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = or <4 x i32> [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP6]] = or i32 [[TMP5]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !18
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !18
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !19
; CHECK: for.end:
-; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = xor <4 x i32> [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP6]] = xor i32 [[TMP5]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !20
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !20
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !21
; CHECK: for.end:
-; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = fadd fast <4 x float> [[TMP4]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP5:%.*]] = fadd float [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD1]])
+; CHECK-NEXT: [[TMP7]] = fadd float [[TMP6]], [[TMP5]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !22
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !22
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = fmul fast <4 x float> [[TMP4]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP5:%.*]] = fmul float [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD1]])
+; CHECK-NEXT: [[TMP7]] = fmul float [[TMP6]], [[TMP5]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !24
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !24
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt i32 [[TMP2]], [[VEC_PHI]]
+; CHECK-NEXT: [[RDX_MINMAX_SELECT]] = select i1 [[RDX_MINMAX_CMP]], i32 [[TMP2]], i32 [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !26
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !26
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !27
; CHECK: for.end:
-; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt i32 [[TMP2]], [[VEC_PHI]]
+; CHECK-NEXT: [[RDX_MINMAX_SELECT]] = select i1 [[RDX_MINMAX_CMP]], i32 [[TMP2]], i32 [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !28
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !28
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !29
; CHECK: for.end:
-; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
-; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[TMP5]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !34
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !34
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
; CHECK: .lr.ph:
; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !35
; CHECK: ._crit_edge:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 255, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
-; CHECK-NEXT: [[TMP4]] = and <4 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 255, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[VEC_PHI]], 255
+; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP6]] = and i32 [[TMP5]], [[TMP0]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !38
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !38
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP4]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]