/// Print the information about the memory accesses in the loop.
void print(raw_ostream &OS, unsigned Depth = 0) const;
- /// Checks existence of store to invariant address inside loop.
- /// If the loop has any store to invariant address, then it returns true,
- /// else returns false.
- bool hasStoreToLoopInvariantAddress() const {
- return StoreToLoopInvariantAddress;
+ /// If the loop has any store of a variant value to an invariant address, then
+ /// return true, else return false.
+ bool hasVariantStoreToLoopInvariantAddress() const {
+ return HasVariantStoreToLoopInvariantAddress;
}
/// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
/// Cache the result of analyzeLoop.
bool CanVecMem;
- /// Indicator for storing to uniform addresses.
- /// If a loop has write to a loop invariant address then it should be true.
- bool StoreToLoopInvariantAddress;
+ /// Indicator that there is a store of a variant value to a uniform address.
+ bool HasVariantStoreToLoopInvariantAddress;
/// The diagnostics report generated for the analysis. E.g. why we
/// couldn't analyze the loop.
// writes and between reads and writes, but not between reads and reads.
ValueSet Seen;
+ // Record uniform store addresses to identify if we have multiple stores
+ // to the same address.
+ ValueSet UniformStores;
+
for (StoreInst *ST : Stores) {
Value *Ptr = ST->getPointerOperand();
- // Check for store to loop invariant address.
- StoreToLoopInvariantAddress |= isUniform(Ptr);
+
+ if (isUniform(Ptr)) {
+ // Consider multiple stores to the same uniform address as a store of a
+ // variant value.
+ bool MultipleStoresToUniformPtr = !UniformStores.insert(Ptr).second;
+ HasVariantStoreToLoopInvariantAddress |=
+ (!isUniform(ST->getValueOperand()) || MultipleStoresToUniformPtr);
+ }
+
// If we did *not* see this pointer before, insert it to the read-write
// list. At this phase it is only a 'write' list.
if (Seen.insert(Ptr).second) {
PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
- StoreToLoopInvariantAddress(false) {
+ HasVariantStoreToLoopInvariantAddress(false) {
if (canAnalyzeLoop())
analyzeLoop(AA, LI, TLI, DT);
}
PtrRtChecking->print(OS, Depth);
OS << "\n";
- OS.indent(Depth) << "Store to invariant address was "
- << (StoreToLoopInvariantAddress ? "" : "not ")
+ OS.indent(Depth) << "Variant Store to invariant address was "
+ << (HasVariantStoreToLoopInvariantAddress ? "" : "not ")
<< "found in loop.\n";
OS.indent(Depth) << "SCEV assumptions:\n";
if (!LAI->canVectorizeMemory())
return false;
- if (LAI->hasStoreToLoopInvariantAddress()) {
+ if (LAI->hasVariantStoreToLoopInvariantAddress()) {
ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
- << "write to a loop invariant address could not be vectorized");
+ << "write of variant value to a loop invariant address could not "
+ "be vectorized");
LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
return false;
}
/// memory access.
unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
- /// The cost calculation for Load instruction \p I with uniform pointer -
- /// scalar load + broadcast.
+ /// The cost calculation for Load/Store instruction \p I with uniform pointer -
+ /// Load: scalar load + broadcast.
+ /// Store: scalar store + (loop invariant value stored? 0 : extract of last
+ /// element)
+ /// TODO: Test the extra cost of the extract when loop variant value stored.
unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
/// Returns whether the instruction is a load or store and will be a emitted
unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
unsigned VF) {
- LoadInst *LI = cast<LoadInst>(I);
- Type *ValTy = LI->getType();
+ Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
- unsigned Alignment = LI->getAlignment();
- unsigned AS = LI->getPointerAddressSpace();
+ unsigned Alignment = getLoadStoreAlignment(I);
+ unsigned AS = getLoadStoreAddressSpace(I);
+ if (isa<LoadInst>(I)) {
+ return TTI.getAddressComputationCost(ValTy) +
+ TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
+ TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+ }
+ StoreInst *SI = cast<StoreInst>(I);
+ bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
- TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+ TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
+ (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
+ Instruction::ExtractElement,
+ VectorTy, VF - 1));
}
unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
if (!Ptr)
continue;
+ // TODO: We should generate better code and update the cost model for
+ // predicated uniform stores. Today they are treated as any other
+ // predicated store (see added test cases in
+ // invariant-store-vectorization.ll).
if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
NumPredStores++;
- if (isa<LoadInst>(&I) && Legal->isUniform(Ptr) &&
- // Conditional loads should be scalarized and predicated.
+ if (Legal->isUniform(Ptr) &&
+ // Conditional loads and stores should be scalarized and predicated.
// isScalarWithPredication cannot be used here since masked
// gather/scatters are not considered scalar with predication.
!Legal->blockNeedsPredication(I.getParent())) {
- // Scalar load + broadcast
+ // TODO: Avoid replicating loads and stores instead of
+ // relying on instcombine to remove them.
+ // Load: Scalar load + broadcast
+ // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
unsigned Cost = getUniformMemOpCost(&I, VF);
setWideningDecision(&I, VF, CM_Scalarize, Cost);
continue;
; CHECK-NEXT: Group
; CHECK-NEXT: (Low: %b High: ((4 * (1 umax %x)) + %b))
; CHECK-NEXT: Member: {%b,+,4}<%for.body>
-; CHECK: Store to invariant address was not found in loop.
+; CHECK: Variant Store to invariant address was not found in loop.
; CHECK-NEXT: SCEV assumptions:
; CHECK-NEXT: {1,+,1}<%for.body> Added Flags: <nusw>
; CHECK-NEXT: {0,+,1}<%for.body> Added Flags: <nusw>
; The LAA with the new PM is a loop pass so we go from inner to outer loops.
; OLDPM: for.cond1.preheader:
-; OLDPM: Store to invariant address was not found in loop.
+; OLDPM: Variant Store to invariant address was not found in loop.
; OLDPM: for.body3:
-; OLDPM: Store to invariant address was found in loop.
+; OLDPM: Variant Store to invariant address was found in loop.
; NEWPM: for.body3:
-; NEWPM: Store to invariant address was found in loop.
+; NEWPM: Variant Store to invariant address was found in loop.
; NEWPM: for.cond1.preheader:
-; NEWPM: Store to invariant address was not found in loop.
+; NEWPM: Variant Store to invariant address was not found in loop.
define i32 @foo(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
entry:
; }
; }
-; CHECK: Store to invariant address was not found in loop.
-; CHECK-NOT: Store to invariant address was found in loop.
+; CHECK: Variant Store to invariant address was not found in loop.
+; CHECK-NOT: Variant Store to invariant address was found in loop.
define i32 @foo(i32* nocapture readonly %var1, i32* nocapture %var2, i32 %itr) #0 {
; }
; }
-; CHECK: Store to invariant address was found in loop.
+; CHECK: Variant Store to invariant address was found in loop.
define void @foo(i32* nocapture %var1, i32* nocapture %var2, i32 %itr) #0 {
entry:
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512 -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; first test checks that loop with a reduction and a uniform store gets
+; vectorized.
+; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction
+; CHECK-LABEL: vector.memcheck:
+; CHECK: found.conflict
+
+; CHECK-LABEL: vector.body:
+; CHECK: %vec.phi = phi <16 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK: %wide.load = load <16 x i32>
+; CHECK: [[ADD]] = add <16 x i32> %vec.phi, %wide.load
+; CHECK: store i32 %ntrunc, i32* %a
+; CHECK-NOT: store i32 %ntrunc, i32* %a
+; CHECK: %index.next = add i64 %index, 64
+
+; CHECK-LABEL: middle.block:
+; CHECK: %rdx.shuf = shufflevector <16 x i32>
+define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+entry:
+ %ntrunc = trunc i64 %n to i32
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+ %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+ %tmp2 = load i32, i32* %tmp1, align 8
+ %tmp3 = add i32 %tmp0, %tmp2
+ store i32 %ntrunc, i32* %a
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ %tmp4 = phi i32 [ %tmp3, %for.body ]
+ ret i32 %tmp4
+}
+
+; Conditional store
+; if (b[i] == k) a = ntrunc
+define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+; CHECK-LABEL: @inv_val_store_to_inv_address_conditional(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK: vector.memcheck:
+; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT: [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT5]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT9]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !8, !noalias !11
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP5]], align 4, !alias.scope !8, !noalias !11
+; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT8]], <16 x i32*> [[BROADCAST_SPLAT10]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !11
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !13
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
+; CHECK: cond_store:
+; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT: br label [[LATCH]]
+; CHECK: latch:
+; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !14
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %ntrunc = trunc i64 %n to i32
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+ %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+ %tmp2 = load i32, i32* %tmp1, align 8
+ %cmp = icmp eq i32 %tmp2, %k
+ store i32 %ntrunc, i32* %tmp1
+ br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+ store i32 %ntrunc, i32* %a
+ br label %latch
+
+latch:
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ ret void
+}
--- /dev/null
+; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+; First licm pass is to hoist/sink invariant stores if possible. Today LICM does
+; not hoist/sink the invariant stores. Even if that changes, we should still
+; vectorize this loop in case licm is not run.
+
+; The next licm pass after vectorization is to hoist/sink loop invariant
+; instructions.
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; all tests check that it is legal to vectorize the stores to invariant
+; address.
+
+
+; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction(
+; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b)
+; CHECK: vector.memcheck:
+; CHECK: found.conflict
+
+; CHECK-LABEL: vector.body:
+; CHECK: %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK: %wide.load = load <4 x i32>
+; CHECK: [[ADD]] = add <4 x i32> %vec.phi, %wide.load
+; CHECK-NEXT: store i32 %ntrunc, i32* %a
+; CHECK-NEXT: %index.next = add i64 %index, 4
+; CHECK-NEXT: icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1
+
+; CHECK-LABEL: middle.block:
+; CHECK: %rdx.shuf = shufflevector <4 x i32>
+define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+entry:
+ %ntrunc = trunc i64 %n to i32
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+ %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+ %tmp2 = load i32, i32* %tmp1, align 8
+ %tmp3 = add i32 %tmp0, %tmp2
+ store i32 %ntrunc, i32* %a
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ %tmp4 = phi i32 [ %tmp3, %for.body ]
+ ret i32 %tmp4
+}
+
+; CHECK-LABEL: inv_val_store_to_inv_address(
+; CHECK-LABEL: vector.body:
+; CHECK: store i32 %ntrunc, i32* %a
+; CHECK: store <4 x i32>
+; CHECK-NEXT: %index.next = add i64 %index, 4
+; CHECK-NEXT: icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT: br i1
+define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) {
+entry:
+ %ntrunc = trunc i64 %n to i32
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+ %tmp2 = load i32, i32* %tmp1, align 8
+ store i32 %ntrunc, i32* %a
+ store i32 %ntrunc, i32* %tmp1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+
+; Both of these tests below are handled as predicated stores.
+
+; Conditional store
+; if (b[i] == k) a = ntrunc
+; TODO: We can be better with the code gen for the first test and we can have
+; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1.
+
+; CHECK-LABEL:inv_val_store_to_inv_address_conditional(
+; CHECK-LABEL: vector.body:
+; CHECK: %wide.load = load <4 x i32>, <4 x i32>*
+; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}}
+; CHECK: store <4 x i32>
+; CHECK-NEXT: [[EE:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 0
+; CHECK-NEXT: br i1 [[EE]], label %pred.store.if, label %pred.store.continue
+
+; CHECK-LABEL: pred.store.if:
+; CHECK-NEXT: store i32 %ntrunc, i32* %a
+; CHECK-NEXT: br label %pred.store.continue
+
+; CHECK-LABEL: pred.store.continue:
+; CHECK-NEXT: [[EE1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 1
+define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+ %ntrunc = trunc i64 %n to i32
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+ %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+ %tmp2 = load i32, i32* %tmp1, align 8
+ %cmp = icmp eq i32 %tmp2, %k
+ store i32 %ntrunc, i32* %tmp1
+ br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+ store i32 %ntrunc, i32* %a
+ br label %latch
+
+latch:
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+; if (b[i] == k)
+; a = ntrunc
+; else a = k;
+; TODO: We could vectorize this once we support multiple uniform stores to the
+; same address.
+; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values(
+; CHECK-NOT: load <4 x i32>
+define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+ %ntrunc = trunc i64 %n to i32
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+ %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+ %tmp2 = load i32, i32* %tmp1, align 8
+ %cmp = icmp eq i32 %tmp2, %k
+ store i32 %ntrunc, i32* %tmp1
+ br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+ store i32 %ntrunc, i32* %a
+ br label %latch
+
+cond_store_k:
+ store i32 %k, i32 * %a
+ br label %latch
+
+latch:
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+; Instcombine'd version of above test. Now the store is no longer of invariant
+; value.
+; TODO: We should be able to vectorize this loop once we support vectorizing
+; stores of variant values to invariant addresses.
+; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic
+; CHECK-NOT: <4 x
+define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+ %ntrunc = trunc i64 %n to i32
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+ %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+ %tmp2 = load i32, i32* %tmp1, align 8
+ %cmp = icmp eq i32 %tmp2, %k
+ store i32 %ntrunc, i32* %tmp1
+ br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+ br label %latch
+
+cond_store_k:
+ br label %latch
+
+latch:
+ %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
+ store i32 %storeval, i32* %a
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+; invariant val stored to invariant address predicated on invariant condition
+; This is not treated as a predicated store since the block the store belongs to
+; is the latch block (which doesn't need to be predicated).
+; TODO: We should vectorize this loop once we relax the check for
+; variant/invariant values being stored to invariant address.
+; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv
+; CHECK-NOT: <4 x
+define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+ %ntrunc = trunc i64 %n to i32
+ %cmp = icmp eq i32 %ntrunc, %k
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+ %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+ %tmp2 = load i32, i32* %tmp1, align 8
+ store i32 %ntrunc, i32* %tmp1
+ br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+ br label %latch
+
+cond_store_k:
+ br label %latch
+
+latch:
+ %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
+ store i32 %storeval, i32* %a
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+; TODO: This loop can be vectorized once we support variant value being
+; stored into invariant address.
+; CHECK-LABEL: variant_val_store_to_inv_address
+; CHECK-NOT: <4 x i32>
+define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+ %ntrunc = trunc i64 %n to i32
+ %cmp = icmp eq i32 %ntrunc, %k
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+ %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+ %tmp2 = load i32, i32* %tmp1, align 8
+ store i32 %tmp2, i32* %a
+ %tmp3 = add i32 %tmp0, %tmp2
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ %rdx.lcssa = phi i32 [ %tmp0, %for.body ]
+ ret i32 %rdx.lcssa
+}
@a = external global i32, align 4
@b = external global [1 x i32], align 4
-; CHECK: LV: Not vectorizing: Cannot prove legality.
+; We can vectorize this loop because we are storing an invariant value into an
+; invariant address.
+
+; CHECK: LV: We can vectorize this loop!
; CHECK-LABEL: @test
define void @test() {
entry: