// through scalar predication or masked load/store or masked gather/scatter.
// \p VF is the vectorization factor that will be used to vectorize \p I.
// Superset of instructions that return true for isScalarWithPredication.
- bool isPredicatedInst(Instruction *I, ElementCount VF,
- bool IsKnownUniform = false) {
- // When we know the load is uniform and the original scalar loop was not
- // predicated we don't need to mark it as a predicated instruction. Any
- // vectorised blocks created when tail-folding are something artificial we
- // have introduced and we know there is always at least one active lane.
- // That's why we call Legal->blockNeedsPredication here because it doesn't
- // query tail-folding.
- if (IsKnownUniform && isa<LoadInst>(I) &&
+ bool isPredicatedInst(Instruction *I, ElementCount VF) {
+ // When we know the load's address is loop invariant and the instruction
+ // in the original scalar loop was unconditionally executed then we
+ // don't need to mark it as a predicated instruction. Tail folding may
+ // introduce additional predication, but we're guaranteed to always have
+ // at least one active lane. We call Legal->blockNeedsPredication here
+ // because it doesn't query tail-folding.
+ if (Legal->isUniformMemOp(*I) && isa<LoadInst>(I) &&
!Legal->blockNeedsPredication(I->getParent()))
return false;
if (!blockNeedsPredicationForAnyReason(I->getParent()))
// from moving "masked load/store" check from legality to cost model.
// Masked Load/Gather emulation was previously never allowed.
// Limited number of Masked Store/Scatter emulation was allowed.
- assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
+ assert((isPredicatedInst(I, VF) || Legal->isUniformMemOp(*I)) &&
+ "Expecting a scalar emulated instruction");
return isa<LoadInst>(I) ||
(isa<StoreInst>(I) &&
NumPredStores > NumberOfStoresToPredicate);
Range);
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
+ [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); },
Range);
// Even if the instruction is not marked as uniform, there are certain
; CHECK-NEXT: <x1> vector loop: {
; CHECK-NEXT: vector.body:
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%.pn> = phi ir<0>, vp<[[P_L:%.+]]>
+; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%.pn> = phi ir<0>, ir<%l>
; CHECK-NEXT: EMIT vp<[[WIDE_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]>
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp ule vp<[[WIDE_IV]]> vp<[[BTC]]>
-; CHECK-NEXT: Successor(s): loop.0
-; CHECK-EMPTY:
-; CHECK-NEXT: loop.0:
-; CHECK-NEXT: Successor(s): pred.load
-; CHECK-EMPTY:
-; CHECK-NEXT: <xVFxUF> pred.load: {
-; CHECK-NEXT: pred.load.entry:
-; CHECK-NEXT: BRANCH-ON-MASK vp<[[CMP]]>
-; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue
-; CHECK-EMPTY:
-; CHECK-NEXT: pred.load.if:
-; CHECK-NEXT: REPLICATE ir<%l> = load ir<%src> (S->V)
-; CHECK-NEXT: Successor(s): pred.load.continue
-; CHECK-EMPTY:
-; CHECK-NEXT: pred.load.continue:
-; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[P_L]]> = ir<%l>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): pred.load.succ
-; CHECK-EMPTY:
-; CHECK-NEXT: pred.load.succ:
-; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%.pn> vp<[[P_L]]>
+; CHECK-NEXT: Successor(s): loop.0
+; CHECK-EMPTY:
+; CHECK-NEXT: loop.0:
+; CHECK-NEXT: REPLICATE ir<%l> = load ir<%src>
+; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%.pn> ir<%l>
; CHECK-NEXT: Successor(s): pred.sdiv
; CHECK-EMPTY:
; CHECK-NEXT: <xVFxUF> pred.sdiv: {
; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[P_VAL:%.+]]> = ir<%val>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): loop.1
+; CHECK-NEXT: Successor(s): loop.0.split
; CHECK-EMPTY:
-; CHECK-NEXT: loop.1:
+; CHECK-NEXT: loop.0.split:
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + vp<[[CAN_IV]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]> vp<[[VEC_TC]]>
; CHECK-NEXT: No successors
; VF1UF4-NEXT: store i64 [[TMP5]], i64* [[B:%.*]], align 8
; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE]]
; VF1UF4: pred.store.continue:
+; VF1UF4-NEXT: [[TMP6:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_STORE_IF]] ]
; VF1UF4-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
; VF1UF4: pred.store.if7:
; VF1UF4-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; VF1UF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION1]]
-; VF1UF4-NEXT: [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
-; VF1UF4-NEXT: store i64 [[TMP7]], i64* [[B]], align 8
+; VF1UF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION1]]
+; VF1UF4-NEXT: [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 8
+; VF1UF4-NEXT: store i64 [[TMP8]], i64* [[B]], align 8
; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE8]]
; VF1UF4: pred.store.continue8:
+; VF1UF4-NEXT: [[TMP9:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP8]], [[PRED_STORE_IF7]] ]
; VF1UF4-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
; VF1UF4: pred.store.if9:
; VF1UF4-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2
-; VF1UF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION2]]
-; VF1UF4-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8
-; VF1UF4-NEXT: store i64 [[TMP9]], i64* [[B]], align 8
+; VF1UF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION2]]
+; VF1UF4-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8
+; VF1UF4-NEXT: store i64 [[TMP11]], i64* [[B]], align 8
; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE10]]
; VF1UF4: pred.store.continue10:
+; VF1UF4-NEXT: [[TMP12:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE8]] ], [ [[TMP11]], [[PRED_STORE_IF9]] ]
; VF1UF4-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
; VF1UF4: pred.store.if11:
; VF1UF4-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3
-; VF1UF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION3]]
-; VF1UF4-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8
-; VF1UF4-NEXT: store i64 [[TMP11]], i64* [[B]], align 8
+; VF1UF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION3]]
+; VF1UF4-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP13]], align 8
+; VF1UF4-NEXT: store i64 [[TMP14]], i64* [[B]], align 8
; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE12]]
; VF1UF4: pred.store.continue12:
+; VF1UF4-NEXT: [[TMP15:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE10]] ], [ [[TMP14]], [[PRED_STORE_IF11]] ]
; VF1UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; VF1UF4-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF1UF4-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF1UF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF1UF4-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; VF1UF4: middle.block:
; VF1UF4-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; VF1UF4: scalar.ph:
; CHECK-NEXT: EMIT vp<[[WIDE_CAN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]>
; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDE_CAN_IV]]> vp<[[BTC]]>
; CHECK-NEXT: CLONE ir<%gep.A.uniform> = getelementptr ir<%A>, ir<0>
-; CHECK-NEXT: Successor(s): pred.load
-; CHECK-EMPTY:
-; CHECK-NEXT: <xVFxUF> pred.load: {
-; CHECK-NEXT: pred.load.entry:
-; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]>
-; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue
-; CHECK-EMPTY:
-; CHECK-NEXT: pred.load.if:
-; CHECK-NEXT: REPLICATE ir<%lv> = load ir<%gep.A.uniform>
-; CHECK-NEXT: Successor(s): pred.load.continue
-; CHECK-EMPTY:
-; CHECK-NEXT: pred.load.continue:
-; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%lv>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): loop.0
-; CHECK-EMPTY:
-; CHECK-NEXT: loop.0:
+; CHECK-NEXT: REPLICATE ir<%lv> = load ir<%gep.A.uniform>
; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%iv>, ir<%k>
; CHECK-NEXT: Successor(s): loop.then
; CHECK-EMPTY:
; CHECK-EMPTY:
; CHECK-NEXT: pred.store.if:
; CHECK-NEXT: REPLICATE ir<%gep.B> = getelementptr ir<%B>, vp<[[STEPS]]>
-; CHECK-NEXT: REPLICATE store vp<[[PRED]]>, ir<%gep.B>
+; CHECK-NEXT: REPLICATE store ir<%lv>, ir<%gep.B>
; CHECK-NEXT: Successor(s): pred.store.continue
; CHECK-EMPTY:
; CHECK-NEXT: pred.store.continue: