SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit));
}
+/// Returns estimate for max latch taken count of the loop of the narrowest
+/// available type. If the latch block has such estimate, it is returned.
+/// Otherwise, we use max exit count of whole loop (that is potentially of wider
+/// type than latch check itself), which is still better than no estimate.
+static const SCEV *getNarrowestLatchMaxTakenCountEstimate(ScalarEvolution &SE,
+ const Loop &L) {
+ const SCEV *FromBlock =
+ SE.getExitCount(&L, L.getLoopLatch(), ScalarEvolution::SymbolicMaximum);
+ if (isa<SCEVCouldNotCompute>(FromBlock))
+ return SE.getSymbolicMaxBackedgeTakenCount(&L);
+ return FromBlock;
+}
+
std::optional<LoopStructure>
LoopStructure::parseLoopStructure(ScalarEvolution &SE, Loop &L,
const char *&FailureReason) {
return std::nullopt;
}
- const SCEV *LatchCount = SE.getExitCount(&L, Latch);
- if (isa<SCEVCouldNotCompute>(LatchCount)) {
+ const SCEV *MaxBETakenCount = getNarrowestLatchMaxTakenCountEstimate(SE, L);
+ if (isa<SCEVCouldNotCompute>(MaxBETakenCount)) {
FailureReason = "could not compute latch count";
return std::nullopt;
}
- assert(SE.getLoopDisposition(LatchCount, &L) ==
+ assert(SE.getLoopDisposition(MaxBETakenCount, &L) ==
ScalarEvolution::LoopInvariant &&
"loop variant exit count doesn't make sense!");
bool LoopConstrainer::run() {
BasicBlock *Preheader = nullptr;
- const SCEV *LatchTakenCount =
- SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch);
+ const SCEV *MaxBETakenCount =
+ getNarrowestLatchMaxTakenCountEstimate(SE, OriginalLoop);
Preheader = OriginalLoop.getLoopPreheader();
- assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr &&
+ assert(!isa<SCEVCouldNotCompute>(MaxBETakenCount) && Preheader != nullptr &&
"preconditions!");
- ExitCountTy = cast<IntegerType>(LatchTakenCount->getType());
+ ExitCountTy = cast<IntegerType>(MaxBETakenCount->getType());
OriginalPreheader = Preheader;
MainLoopPreheader = Preheader;
ret void
}
-; TODO: IRCE is legal here.
+; IRCE is legal here.
; Here how it is done if the step was 1: https://godbolt.org/z/jEqWaseWc
; It is also legal for step 4. Proof:
; - Capacity check ensures that iv < limit <= SINT_MAX - 3, meaning that
; CHECK-NEXT: [[CAPACITY:%.*]] = load i32, ptr [[CAPACITY_P]], align 4, !range [[RNG16:![0-9]+]]
; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = load i32, ptr [[NUM_ELEMENTS_P]], align 4, !range [[RNG16]]
; CHECK-NEXT: [[LIMIT:%.*]] = sub i32 [[CAPACITY]], 3
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[CAPACITY]], -3
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[CAPACITY]], 2147483646
+; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[TMP0]], [[SMAX]]
+; CHECK-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[LIMIT]], i32 0)
+; CHECK-NEXT: [[SMAX2:%.*]] = call i32 @llvm.smax.i32(i32 [[SMIN]], i32 -1)
+; CHECK-NEXT: [[TMP3:%.*]] = add nsw i32 [[SMAX2]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[SMIN3:%.*]] = call i32 @llvm.smin.i32(i32 [[NUM_ELEMENTS]], i32 [[TMP4]])
+; CHECK-NEXT: [[EXIT_MAINLOOP_AT:%.*]] = call i32 @llvm.smax.i32(i32 [[SMIN3]], i32 0)
+; CHECK-NEXT: [[TMP5:%.*]] = icmp slt i32 0, [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT: br i1 [[TMP5]], label [[LOOP_PREHEADER:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
+; CHECK: loop.preheader:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[LOOP_PREHEADER]] ]
; CHECK-NEXT: [[CAPACITY_CHECK:%.*]] = icmp slt i32 [[IV]], [[LIMIT]]
-; CHECK-NEXT: br i1 [[CAPACITY_CHECK]], label [[BACKEDGE]], label [[OUT_OF_BOUNDS:%.*]], !prof [[PROF17:![0-9]+]]
+; CHECK-NEXT: br i1 true, label [[BACKEDGE]], label [[OUT_OF_BOUNDS_LOOPEXIT5:%.*]], !prof [[PROF17:![0-9]+]]
; CHECK: backedge:
; CHECK-NEXT: [[IV_WIDE:%.*]] = zext i32 [[IV]] to i64
; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i64 [[IV_WIDE]]
; CHECK-NEXT: store i32 1, ptr [[EL_PTR]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 4
; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[NUM_ELEMENTS]]
-; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp slt i32 [[IV_NEXT]], [[EXIT_MAINLOOP_AT]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[LOOP]], label [[MAIN_EXIT_SELECTOR:%.*]]
+; CHECK: main.exit.selector:
+; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_NEXT]], [[BACKEDGE]] ]
+; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], [[BACKEDGE]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp slt i32 [[IV_NEXT_LCSSA]], [[NUM_ELEMENTS]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MAIN_PSEUDO_EXIT]], label [[EXIT:%.*]]
+; CHECK: main.pseudo.exit:
+; CHECK-NEXT: [[IV_COPY:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT: [[INDVAR_END:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT_LCSSA]], [[MAIN_EXIT_SELECTOR]] ]
+; CHECK-NEXT: br label [[POSTLOOP:%.*]]
+; CHECK: exit.loopexit:
+; CHECK-NEXT: [[IV_LCSSA1_PH:%.*]] = phi i32 [ [[IV_POSTLOOP:%.*]], [[BACKEDGE_POSTLOOP:%.*]] ]
+; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
-; CHECK-NEXT: [[IV_LCSSA1:%.*]] = phi i32 [ [[IV]], [[BACKEDGE]] ]
+; CHECK-NEXT: [[IV_LCSSA1:%.*]] = phi i32 [ [[IV_LCSSA]], [[MAIN_EXIT_SELECTOR]] ], [ [[IV_LCSSA1_PH]], [[EXIT_LOOPEXIT:%.*]] ]
; CHECK-NEXT: ret i32 [[IV_LCSSA1]]
+; CHECK: out_of_bounds.loopexit:
+; CHECK-NEXT: br label [[OUT_OF_BOUNDS:%.*]]
+; CHECK: out_of_bounds.loopexit5:
+; CHECK-NEXT: br label [[OUT_OF_BOUNDS]]
; CHECK: out_of_bounds:
; CHECK-NEXT: ret i32 -1
+; CHECK: postloop:
+; CHECK-NEXT: br label [[LOOP_POSTLOOP:%.*]]
+; CHECK: loop.postloop:
+; CHECK-NEXT: [[IV_POSTLOOP]] = phi i32 [ [[IV_COPY]], [[POSTLOOP]] ], [ [[IV_NEXT_POSTLOOP:%.*]], [[BACKEDGE_POSTLOOP]] ]
+; CHECK-NEXT: [[CAPACITY_CHECK_POSTLOOP:%.*]] = icmp slt i32 [[IV_POSTLOOP]], [[LIMIT]]
+; CHECK-NEXT: br i1 [[CAPACITY_CHECK_POSTLOOP]], label [[BACKEDGE_POSTLOOP]], label [[OUT_OF_BOUNDS_LOOPEXIT:%.*]], !prof [[PROF17]]
+; CHECK: backedge.postloop:
+; CHECK-NEXT: [[IV_WIDE_POSTLOOP:%.*]] = zext i32 [[IV_POSTLOOP]] to i64
+; CHECK-NEXT: [[EL_PTR_POSTLOOP:%.*]] = getelementptr i32, ptr [[P]], i64 [[IV_WIDE_POSTLOOP]]
+; CHECK-NEXT: store i32 1, ptr [[EL_PTR_POSTLOOP]], align 4
+; CHECK-NEXT: [[IV_NEXT_POSTLOOP]] = add nuw nsw i32 [[IV_POSTLOOP]], 4
+; CHECK-NEXT: [[LOOP_COND_POSTLOOP:%.*]] = icmp slt i32 [[IV_NEXT_POSTLOOP]], [[NUM_ELEMENTS]]
+; CHECK-NEXT: br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP18:![0-9]+]], !irce.loop.clone !6
;
entry:
%capacity = load i32, ptr %capacity_p, !range !4