"The cost of a loop that is considered 'small' by the interleaver."));
static cl::opt<bool> LoopVectorizeWithBlockFrequency(
- "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
+ "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
cl::desc("Enable the use of the block frequency analysis to access PGO "
"heuristics minimizing code growth in cold regions and being more "
"aggressive in hot regions."));
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
- unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
- bool HasExpectedTC = (ExpectedTC > 0);
-
+ // Prefer constant trip counts over profile data, over upper bound estimate.
+ unsigned ExpectedTC = 0;
+ bool HasExpectedTC = false;
+ if (const SCEVConstant *ConstExits =
+ dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
+ const APInt &ExitsCount = ConstExits->getAPInt();
+ // We are interested in small values for ExpectedTC. Skip over those that
+ // can't fit an unsigned.
+ if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
+ ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
+ HasExpectedTC = true;
+ }
+ }
+ // ExpectedTC may be large because it's bound by a variable. Check
+ // profiling information to validate we should vectorize.
if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
auto EstimatedTC = getLoopEstimatedTripCount(L);
if (EstimatedTC) {
HasExpectedTC = true;
}
}
+ if (!HasExpectedTC) {
+ ExpectedTC = SE->getSmallConstantMaxTripCount(L);
+ HasExpectedTC = (ExpectedTC > 0);
+ }
if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
}
define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
-; The loop has low invocation count compare to the function invocation count,
+; The loop has low invocation count compare to the function invocation count,
; but has a high trip count per invocation. Vectorize it.
; CHECK-LABEL: @foo_low_trip_count3(
ret i32 0
}
+define i32 @foo_low_trip_count_icmp_sgt(i32 %bound) {
+; Simple loop with low tripcount and inequality test for exit.
+; Should not be vectorized.
+
+; CHECK-LABEL: @foo_low_trip_count_icmp_sgt(
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+ %0 = load i8, i8* %arrayidx, align 1
+ %cmp1 = icmp eq i8 %0, 0
+ %. = select i1 %cmp1, i8 2, i8 1
+ store i8 %., i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 1
+ %exitcond = icmp sgt i32 %i.08, %bound
+ br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end: ; preds = %for.body
+ ret i32 0
+}
+
+define i32 @const_low_trip_count() {
+; Simple loop with constant, small trip count and no profiling info.
+
+; CHECK-LABEL: @const_low_trip_count
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+ %0 = load i8, i8* %arrayidx, align 1
+ %cmp1 = icmp eq i8 %0, 0
+ %. = select i1 %cmp1, i8 2, i8 1
+ store i8 %., i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 1
+ %exitcond = icmp slt i32 %i.08, 2
+ br i1 %exitcond, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ ret i32 0
+}
+
+define i32 @const_large_trip_count() {
+; Simple loop with constant large trip count and no profiling info.
+
+; CHECK-LABEL: @const_large_trip_count
+; CHECK: <{{[0-9]+}} x i8>
+
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+ %0 = load i8, i8* %arrayidx, align 1
+ %cmp1 = icmp eq i8 %0, 0
+ %. = select i1 %cmp1, i8 2, i8 1
+ store i8 %., i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 1
+ %exitcond = icmp slt i32 %i.08, 1000
+ br i1 %exitcond, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ ret i32 0
+}
+
+define i32 @const_small_trip_count_step() {
+; Simple loop with static, small trip count and no profiling info.
+
+; CHECK-LABEL: @const_small_trip_count_step
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+ %0 = load i8, i8* %arrayidx, align 1
+ %cmp1 = icmp eq i8 %0, 0
+ %. = select i1 %cmp1, i8 2, i8 1
+ store i8 %., i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 5
+ %exitcond = icmp slt i32 %i.08, 10
+ br i1 %exitcond, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ ret i32 0
+}
+
+define i32 @const_trip_over_profile() {
+; constant trip count takes precedence over profile data
+
+; CHECK-LABEL: @const_trip_over_profile
+; CHECK: <{{[0-9]+}} x i8>
+
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+ %0 = load i8, i8* %arrayidx, align 1
+ %cmp1 = icmp eq i8 %0, 0
+ %. = select i1 %cmp1, i8 2, i8 1
+ store i8 %., i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 1
+ %exitcond = icmp slt i32 %i.08, 1000
+ br i1 %exitcond, label %for.body, label %for.end, !prof !1
+
+for.end: ; preds = %for.body
+ ret i32 0
+}
!0 = !{!"function_entry_count", i64 100}
!1 = !{!"branch_weights", i32 100, i32 0}