MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
cl::desc("Maximum depth of the lookup for consecutive stores."));
-/// Limits the size of scheduling regions in a block.
-/// It avoid long compile times for _very_ large blocks where vector
-/// instructions are spread over a wide range.
-/// This limit is way higher than needed by real-world functions.
-static cl::opt<int>
-ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
- cl::desc("Limit the size of the SLP scheduling region per block"));
-
static cl::opt<int> MinVectorRegSizeOption(
"slp-min-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
// This limit is useful for very large basic blocks.
static const unsigned MaxMemDepDistance = 160;
-/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
-/// regions to be handled.
-static const int MinScheduleRegionSize = 16;
-
/// Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
FirstLoadStoreInRegion = nullptr;
LastLoadStoreInRegion = nullptr;
- // Reduce the maximum schedule region size by the size of the
- // previous scheduling run.
- ScheduleRegionSizeLimit -= ScheduleRegionSize;
- if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
- ScheduleRegionSizeLimit = MinScheduleRegionSize;
- ScheduleRegionSize = 0;
-
// Make a new scheduling region, i.e. all existing ScheduleData is not
// in the new region yet.
++SchedulingRegionID;
/// Extends the scheduling region so that V is inside the region.
/// \returns true if the region size is within the limit.
- bool extendSchedulingRegion(Value *V, const InstructionsState &S);
+ void extendSchedulingRegion(Value *V, const InstructionsState &S);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
/// (can be null).
ScheduleData *LastLoadStoreInRegion = nullptr;
- /// The current size of the scheduling region.
- int ScheduleRegionSize = 0;
-
- /// The maximum size allowed for the scheduling region.
- int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
-
/// The ID of the scheduling region. For a new vectorization iteration this
/// is incremented which "removes" all ScheduleData from the region.
/// Make sure that the initial SchedulingRegionID is greater than the
doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
ReSchedule = true;
}
- if (Bundle) {
- LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
- << " in block " << BB->getName() << "\n");
- calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
- }
+ LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
+ << " in block " << BB->getName() << "\n");
+ calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
if (ReSchedule) {
resetSchedule();
// dependencies. As soon as the bundle is "ready" it means that there are no
// cyclic dependencies and we can schedule it. Note that's important that we
// don't "schedule" the bundle yet (see cancelScheduling).
- while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
- !ReadyInsts.empty()) {
+ while (!Bundle->isReady() && !ReadyInsts.empty()) {
ScheduleData *Picked = ReadyInsts.pop_back_val();
assert(Picked->isSchedulingEntity() && Picked->isReady() &&
"must be ready to schedule");
// Make sure that the scheduling region contains all
// instructions of the bundle.
- for (Value *V : VL) {
- if (!extendSchedulingRegion(V, S)) {
- // If the scheduling region got new instructions at the lower end (or it
- // is a new region for the first bundle). This makes it necessary to
- // recalculate all dependencies.
- // Otherwise the compiler may crash trying to incorrectly calculate
- // dependencies and emit instruction in the wrong order at the actual
- // scheduling.
- TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
- return None;
- }
- }
+ for (Value *V : VL)
+ extendSchedulingRegion(V, S);
bool ReSchedule = false;
for (Value *V : VL) {
return &(ScheduleDataChunks.back()[ChunkPos++]);
}
-bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
- const InstructionsState &S) {
+void
+BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
+ const InstructionsState &S) {
if (getScheduleData(V, isOneOf(S, V)))
- return true;
+ return;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
return true;
};
if (CheckSheduleForI(I))
- return true;
+ return;
if (!ScheduleStart) {
// It's the first instruction in the new region.
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
CheckSheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a terminator?");
LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
- return true;
+ return;
}
// Search up and down at the same time, because we don't know if the new
// instruction is above or below the existing scheduling region.
BasicBlock::iterator LowerEnd = BB->end();
while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
&*DownIter != I) {
- if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
- LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
- return false;
- }
-
++UpIter;
++DownIter;
}
CheckSheduleForI(I);
LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
<< "\n");
- return true;
+ return;
}
assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
"Expected to reach top of the basic block or instruction down the "
CheckSheduleForI(I);
assert(ScheduleEnd && "tried to vectorize a terminator?");
LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
- return true;
+ return;
}
void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT
-; RUN: opt < %s -slp-schedule-budget=0 -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
-; RUN: opt < %s -slp-schedule-budget=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
+; RUN: opt < %s -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
+; RUN: opt < %s -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
;
; MAX-COST-LABEL: @PR28330(
; MAX-COST-NEXT: entry:
-; MAX-COST-NEXT: [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
-; MAX-COST-NEXT: [[P1:%.*]] = icmp eq i8 [[P0]], 0
-; MAX-COST-NEXT: [[P2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
-; MAX-COST-NEXT: [[P3:%.*]] = icmp eq i8 [[P2]], 0
-; MAX-COST-NEXT: [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0
-; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0
-; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0
-; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-; MAX-COST-NEXT: [[P11:%.*]] = icmp eq i8 [[P10]], 0
-; MAX-COST-NEXT: [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0
-; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-; MAX-COST-NEXT: [[P15:%.*]] = icmp eq i8 [[P14]], 0
+; MAX-COST-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; MAX-COST-NEXT: br label [[FOR_BODY:%.*]]
; MAX-COST: for.body:
-; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT: [[P19:%.*]] = select i1 [[P1]], i32 -720, i32 -80
-; MAX-COST-NEXT: [[P20:%.*]] = add i32 [[P17]], [[P19]]
-; MAX-COST-NEXT: [[P21:%.*]] = select i1 [[P3]], i32 -720, i32 -80
-; MAX-COST-NEXT: [[P22:%.*]] = add i32 [[P20]], [[P21]]
-; MAX-COST-NEXT: [[P23:%.*]] = select i1 [[P5]], i32 -720, i32 -80
-; MAX-COST-NEXT: [[P24:%.*]] = add i32 [[P22]], [[P23]]
-; MAX-COST-NEXT: [[P25:%.*]] = select i1 [[P7]], i32 -720, i32 -80
-; MAX-COST-NEXT: [[P26:%.*]] = add i32 [[P24]], [[P25]]
-; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
-; MAX-COST-NEXT: [[P28:%.*]] = add i32 [[P26]], [[P27]]
-; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT: [[P30:%.*]] = add i32 [[P28]], [[P29]]
-; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
-; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[P30]], [[P31]]
-; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
-; MAX-COST-NEXT: [[P34]] = add i32 [[P32]], [[P33]]
+; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; MAX-COST-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
; MAX-COST-NEXT: br label [[FOR_BODY]]
;
entry:
;
; MAX-COST-LABEL: @PR32038(
; MAX-COST-NEXT: entry:
-; MAX-COST-NEXT: [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <4 x i8>*), align 1
-; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer
-; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0
-; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-; MAX-COST-NEXT: [[P11:%.*]] = icmp eq i8 [[P10]], 0
-; MAX-COST-NEXT: [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0
-; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-; MAX-COST-NEXT: [[P15:%.*]] = icmp eq i8 [[P14]], 0
+; MAX-COST-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; MAX-COST-NEXT: br label [[FOR_BODY:%.*]]
; MAX-COST: for.body:
-; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
-; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
-; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
-; MAX-COST-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[P27]]
-; MAX-COST-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[P29]]
-; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP5]], -5
-; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
-; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
-; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
-; MAX-COST-NEXT: [[P34]] = add i32 [[P32]], [[P33]]
+; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; MAX-COST-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5
; MAX-COST-NEXT: br label [[FOR_BODY]]
;
entry:
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -basic-aa -slp-vectorizer -S -slp-schedule-budget=16 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.9.0"
; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
; CHECK-NEXT: [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3
+; CHECK-NEXT: [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
; CHECK-NEXT: call void @unknown()
; CHECK-NEXT: call void @unknown()
; CHECK-NEXT: call void @unknown()
; CHECK-NEXT: call void @unknown()
-; CHECK-NEXT: [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
-; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[B]] to <4 x float>*
; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
; CHECK-NEXT: [[C1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 1