MF = &mf;
MLI = &getAnalysis<MachineLoopInfo>();
MDT = &getAnalysis<MachineDominatorTree>();
+ ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
TII = MF->getSubtarget().getInstrInfo();
RegClassInfo.runOnMachineFunction(*MF);
setPragmaPipelineOptions(L);
if (!canPipelineLoop(L)) {
LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
+ L.getStartLoc(), L.getHeader())
+ << "Failed to pipeline loop";
+ });
+
return Changed;
}
/// restricted to loops with a single basic block. Make sure that the
/// branch in the loop can be analyzed.
bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
- if (L.getNumBlocks() != 1)
+ if (L.getNumBlocks() != 1) {
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+ L.getStartLoc(), L.getHeader())
+ << "Not a single basic block: "
+ << ore::NV("NumBlocks", L.getNumBlocks());
+ });
return false;
+ }
- if (disabledByPragma)
+ if (disabledByPragma) {
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+ L.getStartLoc(), L.getHeader())
+ << "Disabled by Pragma.";
+ });
return false;
+ }
// Check if the branch can't be understood because we can't do pipelining
// if that's the case.
LI.FBB = nullptr;
LI.BrCond.clear();
if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
- LLVM_DEBUG(
- dbgs() << "Unable to analyzeBranch, can NOT pipeline current Loop\n");
+ LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
NumFailBranch++;
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+ L.getStartLoc(), L.getHeader())
+ << "The branch can't be understood";
+ });
return false;
}
LI.LoopInductionVar = nullptr;
LI.LoopCompare = nullptr;
if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
- LLVM_DEBUG(
- dbgs() << "Unable to analyzeLoop, can NOT pipeline current Loop\n");
+ LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
NumFailLoop++;
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+ L.getStartLoc(), L.getHeader())
+ << "The loop structure is not supported";
+ });
return false;
}
if (!L.getLoopPreheader()) {
- LLVM_DEBUG(
- dbgs() << "Preheader not found, can NOT pipeline current Loop\n");
+ LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
NumFailPreheader++;
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+ L.getStartLoc(), L.getHeader())
+ << "No loop preheader found";
+ });
return false;
}
// Can't schedule a loop without a valid MII.
if (MII == 0) {
- LLVM_DEBUG(
- dbgs()
- << "0 is not a valid Minimal Initiation Interval, can NOT schedule\n");
+ LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
NumFailZeroMII++;
+ Pass.ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(
+ DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+ << "Invalid Minimal Initiation Interval: 0";
+ });
return;
}
LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
<< ", we don't pipleline large loops\n");
NumFailLargeMaxMII++;
+ Pass.ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(
+ DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+ << "Minimal Initiation Interval too large: "
+ << ore::NV("MII", (int)MII) << " > "
+ << ore::NV("SwpMaxMii", SwpMaxMii) << "."
+ << "Refer to -pipeliner-max-mii.";
+ });
return;
}
if (!Scheduled){
LLVM_DEBUG(dbgs() << "No schedule found, return\n");
NumFailNoSchedule++;
+ Pass.ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(
+ DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+ << "Unable to find schedule";
+ });
return;
}
unsigned numStages = Schedule.getMaxStageCount();
// No need to generate pipeline if there are no overlapped iterations.
if (numStages == 0) {
- LLVM_DEBUG(
- dbgs() << "No overlapped iterations, no need to generate pipeline\n");
+ LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
NumFailZeroStage++;
+ Pass.ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(
+ DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+ << "No need to pipeline - no overlapped iterations in schedule.";
+ });
return;
}
// Check that the maximum stage count is less than user-defined limit.
LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
<< " : too many stages, abort\n");
NumFailLargeMaxStage++;
+ Pass.ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(
+ DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+ << "Too many stages in schedule: "
+ << ore::NV("numStages", (int)numStages) << " > "
+ << ore::NV("SwpMaxStages", SwpMaxStages)
+ << ". Refer to -pipeliner-max-stages.";
+ });
return;
}
+ Pass.ORE->emit([&]() {
+ return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
+ Loop.getHeader())
+ << "Pipelined succesfully!";
+ });
+
// Generate the schedule as a ModuloSchedule.
DenseMap<MachineInstr *, int> Cycles, Stages;
std::vector<MachineInstr *> OrderedInsts;
}
}
int Resmii = Resources.size();
- LLVM_DEBUG(dbgs() << "Retrun Res MII:" << Resmii << "\n");
+ LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
// Delete the memory for each of the DFAs that were created earlier.
for (ResourceManager *RI : Resources) {
ResourceManager *D = RI;
LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << " (II=" << II
<< ")\n");
- if (scheduleFound)
+ if (scheduleFound) {
Schedule.finalizeSchedule(this);
- else
+ Pass.ORE->emit([&]() {
+ return MachineOptimizationRemarkAnalysis(
+ DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+ << "Schedule found with Initiation Interval: " << ore::NV("II", II)
+ << ", MaxStageCount: "
+ << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
+ });
+ } else
Schedule.reset();
return scheduleFound && Schedule.getMaxStageCount() > 0;
--- /dev/null
+; RUN: llc < %s -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -verify-machineinstrs -ppc-asm-full-reg-names -mcpu=pwr9 --ppc-enable-pipeliner \
+; RUN: -pass-remarks-analysis=pipeliner -pass-remarks=pipeliner -o /dev/null 2>&1 \
+; RUN: | FileCheck %s
+
+@x = dso_local local_unnamed_addr global <{ i32, i32, i32, i32, [1020 x i32] }> <{ i32 1, i32 2, i32 3, i32 4, [1020 x i32] zeroinitializer }>, align 4
+@y = dso_local global [1024 x i32] zeroinitializer, align 4
+
+define dso_local i32* @foo() local_unnamed_addr {
+;CHECK: Schedule found with Initiation Interval
+;CHECK: Pipelined succesfully!
+entry:
+ %.pre = load i32, i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0), align 4
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0)
+
+for.body: ; preds = %for.body, %entry
+ %0 = phi i32 [ %.pre, %entry ], [ %add.2, %for.body ]
+ %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next.2, %for.body ]
+ %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv
+ %1 = load i32, i32* %arrayidx2, align 4
+ %mul = mul nsw i32 %1, %1
+ %add = add nsw i32 %mul, %0
+ %arrayidx6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv
+ store i32 %add, i32* %arrayidx6, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx2.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next
+ %2 = load i32, i32* %arrayidx2.1, align 4
+ %mul.1 = mul nsw i32 %2, %2
+ %add.1 = add nsw i32 %mul.1, %add
+ %arrayidx6.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next
+ store i32 %add.1, i32* %arrayidx6.1, align 4
+ %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
+ %arrayidx2.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next.1
+ %3 = load i32, i32* %arrayidx2.2, align 4
+ %mul.2 = mul nsw i32 %3, %3
+ %add.2 = add nsw i32 %mul.2, %add.1
+ %arrayidx6.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next.1
+ store i32 %add.2, i32* %arrayidx6.2, align 4
+ %indvars.iv.next.2 = add nuw nsw i64 %indvars.iv, 3
+ %exitcond.2 = icmp eq i64 %indvars.iv.next.2, 1024
+ br i1 %exitcond.2, label %for.cond.cleanup, label %for.body
+}