printDepMatrix(DependencyMatrix);
#endif
- // Since we currently do not handle LCSSA PHI's any failure in loop
- // condition will now branch to LoopNestExit.
- // TODO: This should be removed once we handle LCSSA PHI nodes.
-
// Get the Outermost loop exit.
BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
if (!LoopNestExit) {
return false;
}
- if (isa<PHINode>(LoopNestExit->begin())) {
- DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now "
- "since on failure all loops branch to loop nest exit.\n");
- return false;
- }
-
unsigned SelecLoopId = selectLoopForInterchange(LoopList);
// Move the selected loop outwards to the best possible position.
for (unsigned i = SelecLoopId; i > 0; i--) {
}
// TODO: We only handle LCSSA PHI's corresponding to reduction for now.
- BasicBlock *OuterExit = OuterLoop->getExitBlock();
- if (!containsSafePHI(OuterExit, true)) {
- DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter",
- OuterLoop->getStartLoc(),
- OuterLoop->getHeader())
- << "Only outer loops with LCSSA PHIs can be interchange "
- "currently.";
- });
- return true;
- }
-
BasicBlock *InnerExit = InnerLoop->getExitBlock();
if (!containsSafePHI(InnerExit, false)) {
DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
return false;
}
+// We currently support LCSSA PHI nodes in the outer loop exit, if their
+// incoming values do not come from the outer loop latch or if the
+// outer loop latch has a single predecessor. In that case, the value will
+// be available if both the inner and outer loop conditions are true, which
+// will still be true after interchanging. If we have multiple predecessor,
+// that may not be the case, e.g. because the outer loop latch may be executed
+// if the inner loop is not executed.
+static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
+ BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
+ for (PHINode &PHI : LoopNestExit->phis()) {
+ // FIXME: We currently are not able to detect floating point reductions
+ // and have to use floating point PHIs as a proxy to prevent
+ // interchanging in the presence of floating point reductions.
+ if (PHI.getType()->isFloatingPointTy())
+ return false;
+ for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
+ Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
+ if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
+ continue;
+
+ // The incoming value is defined in the outer loop latch. Currently we
+ // only support that in case the outer loop latch has a single predecessor.
+ // This guarantees that the outer loop latch is executed if and only if
+ // the inner loop is executed (because tightlyNested() guarantees that the
+ // outer loop header only branches to the inner loop or the outer loop
+ // latch).
+ // FIXME: We could weaken this logic and allow multiple predecessors,
+ // if the values are produced outside the loop latch. We would need
+ // additional logic to update the PHI nodes in the exit block as
+ // well.
+ if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
+ return false;
+ }
+ }
+ return true;
+}
+
bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
unsigned OuterLoopId,
CharMatrix &DepMatrix) {
return false;
}
+ if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
+ DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Found unsupported PHI node in loop exit.";
+ });
+ return false;
+ }
+
return true;
}
+++ /dev/null
-; REQUIRES: asserts
-; RUN: opt < %s -basicaa -loop-interchange -S -debug 2>&1 | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@A = common global [100 x [100 x i32]] zeroinitializer
-@C = common global [100 x [100 x i32]] zeroinitializer
-
-;; FIXME:
-;; Test for interchange when we have an lcssa phi. This should ideally be interchanged but it is currently not supported.
-;; for(gi=1;gi<N;gi++)
-;; for(gj=1;gj<M;gj++)
-;; A[gj][gi] = A[gj - 1][gi] + C[gj][gi];
-
-; CHECK: PHI Nodes in loop nest exit is not handled for now since on failure all loops branch to loop nest exit.
-
-@gi = common global i32 0
-@gj = common global i32 0
-
-define void @interchange_07(i32 %N, i32 %M){
-entry:
- store i32 1, i32* @gi
- %cmp21 = icmp sgt i32 %N, 1
- br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end16
-
-for.cond1.preheader.lr.ph:
- %cmp218 = icmp sgt i32 %M, 1
- %gi.promoted = load i32, i32* @gi
- %0 = add i32 %M, -1
- %1 = sext i32 %gi.promoted to i64
- %2 = sext i32 %N to i64
- %3 = add i32 %gi.promoted, 1
- %4 = icmp slt i32 %3, %N
- %smax = select i1 %4, i32 %N, i32 %3
- br label %for.cond1.preheader
-
-for.cond1.preheader:
- %indvars.iv25 = phi i64 [ %1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next26, %for.inc14 ]
- br i1 %cmp218, label %for.body3, label %for.inc14
-
-for.body3:
- %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ]
- %5 = add nsw i64 %indvars.iv, -1
- %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25
- %6 = load i32, i32* %arrayidx5
- %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25
- %7 = load i32, i32* %arrayidx9
- %add = add nsw i32 %7, %6
- %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv25
- store i32 %add, i32* %arrayidx13
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %lftr.wideiv = trunc i64 %indvars.iv to i32
- %exitcond = icmp eq i32 %lftr.wideiv, %0
- br i1 %exitcond, label %for.inc14, label %for.body3
-
-for.inc14:
- %inc.lcssa23 = phi i32 [ 1, %for.cond1.preheader ], [ %M, %for.body3 ]
- %indvars.iv.next26 = add nsw i64 %indvars.iv25, 1
- %cmp = icmp slt i64 %indvars.iv.next26, %2
- br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end16_crit_edge
-
-for.cond.for.end16_crit_edge:
- store i32 %inc.lcssa23, i32* @gj
- store i32 %smax, i32* @gi
- br label %for.end16
-
-for.end16:
- ret void
-}
--- /dev/null
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t
+; RUN: cat %t | FileCheck --check-prefix REMARK %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@C = common global [100 x [100 x i32]] zeroinitializer
+@X = common global i32 0
+@Y = common global i64 0
+@F = common global float 0.0
+
+; We cannot interchange this loop at the moment, because iv.outer.next is
+; produced in the outer loop latch and used in the loop exit block. If the inner
+; loop body is not executed, the outer loop latch won't be executed either
+; after interchanging.
+; REMARK: UnsupportedExitPHI
+; REMARK-NEXT: lcssa_01
+
+define void @lcssa_01(){
+entry:
+ %cmp21 = icmp sgt i64 100, 1
+ br i1 %cmp21, label %outer.ph, label %for.end16
+
+outer.ph:
+ %cmp218 = icmp sgt i64 100, 1
+ br label %outer.header
+
+outer.header:
+ %iv.outer= phi i64 [ 1, %outer.ph ], [ %iv.outer.next, %outer.inc ]
+ br i1 %cmp218, label %for.body3, label %outer.inc
+
+for.body3:
+ %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+ %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
+ %vA = load i32, i32* %arrayidx5
+ %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
+ %vC = load i32, i32* %arrayidx9
+ %add = add nsw i32 %vA, %vC
+ store i32 %add, i32* %arrayidx5
+ %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+ %exitcond = icmp eq i64 %iv.inner.next, 100
+ br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:
+ %iv.outer.next = add nsw i64 %iv.outer, 1
+ %cmp = icmp eq i64 %iv.outer.next, 100
+ br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:
+ store i64 %iv.outer.next, i64 * @Y
+ br label %for.end16
+
+for.end16:
+ ret void
+}
+
+; REMARK: UnsupportedExitPHI
+; REMARK-NEXT: lcssa_02
+define void @lcssa_02(){
+entry:
+ %cmp21 = icmp sgt i64 100, 1
+ br i1 %cmp21, label %outer.ph, label %for.end16
+
+outer.ph:
+ %cmp218 = icmp sgt i64 100, 1
+ br label %outer.header
+
+outer.header:
+ %iv.outer= phi i64 [ 1, %outer.ph ], [ %iv.outer.next, %outer.inc ]
+ br i1 %cmp218, label %for.body3, label %outer.inc
+
+for.body3:
+ %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+ %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
+ %vA = load i32, i32* %arrayidx5
+ %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
+ %vC = load i32, i32* %arrayidx9
+ %add = add nsw i32 %vA, %vC
+ store i32 %add, i32* %arrayidx5
+ %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+ %exitcond = icmp eq i64 %iv.inner.next, 100
+ br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:
+ %iv.inner.end = phi i64 [ 0, %outer.header ], [ %iv.inner.next, %for.body3 ]
+ %iv.outer.next = add nsw i64 %iv.outer, 1
+ %cmp = icmp eq i64 %iv.outer.next, 100
+ br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:
+ store i64 %iv.inner.end, i64 * @Y
+ br label %for.end16
+
+for.end16:
+ ret void
+}
+
+
+; REMARK: Interchanged
+; REMARK-NEXT: lcssa_03
+define void @lcssa_03(){
+entry:
+ br label %outer.header
+
+outer.header:
+ %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+ br label %for.body3
+
+for.body3:
+ %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+ %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
+ %vA = load i32, i32* %arrayidx5
+ %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
+ %vC = load i32, i32* %arrayidx9
+ %add = add nsw i32 %vA, %vC
+ store i32 %add, i32* %arrayidx5
+ %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+ %exitcond = icmp eq i64 %iv.inner.next, 100
+ br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:
+ %iv.outer.next = add nsw i64 %iv.outer, 1
+ %cmp = icmp eq i64 %iv.outer.next, 100
+ br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:
+ store i64 %iv.inner, i64 * @Y
+ br label %for.end16
+
+for.end16:
+ ret void
+}
+
+; FIXME: We currently do not support LCSSA phi nodes involving floating point
+; types, as we fail to detect floating point reductions for now.
+; REMARK: UnsupportedPHIOuter
+; REMARK-NEXT: lcssa_04
+define void @lcssa_04(){
+entry:
+ br label %outer.header
+
+outer.header:
+ %iv.outer= phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+ %float.outer= phi float [ 1.0, %entry ], [ 2.0, %outer.inc ]
+ br label %for.body3
+
+for.body3:
+ %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+ %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
+ %vA = load i32, i32* %arrayidx5
+ %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
+ %vC = load i32, i32* %arrayidx9
+ %add = add nsw i32 %vA, %vC
+ store i32 %add, i32* %arrayidx5
+ %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+ %exitcond = icmp eq i64 %iv.inner.next, 100
+ br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:
+ %iv.outer.next = add nsw i64 %iv.outer, 1
+ %cmp = icmp eq i64 %iv.outer.next, 100
+ br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:
+ store float %float.outer, float* @F
+ br label %for.end16
+
+for.end16:
+ ret void
+}
for.end15: ; preds = %for.inc13, %entry
ret void
}
+
+;; for( int i=1;i<N;i++)
+;; for( int j=1;j<N;j++)
+;; X+=A[j][i];
+;; Y = X
+; CHECK: Loops interchanged.
+define void @reduction_05(i32 %N) {
+entry:
+ %cmp16 = icmp sgt i32 %N, 1
+ br i1 %cmp16, label %for.body7.lr.ph, label %for.end8
+
+for.body7.lr.ph: ; preds = %entry, %for.cond1.for.inc6_crit_edge
+ %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ]
+ %X.promoted = load i32, i32* @X
+ br label %for.body7
+
+for.body7: ; preds = %for.body7, %for.body7.lr.ph
+ %indvars.iv = phi i64 [ 1, %for.body7.lr.ph ], [ %indvars.iv.next, %for.body7 ]
+ %add15 = phi i32 [ %X.promoted, %for.body7.lr.ph ], [ %add, %for.body7 ]
+ %arrayidx5 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv18
+ %0 = load i32, i32* %arrayidx5
+ %add = add nsw i32 %add15, %0
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %N
+ br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body7
+
+for.cond1.for.inc6_crit_edge: ; preds = %for.body7
+ store i32 %add, i32* @X
+ %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
+ %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32
+ %exitcond21 = icmp eq i32 %lftr.wideiv20, %N
+ br i1 %exitcond21, label %for.end8, label %for.body7.lr.ph
+
+for.end8: ; preds = %for.cond1.for.inc6_crit_edge, %entry
+ %add.res = phi i32 [ %add, %for.cond1.for.inc6_crit_edge], [ 0, %entry ]
+ store i32 %add.res, i32* @Y
+
+ ret void
+}