bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
unsigned InnerLoopId, unsigned OuterLoopId,
CharMatrix &DepMatrix,
- const DenseMap<const Loop *, unsigned> &CostMap);
+ const DenseMap<const Loop *, unsigned> &CostMap,
+ std::unique_ptr<CacheCost> &CC);
private:
int getInstrOrderCost();
-
+ std::optional<bool> isProfitablePerLoopCacheAnalysis(
+ const DenseMap<const Loop *, unsigned> &CostMap,
+ std::unique_ptr<CacheCost> &CC);
+ std::optional<bool> isProfitablePerInstrOrderCost();
+ std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix);
Loop *OuterLoop;
Loop *InnerLoop;
LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
- DependencyMatrix, CostMap)) {
+ DependencyMatrix, CostMap, CC)) {
LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
return false;
}
return GoodOrder - BadOrder;
}
-static bool isProfitableForVectorization(unsigned InnerLoopId,
- unsigned OuterLoopId,
- CharMatrix &DepMatrix) {
- // TODO: Improve this heuristic to catch more cases.
- // If the inner loop is loop independent or doesn't carry any dependency it is
- // profitable to move this to outer position.
- for (auto &Row : DepMatrix) {
- if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
- return false;
- // TODO: We need to improve this heuristic.
- if (Row[OuterLoopId] != '=')
- return false;
- }
- // If outer loop has dependence and inner loop is loop independent then it is
- // profitable to interchange to enable parallelism.
- // If there are no dependences, interchanging will not improve anything.
- return !DepMatrix.empty();
-}
-
-bool LoopInterchangeProfitability::isProfitable(
- const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
- unsigned OuterLoopId, CharMatrix &DepMatrix,
- const DenseMap<const Loop *, unsigned> &CostMap) {
- // TODO: Remove the legacy cost model.
-
+std::optional<bool>
+LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
+ const DenseMap<const Loop *, unsigned> &CostMap,
+ std::unique_ptr<CacheCost> &CC) {
// This is the new cost model returned from loop cache analysis.
// A smaller index means the loop should be placed an outer loop, and vice
// versa.
LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
<< ", OuterIndex = " << OuterIndex << "\n");
if (InnerIndex < OuterIndex)
- return true;
- } else {
- // Legacy cost model: this is rough cost estimation algorithm. It counts the
- // good and bad order of induction variables in the instruction and allows
- // reordering if number of bad orders is more than good.
- int Cost = getInstrOrderCost();
- LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
- if (Cost < -LoopInterchangeCostThreshold)
- return true;
+ return std::optional<bool>(true);
+ assert(InnerIndex != OuterIndex && "CostMap should assign unique "
+ "numbers to each loop");
+ if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
+ return std::nullopt;
+ return std::optional<bool>(false);
}
+ return std::nullopt;
+}
- // It is not profitable as per current cache profitability model. But check if
- // we can move this loop outside to improve parallelism.
- if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
- return true;
+std::optional<bool>
+LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
+ // Legacy cost model: this is rough cost estimation algorithm. It counts the
+ // good and bad order of induction variables in the instruction and allows
+ // reordering if number of bad orders is more than good.
+ int Cost = getInstrOrderCost();
+ LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
+ if (Cost < 0 && Cost < LoopInterchangeCostThreshold)
+ return std::optional<bool>(true);
+
+ return std::nullopt;
+}
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Interchanging loops is too costly and it does not improve "
- "parallelism.";
- });
- return false;
+std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
+ unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
+ for (auto &Row : DepMatrix) {
+ // If the inner loop is loop independent or doesn't carry any dependency
+ // it is not profitable to move this to outer position, since we are
+ // likely able to do inner loop vectorization already.
+ if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=')
+ return std::optional<bool>(false);
+
+ // If the outer loop is not loop independent it is not profitable to move
+ // this to inner position, since doing so would not enable inner loop
+ // parallelism.
+ if (Row[OuterLoopId] != 'I' && Row[OuterLoopId] != '=')
+ return std::optional<bool>(false);
+ }
+ // If inner loop has dependence and outer loop is loop independent then it
+ // is/ profitable to interchange to enable inner loop parallelism.
+ // If there are no dependences, interchanging will not improve anything.
+ return std::optional<bool>(!DepMatrix.empty());
+}
+
+bool LoopInterchangeProfitability::isProfitable(
+ const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
+ unsigned OuterLoopId, CharMatrix &DepMatrix,
+ const DenseMap<const Loop *, unsigned> &CostMap,
+ std::unique_ptr<CacheCost> &CC) {
+ // isProfitable() is structured to avoid endless loop interchange.
+ // If loop cache analysis could decide the profitability then,
+ // profitability check will stop and return the analysis result.
+ // If cache analysis failed to analyze the loopnest (e.g.,
+ // due to delinearization issues) then only check whether it is
+ // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
+ // analysis the profitability then only, isProfitableForVectorization
+ // will decide.
+ std::optional<bool> shouldInterchange =
+ isProfitablePerLoopCacheAnalysis(CostMap, CC);
+ if (!shouldInterchange.has_value()) {
+ shouldInterchange = isProfitablePerInstrOrderCost();
+ if (!shouldInterchange.has_value())
+ shouldInterchange =
+ isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+ }
+ if (!shouldInterchange.has_value()) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Insufficient information to calculate the cost of loop for "
+ "interchange.";
+ });
+ return false;
+ } else if (!shouldInterchange.value()) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Interchanging loops is not considered to improve cache "
+ "locality nor vectorization.";
+ });
+ return false;
+ }
+ return true;
}
void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
; Make sure we do not crash for loops without reachable exits.
define void @no_reachable_exits() {
-; Check we interchanged.
-; CHECK-LABEL: @no_reachable_exits() {
+; Check we do not crash.
+; CHECK-LABEL: @no_reachable_exits(
; CHECK-NEXT: bb:
-; CHECK-NEXT: br label %inner.ph
-; CHECK-LABEL: outer.ph:
-; CHECK-NEXT: br label %outer.header
-; CHECK-LABEL: inner.ph:
-; CHECK-NEXT: br label %inner.body
-; CHECK-LABEL: inner.body:
-; CHECK-NEXT: %tmp31 = phi i32 [ 0, %inner.ph ], [ %[[IVNEXT:[0-9]]], %inner.body.split ]
-; CHECK-NEXT: br label %outer.ph
-; CHECK-LABEL: inner.body.split:
-; CHECK-NEXT: %[[IVNEXT]] = add nsw i32 %tmp31, 1
-; CHECK-NEXT: br i1 false, label %inner.body, label %exit
+; CHECK-NEXT: br label [[OUTER_PH:%.*]]
+; CHECK: outer.ph:
+; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
+; CHECK: outer.header:
+; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[OUTER_PH]] ], [ [[TMP8:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-NEXT: br i1 undef, label [[INNER_PH:%.*]], label [[OUTER_LATCH]]
+; CHECK: inner.ph:
+; CHECK-NEXT: br label [[INNER_BODY:%.*]]
+; CHECK: inner.body:
+; CHECK-NEXT: [[TMP31:%.*]] = phi i32 [ 0, [[INNER_PH]] ], [ [[TMP6:%.*]], [[INNER_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr undef, align 8
+; CHECK-NEXT: [[TMP6]] = add nsw i32 [[TMP31]], 1
+; CHECK-NEXT: br i1 false, label [[INNER_BODY]], label [[OUTER_LATCH_LOOPEXIT:%.*]]
+; CHECK: outer.latch.loopexit:
+; CHECK-NEXT: br label [[OUTER_LATCH]]
+; CHECK: outer.latch:
+; CHECK-NEXT: [[TMP8]] = add nsw i32 [[TMP2]], 1
+; CHECK-NEXT: br i1 false, label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK: exit:
+; CHECK-NEXT: unreachable
bb:
; RUN: -pass-remarks=loop-interchange -pass-remarks-missed=loop-interchange
; RUN: FileCheck -input-file %t %s
+; RUN: opt < %s -passes=loop-interchange,loop-interchange -cache-line-size=64 \
+; RUN: -pass-remarks-output=%t -pass-remarks='loop-interchange' -S
+; RUN: cat %t | FileCheck --check-prefix=PROFIT %s
+
;; We test profitability model in these test cases.
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@B = common global [100 x [100 x i32]] zeroinitializer
;;---------------------------------------Test case 01---------------------------------
-;; Loops interchange will result in code vectorization and hence profitable. Check for interchange.
+;; Loops interchange will result in better cache locality and hence profitable. Check for interchange.
;; for(int i=1;i<100;i++)
;; for(int j=1;j<100;j++)
;; A[j][i] = A[j - 1][i] + B[j][i];
for.end12:
ret void
}
+
+;;---------------------------------------Test case 05---------------------------------
+;; This test is to make sure, that multiple invocations of loop interchange will not
+;; undo previous interchange and will converge to a particular order determined by the
+;; profitability analysis.
+;; for(int i=1;i<100;i++)
+;; for(int j=1;j<100;j++)
+;; A[j][0] = A[j][0] + B[j][i];
+
+; CHECK: Name: Interchanged
+; CHECK-NEXT: Function: interchange_05
+
+; PROFIT-LABEL: --- !Passed
+; PROFIT-NEXT: Pass: loop-interchange
+; PROFIT-NEXT: Name: Interchanged
+; PROFIT-LABEL: Function: interchange_05
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT: - String: Loop interchanged with enclosing loop.
+; PROFIT-NEXT: ...
+; PROFIT: --- !Missed
+; PROFIT-NEXT: Pass: loop-interchange
+; PROFIT-NEXT: Name: InterchangeNotProfitable
+; PROFIT-NEXT: Function: interchange_05
+; PROFIT-NEXT: Args:
+; PROFIT-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
+; PROFIT-NEXT: ...
+define void @interchange_05() {
+entry:
+ br label %for2.preheader
+
+for2.preheader:
+ %i30 = phi i64 [ 1, %entry ], [ %i.next31, %for1.inc14 ]
+ br label %for2
+
+for2:
+ %j = phi i64 [ %i.next, %for2 ], [ 1, %for2.preheader ]
+ %j.prev = add nsw i64 %j, -1
+ %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 0
+ %lv1 = load i32, i32* %arrayidx5
+ %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %j, i64 %i30
+ %lv2 = load i32, i32* %arrayidx9
+ %add = add nsw i32 %lv1, %lv2
+ %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %j, i64 0
+ store i32 %add, i32* %arrayidx13
+ %i.next = add nuw nsw i64 %j, 1
+ %exitcond = icmp eq i64 %j, 99
+ br i1 %exitcond, label %for1.inc14, label %for2
+
+for1.inc14:
+ %i.next31 = add nuw nsw i64 %i30, 1
+ %exitcond33 = icmp eq i64 %i30, 99
+ br i1 %exitcond33, label %for.end16, label %for2.preheader
+
+for.end16:
+ ret void
+}
+