#define DEBUG_TYPE "loop-fusion"
-STATISTIC(FuseCounter, "Count number of loop fusions performed");
+STATISTIC(FuseCounter, "Loops fused");
STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
STATISTIC(InvalidPreheader, "Loop has invalid preheader");
STATISTIC(InvalidHeader, "Loop has invalid header");
STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
-STATISTIC(InvalidTripCount,
- "Loop does not have invariant backedge taken count");
+STATISTIC(UnknownTripCount, "Loop has unknown trip count");
STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
-STATISTIC(NonEqualTripCount, "Candidate trip counts are not the same");
-STATISTIC(NonAdjacent, "Candidates are not adjacent");
-STATISTIC(NonEmptyPreheader, "Candidate has a non-empty preheader");
+STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
+STATISTIC(NonAdjacent, "Loops are not adjacent");
+STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader");
+STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
enum FusionDependenceAnalysisChoice {
FUSION_DEPENDENCE_ANALYSIS_SCEV,
const DominatorTree *DT;
const PostDominatorTree *PDT;
+ OptimizationRemarkEmitter &ORE;
+
FusionCandidate(Loop *L, const DominatorTree *DT,
- const PostDominatorTree *PDT)
+ const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE)
: Preheader(L->getLoopPreheader()), Header(L->getHeader()),
ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
- Latch(L->getLoopLatch()), L(L), Valid(true), DT(DT), PDT(PDT) {
+ Latch(L->getLoopLatch()), L(L), Valid(true), DT(DT), PDT(PDT),
+ ORE(ORE) {
// Walk over all blocks in the loop and check for conditions that may
// prevent fusion. For each block, walk over all instructions and collect
// found, invalidate this object and return.
for (BasicBlock *BB : L->blocks()) {
if (BB->hasAddressTaken()) {
- AddressTakenBB++;
invalidate();
+ reportInvalidCandidate(AddressTakenBB);
return;
}
for (Instruction &I : *BB) {
if (I.mayThrow()) {
- MayThrowException++;
invalidate();
+ reportInvalidCandidate(MayThrowException);
return;
}
if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
if (SI->isVolatile()) {
- ContainsVolatileAccess++;
invalidate();
+ reportInvalidCandidate(ContainsVolatileAccess);
return;
}
}
if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
if (LI->isVolatile()) {
- ContainsVolatileAccess++;
invalidate();
+ reportInvalidCandidate(ContainsVolatileAccess);
return;
}
}
}
#endif
+ /// Determine if a fusion candidate (representing a loop) is eligible for
+ /// fusion. Note that this only checks whether a single loop can be fused - it
+ /// does not check whether it is *legal* to fuse two loops together.
+ bool isEligibleForFusion(ScalarEvolution &SE) const {
+ if (!isValid()) {
+ LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
+ if (!Preheader)
+ ++InvalidPreheader;
+ if (!Header)
+ ++InvalidHeader;
+ if (!ExitingBlock)
+ ++InvalidExitingBlock;
+ if (!ExitBlock)
+ ++InvalidExitBlock;
+ if (!Latch)
+ ++InvalidLatch;
+ if (L->isInvalid())
+ ++InvalidLoop;
+
+ return false;
+ }
+
+ // Require ScalarEvolution to be able to determine a trip count.
+ if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
+ LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+ << " trip count not computable!\n");
+ return reportInvalidCandidate(UnknownTripCount);
+ }
+
+ if (!L->isLoopSimplifyForm()) {
+ LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+ << " is not in simplified form!\n");
+ return reportInvalidCandidate(NotSimplifiedForm);
+ }
+
+ return true;
+ }
+
private:
// This is only used internally for now, to clear the MemWrites and MemReads
// list and setting Valid to false. I can't envision other uses of this right
MemReads.clear();
Valid = false;
}
+
+ bool reportInvalidCandidate(llvm::Statistic &Stat) const {
+ using namespace ore;
+ assert(L && Preheader && "Fusion candidate not initialized properly!");
+ ++Stat;
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),
+ L->getStartLoc(), Preheader)
+ << "[" << Preheader->getParent()->getName() << "]: "
+ << "Loop is not a candidate for fusion: " << Stat.getDesc());
+ return false;
+ }
};
inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
}
#endif
-static void reportLoopFusion(const FusionCandidate &FC0,
- const FusionCandidate &FC1,
- OptimizationRemarkEmitter &ORE) {
- using namespace ore;
- ORE.emit(
- OptimizationRemark(DEBUG_TYPE, "LoopFusion", FC0.Preheader->getParent())
- << "Fused " << NV("Cand1", StringRef(FC0.Preheader->getName()))
- << " with " << NV("Cand2", StringRef(FC1.Preheader->getName())));
-}
-
struct LoopFuser {
private:
// Sets of control flow equivalent fusion candidates for a given nest level.
return false;
}
- /// Determine if a fusion candidate (representing a loop) is eligible for
- /// fusion. Note that this only checks whether a single loop can be fused - it
- /// does not check whether it is *legal* to fuse two loops together.
- bool eligibleForFusion(const FusionCandidate &FC) const {
- if (!FC.isValid()) {
- LLVM_DEBUG(dbgs() << "FC " << FC << " has invalid CFG requirements!\n");
- if (!FC.Preheader)
- InvalidPreheader++;
- if (!FC.Header)
- InvalidHeader++;
- if (!FC.ExitingBlock)
- InvalidExitingBlock++;
- if (!FC.ExitBlock)
- InvalidExitBlock++;
- if (!FC.Latch)
- InvalidLatch++;
- if (FC.L->isInvalid())
- InvalidLoop++;
-
- return false;
- }
-
- // Require ScalarEvolution to be able to determine a trip count.
- if (!SE.hasLoopInvariantBackedgeTakenCount(FC.L)) {
- LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName()
- << " trip count not computable!\n");
- InvalidTripCount++;
- return false;
- }
-
- if (!FC.L->isLoopSimplifyForm()) {
- LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName()
- << " is not in simplified form!\n");
- NotSimplifiedForm++;
- return false;
- }
-
- return true;
- }
-
/// Iterate over all loops in the given loop set and identify the loops that
/// are eligible for fusion. Place all eligible fusion candidates into Control
/// Flow Equivalent sets, sorted by dominance.
void collectFusionCandidates(const LoopVector &LV) {
for (Loop *L : LV) {
- FusionCandidate CurrCand(L, &DT, &PDT);
- if (!eligibleForFusion(CurrCand))
+ FusionCandidate CurrCand(L, &DT, &PDT, ORE);
+ if (!CurrCand.isEligibleForFusion(SE))
continue;
// Go through each list in FusionCandidates and determine if L is control
if (!identicalTripCounts(*FC0, *FC1)) {
LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
"counts. Not fusing.\n");
- NonEqualTripCount++;
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ NonEqualTripCount);
continue;
}
if (!isAdjacent(*FC0, *FC1)) {
LLVM_DEBUG(dbgs()
<< "Fusion candidates are not adjacent. Not fusing.\n");
- NonAdjacent++;
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
continue;
}
if (!isEmptyPreheader(*FC1)) {
LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
"preheader. Not fusing.\n");
- NonEmptyPreheader++;
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ NonEmptyPreheader);
continue;
}
if (!dependencesAllowFusion(*FC0, *FC1)) {
LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ InvalidDependencies);
continue;
}
LLVM_DEBUG(dbgs()
<< "\tFusion appears to be "
<< (BeneficialToFuse ? "" : "un") << "profitable!\n");
- if (!BeneficialToFuse)
+ if (!BeneficialToFuse) {
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ FusionNotBeneficial);
continue;
-
+ }
// All analysis has completed and has determined that fusion is legal
// and profitable. At this point, start transforming the code and
// perform fusion.
// Note this needs to be done *before* performFusion because
// performFusion will change the original loops, making it not
// possible to identify them after fusion is complete.
- reportLoopFusion(*FC0, *FC1, ORE);
+ reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter);
- FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT);
+ FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE);
FusedCand.verify();
- assert(eligibleForFusion(FusedCand) &&
+ assert(FusedCand.isEligibleForFusion(SE) &&
"Fused candidate should be eligible for fusion!");
// Notify the loop-depth-tree that these loops are not valid objects
- // anymore.
LDT.removeLoop(FC1->L);
CandidateSet.erase(FC0);
return FC0.L;
}
+
+ /// Report details on loop fusion opportunities.
+ ///
+ /// This template function can be used to report both successful and missed
+ /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
+ /// be one of:
+ /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful
+ /// given two valid fusion candidates.
+ /// - OptimizationRemark to report successful fusion of two fusion
+ /// candidates.
+ /// The remarks will be printed using the form:
+ /// <path/filename>:<line number>:<column number>: [<function name>]:
+ /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
+ template <typename RemarkKind>
+ void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
+ llvm::Statistic &Stat) {
+ assert(FC0.Preheader && FC1.Preheader &&
+ "Expecting valid fusion candidates");
+ using namespace ore;
+ ++Stat;
+ ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
+ FC0.Preheader)
+ << "[" << FC0.Preheader->getParent()->getName()
+ << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
+ << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
+ << ": " << Stat.getDesc());
+ }
};
struct LoopFuseLegacy : public FunctionPass {
--- /dev/null
+; RUN: opt -S -loop-fusion -pass-remarks-analysis=loop-fusion -disable-output < %s 2>&1 | FileCheck %s
+
+@B = common global [1024 x i32] zeroinitializer, align 16
+
+; CHECK: remark: diagnostics_analysis.c:6:3: [test]: Loop is not a candidate for fusion: Loop contains a volatile access
+; CHECK: remark: diagnostics_analysis.c:10:3: [test]: Loop is not a candidate for fusion: Loop has unknown trip count
+define void @test(i32* %A, i32 %n) !dbg !15 {
+entry:
+ %A.addr = alloca i32*, align 8
+ %n.addr = alloca i32, align 4
+ %i = alloca i32, align 4
+ %i1 = alloca i32, align 4
+ store i32* %A, i32** %A.addr, align 8
+ store i32 %n, i32* %n.addr, align 4
+ %0 = bitcast i32* %i to i8*
+ store i32 0, i32* %i, align 4
+ br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+ %1 = load i32, i32* %i, align 4
+ %2 = load i32, i32* %n.addr, align 4
+ %cmp = icmp slt i32 %1, %2
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond
+ %3 = bitcast i32* %i to i8*, !dbg !42
+ br label %for.end
+
+for.body: ; preds = %for.cond
+ %4 = load i32, i32* %i, align 4
+ %sub = sub nsw i32 %4, 3
+ %5 = load i32, i32* %i, align 4
+ %add = add nsw i32 %5, 3
+ %mul = mul nsw i32 %sub, %add
+ %6 = load i32, i32* %i, align 4
+ %rem = srem i32 %mul, %6
+ %7 = load i32*, i32** %A.addr, align 8
+ %8 = load i32, i32* %i, align 4
+ %idxprom = sext i32 %8 to i64
+ %arrayidx = getelementptr inbounds i32, i32* %7, i64 %idxprom
+ store volatile i32 %rem, i32* %arrayidx, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body
+ %9 = load i32, i32* %i, align 4, !dbg !49
+ %inc = add nsw i32 %9, 1, !dbg !49
+ store i32 %inc, i32* %i, align 4, !dbg !49
+ br label %for.cond, !dbg !42, !llvm.loop !50
+
+for.end: ; preds = %for.cond.cleanup
+ %10 = bitcast i32* %i1 to i8*
+ store i32 0, i32* %i1, align 4
+ br label %for.cond2
+
+for.cond2: ; preds = %for.inc12, %for.end
+ %11 = load i32, i32* %i1, align 4
+ %12 = load i32, i32* %n.addr, align 4
+ %cmp3 = icmp slt i32 %11, %12
+ br i1 %cmp3, label %for.body5, label %for.cond.cleanup4
+
+for.cond.cleanup4: ; preds = %for.cond2
+ %13 = bitcast i32* %i1 to i8*
+ br label %for.end14
+
+for.body5: ; preds = %for.cond2
+ %14 = load i32, i32* %i1, align 4
+ %sub6 = sub nsw i32 %14, 3
+ %15 = load i32, i32* %i1, align 4
+ %add7 = add nsw i32 %15, 3
+ %mul8 = mul nsw i32 %sub6, %add7
+ %16 = load i32, i32* %i1, align 4
+ %rem9 = srem i32 %mul8, %16
+ %17 = load i32, i32* %i1, align 4
+ %idxprom10 = sext i32 %17 to i64
+ %arrayidx11 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %idxprom10
+ store i32 %rem9, i32* %arrayidx11, align 4
+ br label %for.inc12
+
+for.inc12: ; preds = %for.body5
+ %18 = load i32, i32* %i1, align 4
+ %inc13 = add nsw i32 %18, 1
+ store i32 %inc13, i32* %i1, align 4
+ br label %for.cond2, !dbg !59, !llvm.loop !67
+
+for.end14: ; preds = %for.cond.cleanup4
+ ret void
+}
+
+!llvm.module.flags = !{!10, !11, !13}
+!llvm.ident = !{!14}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "B", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 9.0.0 (git@github.ibm.com:compiler/llvm-project.git c019c32c5a2b0ed4487a738337d35fd3f630ac0a)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: GNU)
+!3 = !DIFile(filename: "diagnostics_analysis.c", directory: "/tmp")
+!4 = !{}
+!5 = !{!0}
+!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 32768, elements: !8)
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!8 = !{!9}
+!9 = !DISubrange(count: 1024)
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!13 = !{i32 7, !"PIC Level", i32 2}
+!14 = !{!"clang version 9.0.0 (git@github.ibm.com:compiler/llvm-project.git c019c32c5a2b0ed4487a738337d35fd3f630ac0a)"}
+!15 = distinct !DISubprogram(name: "test", scope: !3, file: !3, line: 5, type: !16, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !20)
+!16 = !DISubroutineType(types: !17)
+!17 = !{null, !18, !7}
+!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
+!19 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !7)
+!20 = !{!21, !22, !23, !25}
+!21 = !DILocalVariable(name: "A", arg: 1, scope: !15, file: !3, line: 5, type: !18)
+!22 = !DILocalVariable(name: "n", arg: 2, scope: !15, file: !3, line: 5, type: !7)
+!23 = !DILocalVariable(name: "i", scope: !24, file: !3, line: 6, type: !7)
+!24 = distinct !DILexicalBlock(scope: !15, file: !3, line: 6, column: 3)
+!25 = !DILocalVariable(name: "i", scope: !26, file: !3, line: 10, type: !7)
+!26 = distinct !DILexicalBlock(scope: !15, file: !3, line: 10, column: 3)
+!38 = distinct !DILexicalBlock(scope: !24, file: !3, line: 6, column: 3)
+!41 = !DILocation(line: 6, column: 3, scope: !24)
+!42 = !DILocation(line: 6, column: 3, scope: !38)
+!44 = distinct !DILexicalBlock(scope: !38, file: !3, line: 6, column: 31)
+!49 = !DILocation(line: 6, column: 27, scope: !38)
+!50 = distinct !{!50, !41, !51}
+!51 = !DILocation(line: 8, column: 3, scope: !24)
+!55 = distinct !DILexicalBlock(scope: !26, file: !3, line: 10, column: 3)
+!58 = !DILocation(line: 10, column: 3, scope: !26)
+!59 = !DILocation(line: 10, column: 3, scope: !55)
+!67 = distinct !{!67, !58, !68}
+!68 = !DILocation(line: 12, column: 3, scope: !26)
+!69 = !DILocation(line: 13, column: 1, scope: !15)
--- /dev/null
+; RUN: opt -S -loop-fusion -pass-remarks-missed=loop-fusion -disable-output < %s 2>&1 | FileCheck %s
+;
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@B = common global [1024 x i32] zeroinitializer, align 16, !dbg !0
+
+; CHECK: remark: diagnostics_missed.c:18:3: [non_adjacent]: entry and for.end: Loops are not adjacent
+define void @non_adjacent(i32* noalias %A) !dbg !67 {
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ]
+ %exitcond1 = icmp ne i64 %i.0, 100
+ br i1 %exitcond1, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond
+ br label %for.end
+
+for.body: ; preds = %for.cond
+ %sub = add nsw i64 %i.0, -3
+ %add = add nuw nsw i64 %i.0, 3
+ %mul = mul nsw i64 %sub, %add
+ %rem = srem i64 %mul, %i.0
+ %conv = trunc i64 %rem to i32
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.0
+ store i32 %conv, i32* %arrayidx, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body
+ %inc = add nuw nsw i64 %i.0, 1, !dbg !86
+ br label %for.cond, !dbg !87, !llvm.loop !88
+
+for.end: ; preds = %for.cond.cleanup
+ br label %for.cond2
+
+for.cond2: ; preds = %for.inc13, %for.end
+ %i1.0 = phi i64 [ 0, %for.end ], [ %inc14, %for.inc13 ]
+ %exitcond = icmp ne i64 %i1.0, 100
+ br i1 %exitcond, label %for.body6, label %for.cond.cleanup5
+
+for.cond.cleanup5: ; preds = %for.cond2
+ br label %for.end15
+
+for.body6: ; preds = %for.cond2
+ %sub7 = add nsw i64 %i1.0, -3
+ %add8 = add nuw nsw i64 %i1.0, 3
+ %mul9 = mul nsw i64 %sub7, %add8
+ %rem10 = srem i64 %mul9, %i1.0
+ %conv11 = trunc i64 %rem10 to i32
+ %arrayidx12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %i1.0
+ store i32 %conv11, i32* %arrayidx12, align 4
+ br label %for.inc13
+
+for.inc13: ; preds = %for.body6
+ %inc14 = add nuw nsw i64 %i1.0, 1, !dbg !100
+ br label %for.cond2, !dbg !101, !llvm.loop !102
+
+for.end15: ; preds = %for.cond.cleanup5
+ ret void
+}
+
+
+; CHECK: remark: diagnostics_missed.c:28:3: [different_bounds]: entry and for.end: Loop trip counts are not the same
+define void @different_bounds(i32* noalias %A) !dbg !105 {
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ]
+ %exitcond1 = icmp ne i64 %i.0, 100
+ br i1 %exitcond1, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond
+ br label %for.end
+
+for.body: ; preds = %for.cond
+ %sub = add nsw i64 %i.0, -3
+ %add = add nuw nsw i64 %i.0, 3
+ %mul = mul nsw i64 %sub, %add
+ %rem = srem i64 %mul, %i.0
+ %conv = trunc i64 %rem to i32
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.0
+ store i32 %conv, i32* %arrayidx, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body
+ %inc = add nuw nsw i64 %i.0, 1, !dbg !123
+ br label %for.cond, !dbg !124, !llvm.loop !125
+
+for.end: ; preds = %for.cond.cleanup
+ br label %for.cond2
+
+for.cond2: ; preds = %for.inc13, %for.end
+ %i1.0 = phi i64 [ 0, %for.end ], [ %inc14, %for.inc13 ]
+ %exitcond = icmp ne i64 %i1.0, 200
+ br i1 %exitcond, label %for.body6, label %for.cond.cleanup5
+
+for.cond.cleanup5: ; preds = %for.cond2
+ br label %for.end15
+
+for.body6: ; preds = %for.cond2
+ %sub7 = add nsw i64 %i1.0, -3
+ %add8 = add nuw nsw i64 %i1.0, 3
+ %mul9 = mul nsw i64 %sub7, %add8
+ %rem10 = srem i64 %mul9, %i1.0
+ %conv11 = trunc i64 %rem10 to i32
+ %arrayidx12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %i1.0
+ store i32 %conv11, i32* %arrayidx12, align 4
+ br label %for.inc13
+
+for.inc13: ; preds = %for.body6
+ %inc14 = add nuw nsw i64 %i1.0, 1
+ br label %for.cond2, !dbg !138, !llvm.loop !139
+
+for.end15: ; preds = %for.cond.cleanup5
+ ret void
+}
+
+; CHECK: remark: diagnostics_missed.c:38:3: [negative_dependence]: entry and for.end: Loop has a non-empty preheader
+define void @negative_dependence(i32* noalias %A) !dbg !142 {
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+ %indvars.iv1 = phi i64 [ %indvars.iv.next2, %for.inc ], [ 0, %entry ]
+ %exitcond3 = icmp ne i64 %indvars.iv1, 100
+ br i1 %exitcond3, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv1
+ %tmp = trunc i64 %indvars.iv1 to i32
+ store i32 %tmp, i32* %arrayidx, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body
+ %indvars.iv.next2 = add nuw nsw i64 %indvars.iv1, 1
+ br label %for.cond, !dbg !160, !llvm.loop !161
+
+for.end: ; preds = %for.cond
+ call void @llvm.dbg.value(metadata i32 0, metadata !147, metadata !DIExpression()), !dbg !163
+ br label %for.cond2, !dbg !164
+
+for.cond2: ; preds = %for.inc10, %for.end
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc10 ], [ 0, %for.end ]
+ %exitcond = icmp ne i64 %indvars.iv, 100
+ br i1 %exitcond, label %for.body5, label %for.end12
+
+for.body5: ; preds = %for.cond2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %arrayidx7 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
+ %tmp4 = load i32, i32* %arrayidx7, align 4
+ %mul = shl nsw i32 %tmp4, 1
+ %arrayidx9 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+ store i32 %mul, i32* %arrayidx9, align 4
+ br label %for.inc10
+
+for.inc10: ; preds = %for.body5
+ br label %for.cond2
+
+for.end12: ; preds = %for.cond.
+ ret void, !dbg !178
+}
+
+; CHECK: remark: diagnostics_missed.c:51:3: [sumTest]: entry and for.cond2.preheader: Dependencies prevent fusion
+define i32 @sumTest(i32* noalias %A) !dbg !179 {
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+ %indvars.iv1 = phi i64 [ %indvars.iv.next2, %for.inc ], [ 0, %entry ]
+ %sum.0 = phi i32 [ 0, %entry ], [ %add, %for.inc ]
+ %exitcond3 = icmp ne i64 %indvars.iv1, 100
+ br i1 %exitcond3, label %for.body, label %for.cond2
+
+for.body: ; preds = %for.cond
+ br label %for.inc
+
+for.inc: ; preds = %for.body
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv1
+ %tmp = load i32, i32* %arrayidx, align 4
+ %add = add nsw i32 %sum.0, %tmp
+ %indvars.iv.next2 = add nuw nsw i64 %indvars.iv1, 1
+ br label %for.cond, !dbg !199, !llvm.loop !200
+
+for.cond2: ; preds = %for.inc10, %for.cond
+ %sum.0.lcssa = phi i32 [ %sum.0, %for.cond ], [ %sum.0.lcssa, %for.inc10 ]
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc10 ], [ 0, %for.cond ]
+ %exitcond = icmp ne i64 %indvars.iv, 100
+ br i1 %exitcond, label %for.body5, label %for.end12
+
+for.body5: ; preds = %for.cond2
+ %arrayidx7 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+ %tmp4 = load i32, i32* %arrayidx7, align 4
+ %div = sdiv i32 %tmp4, %sum.0.lcssa
+ %arrayidx9 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+ store i32 %div, i32* %arrayidx9, align 4
+ br label %for.inc10
+
+for.inc10: ; preds = %for.body5
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ br label %for.cond2
+
+for.end12: ; preds = %for.cond2
+ ret i32 %sum.0.lcssa, !dbg !215
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!11, !12, !13, !14}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "B", scope: !2, file: !6, line: 46, type: !7, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 9.0.0 (git@github.ibm.com:compiler/llvm-project.git 23c4baaa9f5b33d2d52eda981d376c6b0a7a3180)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: GNU)
+!3 = !DIFile(filename: "diagnostics_missed.c", directory: "/tmp")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIFile(filename: "diagnostics_missed.c", directory: "/tmp")
+!7 = !DICompositeType(tag: DW_TAG_array_type, baseType: !8, size: 32768, elements: !9)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !{!10}
+!10 = !DISubrange(count: 1024)
+!11 = !{i32 2, !"Dwarf Version", i32 4}
+!12 = !{i32 2, !"Debug Info Version", i32 3}
+!13 = !{i32 1, !"wchar_size", i32 4}
+!14 = !{i32 7, !"PIC Level", i32 2}
+!17 = !DISubroutineType(types: !18)
+!18 = !{null, !19}
+!19 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !20)
+!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64)
+!67 = distinct !DISubprogram(name: "non_adjacent", scope: !6, file: !6, line: 17, type: !17, scopeLine: 17, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !68)
+!68 = !{!69, !70, !73}
+!69 = !DILocalVariable(name: "A", arg: 1, scope: !67, file: !6, line: 17, type: !19)
+!70 = !DILocalVariable(name: "i", scope: !71, file: !6, line: 18, type: !72)
+!71 = distinct !DILexicalBlock(scope: !67, file: !6, line: 18, column: 3)
+!72 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+!73 = !DILocalVariable(name: "i", scope: !74, file: !6, line: 22, type: !72)
+!74 = distinct !DILexicalBlock(scope: !67, file: !6, line: 22, column: 3)
+!79 = distinct !DILexicalBlock(scope: !71, file: !6, line: 18, column: 3)
+!80 = !DILocation(line: 18, column: 3, scope: !71)
+!86 = !DILocation(line: 18, column: 30, scope: !79)
+!87 = !DILocation(line: 18, column: 3, scope: !79)
+!88 = distinct !{!88, !80, !89}
+!89 = !DILocation(line: 20, column: 3, scope: !71)
+!93 = distinct !DILexicalBlock(scope: !74, file: !6, line: 22, column: 3)
+!94 = !DILocation(line: 22, column: 3, scope: !74)
+!100 = !DILocation(line: 22, column: 30, scope: !93)
+!101 = !DILocation(line: 22, column: 3, scope: !93)
+!102 = distinct !{!102, !94, !103}
+!103 = !DILocation(line: 24, column: 3, scope: !74)
+!105 = distinct !DISubprogram(name: "different_bounds", scope: !6, file: !6, line: 27, type: !17, scopeLine: 27, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !106)
+!106 = !{!107, !108, !110}
+!107 = !DILocalVariable(name: "A", arg: 1, scope: !105, file: !6, line: 27, type: !19)
+!108 = !DILocalVariable(name: "i", scope: !109, file: !6, line: 28, type: !72)
+!109 = distinct !DILexicalBlock(scope: !105, file: !6, line: 28, column: 3)
+!110 = !DILocalVariable(name: "i", scope: !111, file: !6, line: 32, type: !72)
+!111 = distinct !DILexicalBlock(scope: !105, file: !6, line: 32, column: 3)
+!116 = distinct !DILexicalBlock(scope: !109, file: !6, line: 28, column: 3)
+!117 = !DILocation(line: 28, column: 3, scope: !109)
+!123 = !DILocation(line: 28, column: 30, scope: !116)
+!124 = !DILocation(line: 28, column: 3, scope: !116)
+!125 = distinct !{!125, !117, !126}
+!126 = !DILocation(line: 30, column: 3, scope: !109)
+!130 = distinct !DILexicalBlock(scope: !111, file: !6, line: 32, column: 3)
+!131 = !DILocation(line: 32, column: 3, scope: !111)
+!138 = !DILocation(line: 32, column: 3, scope: !130)
+!139 = distinct !{!139, !131, !140}
+!140 = !DILocation(line: 34, column: 3, scope: !111)
+!142 = distinct !DISubprogram(name: "negative_dependence", scope: !6, file: !6, line: 37, type: !17, scopeLine: 37, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !143)
+!143 = !{!144, !145, !147}
+!144 = !DILocalVariable(name: "A", arg: 1, scope: !142, file: !6, line: 37, type: !19)
+!145 = !DILocalVariable(name: "i", scope: !146, file: !6, line: 38, type: !8)
+!146 = distinct !DILexicalBlock(scope: !142, file: !6, line: 38, column: 3)
+!147 = !DILocalVariable(name: "i", scope: !148, file: !6, line: 42, type: !8)
+!148 = distinct !DILexicalBlock(scope: !142, file: !6, line: 42, column: 3)
+!153 = distinct !DILexicalBlock(scope: !146, file: !6, line: 38, column: 3)
+!154 = !DILocation(line: 38, column: 3, scope: !146)
+!160 = !DILocation(line: 38, column: 3, scope: !153)
+!161 = distinct !{!161, !154, !162}
+!162 = !DILocation(line: 40, column: 3, scope: !146)
+!163 = !DILocation(line: 0, scope: !148)
+!164 = !DILocation(line: 42, column: 8, scope: !148)
+!178 = !DILocation(line: 45, column: 1, scope: !142)
+!179 = distinct !DISubprogram(name: "sumTest", scope: !6, file: !6, line: 48, type: !180, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !182)
+!180 = !DISubroutineType(types: !181)
+!181 = !{!8, !19}
+!182 = !{!183, !184, !185, !187}
+!183 = !DILocalVariable(name: "A", arg: 1, scope: !179, file: !6, line: 48, type: !19)
+!184 = !DILocalVariable(name: "sum", scope: !179, file: !6, line: 49, type: !8)
+!185 = !DILocalVariable(name: "i", scope: !186, file: !6, line: 51, type: !8)
+!186 = distinct !DILexicalBlock(scope: !179, file: !6, line: 51, column: 3)
+!187 = !DILocalVariable(name: "i", scope: !188, file: !6, line: 54, type: !8)
+!188 = distinct !DILexicalBlock(scope: !179, file: !6, line: 54, column: 3)
+!193 = distinct !DILexicalBlock(scope: !186, file: !6, line: 51, column: 3)
+!194 = !DILocation(line: 51, column: 3, scope: !186)
+!199 = !DILocation(line: 51, column: 3, scope: !193)
+!200 = distinct !{!200, !194, !201}
+!201 = !DILocation(line: 52, column: 15, scope: !186)
+!215 = !DILocation(line: 57, column: 3, scope: !179)