void initializeLoopInfoWrapperPassPass(PassRegistry&);
void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&);
void initializeLoopInterchangePass(PassRegistry&);
+void initializeLoopFlattenLegacyPassPass(PassRegistry&);
void initializeLoopLoadEliminationPass(PassRegistry&);
void initializeLoopPassPass(PassRegistry&);
void initializeLoopPredicationLegacyPassPass(PassRegistry&);
(void) llvm::createLazyValueInfoPass();
(void) llvm::createLoopExtractorPass();
(void) llvm::createLoopInterchangePass();
+ (void) llvm::createLoopFlattenPass();
(void) llvm::createLoopPredicationPass();
(void) llvm::createLoopSimplifyPass();
(void) llvm::createLoopSimplifyCFGPass();
//===----------------------------------------------------------------------===//
//
+// LoopFlatten - This pass flattens nested loops into a single loop.
+//
+Pass *createLoopFlattenPass();
+
+//===----------------------------------------------------------------------===//
+//
// LoopStrengthReduce - This pass is strength reduces GEP instructions that use
// a loop's canonical induction variable as one of their indices.
//
--- /dev/null
+//===- LoopFlatten.h - Loop Flatten ---------------- -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface for the Loop Flatten Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H
+#define LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H
+
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+namespace llvm {
+
+class LoopFlattenPass : public PassInfoMixin<LoopFlattenPass> {
+public:
+ LoopFlattenPass() = default;
+
+ PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_LOOPFLATTEN_H
#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
#include "llvm/Transforms/Scalar/LoopDeletion.h"
#include "llvm/Transforms/Scalar/LoopDistribute.h"
+#include "llvm/Transforms/Scalar/LoopFlatten.h"
#include "llvm/Transforms/Scalar/LoopFuse.h"
#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
#include "llvm/Transforms/Scalar/LoopInstSimplify.h"
"enable-npm-unroll-and-jam", cl::init(false), cl::Hidden,
cl::desc("Enable the Unroll and Jam pass for the new PM (default = off)"));
+static cl::opt<bool> EnableLoopFlatten(
+ "enable-npm-loop-flatten", cl::init(false), cl::Hidden,
+ cl::desc("Enable the Loop flattening pass for the new PM (default = off)"));
+
static cl::opt<bool> EnableSyntheticCounts(
"enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
cl::desc("Run synthetic function entry count generation "
C(LPM2, Level);
LPM2.addPass(LoopDeletionPass());
+ if (EnableLoopFlatten)
+ LPM2.addPass(LoopFlattenPass());
// Do not enable unrolling in PreLinkThinLTO phase during sample PGO
// because it changes IR to makes profile annotation in back compile
// inaccurate. The normal unroller doesn't pay attention to forced full unroll
LOOP_PASS("no-op-loop", NoOpLoopPass())
LOOP_PASS("print", PrintLoopPass(dbgs()))
LOOP_PASS("loop-deletion", LoopDeletionPass())
+LOOP_PASS("loop-flatten", LoopFlattenPass())
LOOP_PASS("loop-simplifycfg", LoopSimplifyCFGPass())
LOOP_PASS("loop-reduce", LoopStrengthReducePass())
LOOP_PASS("indvars", IndVarSimplifyPass())
cl::init(false), cl::Hidden,
cl::desc("Enable Unroll And Jam Pass"));
+static cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false),
+ cl::Hidden,
+ cl::desc("Enable the LoopFlatten Pass"));
+
static cl::opt<bool>
EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
cl::desc("Enable preparation for ThinLTO."));
if (EnableLoopInterchange)
MPM.add(createLoopInterchangePass()); // Interchange loops
+ if (EnableLoopFlatten) {
+ MPM.add(createLoopFlattenPass()); // Flatten loops
+ MPM.add(createLoopSimplifyCFGPass());
+ }
// Unroll small loops
MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
PM.add(createLoopDeletionPass());
if (EnableLoopInterchange)
PM.add(createLoopInterchangePass());
+ if (EnableLoopFlatten)
+ PM.add(createLoopFlattenPass());
// Unroll small loops
PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
LoopIdiomRecognize.cpp
LoopInstSimplify.cpp
LoopInterchange.cpp
+ LoopFlatten.cpp
LoopLoadElimination.cpp
LoopPassManager.cpp
LoopPredication.cpp
--- /dev/null
+//===- LoopFlatten.cpp - Loop flattening pass------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass flattens pairs nested loops into a single loop.
+//
+// The intention is to optimise loop nests like this, which together access an
+// array linearly:
+// for (int i = 0; i < N; ++i)
+// for (int j = 0; j < M; ++j)
+// f(A[i*M+j]);
+// into one loop:
+// for (int i = 0; i < (N*M); ++i)
+// f(A[i]);
+//
+// It can also flatten loops where the induction variables are not used in the
+// loop. This is only worth doing if the induction variables are only used in an
+// expression like i*M+j. If they had any other uses, we would have to insert a
+// div/mod to reconstruct the original values, so this wouldn't be profitable.
+//
+// We also need to prove that N*M will not overflow.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopFlatten.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "loop-flatten"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<unsigned> RepeatedInstructionThreshold(
+ "loop-flatten-cost-threshold", cl::Hidden, cl::init(2),
+ cl::desc("Limit on the cost of instructions that can be repeated due to "
+ "loop flattening"));
+
+static cl::opt<bool>
+ AssumeNoOverflow("loop-flatten-assume-no-overflow", cl::Hidden,
+ cl::init(false),
+ cl::desc("Assume that the product of the two iteration "
+ "limits will never overflow"));
+
+// Finds the induction variable, increment and limit for a simple loop that we
+// can flatten.
+static bool findLoopComponents(
+ Loop *L, SmallPtrSetImpl<Instruction *> &IterationInstructions,
+ PHINode *&InductionPHI, Value *&Limit, BinaryOperator *&Increment,
+ BranchInst *&BackBranch, ScalarEvolution *SE) {
+ LLVM_DEBUG(dbgs() << "Finding components of loop: " << L->getName() << "\n");
+
+ if (!L->isLoopSimplifyForm()) {
+ LLVM_DEBUG(dbgs() << "Loop is not in normal form\n");
+ return false;
+ }
+
+ // There must be exactly one exiting block, and it must be the same at the
+ // latch.
+ BasicBlock *Latch = L->getLoopLatch();
+ if (L->getExitingBlock() != Latch) {
+ LLVM_DEBUG(dbgs() << "Exiting and latch block are different\n");
+ return false;
+ }
+ // Latch block must end in a conditional branch.
+ BackBranch = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!BackBranch || !BackBranch->isConditional()) {
+ LLVM_DEBUG(dbgs() << "Could not find back-branch\n");
+ return false;
+ }
+ IterationInstructions.insert(BackBranch);
+ LLVM_DEBUG(dbgs() << "Found back branch: "; BackBranch->dump());
+ bool ContinueOnTrue = L->contains(BackBranch->getSuccessor(0));
+
+ // Find the induction PHI. If there is no induction PHI, we can't do the
+ // transformation. TODO: could other variables trigger this? Do we have to
+ // search for the best one?
+ InductionPHI = nullptr;
+ for (PHINode &PHI : L->getHeader()->phis()) {
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) {
+ InductionPHI = &PHI;
+ LLVM_DEBUG(dbgs() << "Found induction PHI: "; InductionPHI->dump());
+ break;
+ }
+ }
+ if (!InductionPHI) {
+ LLVM_DEBUG(dbgs() << "Could not find induction PHI\n");
+ return false;
+ }
+
+ auto IsValidPredicate = [&](ICmpInst::Predicate Pred) {
+ if (ContinueOnTrue)
+ return Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT;
+ else
+ return Pred == CmpInst::ICMP_EQ;
+ };
+
+ // Find Compare and make sure it is valid
+ ICmpInst *Compare = dyn_cast<ICmpInst>(BackBranch->getCondition());
+ if (!Compare || !IsValidPredicate(Compare->getUnsignedPredicate()) ||
+ Compare->hasNUsesOrMore(2)) {
+ LLVM_DEBUG(dbgs() << "Could not find valid comparison\n");
+ return false;
+ }
+ IterationInstructions.insert(Compare);
+ LLVM_DEBUG(dbgs() << "Found comparison: "; Compare->dump());
+
+ // Find increment and limit from the compare
+ Increment = nullptr;
+ if (match(Compare->getOperand(0),
+ m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) {
+ Increment = dyn_cast<BinaryOperator>(Compare->getOperand(0));
+ Limit = Compare->getOperand(1);
+ } else if (Compare->getUnsignedPredicate() == CmpInst::ICMP_NE &&
+ match(Compare->getOperand(1),
+ m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) {
+ Increment = dyn_cast<BinaryOperator>(Compare->getOperand(1));
+ Limit = Compare->getOperand(0);
+ }
+ if (!Increment || Increment->hasNUsesOrMore(3)) {
+ LLVM_DEBUG(dbgs() << "Cound not find valid increment\n");
+ return false;
+ }
+ IterationInstructions.insert(Increment);
+ LLVM_DEBUG(dbgs() << "Found increment: "; Increment->dump());
+ LLVM_DEBUG(dbgs() << "Found limit: "; Limit->dump());
+
+ assert(InductionPHI->getNumIncomingValues() == 2);
+ assert(InductionPHI->getIncomingValueForBlock(Latch) == Increment &&
+ "PHI value is not increment inst");
+
+ auto *CI = dyn_cast<ConstantInt>(
+ InductionPHI->getIncomingValueForBlock(L->getLoopPreheader()));
+ if (!CI || !CI->isZero()) {
+ LLVM_DEBUG(dbgs() << "PHI value is not zero: "; CI->dump());
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "Successfully found all loop components\n");
+ return true;
+}
+
+static bool checkPHIs(Loop *OuterLoop, Loop *InnerLoop,
+ SmallPtrSetImpl<PHINode *> &InnerPHIsToTransform,
+ PHINode *InnerInductionPHI, PHINode *OuterInductionPHI,
+ TargetTransformInfo *TTI) {
+ // All PHIs in the inner and outer headers must either be:
+ // - The induction PHI, which we are going to rewrite as one induction in
+ // the new loop. This is already checked by findLoopComponents.
+ // - An outer header PHI with all incoming values from outside the loop.
+ // LoopSimplify guarantees we have a pre-header, so we don't need to
+ // worry about that here.
+ // - Pairs of PHIs in the inner and outer headers, which implement a
+ // loop-carried dependency that will still be valid in the new loop. To
+ // be valid, this variable must be modified only in the inner loop.
+
+ // The set of PHI nodes in the outer loop header that we know will still be
+ // valid after the transformation. These will not need to be modified (with
+ // the exception of the induction variable), but we do need to check that
+ // there are no unsafe PHI nodes.
+ SmallPtrSet<PHINode *, 4> SafeOuterPHIs;
+ SafeOuterPHIs.insert(OuterInductionPHI);
+
+ // Check that all PHI nodes in the inner loop header match one of the valid
+ // patterns.
+ for (PHINode &InnerPHI : InnerLoop->getHeader()->phis()) {
+ // The induction PHIs break these rules, and that's OK because we treat
+ // them specially when doing the transformation.
+ if (&InnerPHI == InnerInductionPHI)
+ continue;
+
+ // Each inner loop PHI node must have two incoming values/blocks - one
+ // from the pre-header, and one from the latch.
+ assert(InnerPHI.getNumIncomingValues() == 2);
+ Value *PreHeaderValue =
+ InnerPHI.getIncomingValueForBlock(InnerLoop->getLoopPreheader());
+ Value *LatchValue =
+ InnerPHI.getIncomingValueForBlock(InnerLoop->getLoopLatch());
+
+ // The incoming value from the outer loop must be the PHI node in the
+ // outer loop header, with no modifications made in the top of the outer
+ // loop.
+ PHINode *OuterPHI = dyn_cast<PHINode>(PreHeaderValue);
+ if (!OuterPHI || OuterPHI->getParent() != OuterLoop->getHeader()) {
+ LLVM_DEBUG(dbgs() << "value modified in top of outer loop\n");
+ return false;
+ }
+
+ // The other incoming value must come from the inner loop, without any
+ // modifications in the tail end of the outer loop. We are in LCSSA form,
+ // so this will actually be a PHI in the inner loop's exit block, which
+ // only uses values from inside the inner loop.
+ PHINode *LCSSAPHI = dyn_cast<PHINode>(
+ OuterPHI->getIncomingValueForBlock(OuterLoop->getLoopLatch()));
+ if (!LCSSAPHI) {
+ LLVM_DEBUG(dbgs() << "could not find LCSSA PHI\n");
+ return false;
+ }
+
+ // The value used by the LCSSA PHI must be the same one that the inner
+ // loop's PHI uses.
+ if (LCSSAPHI->hasConstantValue() != LatchValue) {
+ LLVM_DEBUG(
+ dbgs() << "LCSSA PHI incoming value does not match latch value\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "PHI pair is safe:\n");
+ LLVM_DEBUG(dbgs() << " Inner: "; InnerPHI.dump());
+ LLVM_DEBUG(dbgs() << " Outer: "; OuterPHI->dump());
+ SafeOuterPHIs.insert(OuterPHI);
+ InnerPHIsToTransform.insert(&InnerPHI);
+ }
+
+ for (PHINode &OuterPHI : OuterLoop->getHeader()->phis()) {
+ if (!SafeOuterPHIs.count(&OuterPHI)) {
+ LLVM_DEBUG(dbgs() << "found unsafe PHI in outer loop: "; OuterPHI.dump());
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool
+checkOuterLoopInsts(Loop *OuterLoop, Loop *InnerLoop,
+ SmallPtrSetImpl<Instruction *> &IterationInstructions,
+ Value *InnerLimit, PHINode *OuterPHI,
+ TargetTransformInfo *TTI) {
+ // Check for instructions in the outer but not inner loop. If any of these
+ // have side-effects then this transformation is not legal, and if there is
+ // a significant amount of code here which can't be optimised out that it's
+ // not profitable (as these instructions would get executed for each
+ // iteration of the inner loop).
+ unsigned RepeatedInstrCost = 0;
+ for (auto *B : OuterLoop->getBlocks()) {
+ if (InnerLoop->contains(B))
+ continue;
+
+ for (auto &I : *B) {
+ if (!isa<PHINode>(&I) && !I.isTerminator() &&
+ !isSafeToSpeculativelyExecute(&I)) {
+ LLVM_DEBUG(dbgs() << "Cannot flatten because instruction may have "
+ "side effects: ";
+ I.dump());
+ return false;
+ }
+ // The execution count of the outer loop's iteration instructions
+ // (increment, compare and branch) will be increased, but the
+ // equivalent instructions will be removed from the inner loop, so
+ // they make a net difference of zero.
+ if (IterationInstructions.count(&I))
+ continue;
+ // The uncoditional branch to the inner loop's header will turn into
+ // a fall-through, so adds no cost.
+ BranchInst *Br = dyn_cast<BranchInst>(&I);
+ if (Br && Br->isUnconditional() &&
+ Br->getSuccessor(0) == InnerLoop->getHeader())
+ continue;
+ // Multiplies of the outer iteration variable and inner iteration
+ // count will be optimised out.
+ if (match(&I, m_c_Mul(m_Specific(OuterPHI), m_Specific(InnerLimit))))
+ continue;
+ int Cost = TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
+ LLVM_DEBUG(dbgs() << "Cost " << Cost << ": "; I.dump());
+ RepeatedInstrCost += Cost;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Cost of instructions that will be repeated: "
+ << RepeatedInstrCost << "\n");
+ // Bail out if flattening the loops would cause instructions in the outer
+ // loop but not in the inner loop to be executed extra times.
+ if (RepeatedInstrCost > RepeatedInstructionThreshold)
+ return false;
+
+ return true;
+}
+
+static bool checkIVUsers(PHINode *InnerPHI, PHINode *OuterPHI,
+ BinaryOperator *InnerIncrement,
+ BinaryOperator *OuterIncrement, Value *InnerLimit,
+ SmallPtrSetImpl<Value *> &LinearIVUses) {
+ // We require all uses of both induction variables to match this pattern:
+ //
+ // (OuterPHI * InnerLimit) + InnerPHI
+ //
+ // Any uses of the induction variables not matching that pattern would
+ // require a div/mod to reconstruct in the flattened loop, so the
+ // transformation wouldn't be profitable.
+
+ // Check that all uses of the inner loop's induction variable match the
+ // expected pattern, recording the uses of the outer IV.
+ SmallPtrSet<Value *, 4> ValidOuterPHIUses;
+ for (User *U : InnerPHI->users()) {
+ if (U == InnerIncrement)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
+
+ Value *MatchedMul, *MatchedItCount;
+ if (match(U, m_c_Add(m_Specific(InnerPHI), m_Value(MatchedMul))) &&
+ match(MatchedMul,
+ m_c_Mul(m_Specific(OuterPHI), m_Value(MatchedItCount))) &&
+ MatchedItCount == InnerLimit) {
+ LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+ ValidOuterPHIUses.insert(MatchedMul);
+ LinearIVUses.insert(U);
+ } else {
+ LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+ return false;
+ }
+ }
+
+ // Check that there are no uses of the outer IV other than the ones found
+ // as part of the pattern above.
+ for (User *U : OuterPHI->users()) {
+ if (U == OuterIncrement)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump());
+
+ if (!ValidOuterPHIUses.count(U)) {
+ LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+ return false;
+ } else {
+ LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Found " << LinearIVUses.size()
+ << " value(s) that can be replaced:\n";
+ for (Value *V : LinearIVUses) {
+ dbgs() << " ";
+ V->dump();
+ });
+
+ return true;
+}
+
+// Return an OverflowResult dependant on if overflow of the multiplication of
+// InnerLimit and OuterLimit can be assumed not to happen.
+static OverflowResult checkOverflow(Loop *OuterLoop, Value *InnerLimit,
+ Value *OuterLimit,
+ SmallPtrSetImpl<Value *> &LinearIVUses,
+ DominatorTree *DT, AssumptionCache *AC) {
+ Function *F = OuterLoop->getHeader()->getParent();
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
+ // For debugging/testing.
+ if (AssumeNoOverflow)
+ return OverflowResult::NeverOverflows;
+
+ // Check if the multiply could not overflow due to known ranges of the
+ // input values.
+ OverflowResult OR = computeOverflowForUnsignedMul(
+ InnerLimit, OuterLimit, DL, AC,
+ OuterLoop->getLoopPreheader()->getTerminator(), DT);
+ if (OR != OverflowResult::MayOverflow)
+ return OR;
+
+ for (Value *V : LinearIVUses) {
+ for (Value *U : V->users()) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
+ // The IV is used as the operand of a GEP, and the IV is at least as
+ // wide as the address space of the GEP. In this case, the GEP would
+ // wrap around the address space before the IV increment wraps, which
+ // would be UB.
+ if (GEP->isInBounds() &&
+ V->getType()->getIntegerBitWidth() >=
+ DL.getPointerTypeSizeInBits(GEP->getType())) {
+ LLVM_DEBUG(
+ dbgs() << "use of linear IV would be UB if overflow occurred: ";
+ GEP->dump());
+ return OverflowResult::NeverOverflows;
+ }
+ }
+ }
+ }
+
+ return OverflowResult::MayOverflow;
+}
+
+static bool FlattenLoopPair(Loop *OuterLoop, Loop *InnerLoop, DominatorTree *DT,
+ LoopInfo *LI, ScalarEvolution *SE,
+ AssumptionCache *AC, TargetTransformInfo *TTI,
+ std::function<void(Loop *)> markLoopAsDeleted) {
+ Function *F = OuterLoop->getHeader()->getParent();
+
+ LLVM_DEBUG(dbgs() << "Loop flattening running on outer loop "
+ << OuterLoop->getHeader()->getName() << " and inner loop "
+ << InnerLoop->getHeader()->getName() << " in "
+ << F->getName() << "\n");
+
+ SmallPtrSet<Instruction *, 8> IterationInstructions;
+
+ PHINode *InnerInductionPHI, *OuterInductionPHI;
+ Value *InnerLimit, *OuterLimit;
+ BinaryOperator *InnerIncrement, *OuterIncrement;
+ BranchInst *InnerBranch, *OuterBranch;
+
+ if (!findLoopComponents(InnerLoop, IterationInstructions, InnerInductionPHI,
+ InnerLimit, InnerIncrement, InnerBranch, SE))
+ return false;
+ if (!findLoopComponents(OuterLoop, IterationInstructions, OuterInductionPHI,
+ OuterLimit, OuterIncrement, OuterBranch, SE))
+ return false;
+
+ // Both of the loop limit values must be invariant in the outer loop
+ // (non-instructions are all inherently invariant).
+ if (!OuterLoop->isLoopInvariant(InnerLimit)) {
+ LLVM_DEBUG(dbgs() << "inner loop limit not invariant\n");
+ return false;
+ }
+ if (!OuterLoop->isLoopInvariant(OuterLimit)) {
+ LLVM_DEBUG(dbgs() << "outer loop limit not invariant\n");
+ return false;
+ }
+
+ SmallPtrSet<PHINode *, 4> InnerPHIsToTransform;
+ if (!checkPHIs(OuterLoop, InnerLoop, InnerPHIsToTransform, InnerInductionPHI,
+ OuterInductionPHI, TTI))
+ return false;
+
+ // FIXME: it should be possible to handle different types correctly.
+ if (InnerInductionPHI->getType() != OuterInductionPHI->getType())
+ return false;
+
+ if (!checkOuterLoopInsts(OuterLoop, InnerLoop, IterationInstructions,
+ InnerLimit, OuterInductionPHI, TTI))
+ return false;
+
+ // Find the values in the loop that can be replaced with the linearized
+ // induction variable, and check that there are no other uses of the inner
+ // or outer induction variable. If there were, we could still do this
+ // transformation, but we'd have to insert a div/mod to calculate the
+ // original IVs, so it wouldn't be profitable.
+ SmallPtrSet<Value *, 4> LinearIVUses;
+ if (!checkIVUsers(InnerInductionPHI, OuterInductionPHI, InnerIncrement,
+ OuterIncrement, InnerLimit, LinearIVUses))
+ return false;
+
+ // Check if the new iteration variable might overflow. In this case, we
+ // need to version the loop, and select the original version at runtime if
+ // the iteration space is too large.
+ // TODO: We currently don't version the loop.
+ // TODO: it might be worth using a wider iteration variable rather than
+ // versioning the loop, if a wide enough type is legal.
+ bool MustVersionLoop = true;
+ OverflowResult OR =
+ checkOverflow(OuterLoop, InnerLimit, OuterLimit, LinearIVUses, DT, AC);
+ if (OR == OverflowResult::AlwaysOverflowsHigh ||
+ OR == OverflowResult::AlwaysOverflowsLow) {
+ LLVM_DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n");
+ return false;
+ } else if (OR == OverflowResult::MayOverflow) {
+ LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
+ } else {
+ LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
+ MustVersionLoop = false;
+ }
+
+ // We cannot safely flatten the loop. Exit now.
+ if (MustVersionLoop)
+ return false;
+
+ // Do the actual transformation.
+ LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n");
+
+ {
+ using namespace ore;
+ OptimizationRemark Remark(DEBUG_TYPE, "Flattened", InnerLoop->getStartLoc(),
+ InnerLoop->getHeader());
+ OptimizationRemarkEmitter ORE(F);
+ Remark << "Flattened into outer loop";
+ ORE.emit(Remark);
+ }
+
+ Value *NewTripCount =
+ BinaryOperator::CreateMul(InnerLimit, OuterLimit, "flatten.tripcount",
+ OuterLoop->getLoopPreheader()->getTerminator());
+ LLVM_DEBUG(dbgs() << "Created new trip count in preheader: ";
+ NewTripCount->dump());
+
+ // Fix up PHI nodes that take values from the inner loop back-edge, which
+ // we are about to remove.
+ InnerInductionPHI->removeIncomingValue(InnerLoop->getLoopLatch());
+ for (PHINode *PHI : InnerPHIsToTransform)
+ PHI->removeIncomingValue(InnerLoop->getLoopLatch());
+
+ // Modify the trip count of the outer loop to be the product of the two
+ // trip counts.
+ cast<User>(OuterBranch->getCondition())->setOperand(1, NewTripCount);
+
+ // Replace the inner loop backedge with an unconditional branch to the exit.
+ BasicBlock *InnerExitBlock = InnerLoop->getExitBlock();
+ BasicBlock *InnerExitingBlock = InnerLoop->getExitingBlock();
+ InnerExitingBlock->getTerminator()->eraseFromParent();
+ BranchInst::Create(InnerExitBlock, InnerExitingBlock);
+ DT->deleteEdge(InnerExitingBlock, InnerLoop->getHeader());
+
+ // Replace all uses of the polynomial calculated from the two induction
+ // variables with the one new one.
+ for (Value *V : LinearIVUses)
+ V->replaceAllUsesWith(OuterInductionPHI);
+
+ // Tell LoopInfo, SCEV and the pass manager that the inner loop has been
+ // deleted, and any information that have about the outer loop invalidated.
+ markLoopAsDeleted(InnerLoop);
+ SE->forgetLoop(OuterLoop);
+ SE->forgetLoop(InnerLoop);
+ LI->erase(InnerLoop);
+
+ return true;
+}
+
+PreservedAnalyses LoopFlattenPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &Updater) {
+ if (L.getSubLoops().size() != 1)
+ return PreservedAnalyses::all();
+
+ Loop *InnerLoop = *L.begin();
+ std::string LoopName(InnerLoop->getName());
+ if (!FlattenLoopPair(
+ &L, InnerLoop, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI,
+ [&](Loop *L) { Updater.markLoopAsDeleted(*L, LoopName); }))
+ return PreservedAnalyses::all();
+ return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+class LoopFlattenLegacyPass : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopFlattenLegacyPass() : LoopPass(ID) {
+ initializeLoopFlattenLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ // Possibly flatten loop L into its child.
+ bool runOnLoop(Loop *L, LPPassManager &) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ getLoopAnalysisUsage(AU);
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<TargetTransformInfoWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addPreserved<AssumptionCacheTracker>();
+ }
+};
+} // namespace
+
+char LoopFlattenLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
+ false, false)
+
+Pass *llvm::createLoopFlattenPass() { return new LoopFlattenLegacyPass(); }
+
+bool LoopFlattenLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipLoop(L))
+ return false;
+
+ if (L->getSubLoops().size() != 1)
+ return false;
+
+ ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ auto &TTIP = getAnalysis<TargetTransformInfoWrapperPass>();
+ TargetTransformInfo *TTI = &TTIP.getTTI(*L->getHeader()->getParent());
+ AssumptionCache *AC =
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
+
+ Loop *InnerLoop = *L->begin();
+ return FlattenLoopPair(L, InnerLoop, DT, LI, SE, AC, TTI,
+ [&](Loop *L) { LPM.markLoopAsDeleted(*L); });
+}
initializeLoopAccessLegacyAnalysisPass(Registry);
initializeLoopInstSimplifyLegacyPassPass(Registry);
initializeLoopInterchangePass(Registry);
+ initializeLoopFlattenLegacyPassPass(Registry);
initializeLoopPredicationLegacyPassPass(Registry);
initializeLoopRotateLegacyPassPass(Registry);
initializeLoopStrengthReducePass(Registry);
unwrap(PM)->add(createLoopDeletionPass());
}
+void LLVMAddLoopFlattenPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopFlattenPass());
+}
+
void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopIdiomPass());
}
--- /dev/null
+; RUN: opt < %s -S -loop-flatten -debug-only=loop-flatten 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Every function in this file has a reason that it can't be transformed.
+
+; CHECK-NOT: Checks all passed, doing the transformation
+
+; Outer loop does not start at zero
+define void @test_1(i32 %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+ %cmp25 = icmp sgt i32 %N, 0
+ br i1 %cmp25, label %for.body4.lr.ph, label %for.cond.cleanup
+
+for.body4.lr.ph:
+ %i.026 = phi i32 [ %inc10, %for.cond.cleanup3 ], [ 1, %entry ]
+ %mul = mul nsw i32 %i.026, %N
+ br label %for.body4
+
+for.body4:
+ %j.024 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ]
+ %add = add nsw i32 %j.024, %mul
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+ %0 = load i32, i32* %arrayidx, align 4
+ %mul5 = mul nsw i32 %0, %scale
+ %arrayidx8 = getelementptr inbounds i32, i32* %C, i32 %add
+ store i32 %mul5, i32* %arrayidx8, align 4
+ %inc = add nuw nsw i32 %j.024, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.cond.cleanup3, label %for.body4
+
+for.cond.cleanup3:
+ %inc10 = add nuw nsw i32 %i.026, 1
+ %exitcond27 = icmp eq i32 %inc10, %N
+ br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph
+
+for.cond.cleanup:
+ ret void
+}
+
+; Inner loop does not start at zero
+define void @test_2(i32 %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+ %cmp25 = icmp sgt i32 %N, 0
+ br i1 %cmp25, label %for.body4.lr.ph, label %for.cond.cleanup
+
+for.body4.lr.ph:
+ %i.026 = phi i32 [ %inc10, %for.cond.cleanup3 ], [ 0, %entry ]
+ %mul = mul nsw i32 %i.026, %N
+ br label %for.body4
+
+for.body4:
+ %j.024 = phi i32 [ 1, %for.body4.lr.ph ], [ %inc, %for.body4 ]
+ %add = add nsw i32 %j.024, %mul
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+ %0 = load i32, i32* %arrayidx, align 4
+ %mul5 = mul nsw i32 %0, %scale
+ %arrayidx8 = getelementptr inbounds i32, i32* %C, i32 %add
+ store i32 %mul5, i32* %arrayidx8, align 4
+ %inc = add nuw nsw i32 %j.024, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.cond.cleanup3, label %for.body4
+
+for.cond.cleanup3:
+ %inc10 = add nuw nsw i32 %i.026, 1
+ %exitcond27 = icmp eq i32 %inc10, %N
+ br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph
+
+for.cond.cleanup:
+ ret void
+}
+
+; Outer IV used directly
+define hidden void @test_3(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+ %conv = zext i16 %N to i32
+ %cmp25 = icmp eq i16 %N, 0
+ br i1 %cmp25, label %for.cond.cleanup, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us: ; preds = %entry
+ br label %for.body.us
+
+for.body.us: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.lr.ph.split.us
+ %i.026.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.cond.cleanup6_crit_edge.us ]
+ %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i.026.us
+ %mul9.us = mul nuw nsw i32 %i.026.us, %conv
+ br label %for.body7.us
+
+for.body7.us: ; preds = %for.body.us, %for.body7.us
+ %j.024.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body7.us ]
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %mul.us = mul nsw i32 %0, %scale
+ %add.us = add nuw nsw i32 %j.024.us, %mul9.us
+ %arrayidx10.us = getelementptr inbounds i32, i32* %C, i32 %add.us
+ store i32 %mul.us, i32* %arrayidx10.us, align 4
+ %inc.us = add nuw nsw i32 %j.024.us, 1
+ %exitcond = icmp ne i32 %inc.us, %conv
+ br i1 %exitcond, label %for.body7.us, label %for.cond2.for.cond.cleanup6_crit_edge.us
+
+for.cond2.for.cond.cleanup6_crit_edge.us: ; preds = %for.body7.us
+ %inc12.us = add nuw nsw i32 %i.026.us, 1
+ %exitcond27 = icmp ne i32 %inc12.us, %conv
+ br i1 %exitcond27, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+}
+
+; Inner IV used directly
+define hidden void @test_4(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+ %conv = zext i16 %N to i32
+ %cmp25 = icmp eq i16 %N, 0
+ br i1 %cmp25, label %for.cond.cleanup, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us: ; preds = %entry
+ br label %for.body.us
+
+for.body.us: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.lr.ph.split.us
+ %i.026.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.cond.cleanup6_crit_edge.us ]
+ %mul9.us = mul nuw nsw i32 %i.026.us, %conv
+ br label %for.body7.us
+
+for.body7.us: ; preds = %for.body.us, %for.body7.us
+ %j.024.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body7.us ]
+ %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %j.024.us
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %mul.us = mul nsw i32 %0, %scale
+ %add.us = add nuw nsw i32 %j.024.us, %mul9.us
+ %arrayidx10.us = getelementptr inbounds i32, i32* %C, i32 %add.us
+ store i32 %mul.us, i32* %arrayidx10.us, align 4
+ %inc.us = add nuw nsw i32 %j.024.us, 1
+ %exitcond = icmp ne i32 %inc.us, %conv
+ br i1 %exitcond, label %for.body7.us, label %for.cond2.for.cond.cleanup6_crit_edge.us
+
+for.cond2.for.cond.cleanup6_crit_edge.us: ; preds = %for.body7.us
+ %inc12.us = add nuw nsw i32 %i.026.us, 1
+ %exitcond27 = icmp ne i32 %inc12.us, %conv
+ br i1 %exitcond27, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+}
+
+; Inner iteration count not invariant in outer loop
+declare i32 @get_int() readonly
+define void @test_5(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+ %conv = zext i16 %N to i32
+ %cmp27 = icmp eq i16 %N, 0
+ br i1 %cmp27, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup5
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+
+for.body: ; preds = %for.body.lr.ph, %for.cond.cleanup5
+ %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %inc12, %for.cond.cleanup5 ]
+ %call = tail call i32 @get_int()
+ %cmp325 = icmp sgt i32 %call, 0
+ br i1 %cmp325, label %for.body6.lr.ph, label %for.cond.cleanup5
+
+for.body6.lr.ph: ; preds = %for.body
+ %mul = mul nsw i32 %call, %i.028
+ br label %for.body6
+
+for.cond.cleanup5.loopexit: ; preds = %for.body6
+ br label %for.cond.cleanup5
+
+for.cond.cleanup5: ; preds = %for.cond.cleanup5.loopexit, %for.body
+ %inc12 = add nuw nsw i32 %i.028, 1
+ %exitcond29 = icmp ne i32 %inc12, %conv
+ br i1 %exitcond29, label %for.body, label %for.cond.cleanup.loopexit
+
+for.body6: ; preds = %for.body6.lr.ph, %for.body6
+ %j.026 = phi i32 [ 0, %for.body6.lr.ph ], [ %inc, %for.body6 ]
+ %add = add nsw i32 %j.026, %mul
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+ %0 = load i32, i32* %arrayidx, align 4
+ %mul7 = mul nsw i32 %0, %scale
+ %arrayidx10 = getelementptr inbounds i32, i32* %C, i32 %add
+ store i32 %mul7, i32* %arrayidx10, align 4
+ %inc = add nuw nsw i32 %j.026, 1
+ %exitcond = icmp ne i32 %inc, %call
+ br i1 %exitcond, label %for.body6, label %for.cond.cleanup5.loopexit
+}
+
+; Inner loop has an early exit
+define hidden void @test_6(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+ %conv = zext i16 %N to i32
+ %cmp39 = icmp eq i16 %N, 0
+ br i1 %cmp39, label %for.cond.cleanup, label %for.body.us.preheader
+
+for.body.us.preheader: ; preds = %entry
+ br label %for.body.us
+
+for.body.us: ; preds = %for.body.us.preheader, %cleanup.us
+ %i.040.us = phi i32 [ %inc19.us, %cleanup.us ], [ 0, %for.body.us.preheader ]
+ %mul.us = mul nuw nsw i32 %i.040.us, %conv
+ br label %for.body7.us
+
+for.body7.us: ; preds = %for.body.us, %if.end.us
+ %j.038.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %if.end.us ]
+ %add.us = add nuw nsw i32 %j.038.us, %mul.us
+ %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %tobool.us = icmp eq i32 %0, 0
+ br i1 %tobool.us, label %if.end.us, label %cleanup.us
+
+cleanup.us: ; preds = %if.end.us, %for.body7.us
+ %inc19.us = add nuw nsw i32 %i.040.us, 1
+ %exitcond = icmp eq i32 %inc19.us, %conv
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body.us
+
+if.end.us: ; preds = %for.body7.us
+ %arrayidx17.us = getelementptr inbounds i32, i32* %C, i32 %add.us
+ store i32 0, i32* %arrayidx17.us, align 4
+ %inc.us = add nuw nsw i32 %j.038.us, 1
+ %cmp4.us = icmp ult i32 %inc.us, %conv
+ br i1 %cmp4.us, label %for.body7.us, label %cleanup.us
+
+for.cond.cleanup: ; preds = %cleanup.us, %entry
+ ret void
+}
+
+define hidden void @test_7(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+ %conv = zext i16 %N to i32
+ %cmp30 = icmp eq i16 %N, 0
+ br i1 %cmp30, label %cleanup, label %for.body.us.preheader
+
+for.body.us.preheader: ; preds = %entry
+ br label %for.body.us
+
+for.body.us: ; preds = %for.body.us.preheader, %for.cond2.for.cond.cleanup6_crit_edge.us
+ %i.031.us = phi i32 [ %inc15.us, %for.cond2.for.cond.cleanup6_crit_edge.us ], [ 0, %for.body.us.preheader ]
+ %call.us = tail call i32 @get_int() #2
+ %tobool.us = icmp eq i32 %call.us, 0
+ br i1 %tobool.us, label %for.body7.lr.ph.us, label %cleanup
+
+for.body7.us: ; preds = %for.body7.us, %for.body7.lr.ph.us
+ %j.029.us = phi i32 [ 0, %for.body7.lr.ph.us ], [ %inc.us, %for.body7.us ]
+ %add.us = add nuw nsw i32 %j.029.us, %mul.us
+ %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %mul9.us = mul nsw i32 %0, %scale
+ %arrayidx13.us = getelementptr inbounds i32, i32* %C, i32 %add.us
+ store i32 %mul9.us, i32* %arrayidx13.us, align 4
+ %inc.us = add nuw nsw i32 %j.029.us, 1
+ %exitcond = icmp eq i32 %inc.us, %conv
+ br i1 %exitcond, label %for.cond2.for.cond.cleanup6_crit_edge.us, label %for.body7.us
+
+for.body7.lr.ph.us: ; preds = %for.body.us
+ %mul.us = mul nuw nsw i32 %i.031.us, %conv
+ br label %for.body7.us
+
+for.cond2.for.cond.cleanup6_crit_edge.us: ; preds = %for.body7.us
+ %inc15.us = add nuw nsw i32 %i.031.us, 1
+ %cmp.us = icmp ult i32 %inc15.us, %conv
+ br i1 %cmp.us, label %for.body.us, label %cleanup
+
+cleanup: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.us, %entry
+ ret void
+}
+
+; Step is not 1
+define i32 @test_8(i32 %val, i16* nocapture %A) {
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.inc6
+ %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ]
+ %mul = mul nuw nsw i32 %i.018, 20
+ br label %for.body3
+
+for.body3: ; preds = %for.body, %for.body3
+ %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
+ %add = add nuw nsw i32 %j.017, %mul
+ %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
+ %0 = load i16, i16* %arrayidx, align 2
+ %conv16 = zext i16 %0 to i32
+ %add4 = add i32 %conv16, %val
+ %conv5 = trunc i32 %add4 to i16
+ store i16 %conv5, i16* %arrayidx, align 2
+ %inc = add nuw nsw i32 %j.017, 1
+ %exitcond = icmp ne i32 %inc, 20
+ br i1 %exitcond, label %for.body3, label %for.inc6
+
+for.inc6: ; preds = %for.body3
+ %inc7 = add nuw nsw i32 %i.018, 2
+ %exitcond19 = icmp ne i32 %inc7, 10
+ br i1 %exitcond19, label %for.body, label %for.end8
+
+for.end8: ; preds = %for.inc6
+ ret i32 10
+}
+
+
+; Step is not 1
+define i32 @test_9(i32 %val, i16* nocapture %A) {
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.inc6
+ %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ]
+ %mul = mul nuw nsw i32 %i.018, 20
+ br label %for.body3
+
+for.body3: ; preds = %for.body, %for.body3
+ %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
+ %add = add nuw nsw i32 %j.017, %mul
+ %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
+ %0 = load i16, i16* %arrayidx, align 2
+ %conv16 = zext i16 %0 to i32
+ %add4 = add i32 %conv16, %val
+ %conv5 = trunc i32 %add4 to i16
+ store i16 %conv5, i16* %arrayidx, align 2
+ %inc = add nuw nsw i32 %j.017, 2
+ %exitcond = icmp ne i32 %inc, 20
+ br i1 %exitcond, label %for.body3, label %for.inc6
+
+for.inc6: ; preds = %for.body3
+ %inc7 = add nuw nsw i32 %i.018, 1
+ %exitcond19 = icmp ne i32 %inc7, 10
+ br i1 %exitcond19, label %for.body, label %for.end8
+
+for.end8: ; preds = %for.inc6
+ ret i32 10
+}
+
+
+; Outer loop conditional phi
+define i32 @e() {
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.end16
+ %f.033 = phi i32 [ 0, %entry ], [ %inc18, %for.end16 ]
+ %g.032 = phi i32 [ undef, %entry ], [ %g.3.lcssa, %for.end16 ]
+ %.pr = add i32 10, 10
+ %tobool29 = icmp eq i32 %.pr, 0
+ br i1 %tobool29, label %for.end, label %for.body2.lr.ph
+
+for.body2.lr.ph: ; preds = %for.body
+ br label %for.cond1.for.end_crit_edge
+
+for.cond1.for.end_crit_edge: ; preds = %for.body2.lr.ph
+ br label %for.end
+
+for.end: ; preds = %for.cond1.for.end_crit_edge, %for.body
+ %g.1.lcssa = phi i32 [ 0, %for.cond1.for.end_crit_edge ], [ %g.032, %for.body ]
+ br label %for.body5
+
+for.body5: ; preds = %for.end, %lor.end
+ %i.031 = phi i32 [ 0, %for.end ], [ %inc15, %lor.end ]
+ %g.230 = phi i32 [ %g.1.lcssa, %for.end ], [ %g.3, %lor.end ]
+ %0 = add i32 10, 10
+ %1 = add i32 10, 10
+ %tobool9 = icmp eq i32 %1, 0
+ br i1 %tobool9, label %lor.rhs, label %lor.end
+
+lor.rhs: ; preds = %for.body5
+ %2 = add i32 10, 10
+ %call11 = add i32 10, 10
+ %tobool12 = icmp ne i32 %call11, 0
+ br label %lor.end
+
+lor.end: ; preds = %for.body5, %lor.rhs
+ %g.3 = phi i32 [ %g.230, %for.body5 ], [ %call11, %lor.rhs ]
+ %3 = phi i1 [ true, %for.body5 ], [ %tobool12, %lor.rhs ]
+ %lor.ext = zext i1 %3 to i32
+ %inc15 = add nuw nsw i32 %i.031, 1
+ %exitcond = icmp ne i32 %inc15, 9
+ br i1 %exitcond, label %for.body5, label %for.end16
+
+for.end16: ; preds = %lor.end
+ %g.3.lcssa = phi i32 [ %g.3, %lor.end ]
+ %inc18 = add nuw nsw i32 %f.033, 1
+ %exitcond34 = icmp ne i32 %inc18, 7
+ br i1 %exitcond34, label %for.body, label %for.end19
+
+for.end19: ; preds = %for.end16
+ ret i32 undef
+}
--- /dev/null
+; RUN: opt < %s -S -loop-flatten -verify-loop-info -verify-dom-info -verify-scev -verify | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; CHECK-LABEL: test1
+; Simple loop where the IV's is constant
+define i32 @test1(i32 %val, i16* nocapture %A) {
+entry:
+ br label %for.body
+; CHECK: entry:
+; CHECK: %flatten.tripcount = mul i32 20, 10
+; CHECK: br label %for.body
+
+for.body: ; preds = %entry, %for.inc6
+ %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ]
+ %mul = mul nuw nsw i32 %i.018, 20
+ br label %for.body3
+; CHECK: for.body:
+; CHECK: %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ]
+; CHECK: %mul = mul nuw nsw i32 %i.018, 20
+; CHECK: br label %for.body3
+
+for.body3: ; preds = %for.body, %for.body3
+ %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
+ %add = add nuw nsw i32 %j.017, %mul
+ %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
+ %0 = load i16, i16* %arrayidx, align 2
+ %conv16 = zext i16 %0 to i32
+ %add4 = add i32 %conv16, %val
+ %conv5 = trunc i32 %add4 to i16
+ store i16 %conv5, i16* %arrayidx, align 2
+ %inc = add nuw nsw i32 %j.017, 1
+ %exitcond = icmp ne i32 %inc, 20
+ br i1 %exitcond, label %for.body3, label %for.inc6
+; CHECK: for.body3:
+; CHECK: %j.017 = phi i32 [ 0, %for.body ]
+; CHECK: %add = add nuw nsw i32 %j.017, %mul
+; CHECK: %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.018
+; CHECK: %0 = load i16, i16* %arrayidx, align 2
+; CHECK: %conv16 = zext i16 %0 to i32
+; CHECK: %add4 = add i32 %conv16, %val
+; CHECK: %conv5 = trunc i32 %add4 to i16
+; CHECK: store i16 %conv5, i16* %arrayidx, align 2
+; CHECK: %inc = add nuw nsw i32 %j.017, 1
+; CHECK: %exitcond = icmp ne i32 %inc, 20
+; CHECK: br label %for.inc6
+
+for.inc6: ; preds = %for.body3
+ %inc7 = add nuw nsw i32 %i.018, 1
+ %exitcond19 = icmp ne i32 %inc7, 10
+ br i1 %exitcond19, label %for.body, label %for.end8
+; CHECK: for.inc6:
+; CHECK: %inc7 = add nuw nsw i32 %i.018, 1
+; CHECK: %exitcond19 = icmp ne i32 %inc7, %flatten.tripcount
+; CHECK: br i1 %exitcond19, label %for.body, label %for.end8
+
+for.end8: ; preds = %for.inc6
+ ret i32 10
+}
+
+
+; CHECK-LABEL: test2
+; Same as above but non constant IV (which still cannot overflow)
+define i32 @test2(i8 zeroext %I, i32 %val, i16* nocapture %A) {
+entry:
+ %conv = zext i8 %I to i32
+ %cmp26 = icmp eq i8 %I, 0
+ br i1 %cmp26, label %for.end13, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us: ; preds = %entry
+ br label %for.body.us
+; CHECK: for.body.lr.ph.split.us:
+; CHECK: %flatten.tripcount = mul i32 %conv, %conv
+; CHECK: br label %for.body.us
+
+for.body.us: ; preds = %for.cond2.for.inc11_crit_edge.us, %for.body.lr.ph.split.us
+ %i.027.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.inc11_crit_edge.us ]
+ %mul.us = mul nuw nsw i32 %i.027.us, %conv
+ br label %for.body6.us
+; CHECK: for.body.us:
+; CHECK: %i.027.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc12.us, %for.cond2.for.inc11_crit_edge.us ]
+; CHECK: %mul.us = mul nuw nsw i32 %i.027.us, %conv
+; CHECK: br label %for.body6.us
+
+for.body6.us: ; preds = %for.body.us, %for.body6.us
+ %j.025.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body6.us ]
+ %add.us = add nuw nsw i32 %j.025.us, %mul.us
+ %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %add.us
+ %0 = load i16, i16* %arrayidx.us, align 2
+ %conv823.us = zext i16 %0 to i32
+ %add9.us = add i32 %conv823.us, %val
+ %conv10.us = trunc i32 %add9.us to i16
+ store i16 %conv10.us, i16* %arrayidx.us, align 2
+ %inc.us = add nuw nsw i32 %j.025.us, 1
+ %exitcond = icmp ne i32 %inc.us, %conv
+ br i1 %exitcond, label %for.body6.us, label %for.cond2.for.inc11_crit_edge.us
+; CHECK: for.body6.us:
+; CHECK: %j.025.us = phi i32 [ 0, %for.body.us ]
+; CHECK: %add.us = add nuw nsw i32 %j.025.us, %mul.us
+; CHECK: %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.027.us
+; CHECK: %0 = load i16, i16* %arrayidx.us, align 2
+; CHECK: %conv823.us = zext i16 %0 to i32
+; CHECK: %add9.us = add i32 %conv823.us, %val
+; CHECK: %conv10.us = trunc i32 %add9.us to i16
+; CHECK: store i16 %conv10.us, i16* %arrayidx.us, align 2
+; CHECK: %inc.us = add nuw nsw i32 %j.025.us, 1
+; CHECK: %exitcond = icmp ne i32 %inc.us, %conv
+; CHECK: br label %for.cond2.for.inc11_crit_edge.us
+
+for.cond2.for.inc11_crit_edge.us: ; preds = %for.body6.us
+ %inc12.us = add nuw nsw i32 %i.027.us, 1
+ %exitcond28 = icmp ne i32 %inc12.us, %conv
+ br i1 %exitcond28, label %for.body.us, label %for.end13.loopexit
+; CHECK: for.cond2.for.inc11_crit_edge.us: ; preds = %for.body6.us
+; CHECK: %inc12.us = add nuw nsw i32 %i.027.us, 1
+; CHECK: %exitcond28 = icmp ne i32 %inc12.us, %flatten.tripcount
+; CHECK: br i1 %exitcond28, label %for.body.us, label %for.end13.loopexit
+
+for.end13.loopexit: ; preds = %for.cond2.for.inc11_crit_edge.us
+ br label %for.end13
+
+for.end13: ; preds = %for.end13.loopexit, %entry
+ %i.0.lcssa = phi i32 [ 0, %entry ], [ %conv, %for.end13.loopexit ]
+ ret i32 %i.0.lcssa
+}
+
+
+; CHECK-LABEL: test3
+; Same as above, uses load to determine it can't overflow
+define i32 @test3(i32 %N, i32 %val, i16* nocapture %A) local_unnamed_addr #0 {
+entry:
+ %cmp21 = icmp eq i32 %N, 0
+ br i1 %cmp21, label %for.end8, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us: ; preds = %entry
+ br label %for.body.us
+; CHECK: for.body.lr.ph.split.us:
+; CHECK: %flatten.tripcount = mul i32 %N, %N
+; CHECK: br label %for.body.us
+
+for.body.us: ; preds = %for.cond1.for.inc6_crit_edge.us, %for.body.lr.ph.split.us
+ %i.022.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc7.us, %for.cond1.for.inc6_crit_edge.us ]
+ %mul.us = mul i32 %i.022.us, %N
+ br label %for.body3.us
+; CHECK: for.body.us:
+; CHECK: %i.022.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc7.us, %for.cond1.for.inc6_crit_edge.us ]
+; CHECK: %mul.us = mul i32 %i.022.us, %N
+; CHECK: br label %for.body3.us
+
+for.body3.us: ; preds = %for.body.us, %for.body3.us
+ %j.020.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body3.us ]
+ %add.us = add i32 %j.020.us, %mul.us
+ %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %add.us
+ %0 = load i16, i16* %arrayidx.us, align 2
+ %conv18.us = zext i16 %0 to i32
+ %add4.us = add i32 %conv18.us, %val
+ %conv5.us = trunc i32 %add4.us to i16
+ store i16 %conv5.us, i16* %arrayidx.us, align 2
+ %inc.us = add nuw i32 %j.020.us, 1
+ %exitcond = icmp ne i32 %inc.us, %N
+ br i1 %exitcond, label %for.body3.us, label %for.cond1.for.inc6_crit_edge.us
+; CHECK: for.body3.us:
+; CHECK: %j.020.us = phi i32 [ 0, %for.body.us ]
+; CHECK: %add.us = add i32 %j.020.us, %mul.us
+; CHECK: %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.022.us
+; CHECK: %0 = load i16, i16* %arrayidx.us, align 2
+; CHECK: %conv18.us = zext i16 %0 to i32
+; CHECK: %add4.us = add i32 %conv18.us, %val
+; CHECK: %conv5.us = trunc i32 %add4.us to i16
+; CHECK: store i16 %conv5.us, i16* %arrayidx.us, align 2
+; CHECK: %inc.us = add nuw i32 %j.020.us, 1
+; CHECK: %exitcond = icmp ne i32 %inc.us, %N
+; CHECK: br label %for.cond1.for.inc6_crit_edge.us
+
+for.cond1.for.inc6_crit_edge.us: ; preds = %for.body3.us
+ %inc7.us = add nuw i32 %i.022.us, 1
+ %exitcond23 = icmp ne i32 %inc7.us, %N
+ br i1 %exitcond23, label %for.body.us, label %for.end8.loopexit
+; CHECK: for.cond1.for.inc6_crit_edge.us:
+; CHECK: %inc7.us = add nuw i32 %i.022.us, 1
+; CHECK: %exitcond23 = icmp ne i32 %inc7.us, %flatten.tripcount
+; CHECK: br i1 %exitcond23, label %for.body.us, label %for.end8.loopexit
+
+for.end8.loopexit: ; preds = %for.cond1.for.inc6_crit_edge.us
+ br label %for.end8
+
+for.end8: ; preds = %for.end8.loopexit, %entry
+ %i.0.lcssa = phi i32 [ 0, %entry ], [ %N, %for.end8.loopexit ]
+ ret i32 %i.0.lcssa
+}
+
+
+; CHECK-LABEL: test4
+; Multiplication cannot overflow, so we can replace the original loop.
+define void @test4(i16 zeroext %N, i32* nocapture %C, i32* nocapture readonly %A, i32 %scale) {
+entry:
+ %conv = zext i16 %N to i32
+ %cmp30 = icmp eq i16 %N, 0
+ br i1 %cmp30, label %for.cond.cleanup, label %for.body.lr.ph.split.us
+; CHECK: entry:
+; CHECK: %[[LIMIT:.*]] = zext i16 %N to i32
+; CHECK: br i1 %{{.*}} label %for.cond.cleanup, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us: ; preds = %entry
+ br label %for.body.us
+; CHECK: for.body.lr.ph.split.us:
+; CHECK: %[[TRIPCOUNT:.*]] = mul i32 %[[LIMIT]], %[[LIMIT]]
+; CHECK: br label %for.body.us
+
+for.body.us: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us, %for.body.lr.ph.split.us
+ %i.031.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc15.us, %for.cond2.for.cond.cleanup6_crit_edge.us ]
+ %mul.us = mul nuw nsw i32 %i.031.us, %conv
+ br label %for.body7.us
+; CHECK: for.body.us:
+; CHECK: %[[OUTER_IV:.*]] = phi i32
+; CHECK: br label %for.body7.us
+
+for.body7.us: ; preds = %for.body.us, %for.body7.us
+; CHECK: for.body7.us:
+ %j.029.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body7.us ]
+ %add.us = add nuw nsw i32 %j.029.us, %mul.us
+ %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
+; CHECK: getelementptr inbounds i32, i32* %A, i32 %[[OUTER_IV]]
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %mul9.us = mul nsw i32 %0, %scale
+; CHECK: getelementptr inbounds i32, i32* %C, i32 %[[OUTER_IV]]
+ %arrayidx13.us = getelementptr inbounds i32, i32* %C, i32 %add.us
+ store i32 %mul9.us, i32* %arrayidx13.us, align 4
+ %inc.us = add nuw nsw i32 %j.029.us, 1
+ %exitcond = icmp ne i32 %inc.us, %conv
+ br i1 %exitcond, label %for.body7.us, label %for.cond2.for.cond.cleanup6_crit_edge.us
+; CHECK: br label %for.cond2.for.cond.cleanup6_crit_edge.us
+
+for.cond2.for.cond.cleanup6_crit_edge.us: ; preds = %for.body7.us
+ %inc15.us = add nuw nsw i32 %i.031.us, 1
+ %exitcond32 = icmp ne i32 %inc15.us, %conv
+ br i1 %exitcond32, label %for.body.us, label %for.cond.cleanup.loopexit
+; CHECK: for.cond2.for.cond.cleanup6_crit_edge.us:
+; CHECK: br i1 %exitcond32, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit: ; preds = %for.cond2.for.cond.cleanup6_crit_edge.us
+ br label %for.cond.cleanup
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK: br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+; CHECK: for.cond.cleanup:
+; CHECK: ret void
+}
+
+
+; CHECK-LABEL: test5
+define i32 @test5(i8 zeroext %I, i16 zeroext %J) {
+entry:
+ %0 = lshr i8 %I, 1
+ %div = zext i8 %0 to i32
+ %cmp30 = icmp eq i8 %0, 0
+ br i1 %cmp30, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph: ; preds = %entry
+ %1 = lshr i16 %J, 1
+ %div5 = zext i16 %1 to i32
+ %cmp627 = icmp eq i16 %1, 0
+ br i1 %cmp627, label %for.body.lr.ph.split, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us: ; preds = %for.body.lr.ph
+ br label %for.body.us
+; CHECK: for.body.lr.ph.split.us:
+; CHECK: %flatten.tripcount = mul i32 %div5, %div
+; CHECK: br label %for.body.us
+
+for.body.us: ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us, %for.body.lr.ph.split.us
+ %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+ %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+ br label %for.body9.us
+; CHECK: for.body.us:
+; CHECK: %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+; CHECK: %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+; CHECK: br label %for.body9.us
+
+for.body9.us: ; preds = %for.body.us, %for.body9.us
+ %j.029.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body9.us ]
+ %x.128.us = phi i32 [ %x.031.us, %for.body.us ], [ %xor.us, %for.body9.us ]
+ %call.us = tail call i32 @func(i32 1)
+ %sub.us = sub nsw i32 %call.us, %x.128.us
+ %xor.us = xor i32 %sub.us, %x.128.us
+ %inc.us = add nuw nsw i32 %j.029.us, 1
+ %cmp6.us = icmp ult i32 %inc.us, %div5
+ br i1 %cmp6.us, label %for.body9.us, label %for.cond3.for.cond.cleanup8_crit_edge.us
+; CHECK: for.body9.us:
+; CHECK: %j.029.us = phi i32 [ 0, %for.body.us ]
+; CHECK: %x.128.us = phi i32 [ %x.031.us, %for.body.us ]
+; CHECK: %call.us = tail call i32 @func(i32 1)
+; CHECK: %sub.us = sub nsw i32 %call.us, %x.128.us
+; CHECK: %xor.us = xor i32 %sub.us, %x.128.us
+; CHECK: %inc.us = add nuw nsw i32 %j.029.us, 1
+; CHECK: %cmp6.us = icmp ult i32 %inc.us, %div5
+; CHECK: br label %for.cond3.for.cond.cleanup8_crit_edge.us
+
+for.cond3.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us
+ %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ]
+ %inc13.us = add nuw nsw i32 %i.032.us, 1
+ %cmp.us = icmp ult i32 %inc13.us, %div
+ br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit
+; CHECK: for.cond3.for.cond.cleanup8_crit_edge.us:
+; CHECK: %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ]
+; CHECK: %inc13.us = add nuw nsw i32 %i.032.us, 1
+; CHECK: %cmp.us = icmp ult i32 %inc13.us, %flatten.tripcount
+; CHECK: br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.body.lr.ph.split: ; preds = %for.body.lr.ph
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us
+ %xor.us.lcssa.lcssa = phi i32 [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit34: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit34, %for.cond.cleanup.loopexit, %entry
+ %x.0.lcssa = phi i32 [ 1, %entry ], [ %xor.us.lcssa.lcssa, %for.cond.cleanup.loopexit ], [ 1, %for.cond.cleanup.loopexit34 ]
+ ret i32 %x.0.lcssa
+
+for.body: ; preds = %for.body.lr.ph.split, %for.body
+ %i.032 = phi i32 [ 0, %for.body.lr.ph.split ], [ %inc13, %for.body ]
+ %inc13 = add nuw nsw i32 %i.032, 1
+ %cmp = icmp ult i32 %inc13, %div
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit34
+}
+
+
+; CHECK-LABEL: test6
+define i32 @test6(i8 zeroext %I, i16 zeroext %J) {
+entry:
+ %0 = lshr i8 %I, 1
+ %div = zext i8 %0 to i32
+ %cmp30 = icmp eq i8 %0, 0
+ br i1 %cmp30, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph: ; preds = %entry
+ %1 = lshr i16 %J, 1
+ %div5 = zext i16 %1 to i32
+ %cmp627 = icmp eq i16 %1, 0
+ br i1 %cmp627, label %for.body.lr.ph.split, label %for.body.lr.ph.split.us
+
+for.body.lr.ph.split.us: ; preds = %for.body.lr.ph
+ br label %for.body.us
+; CHECK: for.body.lr.ph.split.us:
+; CHECK: %flatten.tripcount = mul i32 %div5, %div
+; CHECK: br label %for.body.us
+
+for.body.us: ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us, %for.body.lr.ph.split.us
+ %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+ %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+ %mul.us = mul nuw nsw i32 %i.032.us, %div5
+ br label %for.body9.us
+; CHECK: for.body.us:
+; CHECK: %i.032.us = phi i32 [ 0, %for.body.lr.ph.split.us ], [ %inc13.us, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+; CHECK: %x.031.us = phi i32 [ 1, %for.body.lr.ph.split.us ], [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+; CHECK: %mul.us = mul nuw nsw i32 %i.032.us, %div5
+; CHECK: br label %for.body9.us
+
+for.body9.us: ; preds = %for.body.us, %for.body9.us
+ %j.029.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %for.body9.us ]
+ %x.128.us = phi i32 [ %x.031.us, %for.body.us ], [ %xor.us, %for.body9.us ]
+ %add.us = add nuw nsw i32 %j.029.us, %mul.us
+ %call.us = tail call i32 @func(i32 %add.us)
+ %sub.us = sub nsw i32 %call.us, %x.128.us
+ %xor.us = xor i32 %sub.us, %x.128.us
+ %inc.us = add nuw nsw i32 %j.029.us, 1
+ %cmp6.us = icmp ult i32 %inc.us, %div5
+ br i1 %cmp6.us, label %for.body9.us, label %for.cond3.for.cond.cleanup8_crit_edge.us
+; CHECK: for.body9.us:
+; CHECK: %j.029.us = phi i32 [ 0, %for.body.us ]
+; CHECK: %x.128.us = phi i32 [ %x.031.us, %for.body.us ]
+; CHECK: %add.us = add nuw nsw i32 %j.029.us, %mul.us
+; CHECK: %call.us = tail call i32 @func(i32 %i.032.us)
+; CHECK: %sub.us = sub nsw i32 %call.us, %x.128.us
+; CHECK: %xor.us = xor i32 %sub.us, %x.128.us
+; CHECK: %inc.us = add nuw nsw i32 %j.029.us, 1
+; CHECK: %cmp6.us = icmp ult i32 %inc.us, %div5
+; CHECK: br label %for.cond3.for.cond.cleanup8_crit_edge.us
+
+for.cond3.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us
+ %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ]
+ %inc13.us = add nuw nsw i32 %i.032.us, 1
+ %cmp.us = icmp ult i32 %inc13.us, %div
+ br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit
+; CHECK: for.cond3.for.cond.cleanup8_crit_edge.us:
+; CHECK: %xor.us.lcssa = phi i32 [ %xor.us, %for.body9.us ]
+; CHECK: %inc13.us = add nuw nsw i32 %i.032.us, 1
+; CHECK: %cmp.us = icmp ult i32 %inc13.us, %flatten.tripcount
+; CHECK: br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.body.lr.ph.split: ; preds = %for.body.lr.ph
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.cond3.for.cond.cleanup8_crit_edge.us
+ %xor.us.lcssa.lcssa = phi i32 [ %xor.us.lcssa, %for.cond3.for.cond.cleanup8_crit_edge.us ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit34: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit34, %for.cond.cleanup.loopexit, %entry
+ %x.0.lcssa = phi i32 [ 1, %entry ], [ %xor.us.lcssa.lcssa, %for.cond.cleanup.loopexit ], [ 1, %for.cond.cleanup.loopexit34 ]
+ ret i32 %x.0.lcssa
+
+for.body: ; preds = %for.body.lr.ph.split, %for.body
+ %i.032 = phi i32 [ 0, %for.body.lr.ph.split ], [ %inc13, %for.body ]
+ %inc13 = add nuw nsw i32 %i.032, 1
+ %cmp = icmp ult i32 %inc13, %div
+ br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit34
+}
+
+; CHECK-LABEL: test7
+; Various inner phis and conditions which we can still work with
+define signext i16 @test7(i32 %I, i32 %J, i32* nocapture readonly %C, i16 signext %limit) {
+entry:
+ %cmp43 = icmp eq i32 %J, 0
+ br i1 %cmp43, label %for.end17, label %for.body.lr.ph
+
+for.body.lr.ph: ; preds = %entry
+ %conv = sext i16 %limit to i32
+ br label %for.body.us
+; CHECK: for.body.lr.ph:
+; CHECK: %conv = sext i16 %limit to i32
+; CHECK: %flatten.tripcount = mul i32 %J, %J
+; CHECK: br label %for.body.us
+
+for.body.us: ; preds = %for.cond1.for.inc15_crit_edge.us, %for.body.lr.ph
+ %i.047.us = phi i32 [ 0, %for.body.lr.ph ], [ %inc16.us, %for.cond1.for.inc15_crit_edge.us ]
+ %ret.046.us = phi i16 [ 0, %for.body.lr.ph ], [ %ret.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+ %prev.045.us = phi i32 [ 0, %for.body.lr.ph ], [ %.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+ %tmp.044.us = phi i32 [ 0, %for.body.lr.ph ], [ %tmp.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+ %mul.us = mul i32 %i.047.us, %J
+ br label %for.body3.us
+; CHECK: for.body.us:
+; CHECK: %i.047.us = phi i32 [ 0, %for.body.lr.ph ], [ %inc16.us, %for.cond1.for.inc15_crit_edge.us ]
+; CHECK: %ret.046.us = phi i16 [ 0, %for.body.lr.ph ], [ %ret.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+; CHECK: %prev.045.us = phi i32 [ 0, %for.body.lr.ph ], [ %.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+; CHECK: %tmp.044.us = phi i32 [ 0, %for.body.lr.ph ], [ %tmp.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+; CHECK: %mul.us = mul i32 %i.047.us, %J
+; CHECK: br label %for.body3.us
+
+for.body3.us: ; preds = %for.body.us, %if.end.us
+ %j.040.us = phi i32 [ 0, %for.body.us ], [ %inc.us, %if.end.us ]
+ %ret.139.us = phi i16 [ %ret.046.us, %for.body.us ], [ %ret.2.us, %if.end.us ]
+ %prev.138.us = phi i32 [ %prev.045.us, %for.body.us ], [ %0, %if.end.us ]
+ %tmp.137.us = phi i32 [ %tmp.044.us, %for.body.us ], [ %tmp.2.us, %if.end.us ]
+ %add.us = add i32 %j.040.us, %mul.us
+ %arrayidx.us = getelementptr inbounds i32, i32* %C, i32 %add.us
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %add4.us = add nsw i32 %0, %tmp.137.us
+ %cmp5.us = icmp sgt i32 %add4.us, %conv
+ br i1 %cmp5.us, label %if.then.us, label %if.else.us
+; CHECK: for.body3.us:
+; CHECK: %j.040.us = phi i32 [ 0, %for.body.us ]
+; CHECK: %ret.139.us = phi i16 [ %ret.046.us, %for.body.us ]
+; CHECK: %prev.138.us = phi i32 [ %prev.045.us, %for.body.us ]
+; CHECK: %tmp.137.us = phi i32 [ %tmp.044.us, %for.body.us ]
+; CHECK: %add.us = add i32 %j.040.us, %mul.us
+; CHECK: %arrayidx.us = getelementptr inbounds i32, i32* %C, i32 %i.047.us
+; CHECK: %0 = load i32, i32* %arrayidx.us, align 4
+; CHECK: %add4.us = add nsw i32 %0, %tmp.137.us
+; CHECK: %cmp5.us = icmp sgt i32 %add4.us, %conv
+; CHECK: br i1 %cmp5.us, label %if.then.us, label %if.else.us
+
+if.else.us: ; preds = %for.body3.us
+ %cmp10.us = icmp sgt i32 %0, %prev.138.us
+ %cond.us = zext i1 %cmp10.us to i32
+ %conv1235.us = zext i16 %ret.139.us to i32
+ %add13.us = add nuw nsw i32 %cond.us, %conv1235.us
+ br label %if.end.us
+; CHECK: if.else.us:
+; CHECK: %cmp10.us = icmp sgt i32 %0, %prev.138.us
+; CHECK: %cond.us = zext i1 %cmp10.us to i32
+; CHECK: %conv1235.us = zext i16 %ret.139.us to i32
+; CHECK: %add13.us = add nuw nsw i32 %cond.us, %conv1235.us
+; CHECK: br label %if.end.us
+
+if.then.us: ; preds = %for.body3.us
+ %conv7.us = sext i16 %ret.139.us to i32
+ %add8.us = add nsw i32 %conv7.us, 10
+ br label %if.end.us
+; CHECK: if.then.us:
+; CHECK: %conv7.us = sext i16 %ret.139.us to i32
+; CHECK: %add8.us = add nsw i32 %conv7.us, 10
+; CHECK: br label %if.end.us
+
+if.end.us: ; preds = %if.then.us, %if.else.us
+ %tmp.2.us = phi i32 [ 0, %if.then.us ], [ %add4.us, %if.else.us ]
+ %ret.2.in.us = phi i32 [ %add8.us, %if.then.us ], [ %add13.us, %if.else.us ]
+ %ret.2.us = trunc i32 %ret.2.in.us to i16
+ %inc.us = add nuw i32 %j.040.us, 1
+ %exitcond = icmp ne i32 %inc.us, %J
+ br i1 %exitcond, label %for.body3.us, label %for.cond1.for.inc15_crit_edge.us
+; CHECK: if.end.us:
+; CHECK: %tmp.2.us = phi i32 [ 0, %if.then.us ], [ %add4.us, %if.else.us ]
+; CHECK: %ret.2.in.us = phi i32 [ %add8.us, %if.then.us ], [ %add13.us, %if.else.us ]
+; CHECK: %ret.2.us = trunc i32 %ret.2.in.us to i16
+; CHECK: %inc.us = add nuw i32 %j.040.us, 1
+; CHECK: %exitcond = icmp ne i32 %inc.us, %J
+; CHECK: br label %for.cond1.for.inc15_crit_edge.us
+
+for.cond1.for.inc15_crit_edge.us: ; preds = %if.end.us
+ %tmp.2.us.lcssa = phi i32 [ %tmp.2.us, %if.end.us ]
+ %ret.2.us.lcssa = phi i16 [ %ret.2.us, %if.end.us ]
+ %.lcssa = phi i32 [ %0, %if.end.us ]
+ %inc16.us = add nuw i32 %i.047.us, 1
+ %exitcond49 = icmp ne i32 %inc16.us, %J
+ br i1 %exitcond49, label %for.body.us, label %for.end17.loopexit
+; CHECK: for.cond1.for.inc15_crit_edge.us:
+; CHECK: %tmp.2.us.lcssa = phi i32 [ %tmp.2.us, %if.end.us ]
+; CHECK: %ret.2.us.lcssa = phi i16 [ %ret.2.us, %if.end.us ]
+; CHECK: %.lcssa = phi i32 [ %0, %if.end.us ]
+; CHECK: %inc16.us = add nuw i32 %i.047.us, 1
+; CHECK: %exitcond49 = icmp ne i32 %inc16.us, %flatten.tripcount
+; CHECK: br i1 %exitcond49, label %for.body.us, label %for.end17.loopexit
+
+for.end17.loopexit: ; preds = %for.cond1.for.inc15_crit_edge.us
+ %ret.2.us.lcssa.lcssa = phi i16 [ %ret.2.us.lcssa, %for.cond1.for.inc15_crit_edge.us ]
+ br label %for.end17
+
+for.end17: ; preds = %for.end17.loopexit, %entry
+ %ret.0.lcssa = phi i16 [ 0, %entry ], [ %ret.2.us.lcssa.lcssa, %for.end17.loopexit ]
+ ret i16 %ret.0.lcssa
+}
+
+; CHECK-LABEL: test8
+; Same as test1, but with different continue block order
+; (uses icmp eq and loops on false)
+define i32 @test8(i32 %val, i16* nocapture %A) {
+entry:
+ br label %for.body
+; CHECK: entry:
+; CHECK: %flatten.tripcount = mul i32 20, 10
+; CHECK: br label %for.body
+
+for.body: ; preds = %entry, %for.inc6
+ %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ]
+ %mul = mul nuw nsw i32 %i.018, 20
+ br label %for.body3
+; CHECK: for.body:
+; CHECK: %i.018 = phi i32 [ 0, %entry ], [ %inc7, %for.inc6 ]
+; CHECK: %mul = mul nuw nsw i32 %i.018, 20
+; CHECK: br label %for.body3
+
+for.body3: ; preds = %for.body, %for.body3
+ %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
+ %add = add nuw nsw i32 %j.017, %mul
+ %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
+ %0 = load i16, i16* %arrayidx, align 2
+ %conv16 = zext i16 %0 to i32
+ %add4 = add i32 %conv16, %val
+ %conv5 = trunc i32 %add4 to i16
+ store i16 %conv5, i16* %arrayidx, align 2
+ %inc = add nuw nsw i32 %j.017, 1
+ %exitcond = icmp eq i32 %inc, 20
+ br i1 %exitcond, label %for.inc6, label %for.body3
+; CHECK: for.body3:
+; CHECK: %j.017 = phi i32 [ 0, %for.body ]
+; CHECK: %add = add nuw nsw i32 %j.017, %mul
+; CHECK: %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.018
+; CHECK: %0 = load i16, i16* %arrayidx, align 2
+; CHECK: %conv16 = zext i16 %0 to i32
+; CHECK: %add4 = add i32 %conv16, %val
+; CHECK: %conv5 = trunc i32 %add4 to i16
+; CHECK: store i16 %conv5, i16* %arrayidx, align 2
+; CHECK: %inc = add nuw nsw i32 %j.017, 1
+; CHECK: %exitcond = icmp eq i32 %inc, 20
+; CHECK: br label %for.inc6
+
+for.inc6: ; preds = %for.body3
+ %inc7 = add nuw nsw i32 %i.018, 1
+ %exitcond19 = icmp eq i32 %inc7, 10
+ br i1 %exitcond19, label %for.end8, label %for.body
+; CHECK: for.inc6:
+; CHECK: %inc7 = add nuw nsw i32 %i.018, 1
+; CHECK: %exitcond19 = icmp eq i32 %inc7, %flatten.tripcount
+; CHECK: br i1 %exitcond19, label %for.end8, label %for.body
+
+for.end8: ; preds = %for.inc6
+ ret i32 10
+}
+
+
+declare i32 @func(i32)
+
--- /dev/null
+; RUN: opt < %s -S -loop-flatten -verify-loop-info -verify-dom-info -verify-scev -verify | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; Test case and IR corresponding to this code:
+;
+; int k = 0;
+; for(int i = 0; i < n; i++)
+; for(int j = 0; j < n; j++) {
+; A[k] = B[k];
+; k++;
+; }
+;
+; TODO: this case doesn't trigger yet.
+;
+define dso_local void @v0(i32 %n, i32* nocapture %A, i32* nocapture readonly %B) local_unnamed_addr #0 {
+;
+; CHECK-LABEL: @v0
+; CHECK-NOT: %flatten.tripcount = mul i32 %n, %n
+;
+entry:
+ %cmp21 = icmp sgt i32 %n, 0
+ br i1 %cmp21, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup
+
+for.cond1.preheader.us.preheader:
+ br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:
+ %i.023.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+ %k.022.us = phi i32 [ %inc.us.lcssa, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+ %0 = add i32 %n, %k.022.us
+ br label %for.body4.us
+
+for.body4.us:
+ %k.119.us = phi i32 [ %k.022.us, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
+ %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %k.119.us
+ %1 = load i32, i32* %arrayidx.us, align 4
+ %arrayidx5.us = getelementptr inbounds i32, i32* %A, i32 %k.119.us
+ store i32 %1, i32* %arrayidx5.us, align 4
+ %inc.us = add i32 %k.119.us, 1
+ %exitcond = icmp ne i32 %inc.us, %0
+ br i1 %exitcond, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:
+ %inc.us.lcssa = phi i32 [ %inc.us, %for.body4.us ]
+ %inc8.us = add nuw nsw i32 %i.023.us, 1
+ %cmp.us = icmp slt i32 %inc8.us, %n
+ br i1 %cmp.us, label %for.cond1.preheader.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+; Test case and IR corresponding to this code:
+;
+; for(int i = 0; i < n; i++)
+; for(int j = 0; j < n; j++) {
+; int k = i*n+j;
+; A[k] = B[k];
+; k++;
+; }
+;
+define dso_local void @v1(i32 %n, i32* nocapture %A, i32* nocapture readonly %B) local_unnamed_addr #0 {
+;
+; CHECK-LABEL: @v1
+; CHECK: for.cond1.preheader.us.preheader:
+; CHECK: %flatten.tripcount = mul i32 %n, %n
+; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us:
+; CHECK: %inc8.us = add nuw nsw i32 %i.024.us, 1
+; CHECK: %cmp.us = icmp slt i32 %inc8.us, %flatten.tripcount
+;
+entry:
+ %cmp23 = icmp sgt i32 %n, 0
+ br i1 %cmp23, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup
+
+for.cond1.preheader.us.preheader:
+ br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:
+ %i.024.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+ %mul.us = mul nsw i32 %i.024.us, %n
+ br label %for.body4.us
+
+for.body4.us:
+ %j.022.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc6.us, %for.body4.us ]
+ %add.us = add nsw i32 %j.022.us, %mul.us
+ %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %add.us
+ %0 = load i32, i32* %arrayidx.us, align 4
+ %arrayidx5.us = getelementptr inbounds i32, i32* %A, i32 %add.us
+ store i32 %0, i32* %arrayidx5.us, align 4
+ %inc6.us = add nuw nsw i32 %j.022.us, 1
+ %exitcond = icmp ne i32 %inc6.us, %n
+ br i1 %exitcond, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:
+ %inc8.us = add nuw nsw i32 %i.024.us, 1
+ %cmp.us = icmp slt i32 %inc8.us, %n
+ br i1 %cmp.us, label %for.cond1.preheader.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}