enum ID : unsigned;
}
+class AssumptionCache;
+class BranchInst;
class Function;
class GlobalValue;
class IntrinsicInst;
class ScalarEvolution;
class StoreInst;
class SwitchInst;
+class TargetLibraryInfo;
class Type;
class User;
class Value;
void getUnrollingPreferences(Loop *L, ScalarEvolution &,
UnrollingPreferences &UP) const;
+ /// Attributes of a target dependent hardware loop. Here, the term 'element'
+ /// describes the work performed by an IR loop that has not been vectorized
+ /// by the compiler.
+ struct HardwareLoopInfo {
+ HardwareLoopInfo() = delete;
+ HardwareLoopInfo(Loop *L) : L(L) { }
+ Loop *L = nullptr;
+ BasicBlock *ExitBlock = nullptr;
+ BranchInst *ExitBranch = nullptr;
+ const SCEV *ExitCount = nullptr;
+ IntegerType *CountType = nullptr;
+ Value *LoopDecrement = nullptr; // The maximum number of elements
+ // processed in the loop body.
+ bool IsNestingLegal = false; // Can a hardware loop be a parent to
+ // another hardware loop.
+ bool CounterInReg = false; // Should loop counter be updated in
+ // the loop via a phi?
+ };
+
+ /// Query the target whether it would be profitable to convert the given loop
+ /// into a hardware loop.
+ bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo,
+ HardwareLoopInfo &HWLoopInfo) const;
+
/// @}
/// \name Scalar Target Information
virtual bool isLoweredToCall(const Function *F) = 0;
virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
UnrollingPreferences &UP) = 0;
+ virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo,
+ HardwareLoopInfo &HWLoopInfo) = 0;
virtual bool isLegalAddImmediate(int64_t Imm) = 0;
virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
UnrollingPreferences &UP) override {
return Impl.getUnrollingPreferences(L, SE, UP);
}
+ bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo,
+ HardwareLoopInfo &HWLoopInfo) override {
+ return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
+ }
bool isLegalAddImmediate(int64_t Imm) override {
return Impl.isLegalAddImmediate(Imm);
}
return true;
}
+ bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo,
+ TTI::HardwareLoopInfo &HWLoopInfo) {
+ return false;
+ }
+
void getUnrollingPreferences(Loop *, ScalarEvolution &,
TTI::UnrollingPreferences &) {}
UP.BEInsns = 2;
}
+ bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo,
+ TTI::HardwareLoopInfo &HWLoopInfo) {
+ return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
+ }
+
int getInstructionLatency(const Instruction *I) {
if (isa<LoadInst>(I))
return getST()->getSchedModel().DefaultLoadLatency;
/// Creates CFI Instruction Inserter pass. \see CFIInstrInserter.cpp
FunctionPass *createCFIInstrInserter();
+ /// Create Hardware Loop pass. \see HardwareLoops.cpp
+ FunctionPass *createHardwareLoopsPass();
+
} // End llvm namespace
#endif
[llvm_anyvector_ty],
[IntrNoMem]>;
+//===---------- Intrinsics to control hardware supported loops ----------===//
+
+// Specify that the value given is the number of iterations that the next loop
+// will execute.
+def int_set_loop_iterations :
+ Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>;
+
+// Decrement loop counter by the given argument. Return false if the loop
+// should exit.
+def int_loop_decrement :
+ Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>;
+
+// Decrement the first operand (the loop counter) by the second operand (the
+// maximum number of elements processed in an iteration). Return the remaining
+// number of iterations still to be executed. This is effectively a sub which
+// can be used with a phi, icmp and br to control the number of iterations
+// executed, as usual.
+def int_loop_decrement_reg :
+ Intrinsic<[llvm_anyint_ty],
+ [llvm_anyint_ty, llvm_anyint_ty], [IntrNoDuplicate]>;
+
//===----- Intrinsics that are used to provide predicate information -----===//
def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
void initializeGlobalSplitPass(PassRegistry&);
void initializeGlobalsAAWrapperPassPass(PassRegistry&);
void initializeGuardWideningLegacyPassPass(PassRegistry&);
+void initializeHardwareLoopsPass(PassRegistry&);
void initializeHotColdSplittingLegacyPassPass(PassRegistry&);
void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &);
void initializeIPCPPass(PassRegistry&);
(void) llvm::createEliminateAvailableExternallyPass();
(void) llvm::createScalarizeMaskedMemIntrinPass();
(void) llvm::createWarnMissedTransformationsPass();
+ (void) llvm::createHardwareLoopsPass();
(void)new llvm::IntervalPartition();
(void)new llvm::ScalarEvolutionWrapperPass();
return TTIImpl->isLoweredToCall(F);
}
+bool TargetTransformInfo::isHardwareLoopProfitable(
+ Loop *L, ScalarEvolution &SE, AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const {
+ return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
+}
+
void TargetTransformInfo::getUnrollingPreferences(
Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
return TTIImpl->getUnrollingPreferences(L, SE, UP);
GCRootLowering.cpp
GCStrategy.cpp
GlobalMerge.cpp
+ HardwareLoops.cpp
IfConversion.cpp
ImplicitNullChecks.cpp
IndirectBrExpandPass.cpp
initializeFuncletLayoutPass(Registry);
initializeGCMachineCodeAnalysisPass(Registry);
initializeGCModuleInfoPass(Registry);
+ initializeHardwareLoopsPass(Registry);
initializeIfConverterPass(Registry);
initializeImplicitNullChecksPass(Registry);
initializeIndirectBrExpandPassPass(Registry);
--- /dev/null
+//===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Insert hardware loop intrinsics into loops which are deemed profitable by
+/// the target, by querying TargetTransformInfo. A hardware loop comprises of
+/// two intrinsics: one, outside the loop, to set the loop iteration count and
+/// another, in the exit block, to decrement the counter. The decremented value
+/// can either be carried through the loop via a phi or handled in some opaque
+/// way by the target.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "hardware-loops"
+
+#define HW_LOOPS_NAME "Hardware Loop Insertion"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false),
+ cl::desc("Force hardware loops intrinsics to be inserted"));
+
+static cl::opt<bool>
+ForceHardwareLoopPHI(
+ "force-hardware-loop-phi", cl::Hidden, cl::init(false),
+ cl::desc("Force hardware loop counter to be updated through a phi"));
+
+static cl::opt<bool>
+ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false),
+ cl::desc("Force allowance of nested hardware loops"));
+
+static cl::opt<unsigned>
+LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1),
+ cl::desc("Set the loop decrement value"));
+
+static cl::opt<unsigned>
+CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32),
+ cl::desc("Set the loop counter bitwidth"));
+
+STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
+
+namespace {
+
+ using TTI = TargetTransformInfo;
+
+ class HardwareLoops : public FunctionPass {
+ public:
+ static char ID;
+
+ HardwareLoops() : FunctionPass(ID) {
+ initializeHardwareLoopsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ // Try to convert the given Loop into a hardware loop.
+ bool TryConvertLoop(Loop *L);
+
+ // Given that the target believes the loop to be profitable, try to
+ // convert it.
+ bool TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo);
+
+ private:
+ ScalarEvolution *SE = nullptr;
+ LoopInfo *LI = nullptr;
+ const DataLayout *DL = nullptr;
+ const TargetTransformInfo *TTI = nullptr;
+ DominatorTree *DT = nullptr;
+ bool PreserveLCSSA = false;
+ AssumptionCache *AC = nullptr;
+ TargetLibraryInfo *LibInfo = nullptr;
+ Module *M = nullptr;
+ bool MadeChange = false;
+ };
+
+ class HardwareLoop {
+ // Expand the trip count scev into a value that we can use.
+ Value *InitLoopCount(BasicBlock *BB);
+
+ // Insert the set_loop_iteration intrinsic.
+ void InsertIterationSetup(Value *LoopCountInit, BasicBlock *BB);
+
+ // Insert the loop_decrement intrinsic.
+ void InsertLoopDec();
+
+ // Insert the loop_decrement_reg intrinsic.
+ Instruction *InsertLoopRegDec(Value *EltsRem);
+
+ // If the target requires the counter value to be updated in the loop,
+ // insert a phi to hold the value. The intended purpose is for use by
+ // loop_decrement_reg.
+ PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem);
+
+ // Create a new cmp, that checks the returned value of loop_decrement*,
+ // and update the exit branch to use it.
+ void UpdateBranch(Value *EltsRem);
+
+ public:
+ HardwareLoop(TTI::HardwareLoopInfo &Info, ScalarEvolution &SE,
+ const DataLayout &DL) :
+ SE(SE), DL(DL), L(Info.L), M(L->getHeader()->getModule()),
+ ExitCount(Info.ExitCount),
+ CountType(Info.CountType),
+ ExitBranch(Info.ExitBranch),
+ LoopDecrement(Info.LoopDecrement),
+ UsePHICounter(Info.CounterInReg) { }
+
+ void Create();
+
+ private:
+ ScalarEvolution &SE;
+ const DataLayout &DL;
+ Loop *L = nullptr;
+ Module *M = nullptr;
+ const SCEV *ExitCount = nullptr;
+ Type *CountType = nullptr;
+ BranchInst *ExitBranch = nullptr;
+ Value *LoopDecrement = nullptr;
+ bool UsePHICounter = false;
+ };
+}
+
+char HardwareLoops::ID = 0;
+
+bool HardwareLoops::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n");
+
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ DL = &F.getParent()->getDataLayout();
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
+ PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ M = F.getParent();
+
+ for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
+ Loop *L = *I;
+ if (!L->getParentLoop())
+ TryConvertLoop(L);
+ }
+
+ return MadeChange;
+}
+
+// Return true if the search should stop, which will be when an inner loop is
+// converted and the parent loop doesn't support containing a hardware loop.
+bool HardwareLoops::TryConvertLoop(Loop *L) {
+ // Process nested loops first.
+ for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+ if (TryConvertLoop(*I))
+ return true; // Stop search.
+
+ // Bail out if the loop has irreducible control flow.
+ LoopBlocksRPO RPOT(L);
+ RPOT.perform(LI);
+ if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI))
+ return false;
+
+ TTI::HardwareLoopInfo HWLoopInfo(L);
+ if (TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo) ||
+ ForceHardwareLoops) {
+
+ // Allow overriding of the counter width and loop decrement value.
+ if (CounterBitWidth.getNumOccurrences())
+ HWLoopInfo.CountType =
+ IntegerType::get(M->getContext(), CounterBitWidth);
+
+ if (LoopDecrement.getNumOccurrences())
+ HWLoopInfo.LoopDecrement =
+ ConstantInt::get(HWLoopInfo.CountType, LoopDecrement);
+
+ MadeChange |= TryConvertLoop(HWLoopInfo);
+ return MadeChange && (!HWLoopInfo.IsNestingLegal && !ForceNestedLoop);
+ }
+
+ return false;
+}
+
+bool HardwareLoops::TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo) {
+
+ Loop *L = HWLoopInfo.L;
+ LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L);
+
+ SmallVector<BasicBlock*, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
+ IE = ExitingBlocks.end(); I != IE; ++I) {
+ const SCEV *EC = SE->getExitCount(L, *I);
+ if (isa<SCEVCouldNotCompute>(EC))
+ continue;
+ if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
+ if (ConstEC->getValue()->isZero())
+ continue;
+ } else if (!SE->isLoopInvariant(EC, L))
+ continue;
+
+ if (SE->getTypeSizeInBits(EC->getType()) >
+ HWLoopInfo.CountType->getBitWidth())
+ continue;
+
+ // If this exiting block is contained in a nested loop, it is not eligible
+ // for insertion of the branch-and-decrement since the inner loop would
+ // end up messing up the value in the CTR.
+ if (!HWLoopInfo.IsNestingLegal && LI->getLoopFor(*I) != L &&
+ !ForceNestedLoop)
+ continue;
+
+ // We now have a loop-invariant count of loop iterations (which is not the
+ // constant zero) for which we know that this loop will not exit via this
+ // existing block.
+
+ // We need to make sure that this block will run on every loop iteration.
+ // For this to be true, we must dominate all blocks with backedges. Such
+ // blocks are in-loop predecessors to the header block.
+ bool NotAlways = false;
+ for (pred_iterator PI = pred_begin(L->getHeader()),
+ PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
+ if (!L->contains(*PI))
+ continue;
+
+ if (!DT->dominates(*I, *PI)) {
+ NotAlways = true;
+ break;
+ }
+ }
+
+ if (NotAlways)
+ continue;
+
+ // Make sure this blocks ends with a conditional branch.
+ Instruction *TI = (*I)->getTerminator();
+ if (!TI)
+ continue;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (!BI->isConditional())
+ continue;
+
+ HWLoopInfo.ExitBranch = BI;
+ } else
+ continue;
+
+ // Note that this block may not be the loop latch block, even if the loop
+ // has a latch block.
+ HWLoopInfo.ExitBlock = *I;
+ HWLoopInfo.ExitCount = EC;
+ break;
+ }
+
+ if (!HWLoopInfo.ExitBlock)
+ return false;
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+
+ // If we don't have a preheader, then insert one.
+ if (!Preheader)
+ Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
+ if (!Preheader)
+ return false;
+
+ HardwareLoop HWLoop(HWLoopInfo, *SE, *DL);
+ HWLoop.Create();
+ ++NumHWLoops;
+ return true;
+}
+
+void HardwareLoop::Create() {
+ LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
+ BasicBlock *BeginBB = L->getLoopPreheader();
+ Value *LoopCountInit = InitLoopCount(BeginBB);
+ if (!LoopCountInit)
+ return;
+
+ InsertIterationSetup(LoopCountInit, BeginBB);
+
+ if (UsePHICounter || ForceHardwareLoopPHI) {
+ Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
+ Value *EltsRem = InsertPHICounter(LoopCountInit, LoopDec);
+ LoopDec->setOperand(0, EltsRem);
+ UpdateBranch(LoopDec);
+ } else
+ InsertLoopDec();
+
+ // Run through the basic blocks of the loop and see if any of them have dead
+ // PHIs that can be removed.
+ for (auto I : L->blocks())
+ DeleteDeadPHIs(I);
+}
+
+Value *HardwareLoop::InitLoopCount(BasicBlock *BB) {
+ SCEVExpander SCEVE(SE, DL, "loopcnt");
+ if (!ExitCount->getType()->isPointerTy() &&
+ ExitCount->getType() != CountType)
+ ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
+
+ ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
+
+ if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) {
+ LLVM_DEBUG(dbgs() << "HWLoops: Bailing, unsafe to expand ExitCount "
+ << *ExitCount << "\n");
+ return nullptr;
+ }
+
+ Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
+ BB->getTerminator());
+ LLVM_DEBUG(dbgs() << "HWLoops: Loop Count: " << *Count << "\n");
+ return Count;
+}
+
+void HardwareLoop::InsertIterationSetup(Value *LoopCountInit,
+ BasicBlock *BB) {
+ IRBuilder<> Builder(BB->getTerminator());
+ Type *Ty = LoopCountInit->getType();
+ Function *LoopIter =
+ Intrinsic::getDeclaration(M, Intrinsic::set_loop_iterations, Ty);
+ Value *Call = Builder.CreateCall(LoopIter, LoopCountInit);
+ LLVM_DEBUG(dbgs() << "HWLoops: Iteration set: " << *Call << "\n");
+}
+
+void HardwareLoop::InsertLoopDec() {
+ IRBuilder<> CondBuilder(ExitBranch);
+
+ Function *DecFunc =
+ Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
+ LoopDecrement->getType());
+ Value *Ops[] = { LoopDecrement };
+ Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
+ Value *OldCond = ExitBranch->getCondition();
+ ExitBranch->setCondition(NewCond);
+
+ // The false branch must exit the loop.
+ if (!L->contains(ExitBranch->getSuccessor(0)))
+ ExitBranch->swapSuccessors();
+
+ // The old condition may be dead now, and may have even created a dead PHI
+ // (the original induction variable).
+ RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+
+ LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n");
+}
+
+Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
+ IRBuilder<> CondBuilder(ExitBranch);
+
+ Function *DecFunc =
+ Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
+ { EltsRem->getType(), EltsRem->getType(),
+ LoopDecrement->getType()
+ });
+ Value *Ops[] = { EltsRem, LoopDecrement };
+ Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
+
+ LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
+ return cast<Instruction>(Call);
+}
+
+PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
+ BasicBlock *Preheader = L->getLoopPreheader();
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Latch = ExitBranch->getParent();
+ IRBuilder<> Builder(Header->getFirstNonPHI());
+ PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
+ Index->addIncoming(NumElts, Preheader);
+ Index->addIncoming(EltsRem, Latch);
+ LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n");
+ return Index;
+}
+
+void HardwareLoop::UpdateBranch(Value *EltsRem) {
+ IRBuilder<> CondBuilder(ExitBranch);
+ Value *NewCond =
+ CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->getType(), 0));
+ Value *OldCond = ExitBranch->getCondition();
+ ExitBranch->setCondition(NewCond);
+
+ // The false branch must exit the loop.
+ if (!L->contains(ExitBranch->getSuccessor(0)))
+ ExitBranch->swapSuccessors();
+
+ // The old condition may be dead now, and may have even created a dead PHI
+ // (the original induction variable).
+ RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+}
+
+INITIALIZE_PASS_BEGIN(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
+
+FunctionPass *llvm::createHardwareLoopsPass() { return new HardwareLoops(); }
static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
#endif
-// The latency of mtctr is only justified if there are more than 4
-// comparisons that will be removed as a result.
-static cl::opt<unsigned>
-SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
- cl::desc("Loops with a constant trip count smaller than "
- "this value will not use the count register."));
-
-STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
-
namespace {
- struct PPCCTRLoops : public FunctionPass {
-
-#ifndef NDEBUG
- static int Counter;
-#endif
-
- public:
- static char ID;
-
- PPCCTRLoops() : FunctionPass(ID) {
- initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- }
-
- private:
- bool mightUseCTR(BasicBlock *BB);
- bool convertToCTRLoop(Loop *L);
-
- private:
- const PPCTargetMachine *TM;
- const PPCSubtarget *STI;
- const PPCTargetLowering *TLI;
- const DataLayout *DL;
- const TargetLibraryInfo *LibInfo;
- const TargetTransformInfo *TTI;
- LoopInfo *LI;
- ScalarEvolution *SE;
- DominatorTree *DT;
- bool PreserveLCSSA;
- TargetSchedModel SchedModel;
- };
-
- char PPCCTRLoops::ID = 0;
-#ifndef NDEBUG
- int PPCCTRLoops::Counter = 0;
-#endif
#ifndef NDEBUG
struct PPCCTRLoopsVerify : public MachineFunctionPass {
#endif // NDEBUG
} // end anonymous namespace
-INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
- false, false)
-
-FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); }
-
#ifndef NDEBUG
INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
"PowerPC CTR Loops Verify", false, false)
}
#endif // NDEBUG
-bool PPCCTRLoops::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
- if (!TPC)
- return false;
-
- TM = &TPC->getTM<PPCTargetMachine>();
- STI = TM->getSubtargetImpl(F);
- TLI = STI->getTargetLowering();
-
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- DL = &F.getParent()->getDataLayout();
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
- PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
- SchedModel.init(STI);
-
- bool MadeChange = false;
-
- for (LoopInfo::iterator I = LI->begin(), E = LI->end();
- I != E; ++I) {
- Loop *L = *I;
- if (!L->getParentLoop())
- MadeChange |= convertToCTRLoop(L);
- }
-
- return MadeChange;
-}
-
-static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
- if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
- return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
-
- return false;
-}
-
-// Determining the address of a TLS variable results in a function call in
-// certain TLS models.
-static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) {
- const auto *GV = dyn_cast<GlobalValue>(MemAddr);
- if (!GV) {
- // Recurse to check for constants that refer to TLS global variables.
- if (const auto *CV = dyn_cast<Constant>(MemAddr))
- for (const auto &CO : CV->operands())
- if (memAddrUsesCTR(TM, CO))
- return true;
-
- return false;
- }
-
- if (!GV->isThreadLocal())
- return false;
- TLSModel::Model Model = TM.getTLSModel(GV);
- return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
-}
-
-// Loop through the inline asm constraints and look for something that clobbers
-// ctr.
-static bool asmClobbersCTR(InlineAsm *IA) {
- InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
- for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
- InlineAsm::ConstraintInfo &C = CIV[i];
- if (C.Type != InlineAsm::isInput)
- for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
- if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
- return true;
- }
- return false;
-}
-
-bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
- for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
- J != JE; ++J) {
- if (CallInst *CI = dyn_cast<CallInst>(J)) {
- // Inline ASM is okay, unless it clobbers the ctr register.
- if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
- if (asmClobbersCTR(IA))
- return true;
- continue;
- }
-
- if (Function *F = CI->getCalledFunction()) {
- // Most intrinsics don't become function calls, but some might.
- // sin, cos, exp and log are always calls.
- unsigned Opcode = 0;
- if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
- switch (F->getIntrinsicID()) {
- default: continue;
- // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
- // we're definitely using CTR.
- case Intrinsic::ppc_is_decremented_ctr_nonzero:
- case Intrinsic::ppc_mtctr:
- return true;
-
-// VisualStudio defines setjmp as _setjmp
-#if defined(_MSC_VER) && defined(setjmp) && \
- !defined(setjmp_undefined_for_msvc)
-# pragma push_macro("setjmp")
-# undef setjmp
-# define setjmp_undefined_for_msvc
-#endif
-
- case Intrinsic::setjmp:
-
-#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
- // let's return it to _setjmp state
-# pragma pop_macro("setjmp")
-# undef setjmp_undefined_for_msvc
-#endif
-
- case Intrinsic::longjmp:
-
- // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
- // because, although it does clobber the counter register, the
- // control can't then return to inside the loop unless there is also
- // an eh_sjlj_setjmp.
- case Intrinsic::eh_sjlj_setjmp:
-
- case Intrinsic::memcpy:
- case Intrinsic::memmove:
- case Intrinsic::memset:
- case Intrinsic::powi:
- case Intrinsic::log:
- case Intrinsic::log2:
- case Intrinsic::log10:
- case Intrinsic::exp:
- case Intrinsic::exp2:
- case Intrinsic::pow:
- case Intrinsic::sin:
- case Intrinsic::cos:
- return true;
- case Intrinsic::copysign:
- if (CI->getArgOperand(0)->getType()->getScalarType()->
- isPPC_FP128Ty())
- return true;
- else
- continue; // ISD::FCOPYSIGN is never a library call.
- case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
- case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
- case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
- case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
- case Intrinsic::rint: Opcode = ISD::FRINT; break;
- case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
- case Intrinsic::round: Opcode = ISD::FROUND; break;
- case Intrinsic::minnum: Opcode = ISD::FMINNUM; break;
- case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break;
- case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break;
- case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break;
- }
- }
-
- // PowerPC does not use [US]DIVREM or other library calls for
- // operations on regular types which are not otherwise library calls
- // (i.e. soft float or atomics). If adapting for targets that do,
- // additional care is required here.
-
- LibFunc Func;
- if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
- LibInfo->getLibFunc(F->getName(), Func) &&
- LibInfo->hasOptimizedCodeGen(Func)) {
- // Non-read-only functions are never treated as intrinsics.
- if (!CI->onlyReadsMemory())
- return true;
-
- // Conversion happens only for FP calls.
- if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
- return true;
-
- switch (Func) {
- default: return true;
- case LibFunc_copysign:
- case LibFunc_copysignf:
- continue; // ISD::FCOPYSIGN is never a library call.
- case LibFunc_copysignl:
- return true;
- case LibFunc_fabs:
- case LibFunc_fabsf:
- case LibFunc_fabsl:
- continue; // ISD::FABS is never a library call.
- case LibFunc_sqrt:
- case LibFunc_sqrtf:
- case LibFunc_sqrtl:
- Opcode = ISD::FSQRT; break;
- case LibFunc_floor:
- case LibFunc_floorf:
- case LibFunc_floorl:
- Opcode = ISD::FFLOOR; break;
- case LibFunc_nearbyint:
- case LibFunc_nearbyintf:
- case LibFunc_nearbyintl:
- Opcode = ISD::FNEARBYINT; break;
- case LibFunc_ceil:
- case LibFunc_ceilf:
- case LibFunc_ceill:
- Opcode = ISD::FCEIL; break;
- case LibFunc_rint:
- case LibFunc_rintf:
- case LibFunc_rintl:
- Opcode = ISD::FRINT; break;
- case LibFunc_round:
- case LibFunc_roundf:
- case LibFunc_roundl:
- Opcode = ISD::FROUND; break;
- case LibFunc_trunc:
- case LibFunc_truncf:
- case LibFunc_truncl:
- Opcode = ISD::FTRUNC; break;
- case LibFunc_fmin:
- case LibFunc_fminf:
- case LibFunc_fminl:
- Opcode = ISD::FMINNUM; break;
- case LibFunc_fmax:
- case LibFunc_fmaxf:
- case LibFunc_fmaxl:
- Opcode = ISD::FMAXNUM; break;
- }
- }
-
- if (Opcode) {
- EVT EVTy =
- TLI->getValueType(*DL, CI->getArgOperand(0)->getType(), true);
-
- if (EVTy == MVT::Other)
- return true;
-
- if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
- continue;
- else if (EVTy.isVector() &&
- TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
- continue;
-
- return true;
- }
- }
-
- return true;
- } else if (isa<BinaryOperator>(J) &&
- J->getType()->getScalarType()->isPPC_FP128Ty()) {
- // Most operations on ppc_f128 values become calls.
- return true;
- } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
- isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
- CastInst *CI = cast<CastInst>(J);
- if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
- CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
- isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) ||
- isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType()))
- return true;
- } else if (isLargeIntegerTy(!TM->isPPC64(),
- J->getType()->getScalarType()) &&
- (J->getOpcode() == Instruction::UDiv ||
- J->getOpcode() == Instruction::SDiv ||
- J->getOpcode() == Instruction::URem ||
- J->getOpcode() == Instruction::SRem)) {
- return true;
- } else if (!TM->isPPC64() &&
- isLargeIntegerTy(false, J->getType()->getScalarType()) &&
- (J->getOpcode() == Instruction::Shl ||
- J->getOpcode() == Instruction::AShr ||
- J->getOpcode() == Instruction::LShr)) {
- // Only on PPC32, for 128-bit integers (specifically not 64-bit
- // integers), these might be runtime calls.
- return true;
- } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
- // On PowerPC, indirect jumps use the counter register.
- return true;
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
- if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
- return true;
- }
-
- // FREM is always a call.
- if (J->getOpcode() == Instruction::FRem)
- return true;
-
- if (STI->useSoftFloat()) {
- switch(J->getOpcode()) {
- case Instruction::FAdd:
- case Instruction::FSub:
- case Instruction::FMul:
- case Instruction::FDiv:
- case Instruction::FPTrunc:
- case Instruction::FPExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::UIToFP:
- case Instruction::SIToFP:
- case Instruction::FCmp:
- return true;
- }
- }
-
- for (Value *Operand : J->operands())
- if (memAddrUsesCTR(*TM, Operand))
- return true;
- }
-
- return false;
-}
-bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
- bool MadeChange = false;
-
- // Do not convert small short loops to CTR loop.
- unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
- if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
- SmallPtrSet<const Value *, 32> EphValues;
- auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
- *L->getHeader()->getParent());
- CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
- CodeMetrics Metrics;
- for (BasicBlock *BB : L->blocks())
- Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
- // 6 is an approximate latency for the mtctr instruction.
- if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
- return false;
- }
-
- // Process nested loops first.
- for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
- MadeChange |= convertToCTRLoop(*I);
- LLVM_DEBUG(dbgs() << "Nested loop converted\n");
- }
-
- // If a nested loop has been converted, then we can't convert this loop.
- if (MadeChange)
- return MadeChange;
-
- // Bail out if the loop has irreducible control flow.
- LoopBlocksRPO RPOT(L);
- RPOT.perform(LI);
- if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI))
- return false;
-
-#ifndef NDEBUG
- // Stop trying after reaching the limit (if any).
- int Limit = CTRLoopLimit;
- if (Limit >= 0) {
- if (Counter >= CTRLoopLimit)
- return false;
- Counter++;
- }
-#endif
-
- // We don't want to spill/restore the counter register, and so we don't
- // want to use the counter register if the loop contains calls.
- for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
- I != IE; ++I)
- if (mightUseCTR(*I))
- return MadeChange;
-
- SmallVector<BasicBlock*, 4> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
-
- // If there is an exit edge known to be frequently taken,
- // we should not transform this loop.
- for (auto &BB : ExitingBlocks) {
- Instruction *TI = BB->getTerminator();
- if (!TI) continue;
-
- if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
- uint64_t TrueWeight = 0, FalseWeight = 0;
- if (!BI->isConditional() ||
- !BI->extractProfMetadata(TrueWeight, FalseWeight))
- continue;
-
- // If the exit path is more frequent than the loop path,
- // we return here without further analysis for this loop.
- bool TrueIsExit = !L->contains(BI->getSuccessor(0));
- if (( TrueIsExit && FalseWeight < TrueWeight) ||
- (!TrueIsExit && FalseWeight > TrueWeight))
- return MadeChange;
- }
- }
-
- BasicBlock *CountedExitBlock = nullptr;
- const SCEV *ExitCount = nullptr;
- BranchInst *CountedExitBranch = nullptr;
- for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
- IE = ExitingBlocks.end(); I != IE; ++I) {
- const SCEV *EC = SE->getExitCount(L, *I);
- LLVM_DEBUG(dbgs() << "Exit Count for " << *L << " from block "
- << (*I)->getName() << ": " << *EC << "\n");
- if (isa<SCEVCouldNotCompute>(EC))
- continue;
- if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
- if (ConstEC->getValue()->isZero())
- continue;
- } else if (!SE->isLoopInvariant(EC, L))
- continue;
-
- if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32))
- continue;
-
- // If this exiting block is contained in a nested loop, it is not eligible
- // for insertion of the branch-and-decrement since the inner loop would
- // end up messing up the value in the CTR.
- if (LI->getLoopFor(*I) != L)
- continue;
-
- // We now have a loop-invariant count of loop iterations (which is not the
- // constant zero) for which we know that this loop will not exit via this
- // existing block.
-
- // We need to make sure that this block will run on every loop iteration.
- // For this to be true, we must dominate all blocks with backedges. Such
- // blocks are in-loop predecessors to the header block.
- bool NotAlways = false;
- for (pred_iterator PI = pred_begin(L->getHeader()),
- PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
- if (!L->contains(*PI))
- continue;
-
- if (!DT->dominates(*I, *PI)) {
- NotAlways = true;
- break;
- }
- }
-
- if (NotAlways)
- continue;
-
- // Make sure this blocks ends with a conditional branch.
- Instruction *TI = (*I)->getTerminator();
- if (!TI)
- continue;
-
- if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
- if (!BI->isConditional())
- continue;
-
- CountedExitBranch = BI;
- } else
- continue;
-
- // Note that this block may not be the loop latch block, even if the loop
- // has a latch block.
- CountedExitBlock = *I;
- ExitCount = EC;
- break;
- }
-
- if (!CountedExitBlock)
- return MadeChange;
-
- BasicBlock *Preheader = L->getLoopPreheader();
-
- // If we don't have a preheader, then insert one. If we already have a
- // preheader, then we can use it (except if the preheader contains a use of
- // the CTR register because some such uses might be reordered by the
- // selection DAG after the mtctr instruction).
- if (!Preheader || mightUseCTR(Preheader))
- Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
- if (!Preheader)
- return MadeChange;
-
- LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName()
- << "\n");
-
- // Insert the count into the preheader and replace the condition used by the
- // selected branch.
- MadeChange = true;
-
- SCEVExpander SCEVE(*SE, *DL, "loopcnt");
- LLVMContext &C = SE->getContext();
- Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
- if (!ExitCount->getType()->isPointerTy() &&
- ExitCount->getType() != CountType)
- ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
- ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
- Value *ECValue =
- SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator());
-
- IRBuilder<> CountBuilder(Preheader->getTerminator());
- Module *M = Preheader->getParent()->getParent();
- Function *MTCTRFunc =
- Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr, CountType);
- CountBuilder.CreateCall(MTCTRFunc, ECValue);
-
- IRBuilder<> CondBuilder(CountedExitBranch);
- Function *DecFunc =
- Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
- Value *NewCond = CondBuilder.CreateCall(DecFunc, {});
- Value *OldCond = CountedExitBranch->getCondition();
- CountedExitBranch->setCondition(NewCond);
-
- // The false branch must exit the loop.
- if (!L->contains(CountedExitBranch->getSuccessor(0)))
- CountedExitBranch->swapSuccessors();
-
- // The old condition may be dead now, and may have even created a dead PHI
- // (the original induction variable).
- RecursivelyDeleteTriviallyDeadInstructions(OldCond);
- // Run through the basic blocks of the loop and see if any of them have dead
- // PHIs that can be removed.
- for (auto I : L->blocks())
- DeleteDeadPHIs(I);
-
- ++NumCTRLoops;
- return MadeChange;
-}
-
#ifndef NDEBUG
static bool clobbersCTR(const MachineInstr &MI) {
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
}
case ISD::INTRINSIC_W_CHAIN: {
if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
- Intrinsic::ppc_is_decremented_ctr_nonzero)
+ Intrinsic::loop_decrement)
break;
assert(N->getValueType(0) == MVT::i1 &&
if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
- Intrinsic::ppc_is_decremented_ctr_nonzero) {
+ Intrinsic::loop_decrement) {
// We now need to make the intrinsic dead (it cannot be instruction
// selected).
if (LHS.getOpcode() == ISD::AND &&
LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
- Intrinsic::ppc_is_decremented_ctr_nonzero &&
+ Intrinsic::loop_decrement &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
!isNullConstant(LHS.getOperand(1)))
LHS = LHS.getOperand(0);
if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
- Intrinsic::ppc_is_decremented_ctr_nonzero &&
+ Intrinsic::loop_decrement &&
isa<ConstantSDNode>(RHS)) {
assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
"Counter decrement comparison is not EQ or NE");
PPC970_DGroup_First, PPC970_Unit_FXU;
}
let hasSideEffects = 1, Defs = [CTR8] in {
-let Pattern = [(int_ppc_mtctr i64:$rS)] in
+let Pattern = [(int_set_loop_iterations i64:$rS)] in
def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
"mtctr $rS", IIC_SprMTSPR>,
PPC970_DGroup_First, PPC970_Unit_FXU;
PPC970_DGroup_First, PPC970_Unit_FXU;
}
let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
-let Pattern = [(int_ppc_mtctr i32:$rS)] in
+let Pattern = [(int_set_loop_iterations i32:$rS)] in
def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
"mtctr $rS", IIC_SprMTSPR>,
PPC970_DGroup_First, PPC970_Unit_FXU;
RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());
PassRegistry &PR = *PassRegistry::getPassRegistry();
- initializePPCCTRLoopsPass(PR);
#ifndef NDEBUG
initializePPCCTRLoopsVerifyPass(PR);
#endif
addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
- addPass(createPPCCTRLoops());
+ addPass(createHardwareLoopsPass());
return false;
}
//===----------------------------------------------------------------------===//
#include "PPCTargetTransformInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
cl::desc("Enable using coldcc calling conv for cold "
"internal functions"));
+// The latency of mtctr is only justified if there are more than 4
+// comparisons that will be removed as a result.
+static cl::opt<unsigned>
+SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
+ cl::desc("Loops with a constant trip count smaller than "
+ "this value will not use the count register."));
+
//===----------------------------------------------------------------------===//
//
// PPC cost model.
return BaseT::getUserCost(U, Operands);
}
+bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
+ TargetLibraryInfo *LibInfo) {
+ const PPCTargetMachine &TM = ST->getTargetMachine();
+
+ // Loop through the inline asm constraints and look for something that
+ // clobbers ctr.
+ auto asmClobbersCTR = [](InlineAsm *IA) {
+ InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+ for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+ InlineAsm::ConstraintInfo &C = CIV[i];
+ if (C.Type != InlineAsm::isInput)
+ for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+ if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+ return true;
+ }
+ return false;
+ };
+
+ // Determining the address of a TLS variable results in a function call in
+ // certain TLS models.
+ std::function<bool(const Value*)> memAddrUsesCTR =
+ [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool {
+ const auto *GV = dyn_cast<GlobalValue>(MemAddr);
+ if (!GV) {
+ // Recurse to check for constants that refer to TLS global variables.
+ if (const auto *CV = dyn_cast<Constant>(MemAddr))
+ for (const auto &CO : CV->operands())
+ if (memAddrUsesCTR(CO))
+ return true;
+
+ return false;
+ }
+
+ if (!GV->isThreadLocal())
+ return false;
+ TLSModel::Model Model = TM.getTLSModel(GV);
+ return Model == TLSModel::GeneralDynamic ||
+ Model == TLSModel::LocalDynamic;
+ };
+
+ auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) {
+ if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+ return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
+
+ return false;
+ };
+
+ for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
+ J != JE; ++J) {
+ if (CallInst *CI = dyn_cast<CallInst>(J)) {
+ // Inline ASM is okay, unless it clobbers the ctr register.
+ if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
+ if (asmClobbersCTR(IA))
+ return true;
+ continue;
+ }
+
+ if (Function *F = CI->getCalledFunction()) {
+ // Most intrinsics don't become function calls, but some might.
+ // sin, cos, exp and log are always calls.
+ unsigned Opcode = 0;
+ if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
+ switch (F->getIntrinsicID()) {
+ default: continue;
+ // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
+ // we're definitely using CTR.
+ case Intrinsic::set_loop_iterations:
+ case Intrinsic::loop_decrement:
+ return true;
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+ !defined(setjmp_undefined_for_msvc)
+# pragma push_macro("setjmp")
+# undef setjmp
+# define setjmp_undefined_for_msvc
+#endif
+
+ case Intrinsic::setjmp:
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
+ // let's return it to _setjmp state
+# pragma pop_macro("setjmp")
+# undef setjmp_undefined_for_msvc
+#endif
+
+ case Intrinsic::longjmp:
+
+ // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
+ // because, although it does clobber the counter register, the
+ // control can't then return to inside the loop unless there is also
+ // an eh_sjlj_setjmp.
+ case Intrinsic::eh_sjlj_setjmp:
+
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove:
+ case Intrinsic::memset:
+ case Intrinsic::powi:
+ case Intrinsic::log:
+ case Intrinsic::log2:
+ case Intrinsic::log10:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::pow:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ return true;
+ case Intrinsic::copysign:
+ if (CI->getArgOperand(0)->getType()->getScalarType()->
+ isPPC_FP128Ty())
+ return true;
+ else
+ continue; // ISD::FCOPYSIGN is never a library call.
+ case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
+ case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
+ case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
+ case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
+ case Intrinsic::rint: Opcode = ISD::FRINT; break;
+ case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+ case Intrinsic::round: Opcode = ISD::FROUND; break;
+ case Intrinsic::minnum: Opcode = ISD::FMINNUM; break;
+ case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break;
+ case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break;
+ case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break;
+ }
+ }
+
+ // PowerPC does not use [US]DIVREM or other library calls for
+ // operations on regular types which are not otherwise library calls
+ // (i.e. soft float or atomics). If adapting for targets that do,
+ // additional care is required here.
+
+ LibFunc Func;
+ if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
+ LibInfo->getLibFunc(F->getName(), Func) &&
+ LibInfo->hasOptimizedCodeGen(Func)) {
+ // Non-read-only functions are never treated as intrinsics.
+ if (!CI->onlyReadsMemory())
+ return true;
+
+ // Conversion happens only for FP calls.
+ if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
+ return true;
+
+ switch (Func) {
+ default: return true;
+ case LibFunc_copysign:
+ case LibFunc_copysignf:
+ continue; // ISD::FCOPYSIGN is never a library call.
+ case LibFunc_copysignl:
+ return true;
+ case LibFunc_fabs:
+ case LibFunc_fabsf:
+ case LibFunc_fabsl:
+ continue; // ISD::FABS is never a library call.
+ case LibFunc_sqrt:
+ case LibFunc_sqrtf:
+ case LibFunc_sqrtl:
+ Opcode = ISD::FSQRT; break;
+ case LibFunc_floor:
+ case LibFunc_floorf:
+ case LibFunc_floorl:
+ Opcode = ISD::FFLOOR; break;
+ case LibFunc_nearbyint:
+ case LibFunc_nearbyintf:
+ case LibFunc_nearbyintl:
+ Opcode = ISD::FNEARBYINT; break;
+ case LibFunc_ceil:
+ case LibFunc_ceilf:
+ case LibFunc_ceill:
+ Opcode = ISD::FCEIL; break;
+ case LibFunc_rint:
+ case LibFunc_rintf:
+ case LibFunc_rintl:
+ Opcode = ISD::FRINT; break;
+ case LibFunc_round:
+ case LibFunc_roundf:
+ case LibFunc_roundl:
+ Opcode = ISD::FROUND; break;
+ case LibFunc_trunc:
+ case LibFunc_truncf:
+ case LibFunc_truncl:
+ Opcode = ISD::FTRUNC; break;
+ case LibFunc_fmin:
+ case LibFunc_fminf:
+ case LibFunc_fminl:
+ Opcode = ISD::FMINNUM; break;
+ case LibFunc_fmax:
+ case LibFunc_fmaxf:
+ case LibFunc_fmaxl:
+ Opcode = ISD::FMAXNUM; break;
+ }
+ }
+
+ if (Opcode) {
+ EVT EVTy =
+ TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true);
+
+ if (EVTy == MVT::Other)
+ return true;
+
+ if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
+ continue;
+ else if (EVTy.isVector() &&
+ TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
+ continue;
+
+ return true;
+ }
+ }
+
+ return true;
+ } else if (isa<BinaryOperator>(J) &&
+ J->getType()->getScalarType()->isPPC_FP128Ty()) {
+ // Most operations on ppc_f128 values become calls.
+ return true;
+ } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
+ isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
+ CastInst *CI = cast<CastInst>(J);
+ if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
+ CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
+ isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) ||
+ isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType()))
+ return true;
+ } else if (isLargeIntegerTy(!TM.isPPC64(),
+ J->getType()->getScalarType()) &&
+ (J->getOpcode() == Instruction::UDiv ||
+ J->getOpcode() == Instruction::SDiv ||
+ J->getOpcode() == Instruction::URem ||
+ J->getOpcode() == Instruction::SRem)) {
+ return true;
+ } else if (!TM.isPPC64() &&
+ isLargeIntegerTy(false, J->getType()->getScalarType()) &&
+ (J->getOpcode() == Instruction::Shl ||
+ J->getOpcode() == Instruction::AShr ||
+ J->getOpcode() == Instruction::LShr)) {
+ // Only on PPC32, for 128-bit integers (specifically not 64-bit
+ // integers), these might be runtime calls.
+ return true;
+ } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
+ // On PowerPC, indirect jumps use the counter register.
+ return true;
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
+ if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
+ return true;
+ }
+
+ // FREM is always a call.
+ if (J->getOpcode() == Instruction::FRem)
+ return true;
+
+ if (ST->useSoftFloat()) {
+ switch(J->getOpcode()) {
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FCmp:
+ return true;
+ }
+ }
+
+ for (Value *Operand : J->operands())
+ if (memAddrUsesCTR(Operand))
+ return true;
+ }
+
+ return false;
+}
+
+bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo,
+ TTI::HardwareLoopInfo &HWLoopInfo) {
+ const PPCTargetMachine &TM = ST->getTargetMachine();
+ TargetSchedModel SchedModel;
+ SchedModel.init(ST);
+
+ // Do not convert small short loops to CTR loop.
+ unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
+ if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+ CodeMetrics Metrics;
+ for (BasicBlock *BB : L->blocks())
+ Metrics.analyzeBasicBlock(BB, *this, EphValues);
+ // 6 is an approximate latency for the mtctr instruction.
+ if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
+ return false;
+ }
+
+ // We don't want to spill/restore the counter register, and so we don't
+ // want to use the counter register if the loop contains calls.
+ for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+ I != IE; ++I)
+ if (mightUseCTR(*I, LibInfo))
+ return false;
+
+ SmallVector<BasicBlock*, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ // If there is an exit edge known to be frequently taken,
+ // we should not transform this loop.
+ for (auto &BB : ExitingBlocks) {
+ Instruction *TI = BB->getTerminator();
+ if (!TI) continue;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ uint64_t TrueWeight = 0, FalseWeight = 0;
+ if (!BI->isConditional() ||
+ !BI->extractProfMetadata(TrueWeight, FalseWeight))
+ continue;
+
+ // If the exit path is more frequent than the loop path,
+ // we return here without further analysis for this loop.
+ bool TrueIsExit = !L->contains(BI->getSuccessor(0));
+ if (( TrueIsExit && FalseWeight < TrueWeight) ||
+ (!TrueIsExit && FalseWeight > TrueWeight))
+ return false;
+ }
+ }
+
+ LLVMContext &C = L->getHeader()->getContext();
+ HWLoopInfo.CountType = TM.isPPC64() ?
+ Type::getInt64Ty(C) : Type::getInt32Ty(C);
+ HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
+ return true;
+}
+
void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
if (ST->getDarwinDirective() == PPC::DIR_A2) {
const PPCSubtarget *getST() const { return ST; }
const PPCTargetLowering *getTLI() const { return TLI; }
+ bool mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo);
public:
explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F)
unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+ bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *LibInfo,
+ TTI::HardwareLoopInfo &HWLoopInfo);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
%8 = sub i64 0, %int_part_ptr.02534
%scevgep5 = getelementptr i8, i8* %call109, i64 %8
%scevgep56 = ptrtoint i8* %scevgep5 to i64
- call void @llvm.ppc.mtctr.i64(i64 %scevgep56)
+ call void @llvm.set.loop.iterations.i64(i64 %scevgep56)
br label %for.body.116
for.cond.cleanup: ; preds = %if.end.138, %if.end.105
%conv134 = trunc i32 %add133 to i8
%scevgep = getelementptr i8, i8* inttoptr (i64 -1 to i8*), i64 %call109.pn2
store i8 %conv134, i8* %scevgep, align 1, !tbaa !10
- %12 = call i1 @llvm.ppc.is.decremented.ctr.nonzero()
- br i1 %12, label %for.body.116, label %for.cond.cleanup.115
+ %12 = call i64 @llvm.loop.dec(i64 %scevgep56, i64 1)
+ %dec.cmp = icmp ne i64 %12, 0
+ br i1 %dec.cmp, label %for.body.116, label %for.cond.cleanup.115
if.then.136: ; preds = %for.cond.cleanup.115
%incdec.ptr137 = getelementptr inbounds i8, i8* %int_part_ptr.0253, i64 -1
declare i8* @memcpy(i8*, i8* nocapture readonly, i64) #1
; Function Attrs: nounwind
-declare void @llvm.ppc.mtctr.i64(i64) #0
+declare void @llvm.set.loop.iterations.i64(i64) #0
; Function Attrs: nounwind
-declare i1 @llvm.ppc.is.decremented.ctr.nonzero() #0
+declare i64 @llvm.loop.dec(i64, i64) #0
attributes #0 = { nounwind }
attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-; Test pass name: ppc-ctr-loops.
-; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-ctr-loops -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-CTR-LOOPS
-; STOP-BEFORE-CTR-LOOPS-NOT: -ppc-ctr-loops
-; STOP-BEFORE-CTR-LOOPS-NOT: "ppc-ctr-loops" pass is not registered.
-; STOP-BEFORE-CTR-LOOPS-NOT: PowerPC CTR Loops
-
-; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-ctr-loops -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-CTR-LOOPS
-; STOP-AFTER-CTR-LOOPS: -ppc-ctr-loops
-; STOP-AFTER-CTR-LOOPS-NOT: "ppc-ctr-loops" pass is not registered.
-; STOP-AFTER-CTR-LOOPS: PowerPC CTR Loops
-
-
; Test pass name: ppc-loop-preinc-prep.
; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-loop-preinc-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-LOOP-PREINC-PREP
; STOP-BEFORE-LOOP-PREINC-PREP-NOT: -ppc-loop-preinc-prep
--- /dev/null
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-REGDEC
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC --check-prefix=CHECK-NESTED
+
+; CHECK-LABEL: while_lt
+define void @while_lt(i32 %i, i32 %N, i32* nocapture %A) {
+entry:
+ %cmp4 = icmp ult i32 %i, %N
+ br i1 %cmp4, label %while.body, label %while.end
+
+; CHECK: while.body.preheader:
+; CHECK: [[COUNT:%[^ ]+]] = sub i32 %N, %i
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: br label %while.body
+
+; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
+
+; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
+
+while.body:
+ %i.addr.05 = phi i32 [ %inc, %while.body ], [ %i, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
+ store i32 %i.addr.05, i32* %arrayidx, align 4
+ %inc = add nuw i32 %i.addr.05, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %while.end, label %while.body
+
+while.end:
+ ret void
+}
+
+; CHECK-LABEL: while_gt
+; CHECK: while.body.preheader:
+; CHECK: [[COUNT:%[^ ]+]] = sub i32 %i, %N
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: br label %while.body
+
+; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
+
+; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
+
+define void @while_gt(i32 %i, i32 %N, i32* nocapture %A) {
+entry:
+ %cmp4 = icmp sgt i32 %i, %N
+ br i1 %cmp4, label %while.body, label %while.end
+
+while.body:
+ %i.addr.05 = phi i32 [ %dec, %while.body ], [ %i, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
+ store i32 %i.addr.05, i32* %arrayidx, align 4
+ %dec = add nsw i32 %i.addr.05, -1
+ %cmp = icmp sgt i32 %dec, %N
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+ ret void
+}
+
+; CHECK-LABEL: while_gte
+; CHECK: while.body.preheader:
+; CHECK: [[ADD:%[^ ]+]] = add i32 %i, 1
+; CHECK: [[SEL:%[^ ]+]] = icmp slt i32 %N, %i
+; CHECK: [[MIN:%[^ ]+]] = select i1 [[SEL]], i32 %N, i32 %i
+; CHECK: [[COUNT:%[^ ]+]] = sub i32 [[ADD]], [[MIN]]
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: br label %while.body
+
+; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
+
+; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
+
+define void @while_gte(i32 %i, i32 %N, i32* nocapture %A) {
+entry:
+ %cmp4 = icmp slt i32 %i, %N
+ br i1 %cmp4, label %while.end, label %while.body
+
+while.body:
+ %i.addr.05 = phi i32 [ %dec, %while.body ], [ %i, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
+ store i32 %i.addr.05, i32* %arrayidx, align 4
+ %dec = add nsw i32 %i.addr.05, -1
+ %cmp = icmp sgt i32 %i.addr.05, %N
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+ ret void
+}
+
+; CHECK-LABEL: nested
+; CHECK-NESTED: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK-NESTED: br label %while.cond1.preheader.us
+
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: br label %while.body3.us
+
+; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
+
+; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ %N, %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ]
+; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-REGDEC: br i1 [[CMP]], label %while.body3.us, label %while.cond1.while.end_crit_edge.us
+
+; CHECK-NESTED: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-NESTED: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7
+
+define void @nested(i32* nocapture %A, i32 %N) {
+entry:
+ %cmp20 = icmp eq i32 %N, 0
+ br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
+
+while.cond1.preheader.us:
+ %i.021.us = phi i32 [ %inc6.us, %while.cond1.while.end_crit_edge.us ], [ 0, %entry ]
+ %mul.us = mul i32 %i.021.us, %N
+ br label %while.body3.us
+
+while.body3.us:
+ %j.019.us = phi i32 [ 0, %while.cond1.preheader.us ], [ %inc.us, %while.body3.us ]
+ %add.us = add i32 %j.019.us, %mul.us
+ %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
+ store i32 %add.us, i32* %arrayidx.us, align 4
+ %inc.us = add nuw i32 %j.019.us, 1
+ %exitcond = icmp eq i32 %inc.us, %N
+ br i1 %exitcond, label %while.cond1.while.end_crit_edge.us, label %while.body3.us
+
+while.cond1.while.end_crit_edge.us:
+ %inc6.us = add nuw i32 %i.021.us, 1
+ %exitcond23 = icmp eq i32 %inc6.us, %N
+ br i1 %exitcond23, label %while.end7, label %while.cond1.preheader.us
+
+while.end7:
+ ret void
+}
--- /dev/null
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: float_counter
+; CHECK-NOT: set.loop.iterations
+; CHECK-NOT: loop.decrement
+define void @float_counter(i32* nocapture %A, float %N) {
+entry:
+ %cmp6 = fcmp ogt float %N, 0.000000e+00
+ br i1 %cmp6, label %while.body, label %while.end
+
+while.body:
+ %i.07 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.07
+ store i32 %i.07, i32* %arrayidx, align 4
+ %inc = add i32 %i.07, 1
+ %conv = uitofp i32 %inc to float
+ %cmp = fcmp olt float %conv, %N
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+ ret void
+}
+
+; CHECK-LABEL: variant_counter
+; CHECK-NOT: set.loop.iterations
+; CHECK-NOT: loop.decrement
+define void @variant_counter(i32* nocapture %A, i32* nocapture readonly %B) {
+entry:
+ %0 = load i32, i32* %B, align 4
+ %cmp7 = icmp eq i32 %0, 0
+ br i1 %cmp7, label %while.end, label %while.body
+
+while.body:
+ %i.08 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %arrayidx1 = getelementptr inbounds i32, i32* %A, i32 %i.08
+ store i32 %i.08, i32* %arrayidx1, align 4
+ %inc = add nuw i32 %i.08, 1
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %inc
+ %1 = load i32, i32* %arrayidx, align 4
+ %cmp = icmp ult i32 %inc, %1
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+ ret void
+}
initializeVectorization(*Registry);
initializeScalarizeMaskedMemIntrinPass(*Registry);
initializeExpandReductionsPass(*Registry);
+ initializeHardwareLoopsPass(*Registry);
// Initialize debugging passes.
initializeScavengerTestPass(*Registry);
initializeExpandReductionsPass(Registry);
initializeWasmEHPreparePass(Registry);
initializeWriteBitcodePassPass(Registry);
+ initializeHardwareLoopsPass(Registry);
#ifdef LINK_POLLY_INTO_TOOLS
polly::initializePollyPasses(Registry);