[CodeGen] Generic Hardware Loop Support

author Sam Parker <sam.parker@arm.com>

Fri, 7 Jun 2019 07:35:30 +0000 (07:35 +0000)

committer Sam Parker <sam.parker@arm.com>

Fri, 7 Jun 2019 07:35:30 +0000 (07:35 +0000)
author Sam Parker <sam.parker@arm.com>
Fri, 7 Jun 2019 07:35:30 +0000 (07:35 +0000)
committer Sam Parker <sam.parker@arm.com>
Fri, 7 Jun 2019 07:35:30 +0000 (07:35 +0000)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h

index 5a94bfa..0ef8c08 100644 (file)
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -35,6 +35,8 @@ namespace Intrinsic {
  enum ID : unsigned;
  }
  
+class AssumptionCache;
+class BranchInst;
  class Function;
  class GlobalValue;
  class IntrinsicInst;
@@ -44,6 +46,7 @@ class SCEV;
  class ScalarEvolution;
  class StoreInst;
  class SwitchInst;
+class TargetLibraryInfo;
  class Type;
  class User;
  class Value;
@@ -445,6 +448,32 @@ public:
    void getUnrollingPreferences(Loop *L, ScalarEvolution &,
                                 UnrollingPreferences &UP) const;
  
+  /// Attributes of a target dependent hardware loop. Here, the term 'element'
+  /// describes the work performed by an IR loop that has not been vectorized
+  /// by the compiler.
+  struct HardwareLoopInfo {
+    HardwareLoopInfo()        = delete;
+    HardwareLoopInfo(Loop *L) : L(L) { }
+    Loop *L                   = nullptr;
+    BasicBlock *ExitBlock     = nullptr;
+    BranchInst *ExitBranch    = nullptr;
+    const SCEV *ExitCount     = nullptr;
+    IntegerType *CountType    = nullptr;
+    Value *LoopDecrement      = nullptr;  // The maximum number of elements
+                                          // processed in the loop body.
+    bool IsNestingLegal       = false;    // Can a hardware loop be a parent to
+                                          // another hardware loop.
+    bool CounterInReg         = false;    // Should loop counter be updated in
+                                          // the loop via a phi?
+  };
+
+  /// Query the target whether it would be profitable to convert the given loop
+  /// into a hardware loop.
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                HardwareLoopInfo &HWLoopInfo) const;
+
    /// @}
  
    /// \name Scalar Target Information
@@ -1073,6 +1102,10 @@ public:
    virtual bool isLoweredToCall(const Function *F) = 0;
    virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
                                         UnrollingPreferences &UP) = 0;
+  virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                        AssumptionCache &AC,
+                                        TargetLibraryInfo *LibInfo,
+                                        HardwareLoopInfo &HWLoopInfo) = 0;
    virtual bool isLegalAddImmediate(int64_t Imm) = 0;
    virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
    virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@@ -1304,6 +1337,12 @@ public:
                                 UnrollingPreferences &UP) override {
      return Impl.getUnrollingPreferences(L, SE, UP);
    }
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                HardwareLoopInfo &HWLoopInfo) override {
+    return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
+  }
    bool isLegalAddImmediate(int64_t Imm) override {
      return Impl.isLegalAddImmediate(Imm);
    }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

index bb290ec..2527495 100644 (file)
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -190,6 +190,13 @@ public:
      return true;
    }
  
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                TTI::HardwareLoopInfo &HWLoopInfo) {
+    return false;
+  }
+
    void getUnrollingPreferences(Loop *, ScalarEvolution &,
                                 TTI::UnrollingPreferences &) {}
  
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h

index 9a3be5c..e4c17d7 100644 (file)
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -491,6 +491,13 @@ public:
      UP.BEInsns = 2;
    }
  
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                TTI::HardwareLoopInfo &HWLoopInfo) {
+    return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
+  }
+
    int getInstructionLatency(const Instruction *I) {
      if (isa<LoadInst>(I))
        return getST()->getSchedModel().DefaultLoadLatency;
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h

index c59473a..7d17694 100644 (file)
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -446,6 +446,9 @@ namespace llvm {
    /// Creates CFI Instruction Inserter pass. \see CFIInstrInserter.cpp
    FunctionPass *createCFIInstrInserter();
  
+  /// Create Hardware Loop pass. \see HardwareLoops.cpp
+  FunctionPass *createHardwareLoopsPass();
+
  } // End llvm namespace
  
  #endif
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td

index 06620fe..dd8bab1 100644 (file)
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1182,6 +1182,27 @@ def int_experimental_vector_reduce_fmin : Intrinsic<[llvm_anyfloat_ty],
                                                      [llvm_anyvector_ty],
                                                      [IntrNoMem]>;
  
+//===---------- Intrinsics to control hardware supported loops ----------===//
+
+// Specify that the value given is the number of iterations that the next loop
+// will execute.
+def int_set_loop_iterations :
+  Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>;
+
+// Decrement loop counter by the given argument. Return false if the loop
+// should exit.
+def int_loop_decrement :
+  Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>;
+
+// Decrement the first operand (the loop counter) by the second operand (the
+// maximum number of elements processed in an iteration). Return the remaining
+// number of iterations still to be executed. This is effectively a sub which
+// can be used with a phi, icmp and br to control the number of iterations
+// executed, as usual.
+def int_loop_decrement_reg :
+  Intrinsic<[llvm_anyint_ty],
+            [llvm_anyint_ty, llvm_anyint_ty], [IntrNoDuplicate]>;
+
  //===----- Intrinsics that are used to provide predicate information -----===//
  
  def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h

index aa61bf9..6cb3593 100644 (file)
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -163,6 +163,7 @@ void initializeGlobalOptLegacyPassPass(PassRegistry&);
  void initializeGlobalSplitPass(PassRegistry&);
  void initializeGlobalsAAWrapperPassPass(PassRegistry&);
  void initializeGuardWideningLegacyPassPass(PassRegistry&);
+void initializeHardwareLoopsPass(PassRegistry&);
  void initializeHotColdSplittingLegacyPassPass(PassRegistry&);
  void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &);
  void initializeIPCPPass(PassRegistry&);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h

index 472e320..675d179 100644 (file)
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -223,6 +223,7 @@ namespace {
        (void) llvm::createEliminateAvailableExternallyPass();
        (void) llvm::createScalarizeMaskedMemIntrinPass();
        (void) llvm::createWarnMissedTransformationsPass();
+      (void) llvm::createHardwareLoopsPass();
  
        (void)new llvm::IntervalPartition();
        (void)new llvm::ScalarEvolutionWrapperPass();
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp

index 53dd2bf..763b684 100644 (file)
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -130,6 +130,12 @@ bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
    return TTIImpl->isLoweredToCall(F);
  }
  
+bool TargetTransformInfo::isHardwareLoopProfitable(
+  Loop *L, ScalarEvolution &SE, AssumptionCache &AC,
+  TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const {
+  return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
+}
+
  void TargetTransformInfo::getUnrollingPreferences(
      Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
    return TTIImpl->getUnrollingPreferences(L, SE, UP);
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt

index fedf042..dcbd4e5 100644 (file)
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -33,6 +33,7 @@ add_llvm_library(LLVMCodeGen
    GCRootLowering.cpp
    GCStrategy.cpp
    GlobalMerge.cpp
+  HardwareLoops.cpp
    IfConversion.cpp
    ImplicitNullChecks.cpp
    IndirectBrExpandPass.cpp
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp

index 748db03..31b6bb7 100644 (file)
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -38,6 +38,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
    initializeFuncletLayoutPass(Registry);
    initializeGCMachineCodeAnalysisPass(Registry);
    initializeGCModuleInfoPass(Registry);
+  initializeHardwareLoopsPass(Registry);
    initializeIfConverterPass(Registry);
    initializeImplicitNullChecksPass(Registry);
    initializeIndirectBrExpandPassPass(Registry);
diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp

new file mode 100644 (file)

index 0000000..1d48219
--- /dev/null
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -0,0 +1,441 @@
+//===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Insert hardware loop intrinsics into loops which are deemed profitable by
+/// the target, by querying TargetTransformInfo. A hardware loop comprises of
+/// two intrinsics: one, outside the loop, to set the loop iteration count and
+/// another, in the exit block, to decrement the counter. The decremented value
+/// can either be carried through the loop via a phi or handled in some opaque
+/// way by the target.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "hardware-loops"
+
+#define HW_LOOPS_NAME "Hardware Loop Insertion"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false),
+                   cl::desc("Force hardware loops intrinsics to be inserted"));
+
+static cl::opt<bool>
+ForceHardwareLoopPHI(
+  "force-hardware-loop-phi", cl::Hidden, cl::init(false),
+  cl::desc("Force hardware loop counter to be updated through a phi"));
+
+static cl::opt<bool>
+ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false),
+                cl::desc("Force allowance of nested hardware loops"));
+
+static cl::opt<unsigned>
+LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1),
+            cl::desc("Set the loop decrement value"));
+
+static cl::opt<unsigned>
+CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32),
+                cl::desc("Set the loop counter bitwidth"));
+
+STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
+
+namespace {
+
+  using TTI = TargetTransformInfo;
+
+  class HardwareLoops : public FunctionPass {
+  public:
+    static char ID;
+
+    HardwareLoops() : FunctionPass(ID) {
+      initializeHardwareLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
+    }
+
+    // Try to convert the given Loop into a hardware loop.
+    bool TryConvertLoop(Loop *L);
+
+    // Given that the target believes the loop to be profitable, try to
+    // convert it.
+    bool TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo);
+
+  private:
+    ScalarEvolution *SE = nullptr;
+    LoopInfo *LI = nullptr;
+    const DataLayout *DL = nullptr;
+    const TargetTransformInfo *TTI = nullptr;
+    DominatorTree *DT = nullptr;
+    bool PreserveLCSSA = false;
+    AssumptionCache *AC = nullptr;
+    TargetLibraryInfo *LibInfo = nullptr;
+    Module *M = nullptr;
+    bool MadeChange = false;
+  };
+
+  class HardwareLoop {
+    // Expand the trip count scev into a value that we can use.
+    Value *InitLoopCount(BasicBlock *BB);
+
+    // Insert the set_loop_iteration intrinsic.
+    void InsertIterationSetup(Value *LoopCountInit, BasicBlock *BB);
+
+    // Insert the loop_decrement intrinsic.
+    void InsertLoopDec();
+
+    // Insert the loop_decrement_reg intrinsic.
+    Instruction *InsertLoopRegDec(Value *EltsRem);
+
+    // If the target requires the counter value to be updated in the loop,
+    // insert a phi to hold the value. The intended purpose is for use by
+    // loop_decrement_reg.
+    PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem);
+
+    // Create a new cmp, that checks the returned value of loop_decrement*,
+    // and update the exit branch to use it.
+    void UpdateBranch(Value *EltsRem);
+
+  public:
+    HardwareLoop(TTI::HardwareLoopInfo &Info, ScalarEvolution &SE,
+                 const DataLayout &DL) :
+      SE(SE), DL(DL), L(Info.L), M(L->getHeader()->getModule()),
+      ExitCount(Info.ExitCount),
+      CountType(Info.CountType),
+      ExitBranch(Info.ExitBranch),
+      LoopDecrement(Info.LoopDecrement),
+      UsePHICounter(Info.CounterInReg) { }
+
+    void Create();
+
+  private:
+    ScalarEvolution &SE;
+    const DataLayout &DL;
+    Loop *L                 = nullptr;
+    Module *M               = nullptr;
+    const SCEV *ExitCount   = nullptr;
+    Type *CountType         = nullptr;
+    BranchInst *ExitBranch  = nullptr;
+    Value *LoopDecrement      = nullptr;
+    bool UsePHICounter      = false;
+  };
+}
+
+char HardwareLoops::ID = 0;
+
+bool HardwareLoops::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n");
+
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  DL = &F.getParent()->getDataLayout();
+  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
+  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  M = F.getParent();
+
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
+    Loop *L = *I;
+    if (!L->getParentLoop())
+      TryConvertLoop(L);
+  }
+
+  return MadeChange;
+}
+
+// Return true if the search should stop, which will be when an inner loop is
+// converted and the parent loop doesn't support containing a hardware loop.
+bool HardwareLoops::TryConvertLoop(Loop *L) {
+  // Process nested loops first.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    if (TryConvertLoop(*I))
+      return true; // Stop search.
+
+  // Bail out if the loop has irreducible control flow.
+  LoopBlocksRPO RPOT(L);
+  RPOT.perform(LI);
+  if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI))
+    return false;
+
+  TTI::HardwareLoopInfo HWLoopInfo(L);
+  if (TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo) ||
+      ForceHardwareLoops) {
+
+    // Allow overriding of the counter width and loop decrement value.
+    if (CounterBitWidth.getNumOccurrences())
+      HWLoopInfo.CountType =
+        IntegerType::get(M->getContext(), CounterBitWidth);
+
+    if (LoopDecrement.getNumOccurrences())
+      HWLoopInfo.LoopDecrement =
+        ConstantInt::get(HWLoopInfo.CountType, LoopDecrement);
+
+    MadeChange |= TryConvertLoop(HWLoopInfo);
+    return MadeChange && (!HWLoopInfo.IsNestingLegal && !ForceNestedLoop);
+  }
+
+  return false;
+}
+
+bool HardwareLoops::TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo) {
+
+  Loop *L = HWLoopInfo.L;
+  LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L);
+
+  SmallVector<BasicBlock*, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
+       IE = ExitingBlocks.end(); I != IE; ++I) {
+    const SCEV *EC = SE->getExitCount(L, *I);
+    if (isa<SCEVCouldNotCompute>(EC))
+      continue;
+    if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
+      if (ConstEC->getValue()->isZero())
+        continue;
+    } else if (!SE->isLoopInvariant(EC, L))
+      continue;
+
+    if (SE->getTypeSizeInBits(EC->getType()) >
+        HWLoopInfo.CountType->getBitWidth())
+      continue;
+
+    // If this exiting block is contained in a nested loop, it is not eligible
+    // for insertion of the branch-and-decrement since the inner loop would
+    // end up messing up the value in the CTR.
+    if (!HWLoopInfo.IsNestingLegal && LI->getLoopFor(*I) != L &&
+        !ForceNestedLoop)
+      continue;
+
+    // We now have a loop-invariant count of loop iterations (which is not the
+    // constant zero) for which we know that this loop will not exit via this
+    // existing block.
+
+    // We need to make sure that this block will run on every loop iteration.
+    // For this to be true, we must dominate all blocks with backedges. Such
+    // blocks are in-loop predecessors to the header block.
+    bool NotAlways = false;
+    for (pred_iterator PI = pred_begin(L->getHeader()),
+         PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
+      if (!L->contains(*PI))
+        continue;
+
+      if (!DT->dominates(*I, *PI)) {
+        NotAlways = true;
+        break;
+      }
+    }
+
+    if (NotAlways)
+      continue;
+
+    // Make sure this blocks ends with a conditional branch.
+    Instruction *TI = (*I)->getTerminator();
+    if (!TI)
+      continue;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional())
+        continue;
+
+      HWLoopInfo.ExitBranch = BI;
+    } else
+      continue;
+
+    // Note that this block may not be the loop latch block, even if the loop
+    // has a latch block.
+    HWLoopInfo.ExitBlock = *I;
+    HWLoopInfo.ExitCount = EC;
+    break;
+  }
+
+  if (!HWLoopInfo.ExitBlock)
+    return false;
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // If we don't have a preheader, then insert one.
+  if (!Preheader)
+    Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
+  if (!Preheader)
+    return false;
+
+  HardwareLoop HWLoop(HWLoopInfo, *SE, *DL);
+  HWLoop.Create();
+  ++NumHWLoops;
+  return true;
+}
+
+void HardwareLoop::Create() {
+  LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
+  BasicBlock *BeginBB = L->getLoopPreheader();
+  Value *LoopCountInit = InitLoopCount(BeginBB);
+  if (!LoopCountInit)
+    return;
+
+  InsertIterationSetup(LoopCountInit, BeginBB);
+
+  if (UsePHICounter || ForceHardwareLoopPHI) {
+    Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
+    Value *EltsRem = InsertPHICounter(LoopCountInit, LoopDec);
+    LoopDec->setOperand(0, EltsRem);
+    UpdateBranch(LoopDec);
+  } else
+    InsertLoopDec();
+
+  // Run through the basic blocks of the loop and see if any of them have dead
+  // PHIs that can be removed.
+  for (auto I : L->blocks())
+    DeleteDeadPHIs(I);
+}
+
+Value *HardwareLoop::InitLoopCount(BasicBlock *BB) {
+  SCEVExpander SCEVE(SE, DL, "loopcnt");
+  if (!ExitCount->getType()->isPointerTy() &&
+      ExitCount->getType() != CountType)
+    ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
+
+  ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
+
+  if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) {
+    LLVM_DEBUG(dbgs() << "HWLoops: Bailing, unsafe to expand ExitCount "
+               << *ExitCount << "\n");
+    return nullptr;
+  }
+
+  Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
+                                     BB->getTerminator());
+  LLVM_DEBUG(dbgs() << "HWLoops: Loop Count: " << *Count << "\n");
+  return Count;
+}
+
+void HardwareLoop::InsertIterationSetup(Value *LoopCountInit,
+                                        BasicBlock *BB) {
+  IRBuilder<> Builder(BB->getTerminator());
+  Type *Ty = LoopCountInit->getType();
+  Function *LoopIter =
+    Intrinsic::getDeclaration(M, Intrinsic::set_loop_iterations, Ty);
+  Value *Call = Builder.CreateCall(LoopIter, LoopCountInit);
+  LLVM_DEBUG(dbgs() << "HWLoops: Iteration set: " << *Call << "\n");
+}
+
+void HardwareLoop::InsertLoopDec() {
+  IRBuilder<> CondBuilder(ExitBranch);
+
+  Function *DecFunc =
+    Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
+                              LoopDecrement->getType());
+  Value *Ops[] = { LoopDecrement };
+  Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
+  Value *OldCond = ExitBranch->getCondition();
+  ExitBranch->setCondition(NewCond);
+
+  // The false branch must exit the loop.
+  if (!L->contains(ExitBranch->getSuccessor(0)))
+    ExitBranch->swapSuccessors();
+
+  // The old condition may be dead now, and may have even created a dead PHI
+  // (the original induction variable).
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+
+  LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n");
+}
+
+Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
+  IRBuilder<> CondBuilder(ExitBranch);
+
+  Function *DecFunc =
+      Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
+                                { EltsRem->getType(), EltsRem->getType(),
+                                  LoopDecrement->getType()
+                                });
+  Value *Ops[] = { EltsRem, LoopDecrement };
+  Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
+
+  LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
+  return cast<Instruction>(Call);
+}
+
+PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = ExitBranch->getParent();
+  IRBuilder<> Builder(Header->getFirstNonPHI());
+  PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
+  Index->addIncoming(NumElts, Preheader);
+  Index->addIncoming(EltsRem, Latch);
+  LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n");
+  return Index;
+}
+
+void HardwareLoop::UpdateBranch(Value *EltsRem) {
+  IRBuilder<> CondBuilder(ExitBranch);
+  Value *NewCond =
+    CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->getType(), 0));
+  Value *OldCond = ExitBranch->getCondition();
+  ExitBranch->setCondition(NewCond);
+
+  // The false branch must exit the loop.
+  if (!L->contains(ExitBranch->getSuccessor(0)))
+    ExitBranch->swapSuccessors();
+
+  // The old condition may be dead now, and may have even created a dead PHI
+  // (the original induction variable).
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+}
+
+INITIALIZE_PASS_BEGIN(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
+
+FunctionPass *llvm::createHardwareLoopsPass() { return new HardwareLoops(); }
diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp

index 559d660..2b8d9b8 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -71,63 +71,7 @@ using namespace llvm;
  static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
  #endif
  
-// The latency of mtctr is only justified if there are more than 4
-// comparisons that will be removed as a result.
-static cl::opt<unsigned>
-SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
-                      cl::desc("Loops with a constant trip count smaller than "
-                               "this value will not use the count register."));
-
-STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
-
  namespace {
-  struct PPCCTRLoops : public FunctionPass {
-
-#ifndef NDEBUG
-    static int Counter;
-#endif
-
-  public:
-    static char ID;
-
-    PPCCTRLoops() : FunctionPass(ID) {
-      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<LoopInfoWrapperPass>();
-      AU.addPreserved<LoopInfoWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addRequired<ScalarEvolutionWrapperPass>();
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<TargetTransformInfoWrapperPass>();
-    }
-
-  private:
-    bool mightUseCTR(BasicBlock *BB);
-    bool convertToCTRLoop(Loop *L);
-
-  private:
-    const PPCTargetMachine *TM;
-    const PPCSubtarget *STI;
-    const PPCTargetLowering *TLI;
-    const DataLayout *DL;
-    const TargetLibraryInfo *LibInfo;
-    const TargetTransformInfo *TTI;
-    LoopInfo *LI;
-    ScalarEvolution *SE;
-    DominatorTree *DT;
-    bool PreserveLCSSA;
-    TargetSchedModel SchedModel;
-  };
-
-  char PPCCTRLoops::ID = 0;
-#ifndef NDEBUG
-  int PPCCTRLoops::Counter = 0;
-#endif
  
  #ifndef NDEBUG
    struct PPCCTRLoopsVerify : public MachineFunctionPass {
@@ -153,16 +97,6 @@ namespace {
  #endif // NDEBUG
  } // end anonymous namespace
  
-INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
-                    false, false)
-
-FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); }
-
  #ifndef NDEBUG
  INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
                        "PowerPC CTR Loops Verify", false, false)
@@ -175,512 +109,6 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() {
  }
  #endif // NDEBUG
  
-bool PPCCTRLoops::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-
-  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-  if (!TPC)
-    return false;
-
-  TM = &TPC->getTM<PPCTargetMachine>();
-  STI = TM->getSubtargetImpl(F);
-  TLI = STI->getTargetLowering();
-
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  DL = &F.getParent()->getDataLayout();
-  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-  LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
-  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
-  SchedModel.init(STI);
-
-  bool MadeChange = false;
-
-  for (LoopInfo::iterator I = LI->begin(), E = LI->end();
-       I != E; ++I) {
-    Loop *L = *I;
-    if (!L->getParentLoop())
-      MadeChange |= convertToCTRLoop(L);
-  }
-
-  return MadeChange;
-}
-
-static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
-  if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
-    return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
-
-  return false;
-}
-
-// Determining the address of a TLS variable results in a function call in
-// certain TLS models.
-static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) {
-  const auto *GV = dyn_cast<GlobalValue>(MemAddr);
-  if (!GV) {
-    // Recurse to check for constants that refer to TLS global variables.
-    if (const auto *CV = dyn_cast<Constant>(MemAddr))
-      for (const auto &CO : CV->operands())
-        if (memAddrUsesCTR(TM, CO))
-          return true;
-
-    return false;
-  }
-
-  if (!GV->isThreadLocal())
-    return false;
-  TLSModel::Model Model = TM.getTLSModel(GV);
-  return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
-}
-
-// Loop through the inline asm constraints and look for something that clobbers
-// ctr.
-static bool asmClobbersCTR(InlineAsm *IA) {
-  InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
-  for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
-    InlineAsm::ConstraintInfo &C = CIV[i];
-    if (C.Type != InlineAsm::isInput)
-      for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
-        if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
-          return true;
-  }
-  return false;
-}
-
-bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
-  for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
-       J != JE; ++J) {
-    if (CallInst *CI = dyn_cast<CallInst>(J)) {
-      // Inline ASM is okay, unless it clobbers the ctr register.
-      if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
-        if (asmClobbersCTR(IA))
-          return true;
-        continue;
-      }
-
-      if (Function *F = CI->getCalledFunction()) {
-        // Most intrinsics don't become function calls, but some might.
-        // sin, cos, exp and log are always calls.
-        unsigned Opcode = 0;
-        if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
-          switch (F->getIntrinsicID()) {
-          default: continue;
-          // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
-          // we're definitely using CTR.
-          case Intrinsic::ppc_is_decremented_ctr_nonzero:
-          case Intrinsic::ppc_mtctr:
-            return true;
-
-// VisualStudio defines setjmp as _setjmp
-#if defined(_MSC_VER) && defined(setjmp) && \
-                       !defined(setjmp_undefined_for_msvc)
-#  pragma push_macro("setjmp")
-#  undef setjmp
-#  define setjmp_undefined_for_msvc
-#endif
-
-          case Intrinsic::setjmp:
-
-#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
- // let's return it to _setjmp state
-#  pragma pop_macro("setjmp")
-#  undef setjmp_undefined_for_msvc
-#endif
-
-          case Intrinsic::longjmp:
-
-          // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
-          // because, although it does clobber the counter register, the
-          // control can't then return to inside the loop unless there is also
-          // an eh_sjlj_setjmp.
-          case Intrinsic::eh_sjlj_setjmp:
-
-          case Intrinsic::memcpy:
-          case Intrinsic::memmove:
-          case Intrinsic::memset:
-          case Intrinsic::powi:
-          case Intrinsic::log:
-          case Intrinsic::log2:
-          case Intrinsic::log10:
-          case Intrinsic::exp:
-          case Intrinsic::exp2:
-          case Intrinsic::pow:
-          case Intrinsic::sin:
-          case Intrinsic::cos:
-            return true;
-          case Intrinsic::copysign:
-            if (CI->getArgOperand(0)->getType()->getScalarType()->
-                isPPC_FP128Ty())
-              return true;
-            else
-              continue; // ISD::FCOPYSIGN is never a library call.
-          case Intrinsic::sqrt:               Opcode = ISD::FSQRT;      break;
-          case Intrinsic::floor:              Opcode = ISD::FFLOOR;     break;
-          case Intrinsic::ceil:               Opcode = ISD::FCEIL;      break;
-          case Intrinsic::trunc:              Opcode = ISD::FTRUNC;     break;
-          case Intrinsic::rint:               Opcode = ISD::FRINT;      break;
-          case Intrinsic::nearbyint:          Opcode = ISD::FNEARBYINT; break;
-          case Intrinsic::round:              Opcode = ISD::FROUND;     break;
-          case Intrinsic::minnum:             Opcode = ISD::FMINNUM;    break;
-          case Intrinsic::maxnum:             Opcode = ISD::FMAXNUM;    break;
-          case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO;      break;
-          case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO;      break;
-          }
-        }
-
-        // PowerPC does not use [US]DIVREM or other library calls for
-        // operations on regular types which are not otherwise library calls
-        // (i.e. soft float or atomics). If adapting for targets that do,
-        // additional care is required here.
-
-        LibFunc Func;
-        if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
-            LibInfo->getLibFunc(F->getName(), Func) &&
-            LibInfo->hasOptimizedCodeGen(Func)) {
-          // Non-read-only functions are never treated as intrinsics.
-          if (!CI->onlyReadsMemory())
-            return true;
-
-          // Conversion happens only for FP calls.
-          if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
-            return true;
-
-          switch (Func) {
-          default: return true;
-          case LibFunc_copysign:
-          case LibFunc_copysignf:
-            continue; // ISD::FCOPYSIGN is never a library call.
-          case LibFunc_copysignl:
-            return true;
-          case LibFunc_fabs:
-          case LibFunc_fabsf:
-          case LibFunc_fabsl:
-            continue; // ISD::FABS is never a library call.
-          case LibFunc_sqrt:
-          case LibFunc_sqrtf:
-          case LibFunc_sqrtl:
-            Opcode = ISD::FSQRT; break;
-          case LibFunc_floor:
-          case LibFunc_floorf:
-          case LibFunc_floorl:
-            Opcode = ISD::FFLOOR; break;
-          case LibFunc_nearbyint:
-          case LibFunc_nearbyintf:
-          case LibFunc_nearbyintl:
-            Opcode = ISD::FNEARBYINT; break;
-          case LibFunc_ceil:
-          case LibFunc_ceilf:
-          case LibFunc_ceill:
-            Opcode = ISD::FCEIL; break;
-          case LibFunc_rint:
-          case LibFunc_rintf:
-          case LibFunc_rintl:
-            Opcode = ISD::FRINT; break;
-          case LibFunc_round:
-          case LibFunc_roundf:
-          case LibFunc_roundl:
-            Opcode = ISD::FROUND; break;
-          case LibFunc_trunc:
-          case LibFunc_truncf:
-          case LibFunc_truncl:
-            Opcode = ISD::FTRUNC; break;
-          case LibFunc_fmin:
-          case LibFunc_fminf:
-          case LibFunc_fminl:
-            Opcode = ISD::FMINNUM; break;
-          case LibFunc_fmax:
-          case LibFunc_fmaxf:
-          case LibFunc_fmaxl:
-            Opcode = ISD::FMAXNUM; break;
-          }
-        }
-
-        if (Opcode) {
-          EVT EVTy =
-              TLI->getValueType(*DL, CI->getArgOperand(0)->getType(), true);
-
-          if (EVTy == MVT::Other)
-            return true;
-
-          if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
-            continue;
-          else if (EVTy.isVector() &&
-                   TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
-            continue;
-
-          return true;
-        }
-      }
-
-      return true;
-    } else if (isa<BinaryOperator>(J) &&
-               J->getType()->getScalarType()->isPPC_FP128Ty()) {
-      // Most operations on ppc_f128 values become calls.
-      return true;
-    } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
-               isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
-      CastInst *CI = cast<CastInst>(J);
-      if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
-          CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
-          isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) ||
-          isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType()))
-        return true;
-    } else if (isLargeIntegerTy(!TM->isPPC64(),
-                                J->getType()->getScalarType()) &&
-               (J->getOpcode() == Instruction::UDiv ||
-                J->getOpcode() == Instruction::SDiv ||
-                J->getOpcode() == Instruction::URem ||
-                J->getOpcode() == Instruction::SRem)) {
-      return true;
-    } else if (!TM->isPPC64() &&
-               isLargeIntegerTy(false, J->getType()->getScalarType()) &&
-               (J->getOpcode() == Instruction::Shl ||
-                J->getOpcode() == Instruction::AShr ||
-                J->getOpcode() == Instruction::LShr)) {
-      // Only on PPC32, for 128-bit integers (specifically not 64-bit
-      // integers), these might be runtime calls.
-      return true;
-    } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
-      // On PowerPC, indirect jumps use the counter register.
-      return true;
-    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
-      if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
-        return true;
-    }
-
-    // FREM is always a call.
-    if (J->getOpcode() == Instruction::FRem)
-      return true;
-
-    if (STI->useSoftFloat()) {
-      switch(J->getOpcode()) {
-      case Instruction::FAdd:
-      case Instruction::FSub:
-      case Instruction::FMul:
-      case Instruction::FDiv:
-      case Instruction::FPTrunc:
-      case Instruction::FPExt:
-      case Instruction::FPToUI:
-      case Instruction::FPToSI:
-      case Instruction::UIToFP:
-      case Instruction::SIToFP:
-      case Instruction::FCmp:
-        return true;
-      }
-    }
-
-    for (Value *Operand : J->operands())
-      if (memAddrUsesCTR(*TM, Operand))
-        return true;
-  }
-
-  return false;
-}
-bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
-  bool MadeChange = false;
-
-  // Do not convert small short loops to CTR loop.
-  unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
-  if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
-    SmallPtrSet<const Value *, 32> EphValues;
-    auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
-        *L->getHeader()->getParent());
-    CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
-    CodeMetrics Metrics;
-    for (BasicBlock *BB : L->blocks())
-      Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
-    // 6 is an approximate latency for the mtctr instruction.
-    if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
-      return false;
-  }
-
-  // Process nested loops first.
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
-    MadeChange |= convertToCTRLoop(*I);
-    LLVM_DEBUG(dbgs() << "Nested loop converted\n");
-  }
-
-  // If a nested loop has been converted, then we can't convert this loop.
-  if (MadeChange)
-    return MadeChange;
-
-  // Bail out if the loop has irreducible control flow.
-  LoopBlocksRPO RPOT(L);
-  RPOT.perform(LI);
-  if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI))
-    return false;
-
-#ifndef NDEBUG
-  // Stop trying after reaching the limit (if any).
-  int Limit = CTRLoopLimit;
-  if (Limit >= 0) {
-    if (Counter >= CTRLoopLimit)
-      return false;
-    Counter++;
-  }
-#endif
-
-  // We don't want to spill/restore the counter register, and so we don't
-  // want to use the counter register if the loop contains calls.
-  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
-       I != IE; ++I)
-    if (mightUseCTR(*I))
-      return MadeChange;
-
-  SmallVector<BasicBlock*, 4> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
-
-  // If there is an exit edge known to be frequently taken,
-  // we should not transform this loop.
-  for (auto &BB : ExitingBlocks) {
-    Instruction *TI = BB->getTerminator();
-    if (!TI) continue;
-
-    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-      uint64_t TrueWeight = 0, FalseWeight = 0;
-      if (!BI->isConditional() ||
-          !BI->extractProfMetadata(TrueWeight, FalseWeight))
-        continue;
-
-      // If the exit path is more frequent than the loop path,
-      // we return here without further analysis for this loop.
-      bool TrueIsExit = !L->contains(BI->getSuccessor(0));
-      if (( TrueIsExit && FalseWeight < TrueWeight) ||
-          (!TrueIsExit && FalseWeight > TrueWeight))
-        return MadeChange;
-    }
-  }
-
-  BasicBlock *CountedExitBlock = nullptr;
-  const SCEV *ExitCount = nullptr;
-  BranchInst *CountedExitBranch = nullptr;
-  for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
-       IE = ExitingBlocks.end(); I != IE; ++I) {
-    const SCEV *EC = SE->getExitCount(L, *I);
-    LLVM_DEBUG(dbgs() << "Exit Count for " << *L << " from block "
-                      << (*I)->getName() << ": " << *EC << "\n");
-    if (isa<SCEVCouldNotCompute>(EC))
-      continue;
-    if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
-      if (ConstEC->getValue()->isZero())
-        continue;
-    } else if (!SE->isLoopInvariant(EC, L))
-      continue;
-
-    if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32))
-      continue;
-
-    // If this exiting block is contained in a nested loop, it is not eligible
-    // for insertion of the branch-and-decrement since the inner loop would
-    // end up messing up the value in the CTR.
-    if (LI->getLoopFor(*I) != L)
-      continue;
-
-    // We now have a loop-invariant count of loop iterations (which is not the
-    // constant zero) for which we know that this loop will not exit via this
-    // existing block.
-
-    // We need to make sure that this block will run on every loop iteration.
-    // For this to be true, we must dominate all blocks with backedges. Such
-    // blocks are in-loop predecessors to the header block.
-    bool NotAlways = false;
-    for (pred_iterator PI = pred_begin(L->getHeader()),
-         PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
-      if (!L->contains(*PI))
-        continue;
-
-      if (!DT->dominates(*I, *PI)) {
-        NotAlways = true;
-        break;
-      }
-    }
-
-    if (NotAlways)
-      continue;
-
-    // Make sure this blocks ends with a conditional branch.
-    Instruction *TI = (*I)->getTerminator();
-    if (!TI)
-      continue;
-
-    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-      if (!BI->isConditional())
-        continue;
-
-      CountedExitBranch = BI;
-    } else
-      continue;
-
-    // Note that this block may not be the loop latch block, even if the loop
-    // has a latch block.
-    CountedExitBlock = *I;
-    ExitCount = EC;
-    break;
-  }
-
-  if (!CountedExitBlock)
-    return MadeChange;
-
-  BasicBlock *Preheader = L->getLoopPreheader();
-
-  // If we don't have a preheader, then insert one. If we already have a
-  // preheader, then we can use it (except if the preheader contains a use of
-  // the CTR register because some such uses might be reordered by the
-  // selection DAG after the mtctr instruction).
-  if (!Preheader || mightUseCTR(Preheader))
-    Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
-  if (!Preheader)
-    return MadeChange;
-
-  LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName()
-                    << "\n");
-
-  // Insert the count into the preheader and replace the condition used by the
-  // selected branch.
-  MadeChange = true;
-
-  SCEVExpander SCEVE(*SE, *DL, "loopcnt");
-  LLVMContext &C = SE->getContext();
-  Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
-  if (!ExitCount->getType()->isPointerTy() &&
-      ExitCount->getType() != CountType)
-    ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
-  ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
-  Value *ECValue =
-      SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator());
-
-  IRBuilder<> CountBuilder(Preheader->getTerminator());
-  Module *M = Preheader->getParent()->getParent();
-  Function *MTCTRFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr, CountType);
-  CountBuilder.CreateCall(MTCTRFunc, ECValue);
-
-  IRBuilder<> CondBuilder(CountedExitBranch);
-  Function *DecFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
-  Value *NewCond = CondBuilder.CreateCall(DecFunc, {});
-  Value *OldCond = CountedExitBranch->getCondition();
-  CountedExitBranch->setCondition(NewCond);
-
-  // The false branch must exit the loop.
-  if (!L->contains(CountedExitBranch->getSuccessor(0)))
-    CountedExitBranch->swapSuccessors();
-
-  // The old condition may be dead now, and may have even created a dead PHI
-  // (the original induction variable).
-  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
-  // Run through the basic blocks of the loop and see if any of them have dead
-  // PHIs that can be removed.
-  for (auto I : L->blocks())
-    DeleteDeadPHIs(I);
-
-  ++NumCTRLoops;
-  return MadeChange;
-}
-
  #ifndef NDEBUG
  static bool clobbersCTR(const MachineInstr &MI) {
    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp

index ee8fda9..116916f 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9944,7 +9944,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
    }
    case ISD::INTRINSIC_W_CHAIN: {
      if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
-        Intrinsic::ppc_is_decremented_ctr_nonzero)
+        Intrinsic::loop_decrement)
        break;
  
      assert(N->getValueType(0) == MVT::i1 &&
@@ -13636,7 +13636,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
  
      if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
          cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
-          Intrinsic::ppc_is_decremented_ctr_nonzero) {
+          Intrinsic::loop_decrement) {
  
        // We now need to make the intrinsic dead (it cannot be instruction
        // selected).
@@ -13662,14 +13662,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
      if (LHS.getOpcode() == ISD::AND &&
          LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
          cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
-          Intrinsic::ppc_is_decremented_ctr_nonzero &&
+          Intrinsic::loop_decrement &&
          isa<ConstantSDNode>(LHS.getOperand(1)) &&
          !isNullConstant(LHS.getOperand(1)))
        LHS = LHS.getOperand(0);
  
      if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
          cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
-          Intrinsic::ppc_is_decremented_ctr_nonzero &&
+          Intrinsic::loop_decrement &&
          isa<ConstantSDNode>(RHS)) {
        assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
               "Counter decrement comparison is not EQ or NE");
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td

index e17bc25..6b0e3a2 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -388,7 +388,7 @@ def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
               PPC970_DGroup_First, PPC970_Unit_FXU;
  }
  let hasSideEffects = 1, Defs = [CTR8] in {
-let Pattern = [(int_ppc_mtctr i64:$rS)] in
+let Pattern = [(int_set_loop_iterations i64:$rS)] in
  def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
                                 "mtctr $rS", IIC_SprMTSPR>,
                   PPC970_DGroup_First, PPC970_Unit_FXU;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td

index 2ec6d6b..6b6787f 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -2605,7 +2605,7 @@ def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
              PPC970_DGroup_First, PPC970_Unit_FXU;
  }
  let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
-let Pattern = [(int_ppc_mtctr i32:$rS)] in
+let Pattern = [(int_set_loop_iterations i32:$rS)] in
  def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
                                "mtctr $rS", IIC_SprMTSPR>,
                  PPC970_DGroup_First, PPC970_Unit_FXU;
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp

index 915b89a..da1121b 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -101,7 +101,6 @@ extern "C" void LLVMInitializePowerPCTarget() {
    RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());
  
    PassRegistry &PR = *PassRegistry::getPassRegistry();
-  initializePPCCTRLoopsPass(PR);
  #ifndef NDEBUG
    initializePPCCTRLoopsVerifyPass(PR);
  #endif
@@ -422,7 +421,7 @@ bool PPCPassConfig::addPreISel() {
      addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
  
    if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
-    addPass(createPPCCTRLoops());
+    addPass(createHardwareLoopsPass());
  
    return false;
  }
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp

index fdb2a38..bc29838 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -7,10 +7,12 @@
  //===----------------------------------------------------------------------===//
  
  #include "PPCTargetTransformInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
  #include "llvm/Analysis/TargetTransformInfo.h"
  #include "llvm/CodeGen/BasicTTIImpl.h"
  #include "llvm/CodeGen/CostTable.h"
  #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSchedule.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
  using namespace llvm;
@@ -31,6 +33,13 @@ EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
                  cl::desc("Enable using coldcc calling conv for cold "
                           "internal functions"));
  
+// The latency of mtctr is only justified if there are more than 4
+// comparisons that will be removed as a result.
+static cl::opt<unsigned>
+SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
+                      cl::desc("Loops with a constant trip count smaller than "
+                               "this value will not use the count register."));
+
  //===----------------------------------------------------------------------===//
  //
  // PPC cost model.
@@ -204,6 +213,341 @@ unsigned PPCTTIImpl::getUserCost(const User *U,
    return BaseT::getUserCost(U, Operands);
  }
  
+bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
+                             TargetLibraryInfo *LibInfo) {
+  const PPCTargetMachine &TM = ST->getTargetMachine();
+
+  // Loop through the inline asm constraints and look for something that
+  // clobbers ctr.
+  auto asmClobbersCTR = [](InlineAsm *IA) {
+    InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+    for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+      InlineAsm::ConstraintInfo &C = CIV[i];
+      if (C.Type != InlineAsm::isInput)
+        for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+          if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+            return true;
+    }
+    return false;
+  };
+
+  // Determining the address of a TLS variable results in a function call in
+  // certain TLS models.
+  std::function<bool(const Value*)> memAddrUsesCTR =
+    [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool {
+    const auto *GV = dyn_cast<GlobalValue>(MemAddr);
+    if (!GV) {
+      // Recurse to check for constants that refer to TLS global variables.
+      if (const auto *CV = dyn_cast<Constant>(MemAddr))
+        for (const auto &CO : CV->operands())
+          if (memAddrUsesCTR(CO))
+            return true;
+
+      return false;
+    }
+
+    if (!GV->isThreadLocal())
+      return false;
+    TLSModel::Model Model = TM.getTLSModel(GV);
+    return Model == TLSModel::GeneralDynamic ||
+      Model == TLSModel::LocalDynamic;
+  };
+
+  auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) {
+    if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+      return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
+
+    return false;
+  };
+
+  for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
+       J != JE; ++J) {
+    if (CallInst *CI = dyn_cast<CallInst>(J)) {
+      // Inline ASM is okay, unless it clobbers the ctr register.
+      if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
+        if (asmClobbersCTR(IA))
+          return true;
+        continue;
+      }
+
+      if (Function *F = CI->getCalledFunction()) {
+        // Most intrinsics don't become function calls, but some might.
+        // sin, cos, exp and log are always calls.
+        unsigned Opcode = 0;
+        if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
+          switch (F->getIntrinsicID()) {
+          default: continue;
+          // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
+          // we're definitely using CTR.
+          case Intrinsic::set_loop_iterations:
+          case Intrinsic::loop_decrement:
+            return true;
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+                       !defined(setjmp_undefined_for_msvc)
+#  pragma push_macro("setjmp")
+#  undef setjmp
+#  define setjmp_undefined_for_msvc
+#endif
+
+          case Intrinsic::setjmp:
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
+ // let's return it to _setjmp state
+#  pragma pop_macro("setjmp")
+#  undef setjmp_undefined_for_msvc
+#endif
+
+          case Intrinsic::longjmp:
+
+          // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
+          // because, although it does clobber the counter register, the
+          // control can't then return to inside the loop unless there is also
+          // an eh_sjlj_setjmp.
+          case Intrinsic::eh_sjlj_setjmp:
+
+          case Intrinsic::memcpy:
+          case Intrinsic::memmove:
+          case Intrinsic::memset:
+          case Intrinsic::powi:
+          case Intrinsic::log:
+          case Intrinsic::log2:
+          case Intrinsic::log10:
+          case Intrinsic::exp:
+          case Intrinsic::exp2:
+          case Intrinsic::pow:
+          case Intrinsic::sin:
+          case Intrinsic::cos:
+            return true;
+          case Intrinsic::copysign:
+            if (CI->getArgOperand(0)->getType()->getScalarType()->
+                isPPC_FP128Ty())
+              return true;
+            else
+              continue; // ISD::FCOPYSIGN is never a library call.
+          case Intrinsic::sqrt:               Opcode = ISD::FSQRT;      break;
+          case Intrinsic::floor:              Opcode = ISD::FFLOOR;     break;
+          case Intrinsic::ceil:               Opcode = ISD::FCEIL;      break;
+          case Intrinsic::trunc:              Opcode = ISD::FTRUNC;     break;
+          case Intrinsic::rint:               Opcode = ISD::FRINT;      break;
+          case Intrinsic::nearbyint:          Opcode = ISD::FNEARBYINT; break;
+          case Intrinsic::round:              Opcode = ISD::FROUND;     break;
+          case Intrinsic::minnum:             Opcode = ISD::FMINNUM;    break;
+          case Intrinsic::maxnum:             Opcode = ISD::FMAXNUM;    break;
+          case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO;      break;
+          case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO;      break;
+          }
+        }
+
+        // PowerPC does not use [US]DIVREM or other library calls for
+        // operations on regular types which are not otherwise library calls
+        // (i.e. soft float or atomics). If adapting for targets that do,
+        // additional care is required here.
+
+        LibFunc Func;
+        if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
+            LibInfo->getLibFunc(F->getName(), Func) &&
+            LibInfo->hasOptimizedCodeGen(Func)) {
+          // Non-read-only functions are never treated as intrinsics.
+          if (!CI->onlyReadsMemory())
+            return true;
+
+          // Conversion happens only for FP calls.
+          if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
+            return true;
+
+          switch (Func) {
+          default: return true;
+          case LibFunc_copysign:
+          case LibFunc_copysignf:
+            continue; // ISD::FCOPYSIGN is never a library call.
+          case LibFunc_copysignl:
+            return true;
+          case LibFunc_fabs:
+          case LibFunc_fabsf:
+          case LibFunc_fabsl:
+            continue; // ISD::FABS is never a library call.
+          case LibFunc_sqrt:
+          case LibFunc_sqrtf:
+          case LibFunc_sqrtl:
+            Opcode = ISD::FSQRT; break;
+          case LibFunc_floor:
+          case LibFunc_floorf:
+          case LibFunc_floorl:
+            Opcode = ISD::FFLOOR; break;
+          case LibFunc_nearbyint:
+          case LibFunc_nearbyintf:
+          case LibFunc_nearbyintl:
+            Opcode = ISD::FNEARBYINT; break;
+          case LibFunc_ceil:
+          case LibFunc_ceilf:
+          case LibFunc_ceill:
+            Opcode = ISD::FCEIL; break;
+          case LibFunc_rint:
+          case LibFunc_rintf:
+          case LibFunc_rintl:
+            Opcode = ISD::FRINT; break;
+          case LibFunc_round:
+          case LibFunc_roundf:
+          case LibFunc_roundl:
+            Opcode = ISD::FROUND; break;
+          case LibFunc_trunc:
+          case LibFunc_truncf:
+          case LibFunc_truncl:
+            Opcode = ISD::FTRUNC; break;
+          case LibFunc_fmin:
+          case LibFunc_fminf:
+          case LibFunc_fminl:
+            Opcode = ISD::FMINNUM; break;
+          case LibFunc_fmax:
+          case LibFunc_fmaxf:
+          case LibFunc_fmaxl:
+            Opcode = ISD::FMAXNUM; break;
+          }
+        }
+
+        if (Opcode) {
+          EVT EVTy =
+              TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true);
+
+          if (EVTy == MVT::Other)
+            return true;
+
+          if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
+            continue;
+          else if (EVTy.isVector() &&
+                   TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
+            continue;
+
+          return true;
+        }
+      }
+
+      return true;
+    } else if (isa<BinaryOperator>(J) &&
+               J->getType()->getScalarType()->isPPC_FP128Ty()) {
+      // Most operations on ppc_f128 values become calls.
+      return true;
+    } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
+               isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
+      CastInst *CI = cast<CastInst>(J);
+      if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
+          CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
+          isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) ||
+          isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType()))
+        return true;
+    } else if (isLargeIntegerTy(!TM.isPPC64(),
+                                J->getType()->getScalarType()) &&
+               (J->getOpcode() == Instruction::UDiv ||
+                J->getOpcode() == Instruction::SDiv ||
+                J->getOpcode() == Instruction::URem ||
+                J->getOpcode() == Instruction::SRem)) {
+      return true;
+    } else if (!TM.isPPC64() &&
+               isLargeIntegerTy(false, J->getType()->getScalarType()) &&
+               (J->getOpcode() == Instruction::Shl ||
+                J->getOpcode() == Instruction::AShr ||
+                J->getOpcode() == Instruction::LShr)) {
+      // Only on PPC32, for 128-bit integers (specifically not 64-bit
+      // integers), these might be runtime calls.
+      return true;
+    } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
+      // On PowerPC, indirect jumps use the counter register.
+      return true;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
+      if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
+        return true;
+    }
+
+    // FREM is always a call.
+    if (J->getOpcode() == Instruction::FRem)
+      return true;
+
+    if (ST->useSoftFloat()) {
+      switch(J->getOpcode()) {
+      case Instruction::FAdd:
+      case Instruction::FSub:
+      case Instruction::FMul:
+      case Instruction::FDiv:
+      case Instruction::FPTrunc:
+      case Instruction::FPExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::UIToFP:
+      case Instruction::SIToFP:
+      case Instruction::FCmp:
+        return true;
+      }
+    }
+
+    for (Value *Operand : J->operands())
+      if (memAddrUsesCTR(Operand))
+        return true;
+  }
+
+  return false;
+}
+
+bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                          AssumptionCache &AC,
+                                          TargetLibraryInfo *LibInfo,
+                                          TTI::HardwareLoopInfo &HWLoopInfo) {
+  const PPCTargetMachine &TM = ST->getTargetMachine();
+  TargetSchedModel SchedModel;
+  SchedModel.init(ST);
+
+  // Do not convert small short loops to CTR loop.
+  unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
+  if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+    CodeMetrics Metrics;
+    for (BasicBlock *BB : L->blocks())
+      Metrics.analyzeBasicBlock(BB, *this, EphValues);
+    // 6 is an approximate latency for the mtctr instruction.
+    if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
+      return false;
+  }
+
+  // We don't want to spill/restore the counter register, and so we don't
+  // want to use the counter register if the loop contains calls.
+  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+       I != IE; ++I)
+    if (mightUseCTR(*I, LibInfo))
+      return false;
+
+  SmallVector<BasicBlock*, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  // If there is an exit edge known to be frequently taken,
+  // we should not transform this loop.
+  for (auto &BB : ExitingBlocks) {
+    Instruction *TI = BB->getTerminator();
+    if (!TI) continue;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      uint64_t TrueWeight = 0, FalseWeight = 0;
+      if (!BI->isConditional() ||
+          !BI->extractProfMetadata(TrueWeight, FalseWeight))
+        continue;
+
+      // If the exit path is more frequent than the loop path,
+      // we return here without further analysis for this loop.
+      bool TrueIsExit = !L->contains(BI->getSuccessor(0));
+      if (( TrueIsExit && FalseWeight < TrueWeight) ||
+          (!TrueIsExit && FalseWeight > TrueWeight))
+        return false;
+    }
+  }
+
+  LLVMContext &C = L->getHeader()->getContext();
+  HWLoopInfo.CountType = TM.isPPC64() ?
+    Type::getInt64Ty(C) : Type::getInt32Ty(C);
+  HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
+  return true;
+}
+
  void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                           TTI::UnrollingPreferences &UP) {
    if (ST->getDarwinDirective() == PPC::DIR_A2) {
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h

index 993b297..93a9968 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -33,6 +33,7 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
  
    const PPCSubtarget *getST() const { return ST; }
    const PPCTargetLowering *getTLI() const { return TLI; }
+  bool mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo);
  
  public:
    explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F)
@@ -52,6 +53,10 @@ public:
    unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
  
    TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
+                                AssumptionCache &AC,
+                                TargetLibraryInfo *LibInfo,
+                                TTI::HardwareLoopInfo &HWLoopInfo);
    void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                 TTI::UnrollingPreferences &UP);
  
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-intrin.ll b/llvm/test/CodeGen/PowerPC/ctrloop-intrin.ll

index 6ae5d33..9e18280 100644 (file)
--- a/llvm/test/CodeGen/PowerPC/ctrloop-intrin.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-intrin.ll
@@ -263,7 +263,7 @@ for.body.116.preheader:                           ; preds = %for.cond.112.prehea
    %8 = sub i64 0, %int_part_ptr.02534
    %scevgep5 = getelementptr i8, i8* %call109, i64 %8
    %scevgep56 = ptrtoint i8* %scevgep5 to i64
-  call void @llvm.ppc.mtctr.i64(i64 %scevgep56)
+  call void @llvm.set.loop.iterations.i64(i64 %scevgep56)
    br label %for.body.116
  
  for.cond.cleanup:                                 ; preds = %if.end.138, %if.end.105
@@ -298,8 +298,9 @@ for.body.116:                                     ; preds = %for.body.116, %for.
    %conv134 = trunc i32 %add133 to i8
    %scevgep = getelementptr i8, i8* inttoptr (i64 -1 to i8*), i64 %call109.pn2
    store i8 %conv134, i8* %scevgep, align 1, !tbaa !10
-  %12 = call i1 @llvm.ppc.is.decremented.ctr.nonzero()
-  br i1 %12, label %for.body.116, label %for.cond.cleanup.115
+  %12 = call i64 @llvm.loop.dec(i64 %scevgep56, i64 1)
+  %dec.cmp = icmp ne i64 %12, 0
+  br i1 %dec.cmp, label %for.body.116, label %for.cond.cleanup.115
  
  if.then.136:                                      ; preds = %for.cond.cleanup.115
    %incdec.ptr137 = getelementptr inbounds i8, i8* %int_part_ptr.0253, i64 -1
@@ -323,10 +324,10 @@ cleanup.148:                                      ; preds = %for.cond.cleanup, %
  declare i8* @memcpy(i8*, i8* nocapture readonly, i64) #1
  
  ; Function Attrs: nounwind
-declare void @llvm.ppc.mtctr.i64(i64) #0
+declare void @llvm.set.loop.iterations.i64(i64) #0
  
  ; Function Attrs: nounwind
-declare i1 @llvm.ppc.is.decremented.ctr.nonzero() #0
+declare i64 @llvm.loop.dec(i64, i64) #0
  
  attributes #0 = { nounwind }
  attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/PowerPC/ppc-passname.ll b/llvm/test/CodeGen/PowerPC/ppc-passname.ll

index 38bf60c..005f0a2 100644 (file)
--- a/llvm/test/CodeGen/PowerPC/ppc-passname.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc-passname.ll
@@ -1,15 +1,3 @@
-; Test pass name: ppc-ctr-loops.
-; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-ctr-loops -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-CTR-LOOPS
-; STOP-BEFORE-CTR-LOOPS-NOT: -ppc-ctr-loops
-; STOP-BEFORE-CTR-LOOPS-NOT: "ppc-ctr-loops" pass is not registered.
-; STOP-BEFORE-CTR-LOOPS-NOT: PowerPC CTR Loops 
-
-; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-ctr-loops -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-CTR-LOOPS
-; STOP-AFTER-CTR-LOOPS: -ppc-ctr-loops
-; STOP-AFTER-CTR-LOOPS-NOT: "ppc-ctr-loops" pass is not registered.
-; STOP-AFTER-CTR-LOOPS: PowerPC CTR Loops 
-
-
  ; Test pass name: ppc-loop-preinc-prep.
  ; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-loop-preinc-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-LOOP-PREINC-PREP
  ; STOP-BEFORE-LOOP-PREINC-PREP-NOT: -ppc-loop-preinc-prep
diff --git a/llvm/test/Transforms/HardwareLoops/scalar-while.ll b/llvm/test/Transforms/HardwareLoops/scalar-while.ll

new file mode 100644 (file)

index 0000000..d1fe3e0
--- /dev/null
+++ b/llvm/test/Transforms/HardwareLoops/scalar-while.ll
@@ -0,0 +1,144 @@
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-REGDEC
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC --check-prefix=CHECK-NESTED
+
+; CHECK-LABEL: while_lt
+define void @while_lt(i32 %i, i32 %N, i32* nocapture %A) {
+entry:
+  %cmp4 = icmp ult i32 %i, %N
+  br i1 %cmp4, label %while.body, label %while.end
+
+; CHECK: while.body.preheader:
+; CHECK: [[COUNT:%[^ ]+]] = sub i32 %N, %i
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: br label %while.body
+
+; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
+
+; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
+
+while.body:
+  %i.addr.05 = phi i32 [ %inc, %while.body ], [ %i, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
+  store i32 %i.addr.05, i32* %arrayidx, align 4
+  %inc = add nuw i32 %i.addr.05, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+; CHECK-LABEL: while_gt
+; CHECK: while.body.preheader:
+; CHECK: [[COUNT:%[^ ]+]] = sub i32 %i, %N
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: br label %while.body
+
+; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
+
+; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
+
+define void @while_gt(i32 %i, i32 %N, i32* nocapture %A) {
+entry:
+  %cmp4 = icmp sgt i32 %i, %N
+  br i1 %cmp4, label %while.body, label %while.end
+
+while.body:
+  %i.addr.05 = phi i32 [ %dec, %while.body ], [ %i, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
+  store i32 %i.addr.05, i32* %arrayidx, align 4
+  %dec = add nsw i32 %i.addr.05, -1
+  %cmp = icmp sgt i32 %dec, %N
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+  ret void
+}
+
+; CHECK-LABEL: while_gte
+; CHECK: while.body.preheader:
+; CHECK: [[ADD:%[^ ]+]] = add i32 %i, 1
+; CHECK: [[SEL:%[^ ]+]] = icmp slt i32 %N, %i
+; CHECK: [[MIN:%[^ ]+]] = select i1 [[SEL]], i32 %N, i32 %i
+; CHECK: [[COUNT:%[^ ]+]] = sub i32 [[ADD]], [[MIN]]
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: br label %while.body
+
+; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
+
+; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
+
+define void @while_gte(i32 %i, i32 %N, i32* nocapture %A) {
+entry:
+  %cmp4 = icmp slt i32 %i, %N
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:
+  %i.addr.05 = phi i32 [ %dec, %while.body ], [ %i, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
+  store i32 %i.addr.05, i32* %arrayidx, align 4
+  %dec = add nsw i32 %i.addr.05, -1
+  %cmp = icmp sgt i32 %i.addr.05, %N
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+  ret void
+}
+
+; CHECK-LABEL: nested
+; CHECK-NESTED: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK-NESTED: br label %while.cond1.preheader.us
+
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: br label %while.body3.us
+
+; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
+
+; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ %N, %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ]
+; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK-REGDEC: br i1 [[CMP]], label %while.body3.us, label %while.cond1.while.end_crit_edge.us
+
+; CHECK-NESTED: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-NESTED: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7
+
+define void @nested(i32* nocapture %A, i32 %N) {
+entry:
+  %cmp20 = icmp eq i32 %N, 0
+  br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
+
+while.cond1.preheader.us:
+  %i.021.us = phi i32 [ %inc6.us, %while.cond1.while.end_crit_edge.us ], [ 0, %entry ]
+  %mul.us = mul i32 %i.021.us, %N
+  br label %while.body3.us
+
+while.body3.us:
+  %j.019.us = phi i32 [ 0, %while.cond1.preheader.us ], [ %inc.us, %while.body3.us ]
+  %add.us = add i32 %j.019.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
+  store i32 %add.us, i32* %arrayidx.us, align 4
+  %inc.us = add nuw i32 %j.019.us, 1
+  %exitcond = icmp eq i32 %inc.us, %N
+  br i1 %exitcond, label %while.cond1.while.end_crit_edge.us, label %while.body3.us
+
+while.cond1.while.end_crit_edge.us:
+  %inc6.us = add nuw i32 %i.021.us, 1
+  %exitcond23 = icmp eq i32 %inc6.us, %N
+  br i1 %exitcond23, label %while.end7, label %while.cond1.preheader.us
+
+while.end7:
+  ret void
+}
diff --git a/llvm/test/Transforms/HardwareLoops/unscevable.ll b/llvm/test/Transforms/HardwareLoops/unscevable.ll

new file mode 100644 (file)

index 0000000..63932c5
--- /dev/null
+++ b/llvm/test/Transforms/HardwareLoops/unscevable.ll
@@ -0,0 +1,47 @@
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s
+
+; CHECK-LABEL: float_counter
+; CHECK-NOT: set.loop.iterations
+; CHECK-NOT: loop.decrement
+define void @float_counter(i32* nocapture %A, float %N) {
+entry:
+  %cmp6 = fcmp ogt float %N, 0.000000e+00
+  br i1 %cmp6, label %while.body, label %while.end
+
+while.body:
+  %i.07 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.07
+  store i32 %i.07, i32* %arrayidx, align 4
+  %inc = add i32 %i.07, 1
+  %conv = uitofp i32 %inc to float
+  %cmp = fcmp olt float %conv, %N
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+  ret void
+}
+
+; CHECK-LABEL: variant_counter
+; CHECK-NOT: set.loop.iterations
+; CHECK-NOT: loop.decrement
+define void @variant_counter(i32* nocapture %A, i32* nocapture readonly %B) {
+entry:
+  %0 = load i32, i32* %B, align 4
+  %cmp7 = icmp eq i32 %0, 0
+  br i1 %cmp7, label %while.end, label %while.body
+
+while.body:
+  %i.08 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %arrayidx1 = getelementptr inbounds i32, i32* %A, i32 %i.08
+  store i32 %i.08, i32* %arrayidx1, align 4
+  %inc = add nuw i32 %i.08, 1
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %inc
+  %1 = load i32, i32* %arrayidx, align 4
+  %cmp = icmp ult i32 %inc, %1
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+  ret void
+}
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp

index 66c5cd0..a84e2b9 100644 (file)
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -308,6 +308,7 @@ int main(int argc, char **argv) {
    initializeVectorization(*Registry);
    initializeScalarizeMaskedMemIntrinPass(*Registry);
    initializeExpandReductionsPass(*Registry);
+  initializeHardwareLoopsPass(*Registry);
  
    // Initialize debugging passes.
    initializeScavengerTestPass(*Registry);
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp

index 7053c2d..4871eb2 100644 (file)
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -528,6 +528,7 @@ int main(int argc, char **argv) {
    initializeExpandReductionsPass(Registry);
    initializeWasmEHPreparePass(Registry);
    initializeWriteBitcodePassPass(Registry);
+  initializeHardwareLoopsPass(Registry);
  
  #ifdef LINK_POLLY_INTO_TOOLS
    polly::initializePollyPasses(Registry);
author	Sam Parker <sam.parker@arm.com>
	Fri, 7 Jun 2019 07:35:30 +0000 (07:35 +0000)
committer	Sam Parker <sam.parker@arm.com>
	Fri, 7 Jun 2019 07:35:30 +0000 (07:35 +0000)
llvm/include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h		patch \| blob \| history
llvm/include/llvm/CodeGen/BasicTTIImpl.h		patch \| blob \| history
llvm/include/llvm/CodeGen/Passes.h		patch \| blob \| history
llvm/include/llvm/IR/Intrinsics.td		patch \| blob \| history
llvm/include/llvm/InitializePasses.h		patch \| blob \| history
llvm/include/llvm/LinkAllPasses.h		patch \| blob \| history
llvm/lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/CodeGen/CMakeLists.txt		patch \| blob \| history
llvm/lib/CodeGen/CodeGen.cpp		patch \| blob \| history
llvm/lib/CodeGen/HardwareLoops.cpp	[new file with mode: 0644]	patch \| blob
llvm/lib/Target/PowerPC/PPCCTRLoops.cpp		patch \| blob \| history
llvm/lib/Target/PowerPC/PPCISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/PowerPC/PPCInstr64Bit.td		patch \| blob \| history
llvm/lib/Target/PowerPC/PPCInstrInfo.td		patch \| blob \| history
llvm/lib/Target/PowerPC/PPCTargetMachine.cpp		patch \| blob \| history
llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h		patch \| blob \| history
llvm/test/CodeGen/PowerPC/ctrloop-intrin.ll		patch \| blob \| history
llvm/test/CodeGen/PowerPC/ppc-passname.ll		patch \| blob \| history
llvm/test/Transforms/HardwareLoops/scalar-while.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/HardwareLoops/unscevable.ll	[new file with mode: 0644]	patch \| blob
llvm/tools/llc/llc.cpp		patch \| blob \| history
llvm/tools/opt/opt.cpp		patch \| blob \| history