[AMDGPU][NewPM] Port amdgpu-promote-alloca(-to-vector)

author Arthur Eubanks <aeubanks@google.com>

Mon, 28 Dec 2020 20:14:51 +0000 (12:14 -0800)

committer Arthur Eubanks <aeubanks@google.com>

Tue, 29 Dec 2020 01:52:31 +0000 (17:52 -0800)
author Arthur Eubanks <aeubanks@google.com>
Mon, 28 Dec 2020 20:14:51 +0000 (12:14 -0800)
committer Arthur Eubanks <aeubanks@google.com>
Tue, 29 Dec 2020 01:52:31 +0000 (17:52 -0800)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h

index 22d264e2880b3e67e5da993c82889072569bc914..6a0ba20e8026be838d666f80682eaaa1731aaf06 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -208,6 +208,23 @@ FunctionPass *createAMDGPUPromoteAllocaToVector();
  void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&);
  extern char &AMDGPUPromoteAllocaToVectorID;
  
+struct AMDGPUPromoteAllocaPass : PassInfoMixin<AMDGPUPromoteAllocaPass> {
+  AMDGPUPromoteAllocaPass(TargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  TargetMachine &TM;
+};
+
+struct AMDGPUPromoteAllocaToVectorPass
+    : PassInfoMixin<AMDGPUPromoteAllocaToVectorPass> {
+  AMDGPUPromoteAllocaToVectorPass(TargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  TargetMachine &TM;
+};
+
  Pass *createAMDGPUStructurizeCFGPass();
  FunctionPass *createAMDGPUISelDag(
    TargetMachine *TM = nullptr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

index 4cfe0edfc5330eeea5e8af836cfd1e06cec5ed5d..3dc7b16430813603eafe3338692990d781b8ee2b 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -42,6 +42,7 @@
  #include "llvm/IR/LLVMContext.h"
  #include "llvm/IR/Metadata.h"
  #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
  #include "llvm/IR/Type.h"
  #include "llvm/IR/User.h"
  #include "llvm/IR/Value.h"
@@ -83,8 +84,26 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
  
  // FIXME: This can create globals so should be a module pass.
  class AMDGPUPromoteAlloca : public FunctionPass {
+public:
+  static char ID;
+
+  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
+
+  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+class AMDGPUPromoteAllocaImpl {
  private:
-  const TargetMachine *TM;
+  const TargetMachine &TM;
    Module *Mod = nullptr;
    const DataLayout *DL = nullptr;
  
@@ -116,28 +135,14 @@ private:
    /// Check whether we have enough local memory for promotion.
    bool hasSufficientLocalMem(const Function &F);
  
-public:
-  static char ID;
-
-  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
-
-  bool doInitialization(Module &M) override;
-  bool runOnFunction(Function &F) override;
-
-  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
-
    bool handleAlloca(AllocaInst &I, bool SufficientLDS);
  
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    FunctionPass::getAnalysisUsage(AU);
-  }
+public:
+  AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {}
+  bool run(Function &F);
  };
  
  class AMDGPUPromoteAllocaToVector : public FunctionPass {
-private:
-  unsigned MaxVGPRs;
-
  public:
    static char ID;
  
@@ -149,8 +154,6 @@ public:
      return "AMDGPU Promote Alloca to vector";
    }
  
-  bool handleAlloca(AllocaInst &I);
-
    void getAnalysisUsage(AnalysisUsage &AU) const override {
      AU.setPreservesCFG();
      FunctionPass::getAnalysisUsage(AU);
@@ -171,32 +174,41 @@ INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
  char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
  char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
  
-bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
-  Mod = &M;
-  DL = &Mod->getDataLayout();
+bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
  
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+    return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()).run(F);
+  }
    return false;
  }
  
-bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
+PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F);
+  if (Changed) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
+}
  
-  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-    TM = &TPC->getTM<TargetMachine>();
-  else
-    return false;
+bool AMDGPUPromoteAllocaImpl::run(Function &F) {
+  Mod = F.getParent();
+  DL = &Mod->getDataLayout();
  
-  const Triple &TT = TM->getTargetTriple();
+  const Triple &TT = TM.getTargetTriple();
    IsAMDGCN = TT.getArch() == Triple::amdgcn;
    IsAMDHSA = TT.getOS() == Triple::AMDHSA;
  
-  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
    if (!ST.isPromoteAllocaEnabled())
      return false;
  
    if (IsAMDGCN) {
-    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
      MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
    } else {
      MaxVGPRs = 128;
@@ -221,9 +233,9 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
  }
  
  std::pair<Value *, Value *>
-AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
+AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
    const Function &F = *Builder.GetInsertBlock()->getParent();
-  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
  
    if (!IsAMDHSA) {
      Function *LocalSizeYFn
@@ -308,9 +320,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
    return std::make_pair(Y, LoadZU);
  }
  
-Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
+Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
+                                              unsigned N) {
    const AMDGPUSubtarget &ST =
-      AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
+      AMDGPUSubtarget::get(TM, *Builder.GetInsertBlock()->getParent());
    Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
  
    switch (N) {
@@ -592,11 +605,9 @@ static bool isCallPromotable(CallInst *CI) {
    }
  }
  
-bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
-                                                          Value *Val,
-                                                          Instruction *Inst,
-                                                          int OpIdx0,
-                                                          int OpIdx1) const {
+bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
+    Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
+    int OpIdx1) const {
    // Figure out which operand is the one we might not be promoting.
    Value *OtherOp = Inst->getOperand(OpIdx0);
    if (Val == OtherOp)
@@ -624,10 +635,8 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
    return true;
  }
  
-bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
-  Value *BaseAlloca,
-  Value *Val,
-  std::vector<Value*> &WorkList) const {
+bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
+    Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
  
    for (User *User : Val->users()) {
      if (is_contained(WorkList, User))
@@ -727,10 +736,10 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
    return true;
  }
  
-bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
+bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
  
    FunctionType *FTy = F.getFunctionType();
-  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
  
    // If the function has any arguments in the local address space, then it's
    // possible these arguments require the entire local memory space, so
@@ -863,7 +872,7 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
  }
  
  // FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
+bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
    // Array allocations are probably not worth handling, since an allocation of
    // the array type is the canonical form.
    if (!I.isStaticAlloca() || I.isArrayAllocation())
@@ -904,7 +913,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
    if (!SufficientLDS)
      return false;
  
-  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
    unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
  
    Align Alignment =
@@ -1083,22 +1092,29 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
    return true;
  }
  
-bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
-  if (skipFunction(F) || DisablePromoteAllocaToVector)
+bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) {
+  // Array allocations are probably not worth handling, since an allocation of
+  // the array type is the canonical form.
+  if (!I.isStaticAlloca() || I.isArrayAllocation())
      return false;
  
-  const TargetMachine *TM;
-  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-    TM = &TPC->getTM<TargetMachine>();
-  else
+  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+  Module *Mod = I.getParent()->getParent()->getParent();
+  return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
+}
+
+bool promoteAllocasToVector(Function &F, TargetMachine &TM) {
+  if (DisablePromoteAllocaToVector)
      return false;
  
-  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
    if (!ST.isPromoteAllocaEnabled())
      return false;
  
-  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
-    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+  unsigned MaxVGPRs;
+  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
      MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
    } else {
      MaxVGPRs = 128;
@@ -1114,23 +1130,31 @@ bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
    }
  
    for (AllocaInst *AI : Allocas) {
-    if (handleAlloca(*AI))
+    if (handlePromoteAllocaToVector(*AI, MaxVGPRs))
        Changed = true;
    }
  
    return Changed;
  }
  
-bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
-  // Array allocations are probably not worth handling, since an allocation of
-  // the array type is the canonical form.
-  if (!I.isStaticAlloca() || I.isArrayAllocation())
+bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
+  if (skipFunction(F))
      return false;
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+    return promoteAllocasToVector(F, TPC->getTM<TargetMachine>());
+  }
+  return false;
+}
  
-  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
-
-  Module *Mod = I.getParent()->getParent()->getParent();
-  return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
+PreservedAnalyses
+AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
+  bool Changed = promoteAllocasToVector(F, TM);
+  if (Changed) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
  }
  
  FunctionPass *llvm::createAMDGPUPromoteAlloca() {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index 6ea99bdf9206d90aff8b69ec9c2c39d0a53675ea..89ae9d8029e038ea71c05563f57d691face70bad 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -29,6 +29,7 @@
  #include "SIMachineFunctionInfo.h"
  #include "SIMachineScheduler.h"
  #include "TargetInfo/AMDGPUTargetInfo.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
  #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
  #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
  #include "llvm/CodeGen/GlobalISel/Legalizer.h"
@@ -488,8 +489,8 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
  void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
                                                         bool DebugPassManager) {
    PB.registerPipelineParsingCallback(
-      [](StringRef PassName, FunctionPassManager &PM,
-         ArrayRef<PassBuilder::PipelineElement>) {
+      [this](StringRef PassName, FunctionPassManager &PM,
+             ArrayRef<PassBuilder::PipelineElement>) {
          if (PassName == "amdgpu-simplifylib") {
            PM.addPass(AMDGPUSimplifyLibCallsPass());
            return true;
@@ -498,6 +499,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
            PM.addPass(AMDGPUUseNativeCallsPass());
            return true;
          }
+        if (PassName == "amdgpu-promote-alloca") {
+          PM.addPass(AMDGPUPromoteAllocaPass(*this));
+          return true;
+        }
+        if (PassName == "amdgpu-promote-alloca-to-vector") {
+          PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
+          return true;
+        }
          return false;
        });
  
@@ -510,6 +519,18 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
        FPM.addPass(AMDGPUSimplifyLibCallsPass());
      PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
    });
+
+  PB.registerCGSCCOptimizerLateEPCallback(
+      [this, DebugPassManager](CGSCCPassManager &PM,
+                               PassBuilder::OptimizationLevel Level) {
+        if (Level != PassBuilder::OptimizationLevel::O0) {
+          FunctionPassManager FPM(DebugPassManager);
+          // Promote alloca to vector before SROA and loop unroll. If we manage
+          // to eliminate allocas before unroll we may choose to unroll less.
+          FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
+          PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
+        }
+      });
  }
  
  //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll

index b71454c720b1a4377f9a087640b4898f657f5cad..47839dcb934426e42b87b29590707cd7240bb283 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
@@ -1,5 +1,7 @@
  ; RUN: opt -mtriple=amdgcn-- -O1 -S < %s | FileCheck %s --check-prefixes=FUNC,LOOP
+; RUN: opt -mtriple=amdgcn-- -passes='default<O1>' -S < %s | FileCheck %s --check-prefixes=FUNC,LOOP
  ; RUN: opt -mtriple=amdgcn-- -O1 -S -disable-promote-alloca-to-vector < %s | FileCheck %s --check-prefixes=FUNC,FULL-UNROLL
+; RUN: opt -mtriple=amdgcn-- -passes='default<O1>' -S -disable-promote-alloca-to-vector < %s | FileCheck %s --check-prefixes=FUNC,FULL-UNROLL
  
  target datalayout = "A5"
  
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll

index 3b6df750ff07ee413554eb8d486a631a61ab5b03..0d37ed60c83b880f660acca82202a751fc6ea869 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -4,6 +4,7 @@
  ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
  ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
  ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-promote-alloca,sroa,instcombine < %s | FileCheck -check-prefix=OPT %s
  target datalayout = "A5"
  
  ; OPT-LABEL: @vector_read(
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp

index d7a39c9118112081f362cfe08ef30897f400bd2b..6a2c21e800349dbca35ef66ce6f7f33aaf1afd7e 100644 (file)
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -462,6 +462,13 @@ struct TimeTracerRAII {
  // TODO: use a codegen version of PassRegistry.def/PassBuilder::is*Pass() once
  // it exists.
  static bool shouldPinPassToLegacyPM(StringRef Pass) {
+  std::vector<StringRef> PassNameExactToIgnore = {
+      "amdgpu-simplifylib", "amdgpu-usenative", "amdgpu-promote-alloca",
+      "amdgpu-promote-alloca-to-vector"};
+  for (const auto &P : PassNameExactToIgnore)
+    if (Pass == P)
+      return false;
+
    std::vector<StringRef> PassNamePrefix = {
        "x86-",  "xcore-", "wasm-",    "systemz-", "ppc-",   "nvvm-",   "nvptx-",
        "mips-", "lanai-", "hexagon-", "bpf-",     "avr-",   "thumb2-", "arm-",
author	Arthur Eubanks <aeubanks@google.com>
	Mon, 28 Dec 2020 20:14:51 +0000 (12:14 -0800)
committer	Arthur Eubanks <aeubanks@google.com>
	Tue, 29 Dec 2020 01:52:31 +0000 (17:52 -0800)
llvm/lib/Target/AMDGPU/AMDGPU.h		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/vector-alloca.ll		patch \| blob \| history
llvm/tools/opt/opt.cpp		patch \| blob \| history