From 9cf995be6bb7096747710876f2f2239b4d8367a8 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Fri, 8 Oct 2021 13:04:32 -0700
Subject: [PATCH] [AMDGPU] Promote generic pointer kernel arguments into global

The new pass walks kernel's pointer arguments, then loads from them.
If a loaded value is a pointer and loaded pointer is unmodified in
the kernel before the load, then promote loaded pointer to global.
Then recursively continue.

Differential Revision: https://reviews.llvm.org/D111464
---
 llvm/lib/Target/AMDGPU/AMDGPU.h                    |   9 +
 .../Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp | 195 +++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp     |  28 +-
 llvm/lib/Target/AMDGPU/CMakeLists.txt              |   1 +
 llvm/test/CodeGen/AMDGPU/opt-pipeline.ll           |  10 +
 .../CodeGen/AMDGPU/promote-kernel-arguments.ll     | 317 +++++++++++++++++++++
 6 files changed, 559 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index cc69e0b..958e8c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -102,6 +102,15 @@ FunctionPass *createAMDGPULowerKernelArgumentsPass();
 void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
 extern char &AMDGPULowerKernelArgumentsID;
 
+FunctionPass *createAMDGPUPromoteKernelArgumentsPass();
+void initializeAMDGPUPromoteKernelArgumentsPass(PassRegistry &);
+extern char &AMDGPUPromoteKernelArgumentsID;
+
+struct AMDGPUPromoteKernelArgumentsPass
+    : PassInfoMixin<AMDGPUPromoteKernelArgumentsPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 ModulePass *createAMDGPULowerKernelAttributesPass();
 void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
 extern char &AMDGPULowerKernelAttributesID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
new file mode 100644
index 0000000..01d03d1
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
@@ -0,0 +1,195 @@
+//===-- AMDGPUPromoteKernelArguments.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass recursively promotes generic pointer arguments of a kernel
+/// into the global address space.
+///
+/// The pass walks kernel's pointer arguments, then loads from them. If a loaded
+/// value is a pointer and loaded pointer is unmodified in the kernel before the
+/// load, then promote loaded pointer to global. Then recursively continue.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "amdgpu-promote-kernel-arguments"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUPromoteKernelArguments : public FunctionPass {
+  MemorySSA *MSSA;
+
+  Instruction *ArgCastInsertPt;
+
+  SmallVector<Value *> Ptrs;
+
+  void enqueueUsers(Value *Ptr);
+
+  bool promotePointer(Value *Ptr);
+
+public:
+  static char ID;
+
+  AMDGPUPromoteKernelArguments() : FunctionPass(ID) {}
+
+  bool run(Function &F, MemorySSA &MSSA);
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.setPreservesAll();
+  }
+};
+
+} // end anonymous namespace
+
+void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
+  SmallVector<User *> PtrUsers(Ptr->users());
+
+  while (!PtrUsers.empty()) {
+    Instruction *U = dyn_cast<Instruction>(PtrUsers.pop_back_val());
+    if (!U)
+      continue;
+
+    switch (U->getOpcode()) {
+    default:
+      break;
+    case Instruction::Load: {
+      LoadInst *LD = cast<LoadInst>(U);
+      PointerType *PT = dyn_cast<PointerType>(LD->getType());
+      if (!PT ||
+          (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
+           PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
+           PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) ||
+          LD->getPointerOperand()->stripInBoundsOffsets() != Ptr)
+        break;
+      const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(LD);
+      // TODO: This load poprobably can be promoted to constant address space.
+      if (MSSA->isLiveOnEntryDef(MA))
+        Ptrs.push_back(LD);
+      break;
+    }
+    case Instruction::GetElementPtr:
+    case Instruction::AddrSpaceCast:
+    case Instruction::BitCast:
+      if (U->getOperand(0)->stripInBoundsOffsets() == Ptr)
+        PtrUsers.append(U->user_begin(), U->user_end());
+      break;
+    }
+  }
+}
+
+bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
+  enqueueUsers(Ptr);
+
+  PointerType *PT = cast<PointerType>(Ptr->getType());
+  if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
+    return false;
+
+  bool IsArg = isa<Argument>(Ptr);
+  IRBuilder<> B(IsArg ? ArgCastInsertPt
+                      : &*std::next(cast<Instruction>(Ptr)->getIterator()));
+
+  // Cast pointer to global address space and back to flat and let
+  // Infer Address Spaces pass to do all necessary rewriting.
+  PointerType *NewPT =
+      PointerType::getWithSamePointeeType(PT, AMDGPUAS::GLOBAL_ADDRESS);
+  Value *Cast =
+      B.CreateAddrSpaceCast(Ptr, NewPT, Twine(Ptr->getName(), ".global"));
+  Value *CastBack =
+      B.CreateAddrSpaceCast(Cast, PT, Twine(Ptr->getName(), ".flat"));
+  Ptr->replaceUsesWithIf(CastBack,
+                         [Cast](Use &U) { return U.getUser() != Cast; });
+
+  return true;
+}
+
+// skip allocas
+static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
+  BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
+  for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
+    AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
+
+    // If this is a dynamic alloca, the value may depend on the loaded kernargs,
+    // so loads will need to be inserted before it.
+    if (!AI || !AI->isStaticAlloca())
+      break;
+  }
+
+  return InsPt;
+}
+
+bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
+  if (skipFunction(F))
+    return false;
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
+    return false;
+
+  ArgCastInsertPt = &*getInsertPt(*F.begin());
+  this->MSSA = &MSSA;
+
+  for (Argument &Arg : F.args()) {
+    if (Arg.use_empty())
+      continue;
+
+    PointerType *PT = dyn_cast<PointerType>(Arg.getType());
+    if (!PT || (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
+                PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
+                PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS))
+      continue;
+
+    Ptrs.push_back(&Arg);
+  }
+
+  bool Changed = false;
+  while (!Ptrs.empty()) {
+    Value *Ptr = Ptrs.pop_back_val();
+    Changed |= promotePointer(Ptr);
+  }
+
+  return Changed;
+}
+
+bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) {
+  MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+  return run(F, MSSA);
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
+                      "AMDGPU Promote Kernel Arguments", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
+                    "AMDGPU Promote Kernel Arguments", false, false)
+
+char AMDGPUPromoteKernelArguments::ID = 0;
+
+FunctionPass *llvm::createAMDGPUPromoteKernelArgumentsPass() {
+  return new AMDGPUPromoteKernelArguments();
+}
+
+PreservedAnalyses
+AMDGPUPromoteKernelArgumentsPass::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
+  MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  if (AMDGPUPromoteKernelArguments().run(F, MSSA)) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    PA.preserve<MemorySSAAnalysis>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b090246..54a1f85 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -306,6 +306,11 @@ static cl::opt<bool> EnablePreRAOptimizations(
     cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool> EnablePromoteKernelArguments(
+    "amdgpu-enable-promote-kernel-arguments",
+    cl::desc("Enable promotion of flat kernel pointer arguments to global"),
+    cl::Hidden, cl::init(true));
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -339,6 +344,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUArgumentUsageInfoPass(*PR);
   initializeAMDGPUAtomicOptimizerPass(*PR);
   initializeAMDGPULowerKernelArgumentsPass(*PR);
+  initializeAMDGPUPromoteKernelArgumentsPass(*PR);
   initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPULowerIntrinsicsPass(*PR);
   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
@@ -533,6 +539,8 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
   bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
   bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
   bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
+  bool PromoteKernelArguments =
+      EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less;
 
   if (EnableFunctionCalls) {
     delete Builder.Inliner;
@@ -574,7 +582,14 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
 
   Builder.addExtension(
     PassManagerBuilder::EP_CGSCCOptimizerLate,
-    [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+    [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &,
+                                        legacy::PassManagerBase &PM) {
+      // Add promote kernel arguments pass to the opt pipeline right before
+      // infer address spaces which is needed to do actual address space
+      // rewriting.
+      if (PromoteKernelArguments)
+        PM.add(createAMDGPUPromoteKernelArgumentsPass());
+
       // Add infer address spaces pass to the opt pipeline after inlining
       // but before SROA to increase SROA opportunities.
       PM.add(createInferAddressSpacesPass());
@@ -651,6 +666,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
           PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
           return true;
         }
+        if (PassName == "amdgpu-promote-kernel-arguments") {
+          PM.addPass(AMDGPUPromoteKernelArgumentsPass());
+          return true;
+        }
         return false;
       });
 
@@ -702,6 +721,13 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
         FunctionPassManager FPM;
 
+        // Add promote kernel arguments pass to the opt pipeline right before
+        // infer address spaces which is needed to do actual address space
+        // rewriting.
+        if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
+            EnablePromoteKernelArguments)
+          FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
+
         // Add infer address spaces pass to the opt pipeline after inlining
         // but before SROA to increase SROA opportunities.
         FPM.addPass(InferAddressSpacesPass());
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 86218a3..6dd10af 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -83,6 +83,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUPrintfRuntimeBinding.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPUPropagateAttributes.cpp
+  AMDGPUPromoteKernelArguments.cpp
   AMDGPURegBankCombiner.cpp
   AMDGPURegisterBankInfo.cpp
   AMDGPUReplaceLDSUseWithPointer.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
index c188981..c42d864 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
@@ -408,6 +408,11 @@
 ; GCN-O2-NEXT:       OpenMP specific optimizations
 ; GCN-O2-NEXT:       Deduce function attributes
 ; GCN-O2-NEXT:       FunctionPass Manager
+; GCN-O2-NEXT:         Dominator Tree Construction
+; GCN-O2-NEXT:         Basic Alias Analysis (stateless AA impl)
+; GCN-O2-NEXT:         Function Alias Analysis Results
+; GCN-O2-NEXT:         Memory SSA
+; GCN-O2-NEXT:         AMDGPU Promote Kernel Arguments
 ; GCN-O2-NEXT:         Infer address spaces
 ; GCN-O2-NEXT:     AMDGPU Kernel Attributes
 ; GCN-O2-NEXT:     FunctionPass Manager
@@ -766,6 +771,11 @@
 ; GCN-O3-NEXT:       Deduce function attributes
 ; GCN-O3-NEXT:       Promote 'by reference' arguments to scalars
 ; GCN-O3-NEXT:       FunctionPass Manager
+; GCN-O3-NEXT:         Dominator Tree Construction
+; GCN-O3-NEXT:         Basic Alias Analysis (stateless AA impl)
+; GCN-O3-NEXT:         Function Alias Analysis Results
+; GCN-O3-NEXT:         Memory SSA
+; GCN-O3-NEXT:         AMDGPU Promote Kernel Arguments
 ; GCN-O3-NEXT:         Infer address spaces
 ; GCN-O3-NEXT:     AMDGPU Kernel Attributes
 ; GCN-O3-NEXT:     FunctionPass Manager
diff --git a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
new file mode 100644
index 0000000..b7eb47a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
@@ -0,0 +1,317 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -passes=amdgpu-promote-kernel-arguments,infer-address-spaces | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: ptr_nest_3:
+; GCN-COUNT-2: global_load_dwordx2
+; GCN:         global_store_dword
+define amdgpu_kernel void @ptr_nest_3(float** addrspace(1)* nocapture readonly %Arg) {
+; CHECK-LABEL: @ptr_nest_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8
+; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)*
+; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8
+; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
+; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %p1 = getelementptr inbounds float**, float** addrspace(1)* %Arg, i32 %i
+  %p2 = load float**, float** addrspace(1)* %p1, align 8
+  %p3 = load float*, float** %p2, align 8
+  store float 0.000000e+00, float* %p3, align 4
+  ret void
+}
+
+; GCN-LABEL: ptr_bitcast:
+; GCN: global_load_dwordx2
+; GCN: global_store_dword
+define amdgpu_kernel void @ptr_bitcast(float** nocapture readonly %Arg) {
+; CHECK-LABEL: @ptr_bitcast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
+; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I]]
+; CHECK-NEXT:    [[P1_CAST:%.*]] = bitcast float* addrspace(1)* [[P1]] to i32* addrspace(1)*
+; CHECK-NEXT:    [[P2:%.*]] = load i32*, i32* addrspace(1)* [[P1_CAST]], align 8
+; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast i32* [[P2]] to i32 addrspace(1)*
+; CHECK-NEXT:    store i32 0, i32 addrspace(1)* [[P2_GLOBAL]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %p1 = getelementptr inbounds float*, float** %Arg, i32 %i
+  %p1.cast = bitcast float** %p1 to i32**
+  %p2 = load i32*, i32** %p1.cast, align 8
+  store i32 0, i32* %p2, align 4
+  ret void
+}
+
+%struct.S = type { float* }
+
+; GCN-LABEL: ptr_in_struct:
+; GCN: s_load_dwordx2
+; GCN: global_store_dword
+define amdgpu_kernel void @ptr_in_struct(%struct.S addrspace(1)* nocapture readonly %Arg) {
+; CHECK-LABEL: @ptr_in_struct(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], [[STRUCT_S]] addrspace(1)* [[ARG:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[P1:%.*]] = load float*, float* addrspace(1)* [[P]], align 8
+; CHECK-NEXT:    [[P1_GLOBAL:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)*
+; CHECK-NEXT:    [[ID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float addrspace(1)* [[P1_GLOBAL]], i32 [[ID]]
+; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p = getelementptr inbounds %struct.S, %struct.S addrspace(1)* %Arg, i64 0, i32 0
+  %p1 = load float*, float* addrspace(1)* %p, align 8
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %arrayidx = getelementptr inbounds float, float* %p1, i32 %id
+  store float 0.000000e+00, float* %arrayidx, align 4
+  ret void
+}
+
+@LDS = internal unnamed_addr addrspace(3) global [4 x float] undef, align 16
+
+; GCN-LABEL: flat_ptr_arg:
+; GCN-COUNT-2: global_load_dwordx2
+; GCN:         global_load_dwordx4
+; GCN:         global_store_dword
+define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg, float** nocapture noalias %Out, i32 %X) {
+; CHECK-LABEL: @flat_ptr_arg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OUT_GLOBAL:%.*]] = addrspacecast float** [[OUT:%.*]] to float* addrspace(1)*
+; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
+; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
+; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4
+; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1
+; CHECK-NEXT:    [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[X]], 1
+; CHECK-NEXT:    [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]]
+; CHECK-NEXT:    store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2
+; CHECK-NEXT:    [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[X]], 2
+; CHECK-NEXT:    [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]]
+; CHECK-NEXT:    store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3
+; CHECK-NEXT:    [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[X]], 3
+; CHECK-NEXT:    [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]]
+; CHECK-NEXT:    store float [[I5]], float addrspace(3)* [[ARRAYIDX512_3]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
+; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
+; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[OUT_GLOBAL]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[I7:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX11]], align 8
+; CHECK-NEXT:    [[I7_GLOBAL:%.*]] = addrspacecast float* [[I7]] to float addrspace(1)*
+; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I7_GLOBAL]], i64 [[IDXPROM8]]
+; CHECK-NEXT:    store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %i to i64
+  %arrayidx10 = getelementptr inbounds float*, float** %Arg, i64 %idxprom
+  %i1 = load float*, float** %arrayidx10, align 8
+  %i2 = load float, float* %i1, align 4
+  %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
+  store float %i2, float addrspace(3)* %arrayidx512, align 4
+  %arrayidx3.1 = getelementptr inbounds float, float* %i1, i64 1
+  %i3 = load float, float* %arrayidx3.1, align 4
+  %add.1 = add nsw i32 %X, 1
+  %arrayidx512.1 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.1
+  store float %i3, float addrspace(3)* %arrayidx512.1, align 4
+  %arrayidx3.2 = getelementptr inbounds float, float* %i1, i64 2
+  %i4 = load float, float* %arrayidx3.2, align 4
+  %add.2 = add nsw i32 %X, 2
+  %arrayidx512.2 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.2
+  store float %i4, float addrspace(3)* %arrayidx512.2, align 4
+  %arrayidx3.3 = getelementptr inbounds float, float* %i1, i64 3
+  %i5 = load float, float* %arrayidx3.3, align 4
+  %add.3 = add nsw i32 %X, 3
+  %arrayidx512.3 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.3
+  store float %i5, float addrspace(3)* %arrayidx512.3, align 4
+  %sub = add nsw i32 %X, -1
+  %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
+  %i6 = load float, float addrspace(3)* %arrayidx711, align 4
+  %arrayidx11 = getelementptr inbounds float*, float** %Out, i64 %idxprom
+  %i7 = load float*, float** %arrayidx11, align 8
+  %idxprom8 = sext i32 %X to i64
+  %arrayidx9 = getelementptr inbounds float, float* %i7, i64 %idxprom8
+  store float %i6, float* %arrayidx9, align 4
+  ret void
+}
+
+; GCN-LABEL: global_ptr_arg:
+; GCN: global_load_dwordx2
+; GCN: global_load_dwordx4
+; GCN: global_store_dword
+define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
+; CHECK-LABEL: @global_ptr_arg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
+; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4
+; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1
+; CHECK-NEXT:    [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[X]], 1
+; CHECK-NEXT:    [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]]
+; CHECK-NEXT:    store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2
+; CHECK-NEXT:    [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[X]], 2
+; CHECK-NEXT:    [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]]
+; CHECK-NEXT:    store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3
+; CHECK-NEXT:    [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[X]], 3
+; CHECK-NEXT:    [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]]
+; CHECK-NEXT:    store float [[I5]], float addrspace(3)* [[ARRAYIDX512_3]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
+; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
+; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
+; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]]
+; CHECK-NEXT:    store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %i to i64
+  %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom
+  %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8
+  %i2 = load float, float* %i1, align 4
+  %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
+  store float %i2, float addrspace(3)* %arrayidx512, align 4
+  %arrayidx3.1 = getelementptr inbounds float, float* %i1, i64 1
+  %i3 = load float, float* %arrayidx3.1, align 4
+  %add.1 = add nsw i32 %X, 1
+  %arrayidx512.1 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.1
+  store float %i3, float addrspace(3)* %arrayidx512.1, align 4
+  %arrayidx3.2 = getelementptr inbounds float, float* %i1, i64 2
+  %i4 = load float, float* %arrayidx3.2, align 4
+  %add.2 = add nsw i32 %X, 2
+  %arrayidx512.2 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.2
+  store float %i4, float addrspace(3)* %arrayidx512.2, align 4
+  %arrayidx3.3 = getelementptr inbounds float, float* %i1, i64 3
+  %i5 = load float, float* %arrayidx3.3, align 4
+  %add.3 = add nsw i32 %X, 3
+  %arrayidx512.3 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.3
+  store float %i5, float addrspace(3)* %arrayidx512.3, align 4
+  %sub = add nsw i32 %X, -1
+  %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
+  %i6 = load float, float addrspace(3)* %arrayidx711, align 4
+  %idxprom8 = sext i32 %X to i64
+  %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8
+  store float %i6, float* %arrayidx9, align 4
+  ret void
+}
+
+; GCN-LABEL: global_ptr_arg_clobbered:
+; GCN: global_store_dwordx2
+; GCN: global_load_dwordx2
+; GCN: flat_load_dword
+; GCN: flat_store_dword
+define amdgpu_kernel void @global_ptr_arg_clobbered(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
+; CHECK-LABEL: @global_ptr_arg_clobbered(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]]
+; CHECK-NEXT:    store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[I2:%.*]] = load float, float* [[I1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]]
+; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
+; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
+; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
+; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[I1]], i64 [[IDXPROM8]]
+; CHECK-NEXT:    store float [[I6]], float* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %i to i64
+  %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom
+  %arrayidx11 = getelementptr inbounds float*, float* addrspace(1)* %arrayidx10, i32 %X
+  store float* null, float* addrspace(1)* %arrayidx11, align 4
+  %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8
+  %i2 = load float, float* %i1, align 4
+  %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
+  store float %i2, float addrspace(3)* %arrayidx512, align 4
+  %sub = add nsw i32 %X, -1
+  %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
+  %i6 = load float, float addrspace(3)* %arrayidx711, align 4
+  %idxprom8 = sext i32 %X to i64
+  %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8
+  store float %i6, float* %arrayidx9, align 4
+  ret void
+}
+
+; GCN-LABEL: global_ptr_arg_clobbered_after_load:
+; GCN: global_load_dwordx2
+; GCN: global_store_dwordx2
+; GCN: global_load_dword
+; GCN: global_store_dword
+define amdgpu_kernel void @global_ptr_arg_clobbered_after_load(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
+; CHECK-LABEL: @global_ptr_arg_clobbered_after_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]]
+; CHECK-NEXT:    store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4
+; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]]
+; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
+; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
+; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
+; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]]
+; CHECK-NEXT:    store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %i to i64
+  %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom
+  %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8
+  %arrayidx11 = getelementptr inbounds float*, float* addrspace(1)* %arrayidx10, i32 %X
+  store float* null, float* addrspace(1)* %arrayidx11, align 4
+  %i2 = load float, float* %i1, align 4
+  %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
+  store float %i2, float addrspace(3)* %arrayidx512, align 4
+  %sub = add nsw i32 %X, -1
+  %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
+  %i6 = load float, float addrspace(3)* %arrayidx711, align 4
+  %idxprom8 = sext i32 %X to i64
+  %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8
+  store float %i6, float* %arrayidx9, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
-- 
2.7.4