From: Stanislav Mekhanoshin Date: Wed, 19 May 2021 20:39:55 +0000 (-0700) Subject: [AMDGPU] Lower kernel LDS into a sorted structure X-Git-Tag: llvmorg-14-init~5658 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8de4db697f2841748a5489d18d9fbcd130ae09bb;p=platform%2Fupstream%2Fllvm.git [AMDGPU] Lower kernel LDS into a sorted structure Differential Revision: https://reviews.llvm.org/D102954 --- diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index b1950bf..d189542 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -73,6 +73,10 @@ class AMDGPULowerModuleLDS : public ModulePass { GV->eraseFromParent(); + for (Constant *C : ToRemove) { + C->removeDeadConstantUsers(); + } + if (!Init.empty()) { ArrayType *ATy = ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size()); @@ -129,6 +133,9 @@ class AMDGPULowerModuleLDS : public ModulePass { ""); } +private: + SmallPtrSet UsedList; + public: static char ID; @@ -137,13 +144,28 @@ public: } bool runOnModule(Module &M) override { + UsedList = AMDGPU::getUsedList(M); + + bool Changed = processUsedLDS(M); + + for (Function &F : M.functions()) { + if (!AMDGPU::isKernelCC(&F)) + continue; + Changed |= processUsedLDS(M, &F); + } + + UsedList.clear(); + return Changed; + } + +private: + bool processUsedLDS(Module &M, Function *F = nullptr) { LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); - SmallPtrSet UsedList = AMDGPU::getUsedList(M); // Find variables to move into new struct instance std::vector FoundLocalVars = - AMDGPU::findVariablesToLower(M, UsedList); + AMDGPU::findVariablesToLower(M, UsedList, F); if (FoundLocalVars.empty()) { // No variables to rewrite, no changes made. @@ -207,21 +229,25 @@ public: LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); - StructType *LDSTy = StructType::create( - Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.module.lds.t")); + std::string VarName( + F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str() + : "llvm.amdgcn.module.lds"); + StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t"); Align MaxAlign = AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment GlobalVariable *SGV = new GlobalVariable( M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), - "llvm.amdgcn.module.lds", nullptr, GlobalValue::NotThreadLocal, - AMDGPUAS::LOCAL_ADDRESS, false); + VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, + false); SGV->setAlignment(MaxAlign); - appendToCompilerUsed( - M, {static_cast( - ConstantExpr::getPointerBitCastOrAddrSpaceCast( - cast(SGV), Type::getInt8PtrTy(Ctx)))}); + if (!F) { + appendToCompilerUsed( + M, {static_cast( + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast(SGV), Type::getInt8PtrTy(Ctx)))}); + } // The verifier rejects used lists containing an inttoptr of a constant // so remove the variables from these lists before replaceAllUsesWith @@ -233,16 +259,25 @@ public: for (size_t I = 0; I < LocalVars.size(); I++) { GlobalVariable *GV = LocalVars[I]; Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; - GV->replaceAllUsesWith( - ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx)); - GV->eraseFromParent(); + Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx); + if (F) { + GV->replaceUsesWithIf(GEP, [F](Use &U) { + return AMDGPU::isUsedOnlyFromFunction(U.getUser(), F); + }); + } else { + GV->replaceAllUsesWith(GEP); + } + if (GV->use_empty()) { + UsedList.erase(GV); + GV->eraseFromParent(); + } } // Mark kernels with asm that reads the address of the allocated structure // This is not necessary for lowering. This lets other passes, specifically // PromoteAlloca, accurately calculate how much LDS will be used by the // kernel after lowering. - { + if (!F) { IRBuilder<> Builder(Ctx); SmallPtrSet Kernels; for (auto &I : M.functions()) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp index e5cee6f..d720b2d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -20,7 +20,7 @@ namespace llvm { namespace AMDGPU { -bool isKernelCC(Function *Func) { +bool isKernelCC(const Function *Func) { return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); } @@ -29,18 +29,33 @@ Align getAlign(DataLayout const &DL, const GlobalVariable *GV) { GV->getValueType()); } -bool userRequiresLowering(const SmallPtrSetImpl &UsedList, - User *InitialUser) { +bool isUsedOnlyFromFunction(const User *U, const Function *F) { + if (auto *I = dyn_cast(U)) { + return I->getFunction() == F; + } + + if (auto *C = dyn_cast(U)) { + return all_of(U->users(), + [F](const User *U) { return isUsedOnlyFromFunction(U, F); }); + } + + return false; +} + +bool shouldLowerLDSToStruct(const SmallPtrSetImpl &UsedList, + const GlobalVariable &GV, const Function *F) { // Any LDS variable can be lowered by moving into the created struct // Each variable so lowered is allocated in every kernel, so variables // whose users are all known to be safe to lower without the transform // are left unchanged. - SmallPtrSet Visited; - SmallVector Stack; - Stack.push_back(InitialUser); + bool Ret = false; + SmallPtrSet Visited; + SmallVector Stack(GV.users()); + + assert(!F || isKernelCC(F)); while (!Stack.empty()) { - User *V = Stack.pop_back_val(); + const User *V = Stack.pop_back_val(); Visited.insert(V); if (auto *G = dyn_cast(V->stripPointerCasts())) { @@ -50,31 +65,44 @@ bool userRequiresLowering(const SmallPtrSetImpl &UsedList, } if (auto *I = dyn_cast(V)) { - if (isKernelCC(I->getFunction())) { - continue; + const Function *UF = I->getFunction(); + if (UF == F) { + // Used from this kernel, we want to put it into the structure. + Ret = true; + } else if (!F) { + Ret |= !isKernelCC(UF); } + continue; } if (auto *E = dyn_cast(V)) { - for (Value::user_iterator EU = E->user_begin(); EU != E->user_end(); - ++EU) { - if (Visited.insert(*EU).second) { - Stack.push_back(*EU); + if (F) { + // Any use which does not end up an instruction disqualifies a + // variable to be put into a kernel's LDS structure because later + // we will need to replace only this kernel's uses for which we + // need to identify a using function. + return isUsedOnlyFromFunction(E, F); + } + for (const User *U : E->users()) { + if (Visited.insert(U).second) { + Stack.push_back(U); } } continue; } - // Unknown user, conservatively lower the variable - return true; + // Unknown user, conservatively lower the variable. + // For module LDS conservatively means place it into the module LDS struct. + // For kernel LDS it means lower as a standalone variable. + return !F; } - return false; + return Ret; } std::vector -findVariablesToLower(Module &M, - const SmallPtrSetImpl &UsedList) { +findVariablesToLower(Module &M, const SmallPtrSetImpl &UsedList, + const Function *F) { std::vector LocalVars; for (auto &GV : M.globals()) { if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { @@ -98,9 +126,7 @@ findVariablesToLower(Module &M, // dropped by the back end if not. This pass skips over it. continue; } - if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) { - return userRequiresLowering(UsedList, U); - })) { + if (!shouldLowerLDSToStruct(UsedList, GV, F)) { continue; } LocalVars.push_back(&GV); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h index f2c781a..adcaa34 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -19,18 +19,29 @@ namespace llvm { namespace AMDGPU { -bool isKernelCC(Function *Func); +bool isKernelCC(const Function *Func); Align getAlign(DataLayout const &DL, const GlobalVariable *GV); -bool userRequiresLowering(const SmallPtrSetImpl &UsedList, - User *InitialUser); +/// \returns true if an LDS global requres lowering to a module LDS structure +/// if \p F is not given. If \p F is given it must be a kernel and function +/// \returns true if an LDS global is directly used from that kernel and it +/// is safe to replace its uses with a kernel LDS structure member. +/// \p UsedList contains a union of llvm.used and llvm.compiler.used variables +/// which do not count as a use. +bool shouldLowerLDSToStruct(const SmallPtrSetImpl &UsedList, + const GlobalVariable &GV, + const Function *F = nullptr); std::vector -findVariablesToLower(Module &M, const SmallPtrSetImpl &UsedList); +findVariablesToLower(Module &M, const SmallPtrSetImpl &UsedList, + const Function *F = nullptr); SmallPtrSet getUsedList(Module &M); +/// \returns true if all uses of \p U end up in a function \p F. +bool isUsedOnlyFromFunction(const User *U, const Function *F); + } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll index 2f01b48..90ca343 100644 --- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll @@ -83,7 +83,7 @@ define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] a @g_lds = addrspace(3) global float undef, align 4 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset: -; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], g_lds@abs32@lo +; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}} ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]] define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) { %val = load float, float addrspace(3)* @g_lds diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index 7b5d677..bfff751 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -9,9 +9,9 @@ define amdgpu_kernel void @use_lds_globals(i32 addrspace(1)* %out, i32 addrspace ; CHECK-LABEL: use_lds_globals: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v0, 4 +; CHECK-NEXT: v_mov_b32_e32 v0, 8 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v3, v0 offset:4 +; CHECK-NEXT: ds_read_b32 v3, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll index a3cd21b..1f76655 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll @@ -1,4 +1,4 @@ -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s ; FIXME: Merge with DAG test @lds.external = external unnamed_addr addrspace(3) global [0 x i32] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 5f28f31..6efd94a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s @@ -28,7 +28,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -57,7 +57,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_mul_i32 s2, s2, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -87,7 +87,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -115,7 +115,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_cbranch_execz BB0_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mul_i32 s2, s2, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -145,7 +145,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_cbranch_execz BB0_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mul_i32 s3, s3, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -187,7 +187,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -219,7 +219,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s1, s0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -251,7 +251,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s3, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -281,7 +281,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX1064-NEXT: s_cbranch_execz BB1_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s3, s2, s3 ; GFX1064-NEXT: v_mov_b32_e32 v2, s3 @@ -315,7 +315,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive ; GFX1032-NEXT: s_cbranch_execz BB1_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 -; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s1, s2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 @@ -347,7 +347,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX7LESS-LABEL: add_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 @@ -390,7 +390,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB2_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -441,7 +441,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB2_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 @@ -500,7 +500,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB2_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -551,7 +551,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB2_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -578,7 +578,7 @@ entry: define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX7LESS-LABEL: add_i32_varying_nouse: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_u32 v1, v0 @@ -612,7 +612,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB3_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -648,7 +648,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB3_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_u32 v0, v2 @@ -682,7 +682,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB3_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 @@ -713,7 +713,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() { ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB3_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v3, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_u32 v3, v0 @@ -742,13 +742,12 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_cbranch_execz BB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] +; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] @@ -780,10 +779,9 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] +; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -814,9 +812,8 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] +; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -846,11 +843,10 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] +; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB4_2: @@ -878,11 +874,10 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] +; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB4_2: @@ -917,7 +912,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX7LESS-NEXT: s_cbranch_execz BB5_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 @@ -965,7 +960,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 ; GFX8-NEXT: s_mul_i32 s7, s3, s6 ; GFX8-NEXT: s_mul_i32 s6, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -1010,7 +1005,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1045,7 +1040,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1064-NEXT: s_cbranch_execz BB5_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s7, s3, s6 ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -1086,7 +1081,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1032-NEXT: s_cbranch_execz BB5_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s6, s3, s5 ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -1128,10 +1123,9 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] +; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -1141,11 +1135,10 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { ; GFX8-LABEL: add_i64_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] +; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -1155,10 +1148,9 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { ; GFX9-LABEL: add_i64_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -1168,13 +1160,12 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { ; GFX10-LABEL: add_i64_varying: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] +; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1203,7 +1194,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1233,7 +1224,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_mul_i32 s2, s2, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1264,7 +1255,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -1293,7 +1284,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_cbranch_execz BB7_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mul_i32 s2, s2, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1324,7 +1315,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_cbranch_execz BB7_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mul_i32 s3, s3, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1367,7 +1358,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1399,7 +1390,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s1, s0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1431,7 +1422,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s3, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -1461,7 +1452,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1064-NEXT: s_cbranch_execz BB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s3, s2, s3 ; GFX1064-NEXT: v_mov_b32_e32 v2, s3 @@ -1495,7 +1486,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1032-NEXT: s_cbranch_execz BB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 -; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s1, s2, s1 ; GFX1032-NEXT: v_mov_b32_e32 v2, s1 @@ -1527,7 +1518,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX7LESS-LABEL: sub_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 @@ -1570,7 +1561,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB9_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1621,7 +1612,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 @@ -1680,7 +1671,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1731,7 +1722,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1758,7 +1749,7 @@ entry: define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX7LESS-LABEL: sub_i32_varying_nouse: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_u32 v1, v0 @@ -1792,7 +1783,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB10_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1828,7 +1819,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB10_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_u32 v0, v2 @@ -1862,7 +1853,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB10_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1893,7 +1884,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() { ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB10_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v3, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_u32 v3, v0 @@ -1922,13 +1913,12 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_cbranch_execz BB11_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB11_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1960,10 +1950,9 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] +; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB11_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1995,9 +1984,8 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] +; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB11_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -2028,11 +2016,10 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] +; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB11_2: @@ -2063,11 +2050,10 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] +; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB11_2: @@ -2105,7 +2091,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX7LESS-NEXT: s_cbranch_execz BB12_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 @@ -2153,7 +2139,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 ; GFX8-NEXT: s_mul_i32 s7, s3, s6 ; GFX8-NEXT: s_mul_i32 s6, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: s_mov_b32 m0, -1 @@ -2198,7 +2184,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2233,7 +2219,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1064-NEXT: s_cbranch_execz BB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s7, s3, s6 ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -2274,7 +2260,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1032-NEXT: s_cbranch_execz BB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s6, s3, s5 ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -2316,10 +2302,9 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -2329,11 +2314,10 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { ; GFX8-LABEL: sub_i64_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] +; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -2343,10 +2327,9 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { ; GFX9-LABEL: sub_i64_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -2356,13 +2339,12 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { ; GFX10-LABEL: sub_i64_varying: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] +; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2381,7 +2363,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX7LESS-LABEL: and_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 @@ -2424,7 +2406,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB14_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2475,7 +2457,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB14_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 @@ -2534,7 +2516,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB14_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2585,7 +2567,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB14_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2615,7 +2597,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX7LESS-LABEL: or_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 @@ -2658,7 +2640,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB15_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2709,7 +2691,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB15_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 @@ -2768,7 +2750,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB15_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2819,7 +2801,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB15_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2849,7 +2831,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX7LESS-LABEL: xor_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 @@ -2892,7 +2874,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB16_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2943,7 +2925,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB16_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 @@ -3002,7 +2984,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB16_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3053,7 +3035,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB16_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3083,7 +3065,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX7LESS-LABEL: max_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 @@ -3126,7 +3108,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB17_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3177,7 +3159,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB17_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 @@ -3238,7 +3220,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB17_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3291,7 +3273,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB17_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3328,7 +3310,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB18_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 @@ -3364,7 +3346,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_cbranch_execz BB18_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 -; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3399,7 +3381,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_cbranch_execz BB18_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 -; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] @@ -3433,7 +3415,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: s_cbranch_execz BB18_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 -; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3466,7 +3448,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: s_cbranch_execz BB18_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3500,7 +3482,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX7LESS-LABEL: min_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 @@ -3543,7 +3525,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB19_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3594,7 +3576,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB19_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 @@ -3655,7 +3637,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB19_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3708,7 +3690,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB19_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3745,7 +3727,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB20_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 @@ -3781,7 +3763,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_cbranch_execz BB20_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 -; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3816,7 +3798,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_cbranch_execz BB20_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 -; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] @@ -3850,7 +3832,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: s_cbranch_execz BB20_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 -; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3883,7 +3865,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: s_cbranch_execz BB20_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3917,7 +3899,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX7LESS-LABEL: umax_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 @@ -3960,7 +3942,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB21_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4011,7 +3993,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB21_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 @@ -4070,7 +4052,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB21_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4121,7 +4103,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB21_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4158,7 +4140,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB22_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 @@ -4193,7 +4175,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_cbranch_execz BB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 -; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4227,7 +4209,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_cbranch_execz BB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 -; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] @@ -4260,7 +4242,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: s_cbranch_execz BB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 -; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4293,7 +4275,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: s_cbranch_execz BB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4327,7 +4309,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX7LESS-LABEL: umin_i32_varying: ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 @@ -4370,7 +4352,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB23_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4421,7 +4403,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB23_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 @@ -4480,7 +4462,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB23_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4531,7 +4513,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB23_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4568,7 +4550,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB24_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 @@ -4603,7 +4585,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_cbranch_execz BB24_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 -; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4637,7 +4619,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_cbranch_execz BB24_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 -; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] @@ -4670,7 +4652,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: s_cbranch_execz BB24_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 -; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4703,7 +4685,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: s_cbranch_execz BB24_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 -; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index c82d042..26dc977 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1096,27 +1096,33 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CI-NEXT: s_lshl_b32 s0, s2, 2 -; CI-NEXT: s_add_i32 s1, s0, 0xc20 -; CI-NEXT: s_addk_i32 s0, 0xc60 +; CI-NEXT: s_add_i32 s1, s0, 0x8c40 +; CI-NEXT: s_add_i32 s0, s0, 0x8c80 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 +; CI-NEXT: s_mov_b32 s0, 0x8020 +; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v8 +; CI-NEXT: s_mov_b32 s0, 0x80a0 +; CI-NEXT: v_add_i32_e32 v6, vcc, s0, v8 ; CI-NEXT: v_mov_b32_e32 v0, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_read2_b32 v[2:3], v0 offset1:1 +; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; CI-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 ; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 -; CI-NEXT: v_lshlrev_b32_e32 v8, 2, v1 -; CI-NEXT: ds_read2_b32 v[0:1], v8 offset1:1 -; CI-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 -; CI-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: ds_read2_b32 v[6:7], v6 offset1:1 +; CI-NEXT: v_add_i32_e32 v8, vcc, 0x8120, v8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_f32_e32 v2, v2, v3 -; CI-NEXT: v_add_f32_e32 v2, v2, v4 -; CI-NEXT: v_add_f32_e32 v2, v2, v5 -; CI-NEXT: v_add_f32_e32 v0, v2, v0 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_add_f32_e32 v0, v0, v3 +; CI-NEXT: ds_read2_b32 v[8:9], v8 offset1:1 +; CI-NEXT: v_add_f32_e32 v0, v0, v4 +; CI-NEXT: v_add_f32_e32 v0, v0, v5 ; CI-NEXT: v_add_f32_e32 v0, v0, v6 ; CI-NEXT: v_add_f32_e32 v0, v0, v7 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_f32_e32 v0, v0, v8 +; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_add_f32_e32 v0, v0, v9 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -1125,23 +1131,29 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* % ; GFX9-LABEL: sgemm_inner_loop_read2_sequence: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshl_b32 s2, s2, 2 -; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 -; GFX9-NEXT: s_addk_i32 s2, 0xc60 +; GFX9-NEXT: s_add_i32 s3, s2, 0x8c40 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v1 +; GFX9-NEXT: s_add_i32 s2, s2, 0x8c80 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: ds_read2_b32 v[2:3], v0 offset1:1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_add_u32_e32 v4, 0x8020, v8 +; GFX9-NEXT: v_add_u32_e32 v6, 0x80a0, v8 +; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v1 -; GFX9-NEXT: ds_read2_b32 v[0:1], v8 offset1:1 -; GFX9-NEXT: ds_read2_b32 v[6:7], v8 offset0:32 offset1:33 -; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset0:64 offset1:65 +; GFX9-NEXT: ds_read2_b32 v[6:7], v6 offset1:1 +; GFX9-NEXT: v_add_u32_e32 v8, 0x8120, v8 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-NEXT: ds_read2_b32 v[8:9], v8 offset1:1 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 @@ -1462,11 +1474,11 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* ; CI: ; %bb.0: ; %entry ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_read_u8 v1, v0 offset:72 -; CI-NEXT: ds_read_u8 v2, v0 offset:71 -; CI-NEXT: ds_read_u8 v3, v0 offset:70 -; CI-NEXT: ds_read_u8 v4, v0 offset:69 -; CI-NEXT: ds_read_u8 v5, v0 offset:68 +; CI-NEXT: ds_read_u8 v1, v0 offset:37032 +; CI-NEXT: ds_read_u8 v2, v0 offset:37031 +; CI-NEXT: ds_read_u8 v3, v0 offset:37030 +; CI-NEXT: ds_read_u8 v4, v0 offset:37029 +; CI-NEXT: ds_read_u8 v5, v0 offset:37028 ; CI-NEXT: s_waitcnt lgkmcnt(4) ; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; CI-NEXT: s_waitcnt lgkmcnt(3) @@ -1477,9 +1489,9 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* ; CI-NEXT: v_or_b32_e32 v3, v3, v4 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v1, v1, v3 -; CI-NEXT: ds_read_u8 v2, v0 offset:67 -; CI-NEXT: ds_read_u8 v3, v0 offset:66 -; CI-NEXT: ds_read_u8 v0, v0 offset:65 +; CI-NEXT: ds_read_u8 v2, v0 offset:37027 +; CI-NEXT: ds_read_u8 v3, v0 offset:37026 +; CI-NEXT: ds_read_u8 v0, v0 offset:37025 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 @@ -1496,14 +1508,14 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* ; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-ALIGNED: ; %bb.0: ; %entry ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:65 -; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:66 -; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:67 -; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:68 -; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:69 -; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:70 -; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:71 -; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:72 +; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:37025 +; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:37026 +; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:37027 +; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:37028 +; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:37029 +; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:37030 +; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:37031 +; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:37032 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 @@ -1521,7 +1533,7 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* ; ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-UNALIGNED: ; %bb.0: ; %entry -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x90a1 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 674837a..3375de9 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s @@ -889,9 +889,9 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_lshl_b32 s2, s2, 2 -; CI-NEXT: s_add_i32 s3, s2, 0xc20 +; CI-NEXT: s_add_i32 s3, s2, 0x8c40 ; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: s_addk_i32 s2, 0xc60 +; CI-NEXT: s_add_i32 s2, s2, 0x8c80 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s0, s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 @@ -902,17 +902,20 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v1 +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x8020, v0 +; CI-NEXT: ds_write2_b32 v1, v2, v3 offset1:1 +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x80a0, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 0x8120, v0 +; CI-NEXT: ds_write2_b32 v1, v2, v3 offset1:1 ; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:32 offset1:33 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:64 offset1:65 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: write2_sgemm_sequence: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_lshl_b32 s2, s2, 2 -; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 -; GFX9-NEXT: s_addk_i32 s2, 0xc60 +; GFX9-NEXT: s_add_i32 s3, s2, 0x8c40 +; GFX9-NEXT: s_add_i32 s2, s2, 0x8c80 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 @@ -923,9 +926,12 @@ define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %ld ; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 ; GFX9-NEXT: ds_write2_b32 v2, v3, v4 offset1:1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x8020, v0 +; GFX9-NEXT: ds_write2_b32 v1, v3, v4 offset1:1 +; GFX9-NEXT: v_add_u32_e32 v1, 0x80a0, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x8120, v0 +; GFX9-NEXT: ds_write2_b32 v1, v3, v4 offset1:1 ; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 -; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:32 offset1:33 -; GFX9-NEXT: ds_write2_b32 v0, v3, v4 offset0:64 offset1:65 ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 @@ -1026,37 +1032,37 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { ; CI-NEXT: v_mov_b32_e32 v0, 0x7b ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_write_b8 v1, v0 offset:65 +; CI-NEXT: ds_write_b8 v1, v0 offset:37025 ; CI-NEXT: v_mov_b32_e32 v0, 1 -; CI-NEXT: ds_write_b8 v1, v0 offset:70 +; CI-NEXT: ds_write_b8 v1, v0 offset:37030 ; CI-NEXT: v_mov_b32_e32 v0, 0xc8 -; CI-NEXT: ds_write_b8 v1, v0 offset:69 -; CI-NEXT: ds_write_b8 v1, v1 offset:68 -; CI-NEXT: ds_write_b8 v1, v1 offset:67 -; CI-NEXT: ds_write_b8 v1, v1 offset:66 -; CI-NEXT: ds_write_b8 v1, v1 offset:72 -; CI-NEXT: ds_write_b8 v1, v1 offset:71 +; CI-NEXT: ds_write_b8 v1, v0 offset:37029 +; CI-NEXT: ds_write_b8 v1, v1 offset:37028 +; CI-NEXT: ds_write_b8 v1, v1 offset:37027 +; CI-NEXT: ds_write_b8 v1, v1 offset:37026 +; CI-NEXT: ds_write_b8 v1, v1 offset:37032 +; CI-NEXT: ds_write_b8 v1, v1 offset:37031 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset: ; GFX9-ALIGNED: ; %bb.0: ; %entry ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65 +; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:37025 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70 +; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:37030 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71 +; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:37029 +; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:37028 +; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:37027 +; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:37026 +; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:37032 +; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:37031 ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset: ; GFX9-UNALIGNED: ; %bb.0: ; %entry -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x90a1 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x1c8 ; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 diff --git a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll index c8e2c5b..7205fb4 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll @@ -46,9 +46,9 @@ define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace ret void } -; 38 + (10 pad) + 38 +; 38 + (2 pad) + 38 ; HSA-LABEL: {{^}}test_round_size_2_align_8: -; HSA: workgroup_group_segment_byte_size = 86 +; HSA: workgroup_group_segment_byte_size = 78 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -94,9 +94,10 @@ define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 add ret void } -; (7 * 8) + (39 * 4) = 212 +; FIXME: missign alignment can be improved. +; (39 * 4) + (4 pad) + (7 * 8) = 216 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: -; HSA: workgroup_group_segment_byte_size = 212 +; HSA: workgroup_group_segment_byte_size = 216 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* @@ -125,22 +126,11 @@ define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* ret void } -; Test how the size needed for padding changes based on when the -; global is encountered during lowering. There should be a consistent -; order to minimize padding waste. -; -; The way global addresses are lowered now, this is in inverse of -; first use order which isn't great. -; -; This should be the optimal order for these globals. If sorted to -; minimize padding, the minimum possible size is: align 32, align 8, -; align 16 - ; align 32, 16, 8 -; 38 + (10 pad) + 38 + (10 pad) + 38 = 134 +; 38 + (10 pad) + 38 + (2 pad) + 38 = 126 ; HSA-LABEL: {{^}}test_round_size_3_order0: -; HSA: workgroup_group_segment_byte_size = 134 +; HSA: workgroup_group_segment_byte_size = 126 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* @@ -159,9 +149,9 @@ define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 ad } ; align 32, 8, 16 -; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134 +; 38 (+ 10 pad) + 38 + (2 pad) + 38 = 126 ; HSA-LABEL: {{^}}test_round_size_3_order1: -; HSA: workgroup_group_segment_byte_size = 134 +; HSA: workgroup_group_segment_byte_size = 126 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* @@ -180,9 +170,9 @@ define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 ad } ; align 16, 32, 8 -; 38 + (26 pad) + 38 + (10 pad) + 38 = 150 +; 38 + (10 pad) + 38 + (2 pad) + 38 = 126 ; HSA-LABEL: {{^}}test_round_size_3_order2: -; HSA: workgroup_group_segment_byte_size = 150 +; HSA: workgroup_group_segment_byte_size = 126 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -200,10 +190,11 @@ define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 ad ret void } +; FIXME: Improve alignment ; align 16, 8, 32 -; 38 + (2 pad) + 38 + (2 pad) + 38 +; 38 + (10 pad) + 38 + (2 pad) + 38 ; HSA-LABEL: {{^}}test_round_size_3_order3: -; HSA: workgroup_group_segment_byte_size = 118 +; HSA: workgroup_group_segment_byte_size = 126 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* @@ -222,9 +213,9 @@ define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 ad } ; align 8, 32, 16 -; 38 + (26 pad) + 38 + (2 pad) + 38 = 142 +; 38 + (10 pad) + 38 + (2 pad) + 38 = 126 ; HSA-LABEL: {{^}}test_round_size_3_order4: -; HSA: workgroup_group_segment_byte_size = 142 +; HSA: workgroup_group_segment_byte_size = 126 ; HSA: group_segment_alignment = 4 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll index 1426f80..a4a8e078 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s | llvm-readobj -r -t - | FileCheck -check-prefixes=ELF %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -amdgpu-enable-lower-module-lds=0 -filetype=obj < %s | llvm-readobj -r -t - | FileCheck -check-prefixes=ELF %s @lds.external = external unnamed_addr addrspace(3) global [0 x i32] @lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll index 130ce9e..fcd3346 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -275,11 +275,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0 ; CIVI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} -; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] -; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo -; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]] - +; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -428,11 +424,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa ; CIVI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}} -; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] -; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo -; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]] - +; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll index 6b66070..417901e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -136,10 +136,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa @lds0 = addrspace(3) global [512 x i32] undef, align 4 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32: -; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} -; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] -; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo -; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]] +; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -337,10 +334,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 @lds1 = addrspace(3) global [512 x i64] undef, align 8 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64: -; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}} -; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] -; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo -; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]] +; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll index d8a8285..690feae 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -38,17 +38,15 @@ entry: ; GCN-LABEL: {{^}}local_memory_two_objects: ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 -; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] -; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 -; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 -; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]] +; CI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 0, [[ADDRW]] +; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 12, [[ADDRW]] +; GCN-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 16, [[ADDRW]] +; GCN-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; GCN: s_barrier -; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] -; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 +; GCN-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] +; GCN-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.ll b/llvm/test/CodeGen/AMDGPU/local-memory.ll index ca3a836..78e76f7 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.ll @@ -10,7 +10,7 @@ ; not an immediate. ; FUNC-LABEL: {{^}}load_i32_local_const_ptr: -; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], lds@abs32@lo +; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], 0{{$}} ; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4 ; R600: LDS_READ_RET diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll index 4b3fb1a..7cf7963 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -1,5 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s ; RUN: llc -march=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck -check-prefix=GCN %s @@ -49,7 +47,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) #0 { ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.2: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec @@ -128,7 +126,7 @@ define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_cbranch_scc1 BB1_3 ; GCN-NEXT: ; %bb.2: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec @@ -206,24 +204,20 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s3, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 -; GCN-NEXT: s_mov_b32 s2, lds@abs32@lo -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-NEXT: ; implicit-def: $sgpr3 +; GCN-NEXT: ; implicit-def: $sgpr6 ; GCN-NEXT: BB2_1: ; %bb1 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_cmp_ne_u32_e64 s[8:9], s2, 4 -; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; GCN-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-NEXT: s_cmp_gt_i32 s3, -1 +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cmp_gt_i32 s6, -1 ; GCN-NEXT: s_cbranch_scc1 BB2_3 ; GCN-NEXT: ; %bb.2: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB2_1 Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec @@ -231,7 +225,7 @@ define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GCN-NEXT: BB2_3: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB2_1 Depth=1 -; GCN-NEXT: s_add_i32 s3, s3, 1 +; GCN-NEXT: s_add_i32 s6, s6, 1 ; GCN-NEXT: s_and_b64 s[8:9], exec, s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] ; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] @@ -311,7 +305,7 @@ define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_cbranch_scc1 BB3_3 ; GCN-NEXT: ; %bb.2: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB3_1 Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec @@ -399,7 +393,7 @@ define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_cbranch_scc1 BB4_3 ; GCN-NEXT: ; %bb.2: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB4_1 Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec @@ -491,7 +485,7 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_cbranch_scc1 BB5_3 ; GCN-NEXT: ; %bb.2: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 ; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll new file mode 100644 index 0000000..ee16e88 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll @@ -0,0 +1,119 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 +@lds.size.2.align.2 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 2 +@lds.size.4.align.4 = internal unnamed_addr addrspace(3) global [4 x i8] undef, align 4 +@lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8 +@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 + +; CHECK: %llvm.amdgcn.module.lds.t = type { [8 x i8], [1 x i8] } +; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8] } +; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8] } +; CHECK: %llvm.amdgcn.kernel..lds.t = type { [2 x i8] } +; CHECK: %llvm.amdgcn.kernel..lds.t.0 = type { [4 x i8] } + +;. +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i8*)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16 +; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16 +; CHECK: @llvm.amdgcn.kernel..lds = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t undef, align 2 +; CHECK: @llvm.amdgcn.kernel..lds.1 = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t.0 undef, align 4 +;. +define amdgpu_kernel void @k0() { +; CHECK-LABEL: @k0( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)* +; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 +; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)* +; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 +; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)* +; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 +; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)* +; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 +; CHECK-NEXT: ret void +; + %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 + + %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 + + %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 + + %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* + store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 + + ret void +} + +define amdgpu_kernel void @k1() { +; CHECK-LABEL: @k1( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)* +; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 +; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)* +; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 +; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)* +; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 +; CHECK-NEXT: ret void +; + %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 + + %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 + + %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* + store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 + + ret void +} + +define amdgpu_kernel void @0() { +; CHECK-LABEL: @0( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t, %llvm.amdgcn.kernel..lds.t addrspace(3)* @llvm.amdgcn.kernel..lds, i32 0, i32 0) to i8 addrspace(3)* +; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 +; CHECK-NEXT: ret void +; + %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 + + ret void +} + +define amdgpu_kernel void @1() { +; CHECK-LABEL: @1( +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t.0, %llvm.amdgcn.kernel..lds.t.0 addrspace(3)* @llvm.amdgcn.kernel..lds.1, i32 0, i32 0) to i8 addrspace(3)* +; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 +; CHECK-NEXT: ret void +; + %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 + + ret void +} + +define void @f0() { +; CHECK-LABEL: @f0( +; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)* +; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 +; CHECK-NEXT: %lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)* +; CHECK-NEXT: store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4 +; CHECK-NEXT: ret void +; + %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 + + %lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @lds.size.8.align.8 to i8 addrspace(3)* + store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4 + + ret void +} +;. +; CHECK: attributes #0 = { nofree nosync nounwind readnone willreturn } +;. diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll new file mode 100644 index 0000000..ac7aa00 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll @@ -0,0 +1,68 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i32 } + +@lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1 + +; Use constant from different kernels +;. +; CHECK: @lds.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1 +; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4 +;. +define amdgpu_kernel void @k0(i64 %x) { +; CHECK-LABEL: @k0( +; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x +; CHECK-NEXT: store i8 1, i8* %ptr, align 1 +; CHECK-NEXT: ret void +; + %ptr = getelementptr inbounds i8, i8* addrspacecast ([2 x i8] addrspace(3)* @lds.1 to i8*), i64 %x + store i8 1, i8 addrspace(0)* %ptr, align 1 + ret void +} + +define amdgpu_kernel void @k1(i64 %x) { +; CHECK-LABEL: @k1( +; CHECK-NEXT: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(3)* @lds.1, i32 0, i32 0) to i8*), i64 %x +; CHECK-NEXT: store i8 1, i8* %ptr, align 1 +; CHECK-NEXT: ret void +; + %ptr = getelementptr inbounds i8, i8* addrspacecast ([2 x i8] addrspace(3)* @lds.1 to i8*), i64 %x + store i8 1, i8 addrspace(0)* %ptr, align 1 + ret void +} + +@lds.2 = internal unnamed_addr addrspace(3) global i32 undef, align 4 + +; Use constant twice from the same kernel +define amdgpu_kernel void @k2(i64 %x) { +; CHECK-LABEL: @k2( +; CHECK-NEXT: %ptr1 = bitcast i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0) to i8 addrspace(3)* +; CHECK-NEXT: store i8 1, i8 addrspace(3)* %ptr1, align 4 +; CHECK-NEXT: %ptr2 = bitcast i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0) to i8 addrspace(3)* +; CHECK-NEXT: store i8 2, i8 addrspace(3)* %ptr2, align 4 +; CHECK-NEXT: ret void +; + %ptr1 = bitcast i32 addrspace(3)* @lds.2 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %ptr1, align 4 + %ptr2 = bitcast i32 addrspace(3)* @lds.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %ptr2, align 4 + ret void +} + +@lds.3 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1 + +; Use constant twice from the same kernel but a different other constant. +define amdgpu_kernel void @k3(i64 %x) { +; CHECK-LABEL: @k3( +; CHECK-NEXT: %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0, i32 16) to i64 addrspace(3)*) to i64* +; CHECK-NEXT: store i64 1, i64* %ptr1, align 1 +; CHECK-NEXT: %ptr2 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0, i32 24) to i64 addrspace(3)*) to i64* +; CHECK-NEXT: store i64 2, i64* %ptr2, align 1 +; + %ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64* + store i64 1, i64* %ptr1, align 1 + %ptr2 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 24) to i64 addrspace(3)*) to i64* + store i64 2, i64* %ptr2, align 1 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll new file mode 100644 index 0000000..49b912c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll @@ -0,0 +1,65 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 +@lds.size.2.align.2 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 2 +@lds.size.4.align.4 = internal unnamed_addr addrspace(3) global [4 x i8] undef, align 4 +@lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8 +@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 + +; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [4 x i8], [2 x i8], [1 x i8] } +; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [16 x i8], [4 x i8], [2 x i8] } + +;. +; CHECK: @lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8 +; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16 +; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16 +;. +define amdgpu_kernel void @k0() { +; CHECK-LABEL: @k0( +; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3) to i8 addrspace(3)* +; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 +; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)* +; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 +; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)* +; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 +; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)* +; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 +; CHECK-NEXT: ret void +; + %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 + + %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 + + %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 + + %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* + store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 + + ret void +} + +define amdgpu_kernel void @k1() { +; CHECK-LABEL: @k1( +; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)* +; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 +; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)* +; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 +; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)* +; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 +; CHECK-NEXT: ret void +; + %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2 + + %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)* + store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4 + + %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* + store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 + + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll index 7d56a67..43ba68a 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -5,8 +5,8 @@ @func = addrspace(3) global float undef, align 4 -; @kern is only used from a kernel so it is left unchanged -; CHECK: @kern = addrspace(3) global float undef, align 4 +; CHECK: %llvm.amdgcn.kernel.timestwo.lds.t = type { float } + @kern = addrspace(3) global float undef, align 4 ; @func is only used from a non-kernel function so is rewritten @@ -17,6 +17,7 @@ @both = addrspace(3) global float undef, align 4 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4 +; CHECK: @llvm.amdgcn.kernel.timestwo.lds = internal addrspace(3) global %llvm.amdgcn.kernel.timestwo.lds.t undef, align 4 ; CHECK-LABEL: @get_func() ; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 @@ -36,9 +37,9 @@ entry: ; CHECK-LABEL: @timestwo() ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] -; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 ; CHECK: %mul = mul i32 %ld, 2 -; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 define amdgpu_kernel void @timestwo() { %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 %mul = mul i32 %ld, 2 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll index f241ab6..1cb7309 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll @@ -7,7 +7,7 @@ ; CHECK-NOT: llvm.amdgcn.module.lds.t ; var1, var2 would be transformed were they used from a non-kernel function -; CHECK: @var1 = addrspace(3) global i32 undef +; CHECK-NOT: @var1 = ; CHECK: @var2 = addrspace(3) global float undef @var1 = addrspace(3) global i32 undef @var2 = addrspace(3) global float undef @@ -36,7 +36,7 @@ @toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8 ; Use by .used lists doesn't trigger lowering -; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK-NOT: @llvm.used = @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" ; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata" @@ -58,9 +58,8 @@ define void @use_variables() { ret void } -; Use by kernel doesn't trigger lowering ; CHECK-LABEL: @kern_use() -; CHECK: %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic +; CHECK: %inc = atomicrmw add i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.kern_use.lds.t, %llvm.amdgcn.kernel.kern_use.lds.t addrspace(3)* @llvm.amdgcn.kernel.kern_use.lds, i32 0, i32 0), i32 1 monotonic, align 4 define amdgpu_kernel void @kern_use() { %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic call void @use_variables() diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll index ad00d96..92b26d3 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll @@ -20,7 +20,7 @@ define amdgpu_kernel void @k0() { ; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.module.lds) ] ; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)* ; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1 -; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* +; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)* ; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16 ; OPT-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll index 4fd9c01..fc91960 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll @@ -8,8 +8,8 @@ ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { ; IR: alloca [10 x i32] ; ASM-LABEL: {{^}}promote_alloca_size_256: -; ASM: .amdgpu_lds global_array0, 30000, 4 -; ASM: .amdgpu_lds global_array1, 30000, 4 +; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 4 +; ASM-NOT: .amdgpu_lds define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll index bd496a8..d63bd45 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -disable-promote-alloca-to-vector -amdgpu-enable-lower-module-lds=0 < %s | FileCheck -check-prefix=GCN %s ; This shows that the amount LDS size estimate should try to not be ; sensitive to the order of the LDS globals. This should try to diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll index 8b9942c..0782f17 100644 --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -34,11 +34,7 @@ define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 add ; GCN-LABEL: {{^}}load_shl_base_lds_1: ; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} - -; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation -; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] - -; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 +; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[OFS]] offset:8 ; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}} ; GCN-DAG: buffer_store_dword [[RESULT]] ; GCN-DAG: buffer_store_dword [[ADDUSE]] @@ -72,18 +68,10 @@ define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i ; The two globals are placed adjacent in memory, so the same base ; pointer can be used with an offset into the second one. -; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints - ; GCN-LABEL: {{^}}load_shl_base_lds_2: -; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} -; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] -; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] -; GCN-DAG: s_mov_b32 m0, -1 - -; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256 -; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256 -; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 - +; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; GCN: s_mov_b32 m0, -1 +; GCN: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 ; GCN: s_endpgm define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index a9736c4..b10e96c 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -10,7 +10,7 @@ ; CHECK: machineFunctionInfo: ; CHECK-NEXT: explicitKernArgSize: 128 ; CHECK-NEXT: maxKernArgAlign: 64 -; CHECK-NEXT: ldsSize: 0 +; CHECK-NEXT: ldsSize: 2048 ; CHECK-NEXT: dynLDSAlign: 1 ; CHECK-NEXT: isEntryFunction: true ; CHECK-NEXT: noSignedZerosFPMath: false