From 3ae4c3589ec7336d363fc1779c4a99360164c8f4 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Tue, 20 Sep 2022 17:25:52 -0700 Subject: [PATCH] AMDGPU: Implicit kernel arguments related optimization when uniform-workgroup-size=true Summary: Under code object version 5, ockl_get_local_size returns the value computed by the expression: workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder For functions with the attribute uniform-work-group-size=true. we can evaluate workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned for ockl_get_local_size. With uniform-workgroup-size=true, this work also set all remainders to zero, and if there is reqd_work_group_size, we also set work-group-size to the required value from the metadata. Reviewers: arsenm and bcahoon Differential Revision: https://reviews.llvm.org/D131276 --- .../Target/AMDGPU/AMDGPULowerKernelAttributes.cpp | 325 ++++++++++++++------- llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll | 224 ++++++++++++++ 2 files changed, 441 insertions(+), 108 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index f5903b3..0d2b2a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -41,6 +42,21 @@ enum DispatchPackedOffsets { GRID_SIZE_Z = 20 }; +// Field offsets to implicit kernel argument pointer. +enum ImplicitArgOffsets { + HIDDEN_BLOCK_COUNT_X = 0, + HIDDEN_BLOCK_COUNT_Y = 4, + HIDDEN_BLOCK_COUNT_Z = 8, + + HIDDEN_GROUP_SIZE_X = 12, + HIDDEN_GROUP_SIZE_Y = 14, + HIDDEN_GROUP_SIZE_Z = 16, + + HIDDEN_REMAINDER_X = 18, + HIDDEN_REMAINDER_Y = 20, + HIDDEN_REMAINDER_Z = 22, +}; + class AMDGPULowerKernelAttributes : public ModulePass { public: static char ID; @@ -60,7 +76,7 @@ public: } // end anonymous namespace -static bool processUse(CallInst *CI) { +static bool processUse(CallInst *CI, bool IsV5OrAbove) { Function *F = CI->getParent()->getParent(); auto MD = F->getMetadata("reqd_work_group_size"); @@ -72,13 +88,10 @@ static bool processUse(CallInst *CI) { if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize) return false; - Value *WorkGroupSizeX = nullptr; - Value *WorkGroupSizeY = nullptr; - Value *WorkGroupSizeZ = nullptr; - - Value *GridSizeX = nullptr; - Value *GridSizeY = nullptr; - Value *GridSizeZ = nullptr; + Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; + Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; + Value *Remainders[3] = {nullptr, nullptr, nullptr}; + Value *GridSizes[3] = {nullptr, nullptr, nullptr}; const DataLayout &DL = F->getParent()->getDataLayout(); @@ -89,10 +102,13 @@ static bool processUse(CallInst *CI) { continue; int64_t Offset = 0; - if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) - continue; + BitCastInst *BCI = dyn_cast(U); + if (!BCI) { + if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) + continue; + BCI = dyn_cast(*U->user_begin()); + } - auto *BCI = dyn_cast(*U->user_begin()); if (!BCI || !BCI->hasOneUse()) continue; @@ -103,134 +119,218 @@ static bool processUse(CallInst *CI) { unsigned LoadSize = DL.getTypeStoreSize(Load->getType()); // TODO: Handle merged loads. - switch (Offset) { - case WORKGROUP_SIZE_X: - if (LoadSize == 2) - WorkGroupSizeX = Load; - break; - case WORKGROUP_SIZE_Y: - if (LoadSize == 2) - WorkGroupSizeY = Load; - break; - case WORKGROUP_SIZE_Z: - if (LoadSize == 2) - WorkGroupSizeZ = Load; - break; - case GRID_SIZE_X: - if (LoadSize == 4) - GridSizeX = Load; - break; - case GRID_SIZE_Y: - if (LoadSize == 4) - GridSizeY = Load; - break; - case GRID_SIZE_Z: - if (LoadSize == 4) - GridSizeZ = Load; - break; - default: - break; + if (IsV5OrAbove) { // Base is ImplicitArgPtr. + switch (Offset) { + case HIDDEN_BLOCK_COUNT_X: + if (LoadSize == 4) + BlockCounts[0] = Load; + break; + case HIDDEN_BLOCK_COUNT_Y: + if (LoadSize == 4) + BlockCounts[1] = Load; + break; + case HIDDEN_BLOCK_COUNT_Z: + if (LoadSize == 4) + BlockCounts[2] = Load; + break; + case HIDDEN_GROUP_SIZE_X: + if (LoadSize == 2) + GroupSizes[0] = Load; + break; + case HIDDEN_GROUP_SIZE_Y: + if (LoadSize == 2) + GroupSizes[1] = Load; + break; + case HIDDEN_GROUP_SIZE_Z: + if (LoadSize == 2) + GroupSizes[2] = Load; + break; + case HIDDEN_REMAINDER_X: + if (LoadSize == 2) + Remainders[0] = Load; + break; + case HIDDEN_REMAINDER_Y: + if (LoadSize == 2) + Remainders[1] = Load; + break; + case HIDDEN_REMAINDER_Z: + if (LoadSize == 2) + Remainders[2] = Load; + break; + default: + break; + } + } else { // Base is DispatchPtr. + switch (Offset) { + case WORKGROUP_SIZE_X: + if (LoadSize == 2) + GroupSizes[0] = Load; + break; + case WORKGROUP_SIZE_Y: + if (LoadSize == 2) + GroupSizes[1] = Load; + break; + case WORKGROUP_SIZE_Z: + if (LoadSize == 2) + GroupSizes[2] = Load; + break; + case GRID_SIZE_X: + if (LoadSize == 4) + GridSizes[0] = Load; + break; + case GRID_SIZE_Y: + if (LoadSize == 4) + GridSizes[1] = Load; + break; + case GRID_SIZE_Z: + if (LoadSize == 4) + GridSizes[2] = Load; + break; + default: + break; + } } } - // Pattern match the code used to handle partial workgroup dispatches in the - // library implementation of get_local_size, so the entire function can be - // constant folded with a known group size. - // - // uint r = grid_size - group_id * group_size; - // get_local_size = (r < group_size) ? r : group_size; - // - // If we have uniform-work-group-size (which is the default in OpenCL 1.2), - // the grid_size is required to be a multiple of group_size). In this case: - // - // grid_size - (group_id * group_size) < group_size - // -> - // grid_size < group_size + (group_id * group_size) - // - // (grid_size / group_size) < 1 + group_id - // - // grid_size / group_size is at least 1, so we can conclude the select - // condition is false (except for group_id == 0, where the select result is - // the same). - bool MadeChange = false; - Value *WorkGroupSizes[3] = { WorkGroupSizeX, WorkGroupSizeY, WorkGroupSizeZ }; - Value *GridSizes[3] = { GridSizeX, GridSizeY, GridSizeZ }; - - for (int I = 0; HasUniformWorkGroupSize && I < 3; ++I) { - Value *GroupSize = WorkGroupSizes[I]; - Value *GridSize = GridSizes[I]; - if (!GroupSize || !GridSize) - continue; + if (IsV5OrAbove && HasUniformWorkGroupSize) { + // Under v5 __ockl_get_local_size returns the value computed by the expression: + // + // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder + // + // For functions with the attribute uniform-work-group-size=true. we can evaluate + // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned + // for __ockl_get_local_size. + for (int I = 0; I < 3; ++I) { + Value *BlockCount = BlockCounts[I]; + if (!BlockCount) + continue; - using namespace llvm::PatternMatch; - auto GroupIDIntrin = - I == 0 ? m_Intrinsic() - : (I == 1 ? m_Intrinsic() - : m_Intrinsic()); + using namespace llvm::PatternMatch; + auto GroupIDIntrin = + I == 0 ? m_Intrinsic() + : (I == 1 ? m_Intrinsic() + : m_Intrinsic()); + + for (User *ICmp : BlockCount->users()) { + ICmpInst::Predicate Pred; + if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) { + if (Pred != ICmpInst::ICMP_ULT) + continue; + ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType())); + MadeChange = true; + } + } + } - for (User *U : GroupSize->users()) { - auto *ZextGroupSize = dyn_cast(U); - if (!ZextGroupSize) + // All remainders should be 0 with uniform work group size. + for (Value *Remainder : Remainders) { + if (!Remainder) + continue; + Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType())); + MadeChange = true; + } + } else if (HasUniformWorkGroupSize) { // Pre-V5. + // Pattern match the code used to handle partial workgroup dispatches in the + // library implementation of get_local_size, so the entire function can be + // constant folded with a known group size. + // + // uint r = grid_size - group_id * group_size; + // get_local_size = (r < group_size) ? r : group_size; + // + // If we have uniform-work-group-size (which is the default in OpenCL 1.2), + // the grid_size is required to be a multiple of group_size). In this case: + // + // grid_size - (group_id * group_size) < group_size + // -> + // grid_size < group_size + (group_id * group_size) + // + // (grid_size / group_size) < 1 + group_id + // + // grid_size / group_size is at least 1, so we can conclude the select + // condition is false (except for group_id == 0, where the select result is + // the same). + for (int I = 0; I < 3; ++I) { + Value *GroupSize = GroupSizes[I]; + Value *GridSize = GridSizes[I]; + if (!GroupSize || !GridSize) continue; - for (User *UMin : ZextGroupSize->users()) { - if (match(UMin, - m_UMin(m_Sub(m_Specific(GridSize), - m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))), - m_Specific(ZextGroupSize)))) { - if (HasReqdWorkGroupSize) { - ConstantInt *KnownSize - = mdconst::extract(MD->getOperand(I)); - UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast( - KnownSize, UMin->getType(), false)); - } else { - UMin->replaceAllUsesWith(ZextGroupSize); + using namespace llvm::PatternMatch; + auto GroupIDIntrin = + I == 0 ? m_Intrinsic() + : (I == 1 ? m_Intrinsic() + : m_Intrinsic()); + + for (User *U : GroupSize->users()) { + auto *ZextGroupSize = dyn_cast(U); + if (!ZextGroupSize) + continue; + + for (User *UMin : ZextGroupSize->users()) { + if (match(UMin, + m_UMin(m_Sub(m_Specific(GridSize), + m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))), + m_Specific(ZextGroupSize)))) { + if (HasReqdWorkGroupSize) { + ConstantInt *KnownSize + = mdconst::extract(MD->getOperand(I)); + UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast( + KnownSize, UMin->getType(), false)); + } else { + UMin->replaceAllUsesWith(ZextGroupSize); + } + + MadeChange = true; } - - MadeChange = true; } } } } + // If reqd_work_group_size is set, we can replace work group size with it. if (!HasReqdWorkGroupSize) return MadeChange; - // Eliminate any other loads we can from the dispatch packet. - for (int I = 0; I < 3; ++I) { - Value *GroupSize = WorkGroupSizes[I]; + for (int I = 0; I < 3; I++) { + Value *GroupSize = GroupSizes[I]; if (!GroupSize) continue; ConstantInt *KnownSize = mdconst::extract(MD->getOperand(I)); GroupSize->replaceAllUsesWith( - ConstantExpr::getIntegerCast(KnownSize, - GroupSize->getType(), - false)); + ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false)); MadeChange = true; } return MadeChange; } + // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get // TargetPassConfig for subtarget. bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { - StringRef DispatchPtrName - = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr); + bool MadeChange = false; + Function *BasePtr = nullptr; + bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5; + if (IsV5OrAbove) { + StringRef ImplicitArgPtrName = + Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr); + BasePtr = M.getFunction(ImplicitArgPtrName); + } else { // Pre-V5. + StringRef DispatchPtrName = + Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr); + BasePtr = M.getFunction(DispatchPtrName); + } - Function *DispatchPtr = M.getFunction(DispatchPtrName); - if (!DispatchPtr) // Dispatch ptr not used. + if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. return false; - bool MadeChange = false; - SmallPtrSet HandledUses; - for (auto *U : DispatchPtr->users()) { + for (auto *U : BasePtr->users()) { CallInst *CI = cast(U); if (HandledUses.insert(CI).second) { - if (processUse(CI)) + if (processUse(CI, IsV5OrAbove)) MadeChange = true; } } @@ -238,6 +338,7 @@ bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { return MadeChange; } + INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU Kernel Attributes", false, false) INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, @@ -251,17 +352,25 @@ ModulePass *llvm::createAMDGPULowerKernelAttributesPass() { PreservedAnalyses AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) { - StringRef DispatchPtrName = - Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr); + Function *BasePtr = nullptr; + bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5; + if (IsV5OrAbove) { + StringRef ImplicitArgPtrName = + Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr); + BasePtr = F.getParent()->getFunction(ImplicitArgPtrName); + } else { // Pre_V5. + StringRef DispatchPtrName = + Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr); + BasePtr = F.getParent()->getFunction(DispatchPtrName); + } - Function *DispatchPtr = F.getParent()->getFunction(DispatchPtrName); - if (!DispatchPtr) // Dispatch ptr not used. + if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. return PreservedAnalyses::all(); for (Instruction &I : instructions(F)) { if (CallInst *CI = dyn_cast(&I)) { - if (CI->getCalledFunction() == DispatchPtr) - processUse(CI); + if (CI->getCalledFunction() == BasePtr) + processUse(CI, IsV5OrAbove); } } diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll new file mode 100644 index 0000000..0221933 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=5 -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_local_size_x(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_local_size_x( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 12 +; GCN-NEXT: [[BC_GEP_LOCAL_SIZE:%.*]] = bitcast i8 addrspace(4)* [[GEP_LOCAL_SIZE]] to i16 addrspace(4)* +; GCN-NEXT: [[LOCAL_SIZE:%.*]] = load i16, i16 addrspace(4)* [[BC_GEP_LOCAL_SIZE]], align 4 +; GCN-NEXT: store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %bc.block.count.x = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %block.count.x = load i32, i32 addrspace(4)* %bc.block.count.x, align 4 + %cmp.id.count = icmp ult i32 %group.id, %block.count.x + %local.size.offset = select i1 %cmp.id.count, i64 12, i64 18 + %gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset + %bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)* + %local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2 + store i16 %local.size, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_local_size_y(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_local_size_y( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 14 +; GCN-NEXT: [[BC_GEP_LOCAL_SIZE:%.*]] = bitcast i8 addrspace(4)* [[GEP_LOCAL_SIZE]] to i16 addrspace(4)* +; GCN-NEXT: [[LOCAL_SIZE:%.*]] = load i16, i16 addrspace(4)* [[BC_GEP_LOCAL_SIZE]], align 2 +; GCN-NEXT: store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y() + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.block.count.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 4 + %bc.block.count.y = bitcast i8 addrspace(4)* %gep.block.count.y to i32 addrspace(4)* + %block.count.y = load i32, i32 addrspace(4)* %bc.block.count.y, align 4 + %cmp.id.count = icmp ult i32 %group.id, %block.count.y + %local.size.offset = select i1 %cmp.id.count, i64 14, i64 20 + %gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset + %bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)* + %local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2 + store i16 %local.size, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_local_size_z(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_local_size_z( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_LOCAL_SIZE:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 16 +; GCN-NEXT: [[BC_GEP_LOCAL_SIZE:%.*]] = bitcast i8 addrspace(4)* [[GEP_LOCAL_SIZE]] to i16 addrspace(4)* +; GCN-NEXT: [[LOCAL_SIZE:%.*]] = load i16, i16 addrspace(4)* [[BC_GEP_LOCAL_SIZE]], align 4 +; GCN-NEXT: store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %group.id = tail call i32 @llvm.amdgcn.workgroup.id.z() + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.block.count.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 8 + %bc.block.count.z = bitcast i8 addrspace(4)* %gep.block.count.z to i32 addrspace(4)* + %block.count.z = load i32, i32 addrspace(4)* %bc.block.count.z, align 4 + %cmp.id.count = icmp ult i32 %group.id, %block.count.z + %local.size.offset = select i1 %cmp.id.count, i64 16, i64 22 + %gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset + %bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)* + %local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2 + store i16 %local.size, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_remainder_x(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_remainder_x( +; GCN-NEXT: store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18 + %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)* + %remainder.x = load i16, i16 addrspace(4)* %bc.x, align 2 + store i16 %remainder.x, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_remainder_y(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_remainder_y( +; GCN-NEXT: store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18 + %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)* + %remainder.y = load i16, i16 addrspace(4)* %bc.y, align 2 + store i16 %remainder.y, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_remainder_z(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_remainder_z( +; GCN-NEXT: store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18 + %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)* + %remainder.z = load i16, i16 addrspace(4)* %bc.z, align 2 + store i16 %remainder.z, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_work_group_size_x(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_work_group_size_x( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_X:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 12 +; GCN-NEXT: [[BC_X:%.*]] = bitcast i8 addrspace(4)* [[GEP_X]] to i16 addrspace(4)* +; GCN-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, i16 addrspace(4)* [[BC_X]], align 4 +; GCN-NEXT: store i16 [[GROUP_SIZE_X]], i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 12 + %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)* + %group.size.x = load i16, i16 addrspace(4)* %bc.x, align 2 + store i16 %group.size.x, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_work_group_size_y(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_work_group_size_y( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 14 +; GCN-NEXT: [[BC_Y:%.*]] = bitcast i8 addrspace(4)* [[GEP_Y]] to i16 addrspace(4)* +; GCN-NEXT: [[GROUP_SIZE_Y:%.*]] = load i16, i16 addrspace(4)* [[BC_Y]], align 2 +; GCN-NEXT: store i16 [[GROUP_SIZE_Y]], i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 14 + %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)* + %group.size.y = load i16, i16 addrspace(4)* %bc.y, align 2 + store i16 %group.size.y, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_work_group_size_z(i16 addrspace(1)* %out) #0 { +; GCN-LABEL: @get_work_group_size_z( +; GCN-NEXT: [[IMPLICITARG_PTR:%.*]] = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +; GCN-NEXT: [[GEP_Z:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[IMPLICITARG_PTR]], i64 16 +; GCN-NEXT: [[BC_Z:%.*]] = bitcast i8 addrspace(4)* [[GEP_Z]] to i16 addrspace(4)* +; GCN-NEXT: [[GROUP_SIZE_Z:%.*]] = load i16, i16 addrspace(4)* [[BC_Z]], align 4 +; GCN-NEXT: store i16 [[GROUP_SIZE_Z]], i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 16 + %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)* + %group.size.z = load i16, i16 addrspace(4)* %bc.z, align 2 + store i16 %group.size.z, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_work_group_size_x_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { +; GCN-LABEL: @get_work_group_size_x_reqd( +; GCN-NEXT: store i16 8, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 12 + %bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)* + %group.size.x = load i16, i16 addrspace(4)* %bc.x, align 2 + store i16 %group.size.x, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_work_group_size_y_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { +; GCN-LABEL: @get_work_group_size_y_reqd( +; GCN-NEXT: store i16 16, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 14 + %bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)* + %group.size.y = load i16, i16 addrspace(4)* %bc.y, align 2 + store i16 %group.size.y, i16 addrspace(1)* %out + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define amdgpu_kernel void @get_work_group_size_z_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 { +; GCN-LABEL: @get_work_group_size_z_reqd( +; GCN-NEXT: store i16 2, i16 addrspace(1)* [[OUT:%.*]], align 2 +; GCN-NEXT: ret void +; + %implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 16 + %bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)* + %group.size.z = load i16, i16 addrspace(4)* %bc.z, align 2 + store i16 %group.size.z, i16 addrspace(1)* %out + ret void +} + + +declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1 +declare i32 @llvm.amdgcn.workgroup.id.x() #1 +declare i32 @llvm.amdgcn.workgroup.id.y() #1 +declare i32 @llvm.amdgcn.workgroup.id.z() #1 + +!llvm.module.flags = !{!1} + +attributes #0 = { nounwind "uniform-work-group-size"="true" } +attributes #1 = { nounwind readnone speculatable } +!0 = !{i32 8, i32 16, i32 2} +!1 = !{i32 1, !"amdgpu_code_object_version", i32 500} -- 2.7.4