From e88bbc34c6c13b1ac1a76183afa04b327fc1abc0 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 23 Sep 2016 01:33:26 +0000 Subject: [PATCH] AMDGPU/SI: Include implicit arguments in kernarg_segment_byte_size Reviewers: arsenm Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, llvm-commits, tony-tye Differential Revision: https://reviews.llvm.org/D24835 llvm-svn: 282223 --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 3 ++- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 9 +++++++++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 14 ++++++++++++++ .../AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll | 6 +++++- 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index f2b9fc8477d9..7f0d5141f365 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -730,7 +730,8 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; // FIXME: Should use getKernArgSize - header.kernarg_segment_byte_size = MFI->getABIArgOffset(); + header.kernarg_segment_byte_size = + STM.getKernArgSegmentSize(MFI->getABIArgOffset()); header.wavefront_sgpr_count = KernelInfo.NumSGPR; header.workitem_vgpr_count = KernelInfo.NumVGPR; header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 3c4b5e72761d..586d0478a5d1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -297,6 +297,15 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); } +unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const { + unsigned ImplicitBytes = getImplicitArgNumBytes(); + if (ImplicitBytes == 0) + return ExplicitArgBytes; + + unsigned Alignment = getAlignmentForImplicitArgPtr(); + return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; +} + unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 1e52e1c80633..c278cc55a028 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -142,6 +142,10 @@ public: return TargetTriple.getOS() == Triple::Mesa3D; } + bool isOpenCLEnv() const { + return TargetTriple.getEnvironment() == Triple::OpenCL; + } + Generation getGeneration() const { return Gen; } @@ -288,6 +292,14 @@ public: return isAmdHsaOS() ? 8 : 4; } + unsigned getImplicitArgNumBytes() const { + if (isMesa3DOS()) + return 16; + if (isAmdHsaOS() && isOpenCLEnv()) + return 32; + return 0; + } + unsigned getStackAlignment() const { // Scratch is allocated in 256 dword per wave blocks. return 4 * 256 / getWavefrontSize(); @@ -521,6 +533,8 @@ public: return SGPRInitBug; } + unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const; + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll index c108be5cd8aa..5d4d4cd7ee46 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL,HSA-NOENV %s +; RUN: llc -mtriple=amdgcn--amdhsa-opencl -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL,HSA-OPENCL %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,OS-MESA3D,MESA,ALL %s ; RUN: llc -mtriple=amdgcn-mesa-unknown -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,MESA,ALL %s @@ -29,6 +30,9 @@ define void @test_implicit(i32 addrspace(1)* %out) #1 { } ; ALL-LABEL: {{^}}test_implicit_alignment +; HSA-NOENV: kernarg_segment_byte_size = 10 +; HSA-OPENCL: kernarg_segment_byte_size = 48 +; OS-MESA3D: kernarg_segment_byte_size = 28 ; OS-UNKNOWN: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xc ; HSA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x4 ; OS-MESA3D: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x3 -- 2.34.1