kernel argument that holds the completion action pointer. If this
attribute is absent, then the amdgpu-no-implicitarg-ptr is also removed.
+ "amdgpu-lds-size" The number of bytes that will be allocated in the Local Data Store at
+ address zero. Variables are allocated within this frame using absolute
+ symbol metadata, primarily by the AMDGPULowerModuleLDS pass. Internal
+ detail of how LDS variables are lowered, language front ends should not
+ set this.
+
======================================= ==========================================================
Calling Conventions
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
const DataLayout &DL = F.getParent()->getDataLayout();
- Info->allocateKnownAddressLDSGlobal(F);
-
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
const DataLayout &DL = F.getParent()->getDataLayout();
- Info->allocateKnownAddressLDSGlobal(F);
-
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
return KernelToCreatedDynamicLDS;
}
+ // This attribute is no longer used by the backend. TODO: Delete it in favour
+ // of pass-local state and update the tests to remove the string.
static bool canElideModuleLDS(const Function &F) {
return F.hasFnAttribute("amdgpu-elide-module-lds");
}
// All kernel frames have been allocated. Calculate and record the
// addresses.
-
{
const DataLayout &DL = M.getDataLayout();
continue;
// All three of these are optional. The first variable is allocated at
- // zero. They are allocated by allocateKnownAddressLDSGlobal in the
- // following order:
+ // zero. They are allocated by AMDGPUMachineFunction as one block.
+ // Layout:
//{
// module.lds
// alignment padding
if (AllocateKernelScopeStruct) {
GlobalVariable *KernelStruct = Replacement->second.SGV;
-
Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
-
recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
-
Offset += DL.getTypeAllocSize(KernelStruct->getValueType());
-
}
+ // If there is dynamic allocation, the alignment needed is included in
+ // the static frame size. There may be no reference to the dynamic
+ // variable in the kernel itself, so without including it here, that
+ // alignment padding could be missed.
if (AllocateDynamicVariable) {
GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
-
Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable));
-
recordLDSAbsoluteAddress(&M, DynamicVariable, Offset);
}
+
+ if (Offset != 0)
+ Func.addFnAttr("amdgpu-lds-size", std::to_string(Offset));
}
}
// Assume the attribute allocates before any known GDS globals.
StaticGDSSize = GDSSize;
+ // The two separate variables are only profitable when the LDS module lowering
+ // pass is disabled. If graphics does not use dynamic LDS, this is never
+ // profitable. Leaving cleanup for a later change.
+ LDSSize = F.getFnAttributeAsParsedInteger("amdgpu-lds-size", 0);
+ StaticLDSSize = LDSSize;
+
CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
unsigned Offset;
if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+
+ std::optional<uint32_t> MaybeAbs = getLDSAbsoluteAddress(GV);
+ if (MaybeAbs) {
+ // Absolute address LDS variables that exist prior to the LDS lowering
+ // pass raise a fatal error in that pass. These failure modes are only
+ // reachable if that lowering pass is disabled or broken. If/when adding
+ // support for absolute addresses on user specified variables, the
+ // alignment check moves to the lowering pass and the frame calculation
+ // needs to take the user variables into consideration.
+
+ uint32_t ObjectStart = *MaybeAbs;
+
+ if (ObjectStart != alignTo(ObjectStart, Alignment)) {
+ report_fatal_error("Absolute address LDS variable inconsistent with "
+ "variable alignment");
+ }
+
+ if (isModuleEntryFunction()) {
+ // If this is a module entry function, we can also sanity check against
+ // the static frame. Strictly it would be better to check against the
+ // attribute, i.e. that the variable is within the always-allocated
+ // section, and not within some other non-absolute-address object
+ // allocated here, but the extra error detection is minimal and we would
+ // have to pass the Function around or cache the attribute value.
+ uint32_t ObjectEnd =
+ ObjectStart + DL.getTypeAllocSize(GV.getValueType());
+ if (ObjectEnd > StaticLDSSize) {
+ report_fatal_error(
+ "Absolute address LDS variable outside of static frame");
+ }
+ }
+
+ Entry.first->second = ObjectStart;
+ return ObjectStart;
+ }
+
/// TODO: We should sort these to minimize wasted space due to alignment
/// padding. Currently the padding is decided by the first encountered use
/// during lowering.
return Offset;
}
-static constexpr StringLiteral ModuleLDSName = "llvm.amdgcn.module.lds";
-
-static const GlobalVariable *getKernelLDSGlobalFromFunction(const Function &F) {
- const Module *M = F.getParent();
- std::string KernelLDSName = "llvm.amdgcn.kernel.";
- KernelLDSName += F.getName();
- KernelLDSName += ".lds";
- return M->getNamedGlobal(KernelLDSName);
-}
-
static const GlobalVariable *
getKernelDynLDSGlobalFromFunction(const Function &F) {
const Module *M = F.getParent();
return M->getNamedGlobal(KernelDynLDSName);
}
-// This kernel calls no functions that require the module lds struct
-static bool canElideModuleLDS(const Function &F) {
- return F.hasFnAttribute("amdgpu-elide-module-lds");
-}
-
-void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) {
- const Module *M = F.getParent();
- // This function is called before allocating any other LDS so that it can
- // reliably put values at known addresses. Consequently, dynamic LDS, if
- // present, will not yet have been allocated
-
- assert(getDynLDSAlign() == Align() && "dynamic LDS not yet allocated");
-
- if (isModuleEntryFunction()) {
-
- // Pointer values start from zero, memory allocated per-kernel-launch
- // Variables can be grouped into a module level struct and a struct per
- // kernel function by AMDGPULowerModuleLDSPass. If that is done, they
- // are allocated at statically computable addresses here.
- //
- // Address 0
- // {
- // llvm.amdgcn.module.lds
- // }
- // alignment padding
- // {
- // llvm.amdgcn.kernel.some-name.lds
- // }
- // other variables, e.g. dynamic lds, allocated after this call
-
- const GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName);
- const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F);
- const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
-
- if (GV && !canElideModuleLDS(F)) {
- unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align());
- std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*GV);
- if (!Expect || (Offset != *Expect)) {
- report_fatal_error("Inconsistent metadata on module LDS variable");
- }
- }
-
- if (KV) {
- // The per-kernel offset is deterministic because it is allocated
- // before any other non-module LDS variables.
- unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align());
- std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*KV);
- if (!Expect || (Offset != *Expect)) {
- report_fatal_error("Inconsistent metadata on kernel LDS variable");
- }
- }
-
- if (Dyn) {
- // The dynamic LDS is deterministic because the per-kernel one has the
- // maximum alignment of any reachable and all remaining LDS variables,
- // if this is present, are themselves dynamic LDS and will be allocated
- // at the same address.
- setDynLDSAlign(F, *Dyn);
- unsigned Offset = LDSSize;
- std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
- if (!Expect || (Offset != *Expect)) {
- report_fatal_error("Inconsistent metadata on dynamic LDS variable");
- }
- }
- }
-}
-
std::optional<uint32_t>
AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
// TODO: Would be more consistent with the abs symbols to use a range
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV,
Align Trailing);
- void allocateKnownAddressLDSGlobal(const Function &F);
-
static std::optional<uint32_t> getLDSKernelIdMetadata(const Function &F);
static std::optional<uint32_t> getLDSAbsoluteAddress(const GlobalValue &GV);
return DAG.getEntryNode();
}
- Info->allocateKnownAddressLDSGlobal(Fn);
-
SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;
BitVector Skipped(Ins.size());
; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 4, !absolute_symbol !0
;.
define amdgpu_kernel void @k0() #0 {
-; CHECK-LABEL: @k0(
+; CHECK-LABEL: @k0() #0
; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3), align 2, !alias.scope !1, !noalias !4
; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2), align 4, !alias.scope !8, !noalias !9
; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 16, !alias.scope !10, !noalias !11
}
define amdgpu_kernel void @k1() #0 {
-; CHECK-LABEL: @k1(
+; CHECK-LABEL: @k1() #1
; CHECK-NEXT: store i8 2, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2), align 4, !alias.scope !14, !noalias !17
; CHECK-NEXT: store i8 4, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1), align 16, !alias.scope !20, !noalias !21
; CHECK-NEXT: store i8 16, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 16, !alias.scope !22, !noalias !23
}
define amdgpu_kernel void @k2() #0 {
-; CHECK-LABEL: @k2(
+; CHECK-LABEL: @k2() #2
; CHECK-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k2.lds, align 2
; CHECK-NEXT: ret void
;
}
define amdgpu_kernel void @k3() #0 {
-; CHECK-LABEL: @k3(
+; CHECK-LABEL: @k3() #3
; CHECK-NEXT: store i8 4, ptr addrspace(3) @llvm.amdgcn.kernel.k3.lds, align 4
; CHECK-NEXT: ret void
;
ret void
}
-
+; CHECK-LABEL: @calls_f0() #4
define amdgpu_kernel void @calls_f0() {
call void @f0()
ret void
}
define void @f0() {
-; CHECK-LABEL: define void @f0(
+; CHECK-LABEL: define void @f0()
; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.module.lds.t, ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 8, !noalias !24
; CHECK-NEXT: store i8 8, ptr addrspace(3) @llvm.amdgcn.module.lds, align 8, !noalias !24
; CHECK-NEXT: ret void
ret void
}
-attributes #0 = { "amdgpu-elide-module-lds" }
-; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="23" }
+; CHECK: attributes #1 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="22" }
+; CHECK: attributes #2 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="2" }
+; CHECK: attributes #3 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="4" }
+; CHECK: attributes #4 = { "amdgpu-lds-size"="9" }
; CHECK: !0 = !{i64 0, i64 1}
@B = external addrspace(3) global [0 x i32]
define amdgpu_kernel void @kernel_0() {
-; CHECK-LABEL: define amdgpu_kernel void @kernel_0() !llvm.amdgcn.lds.kernel.id !1 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_0() #0 !llvm.amdgcn.lds.kernel.id !1 {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_0.lds) ]
; CHECK-NEXT: call void @call_store_A()
; CHECK-NEXT: ret void
}
define amdgpu_kernel void @kernel_2() {
-; CHECK-LABEL: define amdgpu_kernel void @kernel_2() !llvm.amdgcn.lds.kernel.id !3 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_2() #0 !llvm.amdgcn.lds.kernel.id !3 {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_2.lds) ]
; CHECK-NEXT: call void @store_A()
; CHECK-NEXT: ret void
;
ret ptr addrspacecast (ptr addrspace(3) @B to ptr)
}
+
+; CHECK: attributes #0 = { "amdgpu-lds-size"="64" }
ret void
}
-; CHECK-LABEL: @timestwo() #0
+; CHECK-LABEL: @timestwo() #1
; CHECK-NOT: call void @llvm.donothing()
; CHECK: %1 = addrspacecast ptr addrspace(3) @llvm.amdgcn.kernel.timestwo.lds to ptr
; CHECK: %12 = inttoptr i64 %11 to ptr
; CHECK: store i32 %mul, ptr %12, align 4
; CHECK: ret void
-define amdgpu_kernel void @timestwo() {
+define amdgpu_kernel void @timestwo() #1 {
%ld = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @b_both to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @kern to ptr) to i64)) to ptr), align 4
%mul = mul i32 %ld, 2
store i32 %mul, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @kern to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @b_both to ptr) to i64)) to ptr), align 4
ret void
}
-; CHECK-LABEL: @through_functions()
+; CHECK-LABEL: @through_functions() #2
define amdgpu_kernel void @through_functions() {
%ld = call i32 @get_func()
%mul = mul i32 %ld, 4
attributes #0 = { "amdgpu-elide-module-lds" }
; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }
+; CHECK: attributes #1 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="8" }
+; CHECK: attributes #2 = { "amdgpu-lds-size"="8" }
}
define amdgpu_kernel void @expect_align4() {
-; CHECK-LABEL: @expect_align4() !llvm.amdgcn.lds.kernel.id !4 {
+; CHECK-LABEL: @expect_align4() #2 !llvm.amdgcn.lds.kernel.id !4 {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds) ]
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: call void @use_shared4()
; Note: use_shared4 uses module.lds so this will allocate at offset 4
define amdgpu_kernel void @expect_max_of_2_and_4() {
-; CHECK-LABEL: @expect_max_of_2_and_4() !llvm.amdgcn.lds.kernel.id !6 {
+; CHECK-LABEL: @expect_max_of_2_and_4() #2 !llvm.amdgcn.lds.kernel.id !6 {
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds) ]
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: call void @use_shared2()
attributes #0 = { noinline }
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-; CHECK: declare void @llvm.donothing() #2
+; CHECK: declare void @llvm.donothing() #3
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-; CHECK: declare i32 @llvm.amdgcn.lds.kernel.id() #3
+; CHECK: declare i32 @llvm.amdgcn.lds.kernel.id() #4
; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }
; CHECK: attributes #1 = { noinline }
-; CHECK: attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #2 = { "amdgpu-lds-size"="4" }
+; CHECK: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
; CHECK: !0 = !{i64 0, i64 1}
; CHECK: !1 = !{i64 4, i64 5}
!2 = !{i32 1}
-; OPT: attributes #0 = { "amdgpu-elide-module-lds" }
-; OPT: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; OPT: attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; OPT: attributes #0 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="8" }
+; OPT: attributes #1 = { "amdgpu-lds-size"="8" }
+; OPT: attributes #2 = { "amdgpu-elide-module-lds" "amdgpu-lds-size"="12" }
+; OPT: attributes #3 = { "amdgpu-lds-size"="20" }
+; OPT: attributes #4 = { nocallback nofree nosync nounwind willreturn memory(none) }
+; OPT: attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
; OPT: !0 = !{i64 0, i64 1}
; OPT: !1 = !{i64 4, i64 5}
; Doesn't access any via a function, won't be in the lookup table
define amdgpu_kernel void @kernel_no_table() {
-; OPT-LABEL: @kernel_no_table() {
+; OPT-LABEL: @kernel_no_table() #0 {
; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8
; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 8
; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8
; Access two variables, will allocate those two
define amdgpu_kernel void @k01() {
-; OPT-LABEL: @k01() !llvm.amdgcn.lds.kernel.id !1 {
+; OPT-LABEL: @k01() #0 !llvm.amdgcn.lds.kernel.id !1 {
; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ]
; OPT-NEXT: call void @f0()
; OPT-NEXT: call void @f1()
}
define amdgpu_kernel void @k23() {
-; OPT-LABEL: @k23() !llvm.amdgcn.lds.kernel.id !7 {
+; OPT-LABEL: @k23() #1 !llvm.amdgcn.lds.kernel.id !7 {
; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ]
; OPT-NEXT: call void @f2()
; OPT-NEXT: call void @f3()
; Access and allocate three variables
define amdgpu_kernel void @k123() {
-; OPT-LABEL: @k123() !llvm.amdgcn.lds.kernel.id !13 {
+; OPT-LABEL: @k123() #2 !llvm.amdgcn.lds.kernel.id !13 {
; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ]
; OPT-NEXT: call void @f1()
; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21
; OPT: declare i32 @llvm.amdgcn.lds.kernel.id()
+; OPT: attributes #0 = { "amdgpu-lds-size"="8" }
+; OPT: attributes #1 = { "amdgpu-lds-size"="12" }
+; OPT: attributes #2 = { "amdgpu-lds-size"="16" }
+
!0 = !{i64 0, i64 1}
!1 = !{i32 0}
!2 = !{i32 2}
}
; This kernel calls a function that uses LDS so needs the block
-; CHECK-LABEL: @kern_call()
+; CHECK-LABEL: @kern_call() #0
; CHECK: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
; CHECK: call void @func()
; CHECK: %dec = atomicrmw fsub ptr addrspace(3) @llvm.amdgcn.module.lds, float 2.000000e+00 monotonic, align 8
}
; This kernel does alloc the LDS block as it makes no calls
-; CHECK-LABEL: @kern_empty()
+; CHECK-LABEL: @kern_empty() #1
; CHECK-NOT: call void @llvm.donothing()
define spir_kernel void @kern_empty() #0{
ret void
declare amdgpu_kernel void @kernel_declaration()
attributes #0 = { "amdgpu-elide-module-lds" }
-; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }
+
+; CHECK: attributes #0 = { "amdgpu-lds-size"="12" }
+; CHECK: attributes #1 = { "amdgpu-elide-module-lds" }