Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
WaveLimiter = WaveLimitAttr.getValueAsBool();
+ // FIXME: How is this attribute supposed to interact with statically known
+ // global sizes?
+ StringRef S = F.getFnAttribute("amdgpu-gds-size").getValueAsString();
+ if (!S.empty())
+ S.consumeInteger(0, GDSSize);
+
+ // Assume the attribute allocates before any known GDS globals.
+ StaticGDSSize = GDSSize;
+
CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
Align Alignment =
DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
- /// TODO: We should sort these to minimize wasted space due to alignment
- /// padding. Currently the padding is decided by the first encountered use
- /// during lowering.
- unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
+ unsigned Offset;
+ if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ /// TODO: We should sort these to minimize wasted space due to alignment
+ /// padding. Currently the padding is decided by the first encountered use
+ /// during lowering.
+ Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
- Entry.first->second = Offset;
- StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
+ StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
- // Update the LDS size considering the padding to align the dynamic shared
- // memory.
- LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
+ // Update the LDS size considering the padding to align the dynamic shared
+ // memory.
+ LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
+ } else {
+ Offset = StaticGDSSize = alignTo(StaticGDSSize, Alignment);
+ StaticGDSSize += DL.getTypeAllocSize(GV.getValueType());
+ // FIXME: Apply alignment of dynamic GDS
+ GDSSize = StaticGDSSize;
+ }
+
+ Entry.first->second = Offset;
return Offset;
}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+@gds0 = internal addrspace(2) global [4 x i32] undef, align 4
+@lds0 = internal addrspace(3) global [4 x i32] undef, align 128
+@lds1 = internal addrspace(3) global [4 x i32] undef, align 256
+
+; These two objects should be allocated at the same constant offsets
+; from the base.
+define amdgpu_kernel void @alloc_lds_gds(i32 addrspace(1)* %out) #1 {
+; GCN-LABEL: alloc_lds_gds:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, 5
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_mov_b32 m0, 16
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_wbinvl1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ds_add_u32 v1, v0 offset:12
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_endpgm
+ %gep.gds = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds0, i32 0, i32 3
+ %val0 = atomicrmw add i32 addrspace(2)* %gep.gds, i32 5 acq_rel
+ %gep.lds = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds0, i32 0, i32 3
+ %val1 = atomicrmw add i32 addrspace(3)* %gep.lds, i32 5 acq_rel
+ ret void
+}
+
+; The LDS alignment shouldn't change offset of GDS.
+define amdgpu_kernel void @alloc_lds_gds_align(i32 addrspace(1)* %out) #1 {
+; GCN-LABEL: alloc_lds_gds_align:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, 5
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_mov_b32 m0, 16
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_wbinvl1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ds_add_u32 v1, v0 offset:140
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ds_add_u32 v1, v0 offset:12
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_endpgm
+ %gep.gds = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds0, i32 0, i32 3
+ %val0 = atomicrmw add i32 addrspace(2)* %gep.gds, i32 5 acq_rel
+
+ %gep.lds0 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds0, i32 0, i32 3
+ %val1 = atomicrmw add i32 addrspace(3)* %gep.lds0, i32 5 acq_rel
+
+ %gep.lds1 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds1, i32 0, i32 3
+ %val2 = atomicrmw add i32 addrspace(3)* %gep.lds1, i32 5 acq_rel
+ ret void
+}
+
+@gds_align8 = internal addrspace(2) global [4 x i32] undef, align 8
+@gds_align32 = internal addrspace(2) global [4 x i32] undef, align 32
+
+define amdgpu_kernel void @gds_global_align(i32 addrspace(1)* %out) {
+; GCN-LABEL: gds_global_align:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, 5
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_mov_b32 m0, 32
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: ds_add_u32 v1, v0 offset:28 gds
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_wbinvl1
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_wbinvl1
+; GCN-NEXT: s_endpgm
+ %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align8, i32 0, i32 3
+ %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
+ %gep.gds1 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align32, i32 0, i32 3
+ %val1 = atomicrmw add i32 addrspace(2)* %gep.gds1, i32 5 acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @gds_global_align_plus_attr(i32 addrspace(1)* %out) #0 {
+; GCN-LABEL: gds_global_align_plus_attr:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, 5
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_movk_i32 m0, 0x420
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: ds_add_u32 v1, v0 offset:1052 gds
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_wbinvl1
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: ds_add_u32 v1, v0 offset:1036 gds
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_wbinvl1
+; GCN-NEXT: s_endpgm
+ %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align8, i32 0, i32 3
+ %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
+ %gep.gds1 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align32, i32 0, i32 3
+ %val1 = atomicrmw add i32 addrspace(2)* %gep.gds1, i32 5 acq_rel
+ ret void
+}
+
+@small.gds = internal addrspace(2) global i8 undef, align 1
+@gds.external = external unnamed_addr addrspace(3) global [0 x i32], align 4
+
+define amdgpu_kernel void @gds_extern_align(i32 addrspace(1)* %out, [4 x i32] addrspace(2)* %gds.arg) #0 {
+; GCN-LABEL: gds_extern_align:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s0, s[0:1], 0x8
+; GCN-NEXT: v_mov_b32_e32 v0, 5
+; GCN-NEXT: s_movk_i32 m0, 0x401
+; GCN-NEXT: s_movk_i32 s1, 0x400
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use s1
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_wbinvl1
+; GCN-NEXT: s_endpgm
+ call void asm sideeffect "; use $0","s"(i8 addrspace(2)* @small.gds)
+ %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* %gds.arg, i32 0, i32 3
+ %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
+ ret void
+}
+
+attributes #0 = { "amdgpu-gds-size"="1024" }