From d9bf8aba2371c8deadfabe90d52431d4bb7ab6cc Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Mon, 13 Mar 2023 16:37:44 +0100 Subject: [PATCH] [AMDGPU] Add MMOs for GFX11 Streamout Instructions The GFX11 NGG Streamout Instructions perform atomic operations on dedicated registers. At the moment, they lack machine memory operands, which causes the si-memory-legalizer pass to treat them conservatively and introduce several unnecessary waits and cache invalidations. This patch introduces a new address space to represent these special registers and teaches instruction selection to add memory operands with this new address space to DS_ADD/SUB_GS_REG_RTN. Since this address space is meant to be compiler-internal, we move it up a bit from the other address spaces and give it the number 128. According to the LLVM Language Reference, address space numbers can go all the way up to 2^24, but I'm not sure how well this is supported in practice [1], so using a smaller number seems safer. [1] https://github.com/llvm/llvm-project/blob/0107513fe79da7670e37c29c0862794a2213a89c/llvm/utils/TableGen/IntrinsicEmitter.cpp#L401 Differential Revision: https://reviews.llvm.org/D146031 --- llvm/docs/AMDGPUUsage.rst | 8 ++++++++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 ++++-- llvm/lib/Target/AMDGPU/AMDGPU.h | 4 ++++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 +++++++++ .../AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll | 22 ---------------------- .../AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll | 22 ---------------------- 6 files changed, 25 insertions(+), 46 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index c61e070..cb774a0 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -675,6 +675,7 @@ supported for the ``amdgcn`` target. Private 5 private scratch 32 0xFFFFFFFF Constant 32-bit 6 *TODO* 0x00000000 Buffer Fat Pointer (experimental) 7 *TODO* + Streamout Registers 128 N/A GS_REGS ================================= =============== =========== ================ ======= ============================ **Generic** @@ -783,6 +784,13 @@ supported for the ``amdgcn`` target. model the buffer descriptors used heavily in graphics workloads targeting the backend. +**Streamout Registers** + Dedicated registers used by the GS NGG Streamout Instructions. The register + file is modelled as a memory in a distinct address space because it is indexed + by an address-like offset in place of named registers, and because register + accesses affect LGKMcnt. This is an internal address space used only by the + compiler. Do not use this address space for IR pointers. + .. _amdgpu-memory-scopes: Memory Scopes diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 03d35e4..48960ee 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2000,12 +2000,14 @@ def int_amdgcn_permlane64 : def int_amdgcn_ds_add_gs_reg_rtn : ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">, Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], - [ImmArg>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + [ImmArg>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand]>; def int_amdgcn_ds_sub_gs_reg_rtn : ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">, Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], - [ImmArg>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + [ImmArg>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand]>; def int_amdgcn_ds_bvh_stack_rtn : Intrinsic< diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 458c8b6..10a7f6c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -376,6 +376,10 @@ enum : unsigned { BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers. + /// Internal address spaces. Can be freely renumbered. + STREAMOUT_REGISTER = 128, ///< Address space for GS NGG Streamout registers. + /// end Internal address spaces. + /// Address space for direct addressable parameter memory (CONST0). PARAM_D_ADDRESS = 6, /// Address space for indirect addressable parameter memory (VTX1). diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9a48328..a020069 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1096,6 +1096,15 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } + case Intrinsic::amdgcn_ds_add_gs_reg_rtn: + case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); + Info.ptrVal = nullptr; + Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: { Info.opc = ISD::INTRINSIC_W_CHAIN; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll index 9c0bacd..70fa1af 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll @@ -8,13 +8,7 @@ declare i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32, i32 immarg) define amdgpu_gs void @test_add_32(i32 %arg) { ; CHECK-LABEL: test_add_32: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:16 gds -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: buffer_gl0_inv -; CHECK-NEXT: buffer_gl1_inv ; CHECK-NEXT: s_endpgm %unused = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16) ret void @@ -23,13 +17,8 @@ define amdgpu_gs void @test_add_32(i32 %arg) { define amdgpu_gs void @test_add_32_use(i32 %arg, ptr addrspace(1) %out) { ; CHECK-LABEL: test_add_32_use: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: ds_add_gs_reg_rtn v[3:4], v0 offset:16 gds ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: buffer_gl0_inv -; CHECK-NEXT: buffer_gl1_inv ; CHECK-NEXT: global_store_b32 v[1:2], v3, off ; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm @@ -41,13 +30,7 @@ define amdgpu_gs void @test_add_32_use(i32 %arg, ptr addrspace(1) %out) { define amdgpu_gs void @test_add_64(i32 %arg) { ; CHECK-LABEL: test_add_64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:32 gds -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: buffer_gl0_inv -; CHECK-NEXT: buffer_gl1_inv ; CHECK-NEXT: s_endpgm %unused = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32) ret void @@ -56,13 +39,8 @@ define amdgpu_gs void @test_add_64(i32 %arg) { define amdgpu_gs void @test_add_64_use(i32 %arg, ptr addrspace(1) %out) { ; CHECK-LABEL: test_add_64_use: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: ds_add_gs_reg_rtn v[3:4], v0 offset:32 gds ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: buffer_gl0_inv -; CHECK-NEXT: buffer_gl1_inv ; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off ; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll index 7f0f2c3..f92dd5e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll @@ -8,13 +8,7 @@ declare i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32, i32 immarg) define amdgpu_gs void @test_sub_32(i32 %arg) { ; CHECK-LABEL: test_sub_32: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:16 gds -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: buffer_gl0_inv -; CHECK-NEXT: buffer_gl1_inv ; CHECK-NEXT: s_endpgm %unused = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16) ret void @@ -23,13 +17,8 @@ define amdgpu_gs void @test_sub_32(i32 %arg) { define amdgpu_gs void @test_sub_32_use(i32 %arg, ptr addrspace(1) %out) { ; CHECK-LABEL: test_sub_32_use: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: ds_sub_gs_reg_rtn v[3:4], v0 offset:16 gds ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: buffer_gl0_inv -; CHECK-NEXT: buffer_gl1_inv ; CHECK-NEXT: global_store_b32 v[1:2], v3, off ; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm @@ -41,13 +30,7 @@ define amdgpu_gs void @test_sub_32_use(i32 %arg, ptr addrspace(1) %out) { define amdgpu_gs void @test_sub_64(i32 %arg) { ; CHECK-LABEL: test_sub_64: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:32 gds -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: buffer_gl0_inv -; CHECK-NEXT: buffer_gl1_inv ; CHECK-NEXT: s_endpgm %unused = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32) ret void @@ -56,13 +39,8 @@ define amdgpu_gs void @test_sub_64(i32 %arg) { define amdgpu_gs void @test_sub_64_use(i32 %arg, ptr addrspace(1) %out) { ; CHECK-LABEL: test_sub_64_use: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: ds_sub_gs_reg_rtn v[3:4], v0 offset:32 gds ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: buffer_gl0_inv -; CHECK-NEXT: buffer_gl1_inv ; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off ; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm -- 2.7.4