The GFX11 NGG Streamout Instructions perform atomic operations on
dedicated registers. At the moment, they lack machine memory operands,
which causes the si-memory-legalizer pass to treat them conservatively
and introduce several unnecessary waits and cache invalidations.
This patch introduces a new address space to represent these special
registers and teaches instruction selection to add memory operands with
this new address space to DS_ADD/SUB_GS_REG_RTN.
Since this address space is meant to be compiler-internal, we move it
up a bit from the other address spaces and give it the number 128.
According to the LLVM Language Reference, address space numbers can go
all the way up to 2^24, but I'm not sure how well this is supported in
practice [1], so using a smaller number seems safer.
[1] https://github.com/llvm/llvm-project/blob/
0107513fe79da7670e37c29c0862794a2213a89c/llvm/utils/TableGen/IntrinsicEmitter.cpp#L401
Differential Revision: https://reviews.llvm.org/D146031
Private 5 private scratch 32 0xFFFFFFFF
Constant 32-bit 6 *TODO* 0x00000000
Buffer Fat Pointer (experimental) 7 *TODO*
+ Streamout Registers 128 N/A GS_REGS
================================= =============== =========== ================ ======= ============================
**Generic**
model the buffer descriptors used heavily in graphics workloads targeting
the backend.
+**Streamout Registers**
+ Dedicated registers used by the GS NGG Streamout Instructions. The register
+ file is modelled as a memory in a distinct address space because it is indexed
+ by an address-like offset in place of named registers, and because register
+ accesses affect LGKMcnt. This is an internal address space used only by the
+ compiler. Do not use this address space for IR pointers.
+
.. _amdgpu-memory-scopes:
Memory Scopes
def int_amdgcn_ds_add_gs_reg_rtn :
ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">,
Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
- [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+ [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]>;
def int_amdgcn_ds_sub_gs_reg_rtn :
ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">,
Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
- [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+ [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]>;
def int_amdgcn_ds_bvh_stack_rtn :
Intrinsic<
BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers.
+ /// Internal address spaces. Can be freely renumbered.
+ STREAMOUT_REGISTER = 128, ///< Address space for GS NGG Streamout registers.
+ /// end Internal address spaces.
+
/// Address space for direct addressable parameter memory (CONST0).
PARAM_D_ADDRESS = 6,
/// Address space for indirect addressable parameter memory (VTX1).
return true;
}
+ case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
+ case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
+ Info.ptrVal = nullptr;
+ Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
define amdgpu_gs void @test_add_32(i32 %arg) {
; CHECK-LABEL: test_add_32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:16 gds
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT: buffer_gl0_inv
-; CHECK-NEXT: buffer_gl1_inv
; CHECK-NEXT: s_endpgm
%unused = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16)
ret void
define amdgpu_gs void @test_add_32_use(i32 %arg, ptr addrspace(1) %out) {
; CHECK-LABEL: test_add_32_use:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
; CHECK-NEXT: ds_add_gs_reg_rtn v[3:4], v0 offset:16 gds
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT: buffer_gl0_inv
-; CHECK-NEXT: buffer_gl1_inv
; CHECK-NEXT: global_store_b32 v[1:2], v3, off
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; CHECK-NEXT: s_endpgm
define amdgpu_gs void @test_add_64(i32 %arg) {
; CHECK-LABEL: test_add_64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:32 gds
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT: buffer_gl0_inv
-; CHECK-NEXT: buffer_gl1_inv
; CHECK-NEXT: s_endpgm
%unused = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32)
ret void
define amdgpu_gs void @test_add_64_use(i32 %arg, ptr addrspace(1) %out) {
; CHECK-LABEL: test_add_64_use:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
; CHECK-NEXT: ds_add_gs_reg_rtn v[3:4], v0 offset:32 gds
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT: buffer_gl0_inv
-; CHECK-NEXT: buffer_gl1_inv
; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; CHECK-NEXT: s_endpgm
define amdgpu_gs void @test_sub_32(i32 %arg) {
; CHECK-LABEL: test_sub_32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:16 gds
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT: buffer_gl0_inv
-; CHECK-NEXT: buffer_gl1_inv
; CHECK-NEXT: s_endpgm
%unused = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16)
ret void
define amdgpu_gs void @test_sub_32_use(i32 %arg, ptr addrspace(1) %out) {
; CHECK-LABEL: test_sub_32_use:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
; CHECK-NEXT: ds_sub_gs_reg_rtn v[3:4], v0 offset:16 gds
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT: buffer_gl0_inv
-; CHECK-NEXT: buffer_gl1_inv
; CHECK-NEXT: global_store_b32 v[1:2], v3, off
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; CHECK-NEXT: s_endpgm
define amdgpu_gs void @test_sub_64(i32 %arg) {
; CHECK-LABEL: test_sub_64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:32 gds
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT: buffer_gl0_inv
-; CHECK-NEXT: buffer_gl1_inv
; CHECK-NEXT: s_endpgm
%unused = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32)
ret void
define amdgpu_gs void @test_sub_64_use(i32 %arg, ptr addrspace(1) %out) {
; CHECK-LABEL: test_sub_64_use:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
; CHECK-NEXT: ds_sub_gs_reg_rtn v[3:4], v0 offset:32 gds
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
-; CHECK-NEXT: buffer_gl0_inv
-; CHECK-NEXT: buffer_gl1_inv
; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; CHECK-NEXT: s_endpgm