From 85f38901266a6e5ec9771a82efdcc16dcd364022 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 19 Jul 2019 19:47:30 +0000 Subject: [PATCH] AMDGPU: Force s_waitcnt after GWS instructions This is apparently required to be the immediately following instruction, so force it into a bundle with a waitcnt. llvm-svn: 366607 --- llvm/lib/Target/AMDGPU/DSInstructions.td | 5 ++++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 23 ++++++++++++++++++--- llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- .../CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll | 24 ++++++++++++++++++++-- .../test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll | 6 ++++-- 6 files changed, 52 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index c52eaaa..0cc21a6 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -317,13 +317,16 @@ class DS_GWS class DS_GWS_0D : DS_GWS; + (ins offset:$offset, gds:$gds), "$offset gds"> { + let hasSideEffects = 1; +} class DS_GWS_1D : DS_GWS { let has_gws_data0 = 1; + let hasSideEffects = 1; } class DS_VOID : DS_PseudogetInstrInfo(); + auto I = MI.getIterator(); + auto E = std::next(I); + + BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + MIBundleBuilder Bundler(*MBB, I, E); + finalizeBundle(*MBB, Bundler.begin()); +} + MachineBasicBlock * SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -3108,8 +3122,7 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, MRI.setSimpleHint(Data0, Src->getReg()); } - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); + bundleInstWithWaitcnt(MI); unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -3828,8 +3841,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::DS_GWS_SEMA_P: case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: case AMDGPU::DS_GWS_BARRIER: - if (getSubtarget()->hasGWSAutoReplay()) + // A s_waitcnt 0 is required to be the instruction immediately following. + if (getSubtarget()->hasGWSAutoReplay()) { + bundleInstWithWaitcnt(MI); return BB; + } + return emitGWSMemViolTestLoop(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 1f8cf4f..27c6445 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -315,6 +315,7 @@ public: MachineBasicBlock *splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const; + void bundleInstWithWaitcnt(MachineInstr &MI) const; MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ba8ed69..1fce2db 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1531,7 +1531,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { break; } case TargetOpcode::BUNDLE: { - if (!MI.mayLoad()) + if (!MI.mayLoad() || MI.hasUnmodeledSideEffects()) return false; // If it is a load it must be a memory clause diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll index a1affcc..756e86c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -4,6 +4,11 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s +; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos. +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MIR %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MIR %s + + ; Minimum offset ; GCN-LABEL: {{^}}gws_barrier_offset0: ; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] @@ -18,11 +23,19 @@ ; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) ; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 ; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] + +; MIR-LABEL: name: gws_barrier_offset0{{$}} +; MIR: BUNDLE implicit{{( killed)?}} $vgpr0, implicit $m0, implicit $exec { +; MIR-NEXT: DS_GWS_BARRIER $vgpr0, 1, -1, implicit $m0, implicit $exec :: (load 4 from custom GWSResource) +; MIR-NEXT: S_WAITCNT 0 +; MIR-NEXT: } define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) ret void } +; MIR-LABEL: name: gws_barrier_offset63{{$}} + ; Maximum offset ; GCN-LABEL: {{^}}gws_barrier_offset63: ; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] @@ -103,7 +116,7 @@ define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) ; Make sure this increments lgkmcnt ; GCN-LABEL: {{^}}gws_barrier_lgkmcnt: ; NOLOOP: ds_gws_barrier v0 offset:1 gds{{$}} -; NOLOOP-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOLOOP-NEXT: s_setpc_b64 define void @gws_barrier_lgkmcnt(i32 %val) { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) @@ -122,7 +135,7 @@ define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* % ; GCN-LABEL: {{^}}gws_barrier_wait_after: ; NOLOOP: ds_gws_barrier v0 offset:8 gds -; NOLOOP-NEXT: s_waitcnt expcnt(0){{$}} +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOLOOP-NEXT: load_dword define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) @@ -135,6 +148,7 @@ define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %p ; NOLOOP: store_dword ; NOLOOP: s_waitcnt vmcnt(0) lgkmcnt(0) ; NOLOOP: ds_gws_barrier v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 { store i32 0, i32 addrspace(1)* %ptr fence release @@ -142,9 +156,11 @@ define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* ret void } +; FIXME: Extra waitcnt ; GCN-LABEL: {{^}}gws_barrier_fence_after: ; NOLOOP: ds_gws_barrier v0 offset:8 gds ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; NOLOOP-NEXT: load_dword define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 { @@ -158,7 +174,9 @@ define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* % ; GCN-LABEL: {{^}}gws_init_barrier: ; NOLOOP: s_mov_b32 m0, -1 ; NOLOOP: ds_gws_init v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOLOOP-NEXT: ds_gws_barrier v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 { call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) @@ -169,9 +187,11 @@ define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 { ; GCN-LABEL: {{^}}gws_init_fence_barrier: ; NOLOOP: s_mov_b32 m0, -1 ; NOLOOP: ds_gws_init v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; NOLOOP-NEXT: ds_gws_barrier v0 offset:8 gds +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 { call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) fence release diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll index 075ec50..11f9264 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll @@ -111,7 +111,7 @@ define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 { ; GCN-LABEL: {{^}}gws_init_lgkmcnt: ; NOLOOP: ds_gws_init v0 offset:1 gds{{$}} -; NOLOOP-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOLOOP-NEXT: s_setpc_b64 define void @gws_init_lgkmcnt(i32 %val) { call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0) @@ -120,8 +120,10 @@ define void @gws_init_lgkmcnt(i32 %val) { ; Does not imply memory fence on its own ; GCN-LABEL: {{^}}gws_init_wait_before: -; NOLOOP: s_waitcnt +; NOLOOP: s_waitcnt lgkmcnt(0) ; NOLOOP-NOT: s_waitcnt +; NOLOOP: ds_gws_init +; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 { store i32 0, i32 addrspace(1)* %ptr call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) -- 2.7.4