"Hardware automatically inserts waitcnt before barrier"
>;
+def FeatureBackOffBarrier : SubtargetFeature <"back-off-barrier",
+ "BackOffBarrier",
+ "true",
+ "Hardware supports backing off s_barrier if an exception occurs"
+>;
+
def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
"HasTrigReducedRange",
"true",
FeatureMadMacF32Insts,
FeatureSupportsSRAMECC,
FeaturePackedTID,
- FullRate64Ops]>;
+ FullRate64Ops,
+ FeatureBackOffBarrier]>;
def FeatureISAVersion9_0_C : FeatureSet<
[FeatureGFX9,
FeatureSupportsSRAMECC,
FeaturePackedTID,
FeatureArchitectedFlatScratch,
- FullRate64Ops]>;
+ FullRate64Ops,
+ FeatureBackOffBarrier]>;
// TODO: Organize more features into groups.
def FeatureGroup {
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
- FeatureSupportsXNACK])>;
+ FeatureSupportsXNACK,
+ FeatureBackOffBarrier])>;
def FeatureISAVersion10_1_1 : FeatureSet<
!listconcat(FeatureGroup.GFX10_1_Bugs,
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
- FeatureSupportsXNACK])>;
+ FeatureSupportsXNACK,
+ FeatureBackOffBarrier])>;
def FeatureISAVersion10_1_2 : FeatureSet<
!listconcat(FeatureGroup.GFX10_1_Bugs,
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
- FeatureSupportsXNACK])>;
+ FeatureSupportsXNACK,
+ FeatureBackOffBarrier])>;
def FeatureISAVersion10_1_3 : FeatureSet<
!listconcat(FeatureGroup.GFX10_1_Bugs,
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
- FeatureSupportsXNACK])>;
+ FeatureSupportsXNACK,
+ FeatureBackOffBarrier])>;
def FeatureISAVersion10_3_0 : FeatureSet<
[FeatureGFX10,
FeatureNSAEncoding,
FeatureNSAMaxSize13,
FeatureWavefrontSize32,
- FeatureShaderCyclesRegister]>;
+ FeatureShaderCyclesRegister,
+ FeatureBackOffBarrier]>;
//===----------------------------------------------------------------------===//
// Dynamically set bits that enable features.
bool FlatForGlobal;
bool AutoWaitcntBeforeBarrier;
+ bool BackOffBarrier;
bool UnalignedScratchAccess;
bool UnalignedAccessMode;
bool HasApertureRegs;
return AutoWaitcntBeforeBarrier;
}
+ /// \returns true if the target supports backing off of s_barrier instructions
+ /// when an exception is raised.
+ bool supportsBackOffBarrier() const {
+ return BackOffBarrier;
+ }
+
bool hasUnalignedBufferAccess() const {
return UnalignedBufferAccess;
}
}
}
- // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
- // occurs before the instruction. Doing it here prevents any additional
- // S_WAITCNTs from being emitted if the instruction was marked as
- // requiring a WAITCNT beforehand.
+ // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
+ // not, we need to ensure the subtarget is capable of backing off barrier
+ // instructions in case there are any outstanding memory operations that may
+ // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
- !ST->hasAutoWaitcntBeforeBarrier()) {
+ !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-back-off-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-BACKOFF %s
+
+; Subtargets must wait for outstanding memory instructions before a barrier if
+; they cannot back off of the barrier.
+
+define void @back_off_barrier_no_fence(i32* %in, i32* %out) #0 {
+; GFX9-NO-BACKOFF-LABEL: back_off_barrier_no_fence:
+; GFX9-NO-BACKOFF: ; %bb.0:
+; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
+; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT: s_barrier
+; GFX9-NO-BACKOFF-NEXT: flat_store_dword v[2:3], v0
+; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-BACKOFF-LABEL: back_off_barrier_no_fence:
+; GFX9-BACKOFF: ; %bb.0:
+; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
+; GFX9-BACKOFF-NEXT: s_barrier
+; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT: flat_store_dword v[2:3], v0
+; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-BACKOFF-LABEL: back_off_barrier_no_fence:
+; GFX10-BACKOFF: ; %bb.0:
+; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
+; GFX10-BACKOFF-NEXT: s_barrier
+; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT: flat_store_dword v[2:3], v0
+; GFX10-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT: s_setpc_b64 s[30:31]
+ %load = load i32, i32* %in
+ call void @llvm.amdgcn.s.barrier()
+ store i32 %load, i32* %out
+ ret void
+}
+
+define void @back_off_barrier_with_fence(i32* %in, i32* %out) #0 {
+; GFX9-NO-BACKOFF-LABEL: back_off_barrier_with_fence:
+; GFX9-NO-BACKOFF: ; %bb.0:
+; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
+; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT: s_barrier
+; GFX9-NO-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT: flat_store_dword v[2:3], v0
+; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-BACKOFF-LABEL: back_off_barrier_with_fence:
+; GFX9-BACKOFF: ; %bb.0:
+; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
+; GFX9-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-BACKOFF-NEXT: s_barrier
+; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT: flat_store_dword v[2:3], v0
+; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-BACKOFF-LABEL: back_off_barrier_with_fence:
+; GFX10-BACKOFF: ; %bb.0:
+; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
+; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT: s_barrier
+; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT: buffer_gl0_inv
+; GFX10-BACKOFF-NEXT: flat_store_dword v[2:3], v0
+; GFX10-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT: s_setpc_b64 s[30:31]
+ %load = load i32, i32* %in
+ fence syncscope("workgroup") release
+ call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ store i32 %load, i32* %out
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier()
+
+attributes #0 = { nounwind }
; GFX10: S_WAITCNT 0
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
- ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1
; GFX10: S_BARRIER
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_WAITCNT 112
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
; GFX10: S_WAITCNT 0
- ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1
; GFX10: S_BARRIER
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_WAITCNT 112
; RUN: llc -march=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
; GCN-LABEL: barrier_vmcnt_global:
; GFX8: flat_load_dword
%tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
store i32 0, i32 addrspace(1)* %tmp5, align 4
fence syncscope("singlethread") release
- tail call void @llvm.amdgcn.s.barrier() #3
+ tail call void @llvm.amdgcn.s.barrier()
fence syncscope("singlethread") acquire
%tmp6 = add nuw nsw i64 %tmp2, 4294967296
%tmp7 = lshr exact i64 %tmp6, 32
%tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
store i32 0, i32* %tmp5, align 4
fence syncscope("singlethread") release
- tail call void @llvm.amdgcn.s.barrier() #3
+ tail call void @llvm.amdgcn.s.barrier()
fence syncscope("singlethread") acquire
%tmp6 = add nuw nsw i64 %tmp2, 4294967296
%tmp7 = lshr exact i64 %tmp6, 32