[AMDGPU] Omit unnecessary waitcnt before barriers

author Austin Kerbow <Austin.Kerbow@amd.com>

Fri, 25 Feb 2022 07:26:51 +0000 (23:26 -0800)

committer Austin Kerbow <Austin.Kerbow@amd.com>

Mon, 7 Mar 2022 16:23:53 +0000 (08:23 -0800)
author Austin Kerbow <Austin.Kerbow@amd.com>
Fri, 25 Feb 2022 07:26:51 +0000 (23:26 -0800)
committer Austin Kerbow <Austin.Kerbow@amd.com>
Mon, 7 Mar 2022 16:23:53 +0000 (08:23 -0800)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td

index 0127626..5dc6874 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -737,6 +737,12 @@ def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature <
    "Hardware automatically inserts waitcnt before barrier"
  >;
  
+def FeatureBackOffBarrier : SubtargetFeature <"back-off-barrier",
+  "BackOffBarrier",
+  "true",
+  "Hardware supports backing off s_barrier if an exception occurs"
+>;
+
  def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
    "HasTrigReducedRange",
    "true",
@@ -1025,7 +1031,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
     FeatureMadMacF32Insts,
     FeatureSupportsSRAMECC,
     FeaturePackedTID,
-   FullRate64Ops]>;
+   FullRate64Ops,
+   FeatureBackOffBarrier]>;
  
  def FeatureISAVersion9_0_C : FeatureSet<
    [FeatureGFX9,
@@ -1059,7 +1066,8 @@ def FeatureISAVersion9_4_0 : FeatureSet<
     FeatureSupportsSRAMECC,
     FeaturePackedTID,
     FeatureArchitectedFlatScratch,
-   FullRate64Ops]>;
+   FullRate64Ops,
+   FeatureBackOffBarrier]>;
  
  // TODO: Organize more features into groups.
  def FeatureGroup {
@@ -1094,7 +1102,8 @@ def FeatureISAVersion10_1_0 : FeatureSet<
       FeatureMadMacF32Insts,
       FeatureDsSrc2Insts,
       FeatureLdsMisalignedBug,
-     FeatureSupportsXNACK])>;
+     FeatureSupportsXNACK,
+     FeatureBackOffBarrier])>;
  
  def FeatureISAVersion10_1_1 : FeatureSet<
    !listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -1116,7 +1125,8 @@ def FeatureISAVersion10_1_1 : FeatureSet<
       FeatureMadMacF32Insts,
       FeatureDsSrc2Insts,
       FeatureLdsMisalignedBug,
-     FeatureSupportsXNACK])>;
+     FeatureSupportsXNACK,
+     FeatureBackOffBarrier])>;
  
  def FeatureISAVersion10_1_2 : FeatureSet<
    !listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -1138,7 +1148,8 @@ def FeatureISAVersion10_1_2 : FeatureSet<
       FeatureMadMacF32Insts,
       FeatureDsSrc2Insts,
       FeatureLdsMisalignedBug,
-     FeatureSupportsXNACK])>;
+     FeatureSupportsXNACK,
+     FeatureBackOffBarrier])>;
  
  def FeatureISAVersion10_1_3 : FeatureSet<
    !listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -1156,7 +1167,8 @@ def FeatureISAVersion10_1_3 : FeatureSet<
       FeatureMadMacF32Insts,
       FeatureDsSrc2Insts,
       FeatureLdsMisalignedBug,
-     FeatureSupportsXNACK])>;
+     FeatureSupportsXNACK,
+     FeatureBackOffBarrier])>;
  
  def FeatureISAVersion10_3_0 : FeatureSet<
    [FeatureGFX10,
@@ -1173,7 +1185,8 @@ def FeatureISAVersion10_3_0 : FeatureSet<
     FeatureNSAEncoding,
     FeatureNSAMaxSize13,
     FeatureWavefrontSize32,
-   FeatureShaderCyclesRegister]>;
+   FeatureShaderCyclesRegister,
+   FeatureBackOffBarrier]>;
  
  //===----------------------------------------------------------------------===//
  
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h

index 687026a..fc9e424 100644 (file)
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -72,6 +72,7 @@ protected:
    // Dynamically set bits that enable features.
    bool FlatForGlobal;
    bool AutoWaitcntBeforeBarrier;
+  bool BackOffBarrier;
    bool UnalignedScratchAccess;
    bool UnalignedAccessMode;
    bool HasApertureRegs;
@@ -493,6 +494,12 @@ public:
      return AutoWaitcntBeforeBarrier;
    }
  
+  /// \returns true if the target supports backing off of s_barrier instructions
+  /// when an exception is raised.
+  bool supportsBackOffBarrier() const {
+    return BackOffBarrier;
+  }
+
    bool hasUnalignedBufferAccess() const {
      return UnalignedBufferAccess;
    }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

index d6ea4c0..89820e3 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1135,12 +1135,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
      }
    }
  
-  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
-  // occurs before the instruction. Doing it here prevents any additional
-  // S_WAITCNTs from being emitted if the instruction was marked as
-  // requiring a WAITCNT beforehand.
+  // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
+  // not, we need to ensure the subtarget is capable of backing off barrier
+  // instructions in case there are any outstanding memory operations that may
+  // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
    if (MI.getOpcode() == AMDGPU::S_BARRIER &&
-      !ST->hasAutoWaitcntBeforeBarrier()) {
+      !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
      Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
    }
  
diff --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll

new file mode 100644 (file)

index 0000000..337dcfc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-back-off-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-BACKOFF %s
+
+; Subtargets must wait for outstanding memory instructions before a barrier if
+; they cannot back off of the barrier.
+
+define void @back_off_barrier_no_fence(i32* %in, i32* %out) #0 {
+; GFX9-NO-BACKOFF-LABEL: back_off_barrier_no_fence:
+; GFX9-NO-BACKOFF:       ; %bb.0:
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    s_barrier
+; GFX9-NO-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-BACKOFF-LABEL: back_off_barrier_no_fence:
+; GFX9-BACKOFF:       ; %bb.0:
+; GFX9-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
+; GFX9-BACKOFF-NEXT:    s_barrier
+; GFX9-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
+; GFX9-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-BACKOFF-LABEL: back_off_barrier_no_fence:
+; GFX10-BACKOFF:       ; %bb.0:
+; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
+; GFX10-BACKOFF-NEXT:    s_barrier
+; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
+; GFX10-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
+  %load = load i32, i32* %in
+  call void @llvm.amdgcn.s.barrier()
+  store i32 %load, i32* %out
+  ret void
+}
+
+define void @back_off_barrier_with_fence(i32* %in, i32* %out) #0 {
+; GFX9-NO-BACKOFF-LABEL: back_off_barrier_with_fence:
+; GFX9-NO-BACKOFF:       ; %bb.0:
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    s_barrier
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-BACKOFF-LABEL: back_off_barrier_with_fence:
+; GFX9-BACKOFF:       ; %bb.0:
+; GFX9-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
+; GFX9-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    s_barrier
+; GFX9-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
+; GFX9-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-BACKOFF-LABEL: back_off_barrier_with_fence:
+; GFX10-BACKOFF:       ; %bb.0:
+; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
+; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT:    s_barrier
+; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT:    buffer_gl0_inv
+; GFX10-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
+; GFX10-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
+  %load = load i32, i32* %in
+  fence syncscope("workgroup") release
+  call void @llvm.amdgcn.s.barrier()
+  fence syncscope("workgroup") acquire
+  store i32 %load, i32* %out
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier()
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir

index 5601d69..c273630 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
@@ -35,7 +35,7 @@ body:             |
      ; GFX10: S_WAITCNT 0
      ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
      ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
-    ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1
      ; GFX10: S_BARRIER
      ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
      ; GFX10: S_WAITCNT 112
@@ -112,7 +112,7 @@ body:             |
      ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
      ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
      ; GFX10: S_WAITCNT 0
-    ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1
      ; GFX10: S_BARRIER
      ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
      ; GFX10: S_WAITCNT 112
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll

index f1c5c5b..e78b535 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -1,6 +1,6 @@
  ; RUN: llc -march=amdgcn -mcpu=gfx802  -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
  ; RUN: llc -march=amdgcn -mcpu=gfx900  -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
  
  ; GCN-LABEL: barrier_vmcnt_global:
  ; GFX8:         flat_load_dword
@@ -42,7 +42,7 @@ bb:
    %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
    store i32 0, i32 addrspace(1)* %tmp5, align 4
    fence syncscope("singlethread") release
-  tail call void @llvm.amdgcn.s.barrier() #3
+  tail call void @llvm.amdgcn.s.barrier()
    fence syncscope("singlethread") acquire
    %tmp6 = add nuw nsw i64 %tmp2, 4294967296
    %tmp7 = lshr exact i64 %tmp6, 32
@@ -116,7 +116,7 @@ bb:
    %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
    store i32 0, i32* %tmp5, align 4
    fence syncscope("singlethread") release
-  tail call void @llvm.amdgcn.s.barrier() #3
+  tail call void @llvm.amdgcn.s.barrier()
    fence syncscope("singlethread") acquire
    %tmp6 = add nuw nsw i64 %tmp2, 4294967296
    %tmp7 = lshr exact i64 %tmp6, 32
author	Austin Kerbow <Austin.Kerbow@amd.com>
	Fri, 25 Feb 2022 07:26:51 +0000 (23:26 -0800)
committer	Austin Kerbow <Austin.Kerbow@amd.com>
	Mon, 7 Mar 2022 16:23:53 +0000 (08:23 -0800)
llvm/lib/Target/AMDGPU/AMDGPU.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/GCNSubtarget.h		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll		patch \| blob \| history