intel/fs: Rework fence handling in brw_fs_nir.cpp

author Jason Ekstrand <jason@jlekstrand.net>

Wed, 15 Sep 2021 17:58:04 +0000 (12:58 -0500)

committer Marge Bot <eric+marge@anholt.net>

Wed, 29 Sep 2021 20:52:54 +0000 (20:52 +0000)
author Jason Ekstrand <jason@jlekstrand.net>
Wed, 15 Sep 2021 17:58:04 +0000 (12:58 -0500)
committer Marge Bot <eric+marge@anholt.net>
Wed, 29 Sep 2021 20:52:54 +0000 (20:52 +0000)
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp

index 9db9a8c..4c1c637 100644 (file)
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4219,6 +4219,21 @@ increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v)
     }
  }
  
+static fs_reg
+emit_fence(const fs_builder &bld, enum opcode opcode,
+           uint8_t sfid, bool commit_enable, uint8_t bti)
+{
+   assert(opcode == SHADER_OPCODE_INTERLOCK ||
+          opcode == SHADER_OPCODE_MEMORY_FENCE);
+
+   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0),
+                             brw_imm_ud(commit_enable),
+                             brw_imm_ud(bti));
+   fence->sfid = sfid;
+   return dst;
+}
+
  void
  fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
  {
@@ -4411,7 +4426,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
     case nir_intrinsic_memory_barrier:
     case nir_intrinsic_begin_invocation_interlock:
     case nir_intrinsic_end_invocation_interlock: {
-      bool l3_fence, slm_fence, tgm_fence = false;
+      bool ugm_fence, slm_fence, tgm_fence, urb_fence;
        const enum opcode opcode =
           instr->intrinsic == nir_intrinsic_begin_invocation_interlock ?
           SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE;
@@ -4419,14 +4434,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        switch (instr->intrinsic) {
        case nir_intrinsic_scoped_barrier: {
           nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
-         l3_fence = modes & (nir_var_shader_out |
-                             nir_var_mem_ssbo |
-                             nir_var_mem_global);
+         ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
           slm_fence = modes & nir_var_mem_shared;
-
-         /* NIR currently doesn't have an image mode */
-         if (devinfo->has_lsc)
-            tgm_fence = modes & nir_var_mem_ssbo;
+         tgm_fence = modes & nir_var_mem_ssbo;
+         urb_fence = modes & nir_var_shader_out;
           break;
        }
  
@@ -4448,16 +4459,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
            * Handling them here will allow the logic for IVB render cache (see
            * below) to be reused.
            */
-         l3_fence = true;
-         slm_fence = false;
+         assert(stage == MESA_SHADER_FRAGMENT);
+         ugm_fence = tgm_fence = true;
+         slm_fence = urb_fence = false;
           break;
  
        default:
-         l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
+         ugm_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared &&
+                     instr->intrinsic != nir_intrinsic_memory_barrier_image;
           slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
                       instr->intrinsic == nir_intrinsic_memory_barrier ||
                       instr->intrinsic == nir_intrinsic_memory_barrier_shared;
-         tgm_fence = instr->intrinsic == nir_intrinsic_memory_barrier_image;
+         tgm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
+                     instr->intrinsic == nir_intrinsic_memory_barrier ||
+                     instr->intrinsic == nir_intrinsic_memory_barrier_image;
+         urb_fence = instr->intrinsic == nir_intrinsic_memory_barrier;
           break;
        }
  
@@ -4474,95 +4490,99 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
            slm_fence && workgroup_size() <= dispatch_width)
           slm_fence = false;
  
-      /* Prior to Gfx11, there's only L3 fence, so emit that instead. */
-      if (slm_fence && devinfo->ver < 11) {
-         slm_fence = false;
-         l3_fence = true;
-      }
-
-      /* IVB does typed surface access through the render cache, so we need
-       * to flush it too.
-       */
-      const bool needs_render_fence =
-         devinfo->verx10 == 70;
-
-      /* Be conservative in Gfx11+ and always stall in a fence.  Since there
-       * are two different fences, and shader might want to synchronize
-       * between them.
-       *
-       * TODO: Use scope and visibility information for the barriers from NIR
-       * to make a better decision on whether we need to stall.
-       */
-      const bool stall = devinfo->ver >= 11 || needs_render_fence ||
-         instr->intrinsic == nir_intrinsic_end_invocation_interlock;
-
-      const bool commit_enable = stall ||
-         devinfo->ver >= 10; /* HSD ES # 1404612949 */
+      if (stage != MESA_SHADER_TESS_CTRL)
+         urb_fence = false;
  
        unsigned fence_regs_count = 0;
        fs_reg fence_regs[3] = {};
  
        const fs_builder ubld = bld.group(8, 0);
  
-      if (l3_fence) {
-         fs_inst *fence =
-            ubld.emit(opcode,
-                      ubld.vgrf(BRW_REGISTER_TYPE_UD),
-                      brw_vec8_grf(0, 0),
-                      brw_imm_ud(commit_enable),
-                      brw_imm_ud(0 /* BTI; ignored for LSC */));
-
-         fence->sfid = devinfo->has_lsc ?
-                       GFX12_SFID_UGM :
-                       GFX7_SFID_DATAPORT_DATA_CACHE;
-
-         fence_regs[fence_regs_count++] = fence->dst;
-
-         if (needs_render_fence) {
-            fs_inst *render_fence =
-               ubld.emit(opcode,
-                         ubld.vgrf(BRW_REGISTER_TYPE_UD),
-                         brw_vec8_grf(0, 0),
-                         brw_imm_ud(commit_enable),
-                         brw_imm_ud(/* bti */ 0));
-            render_fence->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
-
-            fence_regs[fence_regs_count++] = render_fence->dst;
+      if (devinfo->has_lsc) {
+         assert(devinfo->verx10 >= 125);
+         if (ugm_fence) {
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX12_SFID_UGM,
+                          true /* commit_enable */,
+                          0 /* bti; ignored for LSC */);
           }
  
-         /* Translate l3_fence into untyped and typed fence on XeHP */
-         if (devinfo->has_lsc && tgm_fence) {
-            fs_inst *fence =
-               ubld.emit(opcode,
-                         ubld.vgrf(BRW_REGISTER_TYPE_UD),
-                         brw_vec8_grf(0, 0),
-                         brw_imm_ud(commit_enable),
-                         brw_imm_ud(/* ignored */0));
-
-            fence->sfid = GFX12_SFID_TGM;
-            fence_regs[fence_regs_count++] = fence->dst;
+         if (tgm_fence) {
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX12_SFID_TGM,
+                          true /* commit_enable */,
+                          0 /* bti; ignored for LSC */);
           }
-      }
  
-      if (slm_fence) {
-         assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
-         fs_inst *fence =
-            ubld.emit(opcode,
-                      ubld.vgrf(BRW_REGISTER_TYPE_UD),
-                      brw_vec8_grf(0, 0),
-                      brw_imm_ud(commit_enable),
-                      brw_imm_ud(GFX7_BTI_SLM /* ignored for LSC */));
-         if (devinfo->has_lsc)
-            fence->sfid = GFX12_SFID_SLM;
-         else
-            fence->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
-
-         fence_regs[fence_regs_count++] = fence->dst;
+         if (slm_fence) {
+            assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX12_SFID_SLM,
+                          true /* commit_enable */,
+                          0 /* BTI; ignored for LSC */);
+         }
+
+         if (urb_fence) {
+            unreachable("TODO: Emit a URB barrier message");
+         }
+      } else if (devinfo->ver >= 11) {
+         if (tgm_fence || ugm_fence || urb_fence) {
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE,
+                          true /* commit_enable HSD ES # 1404612949 */,
+                          0 /* BTI = 0 means data cache */);
+         }
+
+         if (slm_fence) {
+            assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE,
+                          true /* commit_enable HSD ES # 1404612949 */,
+                          GFX7_BTI_SLM);
+         }
+      } else {
+         /* Prior to Icelake, they're all lumped into a single cache except on
+          * Ivy Bridge and Bay Trail where typed messages actually go through
+          * the render cache.  There, we need both fences because we may
+          * access storage images as either typed or untyped.
+          */
+         const bool render_fence = tgm_fence && devinfo->verx10 == 70;
+
+         const bool commit_enable = render_fence ||
+            instr->intrinsic == nir_intrinsic_end_invocation_interlock;
+
+         if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE,
+                          commit_enable, 0 /* BTI */);
+         }
+
+         if (render_fence) {
+            fence_regs[fence_regs_count++] =
+               emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE,
+                          commit_enable, /* bti */ 0);
+         }
        }
  
-      assert(fence_regs_count <= 3);
+      assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
  
-      if (stall || fence_regs_count == 0) {
+      /* There are three cases where we want to insert a stall:
+       *
+       *  1. If we're a nir_intrinsic_end_invocation_interlock.  This is
+       *     required to ensure that the shader EOT doesn't happen until
+       *     after the fence returns.  Otherwise, we might end up with the
+       *     next shader invocation for that pixel not respecting our fence
+       *     because it may happen on a different HW thread.
+       *
+       *  2. If we have multiple fences.  This is required to ensure that
+       *     they all complete and nothing gets weirdly out-of-order.
+       *
+       *  3. If we have no fences.  In this case, we need at least a
+       *     scheduling barrier to keep the compiler from moving things
+       *     around in an invalid way.
+       */
+      if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
+          fence_regs_count != 1) {
           ubld.exec_all().group(1, 0).emit(
              FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
              fence_regs, fence_regs_count);
author	Jason Ekstrand <jason@jlekstrand.net>
	Wed, 15 Sep 2021 17:58:04 +0000 (12:58 -0500)
committer	Marge Bot <eric+marge@anholt.net>
	Wed, 29 Sep 2021 20:52:54 +0000 (20:52 +0000)