anv, iris: Disable pre fetching the binding table entries on DG2
authorRohan Garg <rohan.garg@intel.com>
Tue, 6 Sep 2022 15:31:51 +0000 (17:31 +0200)
committerRohan Garg <rohan.garg@intel.com>
Tue, 11 Oct 2022 13:16:09 +0000 (15:16 +0200)
On DG2 the HW will fetch the binding entries into the cache
for every single thread when a compute walker is dispatched,
wiping out the advantages of the cache prefetch.

The spec also advises to not do a cache prefetch when we have more than
31 binding table entries, but most real world applications will never
hit that limit.

Signed-off-by: Rohan Garg <rohan.garg@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18498>

src/gallium/drivers/iris/iris_state.c
src/intel/vulkan/genX_cmd_buffer.c
src/intel/vulkan/genX_pipeline.c

index 12f13cc..3afb152 100644 (file)
@@ -4741,7 +4741,9 @@ iris_store_cs_state(const struct intel_device_info *devinfo,
       assert(cs_prog_data->push.cross_thread.regs == 0);
 #endif
       desc.BarrierEnable = cs_prog_data->uses_barrier;
-      desc.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
+      /* Typically set to 0 to avoid prefetching on every thread dispatch. */
+      desc.BindingTableEntryCount = devinfo->verx10 == 125 ?
+         0 : MIN2(shader->bt.size_bytes / 4, 31);
       desc.SamplerCount = encode_sampler_count(shader);
 #if GFX_VER >= 12
       /* TODO: Check if we are missing workarounds and enable mid-thread
@@ -7212,7 +7214,9 @@ iris_upload_compute_walker(struct iris_context *ice,
          .SamplerStatePointer = shs->sampler_table.offset,
          .SamplerCount = encode_sampler_count(shader),
          .BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE],
-         .BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31),
+         /* Typically set to 0 to avoid prefetching on every thread dispatch. */
+         .BindingTableEntryCount = devinfo->verx10 == 125 ?
+            0 : MIN2(shader->bt.size_bytes / 4, 31),
       };
 
       assert(brw_cs_push_const_total_size(cs_prog_data, dispatch.threads) == 0);
index 70cbceb..90dd688 100644 (file)
@@ -5096,8 +5096,9 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
          .BindingTablePointer =
             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
-         .BindingTableEntryCount =
-            1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
+         /* Typically set to 0 to avoid prefetching on every thread dispatch. */
+         .BindingTableEntryCount = devinfo->verx10 == 125 ?
+            0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
          .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
          .SharedLocalMemorySize = encode_slm_size(GFX_VER,
                                                   prog_data->base.total_shared),
index 4a04be3..44a9855 100644 (file)
@@ -2169,8 +2169,11 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
       .SamplerCount           = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
       /* We add 1 because the CS indirect parameters buffer isn't accounted
        * for in bind_map.surface_count.
+       *
+       * Typically set to 0 to avoid prefetching on every thread dispatch.
        */
-      .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30),
+      .BindingTableEntryCount = devinfo->verx10 == 125 ?
+         0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
       .BarrierEnable          = cs_prog_data->uses_barrier,
       .SharedLocalMemorySize  =
          encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),