intel/fs: put scratch surface in the surface state heap
authorLionel Landwerlin <lionel.g.landwerlin@intel.com>
Mon, 14 Nov 2022 13:54:01 +0000 (15:54 +0200)
committerMarge Bot <emma+marge@anholt.net>
Sat, 19 Nov 2022 14:58:58 +0000 (14:58 +0000)
In 4ceaed7839af we made scratch surface state allocations part of the
internal heap (mapped to STATE_BASE_ADDRESS::SurfaceStateBaseAddress)
so that it doesn't uses slots in the application's expected 1M
descriptors (especially with vkd3d-proton).

But all our compiler code relies on BSS
(STATE_BASE_ADDRESS::BindlessSurfaceStateBaseAddress).

The additional issue is that there is only 26bits of surface offset
available in CS instruction (CFE_STATE, 3DSTATE_VS, etc...) for
scratch surfaces. So we need the drivers to put the scratch surfaces
in the first chunk of STATE_BASE_ADDRESS::SurfaceStateBaseAddress
(hence all the driver changes).

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Fixes: 4ceaed7839af ("anv: split internal surface states from descriptors")
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7687
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19727>

12 files changed:
src/gallium/drivers/iris/iris_state.c
src/intel/compiler/brw_eu_defines.h
src/intel/compiler/brw_fs_nir.cpp
src/intel/compiler/brw_fs_reg_allocate.cpp
src/intel/compiler/brw_lower_logical_sends.cpp
src/intel/vulkan/anv_allocator.c
src/intel/vulkan/anv_batch_chain.c
src/intel/vulkan/anv_device.c
src/intel/vulkan/anv_image.c
src/intel/vulkan/anv_private.h
src/intel/vulkan/genX_blorp_exec.c
src/intel/vulkan/genX_cmd_buffer.c

index dcef3b4..63051df 100644 (file)
@@ -709,6 +709,9 @@ init_state_base_address(struct iris_batch *batch)
       sba.IndirectObjectMOCS          = mocs;
       sba.InstructionMOCS             = mocs;
       sba.SurfaceStateMOCS            = mocs;
+#if GFX_VER >= 9
+      sba.BindlessSurfaceStateMOCS    = mocs;
+#endif
 
       sba.GeneralStateBaseAddressModifyEnable   = true;
       sba.DynamicStateBaseAddressModifyEnable   = true;
@@ -717,12 +720,6 @@ init_state_base_address(struct iris_batch *batch)
       sba.GeneralStateBufferSizeModifyEnable    = true;
       sba.DynamicStateBufferSizeModifyEnable    = true;
       sba.SurfaceStateBaseAddressModifyEnable   = true;
-#if GFX_VER >= 9
-      sba.BindlessSurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_SCRATCH_START);
-      sba.BindlessSurfaceStateSize = (IRIS_SCRATCH_ZONE_SIZE >> 12) - 1;
-      sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
-      sba.BindlessSurfaceStateMOCS    = mocs;
-#endif
 #if GFX_VER >= 11
       sba.BindlessSamplerStateMOCS    = mocs;
 #endif
index 11996fe..b734804 100644 (file)
@@ -1567,6 +1567,11 @@ enum brw_message_target {
 #define GFX8_BTI_STATELESS_NON_COHERENT  253
 #define GFX9_BTI_BINDLESS                252
 
+/* This ID doesn't map anything HW related value. It exists to inform the
+ * lowering code to not use the bindless heap.
+ */
+#define GFX125_NON_BINDLESS              (1u << 16)
+
 /* Dataport atomic operations for Untyped Atomic Integer Operation message
  * (and others).
  */
index c5ee5ba..b29fc4b 100644 (file)
@@ -5090,6 +5090,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
          ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
                           brw_imm_ud(~0x3ffu));
+         srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS);
          srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
       } else if (devinfo->ver >= 8) {
          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
@@ -5156,6 +5157,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
          ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
                           brw_imm_ud(~0x3ffu));
+         srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS);
          srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
       } else if (devinfo->ver >= 8) {
          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
index 1b6082e..9a6fe4d 100644 (file)
@@ -843,7 +843,7 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld,
          unspill_inst->sfid = GFX12_SFID_UGM;
          unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
                                            unspill_inst->exec_size,
-                                           LSC_ADDR_SURFTYPE_BSS,
+                                           LSC_ADDR_SURFTYPE_SS,
                                            LSC_ADDR_SIZE_A32,
                                            1 /* num_coordinates */,
                                            LSC_DATA_SIZE_D32,
@@ -940,7 +940,7 @@ fs_reg_alloc::emit_spill(const fs_builder &bld,
          spill_inst->sfid = GFX12_SFID_UGM;
          spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
                                          bld.dispatch_width(),
-                                         LSC_ADDR_SURFTYPE_BSS,
+                                         LSC_ADDR_SURFTYPE_SS,
                                          LSC_ADDR_SIZE_A32,
                                          1 /* num_coordinates */,
                                          LSC_DATA_SIZE_D32,
index 6d49124..1ff064d 100644 (file)
@@ -1718,13 +1718,20 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
    else
       inst->sfid = GFX12_SFID_UGM;
 
-   /* We must have exactly one of surface and surface_handle */
-   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
+   /* We should have exactly one of surface and surface_handle. For scratch
+    * messages generated by brw_fs_nir.cpp we also allow a special value to
+    * know what heap base we should use in STATE_BASE_ADDRESS (SS = Surface
+    * State Offset, or BSS = Bindless Surface State Offset).
+    */
+   bool non_bindless = surface.file == IMM && surface.ud == GFX125_NON_BINDLESS;
+   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE) ||
+          (non_bindless && surface_handle.file != BAD_FILE));
 
    enum lsc_addr_surface_type surf_type;
-   if (surface_handle.file != BAD_FILE)
-      surf_type = LSC_ADDR_SURFTYPE_BSS;
-   else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
+   if (surface_handle.file != BAD_FILE) {
+      assert(surface.file == IMM && (surface.ud == 0 || surface.ud == GFX125_NON_BINDLESS));
+      surf_type = non_bindless ? LSC_ADDR_SURFTYPE_SS : LSC_ADDR_SURFTYPE_BSS;
+   } else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
       surf_type = LSC_ADDR_SURFTYPE_FLAT;
    else
       surf_type = LSC_ADDR_SURFTYPE_BTI;
@@ -1801,6 +1808,7 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
    case LSC_ADDR_SURFTYPE_FLAT:
       inst->src[1] = brw_imm_ud(0);
       break;
+   case LSC_ADDR_SURFTYPE_SS:
    case LSC_ADDR_SURFTYPE_BSS:
       /* We assume that the driver provided the handle in the top 20 bits so
        * we can use the surface handle directly as the extended descriptor.
index 12f502f..a83ecf0 100644 (file)
@@ -1197,7 +1197,7 @@ anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool
 
    for (unsigned i = 0; i < 16; i++) {
       if (pool->surf_states[i].map != NULL) {
-         anv_state_pool_free(&device->internal_surface_state_pool,
+         anv_state_pool_free(&device->scratch_surface_state_pool,
                              pool->surf_states[i]);
       }
    }
@@ -1274,6 +1274,8 @@ anv_scratch_pool_get_surf(struct anv_device *device,
                           struct anv_scratch_pool *pool,
                           unsigned per_thread_scratch)
 {
+   assert(device->info->verx10 >= 125);
+
    if (per_thread_scratch == 0)
       return 0;
 
@@ -1290,7 +1292,7 @@ anv_scratch_pool_get_surf(struct anv_device *device,
    struct anv_address addr = { .bo = bo };
 
    struct anv_state state =
-      anv_state_pool_alloc(&device->internal_surface_state_pool,
+      anv_state_pool_alloc(&device->scratch_surface_state_pool,
                            device->isl_dev.ss.size, 64);
 
    isl_buffer_fill_state(&device->isl_dev, state.map,
@@ -1305,7 +1307,7 @@ anv_scratch_pool_get_surf(struct anv_device *device,
    uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2],
                                        0, state.offset);
    if (current) {
-      anv_state_pool_free(&device->internal_surface_state_pool, state);
+      anv_state_pool_free(&device->scratch_surface_state_pool, state);
       return current;
    } else {
       pool->surf_states[scratch_size_log2] = state;
index 8615f21..1f618c7 100644 (file)
@@ -1443,7 +1443,7 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
    }
 
    /* Add all the global BOs to the object list for softpin case. */
-   result = pin_state_pool(device, execbuf, &device->internal_surface_state_pool);
+   result = pin_state_pool(device, execbuf, &device->scratch_surface_state_pool);
    if (result != VK_SUCCESS)
       return result;
 
@@ -1451,6 +1451,10 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
    if (result != VK_SUCCESS)
       return result;
 
+   result = pin_state_pool(device, execbuf, &device->internal_surface_state_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
    result = pin_state_pool(device, execbuf, &device->dynamic_state_pool);
    if (result != VK_SUCCESS)
       return result;
index 51c7367..8179a35 100644 (file)
@@ -3063,10 +3063,12 @@ decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
       return ret_bo;
    if (get_bo_from_pool(&ret_bo, &device->binding_table_pool.block_pool, address))
       return ret_bo;
-   if (get_bo_from_pool(&ret_bo, &device->internal_surface_state_pool.block_pool, address))
+   if (get_bo_from_pool(&ret_bo, &device->scratch_surface_state_pool.block_pool, address))
       return ret_bo;
    if (get_bo_from_pool(&ret_bo, &device->bindless_surface_state_pool.block_pool, address))
       return ret_bo;
+   if (get_bo_from_pool(&ret_bo, &device->internal_surface_state_pool.block_pool, address))
+      return ret_bo;
 
    if (!device->cmd_buffer_being_decoded)
       return (struct intel_batch_decode_bo) { };
@@ -3437,11 +3439,27 @@ VkResult anv_CreateDevice(
    if (result != VK_SUCCESS)
       goto fail_dynamic_state_pool;
 
-   result = anv_state_pool_init(&device->internal_surface_state_pool, device,
-                                "internal surface state pool",
-                                INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096);
+   if (device->info->verx10 >= 125) {
+      /* Put the scratch surface states at the beginning of the internal
+       * surface state pool.
+       */
+      result = anv_state_pool_init(&device->scratch_surface_state_pool, device,
+                                   "scratch surface state pool",
+                                   SCRATCH_SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096);
+      if (result != VK_SUCCESS)
+         goto fail_instruction_state_pool;
+
+      result = anv_state_pool_init(&device->internal_surface_state_pool, device,
+                                   "internal surface state pool",
+                                   INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS,
+                                   SCRATCH_SURFACE_STATE_POOL_SIZE, 4096);
+   } else {
+      result = anv_state_pool_init(&device->internal_surface_state_pool, device,
+                                   "internal surface state pool",
+                                   INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096);
+   }
    if (result != VK_SUCCESS)
-      goto fail_instruction_state_pool;
+      goto fail_scratch_surface_state_pool;
 
    result = anv_state_pool_init(&device->bindless_surface_state_pool, device,
                                 "bindless surface state pool",
@@ -3549,7 +3567,9 @@ VkResult anv_CreateDevice(
     * to zero and they have a valid descriptor.
     */
    device->null_surface_state =
-      anv_state_pool_alloc(&device->internal_surface_state_pool,
+      anv_state_pool_alloc(device->info->verx10 >= 125 ?
+                           &device->scratch_surface_state_pool :
+                           &device->internal_surface_state_pool,
                            device->isl_dev.ss.size,
                            device->isl_dev.ss.align);
    isl_null_fill_state(&device->isl_dev, device->null_surface_state.map,
@@ -3650,6 +3670,9 @@ VkResult anv_CreateDevice(
    anv_state_pool_finish(&device->bindless_surface_state_pool);
  fail_internal_surface_state_pool:
    anv_state_pool_finish(&device->internal_surface_state_pool);
+ fail_scratch_surface_state_pool:
+   if (device->info->verx10 >= 125)
+      anv_state_pool_finish(&device->scratch_surface_state_pool);
  fail_instruction_state_pool:
    anv_state_pool_finish(&device->instruction_state_pool);
  fail_dynamic_state_pool:
@@ -3738,6 +3761,8 @@ void anv_DestroyDevice(
    }
 
    anv_state_pool_finish(&device->binding_table_pool);
+   if (device->info->verx10 >= 125)
+      anv_state_pool_finish(&device->scratch_surface_state_pool);
    anv_state_pool_finish(&device->internal_surface_state_pool);
    anv_state_pool_finish(&device->bindless_surface_state_pool);
    anv_state_pool_finish(&device->instruction_state_pool);
index 86d0e32..86d276b 100644 (file)
@@ -2679,6 +2679,8 @@ anv_CreateImageView(VkDevice _device,
                                    .h = image->vk.extent.height,
                                    .d = image->vk.extent.depth,
                                 });
+
+            iview->planes[vplane].lowered_surface_state_is_null = true;
          }
       }
    }
index 467176d..71572cc 100644 (file)
@@ -160,8 +160,10 @@ struct intel_perf_query_result;
 #define BINDING_TABLE_POOL_MAX_ADDRESS             0x00013fffffffULL
 #define INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS    0x000140000000ULL /* 5 GiB */
 #define INTERNAL_SURFACE_STATE_POOL_MAX_ADDRESS    0x0001bfffffffULL
-#define BINDLESS_SURFACE_STATE_POOL_MIN_ADDRESS    0x0001c0000000ULL /* 7 GiB */
-#define BINDLESS_SURFACE_STATE_POOL_MAX_ADDRESS    0x0001bfffffffULL
+#define SCRATCH_SURFACE_STATE_POOL_MIN_ADDRESS     0x000140000000ULL /* 5 GiB (8MiB overlaps surface state pool) */
+#define SCRATCH_SURFACE_STATE_POOL_MAX_ADDRESS     0x0001407fffffULL
+#define BINDLESS_SURFACE_STATE_POOL_MIN_ADDRESS    0x0001c0000000ULL /* 7 GiB (64MiB) */
+#define BINDLESS_SURFACE_STATE_POOL_MAX_ADDRESS    0x0001c3ffffffULL
 #define INSTRUCTION_STATE_POOL_MIN_ADDRESS         0x000200000000ULL /* 8 GiB */
 #define INSTRUCTION_STATE_POOL_MAX_ADDRESS         0x00023fffffffULL
 #define CLIENT_VISIBLE_HEAP_MIN_ADDRESS            0x000240000000ULL /* 9 GiB */
@@ -177,10 +179,12 @@ struct intel_perf_query_result;
 #define BINDING_TABLE_POOL_SIZE     \
    (BINDING_TABLE_POOL_MAX_ADDRESS - BINDING_TABLE_POOL_MIN_ADDRESS + 1)
 #define BINDING_TABLE_POOL_BLOCK_SIZE (65536)
-#define INTERNAL_SURFACE_STATE_POOL_SIZE \
-   (INTERNAL_SURFACE_STATE_POOL_MAX_ADDRESS - INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS + 1)
+#define SCRATCH_SURFACE_STATE_POOL_SIZE \
+   (SCRATCH_SURFACE_STATE_POOL_MAX_ADDRESS - SCRATCH_SURFACE_STATE_POOL_MIN_ADDRESS + 1)
 #define BINDLESS_SURFACE_STATE_POOL_SIZE \
    (BINDLESS_SURFACE_STATE_POOL_MAX_ADDRESS - BINDLESS_SURFACE_STATE_POOL_MIN_ADDRESS + 1)
+#define INTERNAL_SURFACE_STATE_POOL_SIZE \
+   (INTERNAL_SURFACE_STATE_POOL_MAX_ADDRESS - INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS + 1)
 #define INSTRUCTION_STATE_POOL_SIZE \
    (INSTRUCTION_STATE_POOL_MAX_ADDRESS - INSTRUCTION_STATE_POOL_MIN_ADDRESS + 1)
 #define CLIENT_VISIBLE_HEAP_SIZE               \
@@ -1158,6 +1162,7 @@ struct anv_device {
     struct anv_state_pool                       dynamic_state_pool;
     struct anv_state_pool                       instruction_state_pool;
     struct anv_state_pool                       binding_table_pool;
+    struct anv_state_pool                       scratch_surface_state_pool;
     struct anv_state_pool                       internal_surface_state_pool;
     struct anv_state_pool                       bindless_surface_state_pool;
 
@@ -3831,6 +3836,8 @@ struct anv_image_view {
        */
       struct anv_surface_state storage_surface_state;
       struct anv_surface_state lowered_storage_surface_state;
+
+      bool lowered_surface_state_is_null;
    } planes[3];
 };
 
index b45e796..0c14747 100644 (file)
@@ -181,7 +181,11 @@ static uint32_t
 blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
                                       uint32_t offset)
 {
+#if GFX_VERX10 >= 125
+   return SCRATCH_SURFACE_STATE_POOL_SIZE + offset;
+#else
    return offset;
+#endif
 }
 
 static void *
index def1cf2..33cd8dd 100644 (file)
@@ -2421,7 +2421,12 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
             continue;
          }
          const struct anv_descriptor *desc = &set->descriptors[binding->index];
-
+         /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress
+          * heap. Depending on where the descriptor surface state is
+          * allocated, they can either come from
+          * device->internal_surface_state_pool or
+          * device->bindless_surface_state_pool.
+          */
          switch (desc->type) {
          case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
          case VK_DESCRIPTOR_TYPE_SAMPLER:
@@ -2451,10 +2456,11 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
                   binding->lowered_storage_surface
                   ? desc->image_view->planes[binding->plane].lowered_storage_surface_state
                   : desc->image_view->planes[binding->plane].storage_surface_state;
-               surface_state =
-                  anv_bindless_state_for_binding_table(sstate.state);
+               const bool lowered_surface_state_is_null =
+                  desc->image_view->planes[binding->plane].lowered_surface_state_is_null;
+               surface_state = anv_bindless_state_for_binding_table(sstate.state);
                assert(surface_state.alloc_size);
-               if (surface_state.offset == 0) {
+               if (binding->lowered_storage_surface && lowered_surface_state_is_null) {
                   mesa_loge("Bound a image to a descriptor where the "
                             "descriptor does not have NonReadable "
                             "set and the image does not have a "
@@ -2752,7 +2758,12 @@ cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
       anv_batch_emit(&cmd_buffer->batch,
                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
          btp._3DCommandSubOpcode = binding_table_opcodes[s];
+#if GFX_VERX10 >= 125
+         btp.PointertoVSBindingTable = SCRATCH_SURFACE_STATE_POOL_SIZE +
+                                       cmd_buffer->state.binding_tables[s].offset;
+#else
          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
+#endif
       }
    }
 }