From 9c1c1888d9895d05246005620953ee307d1a17f1 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Mon, 14 Nov 2022 15:54:01 +0200 Subject: [PATCH] intel/fs: put scratch surface in the surface state heap In 4ceaed7839af we made scratch surface state allocations part of the internal heap (mapped to STATE_BASE_ADDRESS::SurfaceStateBaseAddress) so that it doesn't uses slots in the application's expected 1M descriptors (especially with vkd3d-proton). But all our compiler code relies on BSS (STATE_BASE_ADDRESS::BindlessSurfaceStateBaseAddress). The additional issue is that there is only 26bits of surface offset available in CS instruction (CFE_STATE, 3DSTATE_VS, etc...) for scratch surfaces. So we need the drivers to put the scratch surfaces in the first chunk of STATE_BASE_ADDRESS::SurfaceStateBaseAddress (hence all the driver changes). Signed-off-by: Lionel Landwerlin Fixes: 4ceaed7839af ("anv: split internal surface states from descriptors") Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7687 Reviewed-by: Kenneth Graunke Part-of: --- src/gallium/drivers/iris/iris_state.c | 9 +++---- src/intel/compiler/brw_eu_defines.h | 5 ++++ src/intel/compiler/brw_fs_nir.cpp | 2 ++ src/intel/compiler/brw_fs_reg_allocate.cpp | 4 +-- src/intel/compiler/brw_lower_logical_sends.cpp | 18 +++++++++---- src/intel/vulkan/anv_allocator.c | 8 +++--- src/intel/vulkan/anv_batch_chain.c | 6 ++++- src/intel/vulkan/anv_device.c | 37 +++++++++++++++++++++----- src/intel/vulkan/anv_image.c | 2 ++ src/intel/vulkan/anv_private.h | 15 ++++++++--- src/intel/vulkan/genX_blorp_exec.c | 4 +++ src/intel/vulkan/genX_cmd_buffer.c | 19 ++++++++++--- 12 files changed, 98 insertions(+), 31 deletions(-) diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index dcef3b4..63051df 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -709,6 +709,9 @@ init_state_base_address(struct iris_batch *batch) sba.IndirectObjectMOCS = mocs; sba.InstructionMOCS = mocs; sba.SurfaceStateMOCS = mocs; +#if GFX_VER >= 9 + sba.BindlessSurfaceStateMOCS = mocs; +#endif sba.GeneralStateBaseAddressModifyEnable = true; sba.DynamicStateBaseAddressModifyEnable = true; @@ -717,12 +720,6 @@ init_state_base_address(struct iris_batch *batch) sba.GeneralStateBufferSizeModifyEnable = true; sba.DynamicStateBufferSizeModifyEnable = true; sba.SurfaceStateBaseAddressModifyEnable = true; -#if GFX_VER >= 9 - sba.BindlessSurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_SCRATCH_START); - sba.BindlessSurfaceStateSize = (IRIS_SCRATCH_ZONE_SIZE >> 12) - 1; - sba.BindlessSurfaceStateBaseAddressModifyEnable = true; - sba.BindlessSurfaceStateMOCS = mocs; -#endif #if GFX_VER >= 11 sba.BindlessSamplerStateMOCS = mocs; #endif diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 11996fe..b734804 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -1567,6 +1567,11 @@ enum brw_message_target { #define GFX8_BTI_STATELESS_NON_COHERENT 253 #define GFX9_BTI_BINDLESS 252 +/* This ID doesn't map anything HW related value. It exists to inform the + * lowering code to not use the bindless heap. + */ +#define GFX125_NON_BINDLESS (1u << 16) + /* Dataport atomic operations for Untyped Atomic Integer Operation message * (and others). */ diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index c5ee5ba..b29fc4b 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -5090,6 +5090,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0); ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), brw_imm_ud(~0x3ffu)); + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS); srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle; } else if (devinfo->ver >= 8) { srcs[SURFACE_LOGICAL_SRC_SURFACE] = @@ -5156,6 +5157,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0); ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), brw_imm_ud(~0x3ffu)); + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS); srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle; } else if (devinfo->ver >= 8) { srcs[SURFACE_LOGICAL_SRC_SURFACE] = diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index 1b6082e..9a6fe4d 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -843,7 +843,7 @@ fs_reg_alloc::emit_unspill(const fs_builder &bld, unspill_inst->sfid = GFX12_SFID_UGM; unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, unspill_inst->exec_size, - LSC_ADDR_SURFTYPE_BSS, + LSC_ADDR_SURFTYPE_SS, LSC_ADDR_SIZE_A32, 1 /* num_coordinates */, LSC_DATA_SIZE_D32, @@ -940,7 +940,7 @@ fs_reg_alloc::emit_spill(const fs_builder &bld, spill_inst->sfid = GFX12_SFID_UGM; spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, bld.dispatch_width(), - LSC_ADDR_SURFTYPE_BSS, + LSC_ADDR_SURFTYPE_SS, LSC_ADDR_SIZE_A32, 1 /* num_coordinates */, LSC_DATA_SIZE_D32, diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index 6d49124..1ff064d 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -1718,13 +1718,20 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst) else inst->sfid = GFX12_SFID_UGM; - /* We must have exactly one of surface and surface_handle */ - assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); + /* We should have exactly one of surface and surface_handle. For scratch + * messages generated by brw_fs_nir.cpp we also allow a special value to + * know what heap base we should use in STATE_BASE_ADDRESS (SS = Surface + * State Offset, or BSS = Bindless Surface State Offset). + */ + bool non_bindless = surface.file == IMM && surface.ud == GFX125_NON_BINDLESS; + assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE) || + (non_bindless && surface_handle.file != BAD_FILE)); enum lsc_addr_surface_type surf_type; - if (surface_handle.file != BAD_FILE) - surf_type = LSC_ADDR_SURFTYPE_BSS; - else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM) + if (surface_handle.file != BAD_FILE) { + assert(surface.file == IMM && (surface.ud == 0 || surface.ud == GFX125_NON_BINDLESS)); + surf_type = non_bindless ? LSC_ADDR_SURFTYPE_SS : LSC_ADDR_SURFTYPE_BSS; + } else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM) surf_type = LSC_ADDR_SURFTYPE_FLAT; else surf_type = LSC_ADDR_SURFTYPE_BTI; @@ -1801,6 +1808,7 @@ lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst) case LSC_ADDR_SURFTYPE_FLAT: inst->src[1] = brw_imm_ud(0); break; + case LSC_ADDR_SURFTYPE_SS: case LSC_ADDR_SURFTYPE_BSS: /* We assume that the driver provided the handle in the top 20 bits so * we can use the surface handle directly as the extended descriptor. diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c index 12f502f..a83ecf0 100644 --- a/src/intel/vulkan/anv_allocator.c +++ b/src/intel/vulkan/anv_allocator.c @@ -1197,7 +1197,7 @@ anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool for (unsigned i = 0; i < 16; i++) { if (pool->surf_states[i].map != NULL) { - anv_state_pool_free(&device->internal_surface_state_pool, + anv_state_pool_free(&device->scratch_surface_state_pool, pool->surf_states[i]); } } @@ -1274,6 +1274,8 @@ anv_scratch_pool_get_surf(struct anv_device *device, struct anv_scratch_pool *pool, unsigned per_thread_scratch) { + assert(device->info->verx10 >= 125); + if (per_thread_scratch == 0) return 0; @@ -1290,7 +1292,7 @@ anv_scratch_pool_get_surf(struct anv_device *device, struct anv_address addr = { .bo = bo }; struct anv_state state = - anv_state_pool_alloc(&device->internal_surface_state_pool, + anv_state_pool_alloc(&device->scratch_surface_state_pool, device->isl_dev.ss.size, 64); isl_buffer_fill_state(&device->isl_dev, state.map, @@ -1305,7 +1307,7 @@ anv_scratch_pool_get_surf(struct anv_device *device, uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2], 0, state.offset); if (current) { - anv_state_pool_free(&device->internal_surface_state_pool, state); + anv_state_pool_free(&device->scratch_surface_state_pool, state); return current; } else { pool->surf_states[scratch_size_log2] = state; diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index 8615f21..1f618c7 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -1443,7 +1443,7 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf, } /* Add all the global BOs to the object list for softpin case. */ - result = pin_state_pool(device, execbuf, &device->internal_surface_state_pool); + result = pin_state_pool(device, execbuf, &device->scratch_surface_state_pool); if (result != VK_SUCCESS) return result; @@ -1451,6 +1451,10 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf, if (result != VK_SUCCESS) return result; + result = pin_state_pool(device, execbuf, &device->internal_surface_state_pool); + if (result != VK_SUCCESS) + return result; + result = pin_state_pool(device, execbuf, &device->dynamic_state_pool); if (result != VK_SUCCESS) return result; diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 51c7367..8179a35 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -3063,10 +3063,12 @@ decode_get_bo(void *v_batch, bool ppgtt, uint64_t address) return ret_bo; if (get_bo_from_pool(&ret_bo, &device->binding_table_pool.block_pool, address)) return ret_bo; - if (get_bo_from_pool(&ret_bo, &device->internal_surface_state_pool.block_pool, address)) + if (get_bo_from_pool(&ret_bo, &device->scratch_surface_state_pool.block_pool, address)) return ret_bo; if (get_bo_from_pool(&ret_bo, &device->bindless_surface_state_pool.block_pool, address)) return ret_bo; + if (get_bo_from_pool(&ret_bo, &device->internal_surface_state_pool.block_pool, address)) + return ret_bo; if (!device->cmd_buffer_being_decoded) return (struct intel_batch_decode_bo) { }; @@ -3437,11 +3439,27 @@ VkResult anv_CreateDevice( if (result != VK_SUCCESS) goto fail_dynamic_state_pool; - result = anv_state_pool_init(&device->internal_surface_state_pool, device, - "internal surface state pool", - INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096); + if (device->info->verx10 >= 125) { + /* Put the scratch surface states at the beginning of the internal + * surface state pool. + */ + result = anv_state_pool_init(&device->scratch_surface_state_pool, device, + "scratch surface state pool", + SCRATCH_SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096); + if (result != VK_SUCCESS) + goto fail_instruction_state_pool; + + result = anv_state_pool_init(&device->internal_surface_state_pool, device, + "internal surface state pool", + INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS, + SCRATCH_SURFACE_STATE_POOL_SIZE, 4096); + } else { + result = anv_state_pool_init(&device->internal_surface_state_pool, device, + "internal surface state pool", + INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096); + } if (result != VK_SUCCESS) - goto fail_instruction_state_pool; + goto fail_scratch_surface_state_pool; result = anv_state_pool_init(&device->bindless_surface_state_pool, device, "bindless surface state pool", @@ -3549,7 +3567,9 @@ VkResult anv_CreateDevice( * to zero and they have a valid descriptor. */ device->null_surface_state = - anv_state_pool_alloc(&device->internal_surface_state_pool, + anv_state_pool_alloc(device->info->verx10 >= 125 ? + &device->scratch_surface_state_pool : + &device->internal_surface_state_pool, device->isl_dev.ss.size, device->isl_dev.ss.align); isl_null_fill_state(&device->isl_dev, device->null_surface_state.map, @@ -3650,6 +3670,9 @@ VkResult anv_CreateDevice( anv_state_pool_finish(&device->bindless_surface_state_pool); fail_internal_surface_state_pool: anv_state_pool_finish(&device->internal_surface_state_pool); + fail_scratch_surface_state_pool: + if (device->info->verx10 >= 125) + anv_state_pool_finish(&device->scratch_surface_state_pool); fail_instruction_state_pool: anv_state_pool_finish(&device->instruction_state_pool); fail_dynamic_state_pool: @@ -3738,6 +3761,8 @@ void anv_DestroyDevice( } anv_state_pool_finish(&device->binding_table_pool); + if (device->info->verx10 >= 125) + anv_state_pool_finish(&device->scratch_surface_state_pool); anv_state_pool_finish(&device->internal_surface_state_pool); anv_state_pool_finish(&device->bindless_surface_state_pool); anv_state_pool_finish(&device->instruction_state_pool); diff --git a/src/intel/vulkan/anv_image.c b/src/intel/vulkan/anv_image.c index 86d0e32..86d276b 100644 --- a/src/intel/vulkan/anv_image.c +++ b/src/intel/vulkan/anv_image.c @@ -2679,6 +2679,8 @@ anv_CreateImageView(VkDevice _device, .h = image->vk.extent.height, .d = image->vk.extent.depth, }); + + iview->planes[vplane].lowered_surface_state_is_null = true; } } } diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 467176d..71572cc 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -160,8 +160,10 @@ struct intel_perf_query_result; #define BINDING_TABLE_POOL_MAX_ADDRESS 0x00013fffffffULL #define INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS 0x000140000000ULL /* 5 GiB */ #define INTERNAL_SURFACE_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL -#define BINDLESS_SURFACE_STATE_POOL_MIN_ADDRESS 0x0001c0000000ULL /* 7 GiB */ -#define BINDLESS_SURFACE_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL +#define SCRATCH_SURFACE_STATE_POOL_MIN_ADDRESS 0x000140000000ULL /* 5 GiB (8MiB overlaps surface state pool) */ +#define SCRATCH_SURFACE_STATE_POOL_MAX_ADDRESS 0x0001407fffffULL +#define BINDLESS_SURFACE_STATE_POOL_MIN_ADDRESS 0x0001c0000000ULL /* 7 GiB (64MiB) */ +#define BINDLESS_SURFACE_STATE_POOL_MAX_ADDRESS 0x0001c3ffffffULL #define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000200000000ULL /* 8 GiB */ #define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x00023fffffffULL #define CLIENT_VISIBLE_HEAP_MIN_ADDRESS 0x000240000000ULL /* 9 GiB */ @@ -177,10 +179,12 @@ struct intel_perf_query_result; #define BINDING_TABLE_POOL_SIZE \ (BINDING_TABLE_POOL_MAX_ADDRESS - BINDING_TABLE_POOL_MIN_ADDRESS + 1) #define BINDING_TABLE_POOL_BLOCK_SIZE (65536) -#define INTERNAL_SURFACE_STATE_POOL_SIZE \ - (INTERNAL_SURFACE_STATE_POOL_MAX_ADDRESS - INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS + 1) +#define SCRATCH_SURFACE_STATE_POOL_SIZE \ + (SCRATCH_SURFACE_STATE_POOL_MAX_ADDRESS - SCRATCH_SURFACE_STATE_POOL_MIN_ADDRESS + 1) #define BINDLESS_SURFACE_STATE_POOL_SIZE \ (BINDLESS_SURFACE_STATE_POOL_MAX_ADDRESS - BINDLESS_SURFACE_STATE_POOL_MIN_ADDRESS + 1) +#define INTERNAL_SURFACE_STATE_POOL_SIZE \ + (INTERNAL_SURFACE_STATE_POOL_MAX_ADDRESS - INTERNAL_SURFACE_STATE_POOL_MIN_ADDRESS + 1) #define INSTRUCTION_STATE_POOL_SIZE \ (INSTRUCTION_STATE_POOL_MAX_ADDRESS - INSTRUCTION_STATE_POOL_MIN_ADDRESS + 1) #define CLIENT_VISIBLE_HEAP_SIZE \ @@ -1158,6 +1162,7 @@ struct anv_device { struct anv_state_pool dynamic_state_pool; struct anv_state_pool instruction_state_pool; struct anv_state_pool binding_table_pool; + struct anv_state_pool scratch_surface_state_pool; struct anv_state_pool internal_surface_state_pool; struct anv_state_pool bindless_surface_state_pool; @@ -3831,6 +3836,8 @@ struct anv_image_view { */ struct anv_surface_state storage_surface_state; struct anv_surface_state lowered_storage_surface_state; + + bool lowered_surface_state_is_null; } planes[3]; }; diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index b45e796..0c14747 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -181,7 +181,11 @@ static uint32_t blorp_binding_table_offset_to_pointer(struct blorp_batch *batch, uint32_t offset) { +#if GFX_VERX10 >= 125 + return SCRATCH_SURFACE_STATE_POOL_SIZE + offset; +#else return offset; +#endif } static void * diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index def1cf2..33cd8dd 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2421,7 +2421,12 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, continue; } const struct anv_descriptor *desc = &set->descriptors[binding->index]; - + /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress + * heap. Depending on where the descriptor surface state is + * allocated, they can either come from + * device->internal_surface_state_pool or + * device->bindless_surface_state_pool. + */ switch (desc->type) { case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: case VK_DESCRIPTOR_TYPE_SAMPLER: @@ -2451,10 +2456,11 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, binding->lowered_storage_surface ? desc->image_view->planes[binding->plane].lowered_storage_surface_state : desc->image_view->planes[binding->plane].storage_surface_state; - surface_state = - anv_bindless_state_for_binding_table(sstate.state); + const bool lowered_surface_state_is_null = + desc->image_view->planes[binding->plane].lowered_surface_state_is_null; + surface_state = anv_bindless_state_for_binding_table(sstate.state); assert(surface_state.alloc_size); - if (surface_state.offset == 0) { + if (binding->lowered_storage_surface && lowered_surface_state_is_null) { mesa_loge("Bound a image to a descriptor where the " "descriptor does not have NonReadable " "set and the image does not have a " @@ -2752,7 +2758,12 @@ cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer, anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) { btp._3DCommandSubOpcode = binding_table_opcodes[s]; +#if GFX_VERX10 >= 125 + btp.PointertoVSBindingTable = SCRATCH_SURFACE_STATE_POOL_SIZE + + cmd_buffer->state.binding_tables[s].offset; +#else btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset; +#endif } } } -- 2.7.4