From 98dc179c1e094ab42346b23fe046ebb719b66ed4 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 7 Nov 2019 11:28:47 -0600 Subject: [PATCH] anv: More carefully dirty state in BindPipeline Instead of blindly dirtying descriptors and push constants the moment we see a pipeline change, check to see if it actually changes the bind layout or push constant layout. This doubles the runtime performance of one CPU-limited example running with the Dawn WebGPU implementation when running on my laptop. NOTE: This effectively reverts beca63c6c07. While it was a nice optimization, it was based on prog_data and we can't do that anymore once we start allowing the same binding table to be used with multiple different pipelines. Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/anv_cmd_buffer.c | 49 +++++++++++++++++++++--- src/intel/vulkan/anv_nir_apply_pipeline_layout.c | 13 +++++++ src/intel/vulkan/anv_nir_compute_push_layout.c | 9 +++++ src/intel/vulkan/anv_pipeline.c | 6 +++ src/intel/vulkan/anv_pipeline_cache.c | 9 +++++ src/intel/vulkan/anv_private.h | 16 +++++++- src/intel/vulkan/genX_cmd_buffer.c | 24 +++--------- 7 files changed, 101 insertions(+), 25 deletions(-) diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 2a555f1..6ba3cd6 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -380,6 +380,34 @@ anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer) cmd_buffer); } +static bool +mem_update(void *dst, const void *src, size_t size) +{ + if (memcmp(dst, src, size) == 0) + return false; + + memcpy(dst, src, size); + return true; +} + +static void +set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage, + const struct anv_pipeline_bind_map *map) +{ + if (mem_update(cmd_buffer->state.surface_sha1s[stage], + map->surface_sha1, sizeof(map->surface_sha1))) + cmd_buffer->state.descriptors_dirty |= mesa_to_vk_shader_stage(stage); + + if (mem_update(cmd_buffer->state.sampler_sha1s[stage], + map->sampler_sha1, sizeof(map->sampler_sha1))) + cmd_buffer->state.descriptors_dirty |= mesa_to_vk_shader_stage(stage); + + if (mem_update(cmd_buffer->state.push_sha1s[stage], + map->push_sha1, sizeof(map->push_sha1))) + cmd_buffer->state.push_constants_dirty |= mesa_to_vk_shader_stage(stage); +} + void anv_CmdBindPipeline( VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, @@ -389,19 +417,30 @@ void anv_CmdBindPipeline( ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline); switch (pipelineBindPoint) { - case VK_PIPELINE_BIND_POINT_COMPUTE: + case VK_PIPELINE_BIND_POINT_COMPUTE: { + if (cmd_buffer->state.compute.base.pipeline == pipeline) + return; + cmd_buffer->state.compute.base.pipeline = pipeline; cmd_buffer->state.compute.pipeline_dirty = true; - cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; - cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[MESA_SHADER_COMPUTE]->bind_map; + set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE, bind_map); break; + } case VK_PIPELINE_BIND_POINT_GRAPHICS: + if (cmd_buffer->state.gfx.base.pipeline == pipeline) + return; + cmd_buffer->state.gfx.base.pipeline = pipeline; cmd_buffer->state.gfx.vb_dirty |= pipeline->vb_used; cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; - cmd_buffer->state.push_constants_dirty |= pipeline->active_stages; - cmd_buffer->state.descriptors_dirty |= pipeline->active_stages; + + anv_foreach_stage(stage, pipeline->active_stages) { + set_dirty_for_bind_map(cmd_buffer, stage, + &pipeline->shaders[stage]->bind_map); + } /* Apply the dynamic state from the pipeline */ cmd_buffer->state.gfx.dirty |= diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index 6ebc02c..f37b672 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -25,6 +25,7 @@ #include "program/prog_parameter.h" #include "nir/nir_builder.h" #include "compiler/brw_nir.h" +#include "util/mesa-sha1.h" #include "util/set.h" /* Sampler tables don't actually have a maximum size but we pick one just so @@ -1318,6 +1319,7 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, dim == GLSL_SAMPLER_DIM_SUBPASS_MS) pipe_binding[i].input_attachment_index = var->data.index + i; + /* NOTE: This is a uint8_t so we really do need to != 0 here */ pipe_binding[i].write_only = (var->data.image.access & ACCESS_NON_READABLE) != 0; } @@ -1367,4 +1369,15 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, } ralloc_free(mem_ctx); + + /* Now that we're done computing the surface and sampler portions of the + * bind map, hash them. This lets us quickly determine if the actual + * mapping has changed and not just a no-op pipeline change. + */ + _mesa_sha1_compute(map->surface_to_descriptor, + map->surface_count * sizeof(struct anv_pipeline_binding), + map->surface_sha1); + _mesa_sha1_compute(map->sampler_to_descriptor, + map->sampler_count * sizeof(struct anv_pipeline_binding), + map->sampler_sha1); } diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c index e624a90..0b696fb 100644 --- a/src/intel/vulkan/anv_nir_compute_push_layout.c +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -23,6 +23,7 @@ #include "anv_nir.h" #include "compiler/brw_nir.h" +#include "util/mesa-sha1.h" void anv_nir_compute_push_layout(const struct anv_physical_device *pdevice, @@ -173,6 +174,14 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice, */ map->push_ranges[0] = push_constant_range; } + + /* Now that we're done computing the push constant portion of the + * bind map, hash it. This lets us quickly determine if the actual + * mapping has changed and not just a no-op pipeline change. + */ + _mesa_sha1_compute(map->push_ranges, + sizeof(map->push_ranges), + map->push_sha1); } void diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 144fcf4..aee3b22 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -1550,6 +1550,12 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline, anv_nir_validate_push_layout(&stage.prog_data.base, &stage.bind_map); + if (!stage.prog_data.cs.uses_num_work_groups) { + assert(stage.bind_map.surface_to_descriptor[0].set == + ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS); + stage.bind_map.surface_to_descriptor[0].set = ANV_DESCRIPTOR_SET_NULL; + } + const unsigned code_size = stage.prog_data.base.program_size; bin = anv_device_upload_kernel(pipeline->device, cache, &stage.cache_key, sizeof(stage.cache_key), diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c index a4294e1..396b75f 100644 --- a/src/intel/vulkan/anv_pipeline_cache.c +++ b/src/intel/vulkan/anv_pipeline_cache.c @@ -161,6 +161,12 @@ anv_shader_bin_write_to_blob(const struct anv_shader_bin *shader, blob_write_uint32(blob, 0); } + blob_write_bytes(blob, shader->bind_map.surface_sha1, + sizeof(shader->bind_map.surface_sha1)); + blob_write_bytes(blob, shader->bind_map.sampler_sha1, + sizeof(shader->bind_map.sampler_sha1)); + blob_write_bytes(blob, shader->bind_map.push_sha1, + sizeof(shader->bind_map.push_sha1)); blob_write_uint32(blob, shader->bind_map.surface_count); blob_write_uint32(blob, shader->bind_map.sampler_count); blob_write_bytes(blob, shader->bind_map.surface_to_descriptor, @@ -206,6 +212,9 @@ anv_shader_bin_create_from_blob(struct anv_device *device, xfb_info = blob_read_bytes(blob, xfb_size); struct anv_pipeline_bind_map bind_map; + blob_copy_bytes(blob, bind_map.surface_sha1, sizeof(bind_map.surface_sha1)); + blob_copy_bytes(blob, bind_map.sampler_sha1, sizeof(bind_map.sampler_sha1)); + blob_copy_bytes(blob, bind_map.push_sha1, sizeof(bind_map.push_sha1)); bind_map.surface_count = blob_read_uint32(blob); bind_map.sampler_count = blob_read_uint32(blob); bind_map.surface_to_descriptor = (void *) diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index ec403ac..7c144d7 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2009,6 +2009,7 @@ anv_descriptor_set_destroy(struct anv_device *device, struct anv_descriptor_pool *pool, struct anv_descriptor_set *set); +#define ANV_DESCRIPTOR_SET_NULL (UINT8_MAX - 5) #define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 4) #define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 3) #define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS (UINT8_MAX - 2) @@ -2042,7 +2043,12 @@ struct anv_pipeline_binding { }; /** For a storage image, whether it is write-only */ - bool write_only; + uint8_t write_only; + + /** Pad to 64 bits so that there are no holes and we can safely memcmp + * assuming POD zero-initialization. + */ + uint8_t pad; }; struct anv_push_range { @@ -2575,6 +2581,10 @@ struct anv_cmd_state { struct anv_state binding_tables[MESA_SHADER_STAGES]; struct anv_state samplers[MESA_SHADER_STAGES]; + unsigned char sampler_sha1s[MESA_SHADER_STAGES][20]; + unsigned char surface_sha1s[MESA_SHADER_STAGES][20]; + unsigned char push_sha1s[MESA_SHADER_STAGES][20]; + /** * Whether or not the gen8 PMA fix is enabled. We ensure that, at the top * of any command buffer it is disabled by disabling it in EndCommandBuffer @@ -2936,6 +2946,10 @@ mesa_to_vk_shader_stage(gl_shader_stage mesa_stage) __tmp &= ~(1 << (stage))) struct anv_pipeline_bind_map { + unsigned char surface_sha1[20]; + unsigned char sampler_sha1[20]; + unsigned char push_sha1[20]; + uint32_t surface_count; uint32_t sampler_count; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index d4ed492..a30f675 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2122,8 +2122,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, return VK_SUCCESS; } - struct anv_shader_bin *bin = pipeline->shaders[stage]; - struct anv_pipeline_bind_map *map = &bin->bind_map; + struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map; if (map->surface_count == 0) { *bt_state = (struct anv_state) { 0, }; return VK_SUCCESS; @@ -2149,6 +2148,10 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, struct anv_state surface_state; switch (binding->set) { + case ANV_DESCRIPTOR_SET_NULL: + bt_map[s] = 0; + break; + case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS: /* Color attachment binding */ assert(stage == MESA_SHADER_FRAGMENT); @@ -2199,8 +2202,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: { /* This is always the first binding for compute shaders */ assert(stage == MESA_SHADER_COMPUTE && s == 0); - if (!get_cs_prog_data(pipeline)->uses_num_work_groups) - break; struct anv_state surface_state = anv_cmd_buffer_alloc_surface_state(cmd_buffer); @@ -2305,16 +2306,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - /* If the shader never does any UBO pulls (this is a fairly common - * case) then we don't need to fill out those binding table entries. - * The real cost savings here is that we don't have to build the - * surface state for them which is surprisingly expensive when it's - * on the hot-path. - */ - if (!bin->prog_data->has_ubo_pull) - continue; - /* Fall through */ - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { /* Compute the offset within the buffer */ struct anv_push_constants *push = @@ -2730,11 +2721,6 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch); - /* The exact descriptor layout is pulled from the pipeline, so we need - * to re-emit binding tables on every pipeline change. - */ - cmd_buffer->state.descriptors_dirty |= pipeline->active_stages; - /* If the pipeline changed, we may need to re-allocate push constant * space in the URB. */ -- 2.7.4