From a7f098fb769bdfdac692a04eab6bdd84e061e5cd Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 15 May 2017 23:03:01 +0200 Subject: [PATCH] radeonsi: only upload (dump to L2) those descriptors that are used by shaders MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This decreases the size of CE RAM dumps to L2, or the size of descriptor uploads without CE. Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/si_compute.c | 28 ++++++-- src/gallium/drivers/radeonsi/si_descriptors.c | 89 ++++++++++++++++++++----- src/gallium/drivers/radeonsi/si_state.h | 18 ++++- src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++ 4 files changed, 117 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 22ef111..4c98066 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -208,7 +208,24 @@ static void *si_create_compute_state( static void si_bind_compute_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context*)ctx; - sctx->cs_shader_state.program = (struct si_compute*)state; + struct si_compute *program = (struct si_compute*)state; + + sctx->cs_shader_state.program = program; + if (!program) + return; + + /* Wait because we need active slot usage masks. */ + if (program->ir_type == PIPE_SHADER_IR_TGSI) + util_queue_fence_wait(&program->ready); + + si_set_active_descriptors(sctx, + SI_DESCS_FIRST_COMPUTE + + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, + program->active_const_and_shader_buffers); + si_set_active_descriptors(sctx, + SI_DESCS_FIRST_COMPUTE + + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, + program->active_samplers_and_images); } static void si_set_global_binding( @@ -756,12 +773,9 @@ static void si_launch_grid( sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; - if (program->ir_type == PIPE_SHADER_IR_TGSI) { - util_queue_fence_wait(&program->ready); - - if (program->shader.compilation_failed) - return; - } + if (program->ir_type == PIPE_SHADER_IR_TGSI && + program->shader.compilation_failed) + return; si_decompress_compute_textures(sctx); diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index b38b6b5..b514961 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -125,12 +125,14 @@ static void si_release_descriptors(struct si_descriptors *desc) } static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size, - unsigned *out_offset, struct r600_resource **out_buf) { + unsigned *out_offset, struct r600_resource **out_buf) +{ uint64_t va; u_suballocator_alloc(sctx->ce_suballocator, size, - sctx->screen->b.info.tcc_cache_line_size, - out_offset, (struct pipe_resource**)out_buf); + si_optimal_tcc_alignment(sctx, size), + out_offset, + (struct pipe_resource**)out_buf); if (!out_buf) return false; @@ -193,7 +195,16 @@ static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc, struct r600_atom * atom) { - unsigned list_size = desc->num_elements * desc->element_dw_size * 4; + unsigned slot_size = desc->element_dw_size * 4; + unsigned first_slot_offset = desc->first_active_slot * slot_size; + unsigned upload_size = desc->num_active_slots * slot_size; + + /* Skip the upload if no shader is using the descriptors. dirty_mask + * will stay dirty and the descriptors will be uploaded when there is + * a shader using them. + */ + if (!upload_size) + return true; if (sctx->ce_ib && desc->uses_ce) { uint32_t const* list = (uint32_t const*)desc->list; @@ -212,25 +223,32 @@ static bool si_upload_descriptors(struct si_context *sctx, radeon_emit_array(sctx->ce_ib, list + begin, count); } - if (!si_ce_upload(sctx, desc->ce_offset, list_size, - &desc->buffer_offset, &desc->buffer)) + if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset, + upload_size, (unsigned*)&desc->buffer_offset, + &desc->buffer)) return false; } else { - void *ptr; + uint32_t *ptr; - u_upload_alloc(sctx->b.b.const_uploader, 0, list_size, - sctx->screen->b.info.tcc_cache_line_size, - &desc->buffer_offset, - (struct pipe_resource**)&desc->buffer, &ptr); + u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size, + si_optimal_tcc_alignment(sctx, upload_size), + (unsigned*)&desc->buffer_offset, + (struct pipe_resource**)&desc->buffer, + (void**)&ptr); if (!desc->buffer) return false; /* skip the draw call */ - util_memcpy_cpu_to_le32(ptr, desc->list, list_size); - desc->gpu_list = ptr; + util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset, + upload_size); + desc->gpu_list = ptr - first_slot_offset / 4; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } + + /* The shader pointer should point to slot 0. */ + desc->buffer_offset -= first_slot_offset; + desc->dirty_mask = 0; if (atom) @@ -1030,7 +1048,7 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) u_upload_alloc(sctx->b.b.const_uploader, 0, desc_list_byte_size, si_optimal_tcc_alignment(sctx, desc_list_byte_size), - &desc->buffer_offset, + (unsigned*)&desc->buffer_offset, (struct pipe_resource**)&desc->buffer, (void**)&ptr); if (!desc->buffer) return false; @@ -1891,7 +1909,8 @@ static void si_emit_shader_pointer(struct si_context *sctx, struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint64_t va; - assert(desc->buffer); + if (!desc->buffer) + return; /* the pointer is not used by current shaders */ va = desc->buffer->gpu_address + desc->buffer_offset; @@ -2034,6 +2053,8 @@ void si_init_all_descriptors(struct si_context *sctx) RADEON_USAGE_READWRITE, RADEON_USAGE_READ, RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER, &ce_offset); + sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS; + si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS, 4, SI_NUM_VERTEX_BUFFERS, NULL); @@ -2156,3 +2177,41 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx) si_shader_userdata_begin_new_cs(sctx); } + +void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, + uint64_t new_active_mask) +{ + struct si_descriptors *desc = &sctx->descriptors[desc_idx]; + + /* Ignore no-op updates and updates that disable all slots. */ + if (!new_active_mask || + new_active_mask == u_bit_consecutive64(desc->first_active_slot, + desc->num_active_slots)) + return; + + int first, count; + u_bit_scan_consecutive_range64(&new_active_mask, &first, &count); + assert(new_active_mask == 0); + + /* Upload/dump descriptors if slots are being enabled. */ + if (first < desc->first_active_slot || + first + count > desc->first_active_slot + desc->num_active_slots) + sctx->descriptors_dirty |= 1u << desc_idx; + + desc->first_active_slot = first; + desc->num_active_slots = count; +} + +void si_set_active_descriptors_for_shader(struct si_context *sctx, + struct si_shader_selector *sel) +{ + if (!sel) + return; + + si_set_active_descriptors(sctx, + si_const_and_shader_buffer_descriptors_idx(sel->type), + sel->active_const_and_shader_buffers); + si_set_active_descriptors(sctx, + si_sampler_and_image_descriptors_idx(sel->type), + sel->active_samplers_and_images); +} diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index f2003a5..dfabaa3 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -42,6 +42,7 @@ struct si_screen; struct si_shader; +struct si_shader_selector; struct si_state_blend { struct si_pm4_state pm4; @@ -222,12 +223,20 @@ struct si_descriptors { /* The buffer where the descriptors have been uploaded. */ struct r600_resource *buffer; - unsigned buffer_offset; + int buffer_offset; /* can be negative if not using lower slots */ /* Offset in CE RAM */ unsigned ce_offset; - /* elements of the list that are changed and need to be uploaded */ + /* Slots that are used by currently-bound shaders. + * With CE: It determines which slots are dumped to L2. + * It doesn't skip uploads to CE RAM. + * Without CE: It determines which slots are uploaded. + */ + unsigned first_active_slot; + unsigned num_active_slots; + + /* Slots that have been changed and need to be uploaded. */ uint64_t dirty_mask; /* Whether CE is used to upload this descriptor array. */ @@ -315,6 +324,11 @@ void si_emit_graphics_shader_userdata(struct si_context *sctx, void si_emit_compute_shader_userdata(struct si_context *sctx); void si_set_rw_buffer(struct si_context *sctx, uint slot, const struct pipe_constant_buffer *input); +void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, + uint64_t new_active_mask); +void si_set_active_descriptors_for_shader(struct si_context *sctx, + struct si_shader_selector *sel); + /* si_state.c */ struct si_shader_selector; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 45d996b..8ac4309 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2151,6 +2151,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) sctx->do_update_shaders = true; si_mark_atom_dirty(sctx, &sctx->clip_regs); r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx)); + si_set_active_descriptors_for_shader(sctx, sel); } static void si_update_tess_uses_prim_id(struct si_context *sctx) @@ -2188,6 +2189,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state) si_update_tess_uses_prim_id(sctx); } r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx)); + si_set_active_descriptors_for_shader(sctx, sel); } static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) @@ -2206,6 +2208,8 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) if (enable_changed) sctx->last_tcs = NULL; /* invalidate derived tess state */ + + si_set_active_descriptors_for_shader(sctx, sel); } static void si_bind_tes_shader(struct pipe_context *ctx, void *state) @@ -2230,6 +2234,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state) sctx->last_tes_sh_base = -1; /* invalidate derived tess state */ } r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx)); + si_set_active_descriptors_for_shader(sctx, sel); } static void si_bind_ps_shader(struct pipe_context *ctx, void *state) @@ -2247,6 +2252,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess) si_update_tess_uses_prim_id(sctx); si_mark_atom_dirty(sctx, &sctx->cb_render_state); + si_set_active_descriptors_for_shader(sctx, sel); } static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) -- 2.7.4